#!/usr/bin/env python import base64 import sys sys.path.append('/home/buck/net/build/bitextor/share/bitextor/utils') from unicodepunct import get_unicode_punct from nltk import wordpunct_tokenize punctuation_chars = get_unicode_punct() # same tokenization as used in bitextor-lett2idx def get_words(untokenized_text): text = " ".join(wordpunct_tokenize(untokenized_text)) words = text.lower().split() words = [w.strip(punctuation_chars) for w in words] words = [w for w in words if w] return words def read_lett(filename, lang=None, as_set=False): # with codecs.open(filename, 'r', 'utf-8') as lettfile: with open(filename, 'r') as lettfile: for linenr, line in enumerate(lettfile): fields = line.strip().split("\t") if lang is not None and lang != fields[0]: continue text = base64.b64decode(fields[6]) # print repr(text)
oparser.add_argument("--morphanalyser_sl", help="Path to the Apertium's morphological analyser for SL to TL", dest="morphanal1", default=None) oparser.add_argument("--morphanalyser_tl", help="Path to the Apertium's morphological analyser for TL to SL", dest="morphanal2", default=None) oparser.add_argument("--lang1", help="Two-characters-code for language 1 in the pair of languages", dest="lang1", required=True) oparser.add_argument("--lang2", help="Two-characters-code for language 2 in the pair of languages", dest="lang2", required=True) options = oparser.parse_args() if options.lett != None: reader = open(options.lett,"r") else: reader = sys.stdin docnumber = 0 word_map = {} punctuation=get_unicode_punct() for line in reader: ################## #Parsing the text: ################## fields=line.strip().split("\t") if len(fields)>=6: lang=fields[0] #Decoding base 64: text = base64.b64decode(fields[5]).decode("utf-8") if len(text.strip()) != 0 and options.morphanal1 != None and lang == options.lang1: morphanalyser = ["__BASH__", options.morphanal1] spmorph = subprocess.Popen(morphanalyser, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) morph_stdout,error = spmorph.communicate(input=text) if len(error.strip()) == 0: text = re.sub(r"\^\*?", r"", re.sub(r"[/<][^$]*\$", r"", morph_stdout.decode("utf-8")))