def trainSegmenters(reader, l1, l2): reader_list=[] try: mitok_l1=pickle.load(gzip.open("__PREFIX__/share/bitextor/ulysses-data/{0}.pickle.gz".format(l1), "r")) except: mitok_l1=ulysses.Ulysses() mitok_l1.init_model() try: mitok_l2=pickle.load(gzip.open("__PREFIX__/share/bitextor/ulysses-data/{0}.pickle.gz".format(l2), "r")) except: mitok_l2=ulysses.Ulysses() mitok_l2.init_model() for line in reader: reader_list.append(line.decode("utf-8").strip()) fields=reader_list[-1].split("\t") text1=base64.b64decode(fields[2]).decode("utf-8") mitok_l1.feed_model(ulysses.splitinwords(text1)) text2=base64.b64decode(fields[3]).decode("utf-8") mitok_l2.feed_model(ulysses.splitinwords(text2)) mitok_l1.update_model() mitok_l2.update_model() return mitok_l1, mitok_l2, reader_list
def splitSegs(mitok, text): return mitok.split(ulysses.splitinwords(text))