Ejemplo n.º 1
0
def trainSegmenters(reader, l1, l2):
  reader_list=[]

  try:
    mitok_l1=pickle.load(gzip.open("__PREFIX__/share/bitextor/ulysses-data/{0}.pickle.gz".format(l1), "r"))
  except:
    mitok_l1=ulysses.Ulysses()
    mitok_l1.init_model()
  
  try:
    mitok_l2=pickle.load(gzip.open("__PREFIX__/share/bitextor/ulysses-data/{0}.pickle.gz".format(l2), "r"))
  except:
    mitok_l2=ulysses.Ulysses()
    mitok_l2.init_model()

  for line in reader:
    reader_list.append(line.decode("utf-8").strip())
    fields=reader_list[-1].split("\t")
    text1=base64.b64decode(fields[2]).decode("utf-8")
    mitok_l1.feed_model(ulysses.splitinwords(text1))

    text2=base64.b64decode(fields[3]).decode("utf-8")
    mitok_l2.feed_model(ulysses.splitinwords(text2))

  mitok_l1.update_model()
  mitok_l2.update_model()

  return mitok_l1, mitok_l2, reader_list
Ejemplo n.º 2
0
def splitSegs(mitok, text):
  return mitok.split(ulysses.splitinwords(text))