#!/usr/bin/env python

import base64

import sys
sys.path.append('/home/buck/net/build/bitextor/share/bitextor/utils')
from unicodepunct import get_unicode_punct
from nltk import wordpunct_tokenize

punctuation_chars = get_unicode_punct()

# same tokenization as used in bitextor-lett2idx


def get_words(untokenized_text):
    text = " ".join(wordpunct_tokenize(untokenized_text))
    words = text.lower().split()
    words = [w.strip(punctuation_chars) for w in words]
    words = [w for w in words if w]
    return words


def read_lett(filename, lang=None, as_set=False):
    # with codecs.open(filename, 'r', 'utf-8') as lettfile:
    with open(filename, 'r') as lettfile:
        for linenr, line in enumerate(lettfile):
            fields = line.strip().split("\t")
            if lang is not None and lang != fields[0]:
                continue
            text = base64.b64decode(fields[6])
            # print repr(text)
Exemple #2
0
oparser.add_argument("--morphanalyser_sl", help="Path to the Apertium's morphological analyser for SL to TL", dest="morphanal1", default=None)
oparser.add_argument("--morphanalyser_tl", help="Path to the Apertium's morphological analyser for TL to SL", dest="morphanal2", default=None)
oparser.add_argument("--lang1", help="Two-characters-code for language 1 in the pair of languages", dest="lang1", required=True)
oparser.add_argument("--lang2", help="Two-characters-code for language 2 in the pair of languages", dest="lang2", required=True)

options = oparser.parse_args()

if options.lett != None:
  reader = open(options.lett,"r")
else:
  reader = sys.stdin

docnumber = 0
word_map = {}

punctuation=get_unicode_punct()
for line in reader:
  ##################
  #Parsing the text:
  ##################
  fields=line.strip().split("\t")
  if len(fields)>=6:
    lang=fields[0]
    #Decoding base 64:
    text = base64.b64decode(fields[5]).decode("utf-8")
    if len(text.strip()) != 0 and options.morphanal1 != None and lang == options.lang1:
      morphanalyser = ["__BASH__", options.morphanal1]
      spmorph = subprocess.Popen(morphanalyser, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
      morph_stdout,error = spmorph.communicate(input=text)
      if len(error.strip()) == 0:
        text =  re.sub(r"\^\*?", r"", re.sub(r"[/<][^$]*\$", r"", morph_stdout.decode("utf-8")))