def process(fileName, encoding):
    u"""Przetwarza plik zapisujac w __results slowa nie znalezione w clp.
W __shortcuts zapisywane sa prawdopodobne skroty.
Po kazdorazowym uruchomieniu __results i __shortcuts sa czyszczone.

fileName - nazwa pliku
encoding - kodowanie pliku
Return None
    """
    __results.clear()
    __shortcuts.clear()

    for line in open(fileName, "r"):
        # prepare line
        line = line.strip().decode(encoding)
        if re.match('^#\d{6}', line):
            continue

        # process line
        for word in re.split('\[|\]|\d+|\s+|[-&=#`;!.:?,\")(\'\\_/]', line):
            if len(word) <= 2:
                continue

            if not plp.plp_rec(word.encode(default_encoding())):
                __processWord(word)
    def __loadEntityFromClp(self, similar_form):
        entity = Entity()
        for id in plp.plp_rec(similar_form.encode(default_encoding())):
            entity.base = plp.plp_bform(id).decode(default_encoding())
            entity.label = plp.plp_label(id).decode(default_encoding())
            for form in plp.plp_forms(id):
                entity.forms.append(form.decode(default_encoding()))
            break
        entity.prefix = self.__getPrefix(entity.base, entity.forms)

        return entity
Esempio n. 3
0
def get_base_word(word):
    try:
        id = plp.plp_rec(word.encode('UTF-8'))
        return plp.plp_bform(id[0]).decode('UTF-8')
    except:
        return "undefined"