def process(fileName, encoding): u"""Przetwarza plik zapisujac w __results slowa nie znalezione w clp. W __shortcuts zapisywane sa prawdopodobne skroty. Po kazdorazowym uruchomieniu __results i __shortcuts sa czyszczone. fileName - nazwa pliku encoding - kodowanie pliku Return None """ __results.clear() __shortcuts.clear() for line in open(fileName, "r"): # prepare line line = line.strip().decode(encoding) if re.match('^#\d{6}', line): continue # process line for word in re.split('\[|\]|\d+|\s+|[-&=#`;!.:?,\")(\'\\_/]', line): if len(word) <= 2: continue if not plp.plp_rec(word.encode(default_encoding())): __processWord(word)
def __loadEntityFromClp(self, similar_form): entity = Entity() for id in plp.plp_rec(similar_form.encode(default_encoding())): entity.base = plp.plp_bform(id).decode(default_encoding()) entity.label = plp.plp_label(id).decode(default_encoding()) for form in plp.plp_forms(id): entity.forms.append(form.decode(default_encoding())) break entity.prefix = self.__getPrefix(entity.base, entity.forms) return entity
def get_base_word(word): try: id = plp.plp_rec(word.encode('UTF-8')) return plp.plp_bform(id[0]).decode('UTF-8') except: return "undefined"