def stemm(self, word): u"""Dokonuje stemmingu na slowie podanym w metodzie. Zwraca liste entity.""" self.__entities = [] if (word[len(word) - 1] == 'x'): utility.add_to_dict(self.__onlybase, word) return None similar_forms = self.__findSimilarForms(word) if similar_forms != None: self.__createEntities(word, similar_forms) else: utility.add_to_dict(self.__onlybase, word) return None return self.__entities
def __processWord(word): u"""Przetwarzanie slowa, sprawdzenie jego typu i dodanie go do odpowiedniej listy.""" if len(word) == 3 or word.isupper(): if __isRome(word): utility.add_to_dict(__rome, word) else: utility.add_to_dict(__shortcuts, word) else: utility.add_to_dict(__results, word)
#!/usr/bin/python #-*- coding: utf-8 -*- from plp import * import sys import re import operator import utility list = {} plp_init() should_be_utf = False if (sys.argv[2] == 'u'): should_be_utf = True for line in open(sys.argv[1], "r"): for word in re.split("\[|\]|\\d+|\\s+|[-=`;!.:?,\")(\'\\_/]", line.decode('iso-8859-2').lower()): if len(word) == 0: continue word = word.encode('iso-8859-2') if not plp_rec(word): if should_be_utf: utility.add_to_dict(list, word.decode('iso-8859-2')) else: utility.add_to_dict(list, word) for key, value in sorted(list.iteritems(), key = operator.itemgetter(1), reverse = True): print(key + ' : ' + str(value))