def __init__(self, methodName='runTest'): self.current_dir = os.path.dirname(os.path.realpath(__file__)) self.temp_dict1_path = tempfile.mkdtemp() self.dict1_file = open(os.path.join(self.current_dir, 'dict1.txt')) PyDicCreator().generate(self.dict1_file, self.temp_dict1_path, 'dict1', verbose=False) self.dict1 = PyDic(self.temp_dict1_path) self.dict1m = PyDic('dict1.txt') return super(TestPyDicBase, self).__init__(methodName)
def run(self): """ Runs as a command line tool """ parser = argparse.ArgumentParser( description='Makes inflection of a flat text file with words.') parser.add_argument('-d', '--delimiter', default=u',') parser.add_argument('-f', '--dictionary-file', help="path to file with text dictionary", required=True) parser.add_argument('-t', '--output', help="output file name") parser.add_argument('-b', '--base-forms', action="store_true", help="only base forms") parser.add_argument('-v', '--verbose', action="store_true", help="debug verbose mode") parser.add_argument('input', metavar='FILE', help="filename to process", nargs='?') args = parser.parse_args() input_stream = sys.stdin if args.input: input_stream = open(args.input) output_stream = sys.stdout if args.output: output_stream = open(args.output, 'w') self.dictionary = PyDic(args.dictionary_file) self.index = self.load_index(self.dictionary) for line in input_stream: line = line.decode('utf-8').strip() if line and line[0] != '#': print >> output_stream, args.delimiter.join( self.process(self.dictionary, self.index, line, debug=args.verbose)).encode('utf-8') else: print >> output_stream, line.encode('utf-8')
def setUp(self): self.stemmer = PydicStemmer() self.dictionary = PyDic('dict1.txt') self.index = self.stemmer.build_index(self.dictionary)
def load_dictionary(self, path): dic = PyDic(path) self.dictionaries[dic.name] = dic
### LEMMATISING ### from nltk.stem import WordNetLemmatizer from nltk.corpus import wordnet as wn from pattern3.fr import parse as frparse from pattern3.nl import parse as nlparse from pattern3.de import parse as deparse from pattern3.it import parse as itparse from pydic import PyDic from pymystem3 import Mystem if __name__ == "__main__": # Initialising Lemmatisers with logs print("Initialising lemmatiser for Polish... ", end='\r') pl_dict = PyDic('pydic/odm.txt') print("Initialising lemmatiser for Russian... ", end='\r') ru_lemmatiser = Mystem() print("Initialising lemmatiser for English... ", end='\r') en_lemmatiser = WordNetLemmatizer() print("Done initialising lemmatisers. ") def pl_lemmatise(word): """ Lemmatiser for Polish :param word: string :return: string """ word_forms = pl_dict.word_base(word) if word_forms: