def load(filename: str, lexicon, model, **kwargs) -> None: kwargs['compile'] = False analyzer = Analyzer(lexicon, model, **kwargs) analyzer.fst = FST.load_transducer(filename) rules_tr = FST.load_transducer(shared.filenames['rules-tr']) analyzer.inv_rules_tr = hfst.HfstTransducer(rules_tr) analyzer.inv_rules_tr.invert() return analyzer
def _compile_fst(self) -> None: rules_tr = FST.load_transducer(shared.filenames['rules-tr']) self.inv_rules_tr = hfst.HfstTransducer(rules_tr) self.inv_rules_tr.invert() logging.getLogger('main').info('Building lexicon transducer...') lexicon_tr = FST.load_transducer(\ shared.filenames['lexicon-tr']) self.fst = hfst.HfstTransducer(lexicon_tr) logging.getLogger('main').info('Composing with rules...') self.fst.compose(rules_tr) self.fst.minimize() self.fst.invert() self.fst.convert(hfst.ImplementationType.HFST_OLW_TYPE)
def load(filename :str, **kwargs) -> 'AlergiaRootModel': # TODO saving/loading smoothing and parameters result = AlergiaRootModel(**kwargs) result.automaton = FST.load_transducer(filename) if result.smoothing > 0: result.smoothing_model = \ UnigramRootModel.load(filename + '.smoothing') return result
def similar_words_with_pylookup_static(words, transducer_path): '''Not really feasible because of astronomical memory consumption. Implemented only for comparison.''' t = FST.load_transducer(transducer_path) t.minimize() t.convert(hfst.ImplementationType.HFST_OL_TYPE) for word in words: similar_words = set(w for w, c in t.lookup(word)) yield (word, list(similar_words))
def run(): lexicon = Lexicon.load(shared.filenames['wordlist']) lexicon_tr = FST.load_transducer(shared.filenames['lexicon-tr']) rules_tr = FST.load_transducer(shared.filenames['rules-tr']) rules_tr.convert(hfst.ImplementationType.HFST_OLW_TYPE) alphabet = lexicon_tr.get_alphabet() model = ModelSuite.load() max_results = shared.config['inflect'].getint('max_results') if shared.options['interactive']: for line in sys.stdin: try: lemma_str, tag = line.rstrip().split() lemma = LexiconEntry(lemma_str) for analysis in inflect_word(lemma, tag, rules_tr, model, max_results=max_results): print(*analysis, sep='\t') except Exception as e: logging.getLogger('main').warning(e) else: pairs = [] # FIXME is there a better solution for creating lists of LexiconEntry # objects and skipping the ones for which exceptions are thrown? for lemma, tag in read_tsv_file(shared.filenames['analyze.wordlist']): try: pairs.append((LexiconEntry(lemma), tag)) except Exception as e: logging.warning(e) for lemma, tag in tqdm.tqdm(pairs): for analysis in inflect_word(lemma, tag, rules_tr, model, max_results=max_results): print(*analysis, sep='\t')