def run() -> None: logging.getLogger('main').info('Loading lexicon...') lexicon = Lexicon.load(shared.filenames['wordlist']) logging.getLogger('main').info('Building the lexicon transducer...') lexicon_tr = lexicon.to_fst() FST.save_transducer(lexicon_tr, shared.filenames['lexicon-tr']) if shared.config['General'].getboolean('supervised'): logging.getLogger('main').info('Building graph...') build_graph_from_training_edges(lexicon, shared.filenames['wordlist'], shared.filenames['graph']) else: logging.getLogger('main').info('Building graph...') write_tsv_file( shared.filenames['graph'], build_graph_fstfastss(lexicon, shared.filenames['lexicon-tr'])) sort_file(shared.filenames['graph'], key=3) update_file_size(shared.filenames['graph']) run_filters(shared.filenames['graph']) update_file_size(shared.filenames['graph']) # write rules file rules = [] for rule_str, edges in read_tsv_file_by_key(shared.filenames['graph'], key=3, show_progressbar=False): rules.append(Rule.from_string(rule_str)) lexicon_tr = lexicon.to_fst() FST.save_transducer(lexicon_tr, shared.filenames['lexicon-tr']) logging.getLogger('main').info('Computing rule domain sizes...') write_tsv_file(shared.filenames['rules'], ((str(rule), domsize)\ for rule, domsize in \ compute_rule_domsizes(lexicon_tr, rules)))
def create_new_words_acceptor_if_not_exists(filename, analyzer, lexicon): if not file_exists(filename): new_words_acceptor = hfst.HfstTransducer(analyzer.fst) new_words_acceptor.convert( hfst.ImplementationType.TROPICAL_OPENFST_TYPE) new_words_acceptor.input_project() new_words_acceptor.minimize() new_words_acceptor.subtract(lexicon.to_fst()) new_words_acceptor.minimize() FST.save_transducer(new_words_acceptor, filename)
def run() -> None: rules = load_rules() roots = load_roots() logging.getLogger('main').info('Building the rule transducer...') rules_tr = build_rule_transducer(rules) FST.save_transducer(rules_tr, shared.filenames['rules-tr']) if shared.config['General'].getboolean('supervised'): logging.getLogger('main').info('Building the root transducer...') roots_tr = build_root_transducer(roots) FST.save_transducer(roots_tr, shared.filenames['roots-tr'])
def _compute_leaf_prob(self): logging.getLogger('main').info('Computing leaf probabilities...') self.leaf_prob = np.ones((len(self.lexicon), len(self.tagset)), dtype=np.float64) edge_set = EdgeSet(lexicon) def _empty_edge_set(edge_set): lexicon = edge_set.lexicon n = len(edge_set) probs = 1 - self.model.edges_prob(edge_set) for e_id, edge in enumerate(edge_set): word = lexicon.get_by_symstr(''.join(edge.source.word))[0] w_id = lexicon.get_id(word) t_id = self.tag_idx[edge.source.tag] self.leaf_prob[w_id, t_id] *= probs[e_id] edge_set = EdgeSet(lexicon) print(n) return edge_set lexicon_tr = self.lexicon.to_fst() lexicon_tr.concatenate(FST.generator(self.tagset)) rules_tr = self.model.rule_set.to_fst() tr = hfst.HfstTransducer(lexicon_tr) tr.compose(rules_tr) tr.determinize() tr.minimize() FST.save_transducer(tr, 'tr.fsm') tr_path = full_path('tr.fsm') cmd = ['hfst-fst2strings', tr_path] p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, universal_newlines=True, bufsize=1) while True: line = p.stdout.readline().strip() if line: w1, w2 = line.split(':') n1 = LexiconEntry(w1) n2 = LexiconEntry(w2) rules = extract_all_rules(n1, n2) for rule in rules: if rule in rule_set: edge_set.add(GraphEdge(n1, n2, rule)) else: break if len(edge_set) > 300000: edge_set = _empty_edge_set(edge_set) edge_set = _empty_edge_set(edge_set)
def compute_possible_edges(lexicon: Lexicon, rule_set: RuleSet) -> EdgeSet: # build the transducer lexicon_tr = lexicon.to_fst() tag_seqs = extract_tag_symbols_from_rules(rule_set) if tag_seqs: lexicon_tr.concatenate(FST.generator(tag_seqs)) rules_tr = rule_set.to_fst() tr = hfst.HfstTransducer(lexicon_tr) tr.compose(rules_tr) tr.determinize() tr.minimize() lexicon_tr.invert() tr.compose(lexicon_tr) tr.determinize() tr.minimize() FST.save_transducer(tr, 'tr.fsm') tr_path = full_path('tr.fsm') cmd = ['hfst-fst2strings', tr_path] p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, universal_newlines=True, bufsize=1) edge_set = EdgeSet(lexicon) while True: line = p.stdout.readline().strip() if line: w1, w2 = line.split(':') w1_without_tag = re.sub(shared.compiled_patterns['tag'], '', w1) w2_without_tag = re.sub(shared.compiled_patterns['tag'], '', w2) if w1_without_tag != w2_without_tag: n1 = LexiconEntry(w1) n2 = LexiconEntry(w2) rules = algorithms.align.extract_all_rules(n1, n2) for rule in rules: if rule in rule_set: n1_wt = lexicon.get_by_symstr(w1_without_tag)[0] n2_wt = lexicon.get_by_symstr(w2_without_tag)[0] edge_set.add(GraphEdge(n1_wt, n2_wt, rule)) else: break return edge_set
def save(self, filename :str) -> None: # TODO saving/loading smoothing and parameters FST.save_transducer(self.automaton, filename) if self.smoothing > 0: self.smoothing_model.save(filename + '.smoothing')
def save(self, filename: str) -> None: FST.save_transducer(self.fst, filename)