def inflect_word(lemma: LexiconEntry, tag: str, rules_tr, model, **kwargs): def _extract_tag(word): return ''.join(tokenize_word(word)[1]) max_results = kwargs['max_results'] if 'max_results' in kwargs else None lookup_results = rules_tr.lookup(lemma.symstr) inflections = [] for w, c in lookup_results: if _extract_tag(w) == tag: try: inflections.append(LexiconEntry(unnormalize_word(w))) except Exception as e: logging.getLogger('main').warning(e) edges = [] for infl in inflections: for rule in extract_all_rules(lemma, infl): if rule in model.rule_set: edge = GraphEdge(lemma, infl, rule) edge.attr['cost'] = model.edge_cost(edge) edges.append(edge) edges = sorted(edges, key=lambda x: x.attr['cost']) if max_results is not None: edges = edges[:max_results] if not edges: return [(lemma, '---' + tag, '---')] return [(lemma, e.target, e.attr['cost']) for e in edges]
def build_graph_from_training_edges(lexicon, training_file, graph_file): with open_to_write(graph_file) as fp: for word_1, word_2 in read_tsv_file(training_file, (str, str)): if word_1: try: n1, n2 = lexicon[word_1], lexicon[word_2] for rule in extract_all_rules(n1, n2): write_line(fp, (str(n1), str(n2), str(rule))) except KeyError: if word_1 not in lexicon: logging.getLogger('main').warning('%s not in lexicon' % word_1)
def _compute_leaf_prob(self): logging.getLogger('main').info('Computing leaf probabilities...') self.leaf_prob = np.ones((len(self.lexicon), len(self.tagset)), dtype=np.float64) edge_set = EdgeSet(lexicon) def _empty_edge_set(edge_set): lexicon = edge_set.lexicon n = len(edge_set) probs = 1 - self.model.edges_prob(edge_set) for e_id, edge in enumerate(edge_set): word = lexicon.get_by_symstr(''.join(edge.source.word))[0] w_id = lexicon.get_id(word) t_id = self.tag_idx[edge.source.tag] self.leaf_prob[w_id, t_id] *= probs[e_id] edge_set = EdgeSet(lexicon) print(n) return edge_set lexicon_tr = self.lexicon.to_fst() lexicon_tr.concatenate(FST.generator(self.tagset)) rules_tr = self.model.rule_set.to_fst() tr = hfst.HfstTransducer(lexicon_tr) tr.compose(rules_tr) tr.determinize() tr.minimize() FST.save_transducer(tr, 'tr.fsm') tr_path = full_path('tr.fsm') cmd = ['hfst-fst2strings', tr_path] p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, universal_newlines=True, bufsize=1) while True: line = p.stdout.readline().strip() if line: w1, w2 = line.split(':') n1 = LexiconEntry(w1) n2 = LexiconEntry(w2) rules = extract_all_rules(n1, n2) for rule in rules: if rule in rule_set: edge_set.add(GraphEdge(n1, n2, rule)) else: break if len(edge_set) > 300000: edge_set = _empty_edge_set(edge_set) edge_set = _empty_edge_set(edge_set)
def _extract_candidate_edges(words: Iterable[str], output_fun: Callable[..., None], lexicon: Lexicon, transducer_path: str) -> None: sw = similar_words(words, transducer_path) for word_1, simwords in sw: v1_list = lexicon.get_by_symstr(word_1) for v1 in v1_list: results_for_v1 = [] for word_2 in simwords: for v2 in lexicon.get_by_symstr(word_2): if v1 != v2 and _is_possible_edge(v1, v2): rules = extract_all_rules(v1, v2) for rule in rules: results_for_v1.append((v2.literal, str(rule))) output_fun((v1.literal, results_for_v1))
def analyze(self, target :LexiconEntry, compute_cost=True, **kwargs) \ -> List[GraphEdge]: # TODO 1a. if predict_tag: get possible tags from the tag predictor # 1. get possible sources for the given target sources = set(sum([self.lexicon.get_by_symstr(word) \ for word, cost in self.fst.lookup(target.symstr)], [])) results = [] # 2. get possible (source, rule) pairs (extract rules) and score them edge_set = EdgeSet(self.lexicon) for source in sources: rules = extract_all_rules(source, target) for rule in rules: if rule in self.model.rule_set: if self.predict_vec: target_pr = target.copy() edge = GraphEdge(source, target_pr, rule) target_pr.vec = self.model.predict_target_feature_vec( edge) edge_set.add(edge) else: edge_set.add(GraphEdge(source, target, rule)) # back-formation if self.enable_back_formation and \ (self.max_results is None or len(edge_set) < self.max_results): lookup_results = set() for w, c in self.inv_rules_tr.lookup(target.symstr): try: lookup_results.add(unnormalize_word(\ re.sub(hfst.EPSILON, '', w))) except Exception as e: logging.getLogger('main').warning(str(e)) sources = [] for word in lookup_results: try: sources.append(LexiconEntry(word)) except Exception as e: logging.getLogger('main').warning(str(e)) for source in sources: rules = extract_all_rules(source, target) for rule in rules: if rule in self.model.rule_set: edge_set.add(GraphEdge(source, target, rule)) # analysis as root if self.include_roots: edge_set.add(GraphEdge(None, target, None)) # scoring # FIXME this is inefficient and may break on some model components # that don't have the method .edge_cost() for edge in edge_set: edge.attr['cost'] = 0 if edge.source is not None: edge.attr['cost'] += self.model.edge_cost(edge) if edge.source not in self.lexicon: edge.attr['cost'] += self.model.root_cost(edge.source) else: edge.attr['cost'] += self.model.root_cost(edge.target) results = [edge for edge in edge_set] # 4. sort the analyses according to the cost results.sort(key=lambda r: r.attr['cost']) if self.max_results is not None: results = results[:self.max_results] return results