def inflect_word(lemma: LexiconEntry, tag: str, rules_tr, model, **kwargs): def _extract_tag(word): return ''.join(tokenize_word(word)[1]) max_results = kwargs['max_results'] if 'max_results' in kwargs else None lookup_results = rules_tr.lookup(lemma.symstr) inflections = [] for w, c in lookup_results: if _extract_tag(w) == tag: try: inflections.append(LexiconEntry(unnormalize_word(w))) except Exception as e: logging.getLogger('main').warning(e) edges = [] for infl in inflections: for rule in extract_all_rules(lemma, infl): if rule in model.rule_set: edge = GraphEdge(lemma, infl, rule) edge.attr['cost'] = model.edge_cost(edge) edges.append(edge) edges = sorted(edges, key=lambda x: x.attr['cost']) if max_results is not None: edges = edges[:max_results] if not edges: return [(lemma, '---' + tag, '---')] return [(lemma, e.target, e.attr['cost']) for e in edges]
def _sample_process(rules: List[Rule], _output_fun: Callable[..., None], lexicon: Lexicon, sample_size: int) -> None: transducers = [r.to_fst() for r in rules] for tr in transducers: tr.convert(hfst.ImplementationType.HFST_OL_TYPE) seen_ids = set() num = 0 while num < sample_size: w_id = random.randrange(len(lexicon)) r_id = random.randrange(len(rules)) source = lexicon[w_id] rule = rules[r_id] lookup_results = \ sorted(list(map(lambda x: x[0].replace(hfst.EPSILON, ''), transducers[r_id].lookup(source.symstr)))) if lookup_results: t_id = random.randrange(len(lookup_results)) if (w_id, r_id, t_id) in seen_ids: continue seen_ids.add((w_id, r_id, t_id)) target = None try: target = LexiconEntry(lookup_results[t_id]) if target.symstr not in lexicon.items_by_symstr: _output_fun(GraphEdge(source, target, rule)) num += 1 except Exception as e: logging.getLogger('main').debug(\ 'Exception during negative sampling: {}'.format(e))
def _compute_leaf_prob(self): logging.getLogger('main').info('Computing leaf probabilities...') self.leaf_prob = np.ones((len(self.lexicon), len(self.tagset)), dtype=np.float64) edge_set = EdgeSet(lexicon) def _empty_edge_set(edge_set): lexicon = edge_set.lexicon n = len(edge_set) probs = 1 - self.model.edges_prob(edge_set) for e_id, edge in enumerate(edge_set): word = lexicon.get_by_symstr(''.join(edge.source.word))[0] w_id = lexicon.get_id(word) t_id = self.tag_idx[edge.source.tag] self.leaf_prob[w_id, t_id] *= probs[e_id] edge_set = EdgeSet(lexicon) print(n) return edge_set lexicon_tr = self.lexicon.to_fst() lexicon_tr.concatenate(FST.generator(self.tagset)) rules_tr = self.model.rule_set.to_fst() tr = hfst.HfstTransducer(lexicon_tr) tr.compose(rules_tr) tr.determinize() tr.minimize() FST.save_transducer(tr, 'tr.fsm') tr_path = full_path('tr.fsm') cmd = ['hfst-fst2strings', tr_path] p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, universal_newlines=True, bufsize=1) while True: line = p.stdout.readline().strip() if line: w1, w2 = line.split(':') n1 = LexiconEntry(w1) n2 = LexiconEntry(w2) rules = extract_all_rules(n1, n2) for rule in rules: if rule in rule_set: edge_set.add(GraphEdge(n1, n2, rule)) else: break if len(edge_set) > 300000: edge_set = _empty_edge_set(edge_set) edge_set = _empty_edge_set(edge_set)
def compute_possible_edges(lexicon: Lexicon, rule_set: RuleSet) -> EdgeSet: # build the transducer lexicon_tr = lexicon.to_fst() tag_seqs = extract_tag_symbols_from_rules(rule_set) if tag_seqs: lexicon_tr.concatenate(FST.generator(tag_seqs)) rules_tr = rule_set.to_fst() tr = hfst.HfstTransducer(lexicon_tr) tr.compose(rules_tr) tr.determinize() tr.minimize() lexicon_tr.invert() tr.compose(lexicon_tr) tr.determinize() tr.minimize() FST.save_transducer(tr, 'tr.fsm') tr_path = full_path('tr.fsm') cmd = ['hfst-fst2strings', tr_path] p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, universal_newlines=True, bufsize=1) edge_set = EdgeSet(lexicon) while True: line = p.stdout.readline().strip() if line: w1, w2 = line.split(':') w1_without_tag = re.sub(shared.compiled_patterns['tag'], '', w1) w2_without_tag = re.sub(shared.compiled_patterns['tag'], '', w2) if w1_without_tag != w2_without_tag: n1 = LexiconEntry(w1) n2 = LexiconEntry(w2) rules = algorithms.align.extract_all_rules(n1, n2) for rule in rules: if rule in rule_set: n1_wt = lexicon.get_by_symstr(w1_without_tag)[0] n2_wt = lexicon.get_by_symstr(w2_without_tag)[0] edge_set.add(GraphEdge(n1_wt, n2_wt, rule)) else: break return edge_set
def load_graph(filename, lexicon, threshold=0.0): edge_set = EdgeSet(lexicon) weights = [] rules = {} for word_1, word_2, rule_str, edge_freq_str in read_tsv_file(filename): try: edge_freq = float(edge_freq_str) if edge_freq < threshold: continue if rule_str not in rules: rules[rule_str] = Rule.from_string(rule_str) edge = GraphEdge(lexicon[word_1], lexicon[word_2], rules[rule_str], weight=edge_freq) edge_set.add(edge) weights.append(edge_freq) except ValueError: pass return FullGraph(lexicon, edge_set), np.array(weights)
def _untag_edge(lexicon, edge): source = lexicon.get_by_symstr(''.join(edge.source.word))[0] target = lexicon.get_by_symstr(''.join(edge.target.word))[0] rule = Rule(edge.rule.subst) return GraphEdge(source, target, rule)
def analyze(self, target :LexiconEntry, compute_cost=True, **kwargs) \ -> List[GraphEdge]: # TODO 1a. if predict_tag: get possible tags from the tag predictor # 1. get possible sources for the given target sources = set(sum([self.lexicon.get_by_symstr(word) \ for word, cost in self.fst.lookup(target.symstr)], [])) results = [] # 2. get possible (source, rule) pairs (extract rules) and score them edge_set = EdgeSet(self.lexicon) for source in sources: rules = extract_all_rules(source, target) for rule in rules: if rule in self.model.rule_set: if self.predict_vec: target_pr = target.copy() edge = GraphEdge(source, target_pr, rule) target_pr.vec = self.model.predict_target_feature_vec( edge) edge_set.add(edge) else: edge_set.add(GraphEdge(source, target, rule)) # back-formation if self.enable_back_formation and \ (self.max_results is None or len(edge_set) < self.max_results): lookup_results = set() for w, c in self.inv_rules_tr.lookup(target.symstr): try: lookup_results.add(unnormalize_word(\ re.sub(hfst.EPSILON, '', w))) except Exception as e: logging.getLogger('main').warning(str(e)) sources = [] for word in lookup_results: try: sources.append(LexiconEntry(word)) except Exception as e: logging.getLogger('main').warning(str(e)) for source in sources: rules = extract_all_rules(source, target) for rule in rules: if rule in self.model.rule_set: edge_set.add(GraphEdge(source, target, rule)) # analysis as root if self.include_roots: edge_set.add(GraphEdge(None, target, None)) # scoring # FIXME this is inefficient and may break on some model components # that don't have the method .edge_cost() for edge in edge_set: edge.attr['cost'] = 0 if edge.source is not None: edge.attr['cost'] += self.model.edge_cost(edge) if edge.source not in self.lexicon: edge.attr['cost'] += self.model.root_cost(edge.source) else: edge.attr['cost'] += self.model.root_cost(edge.target) results = [edge for edge in edge_set] # 4. sort the analyses according to the cost results.sort(key=lambda r: r.attr['cost']) if self.max_results is not None: results = results[:self.max_results] return results
def test_complete_sample(self) -> None: 'Test a sample consisting of all possible negative edges.' words = [ 'machen', 'macht', 'mache', 'Sachen', 'Sache', 'anwinkeln', 'anzuwinkeln' ] rules = [\ ':/en:t___:', ':/n:___:', ':/a:ä/:er___:', ':/:zu/:___:' ] positive_edges = [\ ('machen', 'macht', ':/en:t___:'), ('machen', 'mache', ':/n:___:'), ('Sachen', 'Sache', ':/n:___:'), ('anwinkeln', 'anzuwinkeln', ':/:zu/:___:'), ] expected_negative_edges = [\ ('Sachen', '{CAP}sacht', ':/en:t___:'), ('anwinkeln', 'anwinkel', ':/n:___:'), ('anzuwinkeln', 'anzuwinkel', ':/n:___:'), ('machen', 'mächener', ':/a:ä/:er___:'), ('macht', 'mächter', ':/a:ä/:er___:'), ('mache', 'mächeer', ':/a:ä/:er___:'), ('Sachen', '{CAP}sächener', ':/a:ä/:er___:'), ('Sache', '{CAP}sächeer', ':/a:ä/:er___:'), ('machen', 'mzuachen', ':/:zu/:___:'), ('machen', 'mazuchen', ':/:zu/:___:'), ('machen', 'maczuhen', ':/:zu/:___:'), ('machen', 'machzuen', ':/:zu/:___:'), ('machen', 'machezun', ':/:zu/:___:'), ('mache', 'mzuache', ':/:zu/:___:'), ('mache', 'mazuche', ':/:zu/:___:'), ('mache', 'maczuhe', ':/:zu/:___:'), ('mache', 'machzue', ':/:zu/:___:'), ('macht', 'mzuacht', ':/:zu/:___:'), ('macht', 'mazucht', ':/:zu/:___:'), ('macht', 'maczuht', ':/:zu/:___:'), ('macht', 'machzut', ':/:zu/:___:'), ('Sachen', '{CAP}zusachen', ':/:zu/:___:'), ('Sachen', '{CAP}szuachen', ':/:zu/:___:'), ('Sachen', '{CAP}sazuchen', ':/:zu/:___:'), ('Sachen', '{CAP}saczuhen', ':/:zu/:___:'), ('Sachen', '{CAP}sachzuen', ':/:zu/:___:'), ('Sachen', '{CAP}sachezun', ':/:zu/:___:'), ('Sache', '{CAP}zusache', ':/:zu/:___:'), ('Sache', '{CAP}szuache', ':/:zu/:___:'), ('Sache', '{CAP}sazuche', ':/:zu/:___:'), ('Sache', '{CAP}saczuhe', ':/:zu/:___:'), ('Sache', '{CAP}sachzue', ':/:zu/:___:'), ('anwinkeln', 'azunwinkeln', ':/:zu/:___:'), ('anwinkeln', 'anwzuinkeln', ':/:zu/:___:'), ('anwinkeln', 'anwizunkeln', ':/:zu/:___:'), ('anwinkeln', 'anwinzukeln', ':/:zu/:___:'), ('anwinkeln', 'anwinkzueln', ':/:zu/:___:'), ('anwinkeln', 'anwinkezuln', ':/:zu/:___:'), ('anwinkeln', 'anwinkelzun', ':/:zu/:___:'), ('anzuwinkeln', 'azunzuwinkeln', ':/:zu/:___:'), ('anzuwinkeln', 'anzuzuwinkeln', ':/:zu/:___:'), ('anzuwinkeln', 'anzzuuwinkeln', ':/:zu/:___:'), ('anzuwinkeln', 'anzuwzuinkeln', ':/:zu/:___:'), ('anzuwinkeln', 'anzuwizunkeln', ':/:zu/:___:'), ('anzuwinkeln', 'anzuwinzukeln', ':/:zu/:___:'), ('anzuwinkeln', 'anzuwinkzueln', ':/:zu/:___:'), ('anzuwinkeln', 'anzuwinkezuln', ':/:zu/:___:'), ('anzuwinkeln', 'anzuwinkelzun', ':/:zu/:___:') ] expected_weights = {\ ':/en:t___:' : 1.0, ':/n:___:' : 1.0, ':/a:ä/:er___:' : 1.0, ':/:zu/:___:' : 41/40 # the word "anzuzuwinkeln" can be # derived in two different ways, so # it is counted double in domsize # computation, but sampled only once; # such cases are very rare, so they # shouldn't influence the weights much } lexicon = Lexicon(LexiconEntry(word) for word in words) lex_fst = lexicon.to_fst() rule_set = RuleSet() for rule_str in rules: rule = Rule.from_string(rule_str) rule_set.add(rule, rule.compute_domsize(lex_fst)) edge_iter = (GraphEdge(lexicon[source], lexicon[target], rule_set[rule]) \ for (source, target, rule) in positive_edges) edge_set = EdgeSet(lexicon, edge_iter) negex_sampler = NegativeExampleSampler(rule_set) sample_size = len(expected_negative_edges) sample = negex_sampler.sample(lexicon, sample_size, show_progressbar=False) sample_weights = negex_sampler.compute_sample_weights(sample, edge_set) self.assertEqual(rule_set.get_domsize(rule_set[0]), 2) self.assertEqual(rule_set.get_domsize(rule_set[1]), 4) self.assertEqual(rule_set.get_domsize(rule_set[2]), 5) self.assertEqual(rule_set.get_domsize(rule_set[3]), 42) self.longMessage = False for edge in edge_set: self.assertNotIn(edge, sample, msg='positive edge: {} in sample'.format(edge)) for source, target, rule in expected_negative_edges: edge = GraphEdge(lexicon[source], LexiconEntry(target), rule_set[rule]) self.assertIn(edge, sample, msg='{} not in sample'.format(edge)) self.longMessage = True for i, edge in enumerate(sample): self.assertAlmostEqual(sample_weights[i], expected_weights[str(edge.rule)], msg='for edge {}'.format(edge))