コード例 #1
0
def inflect_word(lemma: LexiconEntry, tag: str, rules_tr, model, **kwargs):
    def _extract_tag(word):
        return ''.join(tokenize_word(word)[1])

    max_results = kwargs['max_results'] if 'max_results' in kwargs else None
    lookup_results = rules_tr.lookup(lemma.symstr)
    inflections = []
    for w, c in lookup_results:
        if _extract_tag(w) == tag:
            try:
                inflections.append(LexiconEntry(unnormalize_word(w)))
            except Exception as e:
                logging.getLogger('main').warning(e)
    edges = []
    for infl in inflections:
        for rule in extract_all_rules(lemma, infl):
            if rule in model.rule_set:
                edge = GraphEdge(lemma, infl, rule)
                edge.attr['cost'] = model.edge_cost(edge)
                edges.append(edge)
    edges = sorted(edges, key=lambda x: x.attr['cost'])
    if max_results is not None:
        edges = edges[:max_results]
    if not edges:
        return [(lemma, '---' + tag, '---')]
    return [(lemma, e.target, e.attr['cost']) for e in edges]
コード例 #2
0
ファイル: preprocess.py プロジェクト: maciejjan/morle
def build_graph_from_training_edges(lexicon, training_file, graph_file):
    with open_to_write(graph_file) as fp:
        for word_1, word_2 in read_tsv_file(training_file, (str, str)):
            if word_1:
                try:
                    n1, n2 = lexicon[word_1], lexicon[word_2]
                    for rule in extract_all_rules(n1, n2):
                        write_line(fp, (str(n1), str(n2), str(rule)))
                except KeyError:
                    if word_1 not in lexicon:
                        logging.getLogger('main').warning('%s not in lexicon' %
                                                          word_1)
コード例 #3
0
ファイル: samplers.py プロジェクト: maciejjan/morle
    def _compute_leaf_prob(self):
        logging.getLogger('main').info('Computing leaf probabilities...')
        self.leaf_prob = np.ones((len(self.lexicon), len(self.tagset)),
                                 dtype=np.float64)
        edge_set = EdgeSet(lexicon)

        def _empty_edge_set(edge_set):
            lexicon = edge_set.lexicon
            n = len(edge_set)
            probs = 1 - self.model.edges_prob(edge_set)
            for e_id, edge in enumerate(edge_set):
                word = lexicon.get_by_symstr(''.join(edge.source.word))[0]
                w_id = lexicon.get_id(word)
                t_id = self.tag_idx[edge.source.tag]
                self.leaf_prob[w_id, t_id] *= probs[e_id]
            edge_set = EdgeSet(lexicon)
            print(n)
            return edge_set

        lexicon_tr = self.lexicon.to_fst()
        lexicon_tr.concatenate(FST.generator(self.tagset))
        rules_tr = self.model.rule_set.to_fst()
        tr = hfst.HfstTransducer(lexicon_tr)
        tr.compose(rules_tr)
        tr.determinize()
        tr.minimize()
        FST.save_transducer(tr, 'tr.fsm')

        tr_path = full_path('tr.fsm')
        cmd = ['hfst-fst2strings', tr_path]
        p = subprocess.Popen(cmd,
                             stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.DEVNULL,
                             universal_newlines=True,
                             bufsize=1)
        while True:
            line = p.stdout.readline().strip()
            if line:
                w1, w2 = line.split(':')
                n1 = LexiconEntry(w1)
                n2 = LexiconEntry(w2)
                rules = extract_all_rules(n1, n2)
                for rule in rules:
                    if rule in rule_set:
                        edge_set.add(GraphEdge(n1, n2, rule))
            else:
                break
            if len(edge_set) > 300000:
                edge_set = _empty_edge_set(edge_set)
        edge_set = _empty_edge_set(edge_set)
コード例 #4
0
ファイル: preprocess.py プロジェクト: maciejjan/morle
 def _extract_candidate_edges(words: Iterable[str],
                              output_fun: Callable[...,
                                                   None], lexicon: Lexicon,
                              transducer_path: str) -> None:
     sw = similar_words(words, transducer_path)
     for word_1, simwords in sw:
         v1_list = lexicon.get_by_symstr(word_1)
         for v1 in v1_list:
             results_for_v1 = []
             for word_2 in simwords:
                 for v2 in lexicon.get_by_symstr(word_2):
                     if v1 != v2 and _is_possible_edge(v1, v2):
                         rules = extract_all_rules(v1, v2)
                         for rule in rules:
                             results_for_v1.append((v2.literal, str(rule)))
             output_fun((v1.literal, results_for_v1))
コード例 #5
0
 def analyze(self, target :LexiconEntry, compute_cost=True, **kwargs) \
            -> List[GraphEdge]:
     # TODO 1a. if predict_tag: get possible tags from the tag predictor
     # 1. get possible sources for the given target
     sources = set(sum([self.lexicon.get_by_symstr(word) \
                        for word, cost in self.fst.lookup(target.symstr)],
                       []))
     results = []
     # 2. get possible (source, rule) pairs (extract rules) and score them
     edge_set = EdgeSet(self.lexicon)
     for source in sources:
         rules = extract_all_rules(source, target)
         for rule in rules:
             if rule in self.model.rule_set:
                 if self.predict_vec:
                     target_pr = target.copy()
                     edge = GraphEdge(source, target_pr, rule)
                     target_pr.vec = self.model.predict_target_feature_vec(
                         edge)
                     edge_set.add(edge)
                 else:
                     edge_set.add(GraphEdge(source, target, rule))
     # back-formation
     if self.enable_back_formation and \
             (self.max_results is None or len(edge_set) < self.max_results):
         lookup_results = set()
         for w, c in self.inv_rules_tr.lookup(target.symstr):
             try:
                 lookup_results.add(unnormalize_word(\
                     re.sub(hfst.EPSILON, '', w)))
             except Exception as e:
                 logging.getLogger('main').warning(str(e))
         sources = []
         for word in lookup_results:
             try:
                 sources.append(LexiconEntry(word))
             except Exception as e:
                 logging.getLogger('main').warning(str(e))
         for source in sources:
             rules = extract_all_rules(source, target)
             for rule in rules:
                 if rule in self.model.rule_set:
                     edge_set.add(GraphEdge(source, target, rule))
     # analysis as root
     if self.include_roots:
         edge_set.add(GraphEdge(None, target, None))
     # scoring
     # FIXME this is inefficient and may break on some model components
     #   that don't have the method .edge_cost()
     for edge in edge_set:
         edge.attr['cost'] = 0
         if edge.source is not None:
             edge.attr['cost'] += self.model.edge_cost(edge)
             if edge.source not in self.lexicon:
                 edge.attr['cost'] += self.model.root_cost(edge.source)
         else:
             edge.attr['cost'] += self.model.root_cost(edge.target)
     results = [edge for edge in edge_set]
     # 4. sort the analyses according to the cost
     results.sort(key=lambda r: r.attr['cost'])
     if self.max_results is not None:
         results = results[:self.max_results]
     return results