コード例 #1
0
ファイル: preprocess.py プロジェクト: maciejjan/morle
def run() -> None:
    logging.getLogger('main').info('Loading lexicon...')
    lexicon = Lexicon.load(shared.filenames['wordlist'])
    logging.getLogger('main').info('Building the lexicon transducer...')
    lexicon_tr = lexicon.to_fst()
    FST.save_transducer(lexicon_tr, shared.filenames['lexicon-tr'])

    if shared.config['General'].getboolean('supervised'):
        logging.getLogger('main').info('Building graph...')
        build_graph_from_training_edges(lexicon, shared.filenames['wordlist'],
                                        shared.filenames['graph'])
    else:
        logging.getLogger('main').info('Building graph...')
        write_tsv_file(
            shared.filenames['graph'],
            build_graph_fstfastss(lexicon, shared.filenames['lexicon-tr']))

    sort_file(shared.filenames['graph'], key=3)
    update_file_size(shared.filenames['graph'])
    run_filters(shared.filenames['graph'])
    update_file_size(shared.filenames['graph'])

    # write rules file
    rules = []
    for rule_str, edges in read_tsv_file_by_key(shared.filenames['graph'],
                                                key=3,
                                                show_progressbar=False):
        rules.append(Rule.from_string(rule_str))
    lexicon_tr = lexicon.to_fst()
    FST.save_transducer(lexicon_tr, shared.filenames['lexicon-tr'])
    logging.getLogger('main').info('Computing rule domain sizes...')
    write_tsv_file(shared.filenames['rules'],
                   ((str(rule), domsize)\
                    for rule, domsize in \
                        compute_rule_domsizes(lexicon_tr, rules)))
コード例 #2
0
def create_new_words_acceptor_if_not_exists(filename, analyzer, lexicon):
    if not file_exists(filename):
        new_words_acceptor = hfst.HfstTransducer(analyzer.fst)
        new_words_acceptor.convert(
            hfst.ImplementationType.TROPICAL_OPENFST_TYPE)
        new_words_acceptor.input_project()
        new_words_acceptor.minimize()
        new_words_acceptor.subtract(lexicon.to_fst())
        new_words_acceptor.minimize()
        FST.save_transducer(new_words_acceptor, filename)
コード例 #3
0
def run() -> None:
    rules = load_rules()
    roots = load_roots()

    logging.getLogger('main').info('Building the rule transducer...')
    rules_tr = build_rule_transducer(rules)
    FST.save_transducer(rules_tr, shared.filenames['rules-tr'])

    if shared.config['General'].getboolean('supervised'):
        logging.getLogger('main').info('Building the root transducer...')
        roots_tr = build_root_transducer(roots)
        FST.save_transducer(roots_tr, shared.filenames['roots-tr'])
コード例 #4
0
ファイル: samplers.py プロジェクト: maciejjan/morle
    def _compute_leaf_prob(self):
        logging.getLogger('main').info('Computing leaf probabilities...')
        self.leaf_prob = np.ones((len(self.lexicon), len(self.tagset)),
                                 dtype=np.float64)
        edge_set = EdgeSet(lexicon)

        def _empty_edge_set(edge_set):
            lexicon = edge_set.lexicon
            n = len(edge_set)
            probs = 1 - self.model.edges_prob(edge_set)
            for e_id, edge in enumerate(edge_set):
                word = lexicon.get_by_symstr(''.join(edge.source.word))[0]
                w_id = lexicon.get_id(word)
                t_id = self.tag_idx[edge.source.tag]
                self.leaf_prob[w_id, t_id] *= probs[e_id]
            edge_set = EdgeSet(lexicon)
            print(n)
            return edge_set

        lexicon_tr = self.lexicon.to_fst()
        lexicon_tr.concatenate(FST.generator(self.tagset))
        rules_tr = self.model.rule_set.to_fst()
        tr = hfst.HfstTransducer(lexicon_tr)
        tr.compose(rules_tr)
        tr.determinize()
        tr.minimize()
        FST.save_transducer(tr, 'tr.fsm')

        tr_path = full_path('tr.fsm')
        cmd = ['hfst-fst2strings', tr_path]
        p = subprocess.Popen(cmd,
                             stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.DEVNULL,
                             universal_newlines=True,
                             bufsize=1)
        while True:
            line = p.stdout.readline().strip()
            if line:
                w1, w2 = line.split(':')
                n1 = LexiconEntry(w1)
                n2 = LexiconEntry(w2)
                rules = extract_all_rules(n1, n2)
                for rule in rules:
                    if rule in rule_set:
                        edge_set.add(GraphEdge(n1, n2, rule))
            else:
                break
            if len(edge_set) > 300000:
                edge_set = _empty_edge_set(edge_set)
        edge_set = _empty_edge_set(edge_set)
コード例 #5
0
ファイル: possible-edges.py プロジェクト: maciejjan/morle
def compute_possible_edges(lexicon: Lexicon, rule_set: RuleSet) -> EdgeSet:
    # build the transducer
    lexicon_tr = lexicon.to_fst()
    tag_seqs = extract_tag_symbols_from_rules(rule_set)
    if tag_seqs:
        lexicon_tr.concatenate(FST.generator(tag_seqs))
    rules_tr = rule_set.to_fst()
    tr = hfst.HfstTransducer(lexicon_tr)
    tr.compose(rules_tr)
    tr.determinize()
    tr.minimize()
    lexicon_tr.invert()
    tr.compose(lexicon_tr)
    tr.determinize()
    tr.minimize()
    FST.save_transducer(tr, 'tr.fsm')

    tr_path = full_path('tr.fsm')
    cmd = ['hfst-fst2strings', tr_path]
    p = subprocess.Popen(cmd,
                         stdin=subprocess.PIPE,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.DEVNULL,
                         universal_newlines=True,
                         bufsize=1)
    edge_set = EdgeSet(lexicon)
    while True:
        line = p.stdout.readline().strip()
        if line:
            w1, w2 = line.split(':')
            w1_without_tag = re.sub(shared.compiled_patterns['tag'], '', w1)
            w2_without_tag = re.sub(shared.compiled_patterns['tag'], '', w2)
            if w1_without_tag != w2_without_tag:
                n1 = LexiconEntry(w1)
                n2 = LexiconEntry(w2)
                rules = algorithms.align.extract_all_rules(n1, n2)
                for rule in rules:
                    if rule in rule_set:
                        n1_wt = lexicon.get_by_symstr(w1_without_tag)[0]
                        n2_wt = lexicon.get_by_symstr(w2_without_tag)[0]
                        edge_set.add(GraphEdge(n1_wt, n2_wt, rule))
        else:
            break
    return edge_set
コード例 #6
0
ファイル: root.py プロジェクト: maciejjan/morle
 def save(self, filename :str) -> None:
     # TODO saving/loading smoothing and parameters
     FST.save_transducer(self.automaton, filename)
     if self.smoothing > 0:
         self.smoothing_model.save(filename + '.smoothing')
コード例 #7
0
 def save(self, filename: str) -> None:
     FST.save_transducer(self.fst, filename)