Beispiel #1
0
def load_cascade(filename):
    transducers = []
    istr = hfst.HfstInputStream(full_path(filename))
    while not istr.is_eof():
        transducers.append(istr.read())
    istr.close()
    return tuple(transducers)
Beispiel #2
0
 def load(filename: str,
          rule_set: RuleSet) -> 'LogNormalEdgeFrequencyModel':
     result = LogNormalEdgeFrequencyModel(rule_set)
     with np.load(full_path(filename)) as data:
         result.means = data['means']
         result.sdevs = data['sdevs']
     return result
Beispiel #3
0
    def _compute_leaf_prob(self):
        logging.getLogger('main').info('Computing leaf probabilities...')
        self.leaf_prob = np.ones((len(self.lexicon), len(self.tagset)),
                                 dtype=np.float64)
        edge_set = EdgeSet(lexicon)

        def _empty_edge_set(edge_set):
            lexicon = edge_set.lexicon
            n = len(edge_set)
            probs = 1 - self.model.edges_prob(edge_set)
            for e_id, edge in enumerate(edge_set):
                word = lexicon.get_by_symstr(''.join(edge.source.word))[0]
                w_id = lexicon.get_id(word)
                t_id = self.tag_idx[edge.source.tag]
                self.leaf_prob[w_id, t_id] *= probs[e_id]
            edge_set = EdgeSet(lexicon)
            print(n)
            return edge_set

        lexicon_tr = self.lexicon.to_fst()
        lexicon_tr.concatenate(FST.generator(self.tagset))
        rules_tr = self.model.rule_set.to_fst()
        tr = hfst.HfstTransducer(lexicon_tr)
        tr.compose(rules_tr)
        tr.determinize()
        tr.minimize()
        FST.save_transducer(tr, 'tr.fsm')

        tr_path = full_path('tr.fsm')
        cmd = ['hfst-fst2strings', tr_path]
        p = subprocess.Popen(cmd,
                             stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.DEVNULL,
                             universal_newlines=True,
                             bufsize=1)
        while True:
            line = p.stdout.readline().strip()
            if line:
                w1, w2 = line.split(':')
                n1 = LexiconEntry(w1)
                n2 = LexiconEntry(w2)
                rules = extract_all_rules(n1, n2)
                for rule in rules:
                    if rule in rule_set:
                        edge_set.add(GraphEdge(n1, n2, rule))
            else:
                break
            if len(edge_set) > 300000:
                edge_set = _empty_edge_set(edge_set)
        edge_set = _empty_edge_set(edge_set)
Beispiel #4
0
def build_fastss_cascade(lexicon_tr_file,
                         alphabet,
                         max_word_len=20,
                         use_tag_absorber=True,
                         static_compose=False):
    delenv_file = 'delenv.att'
    delfilter_file = 'delfilter.att'
    tag_absorber_file = 'tag_absorber.att'

    write_delenv_transducer(
        delenv_file, shared.config['preprocess'].getint('max_affix_length'),
        shared.config['preprocess'].getint('max_infix_length'),
        shared.config['preprocess'].getint('max_infix_slots'))
    write_delfilter_transducer(delfilter_file, max_word_len)
    write_tag_absorber(tag_absorber_file, alphabet)

    #     cmd = ['hfst-xfst', '-f', 'sfst']
    cmd = ['hfst-xfst', '-p']
    p = subprocess.Popen(cmd,
                         stdin=subprocess.PIPE,
                         stdout=subprocess.DEVNULL,
                         stderr=None,
                         universal_newlines=True)
    p.stdin.write('read att {}\n'.format(full_path(delfilter_file)))
    p.stdin.write('read att {}\n'.format(full_path(delenv_file)))
    if use_tag_absorber:
        p.stdin.write('read att {}\n'.format(full_path(tag_absorber_file)))
    p.stdin.write('compose\n')
    p.stdin.write('minimize\n')
    if static_compose:
        p.stdin.write('load stack {}\n'.format(full_path(lexicon_tr_file)))
        p.stdin.write('compose\n')
        p.stdin.write('minimize\n')
        p.stdin.write('define T\n')
        p.stdin.write('push T\n')
        p.stdin.write('invert\n')
        p.stdin.write('push T\n')
        p.stdin.write('compose\n')
        p.stdin.write('minimize\n')
    else:
        p.stdin.write('define T\n')
        p.stdin.write('push T\n')
        p.stdin.write('load stack {}\n'.format(full_path(lexicon_tr_file)))
        p.stdin.write('compose\n')
        p.stdin.write('minimize\n')
        p.stdin.write('invert\n')
        p.stdin.write('push T\n')
        p.stdin.write('rotate stack\n')
    fastss_tr_path = full_path(shared.filenames['fastss-tr'])
    p.stdin.write('save stack {}\n'.format(fastss_tr_path))
    p.stdin.write('quit\n')
    p.stdin.close()
    p.wait()

    # cleanup
    remove_file(delenv_file)
    remove_file(delfilter_file)
    remove_file(tag_absorber_file)
Beispiel #5
0
 def to_fst(self) -> hfst.HfstTransducer:
     lexc_file = shared.filenames['lexicon-tr'] + '.lex'
     tags = set()
     for entry in self.items:
         for t in entry.tag:
             tags.add(t)
     with open_to_write(lexc_file) as lexfp:
         lexfp.write('Multichar_Symbols ' +
                     ' '.join(self._lexc_escape(s) \
                     for s in shared.multichar_symbols+list(tags)) + '\n\n')
         lexfp.write('LEXICON Root\n')
         for entry in self.items:
             lexfp.write('\t' + self._lexc_escape(entry.symstr) + ' # ;\n')
     transducer = hfst.compile_lexc_file(full_path(lexc_file))
     remove_file(lexc_file)
     return transducer
Beispiel #6
0
def compute_possible_edges(lexicon: Lexicon, rule_set: RuleSet) -> EdgeSet:
    # build the transducer
    lexicon_tr = lexicon.to_fst()
    tag_seqs = extract_tag_symbols_from_rules(rule_set)
    if tag_seqs:
        lexicon_tr.concatenate(FST.generator(tag_seqs))
    rules_tr = rule_set.to_fst()
    tr = hfst.HfstTransducer(lexicon_tr)
    tr.compose(rules_tr)
    tr.determinize()
    tr.minimize()
    lexicon_tr.invert()
    tr.compose(lexicon_tr)
    tr.determinize()
    tr.minimize()
    FST.save_transducer(tr, 'tr.fsm')

    tr_path = full_path('tr.fsm')
    cmd = ['hfst-fst2strings', tr_path]
    p = subprocess.Popen(cmd,
                         stdin=subprocess.PIPE,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.DEVNULL,
                         universal_newlines=True,
                         bufsize=1)
    edge_set = EdgeSet(lexicon)
    while True:
        line = p.stdout.readline().strip()
        if line:
            w1, w2 = line.split(':')
            w1_without_tag = re.sub(shared.compiled_patterns['tag'], '', w1)
            w2_without_tag = re.sub(shared.compiled_patterns['tag'], '', w2)
            if w1_without_tag != w2_without_tag:
                n1 = LexiconEntry(w1)
                n2 = LexiconEntry(w2)
                rules = algorithms.align.extract_all_rules(n1, n2)
                for rule in rules:
                    if rule in rule_set:
                        n1_wt = lexicon.get_by_symstr(w1_without_tag)[0]
                        n2_wt = lexicon.get_by_symstr(w2_without_tag)[0]
                        edge_set.add(GraphEdge(n1_wt, n2_wt, rule))
        else:
            break
    return edge_set
Beispiel #7
0
def similar_words_with_lookup(words, transducer_path):
    cmd = [
        'hfst-lookup', '-i',
        full_path(transducer_path), '-C', 'composition'
    ]
    p = subprocess.Popen(cmd,
                         stdin=subprocess.PIPE,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.DEVNULL,
                         universal_newlines=True,
                         bufsize=1)
    count = 0
    restart_interval = \
        shared.config['preprocess'].getint('hfst_restart_interval')
    for word in words:
        p.stdin.write(word + '\n')
        p.stdin.flush()
        similar_words = set()
        while True:
            line = p.stdout.readline().strip()
            if line:
                cols = line.split('\t')
                if len(cols) == 3 and cols[2].startswith('0'):
                    similar_words.add(cols[1])
            else:
                break
        for sim_word in similar_words:
            yield (word, sim_word)
        count += 1
        if restart_interval > 0 and count % restart_interval == 0:
            # restart the HFST subprocess to counter the memory leak
            p.stdin.close()
            p.wait()
            p = subprocess.Popen(cmd,
                                 stdin=subprocess.PIPE,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.DEVNULL,
                                 universal_newlines=True,
                                 bufsize=1)
    p.stdin.close()
    p.wait()
Beispiel #8
0
 def save(self, filename: str) -> None:
     np.savez(full_path(filename), means=self.means, sdevs=self.sdevs)
Beispiel #9
0
def generate_words(tr_file: str,
                   analyzer: Analyzer,
                   model: ModelSuite,
                   freq_model: bool = True,
                   sum_analyses: bool = True,
                   min_freq: float = 1,
                   max_freq: float = 2):
    logging.getLogger('main').info(
        'Precomputing the Gaussian distribution table...')
    _normcdf_cache = norm.cdf(np.array(range(-10000, 10001)) / 1000)
    max_cost = shared.config['generate'].getfloat('max_cost')

    def _normcdf(x):
        if x < -10:
            x = -10
        elif x > 10:
            x = 10
        return _normcdf_cache[int((x + 10) * 1000)]

    def _edge_prob_ratio(edge: GraphEdge) -> float:
        r_id = model.rule_set.get_id(edge.rule)
        prob = model.edge_model.edge_prob(edge)
        return prob / (1 - prob)

    def _edge_freq_prob(edge: GraphEdge) -> float:
        r_id = model.rule_set.get_id(edge.rule)
        mean = model.edge_frequency_model.means[r_id]
        sdev = model.edge_frequency_model.sdevs[r_id]
        norm_min_freq = (log_min_freq - edge.source.logfreq - mean) / sdev
        norm_max_freq = (log_max_freq - edge.source.logfreq - mean) / sdev
        freq_prob = (_normcdf(log_max_freq) - _normcdf(log_min_freq)) / sdev
        return freq_prob

    logging.getLogger('main').info('Generating...')
    cmd = ['hfst-fst2strings', full_path(tr_file)]
    p = subprocess.Popen(cmd,
                         stdin=subprocess.PIPE,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.DEVNULL,
                         universal_newlines=True,
                         bufsize=1)
    log_max_freq = math.log(max_freq)
    log_min_freq = math.log(min_freq)
    while True:
        try:
            line = p.stdout.readline().strip()
            if line:
                word = unnormalize_word(line.rstrip())
                analyses = analyzer.analyze(LexiconEntry(word),
                                            compute_cost=False)
                word_prob_ratio = 0
                for edge in analyses:
                    prob_ratio = _edge_prob_ratio(edge)
                    if freq_model:
                        prob_ratio *= _edge_freq_prob(edge)
                    if sum_analyses:
                        word_prob_ratio += prob_ratio
                    else:
                        word_prob_ratio = max(word_prob_ratio, prob_ratio)
                if word_prob_ratio > 0:
                    cost = -math.log(word_prob_ratio)
                    if cost < max_cost:
                        yield (word, cost)
            else:
                break
        except Exception as e:
            logging.getLogger('main').warning(str(e))
Beispiel #10
0
 def save(self, filename) -> None:
     np.savez(full_path(filename), means=self.means, vars=self.vars)