def load_cascade(filename): transducers = [] istr = hfst.HfstInputStream(full_path(filename)) while not istr.is_eof(): transducers.append(istr.read()) istr.close() return tuple(transducers)
def load(filename: str, rule_set: RuleSet) -> 'LogNormalEdgeFrequencyModel': result = LogNormalEdgeFrequencyModel(rule_set) with np.load(full_path(filename)) as data: result.means = data['means'] result.sdevs = data['sdevs'] return result
def _compute_leaf_prob(self): logging.getLogger('main').info('Computing leaf probabilities...') self.leaf_prob = np.ones((len(self.lexicon), len(self.tagset)), dtype=np.float64) edge_set = EdgeSet(lexicon) def _empty_edge_set(edge_set): lexicon = edge_set.lexicon n = len(edge_set) probs = 1 - self.model.edges_prob(edge_set) for e_id, edge in enumerate(edge_set): word = lexicon.get_by_symstr(''.join(edge.source.word))[0] w_id = lexicon.get_id(word) t_id = self.tag_idx[edge.source.tag] self.leaf_prob[w_id, t_id] *= probs[e_id] edge_set = EdgeSet(lexicon) print(n) return edge_set lexicon_tr = self.lexicon.to_fst() lexicon_tr.concatenate(FST.generator(self.tagset)) rules_tr = self.model.rule_set.to_fst() tr = hfst.HfstTransducer(lexicon_tr) tr.compose(rules_tr) tr.determinize() tr.minimize() FST.save_transducer(tr, 'tr.fsm') tr_path = full_path('tr.fsm') cmd = ['hfst-fst2strings', tr_path] p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, universal_newlines=True, bufsize=1) while True: line = p.stdout.readline().strip() if line: w1, w2 = line.split(':') n1 = LexiconEntry(w1) n2 = LexiconEntry(w2) rules = extract_all_rules(n1, n2) for rule in rules: if rule in rule_set: edge_set.add(GraphEdge(n1, n2, rule)) else: break if len(edge_set) > 300000: edge_set = _empty_edge_set(edge_set) edge_set = _empty_edge_set(edge_set)
def build_fastss_cascade(lexicon_tr_file, alphabet, max_word_len=20, use_tag_absorber=True, static_compose=False): delenv_file = 'delenv.att' delfilter_file = 'delfilter.att' tag_absorber_file = 'tag_absorber.att' write_delenv_transducer( delenv_file, shared.config['preprocess'].getint('max_affix_length'), shared.config['preprocess'].getint('max_infix_length'), shared.config['preprocess'].getint('max_infix_slots')) write_delfilter_transducer(delfilter_file, max_word_len) write_tag_absorber(tag_absorber_file, alphabet) # cmd = ['hfst-xfst', '-f', 'sfst'] cmd = ['hfst-xfst', '-p'] p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.DEVNULL, stderr=None, universal_newlines=True) p.stdin.write('read att {}\n'.format(full_path(delfilter_file))) p.stdin.write('read att {}\n'.format(full_path(delenv_file))) if use_tag_absorber: p.stdin.write('read att {}\n'.format(full_path(tag_absorber_file))) p.stdin.write('compose\n') p.stdin.write('minimize\n') if static_compose: p.stdin.write('load stack {}\n'.format(full_path(lexicon_tr_file))) p.stdin.write('compose\n') p.stdin.write('minimize\n') p.stdin.write('define T\n') p.stdin.write('push T\n') p.stdin.write('invert\n') p.stdin.write('push T\n') p.stdin.write('compose\n') p.stdin.write('minimize\n') else: p.stdin.write('define T\n') p.stdin.write('push T\n') p.stdin.write('load stack {}\n'.format(full_path(lexicon_tr_file))) p.stdin.write('compose\n') p.stdin.write('minimize\n') p.stdin.write('invert\n') p.stdin.write('push T\n') p.stdin.write('rotate stack\n') fastss_tr_path = full_path(shared.filenames['fastss-tr']) p.stdin.write('save stack {}\n'.format(fastss_tr_path)) p.stdin.write('quit\n') p.stdin.close() p.wait() # cleanup remove_file(delenv_file) remove_file(delfilter_file) remove_file(tag_absorber_file)
def to_fst(self) -> hfst.HfstTransducer: lexc_file = shared.filenames['lexicon-tr'] + '.lex' tags = set() for entry in self.items: for t in entry.tag: tags.add(t) with open_to_write(lexc_file) as lexfp: lexfp.write('Multichar_Symbols ' + ' '.join(self._lexc_escape(s) \ for s in shared.multichar_symbols+list(tags)) + '\n\n') lexfp.write('LEXICON Root\n') for entry in self.items: lexfp.write('\t' + self._lexc_escape(entry.symstr) + ' # ;\n') transducer = hfst.compile_lexc_file(full_path(lexc_file)) remove_file(lexc_file) return transducer
def compute_possible_edges(lexicon: Lexicon, rule_set: RuleSet) -> EdgeSet: # build the transducer lexicon_tr = lexicon.to_fst() tag_seqs = extract_tag_symbols_from_rules(rule_set) if tag_seqs: lexicon_tr.concatenate(FST.generator(tag_seqs)) rules_tr = rule_set.to_fst() tr = hfst.HfstTransducer(lexicon_tr) tr.compose(rules_tr) tr.determinize() tr.minimize() lexicon_tr.invert() tr.compose(lexicon_tr) tr.determinize() tr.minimize() FST.save_transducer(tr, 'tr.fsm') tr_path = full_path('tr.fsm') cmd = ['hfst-fst2strings', tr_path] p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, universal_newlines=True, bufsize=1) edge_set = EdgeSet(lexicon) while True: line = p.stdout.readline().strip() if line: w1, w2 = line.split(':') w1_without_tag = re.sub(shared.compiled_patterns['tag'], '', w1) w2_without_tag = re.sub(shared.compiled_patterns['tag'], '', w2) if w1_without_tag != w2_without_tag: n1 = LexiconEntry(w1) n2 = LexiconEntry(w2) rules = algorithms.align.extract_all_rules(n1, n2) for rule in rules: if rule in rule_set: n1_wt = lexicon.get_by_symstr(w1_without_tag)[0] n2_wt = lexicon.get_by_symstr(w2_without_tag)[0] edge_set.add(GraphEdge(n1_wt, n2_wt, rule)) else: break return edge_set
def similar_words_with_lookup(words, transducer_path): cmd = [ 'hfst-lookup', '-i', full_path(transducer_path), '-C', 'composition' ] p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, universal_newlines=True, bufsize=1) count = 0 restart_interval = \ shared.config['preprocess'].getint('hfst_restart_interval') for word in words: p.stdin.write(word + '\n') p.stdin.flush() similar_words = set() while True: line = p.stdout.readline().strip() if line: cols = line.split('\t') if len(cols) == 3 and cols[2].startswith('0'): similar_words.add(cols[1]) else: break for sim_word in similar_words: yield (word, sim_word) count += 1 if restart_interval > 0 and count % restart_interval == 0: # restart the HFST subprocess to counter the memory leak p.stdin.close() p.wait() p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, universal_newlines=True, bufsize=1) p.stdin.close() p.wait()
def save(self, filename: str) -> None: np.savez(full_path(filename), means=self.means, sdevs=self.sdevs)
def generate_words(tr_file: str, analyzer: Analyzer, model: ModelSuite, freq_model: bool = True, sum_analyses: bool = True, min_freq: float = 1, max_freq: float = 2): logging.getLogger('main').info( 'Precomputing the Gaussian distribution table...') _normcdf_cache = norm.cdf(np.array(range(-10000, 10001)) / 1000) max_cost = shared.config['generate'].getfloat('max_cost') def _normcdf(x): if x < -10: x = -10 elif x > 10: x = 10 return _normcdf_cache[int((x + 10) * 1000)] def _edge_prob_ratio(edge: GraphEdge) -> float: r_id = model.rule_set.get_id(edge.rule) prob = model.edge_model.edge_prob(edge) return prob / (1 - prob) def _edge_freq_prob(edge: GraphEdge) -> float: r_id = model.rule_set.get_id(edge.rule) mean = model.edge_frequency_model.means[r_id] sdev = model.edge_frequency_model.sdevs[r_id] norm_min_freq = (log_min_freq - edge.source.logfreq - mean) / sdev norm_max_freq = (log_max_freq - edge.source.logfreq - mean) / sdev freq_prob = (_normcdf(log_max_freq) - _normcdf(log_min_freq)) / sdev return freq_prob logging.getLogger('main').info('Generating...') cmd = ['hfst-fst2strings', full_path(tr_file)] p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, universal_newlines=True, bufsize=1) log_max_freq = math.log(max_freq) log_min_freq = math.log(min_freq) while True: try: line = p.stdout.readline().strip() if line: word = unnormalize_word(line.rstrip()) analyses = analyzer.analyze(LexiconEntry(word), compute_cost=False) word_prob_ratio = 0 for edge in analyses: prob_ratio = _edge_prob_ratio(edge) if freq_model: prob_ratio *= _edge_freq_prob(edge) if sum_analyses: word_prob_ratio += prob_ratio else: word_prob_ratio = max(word_prob_ratio, prob_ratio) if word_prob_ratio > 0: cost = -math.log(word_prob_ratio) if cost < max_cost: yield (word, cost) else: break except Exception as e: logging.getLogger('main').warning(str(e))
def save(self, filename) -> None: np.savez(full_path(filename), means=self.means, vars=self.vars)