def generate_inflections(self, q, lemma): paths = fst.compose(q, self.lattice) two_state = fst.compose(paths, self.refiner) output = two_state.project(project_output=True) output.rmepsilon() output = fst.determinize(output) output.minimize() # read fst out dist = [] labels = [] state = output.start() for arc in output.arcs(state): label = self.input_syms.find(arc.ilabel) pr = float(arc.weight) dist.append(math.e**(-pr)) labels.append(label) sum_value = sum(dist) norm_dist = [prob / sum_value for prob in dist] relabels = [] inf_d = {} #for label in labels: for label, dist in zip(labels, norm_dist): delete, insert = label.split("_") l = len(delete) label = "".join(lemma[:-l]) + insert relabels.append(label) inf_d[label] = dist return inf_d
def generate_inflections(self,q, lemma): paths = fst.compose(q, self.lattice) two_state = fst.compose(paths, self.refiner) output = two_state.project(project_output=True) output.rmepsilon() output = fst.determinize(output) output.minimize() # read fst out dist = [] labels = [] state = output.start() for arc in output.arcs(state): label = self.input_syms.find(arc.ilabel) pr = float(arc.weight) dist.append(pr) labels.append(label) sum_value = sum(dist) norm_dist = [prob/sum_value for prob in dist] relabels = [] for label in labels: delete, insert = label.split("_") l = len(delete) label = lemma[:-l]+insert #print(lemma, delete, insert, label) relabels.append(label) return str(sorted(zip(relabels, norm_dist), key=lambda x:x[1]))
def apply_fst(elements, automata_op, is_project=True, **kwargs): """Compose a linear automata generated from `elements` with `automata_op`. Args: elements (list): ordered list of edge symbols for a linear automata. automata_op (Fst): automata that will be applied. is_project (bool, optional): whether to keep only the output labels. kwargs: Additional arguments to the compiler of the linear automata . """ linear_automata = linear_fst(elements, automata_op, **kwargs) out = fst.compose(linear_automata, automata_op) if is_project: out.project(project_output=True) return out
def main(): parser = argparse.ArgumentParser("") parser.add_argument("--lexicon", type=str) parser.add_argument("--word", type=str) parser.add_argument("--phone", type=str) parser.add_argument("--G", type=str) parser.add_argument("--L", type=str) parser.add_argument("--LG", type=str) args = parser.parse_args() word_sym = load_symbols(args.word) phone_sym = load_symbols(args.phone) l = make_fst(word_sym, phone_sym, args.lexicon) l.write(args.L) g = fst.Fst.read(args.G) lg = fst.compose(l, g) lg.write(args.LG)
def make_graph(self, lexicon_file, graphemes_file, grammar_file, sil_symbol, disambig_graphemes_file='graphemes_disambig.txt', words_file='words.txt', graph_file='LG.fst'): """build decode graph and write to disk Args: lexicon_file: lexicon file from word to graphemes sequence graphemes_file: graphemes id file grammar_file: arpa language model file disambig_graphemes_file: ouput graphemes table from grapheme to id isymbols_table for result WFST words_file: output words table from word to id osymbols_table for result WFST graph_file: write result graph to graph_file, default: LG.fst """ if self.graph_type == 'TLG': L = self.lexicon_builder(lexicon_file, graphemes_file, sil_symbol) logging.info('grapheme should be phones or syllables') logging.info( 'please confirm the sil symbol is "SIL" or some other you defined' ) elif self.graph_type == 'LG': L = self.lexicon_builder(lexicon_file, graphemes_file, sil_symbol) logging.info('grapheme should be characters') logging.info( 'please confirm the sil symbol is <space> or some other you defined' ) self.lexicon_builder.write_words_table(words_file) self.lexicon_builder.write_disambig_graphemes_table( disambig_graphemes_file) logging.info('make lexicon fst successfully') G = self.grammar_builder(grammar_file, words_file) logging.info('make grammar fst successfully') LG = fst.compose(L, G) logging.info('LG compose successfully') LG = fst.determinize(LG) logging.info('LG determinize successfully') LG.minimize() logging.info('LG minimize successfully') LG.arcsort(sort_type='ilabel') if self.graph_type == 'TLG': T = self.token_builder(disambig_graphemes_file) # 指定 blank self.graph = fst.compose(T, LG) logging.info('TLG compose successfully') remove_unk_arc(self.graph, self.lexicon_builder.unk_ids) logging.info('TLG fst remove unk successfully') elif self.graph_type == 'LG': self.graph = LG remove_unk_arc(self.graph, self.lexicon_builder.unk_ids) logging.info('LG fst remove unk successfully') remove_disambig_symbol(self.graph, self.lexicon_builder.disambig_ids) logging.info('LG fst remove disambig successfully') self.graph.arcsort(sort_type='ilabel') self.graph.write(graph_file) logging.info('write graph successfully') return self.graph
def decode(input_fst, model): res = fst.compose(input_fst, model) res = fst.shortestpath(res) return res