def generate_inflections(self, q, lemma): paths = fst.compose(q, self.lattice) two_state = fst.compose(paths, self.refiner) output = two_state.project(project_output=True) output.rmepsilon() output = fst.determinize(output) output.minimize() # read fst out dist = [] labels = [] state = output.start() for arc in output.arcs(state): label = self.input_syms.find(arc.ilabel) pr = float(arc.weight) dist.append(math.e**(-pr)) labels.append(label) sum_value = sum(dist) norm_dist = [prob / sum_value for prob in dist] relabels = [] inf_d = {} #for label in labels: for label, dist in zip(labels, norm_dist): delete, insert = label.split("_") l = len(delete) label = "".join(lemma[:-l]) + insert relabels.append(label) inf_d[label] = dist return inf_d
def generate_inflections(self,q, lemma): paths = fst.compose(q, self.lattice) two_state = fst.compose(paths, self.refiner) output = two_state.project(project_output=True) output.rmepsilon() output = fst.determinize(output) output.minimize() # read fst out dist = [] labels = [] state = output.start() for arc in output.arcs(state): label = self.input_syms.find(arc.ilabel) pr = float(arc.weight) dist.append(pr) labels.append(label) sum_value = sum(dist) norm_dist = [prob/sum_value for prob in dist] relabels = [] for label in labels: delete, insert = label.split("_") l = len(delete) label = lemma[:-l]+insert #print(lemma, delete, insert, label) relabels.append(label) return str(sorted(zip(relabels, norm_dist), key=lambda x:x[1]))
def build_lm(dev_fname, isyms_fname, constraints, lattice_output, refiner_fname): """ Make a lattice that maps lemmas and constraints (or priors) to an inflected version """ # rewrite constraints constraints = constraints.replace("_",";") # read isyms input_syms = fst.SymbolTable.read_text(isyms_fname) s_fin = '</s>' code = {} for ltr, c in input_syms: code[c]=ltr # init the lattice f_big = fst.Fst() f_big.set_input_symbols(input_syms) f_big.set_output_symbols(input_syms) for line in open(dev_fname,'r').readlines(): cns, lemma, inflection = line.split()[-3:] if cns == constraints: print(cns, lemma, inflection) # find idx that the strings diverge idx = 0 for i, (lm, flc) in enumerate(zip(lemma, inflection)): if lm !=flc: idx = i break f, old= create_priors(cns, input_syms, input_syms, code) keep = old for j in range(idx,len(lemma)): new = f.add_state() f.add_arc(old, fst.Arc(code[lemma[j]], code[lemma[j]], fst.Weight(f.weight_type(), 1.0), new)) old = new new = f.add_state() # the residual of the lemma is mapped to the inflection residual (indirectly) sym = lemma[idx:]+"_"+inflection[idx:] print(lemma, inflection, sym) f.add_arc(old, fst.Arc(code[sym], code[s_fin], fst.Weight(f.weight_type(), 1.0), new)) #f.add_arc(old, fst.Arc(code[inflection[idx:]], code[s_fin], fst.Weight(f.weight_type(), 1.0), new)) #f.add_arc(old, fst.Arc(code[s_fin], code[inflection[idx:]], fst.Weight(f.weight_type(), 1.0), new)) f.set_final(new) f_big.union(f) f_big = fst.determinize(f_big.rmepsilon()) # add <sigma> state in the <sigma place holder> for c, ltr in code.items(): if int(ltr)>1 and int(ltr)<36: # (hard coded) symbols of Runssian + 2 more f_big.add_arc(keep, fst.Arc(code[c], code[c], fst.Weight(f_big.weight_type(), 1.0), keep)) f_big.invert() # save lattice f_big.write(lattice_output)
def from_vocab(cls, vocab, tokenizer): fst = openfst.Fst() def add_word(word): i_words = tokenizer.token2idx(word) + [tokenizer.space_idx] if not fst.num_states(): initial_state = fst.add_state() assert initial_state == 0 fst.set_start(initial_state) source_state = fst.start() dest_state = None for i in i_words: # The initial state of FST is state 0, hence the index of chars in # the FST should start from 1 to avoid the conflict with the initial # state, otherwise wrong decoding results would be given. i += 1 dest_state = fst.add_state() fst.add_arc(source_state, openfst.Arc(i, i, 0, dest_state)) source_state = dest_state fst.set_final(dest_state, openfst.Weight.One('tropical')) lexicon_size = 0 for word in vocab: add_word(word) lexicon_size += 1 # This gets rid of "epsilon" transitions in the FST. # These are transitions that don't require a string input to be taken. # Getting rid of them is necessary to make the FST determinisitc, but # can greatly increase the size of the FST fst.rmepsilon() # This makes the FST deterministic, meaning for any string input there's # only one possible state the FST could be in. It is assumed our # dictionary is deterministic when using it. # (lest we'd have to check for multiple transitions at each state) fst = openfst.determinize(fst) # Finds the simplest equivalent fst. This is unnecessary but decreases # memory usage of the dictionary fst.minimize() return cls(fst_path=None, fst=fst)
def make_graph(self, lexicon_file, graphemes_file, grammar_file, sil_symbol, disambig_graphemes_file='graphemes_disambig.txt', words_file='words.txt', graph_file='LG.fst'): """build decode graph and write to disk Args: lexicon_file: lexicon file from word to graphemes sequence graphemes_file: graphemes id file grammar_file: arpa language model file disambig_graphemes_file: ouput graphemes table from grapheme to id isymbols_table for result WFST words_file: output words table from word to id osymbols_table for result WFST graph_file: write result graph to graph_file, default: LG.fst """ if self.graph_type == 'TLG': L = self.lexicon_builder(lexicon_file, graphemes_file, sil_symbol) logging.info('grapheme should be phones or syllables') logging.info( 'please confirm the sil symbol is "SIL" or some other you defined' ) elif self.graph_type == 'LG': L = self.lexicon_builder(lexicon_file, graphemes_file, sil_symbol) logging.info('grapheme should be characters') logging.info( 'please confirm the sil symbol is <space> or some other you defined' ) self.lexicon_builder.write_words_table(words_file) self.lexicon_builder.write_disambig_graphemes_table( disambig_graphemes_file) logging.info('make lexicon fst successfully') G = self.grammar_builder(grammar_file, words_file) logging.info('make grammar fst successfully') LG = fst.compose(L, G) logging.info('LG compose successfully') LG = fst.determinize(LG) logging.info('LG determinize successfully') LG.minimize() logging.info('LG minimize successfully') LG.arcsort(sort_type='ilabel') if self.graph_type == 'TLG': T = self.token_builder(disambig_graphemes_file) # 指定 blank self.graph = fst.compose(T, LG) logging.info('TLG compose successfully') remove_unk_arc(self.graph, self.lexicon_builder.unk_ids) logging.info('TLG fst remove unk successfully') elif self.graph_type == 'LG': self.graph = LG remove_unk_arc(self.graph, self.lexicon_builder.unk_ids) logging.info('LG fst remove unk successfully') remove_disambig_symbol(self.graph, self.lexicon_builder.disambig_ids) logging.info('LG fst remove disambig successfully') self.graph.arcsort(sort_type='ilabel') self.graph.write(graph_file) logging.info('write graph successfully') return self.graph
def build_lm(dev_fname, isyms_fname, constraints, lattice_output): """ Make a lattice that maps lemmas and constraints (or priors) to an inflected version """ # rewrite constraints constraints = constraints.replace("_", ";") # read isyms input_syms = fst.SymbolTable.read_text(isyms_fname) s_fin = '</s>' code = {} for ltr, c in input_syms: code[c] = ltr # init the lattice f_big = fst.Fst("log") f_big.set_input_symbols(input_syms) f_big.set_output_symbols(input_syms) for line in open(dev_fname, 'r').readlines( ): # all possilbe inflections are added, regardless of the prior (applying the prior an make for a more effecifent computation) line = line.strip() lemma, inflection, cns = line.split("\t")[:-2] #print(lemma, inflection, cns) if cns == constraints: # comparing strings idx = 0 lemma = lemma.split() inflection = inflection.split() for j, (lm, flc) in enumerate(zip(lemma, inflection)): if lm != flc: idx = j break f, old = create_priors(cns, input_syms, input_syms, code) keep = old for j in range(idx, len(lemma)): new = f.add_state() f.add_arc( old, fst.Arc(code[lemma[j]], code[lemma[j]], fst.Weight(f.weight_type(), 1.0), new)) old = new new = f.add_state() # the residual of the lemma is mapped to the inflection residual (indirectly) sym = "".join(lemma[idx:]) + "_" + "".join(inflection[idx:]) f.add_arc( old, fst.Arc(code[sym], code[s_fin], fst.Weight(f.weight_type(), 1.0), new)) #f.add_arc(old, fst.Arc(code[inflection[idx:]], code[s_fin], fst.Weight(f.weight_type(), 1.0), new)) #f.add_arc(old, fst.Arc(code[s_fin], code[inflection[idx:]], fst.Weight(f.weight_type(), 1.0), new)) f.set_final(new) f_big.union(f) f_big = fst.determinize(f_big.rmepsilon()) # add <sigma> state in the <sigma place holder> for c, ltr in code.items(): if int(ltr) > 1 and int( ltr) < 51: # (hard coded) symbols of Runssian + 2 more f_big.add_arc( keep, fst.Arc(code[c], code[c], fst.Weight(f_big.weight_type(), 1.0), keep)) f_big.invert() # save lattice f_big.write(lattice_output)