def read_fst(inp): compiler = fst.Compiler() while True: key = inp.readline().strip() if len(key) == 0: return while True: line = inp.readline() if len(line.strip()) == 0: break parts = line.split() if len(parts) > 4: parts = parts[:4] if len(parts) == 2: parts = parts[:1] print(" ".join(parts), file=compiler) yield key, compiler.compile()
def _fst1(): compiler = fst.Compiler() print("0 1 1 1 -1", file=compiler) print("0 1 2 2 -2", file=compiler) print("0 1 3 3 -3", file=compiler) print("1 2 1 1 1", file=compiler) print("1 2 2 2 0", file=compiler) print("1 2 3 3 -2", file=compiler) print("2 3 1 1 -2", file=compiler) print("2 3 2 2 0", file=compiler) print("2 3 3 3 0", file=compiler) print("3 4 1 1 1", file=compiler) print("3 4 2 2 2", file=compiler) print("3 4 3 3 3", file=compiler) print("4", file=compiler) f = compiler.compile() return f
def general(): st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) compiler.write('0 0 <other> <other>\n') compiler.write('0 0 <#> <#>\n') compiler.write('0\n') # for special cases # compiler.write('0 1 i i\n') # compiler.write('1 2 n n\n') # compiler.write('2 3 g g\n') # compiler.write('3 4 <^> <^>\n') compiler.write('0 1 <^> <epsilon>\n') compiler.write('1 0 <other> <other>\n') compiler.write('1 0 <#> <#>\n') c = compiler.compile() fststr.expand_other_symbols(c) return c
def write_hypos(self, all_hypos): """Writes FST files with standard arcs for each sentence in ``all_hypos``. The created lattices are not optimized in any way: We create a distinct path for each entry in ``all_hypos``. We advise you to determinize/minimize them if you are planning to use them for further processing. Args: all_hypos (list): list of nbest lists of hypotheses Raises: OSError. If the directory could not be created IOError. If something goes wrong while writing to the disk """ try: os.makedirs(self.path) except OSError as exception: if exception.errno != errno.EEXIST: raise else: logging.warn("Output FST directory %s already exists." % self.path) fst_idx = self.start_sen_id for hypos in all_hypos: fst_idx += 1 c = fst.Compiler() # state ID 0 is start, 1 is final state next_free_id = 2 for hypo in hypos: # Connect with start node c.write("0\t%d\t%d\t%d\t%f\n" % (next_free_id, utils.GO_ID, utils.GO_ID, -hypo.total_score)) next_free_id += 1 for sym in hypo.trgt_sentence: c.write("%d\t%d\t%d\t%d\n" % (next_free_id - 1, next_free_id, sym, sym)) next_free_id += 1 # Connect with final node c.write("%d\t1\t%d\t%d\n" % (next_free_id - 1, utils.EOS_ID, utils.EOS_ID)) c.write("1\n") f = c.compile() f.write(self.file_pattern % fst_idx)
def write_hypos(self, all_hypos, sen_indices): """Writes FST files with sparse tuples for each sentence in ``all_hypos``. The created lattices are not optimized in any way: We create a distinct path for each entry in ``all_hypos``. We advise you to determinize/minimize them if you are planning to use them for further processing. Args: all_hypos (list): list of nbest lists of hypotheses sen_indices (list): List of sentence indices (0-indexed) Raises: OSError. If the directory could not be created IOError. If something goes wrong while writing to the disk """ _mkdir(self.path, "FST") for fst_idx, hypos in zip(sen_indices, all_hypos): fst_idx += 1 c = fst.Compiler(arc_type="tropicalsparsetuple") # state ID 0 is start, 1 is final state next_free_id = 2 for hypo in hypos: syms = hypo.trgt_sentence # Connect with start node c.write("0\t%d\t%d\t%d\n" % (next_free_id, utils.GO_ID, utils.GO_ID)) next_free_id += 1 for pos in xrange(len(hypo.score_breakdown) - 1): c.write("%d\t%d\t%d\t%d\t%s\n" % ( next_free_id - 1, # last state id next_free_id, # next state id syms[pos], syms[pos], # arc labels self.write_weight(hypo.score_breakdown[pos]))) next_free_id += 1 # Connect with final node c.write("%d\t1\t%d\t%d\t%s\n" % (next_free_id - 1, utils.EOS_ID, utils.EOS_ID, self.write_weight(hypo.score_breakdown[-1]))) c.write("1\n") # Add final node f = c.compile() f.write(self.file_pattern % fst_idx)
def _get_basic_word_fst(self, text_arr): self.current_oov_queue = queue.Queue() compiler = fst.Compiler() state_counter = 0 next_state = 0 for arr in text_arr: if not arr: continue # single word, single expansion if len(arr) == 1: for w in arr: int_val = self._get_int_value_word(w) from_state = state_counter if next_state != 0: to_state = next_state state_counter = next_state - 1 next_state = 0 else: to_state = state_counter + 1 self._compile_entry(compiler, from_state, int_val, to_state) state_counter += 1 # multiple verbalization possibilities # we are working char by char, so store the state_counter, such that # all possibilities have the same from_state (starting state) else: from_state = state_counter to_state = state_counter + 1 for i, w in enumerate(arr): int_val = self._get_int_value_word(w) self._compile_entry(compiler, from_state, int_val, to_state) state_counter += 1 next_state = to_state + 1 state_counter = to_state compiler.write("{}\n\n".format(state_counter)) input_fst = compiler.compile() return input_fst
def make_mapping_loop_fst(mapping): lines = [] start = '0' mid = '1' end = '2' lines.append('%s %s 0 0' % (start, mid)) for (fro, to) in mapping: lines.append('%s %s %s %s' % (mid, mid, fro + 1, to + 1)) lines.append('%s %s 0 0' % (mid, end)) lines.append(str(end)) compiler = openfst.Compiler() for line in lines: print >> compiler, line #compiler.write(line) ## TODO test f = compiler.compile() f.arcsort(st="olabel") return f
def linear_fst(elements, automata_op, keep_isymbols=True, **kwargs): """Produce a linear automata. Based on code from https://stackoverflow.com/questions/9390536/how-do-you-even-give-an-openfst-made-fst-input-where-does-the-output-go. Args: elements (list): ordered list of input symbols automata_op (Fst): automaton to apply keep_isymbols (bool): whether to keep the input symbols """ compiler = fst.Compiler(isymbols=automata_op.input_symbols().copy(), acceptor=keep_isymbols, keep_isymbols=keep_isymbols, **kwargs) for i, el in enumerate(elements): print("{} {} {}".format(i, i + 1, el), file=compiler) print(str(i + 1), file=compiler) return compiler.compile()
def fst_stringcompile(self, text): compiler = fst.Compiler() state_counter = 0 for c in text: uni = self.utf8_symbols.find(c) if uni == -1: conv = '0x%04x' % ord(c) uni = self.utf8_symbols.find(conv) from_state = state_counter to_state = state_counter + 1 entry = "{} {} {} {}\n".format(from_state, to_state, uni, uni) compiler.write(entry) state_counter += 1 compiler.write("{}\n\n".format(state_counter)) input_fst = compiler.compile() # need to convert to pynini-fst for the combination with the grammar fst pynini_fst = pn.Fst.from_pywrapfst(input_fst) return pynini_fst
def build_automa(self, features_file, lex_in, lex_out): labels, tokens = self.load_data(features_file) #corpus = load_data(text_file) token_label_pair = [ self.__filter(token, lex_in) + " " + labels[id] for id, token in enumerate(tokens) ] labels_counter = Counter(labels) token_label_pair_counter = Counter(token_label_pair) mean = np.asarray(list(token_label_pair_counter.values())).mean() sum = np.asarray(list(token_label_pair_counter.values())).sum() std = np.asarray(list(token_label_pair_counter.values())).std() # Compute probabilities token_label_pair_probabilities = {} for pair, count in token_label_pair_counter.items(): token, label = pair.split(" ") token_label_pair_probabilities[pair] = -math.log( float(count) / float(labels_counter[label])) for label in labels: key = "<unk> " + label if key not in token_label_pair_probabilities.keys(): token_label_pair_probabilities["<unk> " + label] = -math.log( 1 / float(len(labels_counter))) #token_label_pair_probabilities["<unk> "+label] = -math.log(uniform(mean, std) / float(labels_counter[label])) compiler = fst.Compiler(isymbols=lex_in, osymbols=lex_out) corpus = "" for pair, prob in token_label_pair_probabilities.items(): token, label = pair.split(" ") corpus += "0 0 {0} {1} {2}\n".format(token, label, str(prob)) print >> compiler, corpus print >> compiler, "0\0" with open("bin/automa.txt", "w") as file: file.write(corpus) automaton = compiler.compile() return automaton
def buildNgramCounter(wmap,maxngram=1): ''' Build an n-gram counting transducer @wmap: file containing the vocabulary @maxngram: maximium order of the grams to be counted''' filenames = [] counters = [] for order in range(1,maxngram+1): initial_state = 0 final_state = order+1 filename = 'counter'+str(order) filenames.append(filename) with open('counter'+str(order),'w') as outfile: with open (wmap,'r') as infile: line = infile.readline() while line: line = line.strip() outfile.write(str(initial_state)+" "+str(initial_state)+" "+line+" "+str(0)+"\n") for state in range(order): outfile.write(str(state)+" "+str(state+1)+" "+line+" "+line+"\n") outfile.write(str(final_state-1)+" "+str(final_state-1)+" "+line+" "+str(0)+"\n") line = infile.readline() outfile.write(str(order)+"\n") compiler = fst.Compiler() for filename in filenames: with open(filename,'r') as f: for line in f: compiler.write(line) tmp=compiler.compile() tmp.write(filename+".fst") for filename in filenames: counters.append(fst.Fst.read(filename+".fst")) elem1 = counters[0].union(counters[1]) elem2 = counters[2].union(counters[3]) tmp = elem1.union(elem2).rmepsilon().arcsort() # tmp = fst.determinize(tmp).minimize() ngramCounter = fst.arcmap(tmp,map_type='to_log64',delta=0.0000001) ngramCounter.write("ngramCounter.fst") return ngramCounter
def linear_fst( elements: List[str], automata_op: fst.Fst, keep_isymbols: bool = True, **kwargs: Mapping[Any, Any], ) -> fst.Fst: """Produce a linear automata.""" assert len(elements) > 0, "No elements" compiler = fst.Compiler( isymbols=automata_op.input_symbols().copy(), acceptor=keep_isymbols, keep_isymbols=keep_isymbols, **kwargs, ) num_elements = 0 for i, el in enumerate(elements): print("{} {} {}".format(i, i + 1, el), file=compiler) num_elements += 1 print(str(num_elements), file=compiler) return compiler.compile()
def alternatives(sequence): # sequence is a list of words # produces the n_best alternative to sequence made of sub-units that are in words # Build FST compiler_sequence = fst.Compiler(isymbols=printable_ST, osymbols=printable_ST, keep_isymbols=True, keep_osymbols=True) c = 0 for word in sequence: for char in word: print >> compiler_sequence, str(c) + ' ' + str( c + 1) + ' ' + char + ' ' + char c = c + 1 print >> compiler_sequence, str(c) + ' ' + str(c + 1) + ' </w> </w>' c = c + 1 print >> compiler_sequence, str(c) fst_sequence = compiler_sequence.compile() fst_sequence = fst_sequence.set_input_symbols(printable_ST) fst_sequence = fst_sequence.set_output_symbols(printable_ST) composition = fst.compose(fst_vocab, fst.compose(grapheme_confusion, fst_sequence)).rmepsilon().arcsort() # composition.prune(weight = 3) alters = printstrings(composition, nshortest=n_best, syms=printable_ST, weight=True) scores = [] if alters: print alters scores = [float(alt[1]) for alt in alters] alters = [alt[0].split(' </w>')[:-1] for alt in alters] alters = [[''.join(word.split(' ')) for word in alt] for alt in alters] return alters, scores
def make_t_lattice_SIMP2(dist, ind): ''' version 2 -- don't go via strings ''' #print 'target indexes' #print ind #print #print fst = openfst.Fst() start = fst.add_state() fst.set_start(start) frames, cands = np.shape(dist) #frames = 3 for i in range(frames): end = start + 1 for j in range(cands): frame_ix = ind[i, j] weight = dist[i, j] fst.append('%s %s %s %s %s' % (start, end, frame_ix + 1, frame_ix + 1, weight)) start = end fst.append('%s' % (end)) compiler = openfst.Compiler() for line in fst: print >> compiler, line #compiler.write(line) ## TODO test f = compiler.compile() f.arcsort(st="olabel") return f
def write_hypos(self, all_hypos, sen_indices): """Writes FST files with standard arcs for each sentence in ``all_hypos``. The created lattices are not optimized in any way: We create a distinct path for each entry in ``all_hypos``. We advise you to determinize/minimize them if you are planning to use them for further processing. Args: all_hypos (list): list of nbest lists of hypotheses sen_indices (list): List of sentence indices (0-indexed) Raises: OSError. If the directory could not be created IOError. If something goes wrong while writing to the disk """ _mkdir(self.path, "FST") for fst_idx, hypos in zip(sen_indices, all_hypos): fst_idx += 1 c = fst.Compiler() # state ID 0 is start, 1 is final state next_free_id = 2 for hypo in hypos: # Connect with start node c.write("0\t%d\t%d\t%d\t%f\n" % (next_free_id, utils.GO_ID, utils.GO_ID, -hypo.total_score)) next_free_id += 1 for sym in hypo.trgt_sentence: c.write("%d\t%d\t%d\t%d\n" % (next_free_id - 1, next_free_id, sym, sym)) next_free_id += 1 # Connect with final node c.write("%d\t1\t%d\t%d\n" % (next_free_id - 1, utils.EOS_ID, utils.EOS_ID)) c.write("1\n") f = c.compile() f.write(self.file_pattern % fst_idx)
def process_line(line): global isym global osym global tm global lm # Read input compiler = fst.Compiler() arr = line.strip().split() + ["</s>"] unks = [] for i, x in enumerate(arr): if x not in isym: unks.append(x) xsym = isym[x] if x in isym else isym["<unk>"] print >> compiler, "%d %d %s %s" % (i, i + 1, xsym, xsym) print >> compiler, "%s" % (len(arr)) ifst = compiler.compile() # Create the search graph and do search graph = fst.compose(ifst, tm) graph = fst.compose(graph, lm) graph = fst.shortestpath(graph) # Read off the output out = [] unkspot = 0 for state in graph.states(): for arc in graph.arcs(state): if arc.olabel != 0: tok = osym[arc.olabel] # unk substitution (original words in same order) if unkspot < len(unks) and tok == "<unk>": out.append(unks[unkspot]) unkspot += 1 else: out.append(tok) return " ".join(reversed(out[1:]))
c.write("%d\t%d\t%d\t%d\t0,1,%f\n" % ( out_root, arc.nextstate, arc.ilabel, arc.olabel, w2f(arc.weight))) dfs(arc.nextstate, hist + [str(arc.ilabel)]) idx = 0 while True: idx += 1 input_path = get_path(args.input, idx) if not input_path or not os.path.isfile(input_path): break lat = fst.Fst.read(input_path) c = fst.Compiler(arc_type="tropicalsparsetuple") hist2node = {} visited = {} dfs(lat.start(), []) # Add context states next_context_id = 20000000 for key,cluster in hist2node.iteritems(): hist_len = len(key.split()) if len(cluster) < 2: continue # We don't need this context elif len(cluster) == 2: # Directly connect both nodes c.write("%d\t%d\t%d\t%d\t0,%d,1.0\n" % ( cluster[0], out_state(cluster[1]), 0, 0, hist_len+1)) c.write("%d\t%d\t%d\t%d\t0,%d,1.0\n" % ( cluster[1], out_state(cluster[0]), 0, 0, hist_len+1)) else: # Introduce a context node
from fststr import fststr import pywrapfst as fst # Init FST st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) fst_file = open('e-insertion.txt').read() print(fst_file, file=compiler) c = compiler.compile() fststr.expand_other_symbols(c) # Test FST test_in = 'fox<^>s<#>' print("input:", test_in) print("output:", fststr.apply(test_in, c))
def _get_basic_tag_fst(self, text_arr): # create an fst from text_arr, extracting pos-tags where applicable # only use pos-tags where found, store the words for reconstruction of the utterance #self.current_oov_queue = queue.Queue() self.oov_dict = {} self.replacement_dict = {} compiler = fst.Compiler() state_counter = 0 next_state = 0 for i, arr in enumerate(text_arr): if not arr: continue # single word, single expansion if len(arr) == 1: for w in arr: if '_' in w: wrd = w[:w.index('_')] w = w[w.index('_') + 1:] if i in self.replacement_dict: d = self.replacement_dict[i] d[w] = wrd else: self.replacement_dict[i] = {} d = self.replacement_dict[i] d[w] = wrd #elif w == 'og': # #TODO: better solution ... tag in grammar # wrd = 'og' # w = 'c' # if i in self.replacement_dict: # d = self.replacement_dict[i] # d[w] = wrd # else: # self.replacement_dict[i] = {} # d = self.replacement_dict[i] # d[w] = wrd int_val = self._get_int_value_word(w, i) from_state = state_counter if next_state != 0: to_state = next_state state_counter = next_state - 1 next_state = 0 else: to_state = state_counter + 1 self._compile_entry(compiler, from_state, int_val, to_state) state_counter += 1 # multiple verbalization possibilities # we are working char by char, so store the state_counter, such that # all possibilities have the same from_state (starting state) else: from_state = state_counter to_state = state_counter + 1 for w in arr: if '_' in w: wrd = w[:w.index('_')] w = w[w.index('_') + 1:] if i in self.replacement_dict: d = self.replacement_dict[i] d[w] = wrd else: self.replacement_dict[i] = {} d = self.replacement_dict[i] d[w] = wrd int_val = self._get_int_value_word(w, i) self._compile_entry(compiler, from_state, int_val, to_state) state_counter += 1 next_state = to_state + 1 state_counter = to_state compiler.write("{}\n\n".format(state_counter)) input_fst = compiler.compile() return input_fst
def buildpostProcessFST(self, input_str): # initialize a FSTpost st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) initposts = '0\n' print(initposts, file=compiler) initFSTpost = compiler.compile() fststr.expand_other_symbols(initFSTpost) # read post FST txt files post_files = [ filename for filename in os.listdir('.') if filename.startswith("FST_post_") ] # print(post_files) # compile txt files into FST, and union them into initFSTpost for f in post_files: compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) post = open(f).read() print(post, file=compiler) post_FST = compiler.compile() fststr.expand_other_symbols(post_FST) initFSTpost = initFSTpost.union(post_FST) #print("checkpoint: ", fststr.apply(input_str, initFSTpost), '\n') # Run indivdual FST file, for debugging purposes: # compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) # post = open('FST_post_withsign.txt').read() # print(post, file=compiler) # post_FST = compiler.compile() # fststr.expand_other_symbols(post_FST) # initFSTpost = initFSTpost.union(post_FST) # FST that take care of input is original form s = '' # loop through the character parts of the input tracker = 0 for i in range(len(input_str)): if (input_str[i] == '+'): s += '{} {} <#> +Guess\n{}\n'.format(tracker, tracker + 1, tracker + 1) tracker += 1 break else: s += '{} {} {} {}\n'.format(tracker, tracker + 1, input_str[tracker], input_str[tracker]) tracker += 1 # take care of <#> in the end, change it to +Guess s += '{} {} <#> +Guess\n{}\n'.format(tracker, tracker + 1, tracker + 1) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) print(s, file=compiler) original_case_FST = compiler.compile() fststr.expand_other_symbols(original_case_FST) initFSTpost = initFSTpost.union(original_case_FST) # # Last FST, clear out any word ends with <#>, output words ends with +Guess and +Known compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) clear = open('FST_finalclearance.txt').read() print(clear, file=compiler) clear_FST = compiler.compile() fststr.expand_other_symbols(clear_FST) lastFST = fst.compose(initFSTpost.arcsort(sort_type="olabel"), clear_FST.arcsort(sort_type="ilabel")) return lastFST
def __init__(self): self._compiler = pywrapfst.Compiler()
def get_in_vocab_fst(self): alphabet = fststr.EN_SYMB st = fststr.symbols_table_from_alphabet( alphabet) # <class 'pywrapfst.SymbolTable'> compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) in_vocab_fst = compiler.compile() lemma = [] verb_in_form = [] with open('in_vocab_dictionary_verbs.txt', 'r') as f: for line in f.readlines(): lemma.append(line.split(',')[0]) verb_in_form.append(line.split(',')[1]) for idx in range(len(lemma)): compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) compiler.write('0 0 <other> <other>\n') compiler.write('0 1 <#> <epsilon>\n') compiler.write('1\n') a_fst = compiler.compile() fststr.expand_other_symbols(a_fst) lemma_word = lemma[idx] lemma_word_length = len(lemma_word) form_word = verb_in_form[idx] form_word_length = len(form_word) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) if form_word_length >= lemma_word_length: for i in range(form_word_length): if i < lemma_word_length: compiler.write( str(i) + ' ' + str(i + 1) + ' ' + form_word[i] + ' ' + lemma_word[i] + '\n') else: compiler.write( str(i) + ' ' + str(i + 1) + ' ' + form_word[i] + ' <epsilon>\n') compiler.write( str(form_word_length) + ' ' + str(form_word_length + 1) + ' <epsilon>' + ' +Known\n') compiler.write(str(form_word_length + 1)) b_fst = compiler.compile() fststr.expand_other_symbols(b_fst) c_fst = fst.compose(a_fst.arcsort(sort_type="olabel"), b_fst.arcsort(sort_type="ilabel")) in_vocab_fst.union(c_fst) else: for i in range(lemma_word_length): if i < form_word_length: compiler.write( str(i) + ' ' + str(i + 1) + ' ' + form_word[i] + ' ' + lemma_word[i] + '\n') else: compiler.write( str(i) + ' ' + str(i + 1) + ' <epsilon>' + ' ' + lemma_word[i] + '\n') compiler.write( str(lemma_word_length) + ' ' + str(lemma_word_length + 1) + ' <epsilon> +Known\n') compiler.write(str(lemma_word_length + 1)) b_fst = compiler.compile() fststr.expand_other_symbols(b_fst) c_fst = fst.compose(a_fst.arcsort(sort_type="olabel"), b_fst.arcsort(sort_type="ilabel")) in_vocab_fst.union(c_fst) return in_vocab_fst
def generateFst(self, data, st): lines = [] lineLst = data.split("\n") count = 0 for line in lineLst: curFst = "" stemNinf = line.split(",")[:2] curFst = "0\n" # 0 as final state curFst += "0 0 <other> <other>\n" stem = stemNinf[0] if stem == "": return rootFst #print("stem: %s",stem) #if len(stemNinf)>1: inf = stemNinf[1] #print("inf: %s",inf) for i in range(len(stem)): curFst += str(i) curFst += " " curFst += str(i + 1) curFst += " " if i >= len(inf): curFst += "<epsilon>" else: curFst += inf[i] curFst += " " curFst += stem[i] curFst += "\n" infLen = len(inf) stemLen = len(stem) index = stemLen if stemLen > infLen: continue else: toBeReplaced = inf[stemLen:] for i, s in enumerate(toBeReplaced): index = i + stemLen curFst += str(index) curFst += " " curFst += str(index + 1) curFst += " " curFst += s curFst += " " curFst += "<epsilon>" curFst += "\n" curFst += str(index + 1) curFst += " " curFst += "0" curFst += " " curFst += "<#>" curFst += " " curFst += "+Known" compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) #print("curFst",curFst) compiler.write(curFst) other = compiler.compile() fststr.expand_other_symbols(other) if count == 0: rootFst = other else: rootFst = rootFst.union(other) count += 1 return rootFst
def get_morph_fst(name): syms = ofst.SymbolTable.read_text(MORPH_PATH + 'symbols.txt') fst = ofst.Fst.read(MORPH_PATH + name.lower() + '.fst') return fst, ofst.Compiler(isymbols=syms, osymbols=syms, acceptor=True), syms
def getAlloFST(self): compiler = fst.Compiler(isymbols=self.st, osymbols=self.st, keep_isymbols=True, keep_osymbols=True) compiler.write('') AlloFST = compiler.compile() yRepl = '0 \n0 0 <other> <other> \n0 1 i <epsilon> \n1 2 e <epsilon> \n1 12 <^> <epsilon> \n1 0 <epsilon> i \n2 3 <^> <epsilon> \n2 9 <epsilon> i \n3 4 s <epsilon> \n3 10 <epsilon> i \n4 5 <#> y \n5 6 <epsilon> <^> \n6 7 <epsilon> s \n7 8 <epsilon> <#> \n8 \n8 8 <other> <other> \n9 0 <epsilon> e \n10 11 <epsilon> e \n11 0 <epsilon> <^> \n12 13 e <epsilon> \n12 19 <epsilon> i \n13 14 d <epsilon> \n13 20 <epsilon> i \n14 15 <#> y \n15 16 <epsilon> <^> \n16 17 <epsilon> e \n17 18 <epsilon> d \n18 8 <epsilon> <^> \n19 0 <epsilon> <^> \n20 21 <epsilon> <^> \n21 0 <epsilon> e' kIns = '\n0 \n0 0 <other> <other> \n0 1 c c \n1 0 <other> <other> \n1 2 k <epsilon> \n2 0 <epsilon> k \n2 3 <^> <epsilon> \n3 4 i <epsilon> \n 3 10 e <epsilon> \n3 14 <epsilon> k \n4 5 n <^> \n5 6 g i \n6 7 <#> n \n7 8 <epsilon> g \n8 9 <epsilon> <#> \n9 \n10 11 d <^> \n11 12 <#> e \n12 8 <epsilon> d \n14 0 <epsilon> <^>' eDel = '0 \n0 0 <other> <other> \n0 1 <^> <epsilon> \n1 2 i <epsilon> \n1 11 e <epsilon> \n2 3 n <epsilon> \n3 4 g <epsilon> \n4 5 <#> e \n5 6 <epsilon> <^> \n6 7 <epsilon> i \n7 8 <epsilon> n \n8 9 <epsilon> g \n9 10 <epsilon> <#> \n10 \n11 12 d <epsilon> \n11 16 <epsilon> e \n12 13 <#> e \n13 14 <epsilon> <^> \n14 15 <epsilon> e \n15 9 <epsilon> d \n16 0 <epsilon> <^>' eInsch = '0 \n0 0 <other> <other> \n0 1 c <epsilon> \n1 2 h <epsilon> \n1 0 <epsilon> c \n2 3 e <epsilon> \n2 11 <epsilon> c \n3 4 <^> <epsilon> ' eInsch += '\n3 12 <epsilon> c \n4 5 s <epsilon> \n4 12 <epsilon> c \n5 6 <#> c \n6 7 <epsilon> h \n7 8 <epsilon> <^> \n8 9 <epsilon> s \n9 10 <epsilon> <#> \n10 \n10 10 <other> <other> \n11 0 <epsilon> h ' eInsch += '\n12 13 <epsilon> h \n13 0 <epsilon> e \n14 15 <epsilon> h \n15 16 <epsilon> e \n16 0 <epsilon> <^>' eInss = '\n0 \n0 0 <other> <other> \n0 1 s <epsilon> \n1 2 e <epsilon> \n1 12 h <epsilon> \n1 0 <epsilon> s \n2 3 <^> <epsilon> \n2 9 <epsilon> s \n3 4 s <epsilon> \n3 10 <epsilon> s \n4 5 <#> s \n5 6 <epsilon> <^> \n6 7 <epsilon> s \n7 8 <epsilon> <#> \n8 \n8 8 <other> <other> \n9 0 <epsilon> e \n10 11 <epsilon> e \n11 0 <epsilon> <^> \n12 13 e <epsilon> \n 12 20 <epsilon> s \n13 14 <^> <epsilon> \n13 21 <epsilon> s \n14 15 s <epsilon> \n14 23 <epsilon> s \n15 16 <#> s \n16 17 <epsilon> h \n17 18 <epsilon> <^> \n18 19 <epsilon> s \n19 8 <epsilon> <#> \n20 0 <epsilon> h \n21 22 <epsilon> h \n22 0 <epsilon> e \n23 24 <epsilon> h \n24 25 <epsilon> e \n25 0 <epsilon> <^>' xz = [*'xz'] eInsxz = '\n0 \n0 0 <other> <other> ' for i in range(len(xz)): c = xz[i] n = 11 * i eInsxz += '\n0 ' + str( n + 1) + ' ' + c + ' <epsilon> ' + '\n' + str( n + 1) + ' ' + str(n + 2) + ' e <epsilon> \n' + str( n + 1) + ' 0 <epsilon> ' + c + '\n' + str(n + 2) eInsxz += ' ' + str(n + 3) + ' <^> <epsilon> \n' + str( n + 2) + ' ' + str(n + 9) + ' <epsilon> ' + c + ' \n' + str( n + 3) + ' ' + str(n + 4) + ' s <epsilon> \n' + str(n + 4) eInsxz += ' ' + str(n + 5) + ' <#> ' + c + ' \n' + str( n + 3) + ' ' + str(n + 10) + ' <epsilon> ' + c + ' \n' + str( n + 5) + ' ' + str(n + 6) + ' <epsilon> <^> \n' eInsxz += str(n + 6) + ' ' + str(n + 7) + ' <epsilon> s \n' + str( n + 7) + ' ' + str(n + 8) + ' <epsilon> <#> \n' + str( n + 8) + ' \n' + str(n + 8) + ' ' + str(n + 8) + ' <other> <other>' eInsxz += ' \n' + str(n + 9) + ' 0 <epsilon> e \n' + str( n + 10) + ' ' + str(n + 11) + ' <epsilon> e \n' + str( n + 11) + ' 0 <epsilon> <^>' compiler = fst.Compiler(isymbols=self.st, osymbols=self.st, keep_isymbols=True, keep_osymbols=True) compiler.write(eInsch) fstInsch = compiler.compile() fststr.expand_other_symbols(fstInsch) compiler = fst.Compiler(isymbols=self.st, osymbols=self.st, keep_isymbols=True, keep_osymbols=True) compiler.write(eInss) fstInss = compiler.compile() fststr.expand_other_symbols(fstInss) compiler = fst.Compiler(isymbols=self.st, osymbols=self.st, keep_isymbols=True, keep_osymbols=True) compiler.write(eInsxz) fstInsxz = compiler.compile() fststr.expand_other_symbols(fstInsxz) fsteIns = fstInsch.union(fstInss.union(fstInsxz)) consonants = [*'bcdfghjklmnpqrstvwxz'] consDoub = '' consDoub += '\n0 \n0 1 a a \n0 1 e e \n0 1 i i \n0 1 o o \n0 1 u u \n0 1 y y \n0 0 <other> <other>' consDoub += '\n1 0 a a \n1 0 e e \n1 0 i i \n1 0 o o \n1 0 u u \n1 0 y y' consDoub += '\n2 \n2 2 <other> <other>' for i in range(len(consonants)): c = consonants[i] consDoub += '\n1 ' + str(8 * i + 3) + ' ' + c + ' ' + c consDoub += '\n' + str(8 * i + 3) + ' ' + str(8 * i + 4) + ' ' + c + ' <epsilon>' consDoub += '\n' + str(8 * i + 3) + ' 0 <other> <other>' consDoub += '\n' + str(8 * i + 4) + ' ' + str(8 * i + 5) + ' <^> <^>' consDoub += '\n' + str(8 * i + 5) + ' ' + str(8 * i + 6) + ' i i' consDoub += '\n' + str(8 * i + 5) + ' ' + str(8 * i + 9) + ' e e' consDoub += '\n' + str(8 * i + 5) + ' 0 <other> <other>' consDoub += '\n' + str(8 * i + 6) + ' ' + str(8 * i + 7) + ' n n' consDoub += '\n' + str(8 * i + 7) + ' ' + str(8 * i + 8) + ' g g' consDoub += '\n' + str(8 * i + 8) + ' 2 <#> <#>' consDoub += '\n' + str(8 * i + 9) + ' ' + str(8 * i + 10) + ' d d' consDoub += '\n' + str(8 * i + 9) + ' 0 <other> <other>' consDoub += '\n' + str(8 * i + 10) + ' 2 <#> <#>' ycompiler = fst.Compiler(isymbols=self.st, osymbols=self.st, keep_isymbols=True, keep_osymbols=True) ycompiler.write(yRepl) yReplNew = ycompiler.compile() fststr.expand_other_symbols(yReplNew) kcompiler = fst.Compiler(isymbols=self.st, osymbols=self.st, keep_isymbols=True, keep_osymbols=True) kcompiler.write(kIns) kInsNew = kcompiler.compile() fststr.expand_other_symbols(kInsNew) edcompiler = fst.Compiler(isymbols=self.st, osymbols=self.st, keep_isymbols=True, keep_osymbols=True) edcompiler.write(eDel) eDelNew = edcompiler.compile() fststr.expand_other_symbols(eDelNew) cdcompiler = fst.Compiler(isymbols=self.st, osymbols=self.st, keep_isymbols=True, keep_osymbols=True) cdcompiler.write(consDoub) consDoubNew = cdcompiler.compile() fststr.expand_other_symbols(consDoubNew) AlloFST.union( yReplNew.union( kInsNew.union(fsteIns.union(eDelNew.union(consDoubNew))))) fststr.expand_other_symbols(AlloFST) return AlloFST
offset = 10000 # Something larger than the longest sequence offset2 = 2 * offset for idx, t in enumerate(terminals): state_id = idx + 2 fst_arc(c, state_id - 1, state_id, t) fst_arc(c, state_id - 1 + offset2, state_id, t) for nt in closing_non_terminals: # Self loop fst_arc(c, state_id, state_id + offset, nt) fst_arc(c, state_id + offset, state_id + offset, nt) if idx < len(terminals) - 1: # No opening at last position for nt in opening_non_terminals: fst_arc(c, state_id + offset, state_id + offset2, nt) fst_arc(c, state_id + offset2, state_id + offset2, nt) fst_arc(c, state_id, state_id + 1, args.eos_id) fst_arc(c, state_id + offset, state_id + 1, args.eos_id) c.write("%d\n" % (state_id + 1,)) for line_idx, line in enumerate(sys.stdin): terminals = [int(i) for i in line.strip().split()] c = fst.Compiler() # Debug with sys.stdout rather than c if args.format == 'layerbylayer': construct_layerbylayer_fst(c, non_terminals, terminals) elif args.format == 'layerbylayer_pop': construct_layerbylayer_fst(c, non_terminals, terminals, pop_id) else: # flat_* construct_flat_fst(c, closing_non_terminals, opening_non_terminals, terminals) f = c.compile() f.write("%s/%d.fst" % (args.output_dir, line_idx + 1))
# Rename sil by <eps> in those tables index['<eps>'] = index['sil'] del index['sil'] # Create the symbol table printable_ST = fst.SymbolTable() printable_ST.add_symbol('<eps>') for c in index.keys(): if c != '<eps>': printable_ST.add_symbol(c) # save the symbol table printable_ST.write_text('FSTs/symbol_table.txt') # Build a sigma FST: accepting any and outputting epsilon compiler = fst.Compiler(isymbols=printable_ST, osymbols=printable_ST, keep_isymbols=True, keep_osymbols=True) for c in index.keys(): print >> compiler, '0 0 %s <eps>' % c print >> compiler, '0' compiler.compile().write('FSTs/sigma.fst') # Build an FST which corrects the errors: maps <estim> to <truth> with probability confusion_matrix[<truth>][<estim>] compiler = fst.Compiler(isymbols=printable_ST, osymbols=printable_ST, keep_isymbols=True, keep_osymbols=True) for truth in index.keys(): #[0:4]: for estim in index.keys(): #[0:4]: score = confusion_matrix[index[truth]][index[estim]] # score = confusion_matrix[index[estim]][index[truth]]
class Lemmatizer: def e_insertion(): st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) fst_file = open('e-insertion.txt').read() print(fst_file, file=compiler) c = compiler.compile() fststr.expand_other_symbols(c) return c def k_insertion(): st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) fst_file = open('k-insertion.txt').read() print(fst_file, file=compiler) c = compiler.compile() fststr.expand_other_symbols(c) return c def get_morphotactics(): suffix = ['', 's', 'ed', 'en', 'ing'] st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) c = compiler.compile() for s in suffix: compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) compiler.write('0 0 <other> <other>\n') compiler.write('0 1 +Guess <^>\n') l = len(s) for i in range(l): compiler.write(str(i+1) + ' ' + str(i+2) + ' <epsilon> ' + s[i] + '\n') compiler.write(str(l+1) + ' ' + str(l+2) + ' <epsilon> <#>\n') compiler.write(str(l+2)) suffix_rule = compiler.compile() c = c.union(suffix_rule) fststr.expand_other_symbols(c) return c def general(): st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) compiler.write('0 0 <other> <other>\n') compiler.write('0 0 <#> <#>\n') compiler.write('0\n') # for special cases # compiler.write('0 1 i i\n') # compiler.write('1 2 n n\n') # compiler.write('2 3 g g\n') # compiler.write('3 4 <^> <^>\n') compiler.write('0 1 <^> <epsilon>\n') compiler.write('1 0 <other> <other>\n') compiler.write('1 0 <#> <#>\n') c = compiler.compile() fststr.expand_other_symbols(c) return c def e_deletion(): st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) fst_file = open('silent-e-deletion.txt').read() print(fst_file, file=compiler) c = compiler.compile() fststr.expand_other_symbols(c) return c def ch_sh_e_insertion(): st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) fst_file = open('ch_sh_e_insertion.txt').read() print(fst_file, file=compiler) c = compiler.compile() fststr.expand_other_symbols(c) return c def y_replacement(): st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) fst_file = open('y_replacement.txt').read() print(fst_file, file=compiler) c = compiler.compile() fststr.expand_other_symbols(c) return c def del_sharp(): st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) compiler.write('0\n') compiler.write('0 0 <other> <other>\n') compiler.write('0 1 <#> <epsilon>\n') compiler.write('1\n') c = compiler.compile() fststr.expand_other_symbols(c) return c def consonant_doubling(): st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) consonant = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'y', 'z'] vowel = ['a', 'e', 'i', 'o', 'u'] compiler.write('0\n') compiler.write('0 0 <other> <other>\n') compiler.write('0 0 <#> <#>\n') for v in vowel: compiler.write('0 2 ' + v + ' ' + v + '\n') for c in consonant: compiler.write('0 1 ' + c + ' ' + c + '\n') compiler.write('1 1 ' + c + ' ' + c + '\n') for v in vowel: compiler.write('1 2 ' + v + ' ' + v + '\n') compiler.write('2 2 i i\n') compiler.write('2 2 u u\n') for i in range(len(consonant)): compiler.write('2 ' + str(i+3) + ' ' + consonant[i] + ' ' + consonant[i] + '\n') compiler.write(str(i+3) + ' ' + str(len(consonant)+3) + ' ' + '<^>' + ' ' + consonant[i] + '\n') compiler.write(str(i+3) + ' ' + str(len(consonant)+6) + ' ' + '<^>' + ' ' + consonant[i] + '\n') for c in consonant: compiler.write(str(i+3) + ' 1 ' + c + ' ' + c + '\n') for v in vowel: compiler.write(str(i+3) + ' 2 ' + v + ' ' + v + '\n') compiler.write(str(len(consonant)+3) + ' ' + str(len(consonant)+4) + ' e e' + '\n') compiler.write(str(len(consonant)+4) + ' ' + str(len(consonant)+5) + ' d d' + '\n') compiler.write(str(len(consonant)+5) + ' 0 <#> <#>' + '\n') compiler.write(str(len(consonant)+6) + ' ' + str(len(consonant)+7) + ' i i' + '\n') compiler.write(str(len(consonant)+7) + ' ' + str(len(consonant)+8) + ' n n' + '\n') compiler.write(str(len(consonant)+8) + ' ' + str(len(consonant)+5) + ' g g' + '\n') c = compiler.compile() fststr.expand_other_symbols(c) return c st = fststr.symbols_table_from_alphabet(fststr.EN_SYMB) lemma = [] allomorphy = [] with open("in_vocab_dictionary_verbs.txt", "r") as f: for line in f.readlines(): lemma.append(line.split(',')[0]) allomorphy.append(line.split(',')[1]) compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) rule = compiler.compile() for index in range(len(lemma)): compiler = fst.Compiler(isymbols=st, osymbols=st, keep_isymbols=True, keep_osymbols=True) if len(allomorphy[index]) >= len(lemma[index]): for i in range(len(allomorphy[index])): if i < len(lemma[index]): compiler.write(str(i) + ' ' + str(i+1) + ' ' + allomorphy[index][i] + ' ' + lemma[index][i] + '\n') else: compiler.write(str(i) + ' ' + str(i+1) + ' ' + allomorphy[index][i] + ' <epsilon>' + '\n') l = len(allomorphy[index]) compiler.write(str(l) + ' ' + str(l+1) + ' <epsilon>' + ' +Known\n') compiler.write(str(l+1)) rule.union(compiler.compile()) else: for i in range(len(lemma[index])): if i < len(allomorphy[index]): compiler.write(str(i) + ' ' + str(i+1) + ' ' + allomorphy[index][i] + ' ' + lemma[index][i] + '\n') else: compiler.write(str(i) + ' ' + str(i+1) + ' <epsilon>' + ' ' + lemma[index][i] + '\n') l = len(lemma[index]) compiler.write(str(l) + ' ' + str(l+1) + ' <epsilon>' + ' +Known\n') compiler.write(str(l+1)) rule.union(compiler.compile()) de_iv_rule = rule.copy().invert() morphotactics_rule = get_morphotactics() e_insertion_rule = e_insertion() k_insertion_rule = k_insertion() e_deletion_rule = e_deletion() general_rule = general() ch_sh_e_insertion_rule = ch_sh_e_insertion() y_replacement_rule = y_replacement() del_sharp_rule = del_sharp() consonant_doubling_rule = consonant_doubling() new_rule = k_insertion_rule.union(e_insertion_rule).union(general_rule).union(e_deletion_rule).union(ch_sh_e_insertion_rule).union(y_replacement_rule).union(consonant_doubling_rule) de_oov = fst.compose(morphotactics_rule.arcsort(sort_type="olabel"), new_rule.arcsort(sort_type="ilabel")) de_oov_rule = fst.compose(de_oov.arcsort(sort_type="olabel"), del_sharp_rule.arcsort(sort_type="ilabel")) oov_rule = de_oov_rule.copy().invert() # The final rules for lemmatizer rule = rule.union(oov_rule) # The final rules for delemmatizer de_rule = de_iv_rule.union(de_oov_rule) def lemmatize(self, in_str): out_set = set() for i in fststr.apply(in_str, self.rule): out_set.add(i) if in_str[-3:] == 'ing' or in_str[-2:] == 'ed' or in_str[-2:] == 'en' or (in_str[-1] == 's' and in_str[-2] != 's'): out_set.remove(in_str+'+Guess') return out_set def delemmatize(self, in_str): out_set = set() for i in fststr.apply(in_str, self.de_rule): out_set.add(i) return out_set
def get_fst(name): syms = ofst.SymbolTable.read_text(FST_PATH + name + '/symbols.txt') fst = ofst.Fst.read(FST_PATH + name + '/' + name.lower() + '.fst') return fst, ofst.Compiler(isymbols=syms, osymbols=syms, acceptor=True), syms
def create_spelling_fst(word_table, alphabet_table, repeat_char, self_transition_prob): assert isinstance(word_table, openfst.SymbolTable) assert isinstance(alphabet_table, openfst.SymbolTable) assert self_transition_prob > 0. and self_transition_prob < 1. WORD_EPSILON_STR = word_table.Find(EPSILON_INT) ALPHABET_EPSILON_STR = alphabet_table.Find(EPSILON_INT) next_transition_prob = 1. - self_transition_prob self_transition_cost = -math.log(self_transition_prob) next_transition_cost = -math.log(next_transition_prob) compiler = openfst.Compiler(fst_type="const", arc_type="log", isymbols=alphabet_table, osymbols=word_table, keep_isymbols=False, keep_osymbols=False) build_fst = partial(print, file=compiler) start_state_index = 0 final_state_index = 1 state_index = 2 edge_format_str = "{start} {end} {char} {word} {weight:.5f}" for word in openfst.SymbolTableIterator(word_table): if word not in word_table: raise ValueError("Word {0} not in vocabulary {1}".format( word, word_table)) for i, char in enumerate(word): if char not in alphabet_table: raise ValueError("Character {0} not in alphabet {1}".format( char, alphabet_table.name())) # Edge case: single-letter word? if i == 0: build_fst( edge_format_str.format(start=start_state_index, end=state_index if len(word) != 1 else final_state_index, char=char, word=WORD_EPSILON_STR, weight=next_transition_cost)) build_fst( edge_format_str.format(start=start_state_index, end=start_state_index, char=ALPHABET_EPSILON_STR, word=WORD_EPSILON_STR, weight=self_transition_cost)) else: # It is possible that this letter could be a repeat. Warning: # This doesn't check if a letter happens 3 times in a row, but # I don't know of any words in English that ever do that. last_char = word[i - 1] char = repeat_char if char == last_char else char if i == len(word) - 1: build_fst( edge_format_str.format(start=state_index, end=final_state_index, char=char, word=word, weight=next_transition_cost)) build_fst( edge_format_str.format(start=state_index, end=state_index, char=ALPHABET_EPSILON_STR, word=WORD_EPSILON_STR, weight=self_transition_cost)) else: build_fst( edge_format_str.format(start=state_index, end=state_index + 1, char=char, word=word, weight=next_transition_cost)) build_fst( edge_format_str.format(start=state_index, end=state_index, char=ALPHABET_EPSILON_STR, word=WORD_EPSILON_STR, weight=self_transition_cost)) state_index += 1 # Add final state build_fst(str(final_state_index)) S = compiler.compile().determinize() S.minimize() S = S.arcsort("olabel") return S