def fstbuild(words): trie = fst.Transducer() letter_syms = fst.read_symbols("ascii.syms.bin") trie.isyms = letter_syms trie.osyms = letter_syms def bs(s): letter_syms = fst.read_symbols("ascii.syms.bin") return letter_syms[s] biggest = 0 for w in words: p = 0 c = 0 trie.add_arc(p, biggest + 1, w[c], "<epsilon>", 0) p = biggest + 1 c += 1 while (c < len(w) - 1): trie.add_arc(p, p + 1, w[c], "<epsilon>", 0) p += 1 c += 1 trie.add_arc(p, p + 1, w[c], w, 0) p += 1 biggest = max(p, biggest) last_state = trie[biggest] last_state.final = True det_trie = trie.determinize() det_trie.arc_sort_input() det_trie.remove_epsilon() return det_trie
def fstbuild(words): trie = fst.Transducer() letter_syms = fst.read_symbols("ascii.syms.bin") trie.isyms = letter_syms trie.osyms = letter_syms def bs(s): letter_syms = fst.read_symbols("ascii.syms.bin") return letter_syms[s] biggest = 0 for w in words: p = 0 c = 0 trie.add_arc(p, biggest+1, w[c], w[c], 0) p = biggest+1 c += 1 while (c < len(w)-1): trie.add_arc(p, p+1, w[c], w[c], 0) p+=1 c+=1 trie.add_arc(p, p+1, w[c], w, 0) p +=1 biggest = max(p, biggest) last_state = trie[biggest] last_state.final = True det_trie = trie.determinize() det_trie.arc_sort_input() return det_trie
def levenshtein(w, editdst): wts = keyweights() trie = fst.Transducer() letter_syms = fst.read_symbols("ascii.syms.bin") trie.isyms = letter_syms trie.osyms = letter_syms letttup = list(letter_syms.items()) letters = list() for let in letttup: letters.append(let[0]) class StateCounter(object): def __init__(self): self.set = {} self.count = -1 def __contains__(self, obj): return obj in self.set def __getitem__(self, obj): if not obj in self.set: self.count += 1 self.set[obj] = self.count return self.set[obj] states = StateCounter() for x in range(0,len(w)): for y in range(0, editdst+1): trie.add_arc(states[str(x)+"^"+str(y)], states[str(x+1)+"^"+str(y)], w[x], w[x], 0)# char in word if not y == editdst: trie.add_arc(states[str(x)+"^"+str(y)], states[str(x+1)+"^"+str(y+1)], "<epsilon>", "<epsilon>", 1.5)# deletion for i in letters: trie.add_arc(states[str(x)+"^"+str(y)], states[str(x+1)+"^"+str(y+1)], i, i, wts[w[x], i])# substitution trie.add_arc(states[str(x)+"^"+str(y)], states[str(x)+"^"+str(y+1)], i, i, wts[w[x], i])# insertion for y in range(0, editdst+1): trie[states[str(len(w))+"^"+str(y)]].final = True trie.remove_epsilon() trie.arc_sort_input() return trie
def _prepare_resource(self,dir_to_tagger,dir_to_phrase): ''' [description] 根据tagger和constraint的fst生成字典 Arguments: dir_to_tagger {string} -- [description] dir_to_phrase {string} -- [description] Returns: tagger_dict -- [description] tagger_dict['${.concept}']=string list, each string is a path constraint_dict -- [description] constraint_dict['${@constraint}']=list of (string path, mapped value) ''' # deal with entities(tagger) files=os.listdir(dir_to_tagger) isyms=fst.read_symbols(os.path.join(dir_to_tagger,'isyms.fst')) osyms=fst.read_symbols(os.path.join(dir_to_tagger,'osyms.fst')) filepath=os.path.join(dir_to_tagger,[each for each in files if each not in ['isyms.fst','osyms.fst'] and each.endswith('.fst')][0]) lexicon=fst.read_std(filepath) lexicon.isyms=isyms lexicon.osyms=osyms self.tagger_dict=defaultdict(list) for each_path in lexicon.paths(): input_string=[lexicon.isyms.find(arc.ilabel) for arc in each_path if arc.ilabel != 0] if len(input_string)!=1: raise ValueError('[Error]:error in resolving tagger name!') output_string=[lexicon.osyms.find(arc.olabel) for arc in each_path if arc.olabel != 0] self.tagger_dict[input_string[0]].append(reverse_preproc(output_string)) # deal with constraints files=os.listdir(dir_to_phrase) isyms=fst.read_symbols(os.path.join(dir_to_phrase,'isyms.fst')) osyms=fst.read_symbols(os.path.join(dir_to_phrase,'osyms.fst')) fst_dict={} for each in files: if each not in ['isyms.fst','osyms.fst'] and each.endswith('.fst'): fst_dict[each[0]]=fst.read_std(os.path.join(dir_to_phrase,each)) fst_dict[each[0]].isyms=isyms fst_dict[each[0]].osyms=osyms self.constraint_dict=defaultdict(list) for each in sorted(fst_dict.keys()): #层级phrase的fst按0-1-2-...顺序组织 tmp_fst=fst_dict[each] for path in tmp_fst.paths(): name,item_list=self._get_path_and_mapped_value(path,tmp_fst) self.constraint_dict[name].extend(item_list) return (self.tagger_dict,self.constraint_dict)
def levenshtein(w, editdst): wts = keyweights() trie = fst.Transducer() letter_syms = fst.read_symbols("ascii.syms.bin") trie.isyms = letter_syms trie.osyms = letter_syms letttup = list(letter_syms.items()) letters = list() for let in letttup: letters.append(let[0]) class StateCounter(object): def __init__(self): self.set = {} self.count = -1 def __contains__(self, obj): return obj in self.set def __getitem__(self, obj): if not obj in self.set: self.count += 1 self.set[obj] = self.count return self.set[obj] states = StateCounter() for x in range(0, len(w)): for y in range(0, editdst + 1): trie.add_arc(states[str(x) + "^" + str(y)], states[str(x + 1) + "^" + str(y)], w[x], w[x], 0) # char in word if not y == editdst: trie.add_arc(states[str(x) + "^" + str(y)], states[str(x + 1) + "^" + str(y + 1)], "<epsilon>", "<epsilon>", 1.5) # deletion for i in letters: trie.add_arc(states[str(x) + "^" + str(y)], states[str(x + 1) + "^" + str(y + 1)], i, i, wts[w[x], i]) # substitution trie.add_arc(states[str(x) + "^" + str(y)], states[str(x) + "^" + str(y + 1)], i, i, wts[w[x], i]) # insertion for y in range(0, editdst + 1): trie[states[str(len(w)) + "^" + str(y)]].final = True trie.remove_epsilon() trie.arc_sort_input() return trie
def generate_suggestions(prefix): """ To extract suggestions the first step was to traverse the fst in fstfile following the charecters of the given prefix. From there the state of the final letter of prefix is saved and the next part constructs an fst of the branch the grows from the saved state. It is done in bds approach. Later, extract all paths from acceptor in a dfs manner is done with path weight calculation. Then all paths are sorted by weights and the first three are jsoned. INPUT: a string OUTPUT: a json file with up to three values for Suggestion entry """ fstfile = "/Users/dudy/CSLU/summerIntern/src/prfx_tree.fst" sym = fst.read_symbols("/Users/dudy/CSLU/summerIntern/src/syms") lm = fst.read(fstfile) prefix = prefix.lower() # look for subtree given prefix stateid = 0 for ch in prefix: state = lm[stateid] for arc in state.arcs: if sym.find(arc.ilabel) == ch: print ch stateid = arc.nextstate break # construct desired subtree (bds) reduced = bfs(stateid, lm, sym) # read strings (dfs) top3 = dfs(reduced, sym) # take first three (if exists) suggest = [] for (suffix, _) in top3: suggest.append(suffix) # dict it result = {} result["Suggestions:"] = suggest # json it json_file = "auto.json" with open(json_file, "w") as fp: json.dump(result, fp)
def generate_suggestions(prefix): """ To extract suggestions the first step was to traverse the fst in fstfile following the charecters of the given prefix. From there the state of the final letter of prefix is saved and the next part constructs an fst of the branch the grows from the saved state. It is done in bds approach. Later, extract all paths from acceptor in a dfs manner is done with path weight calculation. Then all paths are sorted by weights and the first three are jsoned. INPUT: a string OUTPUT: a json file with up to three values for Suggestion entry """ fstfile = "/Users/dudy/CSLU/summerIntern/src/prfx_tree.fst" sym = fst.read_symbols("/Users/dudy/CSLU/summerIntern/src/syms") lm = fst.read(fstfile) prefix = prefix.lower() # look for subtree given prefix stateid = 0 for ch in prefix: state = lm[stateid] for arc in state.arcs: if sym.find(arc.ilabel)==ch: print ch stateid = arc.nextstate break # construct desired subtree (bds) reduced = bfs(stateid, lm, sym) # read strings (dfs) top3 = dfs(reduced, sym) # take first three (if exists) suggest = [] for (suffix, _) in top3: suggest.append(suffix) # dict it result = {} result["Suggestions:"] = suggest # json it json_file = "auto.json" with open(json_file, "w") as fp: json.dump(result, fp)
#print fst_path s = a[a.index('-str') + 1] #print s s = s.strip() sym = a[a.index('-sym') + 1] #print sym return [s, fst_path, sym] except (ValueError, IndexError): sys.stderr.write('Usage: -fst [name of fst] -str ' \ '[string to encode as fst (no quotes,tokens separated by white space)] ' \ '-sym [binary symbol file]\n e.g. -fst sentence.fst -str hello world -sym mysym.bin\n') exit() def log_linear_chain(txt, sym_f): txt = txt.replace('__s__', '<s>') txt = txt.replace('_s_', '</s>') txt = txt.split() lc = fst.Transducer(sym_f, sym_f) for idx, t in enumerate(txt): lc.add_arc(idx, idx + 1, t, t, 0.0) lc[idx + 1].final = True return lc if __name__ == '__main__': [s, fst_path, sym_path] = parseargs(sys.argv) sym = fst.read_symbols(sym_path) lc = log_linear_chain(s, sym) lc.write(fst_path, sym, sym)
def bs(s): letter_syms = fst.read_symbols("ascii.syms.bin") return letter_syms[s]
def add_arc_pr(sym, lmfst, fid, tid, isy, osy, wt): lmfst.add_arc(fid, tid, isy, osy, wt) ''' print 'added arc', fid, tid, sym[isy], sym[osy] if fid == 1961: count = 0 for c, s in enumerate(lmfst.states): count = c print 'number of total states', count ''' if __name__ == '__main__': sym_e = fst.read_symbols('data/syme.bin') lm_txt = open('data/lm', 'r').read() [bs, unigrams, bigrams, trigrams] = re.split('1-grams:|2-grams:|3-grams:', lm_txt) unigrams = re.split('\n+', unigrams) bigrams = re.split('\n+', bigrams) trigrams = re.split('\n+', trigrams) lm_id = {} lm_id[INITIAL] = len(lm_id) lm_fst = fst.Transducer(sym_e, sym_e) lm_fst.add_state() lm_id[NULL] = len(lm_id) for uni_line in unigrams: if uni_line.strip() != '' and len(uni_line.split('\t')) > 1: [p, ng, bk] = trysplit(uni_line)
__author__ = 'arenduchintala' import fst, itertools if __name__ == '__main__': tokens = open('data/input', 'r').read().split() tokens = set(tokens) symf = fst.read_symbols('data/symf.bin') reorder_list = [] reorder = fst.Transducer(symf, symf) reorder[0].final = True for s, v in symf.items(): reorder.add_arc(0, 0, s, s, 0.0) st = set(s.split('_')) if len(st.intersection(tokens)) > 0: print 'keep', s reorder_list.append(s) else: print 'reject', s print 'filtered down to', len(set(reorder_list)) n = 1 c = 0 for a, b in itertools.product(reorder_list, reorder_list): c += 1 if c % 1000 == 0: print int(c / 1000), 'of', int((len(reorder_list) ** 2) / 1000) if a != b: reorder.add_arc(0, n, a, fst.EPSILON, 0.0) #print 0, n reorder.add_arc(n, n + 1, b, fst.EPSILON, 0.0) #print n + 1, n + 2 reorder.add_arc(n + 1, n + 2, fst.EPSILON, b, 0.0)