def __init__(self): self.old_train = list(open("data/train.old")) self.new_train = list(open("data/train.new")) self.old_test = list(open("data/test.old")) self.new_test = list(open("data/test.new")) self.num_lines = len(self.old_test) # used for the status bar self.fst_m = fst.FST() # composition of lm and tm
def get_fst_mtm(old_data, new_data, initialize=True): m = fst.FST() m.set_start("q0") # get the old and modern alphabets from the traning data for old_line in old_data: for w in old_line: output_alphabet.add(w) for new_line in new_data: for w in new_line: input_alphabet.add(w) # generate the typo model for output_w in output_alphabet: m.add_transition(fst.Transition("q0", (fst.EPSILON, output_w), "q0")) #insert for input_w in input_alphabet: m.add_transition(fst.Transition("q0", (input_w, fst.EPSILON), "q1")) #delete for output_w in output_alphabet: # substitute m.add_transition(fst.Transition("q1", (input_w, output_w), "q0")) m.add_transition(fst.Transition("q0", (input_w, output_w), "q0")) # add terminal transitions m.add_transition(fst.Transition("q0", (fst.STOP, fst.STOP), "q2")) m.add_transition(fst.Transition("q1", (fst.STOP, fst.STOP), "q2")) m.set_accept("q2") # initialize the weights if initialize: for state in m.states: for transition in m.transitions_from[state].keys(): # higher probability if going to the same character if transition.a[0] == transition.a[1]: m.reweight_transition(transition, 100) else: m.reweight_transition(transition, 1) m.normalize_cond() return m
def make_tm(t, testfile): tm = fst.FST() tm.set_start('q0') tm.set_accept('q1') tm.add_transition(fst.Transition('q0', (fst.STOP, fst.STOP), 'q1')) known_words = set() # Store the transitions in a new format top_trans = defaultdict(dict) for trans, prob in t.items(): if trans[1] == '∅': top_trans[trans[0]][fst.EPSILON] = prob else: top_trans[trans[0]][trans[1]] = prob known_words.add(trans[0]) # Find and insert the top 10 translations for fw, trans in top_trans.items(): for i, (ew, prob) in enumerate( sorted(trans.items(), key=operator.itemgetter(1), reverse=True)): if i > 10: break tm.add_transition(fst.Transition('q0', (fw, ew), 'q0'), prob) # Add unknown words from the test data with open(testfile) as f: prob = math.pow(10, -100) for line in f: for w in line.rstrip().split(): if w not in known_words: tm.add_transition( fst.Transition('q0', (w, fst.EPSILON), 'q0'), prob) return tm
def __init__(self): self.old_train = list(open("data/train.old")) self.new_train = list(open("data/train.new")) self.old_test = list(open("data/test.old")) self.new_test = list(open("data/test.new")) self.fst_mtm = fst.FST() # typo model self.fst_mlm = fst.make_ngram(self.new_train, 2) # language model self.num_lines = 0 # used for the status bar
def generate(self, analysis): """Generate the morphologically correct word e.g. p = Parser() analysis = ['p','a','n','i','c','+past form'] p.generate(analysis) ---> 'panicked' """ f1=fst.FST('word_generator') for i in range(1,34): f1.add_state(str(i)) f1.initial_state = '1' f1.add_arc('1','2','w','w') f1.add_arc('2','3','a','a') f1.add_arc('3','4','n','n') f1.add_arc('4','5','t','t') f1.add_arc('5','6','+past form','ed') f1.add_arc('5','7','+present participle','ing') f1.add_arc('1','8','s','s') f1.add_arc('8','9','y','y') f1.add_arc('9','10','n','n') f1.add_arc('10','11','c','c') f1.add_arc('11','12','+past form','ed') f1.add_arc('11','13','+present participle','ing') f1.add_arc('1','14','p','p') f1.add_arc('14','15','a','a') f1.add_arc('15','16','n','n') f1.add_arc('16','17','i','i') f1.add_arc('17','18','c','c') f1.add_arc('18','19','+past form','ked') f1.add_arc('18','20','+present participle','king') f1.add_arc('1','21','h','h') f1.add_arc('21','22','a','a') f1.add_arc('22','23','v','v') f1.add_arc('23','24','o','o') f1.add_arc('24','25','c','c') f1.add_arc('25','26','+past form','ked') f1.add_arc('25','27','+present participle','king') f1.add_arc('1','28','l','l') f1.add_arc('28','29','i','i') f1.add_arc('29','30','c','c') f1.add_arc('30','31','k','k') f1.add_arc('31','32','+past form','ed') f1.add_arc('31','33','+present participle','ing') f1.set_final('6') f1.set_final('7') f1.set_final('12') f1.set_final('13') f1.set_final('19') f1.set_final('20') f1.set_final('26') f1.set_final('27') f1.set_final('32') f1.set_final('33') result=''.join(f1.transduce(analysis)[0]) return result
def make_fm(f): fm = fst.FST() fm.set_start(0) for i, fw in enumerate(f): fm.add_transition(fst.Transition(i, (fw, fw), i + 1)) fm.add_transition( fst.Transition(len(fs), (fst.STOP, fst.STOP), len(fs) + 1)) fm.set_accept(len(f) + 1) return fm
def get_fst_mw(word): m = fst.FST() m.set_start("q0") n = 1 for w in word: m.add_transition(fst.Transition("q"+str(n-1), (w, w), "q"+str(n))) n += 1 m.add_transition(fst.Transition("q"+str(n-1), (fst.STOP, fst.STOP), "q"+str(n))) m.set_accept("q"+str(n)) return m
def make_f(f): # Adapted from Homework 2 Solutions f = f.split() m = fst.FST() m.set_start(0) for (i,a) in enumerate(f): m.add_transition(fst.Transition(i, (a, a), i+1)) m.add_transition(fst.Transition(len(f), (fst.STOP, fst.STOP), len(f)+1)) m.set_accept(len(f)+1) return m
def make_kneserney(data, n): """Create a Kneser-Ney smoothed language model of order `n`, trained on `data`, as a `FST`. Note that the returned FST has epsilon transitions. To iterate over states in topological order, sort them using `lambda q: -len(q)` as the key. """ # Estimate KN-smoothed models for orders 1, ..., n kn = {} for i in range(1, n + 1): kn[i] = KneserNey(data, i) # Create the FST. It has a state for every possible k-gram for k = 0, ..., n-1. m = fst.FST() m.set_start(("<s>", ) * (n - 1)) m.set_accept(("</s>", )) for i in range(1, n + 1): for u in kn[i]._prob: if i > 1: # Add an epsilon transition that backs off from the i-gram model to the (i-1)-gram model m.add_transition( fst.Transition(u, (fst.EPSILON, fst.EPSILON), u[1:]), kn[i]._bow[u]) else: # Smooth 1-gram model with uniform distribution types = len(kn[i]._prob[u]) + 1 for w in kn[i]._prob[u]: m.add_transition(fst.Transition(u, (w, w), (w, )), 1 / types) m.add_transition(fst.Transition(u, ("<unk>", "<unk>"), ()), 1 / types) # Create transitions for word probabilities for w in kn[i]._prob[u]: # If we are in state u and read w, then v is the new state. # This should be the longest suffix of uw that is observed # in the training data. if w == "</s>": v = ("</s>", ) else: v = u + (w, ) while len(v) > 0 and (len(v) >= n or v not in kn[len(v) + 1]._prob): v = v[1:] m.add_transition(fst.Transition(u, (w, w), v), kn[i]._prob[u][w]) return m
def make_tm(t, testfile): tm = fst.FST() tm.set_start('q0') tm.set_accept('q1') tm.add_transition(fst.Transition('q0', (fst.STOP, fst.STOP), 'q1')) known_words = set() for trans, prob in t.items(): known_words.add(trans[0]) if trans[1] == '∅': trans = (trans[0], fst.EPSILON) tm.add_transition(fst.Transition('q0', trans, 'q0'), prob) # Add unknown words from the test data with open(testfile) as f: prob = math.pow(10, -100) for line in f: for w in line.rstrip().split(): if w not in known_words: tm.add_transition( fst.Transition('q0', (w, fst.EPSILON), 'q0'), prob) return tm
def tAutomata(): f = fst.FST('epsilon_test') f.add_state('start') f.initial_state = 'start' f.add_state('1_state') f.add_state('ep_final_state') f.add_state('EPSILON_Intermediate') #f.set_final('start') f.set_final('1_state') f.set_final('ep_final_state') f.add_arc('start', 'EPSILON_Intermediate', [], ['ep_path']) f.add_arc('EPSILON_Intermediate', 'ep_final_state', ['E'], ['ep_to_Final']) f.add_arc('start', '1_state', ['1'], ['1_path']) #print f return f
def make_TM(): # Code adapted from Homework 2 Solution translations = read_translations() tm = fst.FST() tm.set_start(0) tm.set_accept(1) tm.add_transition(fst.Transition(0, ("</s>", "</s>"), 1), wt=1) for t in translations: for prob in translations[t]: tm.add_transition(fst.Transition(0, (t, prob[0]), 0), wt=float(prob[1])) test = '../data/final_data/test.tr' test_set = set() for test_line in open(test): for char in test_line.strip().split(): test_set.add(char) for char in test_set: tm.add_transition(fst.Transition(0, (char, 'ε'), 0), wt=float('1.0e-100')) return tm
def generate(self, analysis): """Generate the morphologically correct word e.g. p = Parser() analysis = ['p','a','n','i','c','+past form'] p.generate(analysis) ---> 'panicked' """ f1 = fst.FST('generator') f1.add_state('start') f1.add_state('a1') f1.add_state('a2') f1.add_state('a3') f1.add_state('b1') f1.add_state('b2') f1.add_state('b3') f1.add_state('c1') f1.add_state('c2') f1.add_state('c3') f1.add_state('c4') f1.add_state('d1') f1.add_state('d2') f1.add_state('d3') f1.add_state('d4') f1.add_state('e1') f1.add_state('e2') f1.add_state('e3') f1.add_state('insertion') f1.add_state('progressive') f1.add_state('past') f1.add_state('end') f1.initial_state = 'start' f1.set_final('end') f1.add_arc('start', 'a1', 'w', 'w') f1.add_arc('a1', 'a2', 'a', 'a') f1.add_arc('a2', 'a3', 'n', 'n') f1.add_arc('a3', 'past', 't', 't') f1.add_arc('a3', 'progressive', 't', 't') f1.add_arc('start', 'b1', 's', 's') f1.add_arc('b1', 'b2', 'y', 'y') f1.add_arc('b2', 'b3', 'n', 'n') f1.add_arc('b3', 'past', 'c', 'c') f1.add_arc('b3', 'progressive', 'c', 'c') f1.add_arc('start', 'c1', 'p', 'p') f1.add_arc('c1', 'c2', 'a', 'a') f1.add_arc('c2', 'c3', 'n', 'n') f1.add_arc('c3', 'c4', 'i', 'i') f1.add_arc('c4', 'insertion', 'c', 'c') f1.add_arc('insertion', 'past', '', 'k') f1.add_arc('insertion', 'progressive', '', 'k') f1.add_arc('start', 'd1', 'h', 'h') f1.add_arc('d1', 'd2', 'a', 'a') f1.add_arc('d2', 'd3', 'v', 'v') f1.add_arc('d3', 'd4', 'o', 'o') f1.add_arc('d4', 'insertion', 'c', 'c') f1.add_arc('start', 'e1', 'l', 'l') f1.add_arc('e1', 'e2', 'i', 'i') f1.add_arc('e2', 'e3', 'c', 'c') f1.add_arc('e3', 'past', 'k', 'k') f1.add_arc('e3', 'progressive', 'k', 'k') f1.add_arc('past', 'end', '+past form', 'ed') f1.add_arc('progressive', 'end', '+present participle', 'ing') # output = ['p','a','n','i','c','k','e','d'] # return ''.join(output) #print(f1.transduce(analysis)) return ''.join(f1.transduce(analysis)[0])
# import the fst module import fst # import the string module import string # Dene a list of all vowels for convenience vowels = ['a', 'e', 'i', 'o', 'u'] # Instantiate an FST object with some name f = fst.FST('devowelizer') # All we need is a single state ... f.add_state('1') # and this same state is the initial and the nal state f.initial_state = '1' f.set_final('1') # Now, we need to add an arc for each letter; if the letter is a vowel # then the transition outputs nothing but otherwise it outputs the same # letter that it consumed. for letter in string.ascii_lowercase: if letter in vowels: _ = f.add_arc('1', '1', (letter), ()) else: _ = f.add_arc('1', '1', (letter), (letter)) # Evaluate it on some example words print ''.join(f.transduce(['v', 'o', 'w', 'e', 'l'])) print ''.join(f.transduce('e x c e p t i o n'.split())) print ''.join(f.transduce('c o n s o n a n t'.split())) print f.transduce(['a','w']) from fsmutils import composechars S = "vowels" output = composechars(S, f, f, f)
def parse(self, word): """Parse a word morphologically want, sync, panic, havoc, and lick e.g. -ed and -ing p = Parser() past form \ present participle form word = ['p', 'a', 'n', 'i', 'c', 'k','e','d'] p.parse(word) ---> 'panic+past form' """ f2=fst.FST('lexicon') for i in range(1,56): f2.add_state(str(i)) f2.initial_state = '1' f2.add_arc('1','2','w','w') f2.add_arc('2','3','a','a') f2.add_arc('3','4','n','n') f2.add_arc('4','5','t','t') f2.add_arc('5','6','e','^') f2.add_arc('6','7','d','d') f2.add_arc('7','8','','#') f2.add_arc('5','9','i','^') f2.add_arc('9','10','n','n') f2.add_arc('10','11','g','g') f2.add_arc('11','8','','#') f2.add_arc('1','12','s','s') f2.add_arc('12','13','y','y') f2.add_arc('13','14','n','n') f2.add_arc('14','15','c','c') f2.add_arc('15','16','e','^') f2.add_arc('16','17','d','d') f2.add_arc('17','18','','#') f2.add_arc('15','19','i','^') f2.add_arc('19','20','n','n') f2.add_arc('20','21','g','g') f2.add_arc('21','18','','#') f2.add_arc('1','22','p','p') f2.add_arc('22','23','a','a') f2.add_arc('23','24','n','n') f2.add_arc('24','25','i','i') f2.add_arc('25','26','c','c') f2.add_arc('26','27','k','^') f2.add_arc('27','28','e','e') f2.add_arc('28','29','d','d') f2.add_arc('29','30','','#') f2.add_arc('27','31','i','i') f2.add_arc('31','32','n','n') f2.add_arc('32','33','g','g') f2.add_arc('33','30','','#') f2.add_arc('1','34','h','h') f2.add_arc('34','35','a','a') f2.add_arc('35','36','v','v') f2.add_arc('36','37','o','o') f2.add_arc('37','38','c','c') f2.add_arc('38','39','k','^') f2.add_arc('39','40','e','e') f2.add_arc('40','41','d','d') f2.add_arc('41','42','','#') f2.add_arc('39','43','i','i') f2.add_arc('43','44','n','n') f2.add_arc('44','45','g','g') f2.add_arc('45','42','','#') f2.add_arc('1','46','l','l') f2.add_arc('46','47','i','i') f2.add_arc('47','48','c','c') f2.add_arc('48','49','k','k') f2.add_arc('49','50','e','^') f2.add_arc('50','51','d','d') f2.add_arc('51','52','','#') f2.add_arc('49','53','i','^') f2.add_arc('53','54','n','n') f2.add_arc('54','55','g','g') f2.add_arc('55','52','','#') f2.set_final('8') f2.set_final('18') f2.set_final('30') f2.set_final('42') f2.set_final('52') f3=fst.FST('rule') for i in range(1,53): f3.add_state(str(i)) f3.initial_state = '1' f3.add_arc('1','2','p','p') f3.add_arc('2','3','a','a') f3.add_arc('3','4','n','n') f3.add_arc('4','5','i','i') f3.add_arc('5','6','c','c') f3.add_arc('6','7','^','') f3.add_arc('7','8','e','') f3.add_arc('8','9','d','+past form') f3.add_arc('9','10','#','') f3.add_arc('7','11','i','') f3.add_arc('11','12','n','') f3.add_arc('12','13','g','+present participle') f3.add_arc('13','10','#','') f3.add_arc('1','14','h','h') f3.add_arc('14','15','a','a') f3.add_arc('15','16','v','v') f3.add_arc('16','17','o','o') f3.add_arc('17','18','c','c') f3.add_arc('18','19','^','') f3.add_arc('19','20','e','') f3.add_arc('20','21','d','+past form') f3.add_arc('21','22','#','') f3.add_arc('19','23','i','') f3.add_arc('23','24','n','') f3.add_arc('24','25','g','+present participle') f3.add_arc('25','22','#','') f3.add_arc('1','26','l','l') f3.add_arc('26','27','i','i') f3.add_arc('27','28','c','c') f3.add_arc('28','29','k','k') f3.add_arc('29','30','^','') f3.add_arc('30','31','d','+past form') f3.add_arc('31','32','#','') f3.add_arc('30','33','n','') f3.add_arc('33','34','g','+present participle') f3.add_arc('34','32','#','') f3.add_arc('1','35','s','s') f3.add_arc('35','36','y','y') f3.add_arc('36','37','n','n') f3.add_arc('37','38','c','c') f3.add_arc('38','39','^','') f3.add_arc('39','40','d','+past form') f3.add_arc('40','41','#','') f3.add_arc('38','42','n','') f3.add_arc('42','43','g','+present participle') f3.add_arc('43','41','#','') f3.add_arc('1','44','w','w') f3.add_arc('44','45','a','a') f3.add_arc('45','46','n','n') f3.add_arc('46','47','t','t') f3.add_arc('47','48','^','') f3.add_arc('48','49','d','+past form') f3.add_arc('49','50','#','') f3.add_arc('47','51','n','') f3.add_arc('51','52','g','+present participle') f3.add_arc('52','50','#','') f3.set_final('10') f3.set_final('22') f3.set_final('32') f3.set_final('41') f3.set_final('50') output=''.join(fsmutils.compose(word,f2,f3)[0]) return output '''output = ['p','a','n','i','c','+past form']
def parse(self, word): """Parse a word morphologically e.g. p = Parser() word = ['p', 'a', 'n', 'i', 'c', 'k','e','d'] p.parse(word) ---> 'panic+past form' """ f2 = fst.FST('parser') f2.add_state('start') f2.add_state('a1') f2.add_state('a2') f2.add_state('a3') f2.add_state('b1') f2.add_state('b2') f2.add_state('b3') f2.add_state('c1') f2.add_state('c2') f2.add_state('c3') f2.add_state('c4') f2.add_state('d1') f2.add_state('d2') f2.add_state('d3') f2.add_state('d4') f2.add_state('e1') f2.add_state('e2') f2.add_state('e3') f2.add_state('deletion') f2.add_state('progressive1') f2.add_state('progressive2') f2.add_state('progressive3') f2.add_state('past1') f2.add_state('past2') f2.add_state('end') f2.initial_state = 'start' f2.set_final('end') f2.add_arc('start', 'a1', 'w', 'w') f2.add_arc('a1', 'a2', 'a', 'a') f2.add_arc('a2', 'a3', 'n', 'n') f2.add_arc('a3', 'past1', 't', 't') f2.add_arc('a3', 'progressive1', 't', 't') f2.add_arc('start', 'b1', 's', 's') f2.add_arc('b1', 'b2', 'y', 'y') f2.add_arc('b2', 'b3', 'n', 'n') f2.add_arc('b3', 'past1', 'c', 'c') f2.add_arc('b3', 'progressive1', 'c', 'c') f2.add_arc('start', 'c1', 'p', 'p') f2.add_arc('c1', 'c2', 'a', 'a') f2.add_arc('c2', 'c3', 'n', 'n') f2.add_arc('c3', 'c4', 'i', 'i') f2.add_arc('c4', 'deletion', 'c', 'c') f2.add_arc('deletion', 'past1', 'k', '') f2.add_arc('deletion', 'progressive1', 'k', '') f2.add_arc('start', 'd1', 'h', 'h') f2.add_arc('d1', 'd2', 'a', 'a') f2.add_arc('d2', 'd3', 'v', 'v') f2.add_arc('d3', 'd4', 'o', 'o') f2.add_arc('d4', 'deletion', 'c', 'c') f2.add_arc('start', 'e1', 'l', 'l') f2.add_arc('e1', 'e2', 'i', 'i') f2.add_arc('e2', 'e3', 'c', 'c') f2.add_arc('e3', 'past1', 'k', 'k') f2.add_arc('e3', 'progressive1', 'k', 'k') f2.add_arc('past1', 'past2', 'e', '+') f2.add_arc('past2', 'end', 'd', 'past form') f2.add_arc('progressive1', 'progressive2', 'i', '+') f2.add_arc('progressive2', 'progressive3', 'n', '') f2.add_arc('progressive3', 'end', 'g', 'present participle') # output = ['p','a','n','i','c','+past form'] # return ''.join(output) return ''.join(f2.transduce(word)[0])
# import the fst module import fst # import the string module import string # Define a list of all vowels for convenience vowels = ['a', 'e', 'i', 'o', 'u'] # Instantiate an FST object with some name f = fst.FST('devowelizer') # All we need is a single state ... f.add_state('1') # and this same state is the initial and the final state f.initial_state = '1' f.set_final('1') # Now, we need to add an arc for each letter; if the letter is a vowel # then the transition outputs nothing but otherwise it outputs the same # letter that it consumed. for letter in string.ascii_lowercase: if letter in vowels: _ = f.add_arc('1', '1', (letter), ()) else: _ = f.add_arc('1', '1', (letter), (letter)) # Evaluate it on some example words print(''.join(f.transduce(['v', 'o', 'w', 'e', 'l']))) print(''.join(f.transduce('e x c e p t i o n'.split()))) print(''.join(f.transduce('c o n s o n a n t'.split())))