def predict(self, output_file): self.num_lines = len(self.old_test) # for the status bar with open(output_file, 'w') as f: fst_m = fst.compose(self.fst_mlm, self.fst_mtm) l = 0 for old_line, new_line in zip(self.old_test, self.new_test): fst_mw = fst_wrapper.get_fst_mw(old_line) # compose the lm, tm, and wm, and find the best path _fst = fst.compose(fst_m, fst_mw) path = viterbi.viterbi_path(_fst) predicted_line = '' for p in path[0]: if p[0][0][0] in self.fst_mlm.input_alphabet: predicted_line += p[0][0][0] # print out the first 10 lines if l < 10: sys.stdout.write(predicted_line) print(path[1]) else: # show the progress bar self.status_bar(l) l += 1 # write the prediction to the file f.write(predicted_line) # print out the transitions with weight greater than 0.1 for state, transitions in self.fst_mtm.transitions_to.items(): for transition, weight in transitions.items(): if weight >= 0.1: print(str(transition) + ' = ' + str(weight))
def train(self, output_file, iterations): # construct the initial unweighted typo model self.fst_mtm = fst_wrapper.get_fst_mtm(self.old_train, self.new_train, False) for _ in range(iterations): # iterate self.num_lines = len(self.old_train) # for the status bar l = 0 self.status_bar(l) # track progress with a status bar # train on parallel text for old_line, new_line in zip(self.old_train, self.new_train): # construct the fst models for modern and old lines fst_mm = fst_wrapper.get_fst_mw(new_line) fst_me = fst_wrapper.get_fst_mw(old_line) #compose the models and find the shortest path _fst = fst.compose(fst.compose(fst_mm, self.fst_mtm), fst_me) viterbi.viterbi_path(fst=_fst, get_counts=True) # reweight the tm with the new counts and reweight for t, count in viterbi.counts.items(): self.fst_mtm.reweight_transition(t, count) self.fst_mtm.normalize_cond(.01) l += 1 self.status_bar(l) print() # add a line after the status bar self.predict(output_file) # print the overall score print('SCORE: ', end='') print(cer.cer(zip(self.new_test, list(open(output_file)))))
def predict(self, old_line, print_lines=True): fst_mw = fst_wrapper.get_fst_mw(old_line) _fst = fst.compose(self.fst_m, fst_mw) path = viterbi.viterbi_path(_fst) # reconstruct the path predicted_line = '' for p in path[0]: if p[0][0][0] in _fst.input_alphabet: predicted_line += p[0][0][0] # print out the modern line with log probability if print_lines: print(predicted_line, end='') print(path[1]) return predicted_line
q = ptr[q].q path.reverse() return vit[m.accept], path if __name__ == '__main__': print('========== CONSTRUCT MODELS ==========') mlm = lm.make_kneserney(process('data/train.en'), 2) mtm = make_tm(ibm_model1.make(), 'data/test.zh') # Iterate through each line of the data print('========== TESTING ===================') with open('data/test.out', 'w') as wf: for i, fs in enumerate(process('data/test.zh')): mf = make_fm(fs) # Compose all fst models m = fst.compose(fst.compose(mf, mtm), mlm) # Calculate the shortest path try: wt, path = viterbi(m, key=lambda q: (q[0][0], -1 * len(q[1]))) out = " ".join( [t.a[1] for t in path[:-1] if t.a[1] != fst.EPSILON]) # Print first 10 translations if i < 10: print('%s' % out) # Print output to help with tracking how far along translations are # else: # print('\rLine #: %d' % i, end='') wf.write('%s\n' % out) except ValueError as e: if i < 10: print('')
# We can encode a string into a sequence of digits # To create an FST that encodes a string, we first get a # list with each symbol str = "mary saw the dog in the park with the telescope" letters = list(str) # Then use linearchain(). # The first argument is the list of symbols, # the second argument is an optional FST to take the symbol # table from, and the last argument is an optional list of # list items to ignore. infst = fst.linearchain(letters, let2dig, [' ']) # Now encode the input FST (letters) into digits encfst = fst.compose(infst, let2dig) print("Input string: ", encfst.get_in_string()) print() print("Output string:", encfst.get_out_string()) print() print("Now reconstructing input string from the output string...\n") # Then go from digits back to letters decfst = fst.compose(encfst, dig2let) # The mapping is now one to many, so we can choose out # strings based on unigram score # Compose the decoded string FST with the unigram model FST decscored = fst.compose(decfst, let2word)
prev = i.q[1][0] return ' '.join(reconstructed) if __name__ == "__main__": M_LM = make_knes() M_TM = make_TM() test_file = '../data/final_data/test.tr' to_write = [] for i, line in enumerate(open(test_file)): line = line.strip() M_f = make_f(line) composed = fst.compose(fst.compose(M_f, M_TM), M_LM) out = viterbi(composed) print(i, out) to_write.append(out) write_file = open('test.translations', 'w') for line in to_write: write_file.write(line + '\n')
def train(self): fst_mlm = fst.make_ngram(self.new_train, 2) fst_mtm = fst_wrapper.get_fst_mtm(self.old_train, self.new_train) self.fst_m = fst.compose(fst_mlm, fst_mtm)