def add_bitext(self, src, tgt): ''' Adds a training bitext. Input: src_data: filepath containing n-best list of src_data hypotheses tgt_data: string containing translation Postcondition: A source word lattice is constructed from src_data, compiled into a weighted FSA. A bitext containing the weighted source FSA and string tgt_data is stored. ''' src_lattice = Lattice(syms=self.src_syms) src_lattice.load_delimited(src) # Weight the edges src_lattice.forward_backward_weights() # Add the NULL token at the front src_lattice.prepend_epsilon() self.src_data.append(src_lattice.fsa) # TODO: Figure out how to extract vocabulary from FSA (src_lattice.sigma) for arc in common.arcs(src_lattice.fsa): self.src_vocab.add(arc.ilabel) self.tgt_data.append(tgt) self.tgt_vocab = self.tgt_vocab.union(set(tgt.split()))
def IBM1(self, src, tgt): ''' IBM Model 1: t(tgt|src) ''' # src is already segmented (as fsa/lattice) # NULL (epsilon) already prepended to src tgt = [i.split() for i in tgt] print(self.src_vocab) print(self.tgt_vocab) num_probs = len(self.src_vocab) * len(self.tgt_vocab) default_prob = 1.0 / len(self.tgt_vocab) t = defaultdict(lambda: default_prob) convergent_threshold=1e-2 globally_converged = False iteration_count = 0 while not globally_converged: count = defaultdict(float) # count(e|f) total = defaultdict(float) # total(f) for src_fsa, tgt_str in zip(src, tgt): s_total = {} # Walk through each arc for arc in common.arcs(src_fsa): s_total[arc.ilabel] = 0.0 for tgt_word in tgt_str: s_total[arc.ilabel] += t[arc.ilabel, tgt_word] * float(arc.weight) for arc in common.arcs(src_fsa): for tgt_word in tgt_str: # Normalize probabilities if s_total[arc.ilabel] == 0: # print (arc.ilabel, tgt_word, 'uh-oh') # TODO: Epsilons (NULLs) aren't working continue cnt = t[arc.ilabel, tgt_word] / s_total[arc.ilabel] # Summing the prob of each src word given tgt_word count[arc.ilabel, tgt_word] += cnt total[tgt_word] += cnt num_converged = 0 for tgt_word in self.tgt_vocab: for src_word in self.src_vocab: new_prob = count[src_word, tgt_word] / total[tgt_word] delta = abs(t[src_word, tgt_word] - new_prob) if delta < convergent_threshold: num_converged += 1 t[src_word, tgt_word] = new_prob iteration_count += 1 if num_converged == num_probs: globally_converged = True return t