def parse_refs(filename): refs = [] poss = [] for line in open(filename): line = line.strip() refs.append(Alignment.fromstring(re.sub(r'[0-9]*p[0-9]*', "", line))) poss.append(Alignment.fromstring(line.replace('p', '-'))) return refs, poss
def __align(self, sentence_pair): """ Determines the best word alignment for one sentence pair from the corpus that the model was trained on. The best alignment will be set in ``sentence_pair`` when the method returns. In contrast with the internal implementation of IBM models, the word indices in the ``Alignment`` are zero- indexed, not one-indexed. :param sentence_pair: A sentence in the source language and its counterpart sentence in the target language :type sentence_pair: AlignedSent """ best_alignment = [] for j, trg_word in enumerate(sentence_pair.words): # Initialize trg_word to align with the NULL token best_prob = max(self.translation_table[trg_word][None], IBMModel.MIN_PROB) best_alignment_point = None for i, src_word in enumerate(sentence_pair.mots): align_prob = self.translation_table[trg_word][src_word] if align_prob >= best_prob: # prefer newer word in case of tie best_prob = align_prob best_alignment_point = i best_alignment.append((j, best_alignment_point)) sentence_pair.alignment = Alignment(best_alignment)
def train(self, parallel_corpus): counts = Model3Counts() for aligned_sentence in parallel_corpus: l = len(aligned_sentence.mots) m = len(aligned_sentence.words) sampled_alignments, best_alignment = self.sample(aligned_sentence) aligned_sentence.alignment = Alignment( best_alignment.zero_indexed_alignment()) total_count = self.prob_of_alignments(sampled_alignments) for alignment_info in sampled_alignments: count = self.prob_t_a_given_s(alignment_info) normalized_count = count / total_count for j in range(1, m + 1): counts.update_lexical_translation(normalized_count, alignment_info, j) counts.update_distortion(normalized_count, alignment_info, j, l, m) counts.update_null_generation(normalized_count, alignment_info) counts.update_fertility(normalized_count, alignment_info) existing_alignment_table = self.alignment_table self.reset_probabilities() self.alignment_table = existing_alignment_table self.maximize_lexical_translation_probabilities(counts) self.maximize_distortion_probabilities(counts) self.maximize_fertility_probabilities(counts) self.maximize_null_generation_probabilities(counts)
def load_alignments(input_file_path: Path) -> List[Alignment]: alignments: List[Alignment] = [] for line in load_corpus(input_file_path): if line.startswith("#"): continue alignments.append(Alignment.fromstring(line)) return alignments
def evaluate_model(f_sents, e_sents_orig, nr_f_words, e_sents, e_dict_inv, trans_probs, gold_alignments): """ Returns the current model performance in terms of translation perplexity :param f_sents: the set of french sentences :param e_sents_orig: :param nr_f_words: :param e_sents: :param e_dict_inv: :param trans_probs: :param gold_alignments: :return: """ sent_perplexities = np.zeros(len(f_sents)) sent_likelihoods = np.zeros(len(f_sents)) sent_aers = np.zeros(len(f_sents)) model_output = align_sentences(e_sents, f_sents, trans_probs, nr_f_words) for index, pair in enumerate(model_output): e_pred_sent = pair[0] f_sent = f_sents[index] alignment = pair[1] if VERBOSE: print("Sentence: {}".format(f_sent)) if VERBOSE: if INDEX_WORDS: print("Predicted translation: {}".format( decode_sentence(e_pred_sent, e_dict_inv))) else: print("Predicted translation: {}".format(e_pred_sent)) if VERBOSE: print("Actual translation: {}".format(e_sents_orig[index])) if VERBOSE: print("Alignment: {}".format(alignment)) if VERBOSE: print("Gold standard alignment: {}".format(gold_alignments[index])) sent_perplexities[index] = get_perplexity(e_pred_sent, f_sent, trans_probs, nr_f_words) sent_likelihoods[index] = get_likelihood(e_pred_sent, f_sent, trans_probs, nr_f_words) sent_aers[index] = metrics.alignment_error_rate( Alignment(gold_alignments[index]), Alignment(alignment)) return [ -np.sum(sent_perplexities), sum(sent_likelihoods) / len(sent_likelihoods), sum(sent_aers) / len(sent_aers) ]
def read_block(self, stream): block = [self._word_tokenizer.tokenize(sent_str) for alignedsent_str in self._alignedsent_block_reader(stream) for sent_str in self._sent_tokenizer.tokenize(alignedsent_str)] if self._aligned: block[2] = Alignment.fromstring(" ".join(block[2])) # kludge; we shouldn't have tokenized the alignment string block = [AlignedSent(*block)] elif self._group_by_sent: block = [block[0]] else: block = block[0] return block
def remove_nones(bitext): bitext_new = [] regex1 = re.compile(r"\([0-9]+, None\), ", re.IGNORECASE) regex2 = re.compile(r"\(None, [0-9]+\), ", re.IGNORECASE) regex3 = re.compile(r"\([0-9]+, None\)", re.IGNORECASE) regex4 = re.compile(r"\(None, [0-9]+\)", re.IGNORECASE) for b in bitext: alignment_str = re.sub(regex1, "", b.alignment.unicode_repr()) alignment_str = re.sub(regex2, "", alignment_str) alignment_str = re.sub(regex3, "", alignment_str) alignment_str = re.sub(regex4, "", alignment_str) alignment_str = alignment_str.replace("Alignment", "").replace("), ", "#").replace(", ", "-").replace("#(", " ").replace("[", "").replace("]", "").replace("(", "").replace(")", "").replace("#", "") bitext_new.append(AlignedSent(b.words, b.mots, Alignment.fromstring(alignment_str))) return bitext_new
def train(self, parallel_corpus): counts = Model4Counts() for aligned_sentence in parallel_corpus: m = len(aligned_sentence.words) # Sample the alignment space sampled_alignments, best_alignment = self.sample(aligned_sentence) # Record the most probable alignment aligned_sentence.alignment = Alignment( best_alignment.zero_indexed_alignment() ) # E step (a): Compute normalization factors to weigh counts total_count = self.prob_of_alignments(sampled_alignments) # E step (b): Collect counts for alignment_info in sampled_alignments: count = self.prob_t_a_given_s(alignment_info) normalized_count = count / total_count for j in range(1, m + 1): counts.update_lexical_translation( normalized_count, alignment_info, j ) counts.update_distortion( normalized_count, alignment_info, j, self.src_classes, self.trg_classes, ) counts.update_null_generation(normalized_count, alignment_info) counts.update_fertility(normalized_count, alignment_info) # M step: Update probabilities with maximum likelihood estimates # If any probability is less than MIN_PROB, clamp it to MIN_PROB existing_alignment_table = self.alignment_table self.reset_probabilities() self.alignment_table = existing_alignment_table # don't retrain self.maximize_lexical_translation_probabilities(counts) self.maximize_distortion_probabilities(counts) self.maximize_fertility_probabilities(counts) self.maximize_null_generation_probabilities(counts)
def align(self, sentence_pair): best_alignment = [] for j, trg_word in enumerate(sentence_pair.words): best_prob = max(self.translation_table[trg_word][None], IBMModel.MIN_PROB) best_alignment_point = None for i, src_word in enumerate(sentence_pair.mots): align_prob = self.translation_table[trg_word][src_word] if align_prob >= best_prob: best_prob = align_prob best_alignment_point = i best_alignment.append((j, best_alignment_point)) sentence_pair.alignment = Alignment(best_alignment)
def main(args): forward = torch.load( args.forward, map_location='cpu') # list of ['src', 'tgt', 'weights', 'metricss'] backward = torch.load( args.backward, map_location='cpu') # list of ['src', 'tgt', 'weights', 'metricss'] assert len(forward) == len(backward) res = [] if args.bialign is not None: assert args.ref is not None refs, poss = parse_refs(args.ref) bi_aligns = [ Alignment.fromstring(line.strip()) for line in open(args.bialign) ] bi_metrics = [ alignment_merics([hyp], [ref], [pos]) for hyp, ref, pos in zip(bi_aligns, refs, poss) ] assert len(forward) == len(backward) == len(bi_aligns) == len( bi_metrics) for f, b, bi_align, bi_metric in zip(forward, backward, bi_aligns, bi_metrics): res_t = {} assert f['src'] == b['src'] and f['tgt'] == b['tgt'] res_t['src'] = f['src'] res_t['tgt'] = f['tgt'] res_t['weights'] = merge_dict(f['weights'], b['weights'], args) res_t['metrics'] = merge_dict(f['metrics'], b['metrics'], args) res_t['weights']['bi_align'] = align_to_weights( bi_align, bi_align, f['src'], f['tgt']) res_t['metrics']['bi_align'] = bi_metric res.append(res_t) else: for f, b in zip(forward, backward): res_t = {} res_t['src'] = f['src'] res_t['tgt'] = f['tgt'] res_t['weights'] = merge_dict(f['weights'], b['weights'], args) res_t['metrics'] = merge_dict(f['metrics'], b['metrics'], args) res.append(res_t) output = args.output or args.forward torch.save(res, output)
def get_direct_lexicon(self, include_special_tokens: bool = False) -> Lexicon: lexicon = Lexicon() source: Iterable[str] = load_corpus(self.model_dir / "src.txt") target: Iterable[str] = load_corpus(self.model_dir / "trg.txt") alignments: Iterable[str] = filter( lambda a: not a.startswith("#"), load_corpus(self.model_dir / "alignments.txt")) for src_str, trg_str, alignment_str in zip(source, target, alignments): src_words = src_str.split() trg_words = trg_str.split() alignment = Alignment.fromstring(alignment_str) for src_index, trg_index in alignment: if src_index >= len(src_words) or trg_index >= len(trg_words): continue src_word = src_words[src_index] trg_word = trg_words[trg_index] lexicon.increment(src_word, trg_word) lexicon.normalize() return lexicon
def eval(test_alignments): f = open(test_alignments, "r") # initializing our "counters" used for the aggregate scores sentence_pairs = 0 ibm1_precision_sum, ibm1_recall_sum, ibm1_aer_sum, ibm1_f1_sum = 0, 0, 0, 0 ibm2_precision_sum, ibm2_recall_sum, ibm2_aer_sum, ibm2_f1_sum = 0, 0, 0, 0 for line in f: sentence_pairs += 1 strs = line.split("\t") print("-" * 47) print("Length of foreign sentence: ", len(strs[0].split())) print(strs[0]) print(strs[1], "\n") ibm1_aligns = Alignment.fromstring(strs[2]) ibm2_aligns = Alignment.fromstring(strs[3]) hand_aligns = Alignment.fromstring(strs[4]) ''' Evaluate the sentence pair's precisiona and recall by utilizing the built in ntlk.metrics precision and recall functions. The functions parameters are the following: 1. Reference ("Gold Standard"): our hand alignments that follow the same format as the system produced alignments 2. Test: the alignments produced by the model which will be put in comparison with the hand alignments ''' ibm1_precision, ibm1_recall, ibm1_aer, ibm1_f1 = precision(hand_aligns, ibm1_aligns), recall(hand_aligns, ibm1_aligns), \ alignment_error_rate(hand_aligns, ibm1_aligns), f_measure(hand_aligns, ibm1_aligns) ibm2_precision, ibm2_recall, ibm2_aer, ibm2_f1 = precision(hand_aligns, ibm2_aligns), recall(hand_aligns, ibm2_aligns), \ alignment_error_rate(hand_aligns, ibm2_aligns), f_measure(hand_aligns, ibm2_aligns) # Add it to our aggregate calculations ibm1_precision_sum += ibm1_precision ibm1_recall_sum += ibm1_recall ibm1_aer_sum += ibm1_aer ibm1_f1_sum += ibm1_f1 ibm2_precision_sum += ibm2_precision ibm2_recall_sum += ibm2_recall ibm2_aer_sum += ibm2_aer ibm2_f1_sum += ibm2_f1 print("IBM1 Precision: ", ibm1_precision, "\t", "IBM2 Precision: ", ibm2_precision) print("IBM1 Recall: ", ibm1_recall, "\t", "IBM2 Recall: ", ibm2_recall) print("IBM1 AER:", ibm1_aer, "\t", "IBM2 AER: ", ibm2_aer) print("IBM1 F1: ", ibm1_f1, "\t", "IBM2 F1: ", ibm2_f1) print("-" * 47, "\n") f.close() # Prints out the total statistics of the dataset print("-" * 23, "AVERAGE STATS", "-" * 23) print("Average IBM1 Precision: ", ibm1_precision_sum / sentence_pairs, "\t" * 2, "Average IBM2 Precision: ", ibm2_precision_sum / sentence_pairs) print("Average IBM1 Recall: ", ibm1_recall_sum / sentence_pairs, "\t" * 2, "Average IBM2 Recall: ", ibm2_recall_sum / sentence_pairs) print("Average IBM1 AER:", ibm1_aer_sum / sentence_pairs, "\t" * 2, "Average IBM2 AER: ", ibm2_aer_sum / sentence_pairs) print("Average IBM1 F1: ", ibm1_f1_sum / sentence_pairs, "\t" * 2, "Average IBM2 F1: ", ibm2_f1_sum / sentence_pairs)
if word.isspace(): pass if word[0].isdigit(): alignment.append(word.replace('\n', '')) elif word[0].isupper(): pron.append(word) alignments.append((list(line[0]), pron, " ".join(alignment))) #else: # break #iterations = iterations - 1 f.close() return alignments align_sents = read_corpus(file) bitext = [] for alignment in align_sents: bitext.append(AlignedSent(alignment[0], alignment[1], Alignment.fromstring(alignment[2]))) model = IBMModel1(bitext, 5) test_sentence = bitext[2] print test_sentence.words print test_sentence.mots print test_sentence.alignment #print('{0:.3f}'.format(model.translation_table['a']['AA'])) #print bitext[365]
from pprint import pprint import goslate gsd = goslate.Goslate(service_urls=['http://translate.google.fr']) gs = goslate.Goslate() sun_fr = gsd.lookup_dictionary('sun', 'fr') print('Goslate') pprint(sun_fr) from nltk.translate import AlignedSent, Alignment algnsent = AlignedSent( ['klein', 'ist', 'das', 'Haus'], # you need parralle ['the', 'house', 'is', 'small'], # corpora Alignment.fromstring('0-2 1-3 2-1 3-0')) # and alignements print("nltk translate") print(algnsent.words, algnsent.mots)