def evaluate(self, data, ref_alignments, batch_size=4, training=False): """Evaluate the model on a data set.""" ref_align = read_naacl_alignments(ref_alignments) ref_iterator = iter(ref_align) metric = AERSufficientStatistics() accuracy_correct = 0 accuracy_total = 0 loss_total = 0 steps = 0. for batch_id, batch in enumerate(iterate_minibatches(data, batch_size=batch_size)): x, y = prepare_data(batch, self.x_vocabulary, self.y_vocabulary) y_len = np.sum(np.sign(y), axis=1, dtype="int64") align, prob, acc_correct, acc_total, loss = self.get_viterbi(x, y, training) accuracy_correct += acc_correct accuracy_total += acc_total loss_total += loss steps += 1 for alignment, N, (sure, probable) in zip(align, y_len, ref_iterator): # the evaluation ignores NULL links, so we discard them # j is 1-based in the naacl format pred = set((aj, j) for j, aj in enumerate(alignment[:N], 1) if aj > 0) metric.update(sure=sure, probable=probable, predicted=pred) # print(batch[s]) # print(alignment[:N]) # print(pred) # s +=1 accuracy = accuracy_correct / float(accuracy_total) return metric.aer(), accuracy, loss_total/float(steps)
def calculate_aer(predictions): from random import random # 1. Read in gold alignments gold_sets = read_naacl_alignments('data/validation/dev.wa.nonullalign') # 3. Compute AER metric = AERSufficientStatistics() for gold, pred in zip(gold_sets, predictions): metric.update(sure=gold[0], probable=gold[1], predicted=pred) return metric.aer()
def calculate_aer( self, validation_corpus: List[Tuple[str, str]], validation_gold: List[List[Tuple[Set[int], Set[int]]]]) -> float: """Calculate AER on validation corpus using gold standard""" predictions = map(self.align, validation_corpus) # Compute AER metric = AERSufficientStatistics() for gold, pred in zip(validation_gold, predictions): (sure, probable) = gold metric.update(sure=sure, probable=probable, predicted=pred) return metric.aer()
def get_validation_metrics(self) -> Metrics: log_data_probability = 0 entropy = 0 predicted_alignments = [] for sentence in self.data.validation_data: log_sentence_probability = 0 sentence_alignment = [] alignment = self.get_best_alignment(sentence, False) for word_alignment in alignment.word_alignments: log_sentence_probability += math.log( word_alignment.probability) if word_alignment.english is not 0: sentence_alignment.append( (word_alignment.english, word_alignment.french + 1 ) # french alignments start from 1 ) entropy += -log_sentence_probability log_data_probability += log_sentence_probability predicted_alignments.append(set(sentence_alignment)) data_probability = math.exp(log_data_probability) aer = AERSufficientStatistics(self.validation_gold_alignments, predicted_alignments).aer() perplexity = entropy return Metrics(data_probability, aer, perplexity)
def calculate_aer(self, eval_alignement_path, test_alignments): gold_standard = read_naacl_alignments(eval_alignement_path) metric = AERSufficientStatistics() for gold_alignments, test_alignments in zip(gold_standard, test_alignments): metric.update(sure=gold_alignments[0], probable=gold_alignments[1], predicted=test_alignments) aer = metric.aer() self.aer.append(aer) print("AER: {}".format(aer))
def evaluate_model(model, alignment_path, parallel_corpus, predictions_file_path=None): # 1. Read in gold alignments gold_sets = read_naacl_alignments(alignment_path) # pairs are in format (e_w_indx, f_w_indx) # 2. Here I have the predictions of my own algorithm predictions = [] sentence_number = 0 if predictions_file_path: write_file = open(predictions_file_path, 'w') for (french_sentence, english_sentence), (s, _) in zip(parallel_corpus, gold_sets): sentence_number += 1 alignment = model.infer_alignment(french_sentence, english_sentence) temp_pred = [] for i, a in enumerate(alignment): # skip null-token alignments if a == 0: continue temp_pred.append((a, i + 1)) if predictions_file_path: write_file.write("%04d %d %d %s\n" % (sentence_number, a, i + 1, "P")) predictions.append(set(temp_pred)) if predictions_file_path: write_file.close() # 3. Compute AER # first we get an object that manages sufficient statistics metric = AERSufficientStatistics() # then we iterate over the corpus for gold, pred in zip(gold_sets, predictions): metric.update(sure=gold[0], probable=gold[1], predicted=pred) # AER return metric.aer()
write_alignments(model, 'ibm2-uniform.mle.naacl') model = Model2(data, None, 'random') model.load_parameters('parameters') write_alignments(model, 'ibm2-random.mle.naacl') model = Model2(data, None, 'ibm1') model.load_parameters('parameters') write_alignments(model, 'ibm2-ibm1.mle.naacl') model = BayesianModel2(data, None, 0.1) model.load_parameters('parameters') write_alignments(model, 'ibm2.vb.naacl') model = JumpingModel2(data, None, 'random') model.load_parameters('parameters') write_alignments(model, 'ibm2-jumps.mle.naacl') testing_gold_alignment_pickle = 'pickles/testing_gold_alignments.pickle' with open(testing_gold_alignment_pickle, 'rb') as file: testing_gold_alignments = pickle.load(file) for file in os.listdir('predictions'): if file.endswith('.naacl'): predictions = [] for prediction in read_naacl_alignments('predictions/{}'.format(file)): predictions.append(prediction[0]) aer = AERSufficientStatistics(testing_gold_alignments, predictions).aer() print('{}: {}'.format(file, round(aer, 5)))