def evaluate(self, file_name, line, original_sequence, corrupt_sequence, predicted_sequence, ground_truth_corruptions=None, probability_predicted_corruption_pairs=None, runtime=0, evaluate_ed=True): # ground truth corruptions if ground_truth_corruptions is None: ground_truth_corruptions = get_space_corruptions(original_sequence, corrupt_sequence) # predicted corruptions if probability_predicted_corruption_pairs is None: predicted_corruptions = get_space_corruptions(predicted_sequence, corrupt_sequence, ignore_other_insertions=False) probability_predicted_corruption_pairs = [(1, prediction) for prediction in predicted_corruptions] # edit distance if evaluate_ed: ed_before = levenshtein(original_sequence, corrupt_sequence, substitutions=False) ed_after = levenshtein(original_sequence, predicted_sequence, substitutions=False) else: ed_before = ed_after = 0 # prediction probability dictionary probabilities = {prediction: probability for probability, prediction in probability_predicted_corruption_pairs} predicted_corruptions = probabilities.keys() # tp, fp and fn sets tp_insertions, fp_insertions, fn_insertions = tp_fp_fn_by_type(ground_truth_corruptions, predicted_corruptions, CorruptionType.INSERTION) tp_deletions, fp_deletions, fn_deletions = tp_fp_fn_by_type(ground_truth_corruptions, predicted_corruptions, CorruptionType.DELETION) # register sequence result sequence_result = SequenceResult( file_name=file_name, line=line, original_sequence=original_sequence, corrupt_sequence=corrupt_sequence, predicted_sequence=predicted_sequence, ground_truth_corruptions=ground_truth_corruptions, predicted_corruption_probabilities=probabilities, tp_insertions=tp_insertions, fp_insertions=fp_insertions, fn_insertions=fn_insertions, tp_deletions=tp_deletions, fp_deletions=fp_deletions, fn_deletions=fn_deletions, ed_before=ed_before, ed_after=ed_after, runtime=runtime) self.sequence_results.append(sequence_result) return sequence_result
def add_predictions(self, key: str, predicted_sequence: str): self.predicted_sequences[key] = predicted_sequence predicted_corruptions = get_space_corruptions(predicted_sequence, self.corrupt_sequence) for predicted_corruption in predicted_corruptions: if predicted_corruption not in self.predictions: self.predictions[predicted_corruption] = Prediction(ground_truth=False) self.predictions[predicted_corruption].add_predictor_key(key)
def __init__(self, correct_sequence: str, corrupt_sequence: str): self.correct_sequence = correct_sequence self.corrupt_sequence = corrupt_sequence self.ground_truth_corruptions = get_space_corruptions(correct_sequence, corrupt_sequence) self.predictions = {corruption: Prediction(ground_truth=True) for corruption in self.ground_truth_corruptions} self.predicted_sequences = {}
def test_get_space_corruptions(self): original = "This is a test sequence. " corrupt = " This isa test seq uence." expected_corruptions = [ Corruption(CorruptionType.INSERTION, 0, ' '), Corruption(CorruptionType.DELETION, 8, ' '), Corruption(CorruptionType.INSERTION, 18, ' '), Corruption(CorruptionType.DELETION, 25, ' ') ] corruptions = get_space_corruptions(original, corrupt) self.assertEqual(expected_corruptions, corruptions)
def predict(self, sequence): q = CandidateQueue(self.corrector, self.score, tolerance_steps=self.tolerance_steps) log_likelihood = q.get_log_likelihood(sequence) q.add_candidate(sequence, log_likelihood) while not q.terminated(): candidate = q.pop() log_likelihood = q.get_log_likelihood(candidate) print(log_likelihood, candidate) if self.spelling: candidates = q.get_candidates_spelling(candidate) else: candidates = q.get_candidates(candidate) for score, candidate in candidates: q.add_candidate(candidate, score) predictions = get_space_corruptions( q.best_sequence, sequence) if not self.spelling else [ ] # TODO spelling predictions dummy_probs = [1 for _ in predictions] print(q.best_sequence) return zip(dummy_probs, predictions), q.best_sequence
if __name__ == "__main__": exclude_zero = "no-zero" in sys.argv out_folder = "acl_error_distribution/" absolute_values = [] error_rates = [] for subset in (Subset.DEVELOPMENT, Subset.TEST): benchmark = Benchmark("ACL", subset) for correct, corrupt in benchmark.get_sequence_pairs( BenchmarkFiles.CORRUPT): print(corrupt) print(correct) edits = get_space_corruptions(correct, corrupt) n_edits = len(edits) n_chars = len(correct) ratio = n_edits / n_chars absolute_values.append(n_edits) error_rates.append(ratio) save_histogram_data(error_rates, out_folder + "tokenization_character_error_rates.txt") plot_rate_histogram( error_rates, title="Tokenization character error rates", subtitle="ACL development+test", xlabel= "Tokenization character error rate (whitespace errors / characters)", save_path=out_folder +
def get_ground_truth(self): return set(get_space_corruptions(self.correct, self.corrupt))