def evaluate_cosine_similarity(evaluated_sentences, reference_sentences): evaluated_words = tuple(chain(*(s.words for s in evaluated_sentences))) reference_words = tuple(chain(*(s.words for s in reference_sentences))) evaluated_model = TfDocumentModel(evaluated_words) reference_model = TfDocumentModel(reference_words) return cosine_similarity(evaluated_model, reference_model)
def analyze_summaries(book_id, args): """ Analyze the summary. Compares the created summary with the ground truth summary. Expects the ground truth summary to be in the data/summaries directory. Parameters: book_id: (str) the book identifier args: the command line arguments, used to determine the filename for the created summary. Outputs: Saves the analysis to a csv file in the results/analysis directory """ if not exists('../results/analysis'): makedirs('../results/analysis') analysis_data = [] summary_doc, summary_model = load_summary( get_data_filename(book_id, 'summaries')) new_summary_doc, new_summary_model = load_summary( get_results_filename(book_id, args)) if summary_doc != '': if new_summary_doc != '': analysis_data.append([ 'word embeddings similarity', summary_doc.similarity(new_summary_doc) ]) if summary_model != '': if new_summary_model != '': analysis_data.append([ 'cosine similarity', cosine_similarity(summary_model, new_summary_model) ]) with open(get_analysis_filename(book_id, args), 'w') as csvFile: writer = csv.writer(csvFile) writer.writerows(analysis_data)
def test_cosine_half_match(self): tokenizer = Tokenizer("czech") model1 = TfDocumentModel("Veta aká sa tu len veľmi ťažko hľadá", tokenizer) model2 = TfDocumentModel("Teta ktorá sa tu iba veľmi zle hľadá", tokenizer) self.assertAlmostEqual(cosine_similarity(model1, model2), 0.5)
def test_cosine_no_match(self): tokenizer = Tokenizer("czech") model1 = TfDocumentModel("Toto je moja veta. To sa nedá poprieť!", tokenizer) model2 = TfDocumentModel("Hento bolo jeho slovo, ale možno klame.", tokenizer) self.assertAlmostEqual(cosine_similarity(model1, model2), 0.0)
def test_cosine_no_match(): tokenizer = Tokenizer("czech") model1 = TfDocumentModel("Toto je moja veta. To sa nedá poprieť!", tokenizer) model2 = TfDocumentModel("Hento bolo jeho slovo, ale možno klame.", tokenizer) assert cosine_similarity(model1, model2) == approx(0.0)
def test_wrong_arguments(): text = "Toto je moja veta, to sa nedá poprieť." model = TfDocumentModel(text, Tokenizer("czech")) with pytest.raises(ValueError): cosine_similarity(text, text) with pytest.raises(ValueError): cosine_similarity(text, model) with pytest.raises(ValueError): cosine_similarity(model, text)
def test_empty_model(): text = "Toto je moja veta, to sa nedá poprieť." model = TfDocumentModel(text, Tokenizer("czech")) empty_model = TfDocumentModel([]) with pytest.raises(ValueError): cosine_similarity(empty_model, empty_model) with pytest.raises(ValueError): cosine_similarity(empty_model, model) with pytest.raises(ValueError): cosine_similarity(model, empty_model)
def evaluate(summary, sumref, debug=False): sumstring = sumtostr(summary) sumtuple = sumtotup(summary) refstring = sumtostr(sumref) reftuple = sumtotup(sumref) summodel = TfDocumentModel(sumstring, Tokenizer("english")) refmodel = TfDocumentModel(refstring, Tokenizer("english")) if debug: print(reftuple) print(sumtuple) cos_val = cosine_similarity(summodel, refmodel) unit_val = unit_overlap(summodel, refmodel) precision_val = precision(sumtuple, reftuple) recall_val = recall(sumtuple, reftuple) f_val = f_score(sumtuple, reftuple) return cos_val, unit_val, precision_val, recall_val, f_val
def test_cosine_exact_match(self): text = "Toto je moja veta, to sa nedá poprieť." model = TfDocumentModel(text, Tokenizer("czech")) self.assertAlmostEqual(cosine_similarity(model, model), 1.0)
def test_cosine_half_match(): tokenizer = Tokenizer("czech") model1 = TfDocumentModel("Veta aká sa tu len veľmi ťažko hľadá", tokenizer) model2 = TfDocumentModel("Teta ktorá sa tu iba veľmi zle hľadá", tokenizer) assert cosine_similarity(model1, model2) == approx(0.5)
def test_cosine_exact_match(): text = "Toto je moja veta, to sa nedá poprieť." model = TfDocumentModel(text, Tokenizer("czech")) assert cosine_similarity(model, model) == approx(1.0)