def evaluate_unit_overlap(evaluated_sentences, reference_sentences): evaluated_words = tuple(chain(*(s.words for s in evaluated_sentences))) reference_words = tuple(chain(*(s.words for s in reference_sentences))) evaluated_model = TfDocumentModel(evaluated_words) reference_model = TfDocumentModel(reference_words) return unit_overlap(evaluated_model, reference_model)
def test_unit_overlap_half_match(self): tokenizer = Tokenizer("czech") model1 = TfDocumentModel("Veta aká sa tu len veľmi ťažko hľadá", tokenizer) model2 = TfDocumentModel("Teta ktorá sa tu iba veľmi zle hľadá", tokenizer) self.assertAlmostEqual(unit_overlap(model1, model2), 1 / 3)
def test_unit_overlap_no_match(self): tokenizer = Tokenizer("czech") model1 = TfDocumentModel("Toto je moja veta. To sa nedá poprieť!", tokenizer) model2 = TfDocumentModel("Hento bolo jeho slovo, ale možno klame.", tokenizer) self.assertAlmostEqual(unit_overlap(model1, model2), 0.0)
def test_unit_overlap_half_match(self): tokenizer = Tokenizer("czech") model1 = TfDocumentModel("Veta aká sa tu len veľmi ťažko hľadá", tokenizer) model2 = TfDocumentModel("Teta ktorá sa tu iba veľmi zle hľadá", tokenizer) self.assertAlmostEqual(unit_overlap(model1, model2), 1/3)
def test_unit_overlap_no_match(): tokenizer = Tokenizer("czech") model1 = TfDocumentModel("Toto je moja veta. To sa nedá poprieť!", tokenizer) model2 = TfDocumentModel("Hento bolo jeho slovo, ale možno klame.", tokenizer) assert unit_overlap(model1, model2) == approx(0.0)
def test_unit_overlap_wrong_arguments(): tokenizer = Tokenizer("english") model = TfDocumentModel("", tokenizer) with pytest.raises(ValueError): unit_overlap("model", "model") with pytest.raises(ValueError): unit_overlap("model", model) with pytest.raises(ValueError): unit_overlap(model, "model")
def evaluate(summary, sumref, debug=False): sumstring = sumtostr(summary) sumtuple = sumtotup(summary) refstring = sumtostr(sumref) reftuple = sumtotup(sumref) summodel = TfDocumentModel(sumstring, Tokenizer("english")) refmodel = TfDocumentModel(refstring, Tokenizer("english")) if debug: print(reftuple) print(sumtuple) cos_val = cosine_similarity(summodel, refmodel) unit_val = unit_overlap(summodel, refmodel) precision_val = precision(sumtuple, reftuple) recall_val = recall(sumtuple, reftuple) f_val = f_score(sumtuple, reftuple) return cos_val, unit_val, precision_val, recall_val, f_val
def test_unit_overlap_exact_match(self): tokenizer = Tokenizer("czech") model = TfDocumentModel("Veta aká sa len veľmi ťažko hľadá.", tokenizer) self.assertAlmostEqual(unit_overlap(model, model), 1.0)
def test_unit_overlap_half_match(): tokenizer = Tokenizer("czech") model1 = TfDocumentModel("Veta aká sa tu len veľmi ťažko hľadá", tokenizer) model2 = TfDocumentModel("Teta ktorá sa tu iba veľmi zle hľadá", tokenizer) assert unit_overlap(model1, model2) == approx(1/3)
def test_unit_overlap_exact_match(): tokenizer = Tokenizer("czech") model = TfDocumentModel("Veta aká sa len veľmi ťažko hľadá.", tokenizer) assert unit_overlap(model, model) == approx(1.0)
def test_unit_overlap_empty(): tokenizer = Tokenizer("english") model = TfDocumentModel("", tokenizer) with pytest.raises(ValueError): unit_overlap(model, model)