Example #1
0
def evaluate_cosine_similarity(evaluated_sentences, reference_sentences):
    evaluated_words = tuple(chain(*(s.words for s in evaluated_sentences)))
    reference_words = tuple(chain(*(s.words for s in reference_sentences)))
    evaluated_model = TfDocumentModel(evaluated_words)
    reference_model = TfDocumentModel(reference_words)

    return cosine_similarity(evaluated_model, reference_model)
Example #2
0
def analyze_summaries(book_id, args):
    """
    Analyze the summary.
    Compares the created summary with the ground truth summary.
    Expects the ground truth summary to be in the data/summaries directory.

    Parameters:
    book_id: (str) the book identifier
    args: the command line arguments, used to determine the filename for the created summary.

    Outputs:
    Saves the analysis to a csv file in the results/analysis directory
    """
    if not exists('../results/analysis'):
        makedirs('../results/analysis')
    analysis_data = []
    summary_doc, summary_model = load_summary(
        get_data_filename(book_id, 'summaries'))
    new_summary_doc, new_summary_model = load_summary(
        get_results_filename(book_id, args))
    if summary_doc != '':
        if new_summary_doc != '':
            analysis_data.append([
                'word embeddings similarity',
                summary_doc.similarity(new_summary_doc)
            ])
    if summary_model != '':
        if new_summary_model != '':
            analysis_data.append([
                'cosine similarity',
                cosine_similarity(summary_model, new_summary_model)
            ])
    with open(get_analysis_filename(book_id, args), 'w') as csvFile:
        writer = csv.writer(csvFile)
        writer.writerows(analysis_data)
Example #3
0
    def test_cosine_half_match(self):
        tokenizer = Tokenizer("czech")
        model1 = TfDocumentModel("Veta aká sa tu len veľmi ťažko hľadá",
                                 tokenizer)
        model2 = TfDocumentModel("Teta ktorá sa tu iba veľmi zle hľadá",
                                 tokenizer)

        self.assertAlmostEqual(cosine_similarity(model1, model2), 0.5)
Example #4
0
    def test_cosine_no_match(self):
        tokenizer = Tokenizer("czech")
        model1 = TfDocumentModel("Toto je moja veta. To sa nedá poprieť!",
                                 tokenizer)
        model2 = TfDocumentModel("Hento bolo jeho slovo, ale možno klame.",
                                 tokenizer)

        self.assertAlmostEqual(cosine_similarity(model1, model2), 0.0)
Example #5
0
    def test_cosine_half_match(self):
        tokenizer = Tokenizer("czech")
        model1 = TfDocumentModel("Veta aká sa tu len veľmi ťažko hľadá",
            tokenizer)
        model2 = TfDocumentModel("Teta ktorá sa tu iba veľmi zle hľadá",
            tokenizer)

        self.assertAlmostEqual(cosine_similarity(model1, model2), 0.5)
Example #6
0
    def test_cosine_no_match(self):
        tokenizer = Tokenizer("czech")
        model1 = TfDocumentModel("Toto je moja veta. To sa nedá poprieť!",
            tokenizer)
        model2 = TfDocumentModel("Hento bolo jeho slovo, ale možno klame.",
            tokenizer)

        self.assertAlmostEqual(cosine_similarity(model1, model2), 0.0)
Example #7
0
def test_cosine_no_match():
    tokenizer = Tokenizer("czech")
    model1 = TfDocumentModel("Toto je moja veta. To sa nedá poprieť!",
                             tokenizer)
    model2 = TfDocumentModel("Hento bolo jeho slovo, ale možno klame.",
                             tokenizer)

    assert cosine_similarity(model1, model2) == approx(0.0)
def test_wrong_arguments():
    text = "Toto je moja veta, to sa nedá poprieť."
    model = TfDocumentModel(text, Tokenizer("czech"))

    with pytest.raises(ValueError):
        cosine_similarity(text, text)
    with pytest.raises(ValueError):
        cosine_similarity(text, model)
    with pytest.raises(ValueError):
        cosine_similarity(model, text)
def test_empty_model():
    text = "Toto je moja veta, to sa nedá poprieť."
    model = TfDocumentModel(text, Tokenizer("czech"))
    empty_model = TfDocumentModel([])

    with pytest.raises(ValueError):
        cosine_similarity(empty_model, empty_model)
    with pytest.raises(ValueError):
        cosine_similarity(empty_model, model)
    with pytest.raises(ValueError):
        cosine_similarity(model, empty_model)
Example #10
0
def evaluate(summary, sumref, debug=False):
    sumstring = sumtostr(summary)
    sumtuple = sumtotup(summary)
    refstring = sumtostr(sumref)
    reftuple = sumtotup(sumref)
    summodel = TfDocumentModel(sumstring, Tokenizer("english"))
    refmodel = TfDocumentModel(refstring, Tokenizer("english"))

    if debug:
        print(reftuple)
        print(sumtuple)

    cos_val = cosine_similarity(summodel, refmodel)
    unit_val = unit_overlap(summodel, refmodel)

    precision_val = precision(sumtuple, reftuple)
    recall_val = recall(sumtuple, reftuple)
    f_val = f_score(sumtuple, reftuple)

    return cos_val, unit_val, precision_val, recall_val, f_val
Example #11
0
    def test_cosine_exact_match(self):
        text = "Toto je moja veta, to sa nedá poprieť."
        model = TfDocumentModel(text, Tokenizer("czech"))

        self.assertAlmostEqual(cosine_similarity(model, model), 1.0)
def test_cosine_half_match():
    tokenizer = Tokenizer("czech")
    model1 = TfDocumentModel("Veta aká sa tu len veľmi ťažko hľadá", tokenizer)
    model2 = TfDocumentModel("Teta ktorá sa tu iba veľmi zle hľadá", tokenizer)

    assert cosine_similarity(model1, model2) == approx(0.5)
def test_cosine_no_match():
    tokenizer = Tokenizer("czech")
    model1 = TfDocumentModel("Toto je moja veta. To sa nedá poprieť!", tokenizer)
    model2 = TfDocumentModel("Hento bolo jeho slovo, ale možno klame.", tokenizer)

    assert cosine_similarity(model1, model2) == approx(0.0)
def test_cosine_exact_match():
    text = "Toto je moja veta, to sa nedá poprieť."
    model = TfDocumentModel(text, Tokenizer("czech"))

    assert cosine_similarity(model, model) == approx(1.0)
Example #15
0
    def test_cosine_exact_match(self):
        text = "Toto je moja veta, to sa nedá poprieť."
        model = TfDocumentModel(text, Tokenizer("czech"))

        self.assertAlmostEqual(cosine_similarity(model, model), 1.0)