Ejemplo n.º 1
0
def test_similarity_of_two_sets_using_w_shingles():

    print ".....Testing w-shingles (shingling, minhash & calc jaccard similarity)\n"

    min_values_list_w_shingles = None
    for shingle, original_document in shingle_generator(faux_generator_string_words(), type=ShingleType.W_SHINGLES):
        print shingle
        min_values_list_w_shingles = run(shingle)
        print "number of min_hash values -> %s" % str(len(min_values_list_w_shingles))
        print min_values_list_w_shingles
        print

    min_values_list_w_shingles_2 = None
    for shingle, original_document in shingle_generator(faux_generator_string_words_2(), type=ShingleType.W_SHINGLES):
        print shingle
        min_values_list_w_shingles_2 = run(shingle)
        print "number of min_hash values -> %s" % str(len(min_values_list_w_shingles_2))
        print min_values_list_w_shingles_2
        print

    # calculate jaccard similarity - should be approx 44% similar
    similarity_ratio = jaccard_similarity(set(min_values_list_w_shingles), set(min_values_list_w_shingles_2))
    print "Asserting jaccard similarity should be ~44%\n"

    assert similarity_ratio >= .44
def test_sets_exact_match_returns_1():
    #set up
    faux_set_1 = set(["abcdef"])
    faux_set_2 = set(["abcdef"])

    #execute
    results = jaccard_similarity(faux_set_1, faux_set_2)

    #asserts
    nt.eq_(results, 1.0)
def test_both_sets_empty():
    #set up
    faux_set_1 = set([])
    faux_set_2 = set([])

    #execute
    results = jaccard_similarity(faux_set_1, faux_set_2)

    #asserts
    nt.eq_(results, 0)
def test_sets_not_similar_returns_0():
    #set up
    faux_set_1 = set(["abcdef"])
    faux_set_2 = set(["test_set_not_similar"])

    #execute
    results = jaccard_similarity(faux_set_1, faux_set_2)

    #asserts
    nt.eq_(results, 0.0)
def test_sets_at_least_50_percent_similar():
    #set up
    faux_set_1 = set(["abcdef"])
    faux_set_2 = set(["abcd", "abcdef"])

    #execute
    results = jaccard_similarity(faux_set_1, faux_set_2)

    #asserts
    assert results >= .50
def test_both_sets_empty():
    #set up
    faux_set_1 = set([])
    faux_set_2 = set([])

    #execute
    results = jaccard_similarity(faux_set_1, faux_set_2)

    #asserts
    nt.eq_(results,0)
def test_sets_exact_match_returns_1():
    #set up
    faux_set_1 = set(["abcdef"])
    faux_set_2 = set(["abcdef"])

    #execute
    results = jaccard_similarity(faux_set_1, faux_set_2)

    #asserts
    nt.eq_(results, 1.0)
def test_sets_at_least_50_percent_similar():
    #set up
    faux_set_1 = set(["abcdef"])
    faux_set_2 = set(["abcd", "abcdef"])

    #execute
    results = jaccard_similarity(faux_set_1, faux_set_2)

    #asserts
    assert results >= .50
def test_sets_not_similar_returns_0():
    #set up
    faux_set_1 = set(["abcdef"])
    faux_set_2 = set(["test_set_not_similar"])

    #execute
    results = jaccard_similarity(faux_set_1, faux_set_2)

    #asserts
    nt.eq_(results, 0.0)
    def _calculate_similarity_score(self, document_1, document_2):
        """
            Calculate similarity score for givens documents.
            :param document_1:
            :param document_2:
            :return: 0.0 if score can't be calculated otherwise returns calculated value
        """
        score = 0.0

        if document_1 and document_2:
            shingles_set_1 = document_1.get_shingles_as_set()
            shingles_set_2 = document_2.get_shingles_as_set()

            score = jaccard_similarity(shingles_set_1, shingles_set_2)

        return score
    def _calculate_similarity_score(self, document_1, document_2):
        """
            Calculate similarity score for givens documents.
            :param document_1:
            :param document_2:
            :return: 0.0 if score can't be calculated otherwise returns calculated value
        """
        score = 0.0

        if document_1 and document_2:
            shingles_set_1 = document_1.get_shingles_as_set()
            shingles_set_2 = document_2.get_shingles_as_set()

            score = jaccard_similarity(shingles_set_1, shingles_set_2)

        return score