Beispiel #1
0
def test_compare_hex():
    """
    tests compare_digests by computing the nilsimsa score of two documents with a known score
    """
    sid_1 = "1352396387-81c1161097f9f00914e1b152ca4c0f46"
    sid_2 = "1338103128-006193af403dcc90c962184df08960a3"
    assert compare_digests(sid_to_nil[sid_1], sid_to_nil[sid_2]) == 95
Beispiel #2
0
        def accumulating_predicate((content_id, fc)):
            sim_feature = get_string_counter(fc, self.nilsimsa_feature_name)
            if not sim_feature:
                return True

            for nhash in sim_feature:
                if nhash in accumulator:
                    # either exact duplicate, or darn close (see
                    # test_nilsimsa_exact_match), so filter it and no
                    # need to update accumulator
                    return False

            for hash1, hash2 in product(sim_feature, accumulator):
                score = nilsimsa.compare_digests(hash1,
                                                 hash2,
                                                 threshold=self.threshold)
                score /= 128.0
                if score > self.threshold:
                    # near duplicate, so filter and do not accumulate
                    return False

            for nhash in sim_feature:
                accumulator[nhash] = content_id

            # allow it through
            return True
Beispiel #3
0
def test_compare_hex():
    """
    tests compare_digests by computing the nilsimsa score of two documents with a known score
    """
    sid_1 = "1352396387-81c1161097f9f00914e1b152ca4c0f46"
    sid_2 = "1338103128-006193af403dcc90c962184df08960a3"
    assert compare_digests(sid_to_nil[sid_1], sid_to_nil[sid_2]) == 95
Beispiel #4
0
        def accumulating_predicate((content_id, fc)):
            sim_feature = get_string_counter(fc, self.nilsimsa_feature_name)
            if not sim_feature:
                return True

            for nhash in sim_feature:
                if nhash in accumulator:
                    # either exact duplicate, or darn close (see
                    # test_nilsimsa_exact_match), so filter it and no
                    # need to update accumulator
                    return False

            for hash1, hash2 in product(sim_feature, accumulator):
                score = nilsimsa.compare_digests(hash1, hash2,
                                                 threshold=self.threshold)
                score /= 128.0
                if score > self.threshold:
                    # near duplicate, so filter and do not accumulate
                    return False

            for nhash in sim_feature:
                accumulator[nhash] = content_id

            # allow it through
            return True
Beispiel #5
0
def test_compare_threshold():
    """
    tests compare_digests by computing the nilsimsa score of two
    documents with a known score and the threshold set well above that
    score, so that it bails out early
    """
    sid_1 = "1352396387-81c1161097f9f00914e1b152ca4c0f46"
    sid_2 = "1338103128-006193af403dcc90c962184df08960a3"
    threshold = 110
    score = compare_digests(sid_to_nil[sid_1], sid_to_nil[sid_2], threshold=threshold)
    assert score == threshold - 1
Beispiel #6
0
 def calc_nilsimsa(self, gold_surface_form, comp_surface_form):
     nil_0 = Nilsimsa(gold_surface_form)
     nil_1 = Nilsimsa(comp_surface_form)
     nil = compare_digests(nil_0.hexdigest(), nil_1.hexdigest())
     return nil
Beispiel #7
0
def compare_hash(hash1, hash2):
    return 128 - compare_digests(hash1, hash2)
Beispiel #8
0
def simNil(h1,h2):
  return compare_digests(h1, h2)
Beispiel #9
0
except ImportError:
    import pickle

test_data_dir = os.path.join(os.path.dirname(__file__),
                             "nilsimsa\\test_data\\")
test_data = "test_dict.p"
test_dict = os.path.join(test_data_dir, test_data)
sid_to_nil = pickle.load(open(test_dict, "rb"))
# print sid_to_nil

nil = Nilsimsa('0' * 64)
s1 = nil.hexdigest()
nil = Nilsimsa('0' * 63 + '1')
s2 = nil.hexdigest()
print s1, s2
print compare_digests(s1, s2)

# for i in range(1,30):
#     cloneGroup = getCodeFragment.getCloneClass('1.2.txt', i)
#     s1 = Nilsimsa(cloneGroup[0]).hexdigest()
#     s2 = Nilsimsa(cloneGroup[1]).hexdigest()
#     #print s1,s2
#     print compare_digests(s1,s2)
#     if compare_digests(s1,s2) <0:
#         getCodeFragment.printCloneClass('1.2.txt', i)

# for i in range(1,50):
#     cloneGroup = getCodeFragment.getCloneClass('1.2.txt', i)
#     s1 = Nilsimsa(cloneGroup[0]).hexdigest()
#     cloneGroup = getCodeFragment.getCloneClass('1.2.txt', i+1)
#     s2 = Nilsimsa(cloneGroup[0]).hexdigest()