def test_compare_hex(): """ tests compare_digests by computing the nilsimsa score of two documents with a known score """ sid_1 = "1352396387-81c1161097f9f00914e1b152ca4c0f46" sid_2 = "1338103128-006193af403dcc90c962184df08960a3" assert compare_digests(sid_to_nil[sid_1], sid_to_nil[sid_2]) == 95
def accumulating_predicate((content_id, fc)): sim_feature = get_string_counter(fc, self.nilsimsa_feature_name) if not sim_feature: return True for nhash in sim_feature: if nhash in accumulator: # either exact duplicate, or darn close (see # test_nilsimsa_exact_match), so filter it and no # need to update accumulator return False for hash1, hash2 in product(sim_feature, accumulator): score = nilsimsa.compare_digests(hash1, hash2, threshold=self.threshold) score /= 128.0 if score > self.threshold: # near duplicate, so filter and do not accumulate return False for nhash in sim_feature: accumulator[nhash] = content_id # allow it through return True
def test_compare_threshold(): """ tests compare_digests by computing the nilsimsa score of two documents with a known score and the threshold set well above that score, so that it bails out early """ sid_1 = "1352396387-81c1161097f9f00914e1b152ca4c0f46" sid_2 = "1338103128-006193af403dcc90c962184df08960a3" threshold = 110 score = compare_digests(sid_to_nil[sid_1], sid_to_nil[sid_2], threshold=threshold) assert score == threshold - 1
def calc_nilsimsa(self, gold_surface_form, comp_surface_form): nil_0 = Nilsimsa(gold_surface_form) nil_1 = Nilsimsa(comp_surface_form) nil = compare_digests(nil_0.hexdigest(), nil_1.hexdigest()) return nil
def compare_hash(hash1, hash2): return 128 - compare_digests(hash1, hash2)
def simNil(h1,h2): return compare_digests(h1, h2)
except ImportError: import pickle test_data_dir = os.path.join(os.path.dirname(__file__), "nilsimsa\\test_data\\") test_data = "test_dict.p" test_dict = os.path.join(test_data_dir, test_data) sid_to_nil = pickle.load(open(test_dict, "rb")) # print sid_to_nil nil = Nilsimsa('0' * 64) s1 = nil.hexdigest() nil = Nilsimsa('0' * 63 + '1') s2 = nil.hexdigest() print s1, s2 print compare_digests(s1, s2) # for i in range(1,30): # cloneGroup = getCodeFragment.getCloneClass('1.2.txt', i) # s1 = Nilsimsa(cloneGroup[0]).hexdigest() # s2 = Nilsimsa(cloneGroup[1]).hexdigest() # #print s1,s2 # print compare_digests(s1,s2) # if compare_digests(s1,s2) <0: # getCodeFragment.printCloneClass('1.2.txt', i) # for i in range(1,50): # cloneGroup = getCodeFragment.getCloneClass('1.2.txt', i) # s1 = Nilsimsa(cloneGroup[0]).hexdigest() # cloneGroup = getCodeFragment.getCloneClass('1.2.txt', i+1) # s2 = Nilsimsa(cloneGroup[0]).hexdigest()