def test_insert_redis(self): with patch('redis.Redis', fake_redis) as mock_redis: lsh = MinHashLSH(threshold=0.5, num_perm=16, storage_config={ 'type': 'redis', 'redis': { 'host': 'localhost', 'port': 6379 } }) m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) lsh.insert("a", m1) lsh.insert("b", m2) for t in lsh.hashtables: self.assertTrue(len(t) >= 1) items = [] for H in t: items.extend(t[H]) self.assertTrue(pickle.dumps("a") in items) self.assertTrue(pickle.dumps("b") in items) self.assertTrue("a" in lsh) self.assertTrue("b" in lsh) for i, H in enumerate(lsh.keys[pickle.dumps("a")]): self.assertTrue(pickle.dumps("a") in lsh.hashtables[i][H]) m3 = MinHash(18) self.assertRaises(ValueError, lsh.insert, "c", m3)
def find_more_then_threshold(self, threshold, curr_file_name): lsh = MinHashLSH(threshold) current_m = self.min_hash_text(set(self.file_to_words('/'.join([self.doc_dir, curr_file_name])))) for k, v in self.min_hash_dict.iteritems(): lsh.insert(k, v) result = lsh.query(current_m) print("Candidates with Jaccard similarity > " + str(threshold), result)
def test_get_counts(self): lsh = MinHashLSH(threshold=0.5, num_perm=16) m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) lsh.insert("a", m1) lsh.insert("b", m2) counts = lsh.get_counts() self.assertEqual(len(counts), lsh.b) for table in counts: self.assertEqual(sum(table.values()), 2)
def test__H(self): ''' Check _H output consistent bytes length given the same concatenated hash value size ''' mg = WeightedMinHashGenerator(100, sample_size=128) for l in range(2, mg.sample_size + 1, 16): m = mg.minhash(np.random.randint(1, 99999999, 100)) lsh = MinHashLSH(num_perm=128) lsh.insert("m", m) sizes = [len(H) for ht in lsh.hashtables for H in ht] self.assertTrue(all(sizes[0] == s for s in sizes))
def test_pickle(self): lsh = MinHashLSH(threshold=0.5, num_perm=4) mg = WeightedMinHashGenerator(10, 4) m1 = mg.minhash(np.random.uniform(1, 10, 10)) m2 = mg.minhash(np.random.uniform(1, 10, 10)) lsh.insert("a", m1) lsh.insert("b", m2) result = lsh.query(m1) self.assertTrue("a" in result) result = lsh.query(m2) self.assertTrue("b" in result)
def test_pickle(self): lsh = MinHashLSH(threshold=0.5, num_perm=16) m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) lsh.insert("a", m1) lsh.insert("b", m2) lsh2 = pickle.loads(pickle.dumps(lsh)) result = lsh.query(m1) self.assertTrue("a" in result) result = lsh.query(m2) self.assertTrue("b" in result)
def test__H(self): ''' Check _H output consistent bytes length given the same concatenated hash value size ''' for l in range(2, 128 + 1, 16): lsh = MinHashLSH(num_perm=128) m = MinHash() m.update("abcdefg".encode("utf8")) m.update("1234567".encode("utf8")) lsh.insert("m", m) sizes = [len(H) for ht in lsh.hashtables for H in ht] self.assertTrue(all(sizes[0] == s for s in sizes))
def eg2(): mg = WeightedMinHashGenerator(10, 5) m1 = mg.minhash(v1) m2 = mg.minhash(v2) m3 = mg.minhash(v3) print("Estimated Jaccard m1, m2", m1.jaccard(m2)) print("Estimated Jaccard m1, m3", m1.jaccard(m3)) # Create LSH index lsh = MinHashLSH(threshold=0.1, num_perm=5) lsh.insert("m2", m2) lsh.insert("m3", m3) result = lsh.query(m1) print("Approximate neighbours with weighted Jaccard similarity > 0.1", result)
def test_query(self): lsh = MinHashLSH(threshold=0.5, num_perm=16) m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) lsh.insert("a", m1) lsh.insert("b", m2) result = lsh.query(m1) self.assertTrue("a" in result) result = lsh.query(m2) self.assertTrue("b" in result) m3 = MinHash(18) self.assertRaises(ValueError, lsh.query, m3)
def test_remove(self): lsh = MinHashLSH(threshold=0.5, num_perm=4) mg = WeightedMinHashGenerator(10, 4) m1 = mg.minhash(np.random.uniform(1, 10, 10)) m2 = mg.minhash(np.random.uniform(1, 10, 10)) lsh.insert("a", m1) lsh.insert("b", m2) lsh.remove("a") self.assertTrue("a" not in lsh.keys) for table in lsh.hashtables: for H in table: self.assertGreater(len(table[H]), 0) self.assertTrue("a" not in table[H]) self.assertRaises(ValueError, lsh.remove, "c")
def test_remove(self): lsh = MinHashLSH(threshold=0.5, num_perm=16) m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) lsh.insert("a", m1) lsh.insert("b", m2) lsh.remove("a") self.assertTrue("a" not in lsh.keys) for table in lsh.hashtables: for H in table: self.assertGreater(len(table[H]), 0) self.assertTrue("a" not in table[H]) self.assertRaises(ValueError, lsh.remove, "c")
def _create_min_hashes(self): print_now('Start creating min hashes') min_hashes = [] for (event_id, _, stacktrace) in self.data: if stacktrace is None: continue l_set = set(stacktrace.lower().replace(',', ' ').split()) m = MinHash(num_perm=NUM_PERM) for d in l_set: m.update(d.encode('utf8')) min_hashes.append((event_id, m)) lsh = MinHashLSH(threshold=0.5, num_perm=NUM_PERM) for event_id, m in min_hashes: lsh.insert(event_id, m) return (min_hashes, lsh)
def eg1(): m1 = MinHash() m2 = MinHash() m3 = MinHash() for d in data1: m1.update(d.encode('utf8')) for d in data2: m2.update(d.encode('utf8')) for d in data3: m3.update(d.encode('utf8')) # Create LSH index lsh = MinHashLSH(threshold=0.5) lsh.insert("m2", m2) lsh.insert("m3", m3) result = lsh.query(m1) print("Approximate neighbours with Jaccard similarity > 0.5", result)
def test_insert(self): lsh = MinHashLSH(threshold=0.5, num_perm=16) m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) lsh.insert("a", m1) lsh.insert("b", m2) for t in lsh.hashtables: self.assertTrue(len(t) >= 1) items = [] for H in t: items.extend(t[H]) self.assertTrue("a" in items) self.assertTrue("b" in items) m3 = MinHash(18) self.assertRaises(ValueError, lsh.insert, "c", m3)
def test_insert(self): lsh = MinHashLSH(threshold=0.5, num_perm=16) m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) lsh.insert("a", m1) lsh.insert("b", m2) for t in lsh.hashtables: self.assertTrue(len(t) >= 1) items = [] for H in t: items.extend(t[H]) self.assertTrue("a" in items) self.assertTrue("b" in items) self.assertTrue("a" in lsh) self.assertTrue("b" in lsh) for i, H in enumerate(lsh.keys["a"]): self.assertTrue("a" in lsh.hashtables[i][H]) m3 = MinHash(18) self.assertRaises(ValueError, lsh.insert, "c", m3)
def test_insert(self): lsh = MinHashLSH(threshold=0.5, num_perm=4) mg = WeightedMinHashGenerator(10, 4) m1 = mg.minhash(np.random.uniform(1, 10, 10)) m2 = mg.minhash(np.random.uniform(1, 10, 10)) lsh.insert("a", m1) lsh.insert("b", m2) for t in lsh.hashtables: self.assertTrue(len(t) >= 1) items = [] for H in t: items.extend(t[H]) self.assertTrue("a" in items) self.assertTrue("b" in items) self.assertTrue("a" in lsh) self.assertTrue("b" in lsh) for i, H in enumerate(lsh.keys["a"]): self.assertTrue("a" in lsh.hashtables[i][H]) mg = WeightedMinHashGenerator(10, 5) m3 = mg.minhash(np.random.uniform(1, 10, 10)) self.assertRaises(ValueError, lsh.insert, "c", m3)
def test_query_redis(self): with patch('redis.Redis', fake_redis) as mock_redis: lsh = MinHashLSH(threshold=0.5, num_perm=16, storage_config={ 'type': 'redis', 'redis': { 'host': 'localhost', 'port': 6379 } }) m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) lsh.insert("a", m1) lsh.insert("b", m2) result = lsh.query(m1) self.assertTrue("a" in result) result = lsh.query(m2) self.assertTrue("b" in result) m3 = MinHash(18) self.assertRaises(ValueError, lsh.query, m3)
Question with similar minHash are candidates to be similar. To compare if two candidate senteces are similar we are using jaccard similarity """ df = pd.read_csv("proccessed.csv") total_questions = df.shape[0] threshold_jacard = 0.30 lsh = MinHashLSH(threshold=threshold_jacard) #calculate minhash for each sentence in column question1 for index, row in df.iterrows(): min_Hash = MinHash() question = tokenize_sentence(str(row['question1'])) for word in question: min_Hash.update(word.encode('utf8')) lsh.insert(str(index), min_Hash) total = 0 return_result = 0 correct = 0 total_correct = 0 #for each sentense in column question2 find similar questions for i in range(0, total_questions): question_minHash = MinHash() question = tokenize_sentence(str(df['question2'][i])) for word in question: question_minHash.update(word.encode('utf8')) candidates = lsh.query(question_minHash) result = [] #check which candidates are similar with the sentence for j in range(len(candidates)):