def test_pickle(self): lsh = MinHashLSH(threshold=0.5, num_perm=4) mg = WeightedMinHashGenerator(10, 4) m1 = mg.minhash(np.random.uniform(1, 10, 10)) m2 = mg.minhash(np.random.uniform(1, 10, 10)) lsh.insert("a", m1) lsh.insert("b", m2) result = lsh.query(m1) self.assertTrue("a" in result) result = lsh.query(m2) self.assertTrue("b" in result)
def test_pickle(self): lsh = MinHashLSH(threshold=0.5, num_perm=16) m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) lsh.insert("a", m1) lsh.insert("b", m2) lsh2 = pickle.loads(pickle.dumps(lsh)) result = lsh.query(m1) self.assertTrue("a" in result) result = lsh.query(m2) self.assertTrue("b" in result)
def test_query(self): lsh = MinHashLSH(threshold=0.5, num_perm=16) m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) lsh.insert("a", m1) lsh.insert("b", m2) result = lsh.query(m1) self.assertTrue("a" in result) result = lsh.query(m2) self.assertTrue("b" in result) m3 = MinHash(18) self.assertRaises(ValueError, lsh.query, m3)
def find_more_then_threshold(self, threshold, curr_file_name): lsh = MinHashLSH(threshold) current_m = self.min_hash_text(set(self.file_to_words('/'.join([self.doc_dir, curr_file_name])))) for k, v in self.min_hash_dict.iteritems(): lsh.insert(k, v) result = lsh.query(current_m) print("Candidates with Jaccard similarity > " + str(threshold), result)
def eg2(): mg = WeightedMinHashGenerator(10, 5) m1 = mg.minhash(v1) m2 = mg.minhash(v2) m3 = mg.minhash(v3) print("Estimated Jaccard m1, m2", m1.jaccard(m2)) print("Estimated Jaccard m1, m3", m1.jaccard(m3)) # Create LSH index lsh = MinHashLSH(threshold=0.1, num_perm=5) lsh.insert("m2", m2) lsh.insert("m3", m3) result = lsh.query(m1) print("Approximate neighbours with weighted Jaccard similarity > 0.1", result)
def test_query_redis(self): with patch('redis.Redis', fake_redis) as mock_redis: lsh = MinHashLSH(threshold=0.5, num_perm=16, storage_config={ 'type': 'redis', 'redis': { 'host': 'localhost', 'port': 6379 } }) m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) lsh.insert("a", m1) lsh.insert("b", m2) result = lsh.query(m1) self.assertTrue("a" in result) result = lsh.query(m2) self.assertTrue("b" in result) m3 = MinHash(18) self.assertRaises(ValueError, lsh.query, m3)
def eg1(): m1 = MinHash() m2 = MinHash() m3 = MinHash() for d in data1: m1.update(d.encode('utf8')) for d in data2: m2.update(d.encode('utf8')) for d in data3: m3.update(d.encode('utf8')) # Create LSH index lsh = MinHashLSH(threshold=0.5) lsh.insert("m2", m2) lsh.insert("m3", m3) result = lsh.query(m1) print("Approximate neighbours with Jaccard similarity > 0.5", result)
question = tokenize_sentence(str(row['question1'])) for word in question: min_Hash.update(word.encode('utf8')) lsh.insert(str(index), min_Hash) total = 0 return_result = 0 correct = 0 total_correct = 0 #for each sentense in column question2 find similar questions for i in range(0, total_questions): question_minHash = MinHash() question = tokenize_sentence(str(df['question2'][i])) for word in question: question_minHash.update(word.encode('utf8')) candidates = lsh.query(question_minHash) result = [] #check which candidates are similar with the sentence for j in range(len(candidates)): canditade = df['question1'][int(candidates[j])] cand = set(tokenize_sentence(str(canditade))) cand_minHash = MinHash() for word in cand: cand_minHash.update(word.encode('utf8')) if cand_minHash.jaccard(question_minHash) >= threshold_jacard: result.append(str(candidates[j])) #statistcs if df['is_duplicate'][i] == 1: total_correct += 1 if len(result) > 0: