async def test_insert_redis(self): async with AsyncMinHashLSH(storage_config=self._storage_config_redis, threshold=0.5, num_perm=16) as lsh: m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) await lsh.insert("a", m1) await lsh.insert("b", m2) for t in lsh.hashtables: self.assertTrue(await t.size() >= 1) items = [] for H in await t.keys(): items.extend(await t.get(H)) self.assertTrue(pickle.dumps("a") in items) self.assertTrue(pickle.dumps("b") in items) self.assertTrue(await lsh.has_key("a")) self.assertTrue(await lsh.has_key("b")) for i, H in enumerate(await lsh.keys.get(pickle.dumps("a"))): res = await lsh.hashtables[i].get(H) self.assertTrue(pickle.dumps("a") in res) m3 = MinHash(18) with self.assertRaises(ValueError): await lsh.insert("c", m3)
def _data(self, count): sizes = np.random.randint(1, 100, count) for key, size in enumerate(sizes): m = MinHash() for i in range(size): m.update(("%d" % i).encode("utf8")) yield (key, m, size)
def test_insert_redis(self): with patch('redis.Redis', fake_redis) as mock_redis: lsh = MinHashLSH(threshold=0.5, num_perm=16, storage_config={ 'type': 'redis', 'redis': { 'host': 'localhost', 'port': 6379 } }) m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) lsh.insert("a", m1) lsh.insert("b", m2) for t in lsh.hashtables: self.assertTrue(len(t) >= 1) items = [] for H in t: items.extend(t[H]) self.assertTrue(pickle.dumps("a") in items) self.assertTrue(pickle.dumps("b") in items) self.assertTrue("a" in lsh) self.assertTrue("b" in lsh) for i, H in enumerate(lsh.keys[pickle.dumps("a")]): self.assertTrue(pickle.dumps("a") in lsh.hashtables[i][H]) m3 = MinHash(18) self.assertRaises(ValueError, lsh.insert, "c", m3)
def mh2(data1, data2): m1 = MinHash() m2 = MinHash() for d in data1: m1.update(d.encode('utf8')) for d in data2: m2.update(d.encode('utf8')) return m1.jaccard(m2)
def _run_acc(size, seed, num_perm): m = MinHash(num_perm=num_perm) s = set() random.seed(seed) for i in range(size): v = int_bytes(random.randint(1, size)) m.update(v) s.add(v) return (m, s)
def _generate_minhash_list(data, shingle_length=2): minhash_list = [] for text in data: m = MinHash() for d in _extract_shingles(text, shingle_length): m.update(d.encode('utf8')) minhash_list.append(m) return minhash_list
def prepare_domain(vals): permutations = config.MINHASH_PARAMS['num_permutations'] encoding = config.MINHASH_PARAMS['encoding'] m_set = MinHash(num_perm=permutations) for elem in vals: m_set.update(elem.encode(encoding)) return m_set
def get_packet_seq_min_hash(packets, packet_range_begin, step): packet_seq_min_hash = MinHash() for packet_index in range(packet_range_begin, packet_range_begin + step): try: payload = get_payload(packets[packet_index]) except IndexError: break packet_seq_min_hash.update(payload.encode('utf-8')) return packet_seq_min_hash.hashvalues
def run_perf(card, num_perm): m = MinHash(num_perm=num_perm) logging.info("MinHash using %d permutation functions" % num_perm) start = time.clock() for i in range(card): m.update(int_bytes(i)) duration = time.clock() - start logging.info("Digested %d hashes in %.4f sec" % (card, duration)) return duration
def _run_minhash(A, B, data, seed, num_perm, b): (a_start, a_end), (b_start, b_end) = A, B hasher = pyhash.murmur3_32() m1 = MinHash(num_perm=num_perm, hashobj=Hash) m2 = MinHash(num_perm=num_perm, hashobj=Hash) for i in xrange(a_start, a_end): m1.update(hasher(data[i], seed=seed)) for i in xrange(b_start, b_end): m2.update(hasher(data[i], seed=seed)) return [m1.jaccard(m2), _b_bit_minhash_jaccard(m1, m2, b)]
def minHashing(splitedString): shringleLength = 5 startIndex = 0 m1 = MinHash(num_perm=minHashPermmutations) for x in range(0, int(round(len(splitedString) / shringleLength))): m1.update(splitedString[startIndex:(startIndex + shringleLength)].encode('utf8')) startIndex = startIndex + shringleLength return m1.hashvalues
def _setup(self): d = "abcdefghijklmnopqrstuvwxyz" forest = MinHashLSHForest() for i in range(len(d)-2): key = d[i] m = MinHash() j = i + 3 for s in d[i:j]: m.update(s.encode("utf8")) forest.add(key, m) forest.index() return forest
def test_get_counts(self): lsh = MinHashLSH(threshold=0.5, num_perm=16) m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) lsh.insert("a", m1) lsh.insert("b", m2) counts = lsh.get_counts() self.assertEqual(len(counts), lsh.b) for table in counts: self.assertEqual(sum(table.values()), 2)
def test__H(self): ''' Check _H output consistent bytes length given the same concatenated hash value size ''' for l in range(2, 128+1, 16): forest = MinHashLSHForest(num_perm=128, l=l) m = MinHash() m.update("abcdefg".encode("utf8")) m.update("1234567".encode("utf8")) forest.add("m", m) sizes = [len(H) for ht in forest.hashtables for H in ht] self.assertTrue(all(sizes[0] == s for s in sizes))
def _minhash_from_text(self, text): """Calculate minhash of text. Args: text: String to calculate minhash of. Returns: A minhash (instance of datasketch.minhash.MinHash) """ minhash = MinHash(self._config.num_perm) for word in self._shingles_from_text(text): minhash.update(word.encode('utf8')) return minhash
def test_pickle(self): lsh = MinHashLSH(threshold=0.5, num_perm=16) m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) lsh.insert("a", m1) lsh.insert("b", m2) lsh2 = pickle.loads(pickle.dumps(lsh)) result = lsh.query(m1) self.assertTrue("a" in result) result = lsh.query(m2) self.assertTrue("b" in result)
async def test_get_counts_mongo(self): async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=16) as lsh: m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) await lsh.insert("a", m1) await lsh.insert("b", m2) counts = await lsh.get_counts() self.assertEqual(len(counts), lsh.b) for table in counts: self.assertEqual(sum(table.values()), 2)
def test_query(self): m1 = MinHash() m1.update("a".encode("utf8")) m1.update("b".encode("utf8")) m1.update("c".encode("utf8")) forest = self._setup() result = forest.query(m1, 3) self.assertTrue("a" in result) self.assertTrue("b" in result) self.assertTrue("c" in result) m3 = MinHash(18) self.assertRaises(ValueError, forest.query, m3, 1)
def test_pickle(self): forest = MinHashLSHForest() m1 = MinHash() m1.update("a".encode("utf8")) m2 = MinHash() m2.update("b".encode("utf8")) forest.add("a", m1) forest.add("b", m2) forest.index() forest2 = pickle.loads(pickle.dumps(forest)) result = forest2.query(m1, 1) self.assertTrue("a" in result) result = forest2.query(m2, 1) self.assertTrue("b" in result)
def eg1(): m1 = MinHash() m2 = MinHash() for d in data1: m1.update(d.encode('utf8')) for d in data2: m2.update(d.encode('utf8')) print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2)) s1 = set(data1) s2 = set(data2) actual_jaccard = float(len(s1.intersection(s2))) /\ float(len(s1.union(s2))) print("Actual Jaccard for data1 and data2 is", actual_jaccard)
def get_hash_values(atomic_table, vectorization_type="simple"): table_vector = [] if vectorization_type == "simple": table_vector = vectorize_atomic_table(atomic_table) elif vectorization_type == "lemmatize": table_vector = lemmatize_atomic_table(atomic_table) elif vectorization_type == "categorize": table_vector = categorize_atomic_table(atomic_table) _hash = MinHash() for item in table_vector: if type(item) == str: item = item.encode() _hash.update(item) return _hash.hashvalues
def prepare_query(filename): file = open(filename) first_line = file.readlines()[0] vals_string = first_line.split('\t')[1] vals = vals_string.split(',') length = len(vals) permutations = config.MINHASH_PARAMS['num_permutations'] encoding = config.MINHASH_PARAMS['encoding'] m_query = MinHash(num_perm=permutations) for elem in vals: m_query.update(elem.encode(encoding)) query_set = [m_query, length] return query_set
def test_query(self): lsh = MinHashLSH(threshold=0.5, num_perm=16) m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) lsh.insert("a", m1) lsh.insert("b", m2) result = lsh.query(m1) self.assertTrue("a" in result) result = lsh.query(m2) self.assertTrue("b" in result) m3 = MinHash(18) self.assertRaises(ValueError, lsh.query, m3)
async def test_pickle_mongo(self): async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=16) as lsh: m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) await lsh.insert("a", m1) await lsh.insert("b", m2) pickled = pickle.dumps(lsh) async with pickle.loads(pickled) as lsh2: result = await lsh2.query(m1) self.assertTrue("a" in result) result = await lsh2.query(m2) self.assertTrue("b" in result) await lsh2.close()
def minhash_from_text(text, num_perm, delimiters): """Calculate minhash of text. Args: text: string to calculate minhash of. num_perm: number of random permutation functions used by MinHash to be indexed. delimiters: list of strings used as delimiters for splitting text into words. Returns: A minhash (instance of datasketch.minhash.MinHash) """ minhash = MinHash(num_perm) for word in _shingles_from_text(text, delimiters): minhash.update(word.encode('utf8')) return minhash
async def test__H_redis(self): """ Check _H output consistent bytes length given the same concatenated hash value size """ for _ in range(2, 128 + 1, 16): m = MinHash() m.update("abcdefg".encode("utf8")) m.update("1234567".encode("utf8")) async with AsyncMinHashLSH( storage_config=self._storage_config_redis, num_perm=128) as lsh: await lsh.insert("m", m) sizes = [ len(H) for ht in lsh.hashtables for H in await ht.keys() ] self.assertTrue(all(sizes[0] == s for s in sizes))
def test_remove(self): lsh = MinHashLSH(threshold=0.5, num_perm=16) m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) lsh.insert("a", m1) lsh.insert("b", m2) lsh.remove("a") self.assertTrue("a" not in lsh.keys) for table in lsh.hashtables: for H in table: self.assertGreater(len(table[H]), 0) self.assertTrue("a" not in table[H]) self.assertRaises(ValueError, lsh.remove, "c")
def _create_min_hashes(self): print_now('Start creating min hashes') min_hashes = [] for (event_id, _, stacktrace) in self.data: if stacktrace is None: continue l_set = set(stacktrace.lower().replace(',', ' ').split()) m = MinHash(num_perm=NUM_PERM) for d in l_set: m.update(d.encode('utf8')) min_hashes.append((event_id, m)) lsh = MinHashLSH(threshold=0.5, num_perm=NUM_PERM) for event_id, m in min_hashes: lsh.insert(event_id, m) return (min_hashes, lsh)
def main(): with open('plato1.txt', 'r') as f: tokens1 = [l for l in f] with open('plato2.txt', 'r') as f: tokens2 = [l for l in f] start = time.time() m1 = MinHash(num_perm=64, seed=0) for t in tokens1: m1.update(t.encode('utf8')) m2 = MinHash(num_perm=64, seed=0, permutations=m1.permutations) for t in tokens2: m2.update(t.encode('utf8')) similarity = m2.jaccard(m1) elapsed = time.time() - start print("Similar %f and Took %f ms", similarity, elapsed * 1000)
def minhash_from_text(text, num_perm, delimiters): """Calculate minhash of text. Args: text: string to calculate minhash of. num_perm: number of random permutation functions used by MinHash to be indexed. delimiters: list of strings used as delimiters for splitting text into words. Returns: A minhash (instance of datasketch.minhash.MinHash) """ minhash = MinHash(num_perm) for word in _shingles_from_text(text, delimiters): minhash.update(word.encode("utf8")) return minhash
def test_insert(self): lsh = MinHashLSH(threshold=0.5, num_perm=16) m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) lsh.insert("a", m1) lsh.insert("b", m2) for t in lsh.hashtables: self.assertTrue(len(t) >= 1) items = [] for H in t: items.extend(t[H]) self.assertTrue("a" in items) self.assertTrue("b" in items) m3 = MinHash(18) self.assertRaises(ValueError, lsh.insert, "c", m3)
async def test_arbitrary_collection(self): self._storage_config_mongo["mongo"][ "collection_name"] = "unit_test_collection" async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=16) as lsh: m1 = MinHash(16) m1.update(b"a") await lsh.insert("a", m1) dsn = MONGO_URL or "mongodb://{host}:{port}/{db}".format( **self._storage_config_mongo["mongo"]) collection = AsyncIOMotorClient(dsn).get_default_database( "lsh_test").get_collection("unit_test_collection") count = await collection.count_documents({}) self.assertGreaterEqual(count, 1) del self._storage_config_mongo["mongo"]["collection_name"]
def _get_minhash_from_domain(domain): """Get the Minhash value from a domain name. This function takes a domain, removes the TLD extension from it and then creates a MinHash object from every remaining character in the domain. If a domain starts with www., it will be stripped of the domain before the Minhash is calculated. Args: domain: string with a full domain, eg. www.google.com Returns: A minhash (instance of datasketch.minhash.MinHash) """ domain_items = domain.split('.') domain_part = '.'.join(domain_items[:-1]) minhash = MinHash(similarity.DEFAULT_PERMUTATIONS) for char in domain_part: minhash.update(char.encode('utf8')) return minhash
def eg1(): m1 = MinHash() m2 = MinHash() m3 = MinHash() for d in data1: m1.update(d.encode('utf8')) for d in data2: m2.update(d.encode('utf8')) for d in data3: m3.update(d.encode('utf8')) # Create LSH index lsh = MinHashLSH(threshold=0.5) lsh.insert("m2", m2) lsh.insert("m3", m3) result = lsh.query(m1) print("Approximate neighbours with Jaccard similarity > 0.5", result)
" thet whre hev back on the barkn of the bors. and they were al anr oo the bark of the bark of", " the boos of the boos of the boos of the boos afd the nererland thet thet whre hev back on the ", " barkn of the bors. and they were al anr oo the bark of the bark of the boos of the boos of the boos of the", " boos afd the nererland thet thet whre hev back on the barkn of the bors. and they were al anr oo the bark of the bark ", " of the boos of the boos of the boos of the boos afd the nererland thet thet whre hev back on the barkn of the bors." ] array=[0]*len(seq) for i in range (len (dataX)): for j in range(len(seq)): data1=dataX[i].split() data2=seq[j].split() m1 = MinHash() m2 = MinHash() for d in data1: m1.update(d.encode('utf8')) for d in data2: m2.update(d.encode('utf8')) s1 = set(data1) s2 = set(data2) actual_jaccard = float(len(s1.intersection(s2))) /\ float(len(s1.union(s2))) if array[j]<actual_jaccard: array[j]=actual_jaccard print array[j] print array #m1 = MinHash()/* #print dataX[1],len(seq) #for i in range(len(seq)):
def min_hash_text(self, sm_text): m = MinHash() for d in sm_text: m.update(d.encode('utf8')) return m