async def test_insert_redis(self): async with AsyncMinHashLSH(storage_config=self._storage_config_redis, threshold=0.5, num_perm=16) as lsh: m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) await lsh.insert("a", m1) await lsh.insert("b", m2) for t in lsh.hashtables: self.assertTrue(await t.size() >= 1) items = [] for H in await t.keys(): items.extend(await t.get(H)) self.assertTrue(pickle.dumps("a") in items) self.assertTrue(pickle.dumps("b") in items) self.assertTrue(await lsh.has_key("a")) self.assertTrue(await lsh.has_key("b")) for i, H in enumerate(await lsh.keys.get(pickle.dumps("a"))): res = await lsh.hashtables[i].get(H) self.assertTrue(pickle.dumps("a") in res) m3 = MinHash(18) with self.assertRaises(ValueError): await lsh.insert("c", m3)
async def test_insert_mongo(self): async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=16) as lsh: seq = [ 'aahhb', 'aahh', 'aahhc', 'aac', 'kld', 'bhg', 'kkd', 'yow', 'ppi', 'eer' ] objs = [MinHash(16) for _ in range(len(seq))] for e, obj in zip(seq, objs): for i in e: obj.update(i.encode('utf-8')) data = [(e, m) for e, m in zip(seq, objs)] for key, minhash in data: await lsh.insert(key, minhash) for t in lsh.hashtables: self.assertTrue(await t.size() >= 1) items = [] for H in await t.keys(): items.extend(await t.get(H)) self.assertTrue('aahh' in items) self.assertTrue('bhg' in items) self.assertTrue(await lsh.has_key('aahh')) self.assertTrue(await lsh.has_key('bhg')) for i, H in enumerate(await lsh.keys.get('aahhb')): self.assertTrue('aahhb' in await lsh.hashtables[i].get(H)) m3 = MinHash(18) with self.assertRaises(ValueError): await lsh.insert("c", m3)
async def test_remove_mongo(self): async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=16) as lsh: m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) m3 = MinHash(16) m3.update("a".encode("utf8")) await lsh.insert("a", m1) await lsh.insert("b", m2) await lsh.insert("a1", m3) await lsh.remove("a") self.assertTrue(not await lsh.has_key("a")) self.assertTrue(await lsh.has_key('a1')) hashtable_correct = False for table in lsh.hashtables: for H in await table.keys(): table_vals = await table.get(H) self.assertGreater(len(table_vals), 0) self.assertTrue("a" not in table_vals) if 'a1' in table_vals: hashtable_correct = True self.assertTrue(hashtable_correct, 'Hashtable broken') with self.assertRaises(ValueError): await lsh.remove("c")
def test_insert_redis(self): with patch('redis.Redis', fake_redis) as mock_redis: lsh = MinHashLSH(threshold=0.5, num_perm=16, storage_config={ 'type': 'redis', 'redis': { 'host': 'localhost', 'port': 6379 } }) m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) lsh.insert("a", m1) lsh.insert("b", m2) for t in lsh.hashtables: self.assertTrue(len(t) >= 1) items = [] for H in t: items.extend(t[H]) self.assertTrue(pickle.dumps("a") in items) self.assertTrue(pickle.dumps("b") in items) self.assertTrue("a" in lsh) self.assertTrue("b" in lsh) for i, H in enumerate(lsh.keys[pickle.dumps("a")]): self.assertTrue(pickle.dumps("a") in lsh.hashtables[i][H]) m3 = MinHash(18) self.assertRaises(ValueError, lsh.insert, "c", m3)
def mh2(data1, data2): m1 = MinHash() m2 = MinHash() for d in data1: m1.update(d.encode('utf8')) for d in data2: m2.update(d.encode('utf8')) return m1.jaccard(m2)
def _run_minhash(A, B, data, seed, num_perm, b): (a_start, a_end), (b_start, b_end) = A, B hasher = pyhash.murmur3_32() m1 = MinHash(num_perm=num_perm, hashobj=Hash) m2 = MinHash(num_perm=num_perm, hashobj=Hash) for i in xrange(a_start, a_end): m1.update(hasher(data[i], seed=seed)) for i in xrange(b_start, b_end): m2.update(hasher(data[i], seed=seed)) return [m1.jaccard(m2), _b_bit_minhash_jaccard(m1, m2, b)]
def test_get_counts(self): lsh = MinHashLSH(threshold=0.5, num_perm=16) m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) lsh.insert("a", m1) lsh.insert("b", m2) counts = lsh.get_counts() self.assertEqual(len(counts), lsh.b) for table in counts: self.assertEqual(sum(table.values()), 2)
def test_pickle(self): lsh = MinHashLSH(threshold=0.5, num_perm=16) m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) lsh.insert("a", m1) lsh.insert("b", m2) lsh2 = pickle.loads(pickle.dumps(lsh)) result = lsh.query(m1) self.assertTrue("a" in result) result = lsh.query(m2) self.assertTrue("b" in result)
def test_query(self): m1 = MinHash() m1.update("a".encode("utf8")) m1.update("b".encode("utf8")) m1.update("c".encode("utf8")) forest = self._setup() result = forest.query(m1, 3) self.assertTrue("a" in result) self.assertTrue("b" in result) self.assertTrue("c" in result) m3 = MinHash(18) self.assertRaises(ValueError, forest.query, m3, 1)
async def test_get_counts_mongo(self): async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=16) as lsh: m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) await lsh.insert("a", m1) await lsh.insert("b", m2) counts = await lsh.get_counts() self.assertEqual(len(counts), lsh.b) for table in counts: self.assertEqual(sum(table.values()), 2)
def test_pickle(self): forest = MinHashLSHForest() m1 = MinHash() m1.update("a".encode("utf8")) m2 = MinHash() m2.update("b".encode("utf8")) forest.add("a", m1) forest.add("b", m2) forest.index() forest2 = pickle.loads(pickle.dumps(forest)) result = forest2.query(m1, 1) self.assertTrue("a" in result) result = forest2.query(m2, 1) self.assertTrue("b" in result)
def eg1(): m1 = MinHash() m2 = MinHash() for d in data1: m1.update(d.encode('utf8')) for d in data2: m2.update(d.encode('utf8')) print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2)) s1 = set(data1) s2 = set(data2) actual_jaccard = float(len(s1.intersection(s2))) /\ float(len(s1.union(s2))) print("Actual Jaccard for data1 and data2 is", actual_jaccard)
def test_query(self): lsh = MinHashLSH(threshold=0.5, num_perm=16) m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) lsh.insert("a", m1) lsh.insert("b", m2) result = lsh.query(m1) self.assertTrue("a" in result) result = lsh.query(m2) self.assertTrue("b" in result) m3 = MinHash(18) self.assertRaises(ValueError, lsh.query, m3)
def _data(self, count): sizes = np.random.randint(1, 100, count) for key, size in enumerate(sizes): m = MinHash() for i in range(size): m.update(("%d" % i).encode("utf8")) yield (key, m, size)
async def test_pickle_mongo(self): async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=16) as lsh: m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) await lsh.insert("a", m1) await lsh.insert("b", m2) pickled = pickle.dumps(lsh) async with pickle.loads(pickled) as lsh2: result = await lsh2.query(m1) self.assertTrue("a" in result) result = await lsh2.query(m2) self.assertTrue("b" in result) await lsh2.close()
async def test_insertion_session_mongo(self): def chunk(it, size): it = iter(it) return iter(lambda: tuple(islice(it, size)), ()) _chunked_str = chunk((random.choice(string.ascii_lowercase) for _ in range(10000)), 4) seq = frozenset(chain((''.join(s) for s in _chunked_str), ('aahhb', 'aahh', 'aahhc', 'aac', 'kld', 'bhg', 'kkd', 'yow', 'ppi', 'eer'))) objs = [MinHash(16) for _ in range(len(seq))] for e, obj in zip(seq, objs): for i in e: obj.update(i.encode('utf-8')) data = [(e, m) for e, m in zip(seq, objs)] async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=16) as lsh: async with lsh.insertion_session(batch_size=1000) as session: fs = (session.insert(key, minhash, check_duplication=False) for key, minhash in data) await asyncio.gather(*fs) for t in lsh.hashtables: self.assertTrue(await t.size() >= 1) items = [] for H in await t.keys(): items.extend(await t.get(H)) self.assertTrue('aahhb' in items) self.assertTrue('kld' in items) self.assertTrue(await lsh.has_key('aahhb')) self.assertTrue(await lsh.has_key('kld')) for i, H in enumerate(await lsh.keys.get('aahh')): self.assertTrue('aahh' in await lsh.hashtables[i].get(H))
def read_pcap(self, filename): """ Reads PCAP files using scapy's rdpcap, divides data into sessions and creates a hash for each session, a hash can be updated if sessions exist in subsequent files. :param filename: PCAP file path """ packets = rdpcap(filename)[IP] sessions = packets.sessions() for key in sessions: # try: if key not in self.session_collection: parts = key.split() protocol = parts[0] ip1_parts = parts[1].split(':') ip1 = ip1_parts[0] port1 = int(ip1_parts[1]) if len(ip1_parts) > 1 else 0 ip2_parts = parts[3].split(':') ip2 = ip2_parts[0] port2 = int(ip2_parts[1]) if len(ip1_parts) > 1 else 0 entropy = entropy_domain_names(sessions[key]) self.session_collection[key] = [ hash(protocol), entropy, # int(ipaddress.ip_address(ip1)), port1, # int(ipaddress.ip_address(ip2)), port2, MinHash() ] self.session_collection[key] = calculate_hash( self.session_collection[key], sessions[key])
def eg1(): m1 = MinHash() m2 = MinHash() m3 = MinHash() for d in data1: m1.update(d.encode('utf8')) for d in data2: m2.update(d.encode('utf8')) for d in data3: m3.update(d.encode('utf8')) # Create LSH index lsh = MinHashLSH(threshold=0.5) lsh.insert("m2", m2) lsh.insert("m3", m3) result = lsh.query(m1) print("Approximate neighbours with Jaccard similarity > 0.5", result)
def main(): with open('plato1.txt', 'r') as f: tokens1 = [l for l in f] with open('plato2.txt', 'r') as f: tokens2 = [l for l in f] start = time.time() m1 = MinHash(num_perm=64, seed=0) for t in tokens1: m1.update(t.encode('utf8')) m2 = MinHash(num_perm=64, seed=0, permutations=m1.permutations) for t in tokens2: m2.update(t.encode('utf8')) similarity = m2.jaccard(m1) elapsed = time.time() - start print("Similar %f and Took %f ms", similarity, elapsed * 1000)
def test_remove(self): lsh = MinHashLSH(threshold=0.5, num_perm=16) m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) lsh.insert("a", m1) lsh.insert("b", m2) lsh.remove("a") self.assertTrue("a" not in lsh.keys) for table in lsh.hashtables: for H in table: self.assertGreater(len(table[H]), 0) self.assertTrue("a" not in table[H]) self.assertRaises(ValueError, lsh.remove, "c")
def _run_acc(size, seed, num_perm): m = MinHash(num_perm=num_perm) s = set() random.seed(seed) for i in range(size): v = int_bytes(random.randint(1, size)) m.update(v) s.add(v) return (m, s)
def run_perf(card, num_perm): m = MinHash(num_perm=num_perm) logging.info("MinHash using %d permutation functions" % num_perm) start = time.clock() for i in range(card): m.update(int_bytes(i)) duration = time.clock() - start logging.info("Digested %d hashes in %.4f sec" % (card, duration)) return duration
def _generate_minhash_list(data, shingle_length=2): minhash_list = [] for text in data: m = MinHash() for d in _extract_shingles(text, shingle_length): m.update(d.encode('utf8')) minhash_list.append(m) return minhash_list
def prepare_domain(vals): permutations = config.MINHASH_PARAMS['num_permutations'] encoding = config.MINHASH_PARAMS['encoding'] m_set = MinHash(num_perm=permutations) for elem in vals: m_set.update(elem.encode(encoding)) return m_set
def get_packet_seq_min_hash(packets, packet_range_begin, step): packet_seq_min_hash = MinHash() for packet_index in range(packet_range_begin, packet_range_begin + step): try: payload = get_payload(packets[packet_index]) except IndexError: break packet_seq_min_hash.update(payload.encode('utf-8')) return packet_seq_min_hash.hashvalues
async def test_query_mongo(self): async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=16) as lsh: m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) m3 = MinHash(16) m3.update("b".encode("utf8")) fs = (lsh.insert("a", m1, check_duplication=False), lsh.insert("b", m2, check_duplication=False), lsh.insert("b", m3, check_duplication=False)) await asyncio.gather(*fs) result = await lsh.query(m1) self.assertTrue("a" in result) result = await lsh.query(m2) self.assertTrue("b" in result) m3 = MinHash(18) with self.assertRaises(ValueError): await lsh.query(m3)
async def test_remove_mongo(self): async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=16) as lsh: m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) await lsh.insert("a", m1) await lsh.insert("b", m2) await lsh.remove("a") self.assertTrue(not await lsh.has_key("a")) for table in lsh.hashtables: for H in await table.keys(): self.assertGreater(len(await table.get(H)), 0) self.assertTrue("a" not in await table.get(H)) with self.assertRaises(ValueError): await lsh.remove("c")
def minHashing(splitedString): shringleLength = 5 startIndex = 0 m1 = MinHash(num_perm=minHashPermmutations) for x in range(0, int(round(len(splitedString) / shringleLength))): m1.update(splitedString[startIndex:(startIndex + shringleLength)].encode('utf8')) startIndex = startIndex + shringleLength return m1.hashvalues
async def test_remove_session_mongo(self): def chunk(it, size): it = iter(it) return iter(lambda: tuple(islice(it, size)), ()) _chunked_str = chunk( (random.choice(string.ascii_lowercase) for _ in range(10000)), 4) seq = frozenset( chain((''.join(s) for s in _chunked_str), ('aahhb', 'aahh', 'aahhc', 'aac', 'kld', 'bhg', 'kkd', 'yow', 'ppi', 'eer'))) objs = [MinHash(16) for _ in range(len(seq))] for e, obj in zip(seq, objs): for i in e: obj.update(i.encode('utf-8')) data = [(e, m) for e, m in zip(seq, objs)] keys_to_remove = ('aahhb', 'aahh', 'aahhc', 'aac', 'kld', 'bhg', 'kkd', 'yow', 'ppi', 'eer') keys_left = frozenset(seq) - frozenset(keys_to_remove) async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=16) as lsh: async with lsh.insertion_session(batch_size=1000) as session: fs = (session.insert(key, minhash, check_duplication=False) for key, minhash in data) await asyncio.gather(*fs) async with lsh.delete_session(batch_size=3) as session: fs = (session.remove(key) for key in keys_to_remove) await asyncio.gather(*fs) for t in lsh.hashtables: self.assertTrue(await t.size() >= 1) items = [] for H in await t.keys(): items.extend(await t.get(H)) for key in keys_to_remove: self.assertTrue( key not in items, '{0} in items, but should not be'.format(key)) for key in keys_left: self.assertTrue( key in items, '{0} not in items, but should be'.format(key)) for key in keys_to_remove: self.assertTrue( not (await lsh.has_key(key)), '<{0}> key should not be in LSH index'.format(key)) for key in keys_left: self.assertTrue(await lsh.has_key(key), '<{0}> key should be in LSH index'.format(key))
def _setup(self): d = "abcdefghijklmnopqrstuvwxyz" forest = MinHashLSHForest() for i in range(len(d)-2): key = d[i] m = MinHash() j = i + 3 for s in d[i:j]: m.update(s.encode("utf8")) forest.add(key, m) forest.index() return forest