def main() -> None: for _ in tqdm(range(1), desc="Create finding example:"): minhash = MinHash(num_perm=256) list_strings = [] for _ in range(200): rand_string = ''.join( random.choice(string.ascii_lowercase) for i in range(5)) list_strings.append(rand_string) minhash.update_batch([s.encode('utf-8') for s in list_strings]) for _ in tqdm(range(1), desc="Connect to existing db:"): lsh = MinHashLSH(threshold=0.5, num_perm=256, storage_config={ 'type': 'cassandra', 'basename': b'perftest', 'cassandra': { 'seeds': ['127.0.0.1'], 'keyspace': config.KEY_SPACE, 'replication': { 'class': 'SimpleStrategy', 'replication_factor': '1', }, 'drop_keyspace': False, 'drop_tables': False, } }) try: for _ in tqdm(range(1), desc="Find minHash similarity:"): result = lsh.query(minhash) print("Approximate neighbours with Jaccard similarity > 0.5", result) except BaseException as e: print(str(e)) print("Error")
def search(self, query: List[int]) -> Union[List[int], List[Tuple[int, float]]]: h = MinHash(num_perm=self._lsh.h, hashfunc=hash) h.update_batch(query) found = self._lsh.query(h) if self._threshold is not None: threshold = self._threshold fps = self._fingerprints bm = BitMap(query) return sorted(((x, j) for x in found if (j := bm.jaccard_index(fps[x])) >= threshold), key=itemgetter(1), reverse=True) return found
def main() -> None: minhashes = [] files = [] for iterator in tqdm(range(config.COUNT_UNQ_MHS), desc="Generate minHashes:"): minhash = MinHash(num_perm=256) file = [] for _ in range(200): rand_string = ''.join( random.choice(string.ascii_lowercase) for _ in range(5)) file.append(rand_string) files.append(file) minhash.update_batch([s.encode('utf-8') for s in file]) minhashes.append(("key" + str(iterator), minhash)) lsh = MinHashLSH(threshold=0.5, num_perm=256, storage_config={ 'type': 'cassandra', 'basename': b'perftest', 'cassandra': { 'seeds': ['127.0.0.1'], 'keyspace': config.KEY_SPACE, 'replication': { 'class': 'SimpleStrategy', 'replication_factor': '1', }, 'drop_keyspace': False, 'drop_tables': False, } }) for _ in tqdm(range(1), desc="Insert 100 minHashes:"): with lsh.insertion_session(buffer_size=100) as session: for key, minhash in minhashes: session.insert(key, minhash) f_disc_mhs = open('minhashes.txt', 'w+') for minhash in tqdm(minhashes, desc="Log minHashes:"): log(f_disc_mhs, minhash[0], minhash[1].digest()) f_disc_mhs.close() f_disc_files = open('files.txt', 'w+') for iterator in tqdm(range(len(files)), desc="Log files:"): log(f_disc_files, minhashes[iterator][0], files[iterator]) f_disc_mhs.close()
def get_minhash(args): (n, x), num_perm = args h = MinHash(num_perm=num_perm, hashfunc=hash) h.update_batch(x) return n, h
from .matrix_utils import split, split_model, dedup_blocks vgg16 = tf.keras.applications.VGG16() vgg19 = tf.keras.applications.VGG19() s1 = split_model(vgg16, 500, 500, 32) s2 = split_model(vgg19, 500, 500, 32) num_perm = 128 min_dict = {} pbar = tqdm(total=len(s1)) for i, val in enumerate(s1): m = MinHash(num_perm=num_perm) m.update_batch(np.floor(val.flatten() * 10000).astype(int)) min_dict[f"s1-{i}"] = m pbar.update(1) # if i > 20: # break pbar = tqdm(total=len(s2)) for i, val in enumerate(s2): m = MinHash(num_perm=num_perm) m.update_batch(np.floor(val.flatten() * 10000).astype(int)) min_dict[f"s2-{i}"] = m pbar.update(1) # if i > 20: # break lsh2 = MinHashLSH(threshold=0.9, num_perm=num_perm)