def constructNNList(self, studentSets, sampledSets, studentData, sampledData): print('Creating min-hashes for student data') self.studentMinHashes = self.createMinHash(studentSets) print('Creating min-hashes for rubric data') self.sampledMinHashes = self.createMinHash(sampledSets) self.forest = MinHashLSHForest(num_perm = 128) i = 0 for minHash in self.sampledMinHashes: self.forest.add(str(i), minHash) i += 1 self.forest.index() print("calculating nearest neighbor") scores = [] for i, query in enumerate(tqdm(self.studentMinHashes)): result = self.forest.query(query, 1) indexMatch = int(result[0]) # Uncomment these to print examples of # student code and their nearest neighbor! print(result) print('Student Code: \n') print(studentData[i]) print('\n') print('Closest Sampled Code: \n') print(sampledData[indexMatch]) print('\n') score = self.sampledMinHashes[indexMatch].jaccard(query) print('Score: \n') scores.append(score) return scores
def fit(self, X): self._index = MinHashLSHForest(num_perm=self._n_perm, l=self._n_rep) for i, x in enumerate(X): m = MinHash(num_perm=self._n_perm) for e in x: m.update(str(e).encode('utf8')) self._index.add(str(i), m) self._index.index()
def __init__(self, num_permutation=60): self.__num_permutation = num_permutation self.__forest = MinHashLSHForest(self.__num_permutation) self.__lem = WordNetLemmatizer() stop_words = set(stopwords.words("english")) stop_words.add('—') stop_words.add('And') self.__stop_words = stop_words
def clustering(self, data_tag): """ Params: :data_tag: Whether it's source or target data """ # create a min hash forest to quickly find nearest neighbours self.forest = MinHashLSHForest(num_perm=self.num_perm) # initialize clusters medoids = random.sample(range(len(self.data_points[data_tag])), self.num_clusters[data_tag]) for i in range(self.num_clusters[data_tag]): cl = self.ClusterClass(self.data_points[data_tag][medoids[i]]) self.clusters[data_tag].append(cl) # put medoids in a the forest self.forest.add(i, self.clusters[data_tag][-1].medoid.min_hash) self.forest.index() # for each data_point find a cluster self.cluster_points(data_tag) # these will be needed for the stopping criterion cluster_names = [ self.clusters[data_tag][i].medoid.string for i in range(self.num_clusters[data_tag]) ] cluster_names_old = list(cluster_names) count = 0 counts = [] exit = False # clustering loop while not exit: count += 1 # find the point that minimizes the mean distance within a cluster self.find_medoid(data_tag) # create new forest self.forest = MinHashLSHForest(num_perm=self.num_perm) for i in range(self.num_clusters[data_tag]): self.forest.add(i, self.clusters[data_tag][i].medoid.min_hash) self.forest.index() # assign each point to the new medoids self.cluster_points(data_tag) # check stopping criterions exit, cluster_names, cluster_names_old, counts = \ self.stop_clustering(data_tag, cluster_names, cluster_names_old, count, counts)
def fit(self, X): self._index = MinHashLSHForest(num_perm = self._n_perm, l = self._n_rep) for i, x in enumerate(X): m = MinHash(num_perm = self._n_perm) for e in x: m.update(str(e)) self._index.add(str(i), m) self._index.index()
def search_lshforest_jaccard_topk(index_data, query_data, b, r, k): (index_sets, index_keys, index_minhashes) = index_data (query_sets, query_keys, query_minhashes) = query_data num_perm = b * r print("Building LSH Forest Index.") start = time.perf_counter() index = MinHashLSHForest(num_perm=num_perm, l=b) # Use the indices of the indexed sets as keys in LSH. for i in range(len(index_keys)): index.add(i, index_minhashes[num_perm][i]) index.index() end = time.perf_counter() print("Indexing time: {:.3f}.".format(end - start)) print("Querying.") times = [] results = [] for query_minhash, query_key, query_set in \ zip(query_minhashes[num_perm], query_keys, query_sets): start = time.perf_counter() result = index.query(query_minhash, k * 2) # Recover the retrieved indexed sets and # compute the exact Jaccard similarities. result = [[index_keys[i], compute_jaccard(query_set, index_sets[i])] for i in result] # Sort by similarity. result.sort(key=lambda x: x[1], reverse=True) # Take the top k. result = result[:k] duration = time.perf_counter() - start times.append(duration) results.append((query_key, result)) sys.stdout.write(f"\rQueried {len(results)} sets") sys.stdout.write("\n") return (results, times)
def mylshforest(corpus): #print(len(corpus)) forest = MinHashLSHForest(num_perm=32) score_res = [0] mh = [] for i in range(len(corpus) - 1): doc = corpus[i] doc2 = corpus[i + 1] m = MinHash(num_perm=32) for d in doc: m.update(d.encode('utf8')) forest.add(str(i), m) forest.index() mh.append(m) m2 = MinHash(num_perm=32) for d in doc2: m2.update(d.encode('utf8')) result = forest.query(m2, 10) score = 0.0 for j in range(len(result)): score = score + m2.jaccard(mh[int(result[j])]) if (len(result) > 0): score = score / len(result) score_res.append(score) i = i + 1 return score_res
def create_lsh_forest(self): cache_file = os.path.join(self.CACHE_DIR, 'lsh_forest.pkl') if not self.evict_cache and os.path.isfile(cache_file): # load precomputed print('Loading cached forest') self.forest = load_pickle(cache_file) else: sampledSets = self.processData(self.sampledData) self.sampledMinHashes = self.createMinHashSet(sampledSets) self.forest = MinHashLSHForest(num_perm=self.num_perm) for prog_idx, minHash in enumerate(self.sampledMinHashes): self.forest.add(prog_idx, minHash) self.forest.index() os.makedirs(self.CACHE_DIR, exist_ok=True) save_pickle(self.forest, cache_file)
def fit(self, X): self.index = numpy.empty([0, 32]) self._index_minhash = [] self._ball_index = [] self._index = MinHashLSHForest(num_perm=self._n_perm, l=self._n_rep) for i, x in enumerate(X): m = MinHash(num_perm=self._n_perm) for e in x: m.update(str(e).encode('utf-8')) self._index.add(str(i), m) #self.index.append(m.digest()) self.index = numpy.vstack((self.index, m.digest())) self._ball_index.append(m.digest()) self._index_minhash.append(m) self._index.index() self._X = X self.tree = BallTree(self.index, leaf_size=self._n_leaves)
def create_LSH_Forest(): global forest if os.path.isfile(LSH_FOREST_FILE): load_forest() else: forest = MinHashLSHForest(num_perm=128) train_records = glob.glob("dataset/train*.tfrecord") validate_records = glob.glob("dataset/validate*.tfrecord") all_records = train_records + validate_records dataset = tf.data.TFRecordDataset(all_records) iterator = dataset.make_one_shot_iterator() count = 0 next_element = iterator.get_next() updated = False with tf.Session() as sess: try: while True: if count % 10000 == 0: print "[SimpleVideoSearch][{}] Processed {} records from the dataset so far".format( datetime.now(), count) if updated and count % 100000 == 0: with open(LSH_FOREST_FILE, 'wb') as forest_file: forest.index() pickle.dump(forest, forest_file, pickle.HIGHEST_PROTOCOL) print "[SimpleVideoSearch][{}] Updated LSH Forest file".format( datetime.now(), count) exampleBinaryString = sess.run(next_element) example = tf.train.Example.FromString(exampleBinaryString) count += 1 example_id = example.features.feature["id"].bytes_list.value[0] if example_id not in forest: if not updated: updated = True print '[SimpleVideoSearch][{}] First update at record {}'.format( datetime.now(), count) dataset_labels_full = convert_dataset_labels_to_list( example.features.feature["labels"].int64_list.value) minhash = MinHash(num_perm=128) for label in dataset_labels_full: minhash.update(label) forest.add(example_id, minhash) except tf.errors.OutOfRangeError: print "[SimpleVideoSearch][{}] Done iterating through dataset".format( datetime.now()) finally: print "[SimpleVideoSearch][{}] Processed {} records from the dataset".format( datetime.now(), count) forest.index() with open(LSH_FOREST_FILE, 'wb') as forest_file: pickle.dump(forest, forest_file, pickle.HIGHEST_PROTOCOL) print "[SimpleVideoSearch][{}] Finished creating LSH Forest file".format( datetime.now(), count)
def lsh_forest(self, algoritm_type=None, use_components=None, type_option=None, n_char=None, n_word=None): """ LSH Function. Parameters -------- use_components: list, optional: ['name', 'addr'] or ['name'] or ['addr']. Components to use. type_option: list, optional: ['char', 'word'] or ['char'] or ['word']. Components to use. n_char: list of int sizes of char grams. n_word: list of int sizes of word grams. algoritm_type: list, optional: [weighed] or [not_weighed] Type of algorithm """ algoritm_type = algoritm_type or 'not_weighed' use_components = use_components or ['name'] type_option = type_option or ['char'] n_char = n_char or [3] n_word = n_word or [1] if 'char' not in type_option and 'word' not in type_option: assert False, "Проверьте значение параметра type_option." if 'name' not in use_components and 'addr' not in use_components: assert False, "Проверьте значение параметра use_components." for i in use_components: for j in type_option: n_list = n_char if j == 'char' else n_word for n in n_list: LpuList.lsh['{}_{}_{}_{}lsh'.format( algoritm_type, i, j, n)] = MinHashLSHForest(self.num_perm) for idx, minhash in enumerate( self.features['{}_{}_{}_{}minhash'.format( algoritm_type, i, j, n)]): LpuList.lsh['{}_{}_{}_{}lsh'.format( algoritm_type, i, j, n)].add(self.indices[idx], minhash) LpuList.lsh['{}_{}_{}_{}lsh'.format( algoritm_type, i, j, n)].index() return self
class MinHas(BaseANN): def __init__(self, metric, n_perm, n_rep): if metric not in ('jaccard'): raise NotImplementedError( "Datasketch doesn't support metric %s" % metric) self._n_perm = n_perm self._n_rep = n_rep self._metric = metric self.name = 'Datasketch(n_perm=%d, n_rep=%d)' % (n_perm, n_rep) def fit(self, X): self._index = MinHashLSHForest(num_perm=self._n_perm, l=self._n_rep) for i, x in enumerate(X): m = MinHash(num_perm=self._n_perm) for e in x: m.update(str(e).encode('utf-8')) self._index.add(str(i), m) self._index.index() def query(self, v, n): m = MinHash(num_perm=self._n_perm) for e in v: m.update(str(e).encode('utf-8')) print(self._index.query(m, n)) return map(int, self._index.query(m, n))
def _get_forest(self, data, perms): # START Time self.START_TIME = time.time() minhash_list = [] for text in data['text']: min_hashtext = _create_hashtex(text=text, perms=perms, language=self.LANGUAGE) minhash_list.append(min_hashtext) forest = MinHashLSHForest(num_perm=perms) for item_index, list_item in enumerate(minhash_list): forest.add(item_index, list_item) forest.index() # END Time self.END_TIME = time.time() # TIMING LIST self.TIMING = [self.END_TIME, self.START_TIME] print('It took %s seconds to build forest.' % (calculate_duration(self.TIMING))) return forest
class DataSketch(BaseANN): def __init__(self, metric, n_perm, n_rep): if metric not in ('jaccard'): raise NotImplementedError("Datasketch doesn't support metric %s" % metric) self._n_perm = n_perm self._n_rep = n_rep self._metric = metric self.name = 'Datasketch(n_perm=%d, n_rep=%d)' % (n_perm, n_rep) def fit(self, X): self._index = MinHashLSHForest(num_perm = self._n_perm, l = self._n_rep) for i, x in enumerate(X): m = MinHash(num_perm = self._n_perm) for e in x: m.update(str(e)) self._index.add(str(i), m) self._index.index() def query(self, v, n): m = MinHash(num_perm = self._n_perm) for e in v: m.update(str(e)) return map(int, self._index.query(m, n))
def getMinhashforest2(minhashs): # Create a MinHash LSH Forest with the same num_perm parameter forest = MinHashLSHForest(num_perm=128) for i in range(len(minhashs)): # Add m2 and m3 into the index forest.add(i, minhashs[i]) # IMPORTANT: must call index() otherwise the keys won't be searchable forest.index() return forest
def build_lsh_forest_hash(game_data): forest = MinHashLSHForest(num_perm=_utils.HASH_REZ) for ind, row in game_data.iterrows(): try: forest.add(f"{row['title']} (id:{row['id']})", row['_sim_hash']) except ValueError: print(f"{row['title']} already added") except: raise forest.index() return forest
def construct_lsh(obj_dict): forest = MinHashLSHForest(num_perm=128) keys = obj_dict.keys() values = obj_dict.values() ms = [] for i in range(len(keys)): temp = MinHash(num_perm=128) for d in values[i]: temp.update(d.encode('utf8')) ms.append(temp) forest.add(keys[i], temp) forest.index() return forest, keys, ms
def store_lsh(): forest = MinHashLSHForest(num_perm=128) documents_en = docs_col.find({"lang": 'english'}) for item in documents_en: minhash = MinHash(num_perm=128) ngrams = ngrams_token(remove_punctuation(item['content']), 3) for ngram in ngrams: minhash.update(ngram.encode("utf-8")) forest.add(str(item["_id"]), minhash) forest.index() ouf = open('pickle_ngram.txt', 'wb') cPickle.dump(forest, ouf) ouf.close() return forest
def target_lsh(grams): lsh_forest = MinHashLSHForest(num_perm=4000, l=200) lsh = MinHashLSH(threshold=0.5, num_perm=4000) # minhashes = {} for c, i in enumerate(grams): minhash = MinHash(num_perm=4000) i = i.replace(' ', '') for d in ngrams(i, 3): minhash.update(''.join(d)) lsh_forest.add(c, minhash) lsh_forest.index() lsh.insert(c, minhash) return lsh_forest, lsh
def build_lsh_forest(self, company_name_column_name): """ Build the LSH forest data structure from the sets of parsed description words for each company Parameters: company_name_column_name - string; name of the company name column in the company corpus dataframe """ # Note: num_perm is a tuning parameter, but has been abstracted away for simplicity # 256 has been found to be a good amount. Increasing it may increase accuracy, # but will decrease speed and increase memory usage. Decreasing will decrease accuracy lsh_forest = MinHashLSHForest(num_perm=256) iteration = 1 self.company_name_column_name = company_name_column_name self.name_to_index_map = dict( zip(self.company_corpus.corpus.loc[:, company_name_column_name], self.company_corpus.corpus.index)) self.index_to_name_map = dict( zip(self.company_corpus.corpus.index, self.company_corpus.corpus.loc[:, company_name_column_name])) sys.stdout.write("Performing LSH...") for company in self.company_corpus.corpus.iterrows(): # Utilize the 'datasketch' library to minhash the company descriptions and hash to LSh forest company_name = company[1][company_name_column_name] if company_name in self.dict_of_minhash_keys: continue mh = MinHash(num_perm=256) if type(company[1]['rare_words']) is float: mh.update(str(company[1]['rare_words']).encode('utf8')) else: for word in company[1]['rare_words']: mh.update(str(word).encode('utf8')) self.dict_of_minhash_keys[company_name] = mh lsh_forest.add(company_name, mh) iteration += 1 sys.stdout.write('\n') sys.stdout.write("Done performing LSH!\n") # Need this line below to be able to query LSH forest! (See datasketch docs on LSH forest for reasoning) lsh_forest.index() self.lsh_forest = lsh_forest
def benchmark_lshforest(num_perm, l, k, index_data, query_data): print("Building LSH Forest index") forest = MinHashLSHForest(num_perm=num_perm, l=l) for key, minhash in zip(index_data.keys, index_data.minhashes[num_perm]): forest.add(key, minhash) forest.index() print("Querying") times = [] results = [] for qs, minhash in zip(query_data.sets, query_data.minhashes[num_perm]): start = time.clock() result = forest.query(minhash, k) duration = time.clock() - start times.append(duration) results.append( sorted([[key, _compute_jaccard(qs, index_data.sets[key])] for key in result], key=lambda x: x[1], reverse=True)) return times, results
def __train_LSH(self,data): start_time = time.time() forest = MinHashLSHForest(num_perm=config.permutations) for item in tqdm(data, desc="MinHash Docs.."): tag = item['tag'] tokens = item['data'] if self.type == 'trigram': tokens = self.normalizer.generate_ngrams_char(tokens[0]) m = MinHash(num_perm=config.permutations) for s in tokens: m.update(s.encode('utf8')) forest.add(tag,m) forest.index() print('It took %.2f seconds to build forest.' % (time.time() - start_time)) return forest
def build_lsh_forest(self, company_name_column_name): # Note: num_perm is a tuning parameter, but has been abstracted away for simplicity # 256 has been found to be a good amount. Increasing it may increase accuracy, # but will decrease speed and increase memory usage. Decreasing will decrease accuracy lsh_forest = MinHashLSHForest(num_perm=256) iteration = 0 self.company_name_column_name = company_name_column_name self.name_to_index_map = dict( zip(self.company_corpus.corpus.loc[:, company_name_column_name], self.company_corpus.corpus.index)) self.index_to_name_map = dict( zip(self.company_corpus.corpus.index, self.company_corpus.corpus.loc[:, company_name_column_name])) graph_size = self.company_corpus.corpus.shape[0] for company in self.company_corpus.corpus.iterrows(): company_name = company[1][company_name_column_name] if company_name in self.dict_of_minhash_keys: continue mh = MinHash(num_perm=256) if type(company[1]['rare_words']) is float: mh.update(str(company[1]['rare_words']).encode('utf8')) else: for word in company[1]['rare_words']: mh.update(str(word).encode('utf8')) self.dict_of_minhash_keys[company_name] = mh lsh_forest.add(company_name, mh) if iteration % 10000 is 0 or (iteration + 1) is graph_size: if (iteration + 1) is graph_size: iteration += 1 sys.stdout.write('\r') sys.stdout.write( "LSH Forest Build Percent Complete: {0:0.2f}%".format( round((iteration / graph_size) * 100))) sys.stdout.flush() iteration += 1 sys.stdout.write('\n') # Need this line below !!!! lsh_forest.index() self.lsh_forest = lsh_forest
def toBuildLSH(cleanSongs): ''' :param cleanSongs :return: forest, min_hash_list ''' forest = MinHashLSHForest(num_perm=128) min_hash_list = [] for songIndex, song in enumerate(cleanSongs): minhash = MinHash(num_perm=128) for word in song: ### encoding each word minhash.update(word.encode('utf8')) ### add each song's minhash to the forest as well as min_hash_list forest.add(str(songIndex), minhash) min_hash_list.append(minhash) forest.index() return forest, min_hash_list
def get_forest(self, data, perms): minhash = [] for text in data['err']: tokens = self.preprocess(text) m = MinHash(num_perm=perms) for s in tokens: m.update(s.encode('utf8')) minhash.append(m) forest = MinHashLSHForest(num_perm=perms) for i, m in enumerate(minhash): forest.add(i, m) forest.index() return forest
def form_lsh(self): minhash = [] for s in self.__items: m = MinHash(num_perm=256) for q in s: m.update(q.encode('utf8')) minhash.append(m) forest = MinHashLSHForest(num_perm=256) for i, m in enumerate(minhash): forest.add(i, m) forest.index() self.__forest = forest self.__hashlist = minhash return forest
def build_lsh_forest(columns, override=False): """ Builds a minHash LSH forest which can be used to query top-k columns with maximum Jaccard similarity @param override: @param columns: @return: """ file_path = f'{os.environ["WORKING_DIRECTORY"]}/results/forest.obj' if override or not os.path.isfile(file_path): forest = MinHashLSHForest(num_perm=NUM_PERM) for column in columns: forest.add(f'{column["table"]}.{column["column"]}', deserialize_minhash(column)) forest.index() with open(file_path, 'wb') as file: pickle.dump(forest, file) return forest with open(file_path, 'rb') as file: forest = pickle.load(file) return forest
def get_forest(records, perms): start_time = time.time() minhash = [] for record in records: for text in record: tokens = preprocess(text) m = MinHash(num_perm=perms) for s in tokens: m.update(s.encode('utf8')) minhash.append(m) forest = MinHashLSHForest(num_perm=perms) print(forest) for i, m in enumerate(minhash): forest.add(i, m) forest.index() print('It took %s seconds to build forest.' % (time.time() - start_time)) return forest
def __datasketch_fit(self): if self.kwargs['create']: # Create a list of MinHash objects min_hash_obj_list = [] forest = MinHashLSHForest(num_perm=self.kwargs['num_perm']) for i in range(len(self.features)): min_hash_obj_list.append( MinHash(num_perm=self.kwargs['num_perm'])) for d in self.features[i]: min_hash_obj_list[i].update(d) forest.add(i, min_hash_obj_list[i]) # IMPORTANT: must call index() otherwise the keys won't be searchable forest.index() with open(self.kwargs['file_path'], "wb") as f: pickle.dump(forest, f) pickle.dump(min_hash_obj_list, f) self.predictor = [forest, min_hash_obj_list] else: with open(self.kwargs['file_path'], "rb") as f: forest = pickle.load(f) min_hash_obj_list = pickle.load(f) self.predictor = [forest, min_hash_obj_list]
def get_forest(data, perms): start_time = time.time() minhash = [] for text in data: tokens = p.preprocess(text) m = MinHash(num_perm=perms) for s in tokens: m.update(s.encode('utf-8')) minhash.append(m) forest = MinHashLSHForest(num_perm=perms) for i, m in enumerate(minhash): forest.add(i, m) forest.index() print('time to build forest: ', (time.time() - start_time)) return forest
data1 = [ 'minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for', 'estimating', 'the', 'similarity', 'between', 'datasets' ] data2 = [ 'minhash', 'is', 'a', 'probability', 'data', 'structure', 'for', 'estimating', 'the', 'similarity', 'between', 'documents' ] data3 = [ 'minhash', 'is', 'probability', 'data', 'structure', 'for', 'estimating', 'the', 'similarity', 'between', 'documents' ] dataset = [[0., 0., 0.], [0., 0., 1.], [0., 1., 0.], [0., 1., 1.]] # Create a MinHash LSH Forest with the same num_perm parameter forest = MinHashLSHForest(num_perm=128) for i, data in enumerate(dataset): m = MinHash(num_perm=128) for d in data: m.update(str(d).encode('utf8')) forest.add(str(i), m) # IMPORTANT: must call index() otherwise the keys won't be searchable pickle.dump(forest, open('forest.lsh', 'wb')) del forest forest = pickle.load(open('forest.lsh', 'rb')) forest.index()
return (hotelName, newMinHash) def growForest(forest, minHahs): for hotel, hash in minHahs: forest.add(hotel, hash) allHotels = getHotelsDict() file_path = join("data", "lshforrest.p") # Создаю minHahs для всех слов if not isfile(file_path): # Create a MinHash LSH Forest with the same num_perm parameter forest = MinHashLSHForest(num_perm=NN_PERM) allKeys = list(allHotels.keys()) print(len(allKeys)) with Pool(4) as pool: minHahs = pool.map(makeMinHash, allKeys[0:100000]) print("Done 1!") growForest(forest, minHahs) minHahs = pool.map(makeMinHash, allKeys[100000:200000]) print("Done 2!") growForest(forest, minHahs) minHahs = pool.map(makeMinHash, allKeys[200000:300000]) print("Done 3!") growForest(forest, minHahs) minHahs = pool.map(makeMinHash, allKeys[300000:400000]) print("Done 4!") growForest(forest, minHahs)