def fit(self, X): print('PANNG: start indexing...') t0 = time.time() dim = len(X[0]) print('PANNG: # of data=' + str(len(X))) print('PANNG: Dimensionality=' + str(dim)) index_dir = 'indexes' if not os.path.exists(index_dir): os.makedirs(index_dir) index = os.path.join(index_dir, 'NGT-' + str(self._edge_size)) print(index) if os.path.exists(index): print('PANNG: index already exists! ' + str(index)) self.index = ngtpy.Index(index) opentime = time.time() - t0 print('PANNG: open time(sec)=' + str(opentime)) else: ngtpy.create(path=index, dimension=dim, edge_size_for_creation=self._edge_size, distance_type=self._metric, object_type=self._object_type) idx = ngtpy.Index(path=index) idx.batch_insert(X, num_threads=24, debug=False) idx.save(index) idx.close() if self._pathadj_size > 0: print('PANNG: path adjustment') args = ['ngt', 'prune', '-s ' + str(self._pathadj_size), index] subprocess.call(args) self.index = ngtpy.Index(path=index) indexingtime = time.time() - t0 print('PANNG: indexing, adjustment and saving time(sec)=' + str(indexingtime))
def fit(self, X): print('ONNG: start indexing...') dim = len(X[0]) print('ONNG: # of data=' + str(len(X))) print('ONNG: dimensionality=' + str(dim)) index_dir = 'indexes' if not os.path.exists(index_dir): os.makedirs(index_dir) index = os.path.join( index_dir, 'ONNG-{}-{}-{}'.format(self._edge_size, self._outdegree, self._indegree)) anngIndex = os.path.join(index_dir, 'ANNG-' + str(self._edge_size)) print('ONNG: index=' + index) if (not os.path.exists(index)) and (not os.path.exists(anngIndex)): print('ONNG: create ANNG') t = time.time() args = ['ngt', 'create', '-it', '-p8', '-b500', '-ga', '-of', '-D' + self._metric, '-d' + str(dim), '-E' + str(self._edge_size), '-S' + str(self._edge_size_for_search), '-e' + str(self._epsilon), '-P0', '-B30', '-T' + str(self._build_time_limit), anngIndex] subprocess.call(args) idx = ngtpy.Index(path=anngIndex) idx.batch_insert(X, num_threads=24, debug=False) print('ONNG: ANNG construction time(sec)=' + str(time.time() - t)) t = time.time() if self._refine_enabled: idx.refine_anng(epsilon=self._epsilon, num_of_edges=self._edge_size, num_of_explored_edges=self._edge_size_for_search) print('ONNG: RNNG construction time(sec)=' + str(time.time() - t)) idx.save() idx.close() if not os.path.exists(index): print('ONNG: degree adjustment') t = time.time() args = ['ngt', 'reconstruct-graph', '-mS', '-o ' + str(self._outdegree), '-i ' + str(self._indegree), anngIndex, index] subprocess.call(args) print('ONNG: degree adjustment time(sec)=' + str(time.time() - t)) if os.path.exists(index): print('ONNG: index already exists! ' + str(index)) t = time.time() print(self._tree_disabled) self.index = ngtpy.Index(index, read_only=True, tree_disabled=self._tree_disabled) self.indexName = index print('ONNG: open time(sec)=' + str(time.time() - t)) else: print('ONNG: something wrong.') print('ONNG: end of fit')
def create_ANN(e_id, pos_data, neg_data, is_extend): if is_extend: ngtpy.create(path=str(e_id) + '_extend.anng', dimension=768, distance_type="L2") index = ngtpy.Index(str(e_id) + '_extend.anng') else: ngtpy.create(path=str(e_id) + '.anng', dimension=768, distance_type="L2") index = ngtpy.Index(str(e_id) + '.anng') nX1 = np.array(list(pos_data['vec'])) nX2 = np.array(list(neg_data['vec'])) objects = np.concatenate((nX1, nX2)) index.batch_insert(objects) index.build_index() y = np.concatenate((np.ones(len(nX1), dtype=int), np.zeros(len(nX2), dtype=int))) return index, y
def search_NN_ngt(test_emb, train_emb_flat, NN=1): import ngtpy Ntest, I, J, D = test_emb.shape closest_inds = np.empty((Ntest, I, J, NN), dtype=np.int32) l2_maps = np.empty((Ntest, I, J, NN), dtype=np.float32) # os.makedirs('tmp', exist_ok=True) dpath = f'/tmp/{os.getpid()}' ngtpy.create(dpath, D) index = ngtpy.Index(dpath) index.batch_insert(train_emb_flat) for n in range(Ntest): for i in range(I): for j in range(J): query = test_emb[n, i, j, :] results = index.search(query, NN) inds = [result[0] for result in results] closest_inds[n, i, j, :] = inds vecs = np.asarray( [index.get_object(inds[nn]) for nn in range(NN)]) dists = np.linalg.norm(query - vecs, axis=-1) l2_maps[n, i, j, :] = dists shutil.rmtree(dpath) return l2_maps, closest_inds
def ngt(self, train_data, train_labels, test_data, test_labels): """ Run NGT. Compute training and testing times [1] https://github.com/yahoojapan/NGT """ train_time = 0 time0 = time.time() ngtpy.create(b"data", len(train_data[0]), distance_type=self.ngt_distance) model = ngtpy.Index(b"data") model.batch_insert(train_data) time1 = time.time() model.save() predicted_labels = self.ngt_predict(model, test_data, train_labels) if self.test_time_ms == None: time2 = time.time() self.ngt_predict(model, train_data[0:self.test_time_samples], train_labels) time3 = time.time() self.test_time_ms = 1000.0 * (time3 - time2) / (self.test_time_samples) train_time = time1 - time0 return predicted_labels, train_time
def test_simple_ngt(tmpdir): import ngtpy path = os.path.join(tmpdir, 'ngt-index') dimension, queries, top_k, batch_size, num_batch = 10, 3, 5, 8, 3 ngtpy.create(path=path, dimension=dimension, distance_type='L2') _index = ngtpy.Index(path=path) for i in range(num_batch): _index.batch_insert(np.random.random((batch_size, dimension)), num_threads=4) assert os.path.exists(path) idx = [] dist = [] for key in np.random.random((queries, dimension)): results = _index.search(key, size=top_k, epsilon=0.1) index_k = [] distance_k = [] [(index_k.append(result[0]), distance_k.append(result[1])) for result in results] idx.append(index_k) dist.append(distance_k) idx = np.array(idx) dist = np.array(dist) assert idx.shape == dist.shape assert idx.shape == (queries, top_k)
def __init__(self, db, f_class=None, d_type='L1'): self.NGT_dir = 'NGT_{}_{}'.format(f_class,d_type) self.NGT_path = b'' self.fearure = f_class self.SQLdb = SQLite() if f_class == 'daisy': self.f_c = Daisy() self.NGT_path = b'NGT/NGT_daisy_'+d_type.encode() elif f_class == 'edge': self.f_c = Edge() self.NGT_path = b'NGT/NGT_edge_'+d_type.encode() elif f_class == 'hog': self.f_c = HOG() self.NGT_path = b'NGT/NGT_hog_'+d_type.encode() elif f_class == 'vgg': self.f_c = VGGNetFeat() self.NGT_path = b'NGT/NGT_vgg_'+d_type.encode() elif f_class == 'res': self.f_c = ResNetFeat() self.NGT_path = b'NGT/NGT_res_'+d_type.encode() if not os.path.exists(os.path.join(NGT_dir,self.NGT_dir)): samples = self.f_c.make_samples(db, verbose=False) dim = 0 try: dim = samples[0]['hist'].shape[0] except: pass images= [] objects = [] for i, row in enumerate(samples): vector = row['hist'] link = row['img'] lable = row['cls'] data = {'index':i,'link':link,'lable':lable} images.append(data) objects.append(vector) self.SQLdb.updateMuti(f_class,images) # cPickle.dump(images, open(os.path.join(NGT_dir, sample_cache), "wb", True)) ngtpy.create(path=self.NGT_path, dimension=dim, distance_type=d_type) self.index = ngtpy.Index(self.NGT_path) self.index.batch_insert(objects) self.index.save() self.index = ngtpy.Index(self.NGT_path)
def __init__(self, dims, edge_size_for_search=40, epsilon=0.1): self.dims = dims self.vectors_map = {} index_path = 'livemint.anng' ngtpy.create(index_path, dims, edge_size_for_search=edge_size_for_search) # create an empty index self.index = ngtpy.Index(index_path) # open the index self.content_id_to_ngt_id = {} self.epsilon = epsilon
def build_advanced_index(self, vecs: 'np.ndarray'): import ngtpy ngtpy.create(path=self.index_path, dimension=self.num_dim, distance_type=self.metric) _index = ngtpy.Index(self.index_path) _index.batch_insert(vecs, num_threads=1) return _index
def fit(self, X): print('QG: start indexing...') dim = len(X[0]) print('QG: # of data=' + str(len(X))) print('QG: dimensionality=' + str(dim)) index_dir = 'indexes' if not os.path.exists(index_dir): os.makedirs(index_dir) index = os.path.join( index_dir, 'ONNG-{}-{}-{}'.format(self._edge_size, self._outdegree, self._indegree)) anngIndex = os.path.join(index_dir, 'ANNG-' + str(self._edge_size)) print('QG: index=' + index) if (not os.path.exists(index)) and (not os.path.exists(anngIndex)): print('QG: create ANNG') t = time.time() args = [ 'ngt', 'create', '-it', '-p8', '-b500', '-ga', '-of', '-D' + self._metric, '-d' + str(dim), '-E' + str(self._edge_size), '-S40', '-e' + str(self._epsilon), '-P0', '-B30', '-T' + str(self._build_time_limit), anngIndex ] subprocess.call(args) idx = ngtpy.Index(path=anngIndex) idx.batch_insert(X, num_threads=24, debug=False) idx.save() idx.close() print('QG: ANNG construction time(sec)=' + str(time.time() - t)) if not os.path.exists(index): print('QG: degree adjustment') t = time.time() args = [ 'ngt', 'reconstruct-graph', '-mS', '-E ' + str(self._outdegree), '-o ' + str(self._outdegree), '-i ' + str(self._indegree), anngIndex, index ] subprocess.call(args) print('QG: degree adjustment time(sec)=' + str(time.time() - t)) if not os.path.exists(index + '/qg'): print('QG: quantization') t = time.time() args = [ 'ngtqg', 'quantize', '-C1', '-c16', '-N ' + str(dim), '-Ms', '-lk', index ] subprocess.call(args) print('QG: quantization time(sec)=' + str(time.time() - t)) if os.path.exists(index): print('QG: index already exists! ' + str(index)) t = time.time() self.index = ngtpy.QuantizedIndex(index, self._max_edge_size) self.index.set_with_distance(False) self.indexName = index print('QG: open time(sec)=' + str(time.time() - t)) else: print('QG: something wrong.') print('QG: end of fit')
def _load_index(self): """Create or open the NGT index.""" path = os.path.join(self.data_directory, "ngt") if not os.path.exists(path): ngtpy.create(path, dimension=300) # Spacy word vectors are 300D return ngtpy.Index(path)
def get_query_handler(self): """Index all vectors , if already indexed return NGT Index handle """ import ngtpy vecs = super().get_query_handler() if vecs is not None: ngtpy.create(path=self.index_path, dimension=self.num_dim, distance_type=self.metric) _index = ngtpy.Index(self.index_path) _index.batch_insert(vecs, num_threads=self.num_threads) return _index else: return None
def __init__(self, data): import tempfile, ngtpy data = np.asarray(data) self.index_directory = tempfile.TemporaryDirectory() self.index_name = bytes(self.index_directory.name, 'ascii') # Can manually delete the index file with: # tree = NGT( <data> ) # tree.index_directory.cleanup() ngtpy.create(self.index_name, data.shape[1]) self.index = ngtpy.Index(self.index_name) self.index.batch_insert(data) self.index.save()
def main(): dim = len(idolvecs[1][0]) ngtpy.create(b"tmp", dim) index = ngtpy.Index(b"tmp") index.batch_insert(idolvecs[1]) index.save() i = int(sys.argv[2]) target_name = idolvecs[0][i] target_vec = idolvecs[1][i] print(target_name) similar_v(index, target_vec)
def fit(self, X): print('ONNG: start indexing...') dim = len(X[0]) print('ONNG: # of data=' + str(len(X))) print('ONNG: dimensionality=' + str(dim)) index_dir = 'indexes' if not os.path.exists(index_dir): os.makedirs(index_dir) index = os.path.join(index_dir, 'ONNG-' + str(self._edge_size) + '-' + str(self._outdegree) + '-' + str(self._indegree)) anngIndex = os.path.join(index_dir, 'ANNG-' + str(self._edge_size)) print('ONNG: index=' + index) if (not os.path.exists(index)) and (not os.path.exists(anngIndex)): print('ONNG: create ANNG') t = time.time() args = ['ngt', 'create', '-it', '-p8', '-b500', '-ga', '-of', '-D' + self._metric, '-d' + str(dim), '-E' + str(self._edge_size), '-S0', '-e' + str(self._epsilon), '-P0', '-B30', anngIndex] subprocess.call(args) idx = ngtpy.Index(path=anngIndex) idx.batch_insert(X, num_threads=1, debug=False) idx.save() idx.close() print('ONNG: ANNG construction time(sec)=' + str(time.time() - t)) if not os.path.exists(index): print('ONNG: degree adjustment') t = time.time() args = ['ngt', 'reconstruct-graph', '-mS', '-o ' + str(self._outdegree), '-i ' + str(self._indegree), anngIndex, index] subprocess.check_call(args) print('ONNG: degree adjustment time(sec)=' + str(time.time() -t)) if os.path.exists(index): print('ONNG: index already exists! ' + str(index)) t = time.time() self.index = ngtpy.Index(index, read_only=True) self.indexName = index print('ONNG: open time(sec)=' + str(time.time() - t)) else: print('ONNG: something wrong.') print('ONNG: end of fit')
def build_advanced_index(self, vecs: 'np.ndarray'): """ Build an advanced index structure from a numpy array. :param vecs: numpy array containing the vectors to index :return: advanced NGT index """ import ngtpy ngtpy.create(path=self.index_path, dimension=self.num_dim, distance_type=self._metric) _index = ngtpy.Index(self.index_path) _index.batch_insert(vecs, num_threads=self._num_threads) return _index
def ANN_cal(e_id, vec, y): index = ngtpy.Index(str(e_id) + '.anng') label = [] for i in vec: results = index.search(i, size=8) sum = 0 for j in results: sum += j[1] if sum == 0: pos = 0 neg = 1 else: pos = 0 neg = 0 for j in results: if y[j[0]] == 1: pos += 1 - j[1] / sum else: neg += 1 - j[1] / sum if pos > neg: label.append(1) else: label.append(0) return label
f = open(q_embedding_file, "rb+") all_question_embedding = list((pickle.load(f)).values()) f.close() train_question_embedding = all_question_embedding[:-5000] eval_question_embedding = all_question_embedding[-5000:] f = open(qa_embedding_file, "rb+") all_question_answer_embedding = list((pickle.load(f)).values()) f.close() train_qa_embedding = all_question_answer_embedding[:-5000] eval_qa_embedding = all_question_answer_embedding[-5000:] ngtpy.create(b"evaluation_questions_index", len(all_question_embedding[0])) index = ngtpy.Index(b"evaluation_questions_index") index.batch_insert(train_question_embedding) index.save() total_cosine_similarity = 0.0 QA = open(qa_file, "r").readlines() eval_qa_pairs = open(eval_qa_file, "r").readlines() eval_questions = open(eval_q_file, "r").readlines() print("Start evaluation") for i in tqdm.tqdm(range(len(eval_question_embedding))): question = eval_questions[i] q_embedding = eval_question_embedding[i] result = index.search(q_embedding, opt.beam)
def dedupe(self, args): if not self.load_hashcache(): self.dump_hashcache() # check num_proc if args.num_proc is None: num_proc = max(cpu_count() - 1, 1) else: num_proc = args.num_proc if self.ngt: try: import ngtpy except: logger.error(colored("Error: Unable to load NGT. Please install NGT and python binding first.", 'red')) sys.exit(1) index_path = self.get_ngt_index_path() logger.warning("Building NGT index (dimension={}, num_proc={})".format(self.hash_bits, num_proc)) ngtpy.create(path=index_path.encode(), dimension=self.hash_bits, edge_size_for_creation=args.ngt_edges, edge_size_for_search=args.ngt_edges_for_search, object_type="Byte", distance_type="Hamming") ngt_index = ngtpy.Index(index_path.encode()) ngt_index.batch_insert(self.hashcache.hshs(), num_proc) # NGT Approximate neighbor search logger.warning("Approximate neighbor searching using NGT") hshs = self.hashcache.hshs() check_list = [0] * len(hshs) current_group_num = 1 if not args.query: for i in tqdm(range(len(hshs))): new_group_found = False if check_list[i] != 0: # already grouped image continue for res in ngt_index.search(hshs[i], size=args.ngt_k, epsilon=args.ngt_epsilon): if res[0] == i: continue else: if res[1] <= self.hamming_distance: if check_list[res[0]] == 0: if check_list[i] == 0: # new group new_group_found = True check_list[i] = current_group_num check_list[res[0]] = current_group_num self.group[current_group_num] = [self.image_filenames[i]] self.group[current_group_num].extend([self.image_filenames[res[0]]]) else: # exists group exists_group_num = check_list[i] check_list[res[0]] = exists_group_num self.group[exists_group_num].extend([self.image_filenames[res[0]]]) if new_group_found: current_group_num += 1 else: # query image new_group_found = False hsh = self.hashcache.gen_hash(args.query) self.group[current_group_num] = [] for res in ngt_index.search(hsh, size=args.ngt_k, epsilon=args.ngt_epsilon): if res[1] <= self.hamming_distance: new_group_found = True self.group[current_group_num].extend([self.image_filenames[res[0]]]) if new_group_found: current_group_num += 1 # remove ngt index if index_path: os.system("rm -rf {}".format(index_path)) elif self.hnsw: try: import hnswlib except: logger.error(colored("Error: Unable to load hnsw. Please install hnsw python binding first.", 'red')) sys.exit(1) hshs = self.hashcache.hshs() num_elements = len(hshs) hshs_labels = np.arange(num_elements) hnsw_index = hnswlib.Index(space='l2', dim=self.hash_bits) # Squared L2 hnsw_index.init_index(max_elements=num_elements, ef_construction=args.hnsw_ef_construction, M=args.hnsw_m) hnsw_index.set_ef(max(args.hnsw_ef, args.hnsw_k - 1)) # ef should always be > k hnsw_index.set_num_threads(num_proc) logger.warning("Building hnsw index (dimension={}, num_proc={})".format(self.hash_bits, num_proc)) hnsw_index.add_items(hshs, hshs_labels, num_proc) # hnsw Approximate neighbor search logger.warning("Approximate neighbor searching using hnsw") check_list = [0] * num_elements current_group_num = 1 if not args.query: for i in tqdm(range(num_elements)): new_group_found = False if check_list[i] != 0: # already grouped image continue labels, distances = hnsw_index.knn_query(hshs[i], k=args.hnsw_k, num_threads=num_proc) for label, distance in zip(labels[0], distances[0]): if label == i: continue else: if distance <= self.hamming_distance: if check_list[label] == 0: if check_list[i] == 0: # new group new_group_found = True check_list[i] = current_group_num check_list[label] = current_group_num self.group[current_group_num] = [self.image_filenames[i]] self.group[current_group_num].extend([self.image_filenames[label]]) else: # exists group exists_group_num = check_list[i] check_list[label] = exists_group_num self.group[exists_group_num].extend([self.image_filenames[label]]) if new_group_found: current_group_num += 1 else: # query image new_group_found = False hsh = self.hashcache.gen_hash(args.query) self.group[current_group_num] = [] labels, distances = hnsw_index.knn_query(hsh, k=args.hnsw_k, num_threads=num_proc) for label, distance in zip(labels[0], distances[0]): if distance <= self.hamming_distance: new_group_found = True self.group[current_group_num].extend([self.image_filenames[label]]) if new_group_found: current_group_num += 1 elif self.faiss_flat: try: import faiss except: logger.error(colored("Error: Unable to load faiss. Please install faiss python binding first.", 'red')) sys.exit(1) hshs = self.hashcache.hshs() faiss.omp_set_num_threads(num_proc) logger.warning("Building faiss index (dimension={}, num_proc={})".format(self.hash_bits, num_proc)) data = np.array(hshs).astype('float32') index = faiss.IndexFlatL2(self.hash_bits) # Exact search index.add(data) # faiss Exact neighbor search logger.warning("Exact neighbor searching using faiss") check_list = [0] * index.ntotal current_group_num = 1 if not args.query: for i in tqdm(range(index.ntotal)): new_group_found = False if check_list[i] != 0: # already grouped image continue distances, labels = index.search(data[[i]], args.faiss_flat_k) for label, distance in zip(labels[0], distances[0]): if label == i: continue else: if distance <= self.hamming_distance: if check_list[label] == 0: if check_list[i] == 0: # new group new_group_found = True check_list[i] = current_group_num check_list[label] = current_group_num self.group[current_group_num] = [self.image_filenames[i]] self.group[current_group_num].extend([self.image_filenames[label]]) else: # exists group exists_group_num = check_list[i] check_list[label] = exists_group_num self.group[exists_group_num].extend([self.image_filenames[label]]) if new_group_found: current_group_num += 1 else: # query image new_group_found = False hsh = np.array([self.hashcache.gen_hash(args.query)]).astype('float32') self.group[current_group_num] = [] distances, labels = index.search(hsh, args.faiss_flat_k) for label, distance in zip(labels[0], distances[0]): if distance <= self.hamming_distance: new_group_found = True self.group[current_group_num].extend([self.image_filenames[label]]) if new_group_found: current_group_num += 1 else: logger.warning("Searching similar images") hshs = self.hashcache.hshs() check_list = [0] * len(hshs) current_group_num = 1 if not args.query: for i in tqdm(range(len(hshs))): new_group_found = False hshi = hshs[i] for j in range(i+1, len(hshs)): hshj = hshs[j] hamming_distance = np.count_nonzero(hshi != hshj) if hamming_distance <= self.hamming_distance: if check_list[j] == 0: if check_list[i] == 0: # new group new_group_found = True check_list[i] = current_group_num check_list[j] = current_group_num self.group[current_group_num] = [self.image_filenames[i]] self.group[current_group_num].extend([self.image_filenames[j]]) else: # exists group exists_group_num = check_list[i] # check hamming distances of exists group for filename in self.group[exists_group_num]: h = hshs[self.image_filenames.index(filename)] hamming_distance = np.count_nonzero(h != hshj) if not hamming_distance <= self.hamming_distance: continue check_list[j] = exists_group_num self.group[exists_group_num].extend([self.image_filenames[j]]) if new_group_found: current_group_num += 1 else: # query image new_group_found = False hsh = self.hashcache.gen_hash(args.query) self.group[current_group_num] = [] for i in tqdm(range(len(hshs))): hshi = hshs[i] hamming_distance = np.count_nonzero(hshi != hsh) if hamming_distance <= self.hamming_distance: new_group_found = True self.group[current_group_num].extend([self.image_filenames[i]]) if new_group_found: current_group_num += 1 # sort self.group self.sort_group() # write duplicate log file self.num_duplicate_set = current_group_num - 1 if self.num_duplicate_set > 0 and args.log: now = datetime.now().strftime('%Y%m%d%H%M%S') duplicate_log_file = "{}_{}".format(now, self.get_duplicate_log_name()) with open(duplicate_log_file, 'w') as f: if args.query: f.write("Query: {}\n".format(args.query)) for k in range(1, self.num_duplicate_set + 1): img_list = self.group[k] if len(img_list) > 1: sorted_img_list, _, _, _ = self.sort_image_list(img_list) if args.sameline: f.write(" ".join(sorted_img_list) + "\n") else: f.write("\n".join(sorted_img_list) + "\n") if k != len(self.group): f.write("\n")
def kneighbors( self, X=None, n_candidates=None, return_distance=True ) -> Union[Tuple[np.array, np.array], np.array]: """ Retrieve k nearest neighbors. Parameters ---------- X: np.array or None, optional, default = None Query objects. If None, search among the indexed objects. n_candidates: int or None, optional, default = None Number of neighbors to retrieve. If None, use the value passed during construction. return_distance: bool, default = True If return_distance, will return distances and indices to neighbors. Else, only return the indices. """ check_is_fitted(self, 'index_') if X is not None: X = check_array(X) n_test = self.n_samples_fit_ if X is None else X.shape[0] dtype = self.X_dtype_ if X is None else X.dtype if n_candidates is None: n_candidates = self.n_candidates n_candidates = check_n_candidates(n_candidates) # For compatibility reasons, as each sample is considered as its own # neighbor, one extra neighbor will be computed. if X is None: n_neighbors = n_candidates + 1 start = 1 else: n_neighbors = n_candidates start = 0 # If fewer candidates than required are found for a query, # we save index=-1 and distance=NaN neigh_ind = -np.ones((n_test, n_candidates), dtype=np.int32) if return_distance: neigh_dist = np.empty_like(neigh_ind, dtype=dtype) * np.nan if isinstance(self.index_, str): index = ngtpy.Index(self.index_) else: index = self.index_ disable_tqdm = False if self.verbose else True if X is None: for i in tqdm( range(n_test), desc='Query NNG', disable=disable_tqdm, ): query = index.get_object(i) response = index.search( query=query, size=n_neighbors, with_distance=return_distance, epsilon=self.epsilon, ) if return_distance: ind, dist = [np.array(arr) for arr in zip(*response)] else: ind = response ind = ind[start:] neigh_ind[i, :len(ind)] = ind if return_distance: dist = dist[start:] neigh_dist[i, :len(dist)] = dist else: # if X was provided for i, x in tqdm( enumerate(X), desc='Query NNG', disable=disable_tqdm, ): response = index.search( query=x, size=n_neighbors, with_distance=return_distance, epsilon=self.epsilon, ) if return_distance: ind, dist = [np.array(arr) for arr in zip(*response)] else: ind = response ind = ind[start:] neigh_ind[i, :len(ind)] = ind if return_distance: dist = dist[start:] neigh_dist[i, :len(dist)] = dist if return_distance and self.metric == 'sqeuclidean': neigh_dist **= 2 if return_distance: return neigh_dist, neigh_ind else: return neigh_ind
def fit(self, X, y=None): """ Build the ngtpy.Index and insert data from X. Parameters ---------- X: np.array Data to be indexed y: any Ignored Returns ------- self: NNG An instance of NNG with a built index """ if y is None: X = check_array(X) else: X, y = check_X_y(X, y) self.y_train_ = y self.n_samples_fit_ = X.shape[0] self.n_features_ = X.shape[1] self.X_dtype_ = X.dtype # Map common distance names to names used by ngt try: self.effective_metric_ = NNG.internal_distance_type[self.metric] except KeyError: self.effective_metric_ = self.metric if self.effective_metric_ not in NNG.valid_metrics: raise ValueError( f'Unknown distance/similarity measure: {self.effective_metric_}. ' f'Please use one of: {NNG.valid_metrics}.') # Set up a directory to save the index to prefix = 'skhubness_' suffix = '.anng' if self.index_dir in ['auto']: index_path = create_tempfile_preferably_in_dir( prefix=prefix, suffix=suffix, directory='/dev/shm') logging.warning( f'The index will be stored in {index_path}. ' f'It will NOT be deleted automatically, when this instance is destructed.' ) elif isinstance(self.index_dir, str): index_path = create_tempfile_preferably_in_dir( prefix=prefix, suffix=suffix, directory=self.index_dir) elif self.index_dir is None: index_path = create_tempfile_preferably_in_dir(prefix=prefix, suffix=suffix) else: raise TypeError( f'NNG requires to write an index to the filesystem. ' f'Please provide a valid path with parameter `index_dir`.') # Create the ANNG index, insert data ngtpy.create( path=index_path, dimension=self.n_features_, edge_size_for_creation=self.edge_size_for_creation, edge_size_for_search=self.edge_size_for_search, distance_type=self.effective_metric_, ) index_obj = ngtpy.Index(index_path) index_obj.batch_insert(X, num_threads=self.n_jobs) index_obj.save() # Convert ANNG top ONNG if self.optimize: optimizer = ngtpy.Optimizer() optimizer.set(num_of_outgoings=self.num_outgoing, num_of_incomings=self.num_incoming) index_path_onng = str( pathlib.Path(index_path).with_suffix('.onng')) optimizer.execute(index_path, index_path_onng) index_path = index_path_onng # Keep index in memory or store in path if self.index_dir is None: self.index_ = index_obj else: # index_obj.save() self.index_ = index_path return self
def build_similarity_structure(model_file, viable_lines, n_items, strategy, n_top=10, n_candidates=1, num_trees=None, epsilon=None, df=None): t_start = dt.datetime.now() most_similar = {} c = 1 nodes = [_.split(' ', maxsplit=1)[0] for _ in viable_lines] # viable_lines = [] if strategy == 'annoy' and ANNOY_NOT_FOUND: warnings.warn( 'Chosen strategy = \'annoy\', but the module is not installed. Falling back to basic.' ) strategy = 'basic' if strategy == 'ngt' and NGT_NOT_FOUND: warnings.warn( 'Chosen strategy = \'NGT\', but the module is not installed. Falling back to basic.' ) strategy = 'basic' if strategy == 'faiss' and FAISS_NOT_FOUND: warnings.warn( 'Chosen strategy = \'faiss\', but the module is not installed. Falling back to basic.' ) strategy = 'basic' if strategy == 'basic': model = models.KeyedVectors.load_word2vec_format( model_file, unicode_errors='ignore') for n in tqdm(nodes): ms = model.most_similar(str(n), topn=n_top) mm = [item[0] for item in ms] idx = int(n.split('__')[1]) if idx < n_items: candidates = [ _ for _ in mm if int(_.split('__')[1]) >= n_items ] else: candidates = [_ for _ in mm if int(_.split('__')[1]) < n_items] candidates = candidates[:n_candidates] most_similar[n] = candidates c += 1 print('') elif strategy == 'annoy': assert num_trees is not None assert type(num_trees) == int assert num_trees > 0 print('Using ANNOY indexing.') model = models.KeyedVectors.load_word2vec_format( model_file, unicode_errors='ignore') annoy_index = AnnoyIndexer(model, num_trees=num_trees) for n in tqdm(nodes): ms = model.most_similar(str(n), topn=n_top, indexer=annoy_index) mm = [item[0] for item in ms] idx = int(n.split('__')[1]) if idx < n_items: candidates = [ _ for _ in mm if int(_.split('__')[1]) >= n_items ] else: candidates = [_ for _ in mm if int(_.split('__')[1]) < n_items] candidates = candidates[:n_candidates] most_similar[n] = candidates print('\rBuilding similarity structure: {:0.1f} - {}/{} tuples'. format(c / len(nodes) * 100, c, len(nodes)), end='') c += 1 print('') elif strategy == 'lsh': print('Using DeepER LSH blocking.') blocking_candidates = execute_blocking(model_file) model = models.KeyedVectors.load_word2vec_format( model_file, unicode_errors='ignore') for n in blocking_candidates: ms = [] bucket = blocking_candidates[n] for cand in bucket: ms.append((cand, model.similarity(n, cand))) ms.sort(key=itemgetter(1), reverse=True) mm = [item[0] for item in ms] idx = int(n.split('_')[1]) if idx < n_items: candidates = [_ for _ in mm if idx >= n_items] else: candidates = [_ for _ in mm if idx < n_items] candidates = candidates[:n_candidates] most_similar[n] = candidates print('\rBuilding similarity structure: {:0.1f} - {}/{} tuples'. format(c / len(nodes) * 100, c, len(nodes)), end='') c += 1 print('') elif strategy == 'ngt': assert epsilon is not None assert type(epsilon) == float assert 0 <= epsilon <= 1 print('Using NGT indexing.') ngt_index_path = 'pipeline/dump/ngt_index.nn' words = [] with open(model_file, 'r') as fp: n, dim = map(int, fp.readline().split()) ngtpy.create(ngt_index_path, dim, distance_type='Cosine') index = ngtpy.Index(ngt_index_path) for idx, line in enumerate(fp): k, v = line.rstrip().split(' ', maxsplit=1) vector = list(map(float, v.split(' '))) index.insert(vector) # insert objects words.append(k) index.build_index() index.save() most_similar = {} for n in tqdm(nodes): query = index.get_object(words.index(n)) ms = index.search(query, size=n_top, epsilon=epsilon) mm = [item[0] for item in ms[1:]] mm = list(map(words.__getitem__, mm)) idx = int(n.split('_')[1]) if idx < n_items: candidates = [_ for _ in mm if idx >= n_items] else: candidates = [_ for _ in mm if idx < n_items] candidates = candidates[:n_candidates] most_similar[n] = candidates print('\rBuilding similarity structure: {:0.1f} - {}/{} tuples'. format(c / len(nodes) * 100, c, len(nodes)), end='') c += 1 print('') elif strategy == 'faiss': print('Using faiss indexing.') # ngt_index_path = 'pipeline/dump/ngt_index.nn' words = [] with open(model_file, 'r') as fp: n, dim = map(int, fp.readline().split()) mat = [] index = faiss.IndexFlatL2(dim) for idx, line in enumerate(fp): k, v = line.rstrip().split(' ', maxsplit=1) vector = np.array(list(map(float, v.split(' '))), ndmin=1).astype('float32') mat.append(vector) words.append(k) mat = np.array(mat) index.add(mat) most_similar = {} D, I = index.search(mat, k=n_top + 1) # D, I = index.search(query, size=n_top, epsilon=epsilon) # mm = [item[0] for item in ms[1:]] # mm = list(map(words.__getitem__, mm)) for n in tqdm(nodes): idx = int(n.split('__')[1]) mm = I[idx] if idx < n_items: candidates = [_ for _ in mm if idx >= n_items] else: candidates = [_ for _ in mm if idx < n_items] candidates = candidates[:n_candidates] most_similar[n] = ['idx__{}'.format(_) for _ in candidates] # print('\rBuilding similarity structure: {:0.1f} - {}/{} tuples'.format(c / len(nodes) * 100, c, len(nodes)), # end='') c += 1 print('') else: raise ValueError('Unknown strategy {0}'.format(strategy)) t_end = dt.datetime.now() diff = t_end - t_start print('Time required to build sim struct: {}'.format(diff.total_seconds())) pickle.dump(most_similar, open('most_similar.pickle', 'wb')) return most_similar
corpus = os.path.join(corpus_name) a_file = os.path.join(corpus, "preprocessed_a.txt") a_embedding_file = os.path.join("embeddings_a.pkl") model_path = os.path.join("best-model.pt") answer_embedding_dict = pickle.load(open(a_embedding_file, "rb+")) dataset = CornellMovieDialogsDataset( question_text_filepath= 'cornell movie-dialogs corpus/preprocessed_q_train.txt', answer_embedding_dict=answer_embedding_dict, maxlen=30) tokenizer = dataset.tokenizer max_len = dataset.maxlen ngtpy.create(b"answers_index", 768) index = ngtpy.Index(b"answers_index") all_answer_embeddings = list(answer_embedding_dict.values()) index.batch_insert(all_answer_embeddings) index.save() def normalizeString(s): s = s.lower().strip() s = re.sub(r"([.!?,])", r" \1", s) s = re.sub(r"[^a-zA-Z.!?,\']+", r" ", s) s = re.sub(r"\s+", r" ", s).strip() return s answers = open(a_file, "r").readlines()
if tokens[0].startswith('idx_'): fow.write(tokens[0] + '\n') fov.write('{0}\n'.format('\t'.join(tokens[1:]))) if __name__ == '__main__': emb_path = 'pipeline/embeddings/movies-ER-golden.emb' # prepare_files(emb_path) index_path = 'index.anng' with open('objects.tsv', 'r') as fin: n, dim = map(int, fin.readline().split()) ngtpy.create(index_path, dim, distance_type='Cosine') # create an empty index index = ngtpy.Index(index_path) # open the index for line in fin: object = list(map(float, line.rstrip().split('\t'))) index.insert(object) # insert objects index.build_index() # build the index index.save() # save the index with open('words.tsv', 'r') as fin: words = list(map(lambda x: x.rstrip('\n'), fin.readlines())) index = ngtpy.Index('index.anng') # open the index query_id = 31 query_object = index.get_object(query_id) # get the object result = index.search(query_object, epsilon=0.10) # approximate nearest neighbor search
import ngtpy import random dim = 10 objects = [] for i in range(0, 100) : vector = random.sample(range(100), dim) objects.append(vector) query = objects[0] ngtpy.create(b"tmp", dim) index = ngtpy.Index(b"tmp") index.batch_insert(objects) index.save() result = index.search(query, 3) for i, o in enumerate(result) : print(str(i) + ": " + str(o[0]) + ", " + str(o[1])) object = index.get_object(o[0]) print(object)
corpus_name = "cornell movie-dialogs corpus" corpus = os.path.join(corpus_name) q_file = os.path.join(corpus, "preprocessed_q.txt") qa_file = os.path.join(corpus, "preprocessed_qa.txt") q_embedding_file = os.path.join("embeddings_q.pkl") qa_embedding_file = os.path.join("embeddings_qa.pkl") bc = BertClient(check_length=False) f = open(q_embedding_file, "rb+") all_question_embedding = list((pickle.load(f)).values()) f = open(qa_embedding_file, "rb+") all_question_answer_embedding = list((pickle.load(f)).values()) ngtpy.create(b"questions_index", len(all_question_embedding[0])) index = ngtpy.Index(b"questions_index") index.batch_insert(all_question_embedding) index.save() def normalizeString(s): s = s.lower().strip() s = re.sub(r"([.!?,])", r" \1", s) s = re.sub(r"[^a-zA-Z.!?,\']+", r" ", s) s = re.sub(r"\s+", r" ", s).strip() return s QA = open(qa_file, "r").readlines() while (True):
import csv import ngtpy # create an index framwork in filesystem. ngtpy.create(path=b'index', dimension=128, distance_type="L2") # load objects. objects = [] with open(b'../../data/sift-dataset-5k.tsv', 'r') as fp: for object in csv.reader(fp, delimiter = '\t'): objects.append(object[0:128]) # open index. index = ngtpy.Index(b'index') # insert the objects. index.batch_insert(objects) # save the index. index.save() # close the index. index.close() # open the index. index = ngtpy.Index(b'index') # load query data. with open(b'../../data/sift-query-3.tsv', 'r') as fp: query = list(csv.reader(fp, delimiter = '\t'))
path[i] = re.sub("\)", "", path[i]) path[i] = re.sub('[0-9]', '', path[i]) path[i] = re.sub(':', '', path[i]) path[i] = path[i].lower() path = " ||| ".join(path[1:]) embedding = bc.encode([path]) all_embeddings.append(embedding.tolist()) with open("full_path_embeddings.pkl", "wb") as write_file: pickle.dump(all_embeddings, write_file) all_embeddings = pickle.load(open("full_path_embeddings.pkl", "rb")) print(len(all_embeddings[0][0])) ngtpy.create(b"event_index", len(all_embeddings[0][0])) index = ngtpy.Index(b"event_index") for i in range(len(all_embeddings)): all_embeddings[i] = all_embeddings[i][0] index.batch_insert(all_embeddings) index.save() all_event_mappings = open("all_event_mappings_bert_base_uncased.txt", "w") all_events_ntsb = open("all_events.txt").readlines() for event in all_events_ntsb: event_embedding = bc.encode([event.lower()]) result = index.search(event_embedding, 1) for i, o in enumerate(result): event_mapping = " --> ".join(paths_to_leaves[int(str(o[0]))]) all_event_mappings.write(event + "===" + event_mapping + '\n') event_path_mappings_non_leaf = dict()