def _set_mips_index(self): """ Create a Faiss Flat index with inner product as the metric to search against """ try: import faiss except ImportError: raise Exception( "Error: Please install faiss to use FaissMIPSIndex") if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0: print("\n> Building index", flush=True) cpu_index = faiss.IndexFlatIP(self.embed_size) if self.use_gpu: # create resources and config for GpuIndex config = faiss.GpuMultipleClonerOptions() config.shard = True config.useFloat16 = True gpu_index = faiss.index_cpu_to_all_gpus(cpu_index, co=config) self.mips_index = faiss.IndexIDMap(gpu_index) if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0: print(">> Initialized index on GPU", flush=True) else: # CPU index supports IDs so wrap with IDMap self.mips_index = faiss.IndexIDMap(cpu_index) if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0: print(">> Initialized index on CPU", flush=True) # if we were constructed with a BlockData, then automatically load it # when the FAISS structure is built if self.embed_data is not None: self.add_embed_data(self.embed_data)
def build_index(self): if self.text_embedding_path.endswith(".gz"): f = gzip.open(self.text_embedding_path, mode='rt') else: f = open(self.text_embedding_path) ids = [] vectors = [] for line in f: vals = line.split('\t') if vals[0].startswith('Q'): qnode = vals[0] # use the number part of Qnodes as id id = int(qnode[1:]) if vals[1] == 'embedding_sentence': self.qnode_to_sentence_dict[qnode] = vals[2] if vals[1] == 'text_embedding': x = vals[2].strip().split(',') x = [np.float32(r) for r in x] self.qnode_to_vector_dict[qnode] = np.array([x]) ids.append(id) vectors.append(x) index = faiss.IndexFlatL2(len(x)) if self.index is None: self.index = faiss.IndexIDMap(index) self.index.add_with_ids(np.array(vectors), np.array(ids))
def save_faiss_index(self): try: diary_cover_pic_face_vec_fd = open( "./diary_cover_pic_face_vec.txt", "r") xb, ids = [], [] for line in diary_cover_pic_face_vec_fd.readlines(): line_term_list = line.split("\t") diary_id = line_term_list[0] face_feature = json.loads(line_term_list[1]) face_feature_vec = np.array(face_feature) xb.append(face_feature_vec) ids.append(diary_id) xb_np = np.array(xb).astype('float32') ids_np = np.array(ids).astype('int') index = faiss.IndexHNSWFlat(128, 32) index = faiss.IndexIDMap(index) index.add_with_ids(xb_np, ids_np) faiss.write_index(index, settings.INDEX_PATH) diary_cover_pic_face_vec_fd.close() except: logging.error("catch exception, err_msg:%s" % traceback.format_exc())
def load_from_db(index_file, version_id): global feature_api log.info('load_from_db') VECTOR_SIZE = 2048 if index_file is None: log.debug('Create a new index file') index = faiss.IndexFlatL2(VECTOR_SIZE) index2 = faiss.IndexIDMap(index) else: log.debug('Load from index file') index2 = faiss.read_index(index_file) offset = 0 limit = 100 id_num = 1 file = os.path.join(os.getcwd(), INDEX_FILE) i = 0 try: while True: queue_size = rconn.llen(REDIS_OBJECT_INDEX_QUEUE) if queue_size != 0: time.sleep(60) continue res = feature_api.get_features(offset=offset, limit=limit) if len(res) == 0: save_index_file(file) time.sleep(60 * 60) continue objects = [] for obj in res: feature = np.fromstring(obj['vector'], dtype=np.float32) xb = np.expand_dims(np.array(feature, dtype=np.float32), axis=0) id_array = [] id_array.append(id_num) id_set = np.array(id_array) index2.add_with_ids(xb, id_set) new_obj = {} new_obj['object_id'] = obj['object_id'] new_obj['index'] = id_num objects.append(new_obj) id_num = id_num + 1 save_objects_to_db(objects) faiss.write_index(index2, file) if i % 100 == 0: save_index_file(file) offset = offset + limit i = i + 1 except Exception as e: log.error(str(e))
def __init__(self, epsilon_b: float, epsilon_n: float, lam: int, beta: float, alpha: float, max_age: int, r0: float, dimensions: int = 2, random_state: int = 42) -> None: self.graph = Graph() self.epsilon_b = epsilon_b self.epsilon_n = epsilon_n self.lam = lam self.beta = beta self.alpha = alpha self.max_age = max_age self.dimensions = dimensions self.r0 = r0 self.index = faiss.IndexIDMap(faiss.IndexFlatL2(dimensions)) self.next_id = 2 self.point_to_cluster = {} self.cycle = 0 self.step = 1 np.random.seed(random_state) node_1 = Node(np.random.rand(1, dimensions).astype( 'float32')[0], 0, id=0, error_cycle=0, radius=r0) node_2 = Node(np.random.rand(1, dimensions).astype( 'float32')[0], 0, id=1, error_cycle=0, radius=r0) self.graph.insert_node(node_1) self.graph.insert_node(node_2) self.index.add_with_ids( np.array([node_1.protype, node_2.protype]), np.array([0, 1]))
def buildindex(self): try: if self.em is None: print("No imported encoded text database.") dec = input( "Would you like to encode? (it may take ~ 1 hour)\n (y/n): " ) if dec.lower()[0] == 'y': self.encoder.max_seq_length = 512 self.em = self.encoder.encode( self.df[self.target].to_list(), show_progress_bar=True) self.em = np.array([emi for emi in self.em]).astype("float32") self.vecdim = self.em.shape[1] else: path = input("Enter the path to encoded text base: ") self.importencoded(path) #self.index = faiss.IndexFlatL2(self.vecdim) self.index = faiss.IndexFlatIP(self.vecdim) self.index = faiss.IndexIDMap(self.index) self.normalizeencoded() self.index.add_with_ids(self.em, self.df.id.values) print("FAISS index was built successfully") print("Number of articles:", self.index.ntotal) except: print("ERROR: CANNOT build index")
def __init__(self, target, nprobe=128, num_gpu=None, index_factory_str=None, verbose=False, mode='proxy', using_gpu=True): self._res_list = [] found_gpu = len(os.environ['CUDA_VISIBLE_DEVICES'].split(",")) if found_gpu == 0: raise RuntimeError( "No GPU found. Please export CUDA_VISIBLE_DEVICES") if num_gpu is None or num_gpu > found_gpu: num_gpu = found_gpu print('[faiss gpu] #GPU: {}'.format(num_gpu)) size, dim = target.shape assert size > 0, "size: {}".format(size) index_factory_str = "IVF{},PQ{}".format( min(8192, 16 * round(np.sqrt(size))), 32) if index_factory_str is None else index_factory_str cpu_index = faiss.index_factory(dim, index_factory_str) cpu_index.nprobe = nprobe if mode == 'proxy': co = faiss.GpuClonerOptions() co.useFloat16 = True co.usePrecomputed = False index = faiss.IndexProxy() for i in range(num_gpu): res = faiss.StandardGpuResources() self._res_list.append(res) sub_index = faiss.index_cpu_to_gpu( res, i, cpu_index, co) if using_gpu else cpu_index index.addIndex(sub_index) elif mode == 'shard': raise NotImplementedError else: raise KeyError("Unknown index mode") index = faiss.IndexIDMap(index) index.verbose = verbose # get nlist to decide how many samples used for training nlist = int([ item for item in index_factory_str.split(",") if 'IVF' in item ][0].replace("IVF", "")) # training if not index.is_trained: indexes_sample_for_train = np.random.randint(0, size, nlist * 256) index.train(target[indexes_sample_for_train]) # add with ids target_ids = np.arange(0, size) index.add_with_ids(target, target_ids) self.index = index
def ivf_searsh_2(): ''' 自定义索引 :return: ''' nlist = 4 # 行分割数-行分割成的单元格数,分割数越多分割越耗时 index = faiss.IndexFlatL2(d) iv_index = faiss.IndexIVFFlat(index, d, nlist, faiss.METRIC_L2) indexIdMap = faiss.IndexIDMap(iv_index) idx = [int(str(int(time.time() * 1000)) + str(i)) for i in range(nb)] # print(xb) # print(xb.size) # print(np.array(idx)) if not indexIdMap.is_trained: indexIdMap.train(xb) indexIdMap.add_with_ids(xb, np.array(idx)) # 索引并行是 数字,long int indexIdMap.nprobe = 3 # 搜索访问的单元格数,访问越大越精确越耗时 :既 128维倒排切割成4份,只计算3份的维度相似 start = time.time() D, I = indexIdMap.search(xq, K) print('ivf_search_time', time.time() - start) print('ivf_search_相似索引', I) print('ivf_search_相似值', D)
def _build_faiss_model(self): sample = next(self._descriptor_set.iterdescriptors()) sample_v = sample.vector() n, d = self.count(), sample_v.size data = np.empty((n, d), dtype=np.float32) elements_to_matrix( self._descriptor_set, mat=data, use_multiprocessing=self.use_multiprocessing, report_interval=1.0, ) self._uuids = np.array(list(self._descriptor_set.keys())) self.faiss_flat = faiss.IndexFlatL2(d) if self.exhaustive: self._faiss_index = faiss.IndexIDMap(self.faiss_flat) else: nlist = 10000 self._faiss_index = faiss.IndexIVFFlat(self.faiss_flat, d, nlist, faiss.METRIC_L2) self._faiss_index.train(data) self._faiss_index.nprobe = 5000 self._log.info("data shape, type: %s, %s", data.shape, data.dtype) self._log.info("uuid shape, type: %s, %s", self._uuids.shape, self._uuids.dtype) self._faiss_index.add_with_ids(data, self._uuids) self._log.info("FAISS index has been constructed with %d vectors", self._faiss_index.ntotal)
def init_sentence_index(self): self.logger.info("Initializing sentence index") empty_embedding = np.array([self.embedder.encode("Vole")]).astype("float32") self.sentences_list = [] print(empty_embedding.shape[1]) self.sentence_index = faiss.IndexFlatL2(empty_embedding.shape[1]) self.sentence_index = faiss.IndexIDMap(self.sentence_index)
def faiss_flat_ip(encoded_data): """Faiss flatip.""" dim = encoded_data.shape[1] index = faiss.IndexIDMap(faiss.IndexFlatIP(dim)) faiss.normalize_L2(encoded_data) index.add_with_ids(encoded_data, np.arange(len(encoded_data))) return index
def load_from_queue(index_file): log.info('load_from_queue') VECTOR_SIZE = 2048 if index_file is None: log.debug('Create a new index file') index = faiss.IndexFlatL2(VECTOR_SIZE) index2 = faiss.IndexIDMap(index) else: log.debug('Load from index file') index2 = faiss.read_index(index_file) def items(): while True: yield rconn.blpop([REDIS_OBJECT_FEATURE_QUEUE]) def request_stop(signum, frame): log.info('stopping') rconn.connection_pool.disconnect() log.info('connection closed') sys.exit() signal.signal(signal.SIGINT, request_stop) signal.signal(signal.SIGTERM, request_stop) i = 0 for item in items(): key, obj_data = item obj = pickle.loads(obj_data) # log.debug(obj) feature = obj['feature'] xb = np.expand_dims(np.array(feature, dtype=np.float32), axis=0) obj['feature'] = None rconn.rpush(REDIS_OBJECT_LIST, obj['name']) d = pickle.dumps(obj) rconn.hset(REDIS_OBJECT_HASH, obj['name'], obj['product_id']) # xb = np.array(features) id_num = rconn.llen(REDIS_OBJECT_LIST) # log.debug(id_num) id_array = [] id_array.append(id_num) id_set = np.array(id_array) # print(xb) # print(np.shape(xb)) # print(id_set) # print(xb.shape) # print(id_set.shape) # print(id_set) start_time = time.time() index2.add_with_ids(xb, id_set) elapsed_time = time.time() - start_time log.info('indexing time: ' + str(elapsed_time)) file = os.path.join(os.getcwd(), INDEX_FILE) if i % 50 == 0: faiss.write_index(index2, file) save_index_file(file) i = i + 1 log.info('index done')
def test_IDMap(self): sub_index = faiss.IndexFlatL2(d) index = faiss.IndexIDMap(sub_index) index.add_with_ids(xb, np.arange(len(xb))) del sub_index gc.collect() index.add_with_ids(xb, np.arange(len(xb)))
def get_persona_faiss_selected(args, tokenizer): """ Prepare the dataset for training and evaluation """ personachat = get_dataset_with_no_tokenizer(tokenizer, args.dataset_path, args.dataset_cache) logger.info("Build inputs and labels") datasets = {"train": defaultdict(list), "valid": defaultdict(list)} persona_faiss_selected = [] history_faiss_selected = [] persona_faiss_index = [] history_faiss_index = [] persona_complete = parse_data('./Dataset/train_self_original.txt') persona_complete = persona_complete[:20] for dataset_name, dataset in personachat.items(): num_candidates = len(dataset[0]["utterances"][0]["candidates"]) if args.num_candidates > 0 and dataset_name == 'train': num_candidates = min(args.num_candidates, num_candidates) for dialog in dataset: #persona = dialog["personality"].copy() persona = dialog["persona_info"] #persona2 = dialog["persona_info2"].copy() #persona_selected = faiss(replyanddialog) #index: all persona1 sentences or all personalities #model1 = SentenceTransformer('bert-large-nli-mean-tokens') #model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased') model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens') embeddings_persona = model.encode(persona_complete, show_progress_bar=True) #data_train list of set of list of all personalities (not duplicated) # Step 1: Change data type #embeddings_persona = np.array([embedding for embedding in embeddings_persona]).astype("float32") # Step 2: Instantiate the index index = faiss.IndexFlatL2(embeddings_persona.shape[1]) # Step 3: Pass the index to IndexIDMap index = faiss.IndexIDMap(index) # Step 4: Add vectors and their IDs index.add_with_ids( embeddings_persona, np.array(list(range(0, embeddings_persona.shape[0])))) for _ in range(args.personality_permutations): for utterance in dialog["utterances"]: history = utterance["history"][-(2 * args.max_history + 1):] for j, candidate in enumerate( utterance["candidates"][-num_candidates:]): history_encoded = model.encode(history, show_progress_bar=True) D, I = index.search(np.array(history_encoded), k=5) history_faiss_selected.append(history) persona_faiss_selected.append( persona_complete[I[0][1]]) #persona = [persona[-1]] + persona[:-1] # permuted personalities return persona_faiss_selected
def fit(self, item_matrix, ids): num, vec_dim = item_matrix.shape # 创建索引 self.faiss_index = faiss.IndexFlatL2(vec_dim) # 使用欧式距离作为变量 # 添加id编号 self.faiss_index1 = faiss.IndexIDMap(self.faiss_index) # 添加数据 self.faiss_index1.add_with_ids(item_matrix, ids)
def __init__(self, batch_size, name, data_location, read_from_file, path=FAISS_PATH, model='document_embeddings', pooling='mean', index_size=INDEX_SIZE, index_start=0, index_number=0, fail_mode=0, fail_size=0, previous_time=0., failed_list=None): # make all arguments class fields if not failed_list: self.failed_list = [] else: with open(failed_list, 'r') as f: self.failed_list = [tuple(map(int, line.split(' '))) for line in f] self.model = model self.pooling = pooling self.fail_size = fail_size self.fail_mode = fail_mode self.index_number = index_number self.index_start = index_start self.index_position = index_start # index start is still needed, that's why it's not increased self.index_size = index_size self.path = path self.read_from_file = read_from_file if batch_size: self.batch_size = batch_size self.name = name self.data_location = data_location self.previous_time = previous_time if not self.path.endswith(f'/{name}'): # self.path += f'/{name}' # if index_number is 0, it was started from the user, not from the script itself. if self.index_number == 0 and not self.fail_mode: try: os.mkdir(self.path) except FileExistsError: print(f'directory already exists and I am just deleting it.') shutil.rmtree(self.path) os.mkdir(self.path) # write basic information into index information File self.write_index_information(f'--- Index Information for Testcase {self.name} ---') self.write_index_information(f'Dataset: {self.data_location}') self.write_index_information(f'Model: {self.model}') self.write_index_information(f'Batch Size : {self.batch_size}') # read in the dataset. The dataset is always read completely and spliced afterwards self.document_pairs = get_dataset(data_location, read_from_file, write=f'{name}_dataset') # creation of the index and the ID-Map which adds the index # If I change the behavior to include a fail-mode and pick up, from it, read in the current index if self.fail_mode: self.id_index = faiss.read_index(f'{path}/{name}_{index_number}') else: if not self.model == 'distiluse-base-multilingual-cased': self.index = faiss.IndexFlatIP(768) # Metric InnerProduct else: self.index = faiss.IndexFlatIP(512) # Metric InnerProduct self.id_index = faiss.IndexIDMap(self.index) if not failed_list: self.failed_list = [] else: with open(failed_list, 'r') as f: self.failed_list = [tuple(map(int, line.split(' '))) for line in f]
def build_index(self): """:returns an inverted index for the search documents""" vectors = [self.encode(document) for document in self.documents] index = faiss.IndexIDMap( faiss.IndexFlatIP(768)) # dimensionality of vector space # Add document vectors into index after transforming into numpy arrays. IDs should match len(documents) index.add_with_ids(np.array([vec.numpy() for vec in vectors]), np.array(range(0, len(self.documents)))) return index
def __init__(self, index_file_path=None, id_dict_path=None, dim=128, index_types='Flat'): if index_file_path and id_dict_path: default_logger.info('loading index from %s' % index_file_path) self.index = faiss.read_index(index_file_path, 0) self.id2key = pickle.load(open(id_dict_path, 'rb')) else: self.index = faiss.index_factory(dim, index_types) self.index = faiss.IndexIDMap(self.index) self.id2key = {} default_logger.info('index inited, is_trained=%s' % (self.index.is_trained))
def __build_index(self, index_dimension): if self.index_type is IndexType.L2_INDEX: log.debug("Building L2 index") index = faiss.IndexFlatL2(index_dimension) elif self.index_type is IndexType.COSINE_INDEX: log.debug("Building cosine index") index = faiss.IndexFlatIP(index_dimension) else: raise ValueError(f"Unknown index type {self.index_type}") self.__index = faiss.IndexIDMap(index)
def make_faiss_index_idmap(self, n_dimensions): """ Make a fairly general-purpose FAISS index :param n_dimensions: :return: """ print("Making index ...") tmp_index = faiss.IndexFlatL2(n_dimensions) index = faiss.IndexIDMap(tmp_index) return index
def create_naas_faiss_index(self): intent_df = pd.read_pickle('data/awesome-notebooks.pkl').reset_index() db_ids = intent_df["intent_id"].values for prefix, dimension in zip(['tf', 'st'], [512, 384]): db_vectors = np.stack( intent_df[f"{prefix}_embedding"].values).astype(np.float32) faiss.normalize_L2(db_vectors) intent_index = faiss.IndexIDMap(faiss.IndexFlatIP(dimension)) intent_index.add_with_ids(db_vectors, db_ids) faiss.write_index(intent_index, f"data/{prefix}_naas_intent_index.idx")
def test_shards(self): k = 32 ref_index = faiss.IndexFlatL2(d) print('ref search') ref_index.add(xb) _Dref, Iref = ref_index.search(xq, k) print(Iref[:5, :6]) shard_index = faiss.IndexShards(d) shard_index_2 = faiss.IndexShards(d, True, False) ni = 3 for i in range(ni): i0 = int(i * nb / ni) i1 = int((i + 1) * nb / ni) index = faiss.IndexFlatL2(d) index.add(xb[i0:i1]) shard_index.add_shard(index) index_2 = faiss.IndexFlatL2(d) irm = faiss.IndexIDMap(index_2) shard_index_2.add_shard(irm) # test parallel add shard_index_2.verbose = True shard_index_2.add(xb) for test_no in range(3): with_threads = test_no == 1 print('shard search test_no = %d' % test_no) if with_threads: remember_nt = faiss.omp_get_max_threads() faiss.omp_set_num_threads(1) shard_index.threaded = True else: shard_index.threaded = False if test_no != 2: _D, I = shard_index.search(xq, k) else: _D, I = shard_index_2.search(xq, k) print(I[:5, :6]) if with_threads: faiss.omp_set_num_threads(remember_nt) ndiff = (I != Iref).sum() print('%d / %d differences' % (ndiff, nq * k)) assert (ndiff < nq * k / 1000.)
def __init__(self, index_name: str, embeddin_dim: int, embedding_type: str): self.index_name = index_name self.last_id = 0 self.index_map = dict() self.encoder = encoder self.embeddin_dim = embeddin_dim self.embeddin_type = embedding_type self.index: faiss.IndexIDMap = faiss.IndexIDMap(faiss.IndexFlatIP(embeddin_dim))
def build_index(self, model, texts, embedding_dim, model_type, batch_size, from_saved=None): if from_saved is not None: with open(from_saved, 'rb') as f: vectors = pickle.load(f) else: vectors = calculate_embeddings(model, texts, embedding_dim, model_type=model_type, batch_size=batch_size) faiss.normalize_L2(vectors) index = faiss.IndexIDMap(faiss.IndexFlatIP(embedding_dim)) index.add_with_ids(vectors, np.array(range(0, vectors.shape[0]))) return index
def get_results(self): embed = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens') # Compute sentence embeddings for every text n the documents corpus = [d['text'] for d in self.documents] corpus_embeddings = np.array(embed.encode(corpus, convert_to_tensor=True)) index = faiss.IndexIDMap(faiss.IndexFlatIP(768)) index.add_with_ids(corpus_embeddings, np.array(range(0, len(corpus)))) # Write index for future usage faiss.write_index(index, 'pandemics') encoded_query = embed.encode([self.query]) top_k = index.search(encoded_query, self.k) answers = [corpus[_id] for _id in top_k[1].tolist()[0]] return answers
def __init__(self, color_cursor): self.index = faiss.IndexIDMap(faiss.IndexFlatL2(3 * 6)) # Query all the cover image palette self.id_to_arr = { row[0]: np.array(json.loads(row[1])).flatten() for row in color_cursor } # Build the index arr = np.stack(list(self.id_to_arr.values())).astype('float32') ids = np.array(list(self.id_to_arr.keys())).astype('int') self.index.add_with_ids(arr, ids)
def gen_faiss(s3, ssapp_docs, s3_bucket_name, paper_all, model, win_size: int = 3, max_words: int = 100): """ gen faiss index at initialization Args: ssapp_docs: bucket paper_all: All papers in database model: sentence_bert model win_size: sliding window, default is 3 max_words: max words per segment, default is 300 Returns: None """ # remove all the html in # gen the faiss indexs paper_titles = set([i.title for i in paper_all]) faiss_indexs = None # if database has data, load the data into faiss_indexs if paper_all: paper_ids = np.array([i.id for i in paper_all]) paper_embeddings = np.array( [i.e1 + i.e2 + i.e3 + i.e4 for i in paper_all]).astype("float32") faiss_indexs = faiss.IndexFlatIP(paper_embeddings.shape[1]) faiss_indexs = faiss.IndexIDMap(faiss_indexs) faiss_indexs.add_with_ids(paper_embeddings, paper_ids) for file in ssapp_docs.objects.filter(Prefix='docs/'): file_key = file.key if file_key.split('.')[-1] in cf.ALLOWED_EXTENSIONS: # if file name is not legal, rename aws s3 file name # legal_name has the format: docs/... legal_key = file_key.encode('utf-8', 'ignore').decode('utf-8') if legal_key != file_key: s3.Object(s3_bucket_name, legal_key).copy_from( CopySource=f'{s3_bucket_name}/{file_key}') s3.Object(s3_bucket_name, file_key).delete() legal_name = legal_key[5:] if legal_name not in paper_titles: # write to db body = BytesIO(file.get()['Body'].read()) faiss_indexs = write_to_db(legal_name, body, model, win_size, max_words, faiss_indexs) write_to_html(legal_name, body, s3, s3_bucket_name) return faiss_indexs
def navicode_init(): print("\nInitializing model . . .") embedder = SentenceTransformer('distilbert-base-nli-mean-tokens') cur_dir = os.getcwd() python_files = [] for dirpath, _, files in os.walk(cur_dir): for filename in files: fname = os.path.join(dirpath, filename) if fname.endswith('.py'): python_files.append(fname) print(f"\nFound {len(python_files)} python sources\n") if len(python_files) > 0: dirname = os.path.basename(cur_dir) navi_dir = os.path.join(cur_dir, ".navi") if not os.path.exists(navi_dir): os.mkdir(navi_dir) corpus = [] comments_dump = {} for i, python_file in enumerate(python_files): print(f"[{i+1}/{len(python_files)}] Scanning {python_file}") comments = comment_parser(python_file) filename = python_file[python_file.index(dirname):] for comment in comments: comments_dump[len(corpus)] = str(filename) + '---' + str( comment[1]) + "---" + re.sub(r'[^a-zA-Z0-9]+', ' ', comment[0]) corpus.append(re.sub(r'[^a-zA-Z0-9]+', ' ', comment[0])) print( f"\nComputing comment embeddings for {len(corpus)} comments . . .") corpus_embeddings = embedder.encode(corpus, show_progress_bar=True) print("\nIndexing comment embeddings . . .") index = faiss.IndexIDMap(faiss.IndexFlatIP(768)) index.add_with_ids(corpus_embeddings, np.array(range(0, len(corpus)))) faiss.write_index(index, os.path.join(navi_dir, dirname + '_navi')) with open(os.path.join(navi_dir, dirname + '_navi.json'), 'w') as file: json.dump(comments_dump, file, indent=4)
def test_int64(self): # see https://github.com/facebookresearch/faiss/issues/1529 v = faiss.Int64Vector() for i in range(10): v.push_back(i) a = faiss.vector_to_array(v) assert a.dtype == 'int64' np.testing.assert_array_equal(a, np.arange(10, dtype='int64')) # check if it works in an IDMap idx = faiss.IndexIDMap(faiss.IndexFlatL2(32)) idx.add_with_ids( np.random.rand(10, 32).astype('float32'), np.random.randint(1000, size=10, dtype='int64')) faiss.vector_to_array(idx.id_map)
def test_stress(self): # a mixture of the above, from issue #631 target = np.random.rand(50, 16).astype('float32') index = faiss.IndexReplicas() size, dim = target.shape num_gpu = 4 for _i in range(num_gpu): config = faiss.GpuIndexFlatConfig() config.device = 0 # simulate on a single GPU sub_index = faiss.GpuIndexFlatIP(faiss.StandardGpuResources(), dim, config) index.addIndex(sub_index) index = faiss.IndexIDMap(index) ids = np.arange(size) index.add_with_ids(target, ids)