def build_annoy_index(X, path, ntrees=50, build_index_on_disk=True, verbose=1): index = AnnoyIndex(X.shape[1], metric='angular') if build_index_on_disk: index.on_disk_build(path) if issparse(X): for i in tqdm(range(X.shape[0]), disable=verbose < 1): v = X[i].toarray()[0] index.add_item(i, v) else: for i in tqdm(range(X.shape[0]), disable=verbose < 1): v = X[i] index.add_item(i, v) try: index.build(ntrees) except Exception: msg = ("Error building Annoy Index. Passing on_disk_build=False" " may solve the issue, especially on Windows.") raise IndexBuildingError(msg) else: if not build_index_on_disk: index.save(path) return index
def nn_annoy(ds1, ds2, names1, names2, knn=20, metric='euclidean', n_trees=50, save_on_disk=True): """ Assumes that Y is zero-indexed. """ # Build index. a = AnnoyIndex(ds2.shape[1], metric=metric) if (save_on_disk): a.on_disk_build('annoy.index') for i in range(ds2.shape[0]): a.add_item(i, ds2[i, :]) a.build(n_trees) # Search index. ind = [] for i in range(ds1.shape[0]): ind.append(a.get_nns_by_vector(ds1[i, :], knn, search_k=-1)) ind = np.array(ind) # Match. match = set() for a, b in zip(range(ds1.shape[0]), ind): for b_i in b: match.add((names1[a], names2[b_i])) return match
def _make_new_manifold(self, *, embedding_id: int, metric: str = 'euclidean', dim: Optional[int] = None) -> int: """Will commit multiple write transactions""" if dim is None: embedding = self.get_embedding(embedding_id) dim = embedding.get_dim() self.begin_exclusive_transaction() c = self._db.execute('INSERT INTO Manifolds (embedding_id, building, metadata) VALUES (?, ?, ?);', (embedding_id, False, '{}')) manifold_id = c.lastrowid self.__manifolds_embedding[manifold_id] = embedding_id full_fn = mkstemp(dir=self.get_data_dir(), prefix=f'{manifold_id:06d}.', suffix='.annoy', text=False)[1] index = AnnoyIndex(dim, metric) index.on_disk_build(full_fn) self.__manifolds_annoy_index[manifold_id] = index # must put in cache, otherwise it will try to index.load(fn) fn = os.path.relpath(full_fn, self.get_data_dir()) metadata = dict( fn=fn, metric=metric, utc=str(datetime.utcnow()) ) self._db.execute('UPDATE Manifolds SET building = ?, metadata = ? WHERE manifold_id = ?;', (True, json.dumps(metadata), manifold_id)) self.commit() return manifold_id
def on_disk_build_annoy(file_name, trees=1, dim=128): vectors = ujson.loads(open(file_name + ".json", "r").read()) index = AnnoyIndex(dim) index.on_disk_build(file_name + ".ann") for i in range(len(vectors)): index.add_item(i, vectors[i]) index.build(trees) return index
def test_on_disk(self): f = 2 i = AnnoyIndex(f, 'euclidean') i.on_disk_build('on_disk.ann') self.add_items(i) i.build(10) self.check_nns(i) i.unload() i.load('on_disk.ann') self.check_nns(i) j = AnnoyIndex(f, 'euclidean') j.load('on_disk.ann') self.check_nns(j)
def load_data(path_data): ids = [] with open_fn(path_data, 'rb') as f: for i, record in enumerate(avro.reader(f)): v = record[FACTORS_KEY] if i == 0: n_dim = len(v) ann = AnnoyIndex(n_dim, metric=METRIC) ann.on_disk_build(PATH_DISK_SAVE) ann.add_item(i, v) ids.append(record[ID_KEY]) return ann, ids
def build_annoy_index(X, path, ntrees=50, build_index_on_disk=True, verbose=True): """ Build a standalone Annoy index. Parameters ------------- X: np.array with shape (n_samples, n_features) path: str or Path The filepath of a trained annoy index file saved on disk ntrees: int The number of random projections trees built by Annoy to approximate KNN. The more trees,the higher the memory usage, but the better the accuracy of results (default 50) build_index_on_disk: bool Whether to build the annoy index directly on disk. Building on disk should allow for bigger datasets to be indexed, but may cause issues. If None, on-disk building will be enabled for Linux, but not Windows due to issues on Windows. verbose: bool """ verbose = int(verbose) index = AnnoyIndex(X.shape[1], metric='angular') if build_index_on_disk: index.on_disk_build(str(path)) if issparse(X): for i in tqdm(range(X.shape[0]), disable=verbose < 1): v = X[i].toarray()[0] index.add_item(i, v) else: for i in tqdm(range(X.shape[0]), disable=verbose < 1): v = X[i] index.add_item(i, v) try: index.build(ntrees) except Exception: raise IndexBuildingError( 'Error building Annoy Index. Try setting `build_index_on_disk` to False.' ) else: if not build_index_on_disk: index.save(path) return index
def test_on_disk(self): f = 2 i = AnnoyIndex(f, 'euclidean') i.on_disk_build('test.ann') i.add_item(0, [2, 2]) i.add_item(1, [3, 2]) i.add_item(2, [3, 3]) i.build(10) i.unload() i.load('test.ann') self.assertEqual(i.get_nns_by_vector([4, 4], 3), [2, 1, 0]) self.assertEqual(i.get_nns_by_vector([1, 1], 3), [0, 1, 2]) self.assertEqual(i.get_nns_by_vector([4, 2], 3), [1, 2, 0])
def build_index(matrix, indices, num_trees, metric, index_path, verbose = True): total_len = len(indices) proj_dim = matrix.shape[1] # compute neighbors using annoy t0 = time.time() index = AnnoyIndex(proj_dim, metric= metric) # Length of item vector that will be indexed index.on_disk_build(index_path) for i in range(total_len): index.add_item(i, matrix[indices[i],:]) index.build(num_trees) if verbose: my_print('time to build '+str(num_trees)+' trees = '+str(time.time()-t0)) return index
def build_annoy_index(X, path, ntrees=50, verbose=1): index = AnnoyIndex(X.shape[1]) index.on_disk_build(path) if issparse(X): for i in tqdm(range(X.shape[0]), disable=verbose < 1): v = X[i].toarray()[0] index.add_item(i, v) else: for i in tqdm(range(X.shape[0]), disable=verbose < 1): v = X[i] index.add_item(i, v) # Build n trees index.build(ntrees) return index
def build_annoy_index(X, path, ntrees=50, build_index_on_disk=True, metric="euclidean", verbose=1): """ Build a standalone annoy index. :param array X: numpy array with shape (n_samples, n_features) :param str path: The filepath of a trained annoy index file saved on disk. :param int ntrees: The number of random projections trees built by Annoy to approximate KNN. The more trees the higher the memory usage, but the better the accuracy of results. :param bool build_index_on_disk: Whether to build the annoy index directly on disk. Building on disk should allow for bigger datasets to be indexed, but may cause issues. If None, on-disk building will be enabled for Linux, but not Windows due to issues on Windows. :param int verbose: Controls the volume of logging output the model produces when training. When set to 0, silences outputs, when above 0 will print outputs. """ index = AnnoyIndex(X.shape[1], metric=metric) if build_index_on_disk: index.on_disk_build(path) if issparse(X): for i in tqdm(range(X.shape[0]), disable=verbose < 1): v = X[i].toarray()[0] index.add_item(i, v) else: for i in tqdm(range(X.shape[0]), disable=verbose < 1): v = X[i] index.add_item(i, v) try: index.build(ntrees) except Exception: msg = ("Error building Annoy Index. Passing on_disk_build=False" " may solve the issue, especially on Windows.") raise IndexBuildingError(msg) else: if not build_index_on_disk: index.save(path) return index
def make_ann(n_dim=N_DIM, n_items=100): ids = [] ann = AnnoyIndex(n_dim, METRIC) ann.on_disk_build(PATH_DISK_SAVE) for ind in range(n_items): v = [random.gauss(0, 1) for _ in range(n_dim)] ann.add_item(ind, v) ids.append(str(ind)) ann.build(N_TREES) meta_d = { 'vec_src': Path(__file__).name, 'metric': METRIC, 'n_dim': n_dim, 'timestamp_utc': datetime.utcnow().isoformat(), } return ids, meta_d
def build_annoy_index(X, path, ntrees=50, verbose=1): index = AnnoyIndex(X.shape[1], metric='angular') if platform.system() != 'Windows': index.on_disk_build(path) if issparse(X): for i in tqdm(range(X.shape[0]), disable=verbose < 1): v = X[i].toarray()[0] index.add_item(i, v) else: for i in tqdm(range(X.shape[0]), disable=verbose < 1): v = X[i] index.add_item(i, v) # Build n trees index.build(ntrees) if platform.system() == 'Windows': index.save(path) return index
def compute_and_store_similarity(self): start = time.time() sessions_VSM, sessions_id = self._driver.session_vectors() print("Time to create the vector:", time.time() - start) t = AnnoyIndex(sessions_VSM.shape[1], 'angular') t.on_disk_build('/tmp/test.ann') start = time.time() i = 0 overall_size = sessions_VSM.shape[0] for ix in range(overall_size): x = sessions_VSM.getrow(ix) t.add_item(ix, x.toarray()[0]) i += 1 if i % 1000 == 0: print(i, "rows processed over", overall_size) print("Time to index:", time.time() - start) del sessions_VSM gc.collect() start = time.time() t.build(5) # 5 trees print("Time to build:", time.time() - start) knn_start = time.time() i = 0 for ix in range(overall_size): knn = self.compute_knn(ix, sessions_id, t, 50) start = time.time() self.store_knn(sessions_id[ix], knn) self.__time_to_store.append(time.time() - start) i +=1 if i%100 == 0: print(i, "rows processed over", overall_size) print(mean(self.__time_to_query), mean(self.__time_to_knn), mean(self.__time_to_sort), mean(self.__time_to_store)) self.__time_to_query = [] self.__time_to_knn = [] self.__time_to_sort = [] self.__time_to_store = [] print("Time to compute knn:", time.time() - knn_start)
def _build_annoy_index(self, annoy_index_path): annoy_index = AnnoyIndex(self.encoder.dimension, 'angular') if os.path.exists(annoy_index_path): print(f"Loading Annoy index from {annoy_index_path}...") annoy_index.load(annoy_index_path, prefault=True) else: print("Building Annoy index...") annoy_index.on_disk_build(annoy_index_path) for starting_index in tqdm( range(0, len(self.target_sentences), _BATCH_SIZE)): target_sentences = self.target_sentences[ starting_index:starting_index + _BATCH_SIZE] target_vectors = self.encoder.get_vectors(target_sentences) for i, vector in enumerate(target_vectors, start=starting_index): annoy_index.add_item(i, vector) annoy_index.build(_N_TREES) return annoy_index
def reindex_manifold(self, manifold_id: int, metric: str = 'euclidean', n_trees: int = 10) -> int: old_index = self._get_annoy_index(manifold_id) with self._db: self.begin_exclusive_transaction() if not self._db.execute('SELECT ready FROM Manifolds WHERE manifold_id = ?', (manifold_id,)).fetchone()[0]: raise RuntimeError(f'Could not reindex manifold #{manifold_id} which is not ready.') full_fn = mkstemp(dir=self.get_data_dir(), prefix=f'{manifold_id:06d}.', suffix='.annoy', text=False)[1] index = AnnoyIndex(old_index.f, metric) index.on_disk_build(full_fn) self.__manifolds_annoy_index[manifold_id] = index fn = os.path.relpath(full_fn, self.get_data_dir()) metadata = self._get_manifold_metadata(manifold_id) metadata.update(dict( fn=fn, metric=metric, utc=str(datetime.utcnow()), )) self.__logger.debug(f"Created new index on {fn}") self._db.execute( 'UPDATE Manifolds SET building = 1, ready = 0, metadata = ? ' 'WHERE manifold_id = ?;', (json.dumps(metadata), manifold_id)) self.commit() self.__logger.info("Copying items from old index...") for item_i in range(old_index.get_n_items()): index.add_item(item_i, old_index.get_item_vector(item_i)) return self._build_manifold_index(manifold_id, n_trees=n_trees)
class Annoy(VectorIndex): def __init__(self, path, dims=None, metric='angular', build_on_disk=True): self.path = path self.is_mutable = None self.is_built = None self.build_on_disk = build_on_disk self.metric = metric if os.path.isfile(self.path): logging.debug(f'Loading existing index: {self.path}') self.load_meta() assert self.dims == dims or not dims, \ 'Passed path to existing index but dims do not match' assert self.metric == metric or not metric, \ 'Passed path to existing index but metrics do not match' self.index = AnnoyIndex(self.dims, metric=self.metric) elif dims: logging.debug( f'Creating new index with {dims} dimensions and {self.metric} metric' ) self.dims = dims self.index = AnnoyIndex(self.dims, metric=self.metric) if build_on_disk: self.index.on_disk_build(self.path) else: logging.debug(f'Loading existing index: {self.path}') self.load_meta() self.index = AnnoyIndex(self.dims, metric=self.metric) @property def meta_path(self): return self.path + '.meta.json' @property def files(self): return [self.path, self.meta_path] def load_meta(self): self.__dict__.update(load_json(self.meta_path)) def save_meta(self): d = {**self.__dict__} d.pop('index') save_json(d, self.meta_path) def build(self, num_trees=10): logging.debug(f'staring to build index: {self.path}') self.index.build(num_trees) logging.debug(f'finished building index: {self.path}') self.is_mutable = False self.is_built = True self.save_meta() def save(self): self.index.save(self.path) self.is_mutable = False self.save_meta() def load(self, memory=False): self.index.load(self.path, prefault=memory) self.is_mutable = False def unload(self): self.index.unload() def __del__(self): self.unload() def __setitem__(self, idx, vector): self.index.add_item(idx, vector) def __getitem__(self, idx): return self.index.get_item_vector(idx) def __len__(self): return self.index.get_n_items() def add(self, vector): idx = len(self) self[idx] = vector return idx def add_bulk(self, vectors): start = len(self) for n, v in enumerate(vectors): self[start + n] = v return self def set_bulk(self, indices, vectors): for idx, vector in zip(indices, vectors): self[idx] = vector def search(self, vector, num=10, depth=None, distances=True): return self.index.get_nns_by_vector(vector, num, depth or -1, distances) def search_index(self, idx, num=10, depth=None, distances=True): return self.index.get_nns_by_item(idx, num, depth or -1, distances) def distance(self, i, j): return self.index.get_distance(i, j)
def vectorize_batch_chunk(lbatch, vector_index_chunk): global doc_counter doc_idxs = [] for i in range(lbatch.shape[0]): doc_idxs.append(doc_counter) doc_counter += 1 vectors = generate_embeddings(lbatch["text"]) if len(vectors.shape) >= 2 and vectors.shape[1] > 0: for vec, page_num in zip(vectors, doc_idxs): vector_index_chunk.add_item(page_num, vec) vector_index_chunk = AnnoyIndex(vector_dims, 'angular') vector_index_chunk.on_disk_build(ES_INDEX_CHUNK + f"_annoy.bin") with tqdm(total=total_chunks) as pbar: for j, batch in enumerate( pd.read_json('nyc_docs-sentences15.json', lines=True, chunksize=batch_size)): batch["smallenough"] = batch["text"].apply(lambda x: len(x) < 100000) batch = batch[batch["smallenough"]] try: vectorize_batch_chunk(batch, vector_index_chunk) except ResourceExhaustedError: minibatches = np.array_split(batch, batch_size) for i, minibatch in enumerate(minibatches): try: vectorize_batch_chunk(minibatch, vector_index_chunk)
sys.path.append('../../SimDocSin/') from datetime import datetime from preprocess.filename import get_file_paths start = datetime.now() args = sys.argv lang = args[1] print("Start Loading Target Documents") paths = get_file_paths(lang) sent_to_doc_map = {} f = 1024 t = AnnoyIndex(f, 'euclidean') t.on_disk_build("../index/test_" + lang + ".ann") sent_count = {} sent_count[0] = 0 count = 0 i = 0 document_count = 0 for file_name in paths: file = open(file_name, encoding='utf-8') embed_data = json.load(file) for j in range(len(embed_data)): # si_doc = Embeddings[j]['content_si'] si_doc_embed = embed_data[j]['embed_' + lang]
def build_annoy_index(X, path, metric='angular', ntrees=50, build_index_on_disk=True, verbose=1): """ Build a standalone annoy index. :param array X: numpy array with shape (n_samples, n_features) :param str path: The filepath of a trained annoy index file saved on disk. :param int ntrees: The number of random projections trees built by Annoy to approximate KNN. The more trees the higher the memory usage, but the better the accuracy of results. :param bool build_index_on_disk: Whether to build the annoy index directly on disk. Building on disk should allow for bigger datasets to be indexed, but may cause issues. :param str metric: Which distance metric Annoy should use when building KNN index. Supports "angular", "euclidean", "manhattan", "hamming", or "dot". :param int verbose: Controls the volume of logging output the model produces when training. When set to 0, silences outputs, when above 0 will print outputs. """ if verbose: print("Building KNN index") if len(X.shape) > 2: if "reshape" in dir(X): if verbose: print( 'Flattening multidimensional input before building KNN index using Annoy' ) X = X.reshape((X.shape[0], -1)) else: raise ValueError( "Attempting to build AnnoyIndex on multi-dimensional data" " without providing a reshape method. AnnoyIndexes require" " 2D data - rows and columns.") index = AnnoyIndex(X.shape[1], metric=metric) if build_index_on_disk: index.on_disk_build(path) if issparse(X): for i in tqdm(range(X.shape[0]), disable=verbose < 1): vector = X[i].toarray()[0] index.add_item(i, vector) else: for i in tqdm(range(X.shape[0]), disable=verbose < 1): vector = X[i] index.add_item(i, vector) try: index.build(ntrees) except Exception as e: msg = ("Error building Annoy Index. Passing on_disk_build=False" " may solve the issue, especially on Windows.") raise Exception(msg) from e else: if not build_index_on_disk: index.save(path) return index
def create_annoy_index(filename, vector_filepaths, dims=300, n_trees=10, check_dupes=False, on_disk=True): ''' Build an Annoy index for approximate nearest neighbours, ingesting one or more of the pySRP vector files. Uses the on-disk build. Includes an index of ids numbers and mtids, saves as {filename}.index.pq ''' import time start = time.time() if type(vector_filepaths) is not list: vector_filepaths = [vector_filepaths] t = AnnoyIndex(dims) if on_disk: t.on_disk_build(filename) # List of mtids, where the list index matches the index given to annoy ind = [] unique = set() lasthtid = None i = 0 for path in vector_filepaths: with Vector_file(path, mode='r') as vecf: assert dims == vecf.dims for ix, vec in vecf: norm = np.linalg.norm(vec) if norm == 0 or np.isnan(norm) or np.isinf(norm): continue vec = vec / norm if check_dupes: # Does two things - avoids duplicated pages / chunks, # and only allows consecutive streams of a book - once # the stream has moved on, that book can't be added again mtid_split = split_mtid(ix) htid = mtid_split[0] seq = "-".join([str(x) for x in mtid_split[1:]]) if lasthtid != htid: if htid in unique: continue else: lasthtid = htid unique.add(htid) currentseqs = set([seq]) elif seq in currentseqs: continue else: currentseqs.add(seq) assert i == len(ind) ind.append(ix) t.add_item(i, vec) i += 1 print("Total vecs", len(ind), end=',') print("Done ingesting. Time: %.0f seconds; Building" % (time.time() - start)) t.build(n_trees) if not on_disk: t.save(filename) print("Done build. Time: %.0f seconds; Saving Index" % (time.time() - start)) #ind = pd.Series(ind).to_frame('mtid') ind = (pd.Series(ind).apply( lambda x: x.split('-', 1)[0]).reset_index().rename(columns={ 0: 'htid' }).groupby('htid')['index'].aggregate(['min', 'max']).sort_index()) ind.to_parquet('%s.index.pq' % filename, compression='snappy')
def build_index(sheets_path, restrict_class=None, restrict_range=None, store_desckp=True): print("building index...") if restrict_class and restrict_range: bboxes = restrict_bboxes(sheets_path, restrict_class, restrict_range) else: bboxes_dict = find_sheet.get_dict(sheets_path) bboxes = list(bboxes_dict.values()) keypoint_dict = {} t = AnnoyIndex(config.index_descriptor_length, config.index_annoydist) t.on_disk_build(config.reference_index_path) idx_id = 0 index_dict = {} progress = progressbar.ProgressBar(maxval=len(bboxes)) for bbox in progress(bboxes): try: rivers_json = osm.get_from_osm(bbox) except JSONDecodeError: print("error in OSM data for bbox %s, skipping sheet" % bbox) continue reference_river_image = osm.paint_features(rivers_json, bbox) # reduce image size for performance with fixed aspect ratio processing_size = resize_by_width(reference_river_image.shape, config.index_img_width_train) reference_image_small = cv2.resize(reference_river_image, processing_size, config.resizing_index_building) if config.index_border_train: reference_image_small = cv2.copyMakeBorder( reference_image_small, config.index_border_train, config.index_border_train, config.index_border_train, config.index_border_train, cv2.BORDER_CONSTANT, None, 0) # get class label # class_label = find_sheet.find_name_for_bbox(sheets_path, bbox) class_label = list(bboxes_dict.keys())[bboxes.index(bbox)] if not class_label: print("error in class name. skipping bbox", bbox) continue # extract features of sheet try: keypoints, descriptors = extract_features( reference_image_small, first_n=config.index_n_descriptors_train) except ValueError as e: print(type(e), e) print("error in descriptors. skipping sheet", class_label) continue if descriptors is None or len( descriptors) == 0 or descriptors[0] is None: print("no descriptors in bbox ", bbox) print("error in descriptors. skipping sheet", class_label) continue # add features and class=sheet to index index_dict[class_label] = descriptors keypoint_dict[class_label] = [x.pt for x in keypoints] for x in descriptors: t.add_item(idx_id, x) idx_id += 1 sheet_names[class_label] = len(descriptors) t.build(config.index_num_trees, n_jobs=-1) # compile index and save to disk # save other data to disk to disk joblib.dump(sheet_names, config.reference_sheets_path) if store_desckp: for sheet, descs in index_dict.items(): joblib.dump( descs, config.reference_descriptors_folder + "/%s.clf" % sheet) for sheet, kps in keypoint_dict.items(): joblib.dump(kps, config.reference_keypoints_folder + "/%s.clf" % sheet)
def gen_cbir(): """Generate structures needed for content-based image retrieval""" global kmeans # parse config.yaml print("parsing config") try: dirpath = os.path.dirname(os.path.realpath(__file__)) path = os.path.join(dirpath, 'config.yaml') with open(path) as f: config = yaml.safe_load(f) except IOError: print("error loading config file") sys.exit(1) try: num_cpus = config['cpus'] except KeyError: num_cpus = cpu_count() # connect to sqlite database print("connecting to databases") conn = sqlite3.connect('working/twitter_scraper.db') c = conn.cursor() # load descriptors descriptors = bsddb3.db.DB() if os.path.exists("working/descriptors.bdb"): descriptors.open("working/descriptors.bdb") else: descriptors.open("working/descriptors.bdb", dbtype=bsddb3.db.DB_BTREE, flags=bsddb3.db.DB_CREATE) # calculate descriptors of new images print("determine files to compute") c.execute('SELECT path, filename FROM info') files = c.fetchall() files = [os.path.join(a, b) for a, b in files] compute_files = set() for i, f in enumerate(files): if descriptors.get(f.encode()) is None: compute_files.add(f) if i % 10000 == 0: print(i) print('files to compute: {}'.format(len(compute_files))) files = enumerate(compute_files) # extract features from new images print("computing descriptors") new_descriptors = {} with Pool(processes=num_cpus) as pool: for r in pool.imap(extract_features, files, chunksize=64): if not isinstance(r, Exception): des = deserialize(r[2]) descriptors[r[1].encode()] = des new_descriptors[r[1]] = des # create clusters try: kmeans = joblib.load('working/kmeans.pkl') n_clusters = kmeans.cluster_centers_.shape[0] except: n_clusters = 512 kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=2048) # calculate kmeans print("calculating kmeans") cur = None for i, des in enumerate(new_descriptors.items()): if des[1] is not None: print(f'calculating kmeans, image: {i:08d}') if des[1].shape[0] < n_clusters: if cur is None: cur = des[1] else: cur = np.concatenate((cur, des[1]), axis=0) if cur is not None and cur.shape[0] > n_clusters: kmeans = kmeans.partial_fit(np.float32(cur)) cur = None else: if cur is not None: cur = np.concatenate((cur, des[1]), axis=0) kmeans = kmeans.partial_fit(np.float32(cur)) cur = None else: kmeans = kmeans.partial_fit(np.float32(des[1])) if cur is not None: kmeans = kmeans.partial_fit(np.float32(cur)) del new_descriptors gc.collect() # save kmeans print("saving kmeans") joblib.dump(kmeans, 'working/kmeans.pkl') # set up structures for annoy index print("setting up annoy structures") c.execute('SELECT path, filename FROM info') all_images = c.fetchall() files = [] for f in all_images: fullpath = os.path.join(f[0], f[1]) if descriptors.get(fullpath.encode()) is not None: files.append(fullpath) BOW_annoy_map = {} for i, f in enumerate(files): BOW_annoy_map[i] = f index = AnnoyIndex(n_clusters, 'angular') index.on_disk_build('working/BOW_index.ann') # add histograms to annoy index print("computing histograms") for i, f in enumerate(files): r = compute_histograms(i, f, descriptors) if not isinstance(r, Exception): index.add_item(r[0], r[2]) # build index print("building index") index.build(50) descriptors.sync() descriptors.close() # save index map print("saving annoy map") joblib.dump(BOW_annoy_map, 'working/BOW_annoy_map.pkl')