Example #1
0
File: knn.py Project: sbhadade/ivis
def build_annoy_index(X, path, ntrees=50, build_index_on_disk=True, verbose=1):

    index = AnnoyIndex(X.shape[1], metric='angular')
    if build_index_on_disk:
        index.on_disk_build(path)

    if issparse(X):
        for i in tqdm(range(X.shape[0]), disable=verbose < 1):
            v = X[i].toarray()[0]
            index.add_item(i, v)
    else:
        for i in tqdm(range(X.shape[0]), disable=verbose < 1):
            v = X[i]
            index.add_item(i, v)

    try:
        index.build(ntrees)
    except Exception:
        msg = ("Error building Annoy Index. Passing on_disk_build=False"
               " may solve the issue, especially on Windows.")
        raise IndexBuildingError(msg)
    else:
        if not build_index_on_disk:
            index.save(path)
        return index
Example #2
0
def nn_annoy(ds1,
             ds2,
             names1,
             names2,
             knn=20,
             metric='euclidean',
             n_trees=50,
             save_on_disk=True):
    """ Assumes that Y is zero-indexed. """
    # Build index.
    a = AnnoyIndex(ds2.shape[1], metric=metric)
    if (save_on_disk):
        a.on_disk_build('annoy.index')
    for i in range(ds2.shape[0]):
        a.add_item(i, ds2[i, :])
    a.build(n_trees)

    # Search index.
    ind = []
    for i in range(ds1.shape[0]):
        ind.append(a.get_nns_by_vector(ds1[i, :], knn, search_k=-1))
    ind = np.array(ind)

    # Match.
    match = set()
    for a, b in zip(range(ds1.shape[0]), ind):
        for b_i in b:
            match.add((names1[a], names2[b_i]))

    return match
Example #3
0
    def _make_new_manifold(self, *, embedding_id: int, metric: str = 'euclidean', dim: Optional[int] = None) -> int:
        """Will commit multiple write transactions"""
        if dim is None:
            embedding = self.get_embedding(embedding_id)
            dim = embedding.get_dim()

        self.begin_exclusive_transaction()
        c = self._db.execute('INSERT INTO Manifolds (embedding_id, building, metadata) VALUES (?, ?, ?);',
                             (embedding_id, False, '{}'))
        manifold_id = c.lastrowid

        self.__manifolds_embedding[manifold_id] = embedding_id

        full_fn = mkstemp(dir=self.get_data_dir(),
                          prefix=f'{manifold_id:06d}.',
                          suffix='.annoy', text=False)[1]

        index = AnnoyIndex(dim, metric)
        index.on_disk_build(full_fn)
        self.__manifolds_annoy_index[manifold_id] = index  # must put in cache, otherwise it will try to index.load(fn)

        fn = os.path.relpath(full_fn, self.get_data_dir())
        metadata = dict(
            fn=fn,
            metric=metric,
            utc=str(datetime.utcnow())
        )

        self._db.execute('UPDATE Manifolds SET building = ?, metadata = ? WHERE manifold_id = ?;',
                         (True, json.dumps(metadata), manifold_id))
        self.commit()
        return manifold_id
Example #4
0
def on_disk_build_annoy(file_name, trees=1, dim=128):
    vectors = ujson.loads(open(file_name + ".json", "r").read())
    index = AnnoyIndex(dim)
    index.on_disk_build(file_name + ".ann")

    for i in range(len(vectors)):
        index.add_item(i, vectors[i])
    index.build(trees)
    return index
 def test_on_disk(self):
     f = 2
     i = AnnoyIndex(f, 'euclidean')
     i.on_disk_build('on_disk.ann')
     self.add_items(i)
     i.build(10)
     self.check_nns(i)
     i.unload()
     i.load('on_disk.ann')
     self.check_nns(i)
     j = AnnoyIndex(f, 'euclidean')
     j.load('on_disk.ann')
     self.check_nns(j)
Example #6
0
def load_data(path_data):
    ids = []
    with open_fn(path_data, 'rb') as f:
        for i, record in enumerate(avro.reader(f)):
            v = record[FACTORS_KEY]
            if i == 0:
                n_dim = len(v)
                ann = AnnoyIndex(n_dim, metric=METRIC)
                ann.on_disk_build(PATH_DISK_SAVE)

            ann.add_item(i, v)
            ids.append(record[ID_KEY])
    return ann, ids
Example #7
0
def build_annoy_index(X,
                      path,
                      ntrees=50,
                      build_index_on_disk=True,
                      verbose=True):
    """
    Build a standalone Annoy index.

    Parameters
    -------------
    X: np.array with shape (n_samples, n_features)
    path: str or Path
        The filepath of a trained annoy index file saved on disk
    ntrees: int
        The number of random projections trees built by Annoy to approximate KNN. The more trees,the
        higher the memory usage, but the better the accuracy of results (default 50)
    build_index_on_disk: bool
        Whether to build the annoy index directly on disk. Building on disk should allow for bigger
        datasets to be indexed, but may cause issues. If None, on-disk building will be enabled for
        Linux, but not Windows due to issues on Windows.
    verbose: bool

    """
    verbose = int(verbose)

    index = AnnoyIndex(X.shape[1], metric='angular')
    if build_index_on_disk:
        index.on_disk_build(str(path))

    if issparse(X):
        for i in tqdm(range(X.shape[0]), disable=verbose < 1):
            v = X[i].toarray()[0]
            index.add_item(i, v)
    else:
        for i in tqdm(range(X.shape[0]), disable=verbose < 1):
            v = X[i]
            index.add_item(i, v)

    try:
        index.build(ntrees)
    except Exception:
        raise IndexBuildingError(
            'Error building Annoy Index. Try setting `build_index_on_disk` to False.'
        )
    else:
        if not build_index_on_disk:
            index.save(path)
        return index
Example #8
0
    def test_on_disk(self):
        f = 2
        i = AnnoyIndex(f, 'euclidean')
        i.on_disk_build('test.ann')
        i.add_item(0, [2, 2])
        i.add_item(1, [3, 2])
        i.add_item(2, [3, 3])

        i.build(10)
        i.unload()

        i.load('test.ann')

        self.assertEqual(i.get_nns_by_vector([4, 4], 3), [2, 1, 0])
        self.assertEqual(i.get_nns_by_vector([1, 1], 3), [0, 1, 2])
        self.assertEqual(i.get_nns_by_vector([4, 2], 3), [1, 2, 0])
def build_index(matrix, indices, num_trees, metric, index_path, verbose = True):
    total_len = len(indices)
    proj_dim = matrix.shape[1]
    # compute neighbors using annoy
    t0 = time.time()

    index = AnnoyIndex(proj_dim, metric= metric)  # Length of item vector that will be indexed
    index.on_disk_build(index_path) 
    for i in range(total_len):
        index.add_item(i, matrix[indices[i],:])
    index.build(num_trees)
    
    if verbose:
        my_print('time to build '+str(num_trees)+' trees = '+str(time.time()-t0)) 
    
    return index
Example #10
0
def build_annoy_index(X, path, ntrees=50, verbose=1):

    index = AnnoyIndex(X.shape[1])
    index.on_disk_build(path)

    if issparse(X):
        for i in tqdm(range(X.shape[0]), disable=verbose < 1):
            v = X[i].toarray()[0]
            index.add_item(i, v)
    else:
        for i in tqdm(range(X.shape[0]), disable=verbose < 1):
            v = X[i]
            index.add_item(i, v)

    # Build n trees
    index.build(ntrees)
    return index
Example #11
0
def build_annoy_index(X,
                      path,
                      ntrees=50,
                      build_index_on_disk=True,
                      metric="euclidean",
                      verbose=1):
    """ Build a standalone annoy index.
    :param array X: numpy array with shape (n_samples, n_features)
    :param str path: The filepath of a trained annoy index file
        saved on disk.
    :param int ntrees: The number of random projections trees built by Annoy to
        approximate KNN. The more trees the higher the memory usage, but the
        better the accuracy of results.
    :param bool build_index_on_disk: Whether to build the annoy index directly
        on disk. Building on disk should allow for bigger datasets to be indexed,
        but may cause issues. If None, on-disk building will be enabled for Linux, 
        but not Windows due to issues on Windows.
    :param int verbose: Controls the volume of logging output the model
        produces when training. When set to 0, silences outputs, when above 0
        will print outputs.
    """

    index = AnnoyIndex(X.shape[1], metric=metric)
    if build_index_on_disk:
        index.on_disk_build(path)

    if issparse(X):
        for i in tqdm(range(X.shape[0]), disable=verbose < 1):
            v = X[i].toarray()[0]
            index.add_item(i, v)
    else:
        for i in tqdm(range(X.shape[0]), disable=verbose < 1):
            v = X[i]
            index.add_item(i, v)

    try:
        index.build(ntrees)
    except Exception:
        msg = ("Error building Annoy Index. Passing on_disk_build=False"
               " may solve the issue, especially on Windows.")
        raise IndexBuildingError(msg)
    else:
        if not build_index_on_disk:
            index.save(path)
        return index
Example #12
0
def make_ann(n_dim=N_DIM, n_items=100):
    ids = []
    ann = AnnoyIndex(n_dim, METRIC)
    ann.on_disk_build(PATH_DISK_SAVE)

    for ind in range(n_items):
        v = [random.gauss(0, 1) for _ in range(n_dim)]
        ann.add_item(ind, v)
        ids.append(str(ind))

    ann.build(N_TREES)

    meta_d = {
        'vec_src': Path(__file__).name,
        'metric': METRIC,
        'n_dim': n_dim,
        'timestamp_utc': datetime.utcnow().isoformat(),
    }

    return ids, meta_d
Example #13
0
File: knn.py Project: kevinrue/ivis
def build_annoy_index(X, path, ntrees=50, verbose=1):

    index = AnnoyIndex(X.shape[1], metric='angular')
    if platform.system() != 'Windows':
        index.on_disk_build(path)

    if issparse(X):
        for i in tqdm(range(X.shape[0]), disable=verbose < 1):
            v = X[i].toarray()[0]
            index.add_item(i, v)
    else:
        for i in tqdm(range(X.shape[0]), disable=verbose < 1):
            v = X[i]
            index.add_item(i, v)

    # Build n trees
    index.build(ntrees)
    if platform.system() == 'Windows':
        index.save(path)

    return index
Example #14
0
 def compute_and_store_similarity(self):
     start = time.time()
     sessions_VSM, sessions_id = self._driver.session_vectors()
     print("Time to create the vector:", time.time() - start)
     t = AnnoyIndex(sessions_VSM.shape[1], 'angular')
     t.on_disk_build('/tmp/test.ann')
     start = time.time()
     i = 0
     overall_size = sessions_VSM.shape[0]
     for ix in range(overall_size):
         x = sessions_VSM.getrow(ix)
         t.add_item(ix, x.toarray()[0])
         i += 1
         if i % 1000 == 0:
             print(i, "rows processed over", overall_size)
     print("Time to index:", time.time() - start)
     del sessions_VSM
     gc.collect()
     start = time.time()
     t.build(5)  # 5 trees
     print("Time to build:", time.time() - start)
     knn_start = time.time()
     i = 0
     for ix in range(overall_size):
         knn = self.compute_knn(ix, sessions_id, t, 50)
         start = time.time()
         self.store_knn(sessions_id[ix], knn)
         self.__time_to_store.append(time.time() - start)
         i +=1
         if i%100 == 0:
             print(i, "rows processed over", overall_size)
             print(mean(self.__time_to_query),
                   mean(self.__time_to_knn),
                   mean(self.__time_to_sort),
                   mean(self.__time_to_store))
             self.__time_to_query = []
             self.__time_to_knn = []
             self.__time_to_sort = []
             self.__time_to_store = []
     print("Time to compute knn:", time.time() - knn_start)
Example #15
0
    def _build_annoy_index(self, annoy_index_path):
        annoy_index = AnnoyIndex(self.encoder.dimension, 'angular')

        if os.path.exists(annoy_index_path):
            print(f"Loading Annoy index from {annoy_index_path}...")
            annoy_index.load(annoy_index_path, prefault=True)
        else:
            print("Building Annoy index...")
            annoy_index.on_disk_build(annoy_index_path)

            for starting_index in tqdm(
                    range(0, len(self.target_sentences), _BATCH_SIZE)):
                target_sentences = self.target_sentences[
                    starting_index:starting_index + _BATCH_SIZE]
                target_vectors = self.encoder.get_vectors(target_sentences)
                for i, vector in enumerate(target_vectors,
                                           start=starting_index):
                    annoy_index.add_item(i, vector)

            annoy_index.build(_N_TREES)

        return annoy_index
Example #16
0
    def reindex_manifold(self, manifold_id: int, metric: str = 'euclidean', n_trees: int = 10) -> int:

        old_index = self._get_annoy_index(manifold_id)

        with self._db:
            self.begin_exclusive_transaction()
            if not self._db.execute('SELECT ready FROM Manifolds WHERE manifold_id = ?', (manifold_id,)).fetchone()[0]:
                raise RuntimeError(f'Could not reindex manifold #{manifold_id} which is not ready.')

            full_fn = mkstemp(dir=self.get_data_dir(),
                              prefix=f'{manifold_id:06d}.',
                              suffix='.annoy', text=False)[1]

            index = AnnoyIndex(old_index.f, metric)
            index.on_disk_build(full_fn)
            self.__manifolds_annoy_index[manifold_id] = index

            fn = os.path.relpath(full_fn, self.get_data_dir())
            metadata = self._get_manifold_metadata(manifold_id)
            metadata.update(dict(
                fn=fn,
                metric=metric,
                utc=str(datetime.utcnow()),
            ))
            self.__logger.debug(f"Created new index on {fn}")

            self._db.execute(
                'UPDATE Manifolds SET building = 1, ready = 0, metadata = ? '
                'WHERE manifold_id = ?;', (json.dumps(metadata), manifold_id))

            self.commit()

        self.__logger.info("Copying items from old index...")
        for item_i in range(old_index.get_n_items()):
            index.add_item(item_i, old_index.get_item_vector(item_i))

        return self._build_manifold_index(manifold_id, n_trees=n_trees)
Example #17
0
class Annoy(VectorIndex):
    def __init__(self, path, dims=None, metric='angular', build_on_disk=True):
        self.path = path
        self.is_mutable = None
        self.is_built = None
        self.build_on_disk = build_on_disk
        self.metric = metric

        if os.path.isfile(self.path):
            logging.debug(f'Loading existing index: {self.path}')
            self.load_meta()
            assert self.dims == dims or not dims, \
              'Passed path to existing index but dims do not match'
            assert self.metric == metric or not metric, \
              'Passed path to existing index but metrics do not match'
            self.index = AnnoyIndex(self.dims, metric=self.metric)
        elif dims:
            logging.debug(
                f'Creating new index with {dims} dimensions and {self.metric} metric'
            )
            self.dims = dims
            self.index = AnnoyIndex(self.dims, metric=self.metric)
            if build_on_disk:
                self.index.on_disk_build(self.path)
        else:
            logging.debug(f'Loading existing index: {self.path}')
            self.load_meta()
            self.index = AnnoyIndex(self.dims, metric=self.metric)

    @property
    def meta_path(self):
        return self.path + '.meta.json'

    @property
    def files(self):
        return [self.path, self.meta_path]

    def load_meta(self):
        self.__dict__.update(load_json(self.meta_path))

    def save_meta(self):
        d = {**self.__dict__}
        d.pop('index')
        save_json(d, self.meta_path)

    def build(self, num_trees=10):
        logging.debug(f'staring to build index: {self.path}')
        self.index.build(num_trees)
        logging.debug(f'finished building index: {self.path}')
        self.is_mutable = False
        self.is_built = True
        self.save_meta()

    def save(self):
        self.index.save(self.path)
        self.is_mutable = False
        self.save_meta()

    def load(self, memory=False):
        self.index.load(self.path, prefault=memory)
        self.is_mutable = False

    def unload(self):
        self.index.unload()

    def __del__(self):
        self.unload()

    def __setitem__(self, idx, vector):
        self.index.add_item(idx, vector)

    def __getitem__(self, idx):
        return self.index.get_item_vector(idx)

    def __len__(self):
        return self.index.get_n_items()

    def add(self, vector):
        idx = len(self)
        self[idx] = vector
        return idx

    def add_bulk(self, vectors):
        start = len(self)
        for n, v in enumerate(vectors):
            self[start + n] = v
        return self

    def set_bulk(self, indices, vectors):
        for idx, vector in zip(indices, vectors):
            self[idx] = vector

    def search(self, vector, num=10, depth=None, distances=True):
        return self.index.get_nns_by_vector(vector, num, depth or -1,
                                            distances)

    def search_index(self, idx, num=10, depth=None, distances=True):
        return self.index.get_nns_by_item(idx, num, depth or -1, distances)

    def distance(self, i, j):
        return self.index.get_distance(i, j)
Example #18
0
def vectorize_batch_chunk(lbatch, vector_index_chunk):
    global doc_counter

    doc_idxs = []
    for i in range(lbatch.shape[0]):
        doc_idxs.append(doc_counter)
        doc_counter += 1

    vectors = generate_embeddings(lbatch["text"])
    if len(vectors.shape) >= 2 and vectors.shape[1] > 0:
        for vec, page_num in zip(vectors, doc_idxs):
            vector_index_chunk.add_item(page_num, vec)


vector_index_chunk = AnnoyIndex(vector_dims, 'angular')
vector_index_chunk.on_disk_build(ES_INDEX_CHUNK + f"_annoy.bin")

with tqdm(total=total_chunks) as pbar:
    for j, batch in enumerate(
            pd.read_json('nyc_docs-sentences15.json',
                         lines=True,
                         chunksize=batch_size)):
        batch["smallenough"] = batch["text"].apply(lambda x: len(x) < 100000)
        batch = batch[batch["smallenough"]]
        try:
            vectorize_batch_chunk(batch, vector_index_chunk)
        except ResourceExhaustedError:
            minibatches = np.array_split(batch, batch_size)
            for i, minibatch in enumerate(minibatches):
                try:
                    vectorize_batch_chunk(minibatch, vector_index_chunk)
Example #19
0
sys.path.append('../../SimDocSin/')
from datetime import datetime
from preprocess.filename import get_file_paths

start = datetime.now()

args = sys.argv
lang = args[1]

print("Start Loading Target Documents")
paths = get_file_paths(lang)
sent_to_doc_map = {}

f = 1024
t = AnnoyIndex(f, 'euclidean')
t.on_disk_build("../index/test_" + lang + ".ann")

sent_count = {}
sent_count[0] = 0
count = 0
i = 0
document_count = 0

for file_name in paths:
    file = open(file_name, encoding='utf-8')
    embed_data = json.load(file)

    for j in range(len(embed_data)):
        # si_doc = Embeddings[j]['content_si']
        si_doc_embed = embed_data[j]['embed_' + lang]
Example #20
0
File: knn.py Project: zhuye88/ivis
def build_annoy_index(X,
                      path,
                      metric='angular',
                      ntrees=50,
                      build_index_on_disk=True,
                      verbose=1):
    """ Build a standalone annoy index.

    :param array X: numpy array with shape (n_samples, n_features)
    :param str path: The filepath of a trained annoy index file
        saved on disk.
    :param int ntrees: The number of random projections trees built by Annoy to
        approximate KNN. The more trees the higher the memory usage, but the
        better the accuracy of results.
    :param bool build_index_on_disk: Whether to build the annoy index directly
        on disk. Building on disk should allow for bigger datasets to be indexed,
        but may cause issues.
    :param str metric: Which distance metric Annoy should use when building KNN index.
        Supports "angular", "euclidean", "manhattan", "hamming", or "dot".
    :param int verbose: Controls the volume of logging output the model
        produces when training. When set to 0, silences outputs, when above 0
        will print outputs.

    """

    if verbose:
        print("Building KNN index")

    if len(X.shape) > 2:
        if "reshape" in dir(X):
            if verbose:
                print(
                    'Flattening multidimensional input before building KNN index using Annoy'
                )
            X = X.reshape((X.shape[0], -1))
        else:
            raise ValueError(
                "Attempting to build AnnoyIndex on multi-dimensional data"
                " without providing a reshape method. AnnoyIndexes require"
                " 2D data - rows and columns.")

    index = AnnoyIndex(X.shape[1], metric=metric)
    if build_index_on_disk:
        index.on_disk_build(path)

    if issparse(X):
        for i in tqdm(range(X.shape[0]), disable=verbose < 1):
            vector = X[i].toarray()[0]
            index.add_item(i, vector)
    else:
        for i in tqdm(range(X.shape[0]), disable=verbose < 1):
            vector = X[i]
            index.add_item(i, vector)

    try:
        index.build(ntrees)
    except Exception as e:
        msg = ("Error building Annoy Index. Passing on_disk_build=False"
               " may solve the issue, especially on Windows.")
        raise Exception(msg) from e
    else:
        if not build_index_on_disk:
            index.save(path)
        return index
Example #21
0
def create_annoy_index(filename,
                       vector_filepaths,
                       dims=300,
                       n_trees=10,
                       check_dupes=False,
                       on_disk=True):
    ''' 
    Build an Annoy index for approximate nearest neighbours, ingesting one
    or more of the pySRP vector files. Uses the on-disk build.
    
    Includes an index of ids numbers and mtids, saves as {filename}.index.pq
    '''
    import time
    start = time.time()

    if type(vector_filepaths) is not list:
        vector_filepaths = [vector_filepaths]

    t = AnnoyIndex(dims)
    if on_disk:
        t.on_disk_build(filename)

    # List of mtids, where the list index matches the index given to annoy
    ind = []
    unique = set()
    lasthtid = None

    i = 0
    for path in vector_filepaths:
        with Vector_file(path, mode='r') as vecf:
            assert dims == vecf.dims
            for ix, vec in vecf:
                norm = np.linalg.norm(vec)
                if norm == 0 or np.isnan(norm) or np.isinf(norm):
                    continue
                vec = vec / norm
                if check_dupes:
                    # Does two things - avoids duplicated pages / chunks,
                    # and only allows consecutive streams of a book - once
                    # the stream has moved on, that book can't be added again
                    mtid_split = split_mtid(ix)
                    htid = mtid_split[0]
                    seq = "-".join([str(x) for x in mtid_split[1:]])

                    if lasthtid != htid:
                        if htid in unique:
                            continue
                        else:
                            lasthtid = htid
                            unique.add(htid)
                            currentseqs = set([seq])
                    elif seq in currentseqs:
                        continue
                    else:
                        currentseqs.add(seq)

                assert i == len(ind)
                ind.append(ix)
                t.add_item(i, vec)
                i += 1

        print("Total vecs", len(ind), end=',')

    print("Done ingesting. Time: %.0f seconds; Building" %
          (time.time() - start))
    t.build(n_trees)

    if not on_disk:
        t.save(filename)

    print("Done build. Time: %.0f seconds; Saving Index" %
          (time.time() - start))
    #ind = pd.Series(ind).to_frame('mtid')

    ind = (pd.Series(ind).apply(
        lambda x: x.split('-', 1)[0]).reset_index().rename(columns={
            0: 'htid'
        }).groupby('htid')['index'].aggregate(['min', 'max']).sort_index())
    ind.to_parquet('%s.index.pq' % filename, compression='snappy')
Example #22
0
def build_index(sheets_path,
                restrict_class=None,
                restrict_range=None,
                store_desckp=True):
    print("building index...")

    if restrict_class and restrict_range:
        bboxes = restrict_bboxes(sheets_path, restrict_class, restrict_range)
    else:
        bboxes_dict = find_sheet.get_dict(sheets_path)

    bboxes = list(bboxes_dict.values())

    keypoint_dict = {}
    t = AnnoyIndex(config.index_descriptor_length, config.index_annoydist)
    t.on_disk_build(config.reference_index_path)
    idx_id = 0

    index_dict = {}
    progress = progressbar.ProgressBar(maxval=len(bboxes))
    for bbox in progress(bboxes):
        try:
            rivers_json = osm.get_from_osm(bbox)
        except JSONDecodeError:
            print("error in OSM data for bbox %s, skipping sheet" % bbox)
            continue
        reference_river_image = osm.paint_features(rivers_json, bbox)

        # reduce image size for performance with fixed aspect ratio
        processing_size = resize_by_width(reference_river_image.shape,
                                          config.index_img_width_train)
        reference_image_small = cv2.resize(reference_river_image,
                                           processing_size,
                                           config.resizing_index_building)
        if config.index_border_train:
            reference_image_small = cv2.copyMakeBorder(
                reference_image_small, config.index_border_train,
                config.index_border_train, config.index_border_train,
                config.index_border_train, cv2.BORDER_CONSTANT, None, 0)
        # get class label
        # class_label = find_sheet.find_name_for_bbox(sheets_path, bbox)
        class_label = list(bboxes_dict.keys())[bboxes.index(bbox)]
        if not class_label:
            print("error in class name. skipping bbox", bbox)
            continue

        # extract features of sheet
        try:
            keypoints, descriptors = extract_features(
                reference_image_small,
                first_n=config.index_n_descriptors_train)
        except ValueError as e:
            print(type(e), e)
            print("error in descriptors. skipping sheet", class_label)
            continue
        if descriptors is None or len(
                descriptors) == 0 or descriptors[0] is None:
            print("no descriptors in bbox ", bbox)
            print("error in descriptors. skipping sheet", class_label)
            continue
        # add features and class=sheet to index
        index_dict[class_label] = descriptors
        keypoint_dict[class_label] = [x.pt for x in keypoints]

        for x in descriptors:
            t.add_item(idx_id, x)
            idx_id += 1
        sheet_names[class_label] = len(descriptors)

    t.build(config.index_num_trees,
            n_jobs=-1)  # compile index and save to disk
    # save other data to disk to disk
    joblib.dump(sheet_names, config.reference_sheets_path)
    if store_desckp:
        for sheet, descs in index_dict.items():
            joblib.dump(
                descs, config.reference_descriptors_folder + "/%s.clf" % sheet)
        for sheet, kps in keypoint_dict.items():
            joblib.dump(kps,
                        config.reference_keypoints_folder + "/%s.clf" % sheet)
Example #23
0
def gen_cbir():
    """Generate structures needed for content-based image retrieval"""

    global kmeans

    # parse config.yaml
    print("parsing config")
    try:
        dirpath = os.path.dirname(os.path.realpath(__file__))
        path = os.path.join(dirpath, 'config.yaml')
        with open(path) as f:
            config = yaml.safe_load(f)
    except IOError:
        print("error loading config file")
        sys.exit(1)
    try:
        num_cpus = config['cpus']
    except KeyError:
        num_cpus = cpu_count()

    # connect to sqlite database
    print("connecting to databases")
    conn = sqlite3.connect('working/twitter_scraper.db')
    c = conn.cursor()

    # load descriptors
    descriptors = bsddb3.db.DB()
    if os.path.exists("working/descriptors.bdb"):
        descriptors.open("working/descriptors.bdb")
    else:
        descriptors.open("working/descriptors.bdb",
                         dbtype=bsddb3.db.DB_BTREE,
                         flags=bsddb3.db.DB_CREATE)

    # calculate descriptors of new images
    print("determine files to compute")
    c.execute('SELECT path, filename FROM info')
    files = c.fetchall()
    files = [os.path.join(a, b) for a, b in files]
    compute_files = set()
    for i, f in enumerate(files):
        if descriptors.get(f.encode()) is None:
            compute_files.add(f)
        if i % 10000 == 0:
            print(i)
    print('files to compute: {}'.format(len(compute_files)))
    files = enumerate(compute_files)

    # extract features from new images
    print("computing descriptors")
    new_descriptors = {}
    with Pool(processes=num_cpus) as pool:
        for r in pool.imap(extract_features, files, chunksize=64):
            if not isinstance(r, Exception):
                des = deserialize(r[2])
                descriptors[r[1].encode()] = des
                new_descriptors[r[1]] = des

    # create clusters
    try:
        kmeans = joblib.load('working/kmeans.pkl')
        n_clusters = kmeans.cluster_centers_.shape[0]
    except:
        n_clusters = 512
        kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=2048)

    # calculate kmeans
    print("calculating kmeans")
    cur = None
    for i, des in enumerate(new_descriptors.items()):
        if des[1] is not None:
            print(f'calculating kmeans, image: {i:08d}')
            if des[1].shape[0] < n_clusters:
                if cur is None:
                    cur = des[1]
                else:
                    cur = np.concatenate((cur, des[1]), axis=0)
                if cur is not None and cur.shape[0] > n_clusters:
                    kmeans = kmeans.partial_fit(np.float32(cur))
                    cur = None
            else:
                if cur is not None:
                    cur = np.concatenate((cur, des[1]), axis=0)
                    kmeans = kmeans.partial_fit(np.float32(cur))
                    cur = None
                else:
                    kmeans = kmeans.partial_fit(np.float32(des[1]))
    if cur is not None:
        kmeans = kmeans.partial_fit(np.float32(cur))

    del new_descriptors
    gc.collect()

    # save kmeans
    print("saving kmeans")
    joblib.dump(kmeans, 'working/kmeans.pkl')

    # set up structures for annoy index
    print("setting up annoy structures")
    c.execute('SELECT path, filename FROM info')
    all_images = c.fetchall()
    files = []
    for f in all_images:
        fullpath = os.path.join(f[0], f[1])
        if descriptors.get(fullpath.encode()) is not None:
            files.append(fullpath)
    BOW_annoy_map = {}
    for i, f in enumerate(files):
        BOW_annoy_map[i] = f

    index = AnnoyIndex(n_clusters, 'angular')
    index.on_disk_build('working/BOW_index.ann')

    # add histograms to annoy index
    print("computing histograms")
    for i, f in enumerate(files):
        r = compute_histograms(i, f, descriptors)
        if not isinstance(r, Exception):
            index.add_item(r[0], r[2])

    # build index
    print("building index")
    index.build(50)

    descriptors.sync()
    descriptors.close()

    # save index map
    print("saving annoy map")
    joblib.dump(BOW_annoy_map, 'working/BOW_annoy_map.pkl')