Exemple #1
0
def build_forests(tree_count=250):
    path = '/home/developer/spaciotesting/'
    db.testSpacio.update_many({}, {'$unset': {"AnnoyIndex": 1}})
    dis_func = 'angular'
    """
    forest for histogram
    """
    t = annoy.AnnoyIndex(696, dis_func)
    items = db.testSpacio.find({})
    for x, item in enumerate(items):
        v = item['fingerprint']
        t.add_item(x, v)
        db.testSpacio.update_one({'_id': item['_id']},
                                 {'$set': {
                                     "AnnoyIndex.fp": x
                                 }})
    t.build(tree_count)
    t.save(path + 'histo250.ann')
    """
    forest for spacio
    """
    t = annoy.AnnoyIndex(2304, dis_func)
    items = db.testSpacio.find({})
    for x, item in enumerate(items):
        v = item['sp']
        vector = []
        for i in range(len(v)):
            vector += v[i]
        t.add_item(x, vector)
        db.testSpacio.update_one({'_id': item['_id']},
                                 {'$set': {
                                     "AnnoyIndex.sp": x
                                 }})
    t.build(tree_count)
    t.save(path + 'spacio250.ann')
Exemple #2
0
def prepare_nlp():
    nlp = spacy.load('en_core_web_md') # or en_core_web_md
    qualified = [item for item in nlp.vocab if item.has_vector and item.is_alpha]

    lexmap = []
    t = annoy.AnnoyIndex(300)
    for i, item in enumerate(islice(sorted(qualified, key=lambda x: x.prob, reverse=True), 100000)):
        t.add_item(i, item.vector)
        lexmap.append(item)
    t.build(25)

    p = annoy.AnnoyIndex(50)
    phonmap = []
    phonlookup = {}

    for i, line in enumerate(open("./cmudict-0.7b-simvecs")):
        word, vec_raw = line.split("  ")
        word = word.lower().rstrip("(0123)")
        vec = [float(v) for v in vec_raw.split()]
        p.add_item(i, vec)
        phonmap.append(word)
        phonlookup[word] = vec
    p.build(25)


    return nlp, lexmap, phonmap, phonlookup, t, p
Exemple #3
0
    def fit(self, Ciu):
        # delay loading the annoy library in case its not installed here
        import annoy

        # train the model
        super(AnnoyAlternatingLeastSquares, self).fit(Ciu)

        # build up an Annoy Index with all the item_factors (for calculating
        # similar items)
        if self.approximate_similar_items:
            log.debug("Building annoy similar items index")

            self.similar_items_index = annoy.AnnoyIndex(
                self.item_factors.shape[1], 'angular')
            for i, row in enumerate(self.item_factors):
                self.similar_items_index.add_item(i, row)
            self.similar_items_index.build(self.n_trees)

        # build up a separate index for the inner product (for recommend
        # methods)
        if self.approximate_recommend:
            log.debug("Building annoy recommendation index")
            self.max_norm, extra = augment_inner_product_matrix(
                self.item_factors)
            self.recommend_index = annoy.AnnoyIndex(extra.shape[1], 'angular')
            for i, row in enumerate(extra):
                self.recommend_index.add_item(i, row)
            self.recommend_index.build(self.n_trees)
Exemple #4
0
    def build_annoy_recommender(als_model: AlternatingLeastSquares,
                                user_labels: np.ndarray, item_labels: np.ndarray,
                                approximate_similar_items=True, approximate_recommend=True,
                                n_trees: int = 1000):
        # build up an Annoy Index with all the item_factors (for calculating similar items)
        if approximate_similar_items:
            log.info("Building annoy similar items index")

            similar_items_index = annoy.AnnoyIndex(
                als_model.item_factors.shape[1], 'angular')
            for i, row in enumerate(als_model.item_factors):
                similar_items_index.add_item(i, row)
            similar_items_index.build(n_trees)

        # build up a separate index for the inner product (for recommend methods)
        if approximate_recommend:
            log.info("Building annoy recommendation index")
            max_norm, extra = augment_inner_product_matrix(als_model.item_factors)
            recommend_index = annoy.AnnoyIndex(extra.shape[1], 'angular')
            for i, row in enumerate(extra):
                recommend_index.add_item(i, row)
            recommend_index.build(n_trees)

        return ImplicitAnnoyRecommender(als_model, recommend_index=recommend_index, max_norm=max_norm,
                                        user_labels=user_labels, item_labels=item_labels)
Exemple #5
0
def match(descriptor1, descriptor2):
	f = 128
	matches = []
	t = annoy.AnnoyIndex(f, "euclidean")
	t.build(1000)
	nFeatures = len(descriptor1)
	for j in range(nFeatures):
		t.add_item(j, descriptor1[j][1])

	nFeatures = len(descriptor2)
	for n in range(nFeatures):
		fd = descriptor2[n][1]
		#search for the best match for a feature in other images
		ind, dist = t.get_nns_by_vector(fd, 2, search_k=-1, include_distances = True)
		#print("ind, dist", ind, dist)
		if(dist[1] == 0 or dist[0]/ dist[1] < 0.8):
			matches.append([descriptor1[ind[0]][0][0], descriptor1[ind[0]][0][1], descriptor2[n][0][0], descriptor2[n][0][1]]) 

	t2 = annoy.AnnoyIndex(f, "euclidean")
	t2.build(1000)
	nFeatures = len(descriptor2)
	for j in range(nFeatures):
		t2.add_item(j, descriptor2[j][1])

	nFeatures = len(descriptor1)
	for n in range(nFeatures):
		fd = descriptor1[n][1]
		#search for the best match for a feature in other images
		ind2, dist2 = t2.get_nns_by_vector(fd, 2, search_k=-1, include_distances = True)
		if(dist2[1] == 0 or dist2[0]/dist2[1] < 0.8):
			matches.append([descriptor1[n][0][0], descriptor1[n][0][1], descriptor2[ind2[0]][0][0], descriptor2[ind2[0]][0][1]])
	print("Feature matching complete...")
	return matches
	
Exemple #6
0
def build_index(vecs, file_name):
    t = annoy.AnnoyIndex(512, "dot")
    for i in range(len(vecs)):
        t.add_item(i, vecs[i])
    t.build(n_trees=100)  # tested 100 on bdd, works well, could do more.
    t.save(file_name)
    u = annoy.AnnoyIndex(512, "dot")
    u.load(file_name)  # verify can load.
    return u
Exemple #7
0
    def fit(self, X, y=None) -> RandomProjectionTree:
        """ Build the annoy.Index and insert data from X.

        Parameters
        ----------
        X: np.array
            Data to be indexed
        y: any
            Ignored

        Returns
        -------
        self: RandomProjectionTree
            An instance of RandomProjectionTree with a built index
        """
        if y is None:
            X = check_array(X)
        else:
            X, y = check_X_y(X, y)
            self.y_train_ = y

        self.n_samples_fit_ = X.shape[0]
        self.n_features_ = X.shape[1]
        self.X_dtype_ = X.dtype
        if self.metric == 'minkowski':  # for compatibility
            self.metric = 'euclidean'
        metric = self.metric if self.metric != 'sqeuclidean' else 'euclidean'
        self.effective_metric_ = metric
        annoy_index = annoy.AnnoyIndex(X.shape[1], metric=metric)
        if self.mmap_dir == 'auto':
            self.annoy_ = create_tempfile_preferably_in_dir(
                prefix='skhubness_', suffix='.annoy', directory='/dev/shm')
            logging.warning(
                f'The index will be stored in {self.annoy_}. '
                f'It will NOT be deleted automatically, when this instance is destructed.'
            )
        elif isinstance(self.mmap_dir, str):
            self.annoy_ = create_tempfile_preferably_in_dir(
                prefix='skhubness_', suffix='.annoy', directory=self.mmap_dir)
        else:  # e.g. None
            self.mmap_dir = None

        if self.verbose:
            enumerate_X = tqdm(
                enumerate(X),
                desc='Build RPtree',
                total=len(X),
            )
        else:
            enumerate_X = enumerate(X)
        for i, x in enumerate_X:
            annoy_index.add_item(i, x.tolist())
        annoy_index.build(self.n_trees)

        if self.mmap_dir is None:
            self.annoy_ = annoy_index
        else:
            annoy_index.save(self.annoy_, )

        return self
def create_index(fn, num_trees=30, verbose=False):
    fn_annoy = fn + '.annoy'
    fn_lmdb = fn + '.lmdb'  # stores word <-> id mapping

    word, vec = get_vectors(fn).next()
    size = len(vec)
    if verbose:
        print("Vector size: {}".format(size))

    env = lmdb.open(fn_lmdb, map_size=int(1e9))
    if not os.path.exists(fn_annoy) or not os.path.exists(fn_lmdb):
        i = 0
        a = annoy.AnnoyIndex(size)
        with env.begin(write=True) as txn:
            for word, vec in get_vectors(fn):
                a.add_item(i, vec)
                id = 'i%d' % i
                word = 'w' + word
                txn.put(id, word)
                txn.put(word, id)
                i += 1
                if verbose:
                    if i % 1000 == 0:
                        print(i, '...')
        if verbose:
            print("Starting to build")
        a.build(num_trees)
        if verbose:
            print("Finished building")
        a.save(fn_annoy)
        if verbose:
            print("Annoy index saved to: {}".format(fn_annoy))
            print("lmdb map saved to: {}".format(fn_lmdb))
    else:
        print("Annoy index and lmdb map already in path")
Exemple #9
0
    def fit(self,corpus):
        """
        Fit a document similarity model

        Parameters
        ----------

        corpus : object
           a corpus object that follows DefaultJsonCorpus

        Returns
        -------
        
        trained DocumentSimilarity object
        """
        if self.model_type == 'sklearn_nmf':
            model = self.create_sklearn_model(corpus)
        else:
            model = self.create_gensim_model(corpus)

        self.index = similarities.Similarity(self.work_folder+"/gensim_index",model,self.vec_size)
        self.index_annoy = annoy.AnnoyIndex(self.vec_size, metric='angular')
        for i, vec in enumerate(model):
            self.index_annoy.add_item(i, list(gensim.matutils.sparse2full(vec, self.vec_size).astype(float)))
        self.index_annoy.build(self.annoy_trees)
        self.seq2meta = {}
        self.id2meta = {}
        for j in corpus.get_meta():
            self.seq2meta[j['corpus_seq_id']] = j
            self.id2meta[j['id']] = j
        return self
Exemple #10
0
    def load(
        self,
        ann_path: str,
        annoy_data_dimensionality: Optional[int] = None,
        annoy_mertic: Optional[str] = None,
        annoy_prefault: bool = False,
    ) -> None:
        """
        Loads an approximate nearest neighbour (ANN) instance from disk.

        Parameters
        ----------
        ann_path : str
            Path of saved ANN instance (directory if ann_alg is "scann", filepath otherwise).
        annoy_data_dimensionality : int, optional
            Dimensionality of data (required if ann_alg is set to "annoy").
        annoy_mertic : str, optional
            Distance metric (required if ann_alg is set to "annoy").
        annoy_prefault : bool, optional
            Whether or not to enable the `prefault` option when loading Annoy index
            (defaults to False).
        """
        if self._ann_alg == "scann":
            self._ann_index = scann.scann_ops_pybind.load_searcher(ann_path)
        elif self._ann_alg == "annoy":
            self._ann_index = annoy.AnnoyIndex(f=annoy_data_dimensionality,
                                               metric=annoy_mertic)
            self._ann_index.load(fn=ann_path, prefault=annoy_prefault)
Exemple #11
0
    def __build_index(self, index_file):
        self.embedding_size = self.embeddings.shape[1]

        self.index = an.AnnoyIndex(self.embedding_size, metric='angular')

        for embedding_ind in range(self.embeddings.shape[0]):
            embedding = self.embeddings[embedding_ind, :]
            self.index.add_item(embedding_ind, embedding)

        self.index.build(self.n_trees)

        if self.id_map is None:
            self.id_map = dict([(i, i)
                                for i in range(self.embeddings.shape[0])])

        self.inverse_id_map = dict([(v, k) for k, v in self.id_map.items()])

        if index_file:
            embeddings_file = index_file + '.embeddings'
            state = {
                'embedding_size': self.embedding_size,
                'id_map': self.id_map,
            }

            self.index.save(embeddings_file)
            with open(index_file, 'wb') as _index_file:
                pickle.dump(state, _index_file)
Exemple #12
0
 def _build_index(self, documents, encodings):
     self.annoy_index = annoy.AnnoyIndex(self.dimension, metric=self.metric)
     for i, (document, encoding) in enumerate(zip(documents, encodings)):
         self.annoy_index.add_item(i, encoding)
         self.document_ids.append(document["id"])
     self.annoy_index.build(self.num_trees)
     self._save_annoy_index()
Exemple #13
0
 def read(self):
     with open(self.pathIds, 'rb') as f:
         data = cPickle.load(f)
         n = data[0]
         self.ids = data[1:]
     self.index = annoy.AnnoyIndex(n)
     self.index.load(self.pathAnnoy)
Exemple #14
0
 def __init__(self):
     self.tree = annoy.AnnoyIndex(Amadeus.ANNOY_WORD_EMBADDING_NUM)
     self.tree.load(Amadeus.ANNOY_TREE)
     self.dic_keys = np.load(Amadeus.ANNOY_DICT_KEYS)
     self.ws = Amadeus.brain.wordseg.base_wordseg.JiebaSeg()
     self.w2v = W2V()
     self.dic = np.load(Amadeus.ANNOY_DICT).item()
    def __init__(self, gensim_emb, texts, trees_n=10):
        self.gensim_emb = gensim_emb
        self.morph = pymorphy2.MorphAnalyzer()
        self.tag_conv = converters.converter('opencorpora-int', 'ud20')
        self.tag_cache = {}

        self.id2text = list(sorted(set(texts)))

        textid2tokens = [[
            tok + '_' + self.get_tag(tok) for tok in txt.split(' ')
        ] for txt in self.id2text]
        tokenid2token = [
            tok for tok in sorted(
                set(tok for txt_toks in textid2tokens for tok in txt_toks))
            if tok in self.gensim_emb.vocab
        ]
        token2tokenid = {tok: i for i, tok in enumerate(tokenid2token)}
        self.tokenid2vec = [self.gensim_emb[tok] for tok in tokenid2token]

        self.tokenid2textid = collections.defaultdict(set)
        self.text2tokenid = collections.defaultdict(set)
        for txt_i, txt_toks in enumerate(textid2tokens):
            txt = self.id2text[txt_i]
            for tok in txt_toks:
                tok_id = token2tokenid.get(tok, None)
                if tok_id is not None:
                    self.tokenid2textid[tok_id].add(txt_i)
                    self.text2tokenid[txt].add(tok_id)

        self.vector_idx = annoy.AnnoyIndex(self.gensim_emb.vectors.shape[1],
                                           'angular')
        for tok_i, tok_vec in enumerate(self.tokenid2vec):
            self.vector_idx.add_item(tok_i, tok_vec)
        self.vector_idx.build(trees_n)
Exemple #16
0
 def build_index(self):
     i = 0
     print "Building Index ...."
     ann = annoy.AnnoyIndex(self.size, self.metric)
     with self.env.begin(write=True) as txn:
         for word, vec in get_vectors_file(self.fname):
             # add the vector to annoy index
             ann.add_item(i, vec)
             # use the same id to point to word
             id = 'i%d' % i
             # make it a string
             word = 'w' + word
             # index by id
             txn.put(id, word)
             # index by word
             txn.put(word, id)
             i += 1
             # print the progress
             if i % 1000 == 0:
                 print i, "..."
         
     # build the forest of trees. More trees give higher precision when querying
     ann.build(self.number_of_trees)
     # save the index to disk
     ann.save(self.anndb)
     # load the new index
     self.ann.load(self.anndb)
     return "Built ann index of size: {}, and loaded it in memory".format(i)
    def create_index(self, data_paths: List[RichPath],
                     metadata: Dict[str, Any]) -> None:
        def representation_iter():
            data_chunk_iterator = (r.read_by_file_suffix() for r in data_paths)
            with self.__model.sess.as_default():
                for raw_data_chunk in data_chunk_iterator:
                    for raw_sample in raw_data_chunk:
                        loaded_sample = {}
                        use_example = self.__model._load_data_from_sample(
                            self.__model.hyperparameters,
                            self.__model.metadata,
                            raw_sample=raw_sample,
                            result_holder=loaded_sample,
                            is_train=False)
                        if not use_example:
                            continue

                        _, fetches = self.__model._run_epoch_in_batches(
                            loaded_sample,
                            '(indexing)',
                            is_train=False,
                            quiet=True,
                            additional_fetch_dict={
                                'target_representations':
                                self.__model.ops['target_representations']
                            })
                        target_representations = fetches[
                            'target_representations']

                        idx = 0
                        for node_idx, annotation_data in raw_sample[
                                'supernodes'].items():
                            node_idx = int(node_idx)
                            if 'ignored_supernodes' in loaded_sample and node_idx in loaded_sample[
                                    'ignored_supernodes']:
                                continue

                            annotation = annotation_data['annotation']
                            if ignore_type_annotation(annotation):
                                idx += 1
                                continue

                            yield target_representations[idx], annotation
                            idx += 1

        index = annoy.AnnoyIndex(self.__type_representation_size, 'manhattan')
        indexed_element_types = []
        logging.info('Creating index...')
        for i, (representation, type) in enumerate(representation_iter()):
            index.add_item(i, representation)
            indexed_element_types.append(type)
        logging.info('Indexing...')
        index.build(20)
        logging.info('Index Created.')

        with tempfile.NamedTemporaryFile() as f:
            index.save(f.name)
            with open(f.name, 'rb') as fout:
                metadata['index'] = fout.read()
        metadata['indexed_element_types'] = indexed_element_types
Exemple #18
0
    def setup(self, data: torch.Tensor) -> None:
        """
        `data` denotes the "stored tensors". These are the tensors within which we
        want to find the nearest neighbors to a query tensor, via a call to the
        `get_nearest_neighbors` method. Before we can call `get_nearest_neighbors`,
        we need to first store the stored tensors, by doing processing that indexes
        the stored tensors in a form that enables nearest-neighbors computation.
        This method does that preprocessing, and is assumed to be called before any
        call to `get_nearest_neighbors`. In particular, it creates the trees used to
        index the stored tensors. This index is built to enable computation of
        vectors that have the largest dot-product with the query tensors. The tensors
        in the "stored tensors" can be of a common, but arbitrary shape, denoted *, so
        that `data` is of shape (N, *), where N is the number of tensors in the stored
        tensors. Therefore, the 0-th dimension indexes the tensors in the stored
        tensors.

        Args:
            data (tensor): A tensor of shape (N, *) representing the stored tensors.
                    The 0-th dimension indexes the tensors in the stored tensors,
                    so that `data[i]` is the tensor with index `i`. The nearest
                    neighbors of a query will be referred to by their index.
        """
        import annoy

        data = data.view((len(data), -1))
        projection_dim = data.shape[1]
        self.knn_index = annoy.AnnoyIndex(projection_dim, "dot")
        for (i, projection) in enumerate(data):
            self.knn_index.add_item(i, projection)
        self.knn_index.build(self.num_trees)
def match(descriptors):
    num_image = len(descriptors)
    f = 128
    trees = []
    for i in range(num_image):
        t = annoy.AnnoyIndex(f, "euclidean")
        t.build(500)
        nFeatures = len(descriptors[i])
        for j in range(nFeatures):
            t.add_item(j, descriptors[i][j][1])
        trees.append(t)
    best_matches = []
    for i in range(num_image):
        best_match = []
        nFeatures = len(descriptors[i])
        for n in range(nFeatures):
            best = np.array([-1] * num_image)
            fd = descriptors[i][n][1]
            #search for the best match for a feature in other images
            for j in range(num_image):
                #skip the same image
                if (i == j):
                    continue
                #other point
                ind, dist = t.get_nns_by_vector(fd,
                                                2,
                                                search_k=-1,
                                                include_distances=True)
                #print("ind, dist", ind, dist)
                if (dist[1] == 0 or dist[0] / dist[1] < 0.8):
                    best[j] = ind[0]
                best_match.append(best)
        best_matches.append(best_match)
    return best_matches
    printf("Feature matching complete...")
    def __init__(self, tree_path, database_path):

        self.model = annoy.AnnoyIndex(128, "angular")
        self.model.load(tree_path)

        with open(database_path, "r") as f:
            self.database = json.load(f)
    def build(self, data, k, cp):
        n_items, vector_length = data.shape
        #initalize parameters
        self.method_param = init_method_param("annoy", data=data, cp=cp)
        ntrees = self.method_param["n_trees"]
        #build index
        self.index = annoy.AnnoyIndex(vector_length, metric=self.metric)
        for i in range(n_items):
            self.index.add_item(i, data[i])
        self.index.build(ntrees)

        #   def query_train(self, data, k):
        #add search_k parameter: tradeoff between speed and accuracy?
        #neighbors_single, distances_single = np.asarray(self.index.get_nns_by_vector(data[i], n=k, search_k=-1, include_distances=True))
        #output array with points x neighbors:
        neighbors = np.empty((data.shape[0], k), dtype=int)
        distances = np.empty((data.shape[0], k))
        for i in range(len(data)):
            neighbors_single, distances_single = np.asarray(
                self.index.get_nns_by_item(i,
                                           n=k,
                                           search_k=-1,
                                           include_distances=True))
            neighbors[i] = neighbors_single
            distances[i] = distances_single
        #print("neighbors.shape: {}".format(neighbors.shape))
        #print("neighbors[0]: {}".format(neighbors[0]))
        #print(neighbors.shape)
        #print("distances.shape: {}".format(distances.shape))
        #print("distances[0]: {}".format(distances[0]))
        return neighbors, distances
Exemple #22
0
 def fit(self, X):
     self.n_samples_fit_ = X.shape[0]
     self.annoy_ = annoy.AnnoyIndex(X.shape[1], metric=self.metric)
     for i, x in enumerate(X):
         self.annoy_.add_item(i, x.tolist())
     self.annoy_.build(self.n_trees)
     return self
Exemple #23
0
    def __init__(
        self,
        module_url,
        index_file_path,
        mapping_file_path,
        dimensions,
        random_projection_matrix_file,
    ):

        # Load the TF-Hub module
        print('Loading the TF-Hub module...')
        self.embed_fn = hub.load(module_url)
        print('TF-hub module is loaded.')

        dimensions = self.embed_fn(['']).shape[1]

        self.random_projection_matrix = None
        if tf.io.gfile.exists(random_projection_matrix_file):
            with open(random_projection_matrix_file, 'rb') as handle:
                self.random_projection_matrix = pickle.load(handle)
            dimensions = self.random_projection_matrix.shape[1]

        self.index = annoy.AnnoyIndex(dimensions, metric=_METRIC)
        self.index.load(index_file_path, prefault=True)
        print('Annoy index is loaded.')
        with open(mapping_file_path, 'rb') as handle:
            self.mapping = pickle.load(handle)
        print('Mapping file is loaded.')
 def __create_annoy_index(self,
                          data,
                          space='angular',
                          n_trees=30,
                          load=True,
                          filepath=None,
                          save=False):
     """
     Create or Load Approximate Nearest Neighbors index 
     
     Args: 
         data (array): Thread word vectors 
         space (str): Distance (metric) function can be "angular", "euclidean", "manhattan", "hamming", or "dot"
         n_trees (int): Number of trees in a forest. More trees gives higher precision when querying.
         load (boolean): Load model (True) -- Create model (False)
         filepath (str): Path to Nearest Neighbors
         save (boolean): Save model (True) -- Only used if load=False
     
     Returns:
         index (object): Annoy object
     """
     index = annoy.AnnoyIndex(self.dim, metric=space)
     if load:
         # only need to init if index is saved
         index.load(filepath)
     else:
         for i, vect in enumerate(data):
             # add data
             index.add_item(i, vect)
         # build moel
         index.build(n_trees)
         # save indexes
         if save: index.save(filepath)
     return index
Exemple #25
0
def musicnn_penultimate(args):
    import musicnn.extractor
    import numpy as np

    index = annoy.AnnoyIndex(200, 'euclidean')

    def slow_embed(z):
        i, wav = z
        try:
            x = musicnn.extractor.extractor(str(wav))
            with open(f'musicnn/{wav.stem}.pickle', 'wb') as f:
                pickle.dump(x, f)
            x = x[2]['penultimate'].mean(axis=0)
            x = x / np.linalg.norm(x)
            return i, x
        except UnboundLocalError:
            return i, None

    results = Parallel(n_jobs=6)(
        delayed(slow_embed)(z)
        for z in tqdm(enumerate(sorted(args.wav_dir.glob('*.wav')))))

    for i, x in results:
        if x is None:
            continue
        index.add_item(i, x)

    index.build(100)
    index.save('musicnn.annoy')
    return index
Exemple #26
0
    def rebuild(self, keyTransform=None):
        vecs = self.readVectors(keyTransform)

        ids = []
        n = None
        for k, v in vecs.items():
            ids.append(k)
            if len(v['vector']) == 0:
                pass
            elif n is None:
                n = len(v['vector'])
            else:
                assert (n == len(v['vector']))
        ids.sort()

        ai = annoy.AnnoyIndex(n)
        for i, (k, v) in enumerate(vecs.items()):
            if len(v['vector']) == 0: continue
            j = binary_search(ids, k)
            ai.add_item(j, v['vector'])
            if i % 10000 == 0:
                logger.info('loading vector %d into annoy index' % i)
        logger.info('building annoy datastructure')
        ai.build(10)
        logger.info('saving annoy datastructure')
        ai.save(self.pathAnnoy)

        with open(self.pathIds, 'wb') as f:
            cPickle.dump([n] + ids, f)
        self.ids = ids
        self.index = ai
Exemple #27
0
def make_adj_mat(
    X,
    n_neighbors=15,
    metric="euclidean",
    n_trees=50,
    seed=None,
    use_dists=False,
    symmetrize=True,
    drop_first=True,
):
    t = annoy.AnnoyIndex(X.shape[1], metric)
    if seed is not None:
        t.set_seed(seed)

    [t.add_item(i, x) for i, x in enumerate(X)]
    t.build(n_trees)

    # construct the adjacency matrix for the graph
    adj = lil_matrix((X.shape[0], X.shape[0]))
    for i in range(X.shape[0]):
        neighs_, dists_ = t.get_nns_by_item(i,
                                            n_neighbors + 1,
                                            include_distances=True)
        if drop_first:
            neighs = neighs_[1:]
            dists = dists_[1:]
        else:
            neighs = neighs_[:n_neighbors]
            dists = dists_[:n_neighbors]

        adj[i, neighs] = dists if use_dists else 1
        if symmetrize:
            adj[neighs, i] = dists if use_dists else 1  # symmetrize on the fly

    return adj, t
    def add_index(self, user_rooms):
        # user_rooms = [{user_id: '', room_id: '', topic: ''}]
        #print user_rooms
        self.ann = annoy.AnnoyIndex(self.size, self.metric)
        with self.env.begin(write=True) as txn:
            for user_room in user_rooms:
                phrase = user_room.get("topic")
                if phrase:
                    id = 'i%d' % self.index_size
                    # make it a string
                    room_word = 'w' + str(
                        user_room.get("user_id")) + ':' + str(
                            user_room.get("room_id")) + ':' + str(phrase)
                    if isinstance(room_word, unicode):
                        room_word = room_word.encode('utf-8')
                    # avoid duplicate user-rooms vector
                    #print txn.get(room_word)
                    #if not txn.get(room_word):
                    #print user_room
                    # get the vector
                    if isinstance(phrase, str):
                        phrase = phrase.decode('utf-8')
                    _phrase, vec = get_vectors_cloud(phrase)
                    #print vec
                    # add the vector to annoy index
                    self.ann.add_item(self.index_size, vec)
                    # use the same id to point to word
                    # index by id
                    txn.put(id, room_word)
                    # index by user_room
                    txn.put(room_word, id)
                    self.index_size += 1

        return "Added user rooms to index. New size: {}".format(
            self.index_size)
def run(database_path, index_path, n_items):
    connection = sqlite3.connect(database_path)
    index = annoy.AnnoyIndex(128, 'euclidean')
    index.load(index_path)

    image_loader = photos_2_db.ImageLoader()
    face_detector = photos_2_db.FaceDetector()
    landmarks_predictor = photos_2_db.LandmarksPredictor()
    face_recognizer = photos_2_db.FaceRecognizer()

    while True:
        input_image = input('Image path >>> ')

        if not os.path.isfile(index_path):
            print('File %s not exists.' % index_path)
            continue

        images = image_loader(file_path=input_image, width=200, height=200)
        faces = face_detector(images=images)
        landmarks = landmarks_predictor(images=images, faces=faces)
        features = face_recognizer(images=images, landmarks=landmarks)

        for feature in features[0]:
            f = list(feature)
            nearest = index.get_nns_by_vector(f, n_items, search_k=-1, include_distances=True)

            for idx, distance in zip(nearest[0], nearest[1]):
                cur = connection.cursor()
                cur.execute('SELECT * FROM features where id=%s' % idx)

                rows = cur.fetchall()

                for row in rows:
                    im = Image.open(row[1])
                    im.show()
Exemple #30
0
 def predict(self, annoytreepath):
     import annoy
     self._annoy = annoy.AnnoyIndex(f=200, metric=self._metric)
     self._annoy.load(annoytreepath)
     with open('word2id.pickle', 'rb') as f:
         self.word2id = cPickle.load(f)
     with open('id2word.pickle', 'rb') as f:
         self.id2word = cPickle.load(f)