Ejemplo n.º 1
0
def characterize_dataset(model, sequences, entity2unique, entity2same,
                         unique_text, nnlens):
    predictions = model.predict(sequences)
    t = AnnoyIndex(
        len(predictions[0]),
        metric='euclidean')  # Length of item vector that will be indexed
    t.set_seed(123)
    for i in range(len(predictions)):
        # print(predictions[i])
        v = predictions[i]
        t.add_item(i, v)

    t.build(100)  # 100 trees

    for nnlen in nnlens:
        print("Characteristics at neighborhood length:" + str(nnlen))
        pos_distances = []
        neg_distances = []
        match = 0
        no_match = 0

        for key in entity2same:
            index = entity2unique[key]
            nearest = t.get_nns_by_vector(predictions[index], nnlen)
            nearest_text = set([unique_text[i] for i in nearest])
            expected_text = set(entity2same[key])
            overlap = expected_text.intersection(nearest_text)

            m = len(overlap)
            match += m
            # since we asked for only x nearest neighbors, and we get at most x-1 neighbors that are not the same as key (!)
            # make sure we adjust our estimate of no match appropriately
            no_match += min(len(expected_text), nnlen - 1) - m

            # annoy has this annoying habit of returning the queried item back as a nearest neighbor.  Remove it.
            if key in nearest_text:
                nearest_text.remove(key)

            # sample only the negatives that are true negatives
            # that is, they are not in the expected set - sampling only 'semi-hard negatives is not defined here'
            pos = expected_text
            neg = nearest_text - expected_text

            for i in pos:
                dist_pos = t.get_distance(index, entity2unique[i])
                pos_distances.append(dist_pos)
            for i in neg:
                dist_neg = t.get_distance(index, entity2unique[i])
                neg_distances.append(dist_neg)

        recall = match / (match + no_match)

        print("mean positive distance:" + str(statistics.mean(pos_distances)))
        print("stdev positive distance:" +
              str(statistics.stdev(pos_distances)))
        print("max positive distance:" + str(max(pos_distances)))
        print("mean neg distance:" + str(statistics.mean(neg_distances)))
        print("stdev neg distance:" + str(statistics.stdev(neg_distances)))
        print("max neg distance:" + str(max(neg_distances)))
        print("recall:" + str(recall))
Ejemplo n.º 2
0
def _get_knn_graph_annoy(X,
                         n_neighbors=5,
                         dist_metric='euclidean',
                         random_seed=0):
    ''' 
    Build k-nearest-neighbor graph
    Return edge list and nearest neighbor matrix
    '''
    try:
        from annoy import AnnoyIndex
    except ImportError:
        raise ImportError('Please install the package "annoy". '
                          'Alternatively, set `knn_method=\'umap\'.')
    npc = X.shape[1]
    ncell = X.shape[0]
    annoy_index = AnnoyIndex(npc, metric=dist_metric)
    annoy_index.set_seed(random_seed)

    for i in range(ncell):
        annoy_index.add_item(i, list(X[i, :]))
    annoy_index.build(10)  # 10 trees

    knn = []
    knn_dists = []
    for iCell in range(ncell):
        neighbors, dists = annoy_index.get_nns_by_item(iCell,
                                                       n_neighbors + 1,
                                                       include_distances=True)
        knn.append(neighbors[1:])
        knn_dists.append(dists[1:])
    knn = np.array(knn, dtype=int)
    knn_dists = np.array(knn_dists)

    return knn, knn_dists
def annoy_build(df, id, metric='euclidean'):
    m = AnnoyIndex(VECTOR_SIZE, metric=metric) 
    m.set_seed(42)
    for _, row in df.iterrows():
        m.add_item(row[id], row['vectors'])
    m.build(TREE_QUERIES)
    return m
Ejemplo n.º 4
0
 def find_candidates_udf(u_factor):
     from annoy import AnnoyIndex  # must import here !
     u = AnnoyIndex(rank, 'dot')
     u.set_seed(random_seed)
     u.load(SparkFiles.get(
         tree_ann_path))  # tree_ann_path must be absolute path
     return u.get_nns_by_vector(u_factor,
                                n=nns,
                                search_k=-1,
                                include_distances=False)
Ejemplo n.º 5
0
def generate_semi_hard_triplets_from_ANN(model, sequences, entity2unique,
                                         entity2same, unique_text, test):
    predictions = model.predict(sequences)
    t = AnnoyIndex(
        len(predictions[0]),
        metric='euclidean')  # Length of item vector that will be indexed
    t.set_seed(123)
    for i in range(len(predictions)):
        # print(predictions[i])
        v = predictions[i]
        t.add_item(i, v)

    t.build(100)  # 100 trees

    triplets = {}

    triplets['anchor'] = []
    triplets['positive'] = []
    triplets['negative'] = []

    if test:
        NNlen = TEST_NEIGHBOR_LEN
    else:
        NNlen = TRAIN_NEIGHBOR_LEN

    for key in entity2same:
        index = entity2unique[key]

        expected_text = set(entity2same[key])
        expected_ids = [entity2unique[i] for i in expected_text]

        for positive in expected_text:
            k = entity2unique[positive]
            nearest = t.get_nns_by_vector(predictions[k], NNlen)
            dist_k = t.get_distance(index, k)

            semi_hards = []
            for n in nearest:
                if n == index or n in expected_ids or n == k:
                    continue
                n_dist = t.get_distance(index, n)
                if n_dist > dist_k:
                    semi_hards.append(unique_text[n])

            # shuffle(semi_hards)
            # semi_hards = semi_hards[0:20]

            for i in semi_hards:
                triplets['anchor'].append(key)
                triplets['positive'].append(unique_text[k])
                triplets['negative'].append(i)

    return triplets
Ejemplo n.º 6
0
def generate_extra_pair_basis(basis,
                              X,
                              n_neighbors,
                              tree: AnnoyIndex,
                              distance='euclidean',
                              verbose=True):
    '''Generate pairs that connects the extra set of data to the fitted basis.
    '''
    npr, dimp = X.shape

    assert (
        basis is not None or tree is not None
    ), "If the annoyindex is not cached, the original dataset must be provided."

    # Build the tree again if not cached
    if tree is None:
        n, dim = basis.shape
        assert dimp == dim, "The dimension of the original dataset is different from the new one's."
        tree = AnnoyIndex(dim, metric=distance)
        if _RANDOM_STATE is not None:
            tree.set_seed(_RANDOM_STATE)
        for i in range(n):
            tree.add_item(i, basis[i, :])
        tree.build(20)
    else:
        n = tree.get_n_items()

    n_neighbors_extra = min(n_neighbors + 50, n - 1)
    nbrs = np.zeros((npr, n_neighbors_extra), dtype=np.int32)
    knn_distances = np.empty((npr, n_neighbors_extra), dtype=np.float32)

    for i in range(npr):
        nbrs[i, :], knn_distances[i, :] = tree.get_nns_by_vector(
            X[i, :], n_neighbors_extra, include_distances=True)

    print_verbose("Found nearest neighbor", verbose)
    # sig = np.maximum(np.mean(knn_distances[:, 3:6], axis=1), 1e-10)
    # print_verbose("Calculated sigma", verbose)

    # Debug
    # print_verbose(f"Sigma is of the scale of {sig.shape}", verbose)
    # print_verbose(f"KNN dist is of shape scale of {knn_distances.shape}", verbose)
    # print_verbose(f"nbrs max: {nbrs.max()}", verbose)

    # scaling the distances is not possible since we don't always track the basis
    # scaled_dist = scale_dist(knn_distances, sig, nbrs)
    print_verbose("Found scaled dist", verbose)

    pair_neighbors = sample_neighbors_pair_basis(n, X, knn_distances, nbrs,
                                                 n_neighbors)
    return pair_neighbors
Ejemplo n.º 7
0
def load_index(path_index: PathType,
               meta_d: Dict) \
        -> AnnoyIndex:
    """ We rely on ANNOY's usage of mmap to be fast loading
    (fast enough that we can load it on every single call)
    """
    n_dim = meta_d['n_dim']
    metric = meta_d['metric']
    u = AnnoyIndex(
        n_dim,
        metric=metric,
    )
    u.load(str(path_index))
    u.set_seed(SEED)
    return u
Ejemplo n.º 8
0
    def test_seeding(self):
        f = 10
        X = numpy.random.rand(1000, f)
        Y = numpy.random.rand(50, f)

        indexes = []
        for i in range(2):
            index = AnnoyIndex(f, 'angular')
            index.set_seed(42)
            for j in range(X.shape[0]):
                index.add_item(j, X[j])

            index.build(10)
            indexes.append(index)

        for k in range(Y.shape[0]):
            self.assertEquals(indexes[0].get_nns_by_vector(Y[k], 100),
                              indexes[1].get_nns_by_vector(Y[k], 100))
Ejemplo n.º 9
0
    def test_seeding(self):
        f = 10
        X = numpy.random.rand(1000, f)
        Y = numpy.random.rand(50, f)

        indexes = []
        for i in range(2):
            index = AnnoyIndex(f)
            index.set_seed(42)
            for j in range(X.shape[0]):
                index.add_item(j, X[j])

            index.build(10)
            indexes.append(index)

        for k in range(Y.shape[0]):
            self.assertEquals(indexes[0].get_nns_by_vector(Y[k], 100),
                              indexes[1].get_nns_by_vector(Y[k], 100))
Ejemplo n.º 10
0
def generate_pair(X,
                  n_neighbors,
                  n_MN,
                  n_FP,
                  distance='euclidean',
                  verbose=True):
    '''Generate pairs for the dataset.
    '''
    n, dim = X.shape
    # sample more neighbors than needed
    n_neighbors_extra = min(n_neighbors + 50, n - 1)
    tree = AnnoyIndex(dim, metric=distance)
    if _RANDOM_STATE is not None:
        tree.set_seed(_RANDOM_STATE)
    for i in range(n):
        tree.add_item(i, X[i, :])
    tree.build(20)

    option = distance_to_option(distance=distance)

    nbrs = np.zeros((n, n_neighbors_extra), dtype=np.int32)
    knn_distances = np.empty((n, n_neighbors_extra), dtype=np.float32)

    for i in range(n):
        nbrs_ = tree.get_nns_by_item(i, n_neighbors_extra + 1)
        nbrs[i, :] = nbrs_[1:]
        for j in range(n_neighbors_extra):
            knn_distances[i, j] = tree.get_distance(i, nbrs[i, j])
    print_verbose("Found nearest neighbor", verbose)
    sig = np.maximum(np.mean(knn_distances[:, 3:6], axis=1), 1e-10)
    print_verbose("Calculated sigma", verbose)
    scaled_dist = scale_dist(knn_distances, sig, nbrs)
    print_verbose("Found scaled dist", verbose)
    pair_neighbors = sample_neighbors_pair(X, scaled_dist, nbrs, n_neighbors)
    if _RANDOM_STATE is None:
        pair_MN = sample_MN_pair(X, n_MN, option)
        pair_FP = sample_FP_pair(X, pair_neighbors, n_neighbors, n_FP)
    else:
        pair_MN = sample_MN_pair_deterministic(X, n_MN, _RANDOM_STATE, option)
        pair_FP = sample_FP_pair_deterministic(X, pair_neighbors, n_neighbors,
                                               n_FP, _RANDOM_STATE)
    return pair_neighbors, pair_MN, pair_FP, tree
def annoy_train(spark, dirname, rank, regParam, n_trees, random_seed):
    # Load model
    model = ALSModel.load(f'{dirname}/{rank}_{regParam}_model')

    # get item factors
    item_factors = model.itemFactors
    item_factors, annoy_index_map = convert_annoy_index(item_factors)

    # train annoy model
    tree = AnnoyIndex(rank, 'dot')
    for item in tqdm(item_factors.collect()):
        tree.add_item(item.annoy_id, item.features)
    tree.set_seed(random_seed)

    # build the tree
    # num of trees: higher n_trees gives higher precision
    tree.build(n_trees)

    # save annoy model and index map
    tree.save(f'{dirname}_{rank}_{regParam}_tree.ann')
    annoy_index_map.write.parquet(f'{dirname}_{rank}_{regParam}_annoy_index_map.parquet')
Ejemplo n.º 12
0
def predict_similar_movies(
    review_vectors: pd.DataFrame,
    parameters: Dict
) -> pd.DataFrame:

    # 近似最近傍探索モデルを初期化
    vector_size = review_vectors.iat[0, 1].size
    annoy_index = AnnoyIndex(vector_size, parameters["similarity_metrics"])
    annoy_index.set_seed(parameters["random_seed"])

    # TODO ログ
    print(f"review size: {len(review_vectors)}")

    # インデックスの構築
    idx2movie = {}
    for i, row in enumerate(review_vectors.itertuples()):
        idx2movie[i] = row.movie_id
        annoy_index.add_item(i, row.vector)

        # TODO ログ
        if i % 10 == 0:
            print(f"{i}番目のインデックスを構築完了")
    annoy_index.build(parameters["n_tree"])

    # 類似映画トップNを予測
    similar_movies = {}
    for j in range(len(review_vectors)):
        # 同じ映画が最も近くなるため、1つずらす
        similar_movies[idx2movie[j]] = annoy_index.get_nns_by_item(j, parameters["predict_num"] + 1)[1:]

        if j % 10 == 0:
            print(f"{j}番目の推論完了")

    return pd.DataFrame({
        "movie_id": similar_movies.keys(),
        "similar_movie_ids": [[idx2movie[movie_id] for movie_id in movie_list] for movie_list in similar_movies.values()]
    })
Ejemplo n.º 13
0
# 100%|############################| 402111/402111 [01:02<00:00, 6455.57it/s]
len(wv.vocab), len(wv[next(iter(wv.vocab))])
# (3000000, 300)
wv.vectors.shape
# (3000000, 300)


"""
>>> from annoy import AnnoyIndex
>>> num_words, num_dimensions = wv.vectors.shape  # <1>
>>> index = AnnoyIndex(num_dimensions)
"""
from annoy import AnnoyIndex
num_words, num_dimensions = wv.vectors.shape  # <1>
index = AnnoyIndex(num_dimensions)
index.set_seed(1983)

"""
>>> from tqdm import tqdm  # <1>
>>> for i, word in enumerate(tqdm(wv.index2word)):  # <2>
...     index.add_item(i, wv[word])
22%|#######▏                   | 649297/3000000 [00:26<01:35, 24587.52it/s]

<1> `tqdm()` takes an iterable and returns an iterable (like `enumerate()`) and inserts code in your loop to display a progress bar
<2> `.index2word` is an unsorted list of all 3M tokens in your vocabulary, equivalent to a map of the integer indexes (0-2999999) to tokens ('</s>' to 'snowcapped_Caucasus').

"""
from tqdm import tqdm
for i, word in enumerate(tqdm(wv.index2word)):
    index.add_item(i, wv[word])
Ejemplo n.º 14
0
 def test_seed(self):
     i = AnnoyIndex(10, 'angular')
     i.load('test/test.tree')
     i.set_seed(42)
class AnnoyDictionary(object):
    def __init__(self,
                 dict_size,
                 key_width,
                 new_value_shift_coefficient=0.1,
                 batch_size=100,
                 key_error_threshold=0.01):
        self.max_size = dict_size
        self.curr_size = 0
        self.new_value_shift_coefficient = new_value_shift_coefficient

        self.index = AnnoyIndex(key_width, metric='euclidean')
        self.index.set_seed(1)

        self.embeddings = np.zeros((dict_size, key_width))
        self.values = np.zeros(dict_size)

        self.lru_timestamps = np.zeros(dict_size)
        self.current_timestamp = 0.0

        # keys that are in this distance will be considered as the same key
        self.key_error_threshold = key_error_threshold

        self.initial_update_size = batch_size
        self.min_update_size = self.initial_update_size
        self.key_dimension = key_width
        self.value_dimension = 1
        self._reset_buffer()

        self.built_capacity = 0

    def add(self, keys, values):
        # Adds new embeddings and values to the dictionary
        indices = []
        indices_to_remove = []
        for i in range(keys.shape[0]):
            index = self._lookup_key_index(keys[i])
            if index:
                # update existing value
                self.values[index] += self.new_value_shift_coefficient * (
                    values[i] - self.values[index])
                self.lru_timestamps[index] = self.current_timestamp
                indices_to_remove.append(i)
            else:
                # add new
                if self.curr_size >= self.max_size:
                    # find the LRU entry
                    index = np.argmin(self.lru_timestamps)
                else:
                    index = self.curr_size
                    self.curr_size += 1
                self.lru_timestamps[index] = self.current_timestamp
                indices.append(index)

        for i in reversed(indices_to_remove):
            keys = np.delete(keys, i, 0)
            values = np.delete(values, i, 0)

        self.buffered_keys = np.vstack((self.buffered_keys, keys))
        self.buffered_values = np.vstack((self.buffered_values, values))
        self.buffered_indices = self.buffered_indices + indices

        if len(self.buffered_indices) >= self.min_update_size:
            self.min_update_size = max(self.initial_update_size,
                                       int(self.curr_size * 0.02))
            self._rebuild_index()

        self.current_timestamp += 1

    # Returns the stored embeddings and values of the closest embeddings
    def query(self, keys, k):
        if not self.has_enough_entries(k):
            # this will only happen when the DND is not yet populated with enough entries, which is only during heatup
            # these values won't be used and therefore they are meaningless
            return [0.0], [0.0], [0]

        _, indices = self._get_k_nearest_neighbors_indices(keys, k)

        embeddings = []
        values = []
        for ind in indices:
            self.lru_timestamps[ind] = self.current_timestamp
            embeddings.append(self.embeddings[ind])
            values.append(self.values[ind])

        self.current_timestamp += 1

        return embeddings, values, indices

    def has_enough_entries(self, k):
        return self.curr_size > k and (self.built_capacity > k)

    def _get_k_nearest_neighbors_indices(self, keys, k):
        distances = []
        indices = []
        for key in keys:
            index, distance = self.index.get_nns_by_vector(
                key, k, include_distances=True)
            distances.append(distance)
            indices.append(index)
        return distances, indices

    def _rebuild_index(self):
        self.index.unbuild()
        self.embeddings[self.buffered_indices] = self.buffered_keys
        self.values[self.buffered_indices] = np.squeeze(self.buffered_values)
        for idx, key in zip(self.buffered_indices, self.buffered_keys):
            self.index.add_item(idx, key)

        self._reset_buffer()

        self.index.build(50)
        self.built_capacity = self.curr_size

    def _reset_buffer(self):
        self.buffered_keys = np.zeros((0, self.key_dimension))
        self.buffered_values = np.zeros((0, self.value_dimension))
        self.buffered_indices = []

    def _lookup_key_index(self, key):
        distance, index = self._get_k_nearest_neighbors_indices([key], 1)
        if distance != [[]] and distance[0][0] <= self.key_error_threshold:
            return index
        return None
Ejemplo n.º 16
0
class Annoy(KNNIndex):
    VALID_METRICS = [
        "cosine",
        "euclidean",
        "manhattan",
        "hamming",
        "dot",
        "l1",
        "l2",
        "taxicab",
    ]

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.__data = None

    def build(self, data, k):
        from annoy import AnnoyIndex

        N = data.shape[0]

        annoy_metric = self.metric
        annoy_aliases = {
            "cosine": "angular",
            "l1": "manhattan",
            "l2": "euclidean",
            "taxicab": "manhattan",
        }
        if annoy_metric in annoy_aliases:
            annoy_metric = annoy_aliases[annoy_metric]

        self.index = AnnoyIndex(data.shape[1], annoy_metric)

        if self.random_state:
            self.index.set_seed(self.random_state)

        for i in range(N):
            self.index.add_item(i, data[i])

        # Number of trees. FIt-SNE uses 50 by default.
        self.index.build(50)

        # Return the nearest neighbors in the training set
        distances = np.zeros((N, k))
        indices = np.zeros((N, k)).astype(int)

        def getnns(i):
            # Annoy returns the query point itself as the first element
            indices_i, distances_i = self.index.get_nns_by_item(
                i, k + 1, include_distances=True
            )
            indices[i] = indices_i[1:]
            distances[i] = distances_i[1:]

        if self.n_jobs == 1:
            for i in range(N):
                getnns(i)
        else:
            from joblib import Parallel, delayed

            Parallel(n_jobs=self.n_jobs, require="sharedmem")(
                delayed(getnns)(i) for i in range(N)
            )

        return indices, distances

    def query(self, query, k):
        N = query.shape[0]
        distances = np.zeros((N, k))
        indices = np.zeros((N, k)).astype(int)

        def getnns(v):
            # Annoy returns the query point itself as the first element
            indices_i, distances_i = self.index.get_nns_by_vector(
                v, k + 1, include_distances=True
            )
            indices[i] = indices_i[1:]
            distances[i] = distances_i[1:]

        if self.n_jobs == 1:
            for i in range(N):
                getnns(query[i])
        else:
            from joblib import Parallel, delayed

            Parallel(n_jobs=self.n_jobs, require="sharedmem")(
                delayed(getnns)(query[i]) for i in range(N)
            )

        return indices, distances
Ejemplo n.º 17
0
    log.debug(f'df_click shape: {df_click.shape}')
    log.debug(f'{df_click.head()}')
    # 得到每个文章id对应的词向量
    article_vec_map = word2vec(df_click, 'user_id', 'click_article_id',
                               model_path)
    f = open(w2v_file, 'wb')
    # 将得到的词向量 保存至文件中
    pickle.dump(article_vec_map, f)
    f.close()

    # 说白了就是先将加载进来的vector进行相似临近计算,然后生成一个树形结构的索引,这样查找速度会变得很快,只不过会牺牲一定的近似精度。
    # 将 embedding 建立索引
    article_index = AnnoyIndex(
        256, 'angular')  #metric='angular'表示使用 angular(余弦)距离度量来计算簇和哈希。
    article_index.set_seed(2020)
    ##加载article_id和向量映射,添加到annoyIndex
    for article_id, emb in tqdm(article_vec_map.items()):
        article_index.add_item(article_id, emb)
    # tree_num设置为100,在内存允许的情况下,越大越好
    article_index.build(100)

    user_item_ = df_click.groupby('user_id')['click_article_id'].agg(
        lambda x: list(x)).reset_index()
    user_item_dict = dict(
        zip(user_item_['user_id'], user_item_['click_article_id']))

    # 召回
    n_split = max_threads
    all_users = df_query['user_id'].unique()
    shuffle(all_users)
Ejemplo n.º 18
0
class Annoy_Dict(LRU_KNN_ANNOY):
    def __init__(self, config):
        super(Annoy_Dict, self).__init__(config)
        self.config = config
        self.key_dim = self.config.knn_key_dim

        self.index = AnnoyIndex(self.key_dim, metric='euclidean')
        self.index.set_seed(123)

        self.initial_update_size = self.config.knn_dict_update_step
        self.min_update_size = self.initial_update_size

        self.cached_embs = []
        self.cached_vals = []
        self.cached_terminals = []
        self.cached_embs_next = []
        self.cached_indices = []

        self.build_capacity = 0

    def _nn(self, keys, k):
        assert np.ndim(keys) == 2
        dists = []
        inds = []
        for key in keys:
            ind, dist = self.index.get_nns_by_vector(key,
                                                     k,
                                                     include_distances=True)
            dists.append(dist)
            inds.append(ind)
        return dists, inds

    def _insert(self, keys, values, terminal, keys_next, indices):
        self.cached_embs = self.cached_embs + keys
        self.cached_vals = self.cached_vals + values
        self.cached_terminals = self.cached_terminals + terminal
        self.cached_embs_next = self.cached_embs_next + keys_next
        self.cached_indices = self.cached_indices + indices

        if len(self.cached_indices) >= self.min_update_size:
            # self.min_update_size = max(self.initial_update_size, self.curr_capacity * 0.02)
            self._update_index()

    def _update_index(self):
        self.index.unbuild()
        for i, ind in enumerate(self.cached_indices):
            new_emb = self.cached_embs[i]
            new_val = self.cached_vals[i]
            new_t = self.cached_terminals[i]
            new_emb_next = self.cached_embs_next[i]
            self.embs[ind] = new_emb
            self.values[ind] = new_val
            self.terminal[ind] = new_t
            self.embs_next[ind] = new_emb_next
            self.index.add_item(ind, new_emb)
        self.cached_embs = []
        self.cached_vals = []
        self.cached_terminals = []
        self.cached_embs_next = []
        self.cached_indices = []

        self.index.build(50)
        self.build_capacity = self.curr_capacity

    def _rebuild(self):
        self.index.unbuild()
        for ind, emb in enumerate(self.embs[:self.curr_capacity]):
            self.index.add_item(ind, emb)
        self.index.build(50)
        self.build_capacity = self.curr_capacity

    def queryable(self, k):
        return (LRU_KNN_ANNOY.queryable(self, k) and (self.build_capacity > k))

    @property
    def capacity_(self):
        # print("self.index.get_n_items: ", self.index.get_n_items())
        return self.index.get_n_items()
Ejemplo n.º 19
0
def get_knn_graph(X,
                  k=5,
                  dist_metric='euclidean',
                  approx=False,
                  return_edges=True,
                  random_seed=0):
    '''
    Build k-nearest-neighbor graph
    Return edge list and nearest neighbor matrix
    '''

    t0 = time.time()
    if approx:
        try:
            from annoy import AnnoyIndex
        except:
            approx = False
            print(
                'Could not find library "annoy" for approx. nearest neighbor search'
            )
    if approx:
        #print('Using approximate nearest neighbor search')

        if dist_metric == 'cosine':
            dist_metric = 'angular'
        npc = X.shape[1]
        ncell = X.shape[0]
        annoy_index = AnnoyIndex(npc, metric=dist_metric)
        annoy_index.set_seed(random_seed)

        for i in range(ncell):
            annoy_index.add_item(i, list(X[i, :]))
        annoy_index.build(10)  # 10 trees

        knn = []
        for iCell in range(ncell):
            knn.append(annoy_index.get_nns_by_item(iCell, k + 1)[1:])
        knn = np.array(knn, dtype=int)

    else:
        #print('Using sklearn NearestNeighbors')

        if dist_metric == 'cosine':
            nbrs = NearestNeighbors(n_neighbors=k,
                                    metric=dist_metric,
                                    algorithm='brute').fit(X)
        else:
            nbrs = NearestNeighbors(n_neighbors=k, metric=dist_metric).fit(X)
        knn = nbrs.kneighbors(return_distance=False)

    if return_edges:
        links = set([])
        for i in range(knn.shape[0]):
            for j in knn[i, :]:
                links.add(tuple(sorted((i, j))))

        t_elapse = time.time() - t0
        #print('kNN graph built in %.3f sec' %(t_elapse))

        return links, knn
    return knn
Ejemplo n.º 20
0
class alpha_KNN:
    def __init__(self,
                 capacity,
                 key_dimension,
                 delta=0.001,
                 alpha=0.1,
                 batch_size=1000):
        self.capacity = capacity
        self.curr_capacity = 0
        self.delta = delta
        self.alpha = 0.001

        self.embeddings = np.zeros((capacity, key_dimension))
        self.values = np.zeros(capacity)

        self.weights = np.zeros(capacity)

        from annoy import AnnoyIndex
        self.index = AnnoyIndex(key_dimension, metric='euclidean')
        self.index.set_seed(123)

        self.min_update_size = batch_size
        self.cached_keys = []
        self.cached_values = []
        self.cached_indices = []

        self.built_capacity = 0

    def _nn(self, keys, k):
        dists = []
        inds = []
        for key in keys:
            ind, dist = self.index.get_nns_by_vector(key + [1.0],
                                                     k,
                                                     include_distances=True)
            dists.append(dist)
            inds.append(ind)
        return dists, inds

    def _insert(self, keys, values, indices):
        self.cached_keys = self.cached_keys + keys
        self.cached_values = self.cached_values + values
        self.cached_indices = self.cached_indices + indices

        if len(self.cached_indices) >= self.min_update_size:
            self._rebuild_index()

    def _rebuild_index(self):
        self.index.unbuild()
        for i, ind in enumerate(self.cached_indices):
            new_key = self.cached_keys[i]
            new_value = self.cached_values[i]
            self.embeddings[ind] = new_key
            self.values[ind] = new_value
            self.weights[ind] = new_weight
            self.index.add_item(ind, new_key + [new_weight])

        self.cached_keys = []
        self.cached_values = []
        self.cached_indices = []

        self.index.build(50)
        self.built_capacity = self.curr_capacity

    def queryable(self, k):
        return (self.built_capacity > k)

    # Returns the stored embeddings and values of the closest embeddings
    def query(self, keys, k):
        _, indices = self._nn(keys, k)

        embs = []
        values = []
        weights = []
        for ind in indices:
            embs.append(self.embeddings[ind])
            values.append(self.values[ind])
            weights.append(self.weights[ind])

        return embs, values, weights

    # Adds new embeddings (and values) to the dictionary
    def add(self, keys, values):

        if self.queryable(5):
            dists, inds = self._nn(keys, k=5)
            for ind, dist in enumerate(dists):
                for i, d in enumerate(dist):
                    index = inds[ind][i]
                    self.weights[index] *= (1 - self.alpha)

        indices, keys_, values_ = [], [], []
        for i, _ in enumerate(keys):
            if self.curr_capacity >= self.capacity:
                # find the LRU entry
                index = np.argmin(self.weights)
            else:
                index = self.curr_capacity
                self.curr_capacity += 1
            self.weights[index] = 1.0
            indices.append(index)
            keys_.append(keys[i])
            values_.append(values[i])

        self._insert(keys_, values_, indices)
Ejemplo n.º 21
0
class annoy_dict(LRU_KNN):
    def __init__(self,
                 capacity,
                 key_dimension,
                 delta=0.001,
                 alpha=0.1,
                 batch_size=100):
        LRU_KNN.__init__(self, capacity, key_dimension, delta, alpha)

        from annoy import AnnoyIndex
        self.index = AnnoyIndex(key_dimension, metric='euclidean')
        self.index.set_seed(123)

        self.initial_update_size = batch_size
        self.min_update_size = self.initial_update_size
        self.cached_keys = []
        self.cached_values = []
        self.cached_indices = []

        self.built_capacity = 0

    def _nn(self, keys, k):
        dists = []
        inds = []
        for key in keys:
            ind, dist = self.index.get_nns_by_vector(key,
                                                     k,
                                                     include_distances=True)
            dists.append(dist)
            inds.append(ind)
        return dists, inds

    def _insert(self, keys, values, indices):
        self.cached_keys = self.cached_keys + keys
        self.cached_values = self.cached_values + values
        self.cached_indices = self.cached_indices + indices

        if len(self.cached_indices) >= self.min_update_size:
            self.min_update_size = max(self.initial_update_size,
                                       self.curr_capacity * 0.02)
            self._update_index()

    def _update_index(self):
        self.index.unbuild()
        for i, ind in enumerate(self.cached_indices):
            new_key = self.cached_keys[i]
            new_value = self.cached_values[i]
            self.embeddings[ind] = new_key
            self.values[ind] = new_value
            self.index.add_item(ind, new_key)

        self.cached_keys = []
        self.cached_values = []
        self.cached_indices = []

        self.index.build(50)
        self.built_capacity = self.curr_capacity

    def _rebuild_index(self):
        self.index.unbuild()
        for ind, emb in enumerate(self.embeddings[:self.curr_capacity]):
            self.index.add_item(ind, emb)
        self.index.build(50)
        self.built_capacity = self.curr_capacity

    def queryable(self, k):
        return (LRU_KNN.queryable(self, k) and (self.built_capacity > k))
Ejemplo n.º 22
0
class LRU_KNN:
    def __init__(self, capacity, key_dim, value_dim, batch_size):
        self.capacity = capacity
        self.curr_capacity = 0

        self.states = np.zeros((capacity, key_dim))
        self.values = np.zeros((capacity, value_dim))
        self.lru = np.zeros(capacity)
        self.tm = 0.0

        self.index = AnnoyIndex(key_dim, metric="euclidean")
        self.index.set_seed(123)

        self.initial_update_size = batch_size
        self.min_update_size = self.initial_update_size
        self.cached_states = []
        self.cached_values = []
        self.cached_indices = []

    def nn(self, keys, k):
        dists = []
        inds = []
        for key in keys:
            ind, dist = self.index.get_nns_by_vector(key,
                                                     k,
                                                     include_distances=True)
            dists.append(dist)
            inds.append(ind)
        return dists, inds

    def query(self, keys, k):
        _, indices = self.nn(keys, k)
        states = []
        values = []

        for ind in indices:
            self.lru[ind] = self.tm
            states.append(self.states[ind])
            values.append(self.values[ind])
        self.tm += 0.001
        return states, values

    def _insert(self, keys, values, indices):
        self.cached_states = self.cached_states + keys
        self.cached_values = self.cached_values + values
        self.cached_indices = self.cached_indices + indices

        if len(self.cached_states) >= self.min_update_size:
            self.min_update_size = max(self.initial_update_size,
                                       self.curr_capacity * 0.02)
            self._update_index()

    def _update_index(self):
        self.index.unbuild()
        for i, ind in enumerate(self.cached_indices):
            new_state = self.cached_states[i]
            new_value = self.cached_values[i]

            self.states[ind] = new_state
            self.values[ind] = new_value
            self.index.add_item(ind, new_state)

        self.cached_states = []
        self.cached_values = []
        self.cached_indices = []

        self.index.build(50)
        self.built_capacity = self.curr_capacity

    def _rebuild_index(self):
        self.index.unbuild()
        for ind, state in enumerate(self.states[:self.curr_capacity]):
            self.index.add_item(ind, state)
        self.index.build(50)
        self.built_capacity = self.curr_capacity
Ejemplo n.º 23
0
#!/usr/bin/env python
# encoding: utf-8
from annoy import AnnoyIndex

test_dims = [64]
for dim in test_dims:
    a = AnnoyIndex(dim, 'angular')
    d = AnnoyIndex(dim, 'dot')
    e = AnnoyIndex(dim, 'euclidean')
    a.set_seed(123)
    d.set_seed(123)
    e.set_seed(123)
    vectors = open('item_vector.txt').readlines()
    for index, vector in enumerate(vectors):
        v = [float(x) for x in vector.split(',')]
        a.add_item(index, v)
        d.add_item(index, v)
        e.add_item(index, v)
    a.build(3)
    a.save('points.angular.annoy.{}'.format(dim))
    d.build(3)
    d.save('points.dot.annoy.{}'.format(dim))
    e.build(3)
    e.save('points.euclidean.annoy.{}'.format(dim))
Ejemplo n.º 24
0
Archivo: test.py Proyecto: ijklr/annoy
import numpy as np
from annoy import AnnoyIndex

X = np.random.rand(100000, 60)
Y = np.random.rand(500, 60)

annoy1 = AnnoyIndex(60)
annoy1.set_seed(100)
for i in range(X.shape[0]):
    annoy1.add_item(i, X[i, :])

annoy1.build(10)

annoy2 = AnnoyIndex(60)
annoy2.set_seed(100)
for j in range(X.shape[0]):
    annoy2.add_item(j, X[j, :])

annoy2.build(10)

result1 = []
result2 = []

for k in range(Y.shape[0]):
    print "annoy1", annoy1.get_nns_by_vector(Y[k, :], 3)
    result1 += annoy1.get_nns_by_vector(Y[k, :], 3)
    print "annoy2", annoy2.get_nns_by_vector(Y[k, :], 3)
    result2 += annoy2.get_nns_by_vector(Y[k, :], 3)

Ejemplo n.º 25
0
class Memory:
    def __init__(self, capacity, state_dim, value_dim):
        self.capacity = capacity
        print("state_dim:", state_dim)
        self.states = np.zeros((capacity, state_dim))
        self.values = np.zeros((capacity, value_dim))

        self.curr_capacity = 0
        self.curr_ = 0
        self.lru = np.zeros(capacity)
        self.tm = 0

        self.cached_states = []
        self.cached_values = []
        self.cached_indices = []

        self.index = AnnoyIndex(state_dim)
        self.index.set_seed(123)
        self.update_size = 1
        self.build_capacity = 0

    def sample_knn_test(self, state, k):
        inds, dists = self.index.get_nns_by_vector(state,
                                                   k,
                                                   include_distances=True)
        self.tm += 0.01
        self.lru[inds] = self.tm
        return self.states[inds], self.values[inds], dists

    def sample_knn(self, states, k):
        dists = []
        inds = []
        for state in states:
            ind, dist = self.index.get_nns_by_vector(state,
                                                     k,
                                                     include_distances=True)
            inds.append(ind)
            dists.append(dist)
        # inds = np.reshape(np.array(inds), -1)
        self.tm += 0.01
        self.lru[inds] = self.tm
        return self.states[inds], self.values[inds], dists

    def sample(self, n_samples):
        if self.curr_capacity < n_samples or n_samples == 0:
            idx = np.random.choice(np.arange(len(self.states)),
                                   n_samples,
                                   replace=False)
        else:
            idx = np.random.choice(np.arange(self.curr_capacity),
                                   n_samples,
                                   replace=False)
        self.tm += 0.01
        self.lru[idx] = self.tm
        embs = self.states[idx]
        values = self.values[idx]

        return embs, values

    def add_knn(self, states, values):
        self._add_knn(states, values)

    def add_knn_lru(self, states, values):
        self._add_knn(states, values, lru=True)

    def add(self, states, values):
        self._add(states, values)

    def add_lru(self, states, values):
        self._add(states, values, lru=True)

    def add_rand(self, states, values):
        self._add(states, values, rand=True)

    def _insert(self, states, values, indices):
        self.cached_states = self.cached_states + states
        self.cached_values = self.cached_values + values
        self.cached_indices = self.cached_indices + indices
        if len(self.cached_states) >= self.update_size:
            self._update_index()

    def _update_index(self):
        self.index.unbuild()
        for i, ind in enumerate(self.cached_indices):
            self.states[ind] = self.cached_states[i]
            self.values[ind] = self.cached_values[i]
            self.index.add_item(ind, self.cached_states[i])

        self.index.build(50)
        self.build_capacity = self.curr_capacity

        self.cached_states = []
        self.cached_values = []
        self.cached_indices = []

    def _rebuild_index(self):
        self.index.unbuild()
        for ind, state in enumerate(self.states[:self.curr_capacity]):
            self.index.add_item(ind, state)
        self.index.build(50)
        self.build_capacity = self.curr_capacity

    def _add_knn(self, states, values, lru=False):
        # print(states)
        indices = []
        states_ = []
        values_ = []
        for i, _ in enumerate(states):
            if lru:
                if self.curr_capacity >= self.capacity:
                    ind = np.argmin(self.lru)
                else:

                    ind = self.curr_capacity
                    self.curr_capacity += 1
            else:
                if self.curr_capacity >= self.capacity:
                    self.curr_ = (self.curr_ + 1) % self.capacity
                    ind = self.curr_
                else:
                    ind = self.curr_capacity
                    self.curr_capacity += 1

            self.lru[ind] = self.tm
            indices.append(ind)
            states_.append(states[i])
            values_.append(values[i])
        self._insert(states_, values_, indices)

    def _add(self, states, values, rand=False, lru=False):
        for i, state in enumerate(states):
            if self.curr_capacity < self.capacity:
                self.curr_ = (self.curr_ + 1) % self.capacity
                # self.states[self.curr_] = state
                # self.values[self.curr_] = values[i]
                if self.curr_capacity < self.capacity:
                    self.curr_capacity += 1
            else:
                if lru:
                    self.curr_ = np.argmin(self.lru)
                if rand:
                    self.curr_ = np.random.choice(np.arange(
                        self.curr_capacity),
                                                  1,
                                                  replace=False)

                if not lru and not rand:
                    self.curr_ = (self.curr_ + 1) % self.capacity
            self.states[self.curr_] = state
            self.values[self.curr_] = values[i]

    @property
    def length(self):
        # assert self.index.get_n_items() == self.curr_capacity
        # return self.curr_capacity
        return self.index.get_n_items()
Ejemplo n.º 26
0
 def test_seed(self):
     i = AnnoyIndex(10)
     i.load('test/test.tree')
     i.set_seed(42)
def generate_triplets_from_ANN(model, sequences, entity2unique, entity2same, unique_text, test):
    predictions = model.predict(sequences)
    t = AnnoyIndex(len(predictions[0]), metric='euclidean')  # Length of item vector that will be indexed
    t.set_seed(123)
    for i in range(len(predictions)):
        # print(predictions[i])
        v = predictions[i]
        t.add_item(i, v)

    t.build(100) # 100 trees

    match = 0
    no_match = 0
    ann_accuracy = 0
    total = 0

    triplets = {}

    pos_distances = []
    neg_distances = []

    triplets['anchor'] = []
    triplets['positive'] = []
    triplets['negative'] = []

    if test:
        NNlen = TEST_NEIGHBOR_LEN
    else:
        NNlen = TRAIN_NEIGHBOR_LEN

    for key in entity2same:
        index = entity2unique[key]
        nearest = t.get_nns_by_vector(predictions[index], NNlen)
        nearest_text = set([unique_text[i] for i in nearest])
        expected_text = set(entity2same[key])
        # annoy has this annoying habit of returning the queried item back as a nearest neighbor.  Remove it.
        if key in nearest_text:
            nearest_text.remove(key)
        # print("query={} names = {} true_match = {}".format(unique_text[index], nearest_text, expected_text))
        overlap = expected_text.intersection(nearest_text)
        # collect up some statistics on how well we did on the match
        m = len(overlap)
        match += m
        # since we asked for only x nearest neighbors, and we get at most x-1 neighbors that are not the same as key (!)
        # make sure we adjust our estimate of no match appropriately
        no_match += min(len(expected_text), NNlen - 1) - m

        # sample only the negatives that are true negatives
        # that is, they are not in the expected set - sampling only 'semi-hard negatives is not defined here'
        # positives = expected_text - nearest_text
        positives = overlap
        negatives = nearest_text - expected_text

        # print(key + str(expected_text) + str(nearest_text))
        for i in negatives:
            for j in positives:
                dist_pos = t.get_distance(index, entity2unique[j])
                pos_distances.append(dist_pos)
                dist_neg = t.get_distance(index, entity2unique[i])
                neg_distances.append(dist_neg)
                if dist_pos < dist_neg:
                    ann_accuracy += 1
                total += 1
                # print(key + "|" +  j + "|" + i)
                # print(dist_pos)
                # print(dist_neg)               

        for i in negatives:
            for j in expected_text:
                triplets['anchor'].append(key)
                triplets['positive'].append(j)
                triplets['negative'].append(i)

    print("mean positive distance:" + str(statistics.mean(pos_distances)))
    print("stdev positive distance:" + str(statistics.stdev(pos_distances)))
    print("max positive distance:" + str(max(pos_distances)))
    print("mean neg distance:" + str(statistics.mean(neg_distances)))
    print("stdev neg distance:" + str(statistics.stdev(neg_distances)))
    print("max neg distance:" + str(max(neg_distances)))
    print("Accuracy in the ANN for triplets that obey the distance func:" + str(ann_accuracy / total))

    obj = {}
    obj['accuracy'] = ann_accuracy / total
    obj['steps'] = 1
    with open(output_file_name_for_hpo, 'w') as out:
        json.dump(obj, out)

    if test:
        return match/(match + no_match)
    else:
        return triplets, match/(match + no_match)
Ejemplo n.º 28
0
class Annoy(KNNIndex):
    VALID_METRICS = [
        "cosine",
        "euclidean",
        "manhattan",
        "hamming",
        "dot",
        "l1",
        "l2",
        "taxicab",
    ]

    def build(self, data, k):
        timer = utils.Timer(
            f"Finding {k} nearest neighbors using Annoy approximate search using "
            f"{self.metric} distance...",
            verbose=self.verbose,
        )
        timer.__enter__()

        from annoy import AnnoyIndex

        N = data.shape[0]

        annoy_metric = self.metric
        annoy_aliases = {
            "cosine": "angular",
            "l1": "manhattan",
            "l2": "euclidean",
            "taxicab": "manhattan",
        }
        if annoy_metric in annoy_aliases:
            annoy_metric = annoy_aliases[annoy_metric]

        self.index = AnnoyIndex(data.shape[1], annoy_metric)

        if self.random_state:
            self.index.set_seed(self.random_state)

        for i in range(N):
            self.index.add_item(i, data[i])

        # Number of trees. FIt-SNE uses 50 by default.
        self.index.build(50)

        # Return the nearest neighbors in the training set
        distances = np.zeros((N, k))
        indices = np.zeros((N, k)).astype(int)

        def getnns(i):
            # Annoy returns the query point itself as the first element
            indices_i, distances_i = self.index.get_nns_by_item(
                i, k + 1, include_distances=True)
            indices[i] = indices_i[1:]
            distances[i] = distances_i[1:]

        if self.n_jobs == 1:
            for i in range(N):
                getnns(i)
        else:
            from joblib import Parallel, delayed

            Parallel(n_jobs=self.n_jobs,
                     require="sharedmem")(delayed(getnns)(i) for i in range(N))

        timer.__exit__()

        return indices, distances

    def query(self, query, k):
        timer = utils.Timer(
            f"Finding {k} nearest neighbors in existing embedding using Annoy "
            f"approximate search...",
            self.verbose,
        )
        timer.__enter__()

        N = query.shape[0]
        distances = np.zeros((N, k))
        indices = np.zeros((N, k)).astype(int)

        def getnns(i):
            indices[i], distances[i] = self.index.get_nns_by_vector(
                query[i], k, include_distances=True)

        if self.n_jobs == 1:
            for i in range(N):
                getnns(i)
        else:
            from joblib import Parallel, delayed

            Parallel(n_jobs=self.n_jobs,
                     require="sharedmem")(delayed(getnns)(i) for i in range(N))

        timer.__exit__()

        return indices, distances