コード例 #1
0
def test_update_w_prepare_query_accuracy(nn_data, metric):
    nnd = NNDescent(
        nn_data[200:800],
        metric=metric,
        n_neighbors=10,
        random_state=None,
        compressed=False,
    )
    nnd.prepare()

    nnd.update(xs_fresh=nn_data[800:])
    nnd.prepare()

    knn_indices, _ = nnd.query(nn_data[:200], k=10, epsilon=0.2)

    true_nnd = NearestNeighbors(metric=metric).fit(nn_data[200:])
    true_indices = true_nnd.kneighbors(nn_data[:200],
                                       10,
                                       return_distance=False)

    num_correct = 0.0
    for i in range(true_indices.shape[0]):
        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))

    percent_correct = num_correct / (true_indices.shape[0] * 10)
    assert percent_correct >= 0.95, ("NN-descent query did not get 95% "
                                     "accuracy on nearest neighbors")
コード例 #2
0
def test_tree_no_split(small_data, sparse_small_data, metric):
    k = 10
    for data, data_type in zip([small_data, sparse_small_data],
                               ["dense", "sparse"]):
        n_instances = data.shape[0]
        leaf_size = n_instances + 1  # just to be safe
        data_train = data[n_instances // 2:]
        data_test = data[:n_instances // 2]

        nnd = NNDescent(
            data_train,
            metric=metric,
            n_neighbors=data_train.shape[0] - 1,
            random_state=None,
            tree_init=True,
            leaf_size=leaf_size,
        )
        nnd.prepare()
        knn_indices, _ = nnd.query(data_test, k=k, epsilon=0.2)

        true_nnd = NearestNeighbors(metric=metric).fit(data_train)
        true_indices = true_nnd.kneighbors(data_test, k, return_distance=False)

        num_correct = 0.0
        for i in range(true_indices.shape[0]):
            num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))

        percent_correct = num_correct / (true_indices.shape[0] * k)
        assert (
            percent_correct >= 0.95
        ), "NN-descent query did not get 95% for accuracy on nearest neighbors on {} data".format(
            data_type)
コード例 #3
0
 def create(
     cls,
     index_vectors: np.ndarray,
     metric: Metric = DEFAULT_METRIC,
     epsilon: float = DEFAULT_EPSILON,
     neighbors: int = DEFAULT_NEIGHBORS,
     diversify_probability: float = DEFAULT_DIVERSIFY_PROBABILITY,
     pruning_degree_multiplier: float = DEFAULT_PRUNING_DEGREE_MULTIPLIER,
 ) -> "Descent":
     index = NNDescent(
         data=index_vectors,
         metric=_METRIC_NAMES[metric],
         n_neighbors=neighbors,
         diversify_prob=diversify_probability,
         pruning_degree_multiplier=pruning_degree_multiplier,
     )
     index.prepare()
     return Descent(
         index=index,
         epsilon=epsilon,
     )
コード例 #4
0
 def test_generate_triplets(self):
     key = random.PRNGKey(42)
     n_points = 1000
     n_inliers = 10
     n_outliers = 5
     n_random = 3
     n_extra = min(n_inliers + 50, n_points)
     # Currently testing it only for 'euclidean' distance. The test for other
     # cases breaks due to issues with the knn search NNDescent package, but
     # it works fine when tested in a colab.
     for distance in ['euclidean']:
         inputs = np.random.normal(size=(n_points, 100))
         index = NNDescent(inputs, metric=distance)
         index.prepare()
         neighbors = index.query(inputs, n_extra)[0]
         neighbors = np.concatenate(
             (np.arange(n_points).reshape([-1, 1]), neighbors), 1)
         distance_fn = trimap.get_distance_fn(distance)
         _, _, sig = trimap.find_scaled_neighbors(inputs, neighbors,
                                                  distance_fn)
         triplets, _ = trimap.generate_triplets(key,
                                                inputs,
                                                n_inliers=n_inliers,
                                                n_outliers=n_outliers,
                                                n_random=n_random,
                                                distance=distance)
         similar_pairs_distances = distance_fn(inputs[triplets[:, 0]],
                                               inputs[triplets[:, 1]])**2
         similar_pairs_distances /= (sig[triplets[:, 0]] *
                                     sig[triplets[:, 1]])
         outlier_pairs_distances = distance_fn(inputs[triplets[:, 0]],
                                               inputs[triplets[:, 2]])**2
         outlier_pairs_distances /= (sig[triplets[:, 0]] *
                                     sig[triplets[:, 2]])
         npt.assert_array_less(similar_pairs_distances,
                               outlier_pairs_distances)
     n_knn_triplets = inputs.shape[0] * n_inliers * n_outliers
     n_random_triplets = inputs.shape[0] * n_random
     npt.assert_equal(triplets.shape,
                      [n_knn_triplets + n_random_triplets, 3])
コード例 #5
0
def test_one_dimensional_data(nn_data, metric):
    nnd = NNDescent(
        nn_data[200:, :1],
        metric=metric,
        n_neighbors=20,
        random_state=None,
        tree_init=False,
    )
    nnd.prepare()

    knn_indices, _ = nnd.query(nn_data[:200, :1], k=10, epsilon=0.2)

    true_nnd = NearestNeighbors(metric=metric).fit(nn_data[200:, :1])
    true_indices = true_nnd.kneighbors(nn_data[:200, :1],
                                       10,
                                       return_distance=False)

    num_correct = 0.0
    for i in range(true_indices.shape[0]):
        num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i]))

    percent_correct = num_correct / (true_indices.shape[0] * 10)
    assert percent_correct >= 0.95, ("NN-descent query did not get 95% "
                                     "accuracy on nearest neighbors")
コード例 #6
0
                                                k,
                                                return_distance=False)
        p_correct = evaluate_predictions(neighbors_expected, neighbors, k)
        assert p_correct >= 0.95, ("NN-descent query did not get 95% "
                                   "accuracy on nearest neighbors")

    k = 10
    xs_orig, xs_fresh, xs_updated, indices_updated = update_data[case]
    queries1 = xs_orig

    # original
    index = NNDescent(xs_orig,
                      metric=metric,
                      n_neighbors=40,
                      random_state=1234)
    index.prepare()
    evaluate(index, xs_orig, queries1)
    # updated
    index.update(xs_fresh=xs_fresh,
                 xs_updated=xs_updated,
                 updated_indices=indices_updated)
    if xs_fresh is not None:
        xs = np.vstack((xs_orig, xs_fresh))
        queries2 = np.vstack((queries1, xs_fresh))
    else:
        xs = xs_orig
        queries2 = queries1
    if indices_updated is not None:
        xs[indices_updated] = xs_updated
    evaluate(index, xs, queries2)
    if indices_updated is not None:
コード例 #7
0
ファイル: core.py プロジェクト: kundajelab/tfmodisco
    def __call__(self, fwd_vecs, rev_vecs, initclusters, fwd_vecs2=None):

        from pynndescent import NNDescent

        assert initclusters is None, (
            "Currently I haven't built support" +
            " for initclusters; use SparseNumpyCosineSimFromFwdAndRevOneDVecs"
            + " instead")

        #fwd_vecs2 is used when you don't just want to compute self-similarities

        #normalize the vectors
        fwd_vecs = magnitude_norm_sparsemat(sparse_mat=fwd_vecs)
        if (rev_vecs is not None):
            rev_vecs = magnitude_norm_sparsemat(sparse_mat=rev_vecs)
        else:
            rev_vecs = None

        if (fwd_vecs2 is None):
            fwd_vecs2 = fwd_vecs
        else:
            fwd_vecs2 = magnitude_norm_sparsemat(sparse_mat=fwd_vecs2)

        #build the index
        if (self.verbose):
            print(datetime.now(), "Building the index")
            sys.stdout.flush()

        index = NNDescent(fwd_vecs2, metric="cosine")

        if (self.verbose):
            print(datetime.now(), "Preparing the index")
            sys.stdout.flush()

        index.prepare()

        if (self.verbose):
            print(datetime.now(), "Index ready")
            sys.stdout.flush()

        if (self.verbose):
            print(datetime.now(), "Querying neighbors for fwd")
            sys.stdout.flush()

        fwd_neighbs, fwd_dists = index.query(fwd_vecs, k=self.n_neighbors)

        if (rev_vecs is not None):
            if (self.verbose):
                print(datetime.now(), "Querying neighbors for rev")
                sys.stdout.flush()
            rev_neighbs, rev_dists = index.query(fwd_vecs, k=self.n_neighbors)
            if (self.verbose):
                print(datetime.now(), "Unifying fwd and rev")
                sys.stdout.flush()

            fwdrev_neighbs = np.concatenate([fwd_neighbs, rev_neighbs], axis=1)
            fwdrev_dists = np.concatenate([fwd_dists, rev_dists], axis=1)
            fwdrev_dists_argsort = np.argsort(fwdrev_dists, axis=1)

            #need to remove redundancy
            sims = []
            neighbors = []
            for i in range(len(fwdrev_dists_argsort)):
                sims_this_ex = []
                neighbors_this_ex = []
                neighbors_seen = set()
                #iterate in order of similarities in the fwd/rev sim search
                for j in fwdrev_dists_argsort[i]:
                    #get the neighbor
                    neighbor = fwdrev_neighbs[i][j]
                    #make sure it hasn't appeared before (this can happen if
                    # a point is a neighbor according to both the fwd and
                    # the rev search)
                    if neighbor not in neighbors_seen:
                        neighbors_seen.add(neighbor)
                        neighbors_this_ex.append(neighbor)
                        #Need to subtract from 1 because pynndescent returns
                        # 1 - cosinesim
                        sims_this_ex.append(1 - fwdrev_dists[i][j])
                    #leave once we have n_neighbors neighbors; since we
                    # iterated over the distances in ascending order, these
                    # should be the nearest neighbors
                    if (len(sims_this_ex) == self.n_neighbors):
                        break
                assert len(neighbors_seen) == self.n_neighbors
                sims.append(np.array(sims_this_ex))
                #neighbors need to be converted to integers as they'll
                # be used later for indexing
                neighbors.append(np.array(neighbors_this_ex).astype("int"))

        else:
            #Need to subtract from 1 because pynndescent returns 1 - cosinesim
            sims = 1.0 - fwd_dists
            neighbors = fwd_neighbs

        return sims, neighbors
コード例 #8
0
    def compute_similarity_graph(self,
                                 X,
                                 knn=15,
                                 sigma=3.,
                                 zp_k=None,
                                 metric='euclidean',
                                 maxN=5000):
        """
        Computes similarity graph using parameters specified in self.param
        """
        N = X.shape[0]
        if knn is None:
            if N < maxN:
                knn = N
            else:
                print(
                    "Parameter knn was given None and N > maxN, so setting knn=15"
                )
                knn = 15
        if N < maxN:
            print("Calculating NN graph with SKLEARN NearestNeighbors...")

            if knn > N / 2:
                nn = NearestNeighbors(n_neighbors=knn,
                                      algorithm='brute').fit(X)
            else:
                nn = NearestNeighbors(n_neighbors=knn,
                                      algorithm='ball_tree').fit(X)

            # construct CSR matrix representation of the k-NN graph
            A_data, A_ind = nn.kneighbors(X, knn, return_distance=True)
        else:
            print(
                "Calculating NN graph with NNDescent package since N = {} > {}"
                .format(N, maxN))
            from pynndescent import NNDescent
            index = NNDescent(X, metric=metric)
            index.prepare()
            A_ind, A_data = index.query(X, k=knn)

        # modify from the kneighbors_graph function from sklearn to
        # accomodate Zelnik-Perona scaling
        n_nonzero = N * knn
        A_indptr = np.arange(0, n_nonzero + 1, knn)
        if zp_k is not None and metric == 'euclidean':
            k_dist = A_data[:, zp_k][:, np.newaxis]
            k_dist[k_dist < 1e-4] = 1e-4
            A_data /= np.sqrt(k_dist * k_dist[A_ind, 0])

        A_data = np.ravel(A_data)
        if metric == 'cosine':
            print(np.max(A_data))
            W = sps.csr_matrix(
                (
                    (
                        1. - A_data
                    ),  # need to do 1.-A_data since NNDescent returns cosine DISTANCE (1. - cosine_similarity)
                    A_ind.ravel(),
                    A_indptr),
                shape=(N, N))
        else:
            W = sps.csr_matrix(
                (np.exp(-(A_data**2) / sigma), A_ind.ravel(), A_indptr),
                shape=(N, N))
        W = (W + W.T) / 2
        #W = max(W, W.T)
        W.setdiag(0)
        W.eliminate_zeros()

        return W