def testMarsKNN(self):
        client = self.odps.create_mars_cluster(1,
                                               4,
                                               8,
                                               name=str(uuid.uuid4()),
                                               scheduler_mem=12,
                                               scheduler_cpu=4)

        try:
            import numpy as np
            import mars.tensor as mt
            from mars.learn.neighbors import NearestNeighbors
            from sklearn.neighbors import NearestNeighbors as SkNearestNeighbors

            rs = np.random.RandomState(0)
            raw_X = rs.rand(10, 5)
            raw_Y = rs.rand(8, 5)

            X = mt.tensor(raw_X, chunk_size=7)
            Y = mt.tensor(raw_Y, chunk_size=(5, 3))

            nn = NearestNeighbors(n_neighbors=3)
            nn.fit(X)
            ret = nn.kneighbors(Y)

            snn = SkNearestNeighbors(n_neighbors=3)
            snn.fit(raw_X)

            expected = snn.kneighbors(raw_Y)
            result = [r.fetch() for r in ret]
            np.testing.assert_almost_equal(result[0], expected[0])
            np.testing.assert_almost_equal(result[1], expected[1])
        finally:
            client.stop_server()
Esempio n. 2
0
def test_faiss_query(setup, X, Y, metric):
    faiss_index = build_faiss_index(X,
                                    'Flat',
                                    None,
                                    metric=metric,
                                    random_state=0)
    d, i = faiss_query(faiss_index, Y, 5, nprobe=10)
    distance, indices = fetch(*execute(d, i))

    nn = NearestNeighbors(metric=metric)
    nn.fit(x)
    expected_distance, expected_indices = nn.kneighbors(y, 5)

    np.testing.assert_array_equal(indices, expected_indices.fetch())
    np.testing.assert_almost_equal(distance,
                                   expected_distance.fetch(),
                                   decimal=4)

    # test other index
    X2 = X.astype(np.float64)
    Y2 = y.astype(np.float64)
    faiss_index = build_faiss_index(X2,
                                    'PCAR6,IVF8_HNSW32,SQ8',
                                    10,
                                    random_state=0,
                                    return_index_type='object')
    d, i = faiss_query(faiss_index, Y2, 5, nprobe=10)
    # test execute only
    execute(d, i)
Esempio n. 3
0
    def testLearnInLocalCluster(self, *_):
        from mars.learn.neighbors import NearestNeighbors
        from sklearn.neighbors import NearestNeighbors as SkNearestNeighbors

        with new_cluster(scheduler_n_process=2,
                         worker_n_process=3,
                         shared_memory='20M') as cluster:
            rs = np.random.RandomState(0)
            raw_X = rs.rand(10, 5)
            raw_Y = rs.rand(8, 5)

            X = mt.tensor(raw_X, chunk_size=7)
            Y = mt.tensor(raw_Y, chunk_size=(5, 3))
            nn = NearestNeighbors(n_neighbors=3)
            nn.fit(X)

            ret = nn.kneighbors(Y, session=cluster.session)

            snn = SkNearestNeighbors(n_neighbors=3)
            snn.fit(raw_X)
            expected = snn.kneighbors(raw_Y)

            result = [r.fetch() for r in ret]
            np.testing.assert_almost_equal(result[0], expected[0])
            np.testing.assert_almost_equal(result[1], expected[1])
Esempio n. 4
0
    def testFaissQuery(self):
        d = 8
        n = 50
        n_test = 10
        x = np.random.RandomState(0).rand(n, d).astype(np.float32)
        y = np.random.RandomState(1).rand(n_test, d).astype(np.float32)

        test_tensors = [
            # multi chunks
            (mt.tensor(x, chunk_size=(20, 5)), mt.tensor(y, chunk_size=5)),
            # one chunk
            (mt.tensor(x, chunk_size=50), mt.tensor(y, chunk_size=10))
        ]

        for X, Y in test_tensors:
            for metric in ['l2', 'cosine']:
                faiss_index = build_faiss_index(X, 'Flat', None, metric=metric,
                                                random_state=0, return_index_type='object')
                d, i = faiss_query(faiss_index, Y, 5, nprobe=10)
                distance, indices = self.executor.execute_tensors([d, i])

                nn = NearestNeighbors(metric=metric)
                nn.fit(x)
                expected_distance, expected_indices = nn.kneighbors(y, 5)

                np.testing.assert_array_equal(indices, expected_indices.fetch())
                np.testing.assert_almost_equal(distance, expected_distance.fetch())
Esempio n. 5
0
    def testLearnInLocalCluster(self, *_):
        from mars.learn.cluster import KMeans
        from mars.learn.neighbors import NearestNeighbors
        from sklearn.cluster import KMeans as SK_KMEANS
        from sklearn.neighbors import NearestNeighbors as SkNearestNeighbors

        with new_cluster(scheduler_n_process=2, worker_n_process=3, shared_memory='20M') as cluster:
            rs = np.random.RandomState(0)
            raw_X = rs.rand(10, 5)
            raw_Y = rs.rand(8, 5)

            X = mt.tensor(raw_X, chunk_size=7)
            Y = mt.tensor(raw_Y, chunk_size=(5, 3))
            nn = NearestNeighbors(n_neighbors=3)
            nn.fit(X)

            ret = nn.kneighbors(Y, session=cluster.session)

            snn = SkNearestNeighbors(n_neighbors=3)
            snn.fit(raw_X)
            expected = snn.kneighbors(raw_Y)

            result = [r.fetch() for r in ret]
            np.testing.assert_almost_equal(result[0], expected[0])
            np.testing.assert_almost_equal(result[1], expected[1])

            raw = np.array([[1, 2], [1, 4], [1, 0],
                            [10, 2], [10, 4], [10, 0]])
            X = mt.array(raw)
            kmeans = KMeans(n_clusters=2, random_state=0, init='k-means++').fit(X)
            sk_km_elkan = SK_KMEANS(n_clusters=2, random_state=0, init='k-means++').fit(raw)
            np.testing.assert_allclose(kmeans.cluster_centers_, sk_km_elkan.cluster_centers_)
Esempio n. 6
0
 def topk_rbf(X, Y=None, n_neighbors=10, gamma=1e-5):
     nn = NearestNeighbors(n_neighbors=10, metric='euclidean', n_jobs=-1)
     nn.fit(X)
     W = -1 * mt.power(nn.kneighbors_graph(Y, mode='distance'), 2) * gamma
     W = mt.exp(W)
     assert W.issparse()
     return W.T
Esempio n. 7
0
    def testLearnInLocalCluster(self, *_):
        from mars.learn.neighbors import NearestNeighbors
        from sklearn.neighbors import NearestNeighbors as SkNearestNeighbors
        from mars.learn.metrics import roc_curve, auc
        from sklearn.metrics import roc_curve as sklearn_roc_curve, auc as sklearn_auc

        with new_cluster(scheduler_n_process=2,
                         worker_n_process=3,
                         shared_memory='20M') as cluster:
            rs = np.random.RandomState(0)
            raw_X = rs.rand(10, 5)
            raw_Y = rs.rand(8, 5)

            X = mt.tensor(raw_X, chunk_size=7)
            Y = mt.tensor(raw_Y, chunk_size=(5, 3))
            nn = NearestNeighbors(n_neighbors=3)
            nn.fit(X)

            ret = nn.kneighbors(Y, session=cluster.session)

            snn = SkNearestNeighbors(n_neighbors=3)
            snn.fit(raw_X)
            expected = snn.kneighbors(raw_Y)

            result = [r.fetch() for r in ret]
            np.testing.assert_almost_equal(result[0], expected[0])
            np.testing.assert_almost_equal(result[1], expected[1])

            rs = np.random.RandomState(0)
            raw = pd.DataFrame({
                'a': rs.randint(0, 10, (10, )),
                'b': rs.rand(10)
            })

            df = md.DataFrame(raw)
            y = df['a'].to_tensor().astype('int')
            pred = df['b'].to_tensor().astype('float')
            fpr, tpr, thresholds = roc_curve(y, pred, pos_label=2)
            m = auc(fpr, tpr)

            sk_fpr, sk_tpr, sk_threshod = sklearn_roc_curve(
                raw['a'].to_numpy().astype('int'),
                raw['b'].to_numpy().astype('float'),
                pos_label=2)
            expect_m = sklearn_auc(sk_fpr, sk_tpr)
            self.assertAlmostEqual(m.fetch(), expect_m)
Esempio n. 8
0
    def testAutoIndex(self):
        d = 8
        n = 50
        n_test = 10
        x = np.random.RandomState(0).rand(n, d).astype(np.float32)
        y = np.random.RandomState(1).rand(n_test, d).astype(np.float32)

        for chunk_size in (50, 20):
            X = mt.tensor(x, chunk_size=chunk_size)

            faiss_index = build_faiss_index(X, random_state=0, return_index_type='object')
            d, i = faiss_query(faiss_index, y, 5, nprobe=10)
            indices = self.executor.execute_tensor(i, concat=True)[0]

            nn = NearestNeighbors()
            nn.fit(x)
            expected_indices = nn.kneighbors(y, 5, return_distance=False)

            np.testing.assert_array_equal(indices, expected_indices)
Esempio n. 9
0
    def testGPUFaissNearestNeighborsExecution(self):
        rs = np.random.RandomState(0)

        raw_X = rs.rand(10, 5)
        raw_Y = rs.rand(8, 5)

        # test faiss execution
        X = mt.tensor(raw_X, chunk_size=7).to_gpu()
        Y = mt.tensor(raw_Y, chunk_size=8).to_gpu()

        nn = NearestNeighbors(n_neighbors=3, algorithm='faiss', metric='l2')
        nn.fit(X)

        ret = nn.kneighbors(Y)

        snn = SkNearestNeighbors(n_neighbors=3, algorithm='auto', metric='l2')
        snn.fit(raw_X)
        expected = snn.kneighbors(raw_Y)

        result = [r.fetch() for r in ret]
        np.testing.assert_almost_equal(result[0].get(), expected[0], decimal=6)
        np.testing.assert_almost_equal(result[1].get(), expected[1])
Esempio n. 10
0
    def testFaissNearestNeighborsExecution(self):
        rs = np.random.RandomState(0)
        raw_X = rs.rand(10, 5)
        raw_Y = rs.rand(8, 5)

        # test faiss execution
        X = mt.tensor(raw_X, chunk_size=7)
        Y = mt.tensor(raw_Y, chunk_size=(5, 3))

        nn = NearestNeighbors(n_neighbors=3, algorithm='faiss', metric='l2')
        nn.fit(X)

        ret = nn.kneighbors(Y)

        snn = SkNearestNeighbors(n_neighbors=3, algorithm='auto', metric='l2')
        snn.fit(raw_X)
        expected = snn.kneighbors(raw_Y)

        result = [r.fetch() for r in ret]
        np.testing.assert_almost_equal(result[0], expected[0], decimal=6)
        np.testing.assert_almost_equal(result[1], expected[1])

        # test return_distance=False
        ret = nn.kneighbors(Y, return_distance=False)

        result = ret.fetch()
        np.testing.assert_almost_equal(result, expected[1])

        # test y is x
        ret = nn.kneighbors()

        expected = snn.kneighbors()

        result = [r.fetch() for r in ret]
        np.testing.assert_almost_equal(result[0], expected[0], decimal=5)
        np.testing.assert_almost_equal(result[1], expected[1])
Esempio n. 11
0
def test_manual_build_faiss_index(setup):
    d = 8
    n = 50
    n_test = 10
    x = np.random.RandomState(0).rand(n, d).astype(np.float32)
    y = np.random.RandomState(0).rand(n_test, d).astype(np.float32)

    nn = NearestNeighbors(algorithm='kd_tree')
    nn.fit(x)
    _, expected_indices = nn.kneighbors(y, 5)

    # test brute-force search
    X = mt.tensor(x, chunk_size=10)
    index = build_faiss_index(X,
                              'Flat',
                              None,
                              random_state=0,
                              same_distribution=True)
    faiss_index = index.execute().fetch()

    index_shards = faiss.IndexShards(d)
    for ind in faiss_index:
        shard = _load_index(ind, -1)
        index_shards.add_shard(shard)
    faiss_index = index_shards

    faiss_index.nprob = 10
    _, indices = faiss_index.search(y, k=5)

    np.testing.assert_array_equal(indices, expected_indices.fetch())

    # test one chunk, brute force
    X = mt.tensor(x, chunk_size=50)
    index = build_faiss_index(X,
                              'Flat',
                              None,
                              random_state=0,
                              same_distribution=True)
    faiss_index = _load_index(index.execute().fetch(), -1)

    faiss_index.nprob = 10
    _, indices = faiss_index.search(y, k=5)

    np.testing.assert_array_equal(indices, expected_indices.fetch())

    # test train, same distribution
    X = mt.tensor(x, chunk_size=10)
    index = build_faiss_index(X,
                              'IVF30,Flat',
                              30,
                              random_state=0,
                              same_distribution=True)
    faiss_index = _load_index(index.execute().fetch(), -1)

    assert isinstance(faiss_index, faiss.IndexIVFFlat)
    assert faiss_index.ntotal == n
    assert len(tile(index).chunks) == 1

    # test train, distributions are variant
    X = mt.tensor(x, chunk_size=10)
    index = build_faiss_index(X,
                              'IVF10,Flat',
                              None,
                              random_state=0,
                              same_distribution=False)
    faiss_index = index.execute().fetch()

    assert len(faiss_index) == 5
    for ind in faiss_index:
        ind = _load_index(ind, -1)
        assert isinstance(ind, faiss.IndexIVFFlat)
        assert ind.ntotal == 10

    # test more index type
    index = build_faiss_index(X, 'PCAR6,IVF8_HNSW32,SQ8', 10, random_state=0)
    faiss_index = index.execute().fetch()

    assert len(faiss_index) == 5
    for ind in faiss_index:
        ind = _load_index(ind, -1)
        assert isinstance(ind, faiss.IndexPreTransform)
        assert ind.ntotal == 10

    # test one chunk, train
    X = mt.tensor(x, chunk_size=50)
    index = build_faiss_index(X,
                              'IVF30,Flat',
                              30,
                              random_state=0,
                              same_distribution=True)
    faiss_index = _load_index(index.execute().fetch(), -1)

    assert isinstance(faiss_index, faiss.IndexIVFFlat)
    assert faiss_index.ntotal == n

    # test wrong index
    with pytest.raises(ValueError):
        build_faiss_index(X, 'unknown_index', None)

    # test unknown metric
    with pytest.raises(ValueError):
        build_faiss_index(X, 'Flat', None, metric='unknown_metric')
Esempio n. 12
0
    def testNearestNeighbors(self):
        rs = np.random.RandomState(0)
        raw_X = rs.rand(10, 5)
        raw_Y = rs.rand(8, 5)

        X = mt.tensor(raw_X)
        Y = mt.tensor(raw_Y)

        raw_sparse_x = sps.random(10,
                                  5,
                                  density=0.5,
                                  format='csr',
                                  random_state=rs)
        raw_sparse_y = sps.random(8,
                                  5,
                                  density=0.4,
                                  format='csr',
                                  random_state=rs)

        X_sparse = mt.tensor(raw_sparse_x)
        Y_sparse = mt.tensor(raw_sparse_y)

        metric_func = lambda u, v: np.sqrt(((u - v)**2).sum())

        _ = NearestNeighbors(algorithm='auto',
                             metric='precomputed',
                             metric_params={})

        with self.assertRaises(ValueError):
            _ = NearestNeighbors(algorithm='unknown')

        with self.assertRaises(ValueError):
            _ = NearestNeighbors(algorithm='kd_tree', metric=metric_func)

        with self.assertRaises(ValueError):
            _ = NearestNeighbors(algorithm='auto', metric='unknown')

        assert_warns(SyntaxWarning, NearestNeighbors, metric_params={'p': 1})

        with self.assertRaises(ValueError):
            _ = NearestNeighbors(metric='wminkowski', p=0)

        with self.assertRaises(ValueError):
            _ = NearestNeighbors(algorithm='auto', metric='minkowski', p=0)

        nn = NearestNeighbors(algorithm='auto', metric='minkowski', p=1)
        nn.fit(X)
        self.assertEqual(nn.effective_metric_, 'manhattan')

        nn = NearestNeighbors(algorithm='auto', metric='minkowski', p=2)
        nn.fit(X)
        self.assertEqual(nn.effective_metric_, 'euclidean')

        nn = NearestNeighbors(algorithm='auto', metric='minkowski', p=np.inf)
        nn.fit(X)
        self.assertEqual(nn.effective_metric_, 'chebyshev')

        nn2 = NearestNeighbors(algorithm='auto', metric='minkowski')
        nn2.fit(nn)
        self.assertEqual(nn2._fit_method, nn._fit_method)

        nn = NearestNeighbors(algorithm='auto', metric='minkowski')
        ball_tree = SkBallTree(raw_X)
        nn.fit(ball_tree)
        self.assertEqual(nn._fit_method, 'ball_tree')

        nn = NearestNeighbors(algorithm='auto', metric='minkowski')
        kd_tree = SkKDTree(raw_X)
        nn.fit(kd_tree)
        self.assertEqual(nn._fit_method, 'kd_tree')

        with self.assertRaises(ValueError):
            nn = NearestNeighbors()
            nn.fit(np.random.rand(0, 10))

        nn = NearestNeighbors(algorithm='ball_tree')
        assert_warns(UserWarning, nn.fit, X_sparse)

        nn = NearestNeighbors(metric='haversine')
        with self.assertRaises(ValueError):
            nn.fit(X_sparse)

        nn = NearestNeighbors(metric=metric_func, n_neighbors=1)
        nn.fit(X)
        self.assertEqual(nn._fit_method, 'ball_tree')

        nn = NearestNeighbors(metric='sqeuclidean', n_neighbors=1)
        nn.fit(X)
        self.assertEqual(nn._fit_method, 'brute')

        with self.assertRaises(ValueError):
            nn = NearestNeighbors(n_neighbors=-1)
            nn.fit(X)

        with self.assertRaises(TypeError):
            nn = NearestNeighbors(n_neighbors=1.3)
            nn.fit(X)

        nn = NearestNeighbors()
        nn.fit(X)
        with self.assertRaises(ValueError):
            nn.kneighbors(Y, n_neighbors=-1)
        with self.assertRaises(TypeError):
            nn.kneighbors(Y, n_neighbors=1.3)
        with self.assertRaises(ValueError):
            nn.kneighbors(Y, n_neighbors=11)

        nn = NearestNeighbors(algorithm='ball_tree')
        nn.fit(X)
        with self.assertRaises(ValueError):
            nn.kneighbors(Y_sparse)
Esempio n. 13
0
    def testKNeighborsGraphExecution(self):
        rs = np.random.RandomState(0)
        raw_X = rs.rand(10, 5)
        raw_Y = rs.rand(8, 5)

        X = mt.tensor(raw_X, chunk_size=7)
        Y = mt.tensor(raw_Y, chunk_size=(5, 3))

        neigh = NearestNeighbors(n_neighbors=3)
        neigh.fit(X)
        sklearn_neigh = SkNearestNeighbors(n_neighbors=3)
        sklearn_neigh.fit(raw_X)

        for mode in ['connectivity', 'distance']:
            graph = neigh.kneighbors_graph(Y, mode=mode)
            result = graph.fetch()

            self.assertIsInstance(result, SparseNDArray)
            self.assertGreater(len(get_tiled(graph).chunks), 1)

            expected = sklearn_neigh.kneighbors_graph(raw_Y, mode=mode)

            np.testing.assert_array_equal(result.toarray(), expected.toarray())

            graph2 = neigh.kneighbors_graph(mode=mode)
            result2 = graph2.fetch()

            self.assertIsInstance(result2, SparseNDArray)
            self.assertGreater(len(get_tiled(graph2).chunks), 1)

            expected2 = sklearn_neigh.kneighbors_graph(mode=mode)

            np.testing.assert_array_equal(result2.toarray(),
                                          expected2.toarray())

        X = [[0], [3], [1]]

        neigh = NearestNeighbors(n_neighbors=2)
        sklearn_neigh = SkNearestNeighbors(n_neighbors=2)
        neigh.fit(X)
        sklearn_neigh.fit(X)

        A = neigh.kneighbors_graph(X).fetch()
        expected_A = sklearn_neigh.kneighbors_graph(X)
        np.testing.assert_array_equal(A.toarray(), expected_A.toarray())

        # test wrong mode
        with self.assertRaises(ValueError):
            _ = neigh.kneighbors_graph(mode='unknown')
Esempio n. 14
0
    def testNearestNeighborsExecution(self):
        rs = np.random.RandomState(0)
        raw_X = rs.rand(10, 5)
        raw_Y = rs.rand(8, 5)

        X = mt.tensor(raw_X, chunk_size=7)
        Y = mt.tensor(raw_Y, chunk_size=(5, 3))

        for algo in ['brute', 'ball_tree', 'kd_tree', 'auto']:
            for metric in ['minkowski', 'manhattan']:
                nn = NearestNeighbors(n_neighbors=3,
                                      algorithm=algo,
                                      metric=metric)
                nn.fit(X)

                ret = nn.kneighbors(Y)

                snn = SkNearestNeighbors(n_neighbors=3,
                                         algorithm=algo,
                                         metric=metric)
                snn.fit(raw_X)
                expected = snn.kneighbors(raw_Y)

                result = [r.fetch() for r in ret]
                np.testing.assert_almost_equal(result[0], expected[0])
                np.testing.assert_almost_equal(result[1], expected[1])

                if nn._tree is not None:
                    self.assertIsInstance(nn._tree.fetch(), type(snn._tree))

                # test return_distance=False
                ret = nn.kneighbors(Y, return_distance=False)

                result = ret.fetch()
                np.testing.assert_almost_equal(result, expected[1])

                # test y is x
                ret = nn.kneighbors()

                expected = snn.kneighbors()

                result = [r.fetch() for r in ret]
                np.testing.assert_almost_equal(result[0], expected[0])
                np.testing.assert_almost_equal(result[1], expected[1])

                # test y is x, and return_distance=False
                ret = nn.kneighbors(return_distance=False)

                result = ret.fetch()
                np.testing.assert_almost_equal(result, expected[1])

        # test callable metric
        metric = lambda u, v: np.sqrt(((u - v)**2).sum())
        for algo in ['brute', 'ball_tree']:
            nn = NearestNeighbors(n_neighbors=3, algorithm=algo, metric=metric)
            nn.fit(X)

            ret = nn.kneighbors(Y)

            snn = SkNearestNeighbors(n_neighbors=3,
                                     algorithm=algo,
                                     metric=metric)
            snn.fit(raw_X)
            expected = snn.kneighbors(raw_Y)

            result = [r.fetch() for r in ret]
            np.testing.assert_almost_equal(result[0], expected[0])
            np.testing.assert_almost_equal(result[1], expected[1])

        # test sparse
        raw_sparse_x = sps.random(10,
                                  5,
                                  density=0.5,
                                  format='csr',
                                  random_state=rs)
        raw_sparse_y = sps.random(8,
                                  5,
                                  density=0.4,
                                  format='csr',
                                  random_state=rs)

        X = mt.tensor(raw_sparse_x, chunk_size=7)
        Y = mt.tensor(raw_sparse_y, chunk_size=5)

        nn = NearestNeighbors(n_neighbors=3)
        nn.fit(X)

        ret = nn.kneighbors(Y)

        snn = SkNearestNeighbors(n_neighbors=3)
        snn.fit(raw_sparse_x)
        expected = snn.kneighbors(raw_sparse_y)

        result = [r.fetch() for r in ret]
        np.testing.assert_almost_equal(result[0], expected[0])
        np.testing.assert_almost_equal(result[1], expected[1])

        # test input with unknown shape
        X = mt.tensor(raw_X, chunk_size=7)
        X = X[X[:, 0] > 0.1]
        Y = mt.tensor(raw_Y, chunk_size=(5, 3))
        Y = Y[Y[:, 0] > 0.1]

        nn = NearestNeighbors(n_neighbors=3)
        nn.fit(X)

        ret = nn.kneighbors(Y)

        x2 = raw_X[raw_X[:, 0] > 0.1]
        y2 = raw_Y[raw_Y[:, 0] > 0.1]
        snn = SkNearestNeighbors(n_neighbors=3)
        snn.fit(x2)
        expected = snn.kneighbors(y2)

        result = ret.fetch()
        self.assertEqual(nn._fit_method, snn._fit_method)
        np.testing.assert_almost_equal(result[0], expected[0])
        np.testing.assert_almost_equal(result[1], expected[1])

        # test serialization
        graph = ret[0].build_graph()
        self.assertEqual(len(graph.from_pb(graph.to_pb())), len(graph))
        self.assertEqual(len(graph.from_json(graph.to_json())), len(graph))

        # test fit a sklearn tree
        nn = NearestNeighbors(n_neighbors=3)
        nn.fit(snn._tree)

        ret = nn.kneighbors(Y)
        result = ret.fetch()
        self.assertEqual(nn._fit_method, snn._fit_method)
        np.testing.assert_almost_equal(result[0], expected[0])
        np.testing.assert_almost_equal(result[1], expected[1])

        # test serialization
        graph = ret[0].build_graph()
        self.assertEqual(len(graph.from_pb(graph.to_pb())), len(graph))
        self.assertEqual(len(graph.from_json(graph.to_json())), len(graph))
Esempio n. 15
0
    def testManualBuildFaissIndex(self):
        d = 8
        n = 50
        n_test = 10
        x = np.random.RandomState(0).rand(n, d).astype(np.float32)
        y = np.random.RandomState(0).rand(n_test, d).astype(np.float32)

        nn = NearestNeighbors(algorithm='kd_tree')
        nn.fit(x)
        _, expected_indices = nn.kneighbors(y, 5)

        for index_type in ['object', 'filename', 'bytes']:
            # test brute-force search
            X = mt.tensor(x, chunk_size=10)
            index = build_faiss_index(X, 'Flat', None, random_state=0,
                                      same_distribution=True, return_index_type=index_type)
            faiss_index = self.executor.execute_tileable(index)

            index_shards = faiss.IndexShards(d)
            for ind in faiss_index:
                shard = _load_index(None, index.op, ind, -1)
                index_shards.add_shard(shard)
            faiss_index = index_shards

            faiss_index.nprob = 10
            _, indices = faiss_index.search(y, k=5)

            np.testing.assert_array_equal(indices, expected_indices.fetch())

        # test one chunk, brute force
        X = mt.tensor(x, chunk_size=50)
        index = build_faiss_index(X, 'Flat', None, random_state=0,
                                  same_distribution=True, return_index_type='object')
        faiss_index = self.executor.execute_tileable(index)[0]

        faiss_index.nprob = 10
        _, indices = faiss_index.search(y, k=5)

        np.testing.assert_array_equal(indices, expected_indices.fetch())

        # test train, same distribution
        X = mt.tensor(x, chunk_size=10)
        index = build_faiss_index(X, 'IVF30,Flat', 30, random_state=0,
                                  same_distribution=True, return_index_type='object')
        faiss_index = self.executor.execute_tileable(index)[0]

        self.assertIsInstance(faiss_index, faiss.IndexIVFFlat)
        self.assertEqual(faiss_index.ntotal, n)
        self.assertEqual(len(get_tiled(index).chunks), 1)

        # test train, distributions are variant
        X = mt.tensor(x, chunk_size=10)
        index = build_faiss_index(X, 'IVF10,Flat', None, random_state=0,
                                  same_distribution=False, return_index_type='object')
        faiss_index = self.executor.execute_tileable(index)

        self.assertEqual(len(faiss_index), 5)
        for ind in faiss_index:
            self.assertIsInstance(ind, faiss.IndexIVFFlat)
            self.assertEqual(ind.ntotal, 10)

        # test one chunk, train
        X = mt.tensor(x, chunk_size=50)
        index = build_faiss_index(X, 'IVF30,Flat', 30, random_state=0,
                                  same_distribution=True, return_index_type='object')
        faiss_index = self.executor.execute_tileable(index)[0]

        self.assertIsInstance(faiss_index, faiss.IndexIVFFlat)
        self.assertEqual(faiss_index.ntotal, n)

        # test wrong index
        with self.assertRaises(ValueError):
            build_faiss_index(X, 'unknown_index', None)

        # test unknown metric
        with self.assertRaises(ValueError):
            build_faiss_index(X, 'Flat', None, metric='unknown_metric')
Esempio n. 16
0
    def testNearestNeighborsExecution(self):
        rs = np.random.RandomState(0)
        raw_X = rs.rand(10, 5)
        raw_Y = rs.rand(8, 5)

        X = mt.tensor(raw_X, chunk_size=7)
        Y = mt.tensor(raw_Y, chunk_size=(5, 3))

        for algo in ['brute', 'ball_tree', 'kd_tree', 'auto']:
            for metric in ['minkowski', 'manhattan']:
                nn = NearestNeighbors(n_neighbors=3,
                                      algorithm=algo,
                                      metric=metric)
                nn.fit(X)

                ret = nn.kneighbors(Y)

                snn = SkNearestNeighbors(n_neighbors=3,
                                         algorithm=algo,
                                         metric=metric)
                snn.fit(raw_X)
                expected = snn.kneighbors(raw_Y)

                result = [r.fetch() for r in ret]
                np.testing.assert_almost_equal(result[0], expected[0])
                np.testing.assert_almost_equal(result[1], expected[1])

                # test return_distance=False
                ret = nn.kneighbors(Y, return_distance=False)

                result = ret.fetch()
                np.testing.assert_almost_equal(result, expected[1])

                # test y is x
                ret = nn.kneighbors()

                expected = snn.kneighbors()

                result = [r.fetch() for r in ret]
                np.testing.assert_almost_equal(result[0], expected[0])
                np.testing.assert_almost_equal(result[1], expected[1])

                # test y is x, and return_distance=False
                ret = nn.kneighbors(return_distance=False)

                result = ret.fetch()
                np.testing.assert_almost_equal(result, expected[1])

        # test callable metric
        metric = lambda u, v: np.sqrt(((u - v)**2).sum())
        for algo in ['brute', 'ball_tree']:
            nn = NearestNeighbors(n_neighbors=3, algorithm=algo, metric=metric)
            nn.fit(X)

            ret = nn.kneighbors(Y)

            snn = SkNearestNeighbors(n_neighbors=3,
                                     algorithm=algo,
                                     metric=metric)
            snn.fit(raw_X)
            expected = snn.kneighbors(raw_Y)

            result = [r.fetch() for r in ret]
            np.testing.assert_almost_equal(result[0], expected[0])
            np.testing.assert_almost_equal(result[1], expected[1])

        # test sparse
        raw_sparse_x = sps.random(10,
                                  5,
                                  density=0.5,
                                  format='csr',
                                  random_state=rs)
        raw_sparse_y = sps.random(8,
                                  5,
                                  density=0.4,
                                  format='csr',
                                  random_state=rs)

        X = mt.tensor(raw_sparse_x, chunk_size=7)
        Y = mt.tensor(raw_sparse_y, chunk_size=5)

        nn = NearestNeighbors(n_neighbors=3)
        nn.fit(X)

        ret = nn.kneighbors(Y)

        snn = SkNearestNeighbors(n_neighbors=3)
        snn.fit(raw_sparse_x)
        expected = snn.kneighbors(raw_sparse_y)

        result = [r.fetch() for r in ret]
        np.testing.assert_almost_equal(result[0], expected[0])
        np.testing.assert_almost_equal(result[1], expected[1])