def test_pairwise_distances_topk_execution(setup): rs = np.random.RandomState(0) raw_x = rs.rand(20, 5) raw_y = rs.rand(21, 5) x = mt.tensor(raw_x, chunk_size=11) y = mt.tensor(raw_y, chunk_size=12) d, i = pairwise_distances_topk(x, y, 3, metric='euclidean', return_index=True) result = fetch(*execute(d, i)) nn = SkNearestNeighbors(n_neighbors=3, algorithm='brute', metric='euclidean') nn.fit(raw_y) expected = nn.kneighbors(raw_x, return_distance=True) np.testing.assert_almost_equal(result[0], expected[0]) np.testing.assert_array_equal(result[1], expected[1]) x = mt.tensor(raw_x, chunk_size=(11, 3)) d = pairwise_distances_topk(x, k=4, metric='euclidean', return_index=False) result = d.execute().fetch() nn = SkNearestNeighbors(n_neighbors=3, algorithm='brute', metric='euclidean') nn.fit(raw_x) expected = nn.kneighbors(return_distance=True)[0] np.testing.assert_almost_equal(result[:, 1:], expected) y = mt.tensor(raw_y, chunk_size=21) d, i = pairwise_distances_topk(x, y, 3, metric='cosine', return_index=True, working_memory='168') result = fetch(*execute(d, i)) nn = SkNearestNeighbors(n_neighbors=3, algorithm='brute', metric='cosine') nn.fit(raw_y) expected = nn.kneighbors(raw_x, return_distance=True) np.testing.assert_almost_equal(result[0], expected[0]) np.testing.assert_array_equal(result[1], expected[1]) d = pairwise_distances_topk(x, y, 3, metric='cosine', axis=0, return_index=False) result = d.execute().fetch() nn = SkNearestNeighbors(n_neighbors=3, algorithm='brute', metric='cosine') nn.fit(raw_x) expected = nn.kneighbors(raw_y, return_distance=True)[0] np.testing.assert_almost_equal(result, expected)
def testKNeighborsGraphExecution(self): rs = np.random.RandomState(0) raw_X = rs.rand(10, 5) raw_Y = rs.rand(8, 5) X = mt.tensor(raw_X, chunk_size=7) Y = mt.tensor(raw_Y, chunk_size=(5, 3)) neigh = NearestNeighbors(n_neighbors=3) neigh.fit(X) sklearn_neigh = SkNearestNeighbors(n_neighbors=3) sklearn_neigh.fit(raw_X) for mode in ['connectivity', 'distance']: graph = neigh.kneighbors_graph(Y, mode=mode) result = graph.fetch() self.assertIsInstance(result, SparseNDArray) self.assertGreater(len(get_tiled(graph).chunks), 1) expected = sklearn_neigh.kneighbors_graph(raw_Y, mode=mode) np.testing.assert_array_equal(result.toarray(), expected.toarray()) graph2 = neigh.kneighbors_graph(mode=mode) result2 = graph2.fetch() self.assertIsInstance(result2, SparseNDArray) self.assertGreater(len(get_tiled(graph2).chunks), 1) expected2 = sklearn_neigh.kneighbors_graph(mode=mode) np.testing.assert_array_equal(result2.toarray(), expected2.toarray()) X = [[0], [3], [1]] neigh = NearestNeighbors(n_neighbors=2) sklearn_neigh = SkNearestNeighbors(n_neighbors=2) neigh.fit(X) sklearn_neigh.fit(X) A = neigh.kneighbors_graph(X).fetch() expected_A = sklearn_neigh.kneighbors_graph(X) np.testing.assert_array_equal(A.toarray(), expected_A.toarray()) # test wrong mode with self.assertRaises(ValueError): _ = neigh.kneighbors_graph(mode='unknown')
def testLearnInLocalCluster(self, *_): from mars.learn.neighbors import NearestNeighbors from sklearn.neighbors import NearestNeighbors as SkNearestNeighbors with new_cluster(scheduler_n_process=2, worker_n_process=3, shared_memory='20M') as cluster: rs = np.random.RandomState(0) raw_X = rs.rand(10, 5) raw_Y = rs.rand(8, 5) X = mt.tensor(raw_X, chunk_size=7) Y = mt.tensor(raw_Y, chunk_size=(5, 3)) nn = NearestNeighbors(n_neighbors=3) nn.fit(X) ret = nn.kneighbors(Y, session=cluster.session) snn = SkNearestNeighbors(n_neighbors=3) snn.fit(raw_X) expected = snn.kneighbors(raw_Y) result = [r.fetch() for r in ret] np.testing.assert_almost_equal(result[0], expected[0]) np.testing.assert_almost_equal(result[1], expected[1])
def testMarsKNN(self): client = self.odps.create_mars_cluster(1, 4, 8, name=str(uuid.uuid4()), scheduler_mem=12, scheduler_cpu=4) try: import numpy as np import mars.tensor as mt from mars.learn.neighbors import NearestNeighbors from sklearn.neighbors import NearestNeighbors as SkNearestNeighbors rs = np.random.RandomState(0) raw_X = rs.rand(10, 5) raw_Y = rs.rand(8, 5) X = mt.tensor(raw_X, chunk_size=7) Y = mt.tensor(raw_Y, chunk_size=(5, 3)) nn = NearestNeighbors(n_neighbors=3) nn.fit(X) ret = nn.kneighbors(Y) snn = SkNearestNeighbors(n_neighbors=3) snn.fit(raw_X) expected = snn.kneighbors(raw_Y) result = [r.fetch() for r in ret] np.testing.assert_almost_equal(result[0], expected[0]) np.testing.assert_almost_equal(result[1], expected[1]) finally: client.stop_server()
def testLearnInLocalCluster(self, *_): from mars.learn.cluster import KMeans from mars.learn.neighbors import NearestNeighbors from sklearn.cluster import KMeans as SK_KMEANS from sklearn.neighbors import NearestNeighbors as SkNearestNeighbors with new_cluster(scheduler_n_process=2, worker_n_process=3, shared_memory='20M') as cluster: rs = np.random.RandomState(0) raw_X = rs.rand(10, 5) raw_Y = rs.rand(8, 5) X = mt.tensor(raw_X, chunk_size=7) Y = mt.tensor(raw_Y, chunk_size=(5, 3)) nn = NearestNeighbors(n_neighbors=3) nn.fit(X) ret = nn.kneighbors(Y, session=cluster.session) snn = SkNearestNeighbors(n_neighbors=3) snn.fit(raw_X) expected = snn.kneighbors(raw_Y) result = [r.fetch() for r in ret] np.testing.assert_almost_equal(result[0], expected[0]) np.testing.assert_almost_equal(result[1], expected[1]) raw = np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]]) X = mt.array(raw) kmeans = KMeans(n_clusters=2, random_state=0, init='k-means++').fit(X) sk_km_elkan = SK_KMEANS(n_clusters=2, random_state=0, init='k-means++').fit(raw) np.testing.assert_allclose(kmeans.cluster_centers_, sk_km_elkan.cluster_centers_)
def test_k_neighbors_graph_execution(setup): rs = np.random.RandomState(0) raw_X = rs.rand(10, 5) raw_Y = rs.rand(8, 5) X = mt.tensor(raw_X, chunk_size=7) Y = mt.tensor(raw_Y, chunk_size=(5, 3)) neigh = NearestNeighbors(n_neighbors=3) neigh.fit(X) sklearn_neigh = SkNearestNeighbors(n_neighbors=3) sklearn_neigh.fit(raw_X) for mode in ["connectivity", "distance"]: graph = neigh.kneighbors_graph(Y, mode=mode) result = graph.fetch() assert isinstance(result, SparseNDArray) assert len(tile(graph).chunks) > 1 expected = sklearn_neigh.kneighbors_graph(raw_Y, mode=mode) np.testing.assert_array_equal(result.toarray(), expected.toarray()) graph2 = neigh.kneighbors_graph(mode=mode) result2 = graph2.fetch() assert isinstance(result2, SparseNDArray) expected2 = sklearn_neigh.kneighbors_graph(mode=mode) np.testing.assert_array_equal(result2.toarray(), expected2.toarray()) X = [[0], [3], [1]] neigh = NearestNeighbors(n_neighbors=2) sklearn_neigh = SkNearestNeighbors(n_neighbors=2) neigh.fit(X) sklearn_neigh.fit(X) A = neigh.kneighbors_graph(X).fetch() expected_A = sklearn_neigh.kneighbors_graph(X) np.testing.assert_array_equal(A.toarray(), expected_A.toarray()) # test wrong mode with pytest.raises(ValueError): _ = neigh.kneighbors_graph(mode="unknown")
def testLearnInLocalCluster(self, *_): from mars.learn.neighbors import NearestNeighbors from sklearn.neighbors import NearestNeighbors as SkNearestNeighbors from mars.learn.metrics import roc_curve, auc from sklearn.metrics import roc_curve as sklearn_roc_curve, auc as sklearn_auc with new_cluster(scheduler_n_process=2, worker_n_process=3, shared_memory='20M') as cluster: rs = np.random.RandomState(0) raw_X = rs.rand(10, 5) raw_Y = rs.rand(8, 5) X = mt.tensor(raw_X, chunk_size=7) Y = mt.tensor(raw_Y, chunk_size=(5, 3)) nn = NearestNeighbors(n_neighbors=3) nn.fit(X) ret = nn.kneighbors(Y, session=cluster.session) snn = SkNearestNeighbors(n_neighbors=3) snn.fit(raw_X) expected = snn.kneighbors(raw_Y) result = [r.fetch() for r in ret] np.testing.assert_almost_equal(result[0], expected[0]) np.testing.assert_almost_equal(result[1], expected[1]) rs = np.random.RandomState(0) raw = pd.DataFrame({ 'a': rs.randint(0, 10, (10, )), 'b': rs.rand(10) }) df = md.DataFrame(raw) y = df['a'].to_tensor().astype('int') pred = df['b'].to_tensor().astype('float') fpr, tpr, thresholds = roc_curve(y, pred, pos_label=2) m = auc(fpr, tpr) sk_fpr, sk_tpr, sk_threshod = sklearn_roc_curve( raw['a'].to_numpy().astype('int'), raw['b'].to_numpy().astype('float'), pos_label=2) expect_m = sklearn_auc(sk_fpr, sk_tpr) self.assertAlmostEqual(m.fetch(), expect_m)
def testGPUFaissNearestNeighborsExecution(self): rs = np.random.RandomState(0) raw_X = rs.rand(10, 5) raw_Y = rs.rand(8, 5) # test faiss execution X = mt.tensor(raw_X, chunk_size=7).to_gpu() Y = mt.tensor(raw_Y, chunk_size=8).to_gpu() nn = NearestNeighbors(n_neighbors=3, algorithm='faiss', metric='l2') nn.fit(X) ret = nn.kneighbors(Y) snn = SkNearestNeighbors(n_neighbors=3, algorithm='auto', metric='l2') snn.fit(raw_X) expected = snn.kneighbors(raw_Y) result = [r.fetch() for r in ret] np.testing.assert_almost_equal(result[0].get(), expected[0], decimal=6) np.testing.assert_almost_equal(result[1].get(), expected[1])
def testFaissNearestNeighborsExecution(self): rs = np.random.RandomState(0) raw_X = rs.rand(10, 5) raw_Y = rs.rand(8, 5) # test faiss execution X = mt.tensor(raw_X, chunk_size=7) Y = mt.tensor(raw_Y, chunk_size=(5, 3)) nn = NearestNeighbors(n_neighbors=3, algorithm='faiss', metric='l2') nn.fit(X) ret = nn.kneighbors(Y) snn = SkNearestNeighbors(n_neighbors=3, algorithm='auto', metric='l2') snn.fit(raw_X) expected = snn.kneighbors(raw_Y) result = [r.fetch() for r in ret] np.testing.assert_almost_equal(result[0], expected[0], decimal=6) np.testing.assert_almost_equal(result[1], expected[1]) # test return_distance=False ret = nn.kneighbors(Y, return_distance=False) result = ret.fetch() np.testing.assert_almost_equal(result, expected[1]) # test y is x ret = nn.kneighbors() expected = snn.kneighbors() result = [r.fetch() for r in ret] np.testing.assert_almost_equal(result[0], expected[0], decimal=5) np.testing.assert_almost_equal(result[1], expected[1])
def test_proxima_nearest_neighbors_execution(setup): rs = np.random.RandomState(0) raw_X = rs.rand(10, 5).astype("float32") raw_Y = rs.rand(8, 5).astype("float32") # test faiss execution X = mt.tensor(raw_X, chunk_size=6) Y = mt.tensor(raw_Y, chunk_size=(5, 3)) nn = NearestNeighbors(n_neighbors=3, algorithm="proxima", metric="l2") nn.fit(X) ret = nn.kneighbors(Y) snn = SkNearestNeighbors(n_neighbors=3, algorithm="auto", metric="l2") snn.fit(raw_X) expected = snn.kneighbors(raw_Y) result = [r.fetch() for r in ret] np.testing.assert_almost_equal(result[0], expected[0], decimal=6) np.testing.assert_almost_equal(result[1], expected[1]) # test return_distance=False ret = nn.kneighbors(Y, return_distance=False) result = ret.fetch() np.testing.assert_almost_equal(result, expected[1]) # test y is x ret = nn.kneighbors() expected = snn.kneighbors() result = [r.fetch() for r in ret] np.testing.assert_almost_equal(result[0], expected[0], decimal=5) np.testing.assert_almost_equal(result[1], expected[1])
def testNearestNeighborsExecution(self): rs = np.random.RandomState(0) raw_X = rs.rand(10, 5) raw_Y = rs.rand(8, 5) X = mt.tensor(raw_X, chunk_size=7) Y = mt.tensor(raw_Y, chunk_size=(5, 3)) for algo in ['brute', 'ball_tree', 'kd_tree', 'auto']: for metric in ['minkowski', 'manhattan']: nn = NearestNeighbors(n_neighbors=3, algorithm=algo, metric=metric) nn.fit(X) ret = nn.kneighbors(Y) snn = SkNearestNeighbors(n_neighbors=3, algorithm=algo, metric=metric) snn.fit(raw_X) expected = snn.kneighbors(raw_Y) result = [r.fetch() for r in ret] np.testing.assert_almost_equal(result[0], expected[0]) np.testing.assert_almost_equal(result[1], expected[1]) if nn._tree is not None: self.assertIsInstance(nn._tree.fetch(), type(snn._tree)) # test return_distance=False ret = nn.kneighbors(Y, return_distance=False) result = ret.fetch() np.testing.assert_almost_equal(result, expected[1]) # test y is x ret = nn.kneighbors() expected = snn.kneighbors() result = [r.fetch() for r in ret] np.testing.assert_almost_equal(result[0], expected[0]) np.testing.assert_almost_equal(result[1], expected[1]) # test y is x, and return_distance=False ret = nn.kneighbors(return_distance=False) result = ret.fetch() np.testing.assert_almost_equal(result, expected[1]) # test callable metric metric = lambda u, v: np.sqrt(((u - v)**2).sum()) for algo in ['brute', 'ball_tree']: nn = NearestNeighbors(n_neighbors=3, algorithm=algo, metric=metric) nn.fit(X) ret = nn.kneighbors(Y) snn = SkNearestNeighbors(n_neighbors=3, algorithm=algo, metric=metric) snn.fit(raw_X) expected = snn.kneighbors(raw_Y) result = [r.fetch() for r in ret] np.testing.assert_almost_equal(result[0], expected[0]) np.testing.assert_almost_equal(result[1], expected[1]) # test sparse raw_sparse_x = sps.random(10, 5, density=0.5, format='csr', random_state=rs) raw_sparse_y = sps.random(8, 5, density=0.4, format='csr', random_state=rs) X = mt.tensor(raw_sparse_x, chunk_size=7) Y = mt.tensor(raw_sparse_y, chunk_size=5) nn = NearestNeighbors(n_neighbors=3) nn.fit(X) ret = nn.kneighbors(Y) snn = SkNearestNeighbors(n_neighbors=3) snn.fit(raw_sparse_x) expected = snn.kneighbors(raw_sparse_y) result = [r.fetch() for r in ret] np.testing.assert_almost_equal(result[0], expected[0]) np.testing.assert_almost_equal(result[1], expected[1]) # test input with unknown shape X = mt.tensor(raw_X, chunk_size=7) X = X[X[:, 0] > 0.1] Y = mt.tensor(raw_Y, chunk_size=(5, 3)) Y = Y[Y[:, 0] > 0.1] nn = NearestNeighbors(n_neighbors=3) nn.fit(X) ret = nn.kneighbors(Y) x2 = raw_X[raw_X[:, 0] > 0.1] y2 = raw_Y[raw_Y[:, 0] > 0.1] snn = SkNearestNeighbors(n_neighbors=3) snn.fit(x2) expected = snn.kneighbors(y2) result = ret.fetch() self.assertEqual(nn._fit_method, snn._fit_method) np.testing.assert_almost_equal(result[0], expected[0]) np.testing.assert_almost_equal(result[1], expected[1]) # test serialization graph = ret[0].build_graph() self.assertEqual(len(graph.from_pb(graph.to_pb())), len(graph)) self.assertEqual(len(graph.from_json(graph.to_json())), len(graph)) # test fit a sklearn tree nn = NearestNeighbors(n_neighbors=3) nn.fit(snn._tree) ret = nn.kneighbors(Y) result = ret.fetch() self.assertEqual(nn._fit_method, snn._fit_method) np.testing.assert_almost_equal(result[0], expected[0]) np.testing.assert_almost_equal(result[1], expected[1]) # test serialization graph = ret[0].build_graph() self.assertEqual(len(graph.from_pb(graph.to_pb())), len(graph)) self.assertEqual(len(graph.from_json(graph.to_json())), len(graph))
def test_nearest_neighbors_execution(setup): rs = np.random.RandomState(0) raw_X = rs.rand(10, 5) raw_Y = rs.rand(8, 5) X = mt.tensor(raw_X, chunk_size=7) Y = mt.tensor(raw_Y, chunk_size=(5, 3)) for algo in ["brute", "ball_tree", "kd_tree", "auto"]: for metric in ["minkowski", "manhattan"]: nn = NearestNeighbors(n_neighbors=3, algorithm=algo, metric=metric) nn.fit(X) ret = nn.kneighbors(Y) snn = SkNearestNeighbors(n_neighbors=3, algorithm=algo, metric=metric) snn.fit(raw_X) expected = snn.kneighbors(raw_Y) result = [r.fetch() for r in ret] np.testing.assert_almost_equal(result[0], expected[0]) np.testing.assert_almost_equal(result[1], expected[1]) if nn._tree is not None: assert isinstance(nn._tree.fetch(), type(snn._tree)) # test return_distance=False ret = nn.kneighbors(Y, return_distance=False) result = ret.fetch() np.testing.assert_almost_equal(result, expected[1]) # test y is x ret = nn.kneighbors() expected = snn.kneighbors() result = [r.fetch() for r in ret] np.testing.assert_almost_equal(result[0], expected[0]) np.testing.assert_almost_equal(result[1], expected[1]) # test y is x, and return_distance=False ret = nn.kneighbors(return_distance=False) result = ret.fetch() np.testing.assert_almost_equal(result, expected[1]) # test callable metric metric = lambda u, v: np.sqrt(((u - v)**2).sum()) for algo in ["brute", "ball_tree"]: nn = NearestNeighbors(n_neighbors=3, algorithm=algo, metric=metric) nn.fit(X) ret = nn.kneighbors(Y) snn = SkNearestNeighbors(n_neighbors=3, algorithm=algo, metric=metric) snn.fit(raw_X) expected = snn.kneighbors(raw_Y) result = [r.fetch() for r in ret] np.testing.assert_almost_equal(result[0], expected[0]) np.testing.assert_almost_equal(result[1], expected[1]) # test sparse raw_sparse_x = sps.random(10, 5, density=0.5, format="csr", random_state=rs) raw_sparse_y = sps.random(8, 5, density=0.4, format="csr", random_state=rs) X = mt.tensor(raw_sparse_x, chunk_size=7) Y = mt.tensor(raw_sparse_y, chunk_size=5) nn = NearestNeighbors(n_neighbors=3) nn.fit(X) ret = nn.kneighbors(Y) snn = SkNearestNeighbors(n_neighbors=3) snn.fit(raw_sparse_x) expected = snn.kneighbors(raw_sparse_y) result = [r.fetch() for r in ret] np.testing.assert_almost_equal(result[0], expected[0]) np.testing.assert_almost_equal(result[1], expected[1]) # test input with unknown shape X = mt.tensor(raw_X, chunk_size=7) X = X[X[:, 0] > 0.1] Y = mt.tensor(raw_Y, chunk_size=(5, 3)) Y = Y[Y[:, 0] > 0.1] nn = NearestNeighbors(n_neighbors=3) nn.fit(X) ret = nn.kneighbors(Y) x2 = raw_X[raw_X[:, 0] > 0.1] y2 = raw_Y[raw_Y[:, 0] > 0.1] snn = SkNearestNeighbors(n_neighbors=3) snn.fit(x2) expected = snn.kneighbors(y2) result = ret.fetch() assert nn._fit_method == snn._fit_method np.testing.assert_almost_equal(result[0], expected[0]) np.testing.assert_almost_equal(result[1], expected[1]) # test fit a sklearn tree nn = NearestNeighbors(n_neighbors=3) nn.fit(snn._tree) ret = nn.kneighbors(Y) result = ret.fetch() assert nn._fit_method == snn._fit_method np.testing.assert_almost_equal(result[0], expected[0]) np.testing.assert_almost_equal(result[1], expected[1])
def testNearestNeighborsExecution(self): rs = np.random.RandomState(0) raw_X = rs.rand(10, 5) raw_Y = rs.rand(8, 5) X = mt.tensor(raw_X, chunk_size=7) Y = mt.tensor(raw_Y, chunk_size=(5, 3)) for algo in ['brute', 'ball_tree', 'kd_tree', 'auto']: for metric in ['minkowski', 'manhattan']: nn = NearestNeighbors(n_neighbors=3, algorithm=algo, metric=metric) nn.fit(X) ret = nn.kneighbors(Y) snn = SkNearestNeighbors(n_neighbors=3, algorithm=algo, metric=metric) snn.fit(raw_X) expected = snn.kneighbors(raw_Y) result = [r.fetch() for r in ret] np.testing.assert_almost_equal(result[0], expected[0]) np.testing.assert_almost_equal(result[1], expected[1]) # test return_distance=False ret = nn.kneighbors(Y, return_distance=False) result = ret.fetch() np.testing.assert_almost_equal(result, expected[1]) # test y is x ret = nn.kneighbors() expected = snn.kneighbors() result = [r.fetch() for r in ret] np.testing.assert_almost_equal(result[0], expected[0]) np.testing.assert_almost_equal(result[1], expected[1]) # test y is x, and return_distance=False ret = nn.kneighbors(return_distance=False) result = ret.fetch() np.testing.assert_almost_equal(result, expected[1]) # test callable metric metric = lambda u, v: np.sqrt(((u - v)**2).sum()) for algo in ['brute', 'ball_tree']: nn = NearestNeighbors(n_neighbors=3, algorithm=algo, metric=metric) nn.fit(X) ret = nn.kneighbors(Y) snn = SkNearestNeighbors(n_neighbors=3, algorithm=algo, metric=metric) snn.fit(raw_X) expected = snn.kneighbors(raw_Y) result = [r.fetch() for r in ret] np.testing.assert_almost_equal(result[0], expected[0]) np.testing.assert_almost_equal(result[1], expected[1]) # test sparse raw_sparse_x = sps.random(10, 5, density=0.5, format='csr', random_state=rs) raw_sparse_y = sps.random(8, 5, density=0.4, format='csr', random_state=rs) X = mt.tensor(raw_sparse_x, chunk_size=7) Y = mt.tensor(raw_sparse_y, chunk_size=5) nn = NearestNeighbors(n_neighbors=3) nn.fit(X) ret = nn.kneighbors(Y) snn = SkNearestNeighbors(n_neighbors=3) snn.fit(raw_sparse_x) expected = snn.kneighbors(raw_sparse_y) result = [r.fetch() for r in ret] np.testing.assert_almost_equal(result[0], expected[0]) np.testing.assert_almost_equal(result[1], expected[1])