def test_local_run_script_with_data(setup_cluster): s = BytesIO(script3) data = { 'tensor': mt.arange(10), 'df': md.DataFrame({'s': mt.arange(9, 0, -1)}) } assert run_script( s, data=data, n_workers=1, ).fetch()['status'] == 'ok' pytest.raises(TypeError, run_script, s, data=[])
def test_store_tiledb_execution(setup): ctx = tiledb.Ctx() tempdir = tempfile.mkdtemp() try: # store TileDB dense array expected = np.random.rand(8, 4, 3) a = tensor(expected, chunk_size=(3, 3, 2)) save = totiledb(tempdir, a, ctx=ctx) save.execute() with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr: np.testing.assert_allclose(expected, arr.read_direct()) finally: shutil.rmtree(tempdir) tempdir = tempfile.mkdtemp() try: # store tensor with 1 chunk to TileDB dense array a = arange(12) save = totiledb(tempdir, a, ctx=ctx) save.execute() with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr: np.testing.assert_allclose(np.arange(12), arr.read_direct()) finally: shutil.rmtree(tempdir) tempdir = tempfile.mkdtemp() try: # store 2-d TileDB sparse array expected = sps.random(8, 7, density=0.1) a = tensor(expected, chunk_size=(3, 5)) save = totiledb(tempdir, a, ctx=ctx) save.execute() with tiledb.SparseArray(uri=tempdir, ctx=ctx) as arr: data = arr[:, :] coords = data['coords'] value = data[arr.attr(0).name] ij = tuple(coords[arr.domain.dim(k).name] for k in range(arr.ndim)) result = sps.coo_matrix((value, ij), shape=arr.shape) np.testing.assert_allclose(expected.toarray(), result.toarray()) finally: shutil.rmtree(tempdir) tempdir = tempfile.mkdtemp() try: # store TileDB dense array expected = np.asfortranarray(np.random.rand(8, 4, 3)) a = tensor(expected, chunk_size=(3, 3, 2)) save = totiledb(tempdir, a, ctx=ctx) save.execute() with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr: np.testing.assert_allclose(expected, arr.read_direct()) assert arr.schema.cell_order == 'col-major' finally: shutil.rmtree(tempdir)
def test_arange(): t = arange(10, chunk_size=3) assert t.op.gpu is False t = tile(t) assert t.shape == (10, ) assert t.nsplits == ((3, 3, 3, 1), ) assert t.chunks[1].op.start == 3 assert t.chunks[1].op.stop == 6 t = arange(0, 10, 3, chunk_size=2) t = tile(t) assert t.shape == (4, ) assert t.nsplits == ((2, 2), ) assert t.chunks[0].op.start == 0 assert t.chunks[0].op.stop == 6 assert t.chunks[0].op.step == 3 assert t.chunks[1].op.start == 6 assert t.chunks[1].op.stop == 12 assert t.chunks[1].op.step == 3 pytest.raises(TypeError, lambda: arange(10, start=0)) pytest.raises(TypeError, lambda: arange(0, 10, stop=0)) pytest.raises(TypeError, lambda: arange()) pytest.raises(ValueError, lambda: arange('1066-10-13', dtype=np.datetime64, chunks=3))
def testArange(self): t = arange(10, chunk_size=3) self.assertFalse(t.op.gpu) t = t.tiles() self.assertEqual(t.shape, (10, )) self.assertEqual(t.nsplits, ((3, 3, 3, 1), )) self.assertEqual(t.chunks[1].op.start, 3) self.assertEqual(t.chunks[1].op.stop, 6) t = arange(0, 10, 3, chunk_size=2) t = t.tiles() self.assertEqual(t.shape, (4, )) self.assertEqual(t.nsplits, ((2, 2), )) self.assertEqual(t.chunks[0].op.start, 0) self.assertEqual(t.chunks[0].op.stop, 6) self.assertEqual(t.chunks[0].op.step, 3) self.assertEqual(t.chunks[1].op.start, 6) self.assertEqual(t.chunks[1].op.stop, 12) self.assertEqual(t.chunks[1].op.step, 3) self.assertRaises(TypeError, lambda: arange(10, start=0)) self.assertRaises(TypeError, lambda: arange(0, 10, stop=0)) self.assertRaises(TypeError, lambda: arange()) self.assertRaises( ValueError, lambda: arange('1066-10-13', dtype=np.datetime64, chunks=3))
def transpose(): row = 100_000_000 col = 10 a = np.arange(row * col) b = np.reshape(a, [row, col]) t1 = time.time_ns() d = b.T print( f"Numpy Mat Transpose Time [{row}] x [{col}] => SUM {d.shape}, Time = {(time.time_ns() - t1) / CN}" ) a = mt.arange(row * col) b = mt.reshape(a, [row, col]) t1 = time.time_ns() d = b.T e = d.execute() print( f"Mars Mat Transpose Time [{row}] x [{col}] => SUM {e.shape}, Time = {(time.time_ns() - t1) / CN}" )
def scalar_mul(): row = 100_000_000 col = 2 a = np.arange(row * col) b = np.reshape(a, [row, col]) t1 = time.time_ns() d = b * 2 sum = d.sum() print( f"Numpy Scalar Mul Time [{row}] x [{col}] => SUM {sum}, Time = {(time.time_ns() - t1) / CN}" ) a = mt.arange(row * col) b = mt.reshape(a, [row, col]) t1 = time.time_ns() d = b * 2 sum = d.sum().execute() print( f"Mars Scalar Mul Time [{row}] x [{col}] => SUM {sum}, Time = {(time.time_ns() - t1) / CN}" )
def testDiffExecution(self): data = np.array([1, 2, 4, 7, 0]) x = tensor(data, chunk_size=2) t = diff(x) res = self.executor.execute_tensor(t, concat=True)[0] expected = np.diff(data) np.testing.assert_equal(res, expected) t = diff(x, n=2) res = self.executor.execute_tensor(t, concat=True)[0] expected = np.diff(data, n=2) np.testing.assert_equal(res, expected) data = np.array([[1, 3, 6, 10], [0, 5, 6, 8]]) x = tensor(data, chunk_size=2) t = diff(x) res = self.executor.execute_tensor(t, concat=True)[0] expected = np.diff(data) np.testing.assert_equal(res, expected) t = diff(x, axis=0) res = self.executor.execute_tensor(t, concat=True)[0] expected = np.diff(data, axis=0) np.testing.assert_equal(res, expected) x = mt.arange('1066-10-13', '1066-10-16', dtype=mt.datetime64) t = diff(x) res = self.executor.execute_tensor(t, concat=True)[0] expected = np.diff( np.arange('1066-10-13', '1066-10-16', dtype=np.datetime64)) np.testing.assert_equal(res, expected)
def matmul(): row = 30_000 col = 2 a = np.arange(row * col) b = np.reshape(a, [row, col]) c = np.reshape(a, [col, row]) t1 = time.time_ns() d = np.matmul(b, c) sum = d.sum() print( f"Numpy Mat Mul Time [{row}] x [{row}] => SUM {sum}, Time = {(time.time_ns() - t1) / CN}" ) a = mt.arange(row * col) b = mt.reshape(a, [row, col]) c = mt.reshape(a, [col, row]) t1 = time.time_ns() d: mt = mt.matmul(b, c) sum = d.sum().execute() print( f"Mars Mat Mul Time [{row}] x [{row}] => SUM {sum}, Time = {(time.time_ns() - t1) / CN}" )
def testDiag(self): # test 2-d, shape[0] == shape[1], k == 0 v = tensor(np.arange(16).reshape(4, 4), chunk_size=2) t = diag(v) self.assertEqual(t.shape, (4, )) self.assertFalse(t.op.gpu) t = t.tiles() self.assertEqual(t.nsplits, ((2, 2), )) v = tensor(np.arange(16).reshape(4, 4), chunk_size=(2, 3)) t = diag(v) self.assertEqual(t.shape, (4, )) t = t.tiles() self.assertEqual(t.nsplits, ((2, 1, 1), )) # test 1-d, k == 0 v = tensor(np.arange(3), chunk_size=2) t = diag(v, sparse=True) self.assertEqual(t.shape, (3, 3)) t = t.tiles() self.assertEqual(t.nsplits, ((2, 1), (2, 1))) self.assertEqual( len([ c for c in t.chunks if c.op.__class__.__name__ == 'TensorDiag' ]), 2) self.assertTrue(t.chunks[0].op.sparse) # test 2-d, shape[0] != shape[1] v = tensor(np.arange(24).reshape(4, 6), chunk_size=2) t = diag(v) self.assertEqual(t.shape, np.diag(np.arange(24).reshape(4, 6)).shape) t = t.tiles() self.assertEqual(tuple(sum(s) for s in t.nsplits), t.shape) v = tensor(np.arange(24).reshape(4, 6), chunk_size=2) t = diag(v, k=1) self.assertEqual(t.shape, np.diag(np.arange(24).reshape(4, 6), k=1).shape) t = t.tiles() self.assertEqual(tuple(sum(s) for s in t.nsplits), t.shape) t = diag(v, k=2) self.assertEqual(t.shape, np.diag(np.arange(24).reshape(4, 6), k=2).shape) t = t.tiles() self.assertEqual(tuple(sum(s) for s in t.nsplits), t.shape) t = diag(v, k=-1) self.assertEqual(t.shape, np.diag(np.arange(24).reshape(4, 6), k=-1).shape) t = t.tiles() self.assertEqual(tuple(sum(s) for s in t.nsplits), t.shape) t = diag(v, k=-2) self.assertEqual(t.shape, np.diag(np.arange(24).reshape(4, 6), k=-2).shape) t = t.tiles() self.assertEqual(tuple(sum(s) for s in t.nsplits), t.shape) # test tiled zeros' keys a = arange(5, chunk_size=2) t = diag(a) t = t.tiles() # 1 and 2 of t.chunks is ones, they have different shapes self.assertNotEqual(t.chunks[1].op.key, t.chunks[2].op.key)
def testMainDataFrameWithoutEtcd(self): self.start_processes( etcd=False, scheduler_args=['-Dscheduler.aggressive_assign=true']) sess = new_session(self.session_manager_ref.address) # test binary arithmetics with different indices raw1 = pd.DataFrame(np.random.rand(10, 10)) df1 = md.DataFrame(raw1, chunk_size=5) raw2 = pd.DataFrame(np.random.rand(10, 10)) df2 = md.DataFrame(raw2, chunk_size=6) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) raw1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = md.DataFrame(raw1, chunk_size=(10, 5)) raw2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = md.DataFrame(raw2, chunk_size=(10, 6)) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) raw1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = md.DataFrame(raw1, chunk_size=5) raw2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = md.DataFrame(raw2, chunk_size=6) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) # test sort_values raw1 = pd.DataFrame(np.random.rand(10, 10)) raw1[0] = raw1[0].apply(str) raw1.columns = pd.MultiIndex.from_product([list('AB'), list('CDEFG')]) df1 = md.DataFrame(raw1, chunk_size=5) r = df1.sort_values([('A', 'C')]) result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1.sort_values([('A', 'C')])) rs = np.random.RandomState(0) raw2 = pd.DataFrame({ 'a': rs.rand(10), 'b': [f's{rs.randint(1000)}' for _ in range(10)] }) raw2['b'] = raw2['b'].astype(md.ArrowStringDtype()) mdf = md.DataFrame(raw2, chunk_size=4) filtered = mdf[mdf['a'] > 0.5] df2 = filtered.sort_values(by='b') result = df2.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = raw2[raw2['a'] > 0.5].sort_values(by='b') pd.testing.assert_frame_equal(result, expected) s1 = pd.Series(np.random.rand(10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) series1 = md.Series(s1, chunk_size=6) result = series1.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_series_equal(result, s1) # test reindex data = pd.DataFrame(np.random.rand(10, 5), columns=['c1', 'c2', 'c3', 'c4', 'c5']) df3 = md.DataFrame(data, chunk_size=4) r = df3.reindex(index=mt.arange(10, 1, -1, chunk_size=3)) result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = data.reindex(index=np.arange(10, 1, -1)) pd.testing.assert_frame_equal(result, expected) # test rebalance df4 = md.DataFrame(data) r = df4.rebalance() result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, data) chunk_metas = sess.get_tileable_chunk_metas(r.key) workers = list( set(itertools.chain(*(m.workers for m in chunk_metas.values())))) self.assertEqual(len(workers), 2) # test nunique data = pd.DataFrame(np.random.randint(0, 10, (100, 5)), columns=['c1', 'c2', 'c3', 'c4', 'c5']) df5 = md.DataFrame(data, chunk_size=4) r = df5.nunique() result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = data.nunique() pd.testing.assert_series_equal(result, expected) # test re-execute df.groupby().agg().sort_values() rs = np.random.RandomState(0) data = pd.DataFrame({ 'col1': rs.rand(100), 'col2': rs.randint(10, size=100) }) df6 = md.DataFrame(data, chunk_size=40) grouped = df6.groupby('col2', as_index=False)['col2'].agg({"cnt": "count"}) \ .execute(session=sess, timeout=self.timeout) r = grouped.sort_values(by='cnt').head().execute(session=sess, timeout=self.timeout) result = r.fetch(session=sess) expected = data.groupby('col2', as_index=False)['col2'].agg({"cnt": "count"}) \ .sort_values(by='cnt').head() pd.testing.assert_frame_equal(result.reset_index(drop=True), expected.reset_index(drop=True)) r2 = df6.groupby('col2', as_index=False)['col2'].agg({"cnt": "count"}).sort_values(by='cnt').head() \ .execute(session=sess, timeout=self.timeout) result = r2.fetch(session=sess) pd.testing.assert_frame_equal(result.reset_index(drop=True), expected.reset_index(drop=True)) # test groupby with sample src_data_list = [] sample_count = 10 for b in range(5): data_count = int(np.random.randint(40, 100)) src_data_list.append( pd.DataFrame({ 'a': np.random.randint(0, 100, size=data_count), 'b': np.array([b] * data_count), 'c': np.random.randint(0, 100, size=data_count), 'd': np.random.randint(0, 100, size=data_count), })) data = pd.concat(src_data_list) shuffle_idx = np.arange(len(data)) np.random.shuffle(shuffle_idx) data = data.iloc[shuffle_idx].reset_index(drop=True) df7 = md.DataFrame(data, chunk_size=40) sampled = df7.groupby('b').sample(10) r = sampled.execute(session=sess, timeout=self.timeout) result = r.fetch(session=sess) self.assertFalse((result.groupby('b').count() - sample_count).any()[0])
def testReindexExecution(self): data = pd.DataFrame(np.random.rand(10, 5), columns=['c1', 'c2', 'c3', 'c4', 'c5']) df = md.DataFrame(data, chunk_size=4) for enable_sparse in [True, False, None]: r = df.reindex(index=mt.arange(10, 1, -1, chunk_size=3), enable_sparse=enable_sparse) result = self.executor.execute_dataframe(r, concat=True)[0] expected = data.reindex(index=np.arange(10, 1, -1)) pd.testing.assert_frame_equal(result, expected) r = df.reindex(columns=['c5', 'c6', 'c2'], enable_sparse=enable_sparse) result = self.executor.execute_dataframe(r, concat=True)[0] expected = data.reindex(columns=['c5', 'c6', 'c2']) pd.testing.assert_frame_equal(result, expected) for enable_sparse in [True, False]: r = df.reindex(index=[5, 11, 1], columns=['c5', 'c6', 'c2'], enable_sparse=enable_sparse) result = self.executor.execute_dataframe(r, concat=True)[0] expected = data.reindex(index=[5, 11, 1], columns=['c5', 'c6', 'c2']) pd.testing.assert_frame_equal(result, expected) r = df.reindex(index=mt.tensor([2, 4, 10]), columns=['c2', 'c3', 'c5', 'c7'], method='bfill', enable_sparse=enable_sparse) result = self.executor.execute_dataframe(r, concat=True)[0] expected = data.reindex(index=[2, 4, 10], columns=['c2', 'c3', 'c5', 'c7'], method='bfill') pd.testing.assert_frame_equal(result, expected) for fill_value, test_fill_value in \ [(3, 3), (df.iloc[:, 0].max(), data.iloc[:, 0].max())]: r = df.reindex(index=mt.tensor([2, 4, 10]), columns=['c2', 'c3', 'c5', 'c7'], fill_value=fill_value, enable_sparse=enable_sparse) result = self.executor.execute_dataframe(r, concat=True)[0] expected = data.reindex(index=[2, 4, 10], columns=['c2', 'c3', 'c5', 'c7'], fill_value=test_fill_value) pd.testing.assert_frame_equal(result, expected) # test date_range index data = pd.DataFrame(np.random.rand(10, 5), index=pd.date_range('2020-1-1', periods=10)) df = md.DataFrame(data, chunk_size=5) r = df.reindex(index=md.date_range('2020-1-6', periods=6), method='ffill', enable_sparse=enable_sparse) result = self.executor.execute_dataframe(r, concat=True)[0] expected = data.reindex(index=pd.date_range('2020-1-6', periods=6), method='ffill') pd.testing.assert_frame_equal(result, expected) # test MultiIndex data = pd.DataFrame(np.random.rand(10, 5), index=pd.MultiIndex.from_arrays( [np.arange(10), np.arange(11, 1, -1)])) df = md.DataFrame(data, chunk_size=5) r = df.reindex([2, 4, 9, 12], level=1, enable_sparse=enable_sparse) result = self.executor.execute_dataframe(r, concat=True, check_shape=False)[0] expected = data.reindex([2, 4, 9, 12], level=1) pd.testing.assert_frame_equal(result, expected) r = df.reindex(mt.tensor([2, 4, 9, 12], chunk_size=2), level=1, enable_sparse=enable_sparse) result = self.executor.execute_dataframe(r, concat=True, check_shape=False)[0] expected = data.reindex([2, 4, 9, 12], level=1) pd.testing.assert_frame_equal(result, expected) # test duplicate index index = np.arange(10) index[-1] = 0 data = pd.DataFrame(np.random.rand(10, 5), index=index) df = md.DataFrame(data, chunk_size=5) with self.assertRaises(ValueError): r = df.reindex([0, 1], enable_sparse=enable_sparse) self.executor.execute_dataframe(r) # test one chunk data = pd.DataFrame(np.random.rand(10, 5), columns=['c1', 'c2', 'c3', 'c4', 'c5']) df = md.DataFrame(data, chunk_size=10) r = df.reindex(index=mt.arange(10, 1, -1, chunk_size=10), fill_value=df['c1'].max(), enable_sparse=enable_sparse) result = self.executor.execute_dataframe(r, concat=True)[0] expected = data.reindex(index=np.arange(10, 1, -1), fill_value=data['c1'].max()) pd.testing.assert_frame_equal(result, expected) # test series s_data = pd.Series(np.random.rand(10), index=[f'c{i + 1}' for i in range(10)]) series = md.Series(s_data, chunk_size=6) r = series.reindex(['c2', 'c11', 'c4'], copy=False, enable_sparse=enable_sparse) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_data.reindex(['c2', 'c11', 'c4'], copy=False) pd.testing.assert_series_equal(result, expected)
def testMainDataFrameWithoutEtcd(self): self.start_processes( etcd=False, scheduler_args=['-Dscheduler.aggressive_assign=true']) sess = new_session(self.session_manager_ref.address) raw1 = pd.DataFrame(np.random.rand(10, 10)) df1 = md.DataFrame(raw1, chunk_size=5) raw2 = pd.DataFrame(np.random.rand(10, 10)) df2 = md.DataFrame(raw2, chunk_size=6) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) raw1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = md.DataFrame(raw1, chunk_size=(10, 5)) raw2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = md.DataFrame(raw2, chunk_size=(10, 6)) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) raw1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = md.DataFrame(raw1, chunk_size=5) raw2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = md.DataFrame(raw2, chunk_size=6) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) raw1 = pd.DataFrame(np.random.rand(10, 10)) raw1[0] = raw1[0].apply(str) df1 = md.DataFrame(raw1, chunk_size=5) r = df1.sort_values(0) result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1.sort_values(0)) rs = np.random.RandomState(0) raw2 = pd.DataFrame({ 'a': rs.rand(10), 'b': [f's{rs.randint(1000)}' for _ in range(10)] }) raw2['b'] = raw2['b'].astype(md.ArrowStringDtype()) mdf = md.DataFrame(raw2, chunk_size=3) df2 = mdf.sort_values(by='b') result = df2.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = raw2.sort_values(by='b') pd.testing.assert_frame_equal(result, expected) s1 = pd.Series(np.random.rand(10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) series1 = md.Series(s1, chunk_size=6) result = series1.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_series_equal(result, s1) data = pd.DataFrame(np.random.rand(10, 5), columns=['c1', 'c2', 'c3', 'c4', 'c5']) df3 = md.DataFrame(data, chunk_size=4) r = df3.reindex(index=mt.arange(10, 1, -1, chunk_size=3)) result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = data.reindex(index=np.arange(10, 1, -1)) pd.testing.assert_frame_equal(result, expected)
def testFromTensor(self): tensor = mt.random.rand(10, 10, chunk_size=5) df = dataframe_from_tensor(tensor) self.assertIsInstance(df.index_value._index_value, IndexValue.RangeIndex) self.assertEqual( df.op.dtypes[0], tensor.dtype, 'DataFrame converted from tensor have the wrong dtype') df = df.tiles() self.assertEqual(len(df.chunks), 4) self.assertIsInstance(df.chunks[0].index_value._index_value, IndexValue.RangeIndex) self.assertIsInstance(df.chunks[0].index_value, IndexValue) # test converted from 1-d tensor tensor2 = mt.array([1, 2, 3]) # in fact, tensor3 is (3,1) tensor3 = mt.array([tensor2]).T df2 = dataframe_from_tensor(tensor2) df3 = dataframe_from_tensor(tensor3) df2 = df2.tiles() df3 = df3.tiles() np.testing.assert_equal(df2.chunks[0].index, (0, 0)) np.testing.assert_equal(df3.chunks[0].index, (0, 0)) # test converted from scalar scalar = mt.array(1) np.testing.assert_equal(scalar.ndim, 0) with self.assertRaises(TypeError): dataframe_from_tensor(scalar) # from tensor with given index df = dataframe_from_tensor(tensor, index=np.arange(0, 20, 2)) df = df.tiles() pd.testing.assert_index_equal(df.chunks[0].index_value.to_pandas(), pd.Index(np.arange(0, 10, 2))) pd.testing.assert_index_equal(df.chunks[1].index_value.to_pandas(), pd.Index(np.arange(0, 10, 2))) pd.testing.assert_index_equal(df.chunks[2].index_value.to_pandas(), pd.Index(np.arange(10, 20, 2))) pd.testing.assert_index_equal(df.chunks[3].index_value.to_pandas(), pd.Index(np.arange(10, 20, 2))) # from tensor with index that is a tensor as well df = dataframe_from_tensor(tensor, index=mt.arange(0, 20, 2)) df = df.tiles() self.assertEqual(len(df.chunks[0].inputs), 2) self.assertFalse(df.chunks[0].index_value.has_value()) # from tensor with given columns df = dataframe_from_tensor(tensor, columns=list('abcdefghij')) df = df.tiles() pd.testing.assert_index_equal(df.dtypes.index, pd.Index(list('abcdefghij'))) pd.testing.assert_index_equal(df.chunks[0].columns_value.to_pandas(), pd.Index(['a', 'b', 'c', 'd', 'e'])) pd.testing.assert_index_equal(df.chunks[0].dtypes.index, pd.Index(['a', 'b', 'c', 'd', 'e'])) pd.testing.assert_index_equal(df.chunks[1].columns_value.to_pandas(), pd.Index(['f', 'g', 'h', 'i', 'j'])) pd.testing.assert_index_equal(df.chunks[1].dtypes.index, pd.Index(['f', 'g', 'h', 'i', 'j'])) pd.testing.assert_index_equal(df.chunks[2].columns_value.to_pandas(), pd.Index(['a', 'b', 'c', 'd', 'e'])) pd.testing.assert_index_equal(df.chunks[2].dtypes.index, pd.Index(['a', 'b', 'c', 'd', 'e'])) pd.testing.assert_index_equal(df.chunks[3].columns_value.to_pandas(), pd.Index(['f', 'g', 'h', 'i', 'j'])) pd.testing.assert_index_equal(df.chunks[3].dtypes.index, pd.Index(['f', 'g', 'h', 'i', 'j'])) # test series from tensor tensor = mt.random.rand(10, chunk_size=4) series = series_from_tensor(tensor, name='a') self.assertEqual(series.dtype, tensor.dtype) self.assertEqual(series.name, 'a') pd.testing.assert_index_equal(series.index_value.to_pandas(), pd.RangeIndex(10)) series = series.tiles() self.assertEqual(len(series.chunks), 3) pd.testing.assert_index_equal(series.chunks[0].index_value.to_pandas(), pd.RangeIndex(0, 4)) self.assertEqual(series.chunks[0].name, 'a') pd.testing.assert_index_equal(series.chunks[1].index_value.to_pandas(), pd.RangeIndex(4, 8)) self.assertEqual(series.chunks[1].name, 'a') pd.testing.assert_index_equal(series.chunks[2].index_value.to_pandas(), pd.RangeIndex(8, 10)) self.assertEqual(series.chunks[2].name, 'a') df = dataframe_from_1d_tensors( [mt.tensor(np.random.rand(4)), mt.tensor(np.random.rand(4))]) pd.testing.assert_index_equal(df.columns_value.to_pandas(), pd.RangeIndex(2)) df = df.tiles() pd.testing.assert_index_equal(df.chunks[0].index_value.to_pandas(), pd.RangeIndex(4)) series = series_from_tensor(mt.random.rand(4)) pd.testing.assert_index_equal(series.index_value.to_pandas(), pd.RangeIndex(4)) series = series_from_tensor(mt.random.rand(4), index=[1, 2, 3]) pd.testing.assert_index_equal(series.index_value.to_pandas(), pd.Index([1, 2, 3])) series = series_from_tensor(mt.random.rand(4), index=pd.Index([1, 2, 3], name='my_index')) pd.testing.assert_index_equal(series.index_value.to_pandas(), pd.Index([1, 2, 3], name='my_index')) self.assertEqual(series.index_value.name, 'my_index') with self.assertRaises(TypeError): series_from_tensor(mt.ones((10, 10))) # index has wrong shape with self.assertRaises(ValueError): dataframe_from_tensor(mt.random.rand(4, 3), index=mt.random.rand(5)) # columns have wrong shape with self.assertRaises(ValueError): dataframe_from_tensor(mt.random.rand(4, 3), columns=['a', 'b']) # index should be 1-d with self.assertRaises(ValueError): dataframe_from_tensor(mt.tensor(np.random.rand(3, 2)), index=mt.tensor(np.random.rand(3, 2))) # 1-d tensors should have same shapen with self.assertRaises(ValueError): dataframe_from_1d_tensors( [mt.tensor(np.random.rand(3)), mt.tensor(np.random.rand(2))]) # index has wrong shape with self.assertRaises(ValueError): dataframe_from_1d_tensors([mt.tensor(np.random.rand(3))], index=mt.tensor(np.random.rand(2))) # columns have wrong shape with self.assertRaises(ValueError): dataframe_from_1d_tensors([mt.tensor(np.random.rand(3))], columns=['a', 'b']) # index should be 1-d with self.assertRaises(ValueError): series_from_tensor(mt.random.rand(4), index=mt.random.rand(4, 3))
def test_check_array(setup): # accept_sparse == False # raise error on sparse inputs X = [[1, 2], [3, 4]] X_csr = sp.csr_matrix(X) with pytest.raises(TypeError): check_array(X_csr) X_csr = mt.tensor(sp.csr_matrix(X)) with pytest.raises(TypeError): check_array(X_csr) # ensure_2d=False X_array = check_array([0, 1, 2], ensure_2d=False) assert X_array.ndim == 1 # ensure_2d=True with 1d array assert_raise_message(ValueError, 'Expected 2D array, got 1D array instead', check_array, [0, 1, 2], ensure_2d=True) assert_raise_message(ValueError, 'Expected 2D array, got 1D array instead', check_array, mt.tensor([0, 1, 2]), ensure_2d=True) # ensure_2d=True with scalar array assert_raise_message(ValueError, 'Expected 2D array, got scalar array instead', check_array, 10, ensure_2d=True) # don't allow ndim > 3 X_ndim = mt.arange(8).reshape(2, 2, 2) with pytest.raises(ValueError): check_array(X_ndim) check_array(X_ndim, allow_nd=True) # doesn't raise # dtype and order enforcement. X_C = mt.arange(4).reshape(2, 2).copy("C") X_F = X_C.copy("F") X_int = X_C.astype(mt.int) X_float = X_C.astype(mt.float) Xs = [X_C, X_F, X_int, X_float] dtypes = [mt.int32, mt.int, mt.float, mt.float32, None, mt.bool, object] orders = ['C', 'F', None] copys = [True, False] for X, dtype, order, copy in product(Xs, dtypes, orders, copys): X_checked = check_array(X, dtype=dtype, order=order, copy=copy, force_all_finite=False) if dtype is not None: assert X_checked.dtype == dtype else: assert X_checked.dtype == X.dtype if order == 'C': assert X_checked.flags['C_CONTIGUOUS'] assert not X_checked.flags['F_CONTIGUOUS'] elif order == 'F': assert X_checked.flags['F_CONTIGUOUS'] assert not X_checked.flags['C_CONTIGUOUS'] if copy: assert X is not X_checked else: # doesn't copy if it was already good if (X.dtype == X_checked.dtype and X_checked.flags['C_CONTIGUOUS'] == X.flags['C_CONTIGUOUS'] and X_checked.flags['F_CONTIGUOUS'] == X.flags['F_CONTIGUOUS']): assert X is X_checked # other input formats # convert lists to arrays X_dense = check_array([[1, 2], [3, 4]]) assert isinstance(X_dense, Tensor) # raise on too deep lists with pytest.raises(ValueError): check_array(X_ndim.to_numpy().tolist()) check_array(X_ndim.to_numpy().tolist(), allow_nd=True) # doesn't raise # convert weird stuff to arrays X_no_array = NotAnArray(X_dense.to_numpy()) result = check_array(X_no_array) assert isinstance(result, Tensor) # deprecation warning if string-like array with dtype="numeric" expected_warn_regex = r"converted to decimal numbers if dtype='numeric'" X_str = [['11', '12'], ['13', 'xx']] for X in [X_str, mt.array(X_str, dtype='U'), mt.array(X_str, dtype='S')]: with pytest.warns(FutureWarning, match=expected_warn_regex): check_array(X, dtype="numeric") # deprecation warning if byte-like array with dtype="numeric" X_bytes = [[b'a', b'b'], [b'c', b'd']] for X in [X_bytes, mt.array(X_bytes, dtype='V1')]: with pytest.warns(FutureWarning, match=expected_warn_regex): check_array(X, dtype="numeric") # test finite X = [[1.0, np.nan], [2.0, 3.0]] with pytest.raises(ValueError): _ = check_array(X).execute()
def test_check_array(self): # accept_sparse == False # raise error on sparse inputs X = [[1, 2], [3, 4]] X_csr = sp.csr_matrix(X) with self.assertRaises(TypeError): check_array(X_csr) X_csr = mt.tensor(sp.csr_matrix(X)) with self.assertRaises(TypeError): check_array(X_csr) # ensure_2d=False X_array = check_array([0, 1, 2], ensure_2d=False) self.assertEqual(X_array.ndim, 1) # ensure_2d=True with 1d array assert_raise_message(ValueError, 'Expected 2D array, got 1D array instead', check_array, [0, 1, 2], ensure_2d=True) assert_raise_message(ValueError, 'Expected 2D array, got 1D array instead', check_array, mt.tensor([0, 1, 2]), ensure_2d=True) # ensure_2d=True with scalar array assert_raise_message(ValueError, 'Expected 2D array, got scalar array instead', check_array, 10, ensure_2d=True) # don't allow ndim > 3 X_ndim = mt.arange(8).reshape(2, 2, 2) with self.assertRaises(ValueError): check_array(X_ndim) check_array(X_ndim, allow_nd=True) # doesn't raise # dtype and order enforcement. X_C = mt.arange(4).reshape(2, 2).copy("C") X_F = X_C.copy("F") X_int = X_C.astype(mt.int) X_float = X_C.astype(mt.float) Xs = [X_C, X_F, X_int, X_float] dtypes = [ mt.int32, mt.int, mt.float, mt.float32, None, mt.bool, object ] orders = ['C', 'F', None] copys = [True, False] for X, dtype, order, copy in product(Xs, dtypes, orders, copys): X_checked = check_array(X, dtype=dtype, order=order, copy=copy) if dtype is not None: self.assertEqual(X_checked.dtype, dtype) else: self.assertEqual(X_checked.dtype, X.dtype) if order == 'C': assert X_checked.flags['C_CONTIGUOUS'] assert not X_checked.flags['F_CONTIGUOUS'] elif order == 'F': assert X_checked.flags['F_CONTIGUOUS'] assert not X_checked.flags['C_CONTIGUOUS'] if copy: assert X is not X_checked else: # doesn't copy if it was already good if (X.dtype == X_checked.dtype and X_checked.flags['C_CONTIGUOUS'] == X.flags['C_CONTIGUOUS'] and X_checked.flags['F_CONTIGUOUS'] == X.flags['F_CONTIGUOUS']): assert X is X_checked # # allowed sparse != None # X_csc = sp.csc_matrix(X_C) # X_coo = X_csc.tocoo() # X_dok = X_csc.todok() # X_int = X_csc.astype(mt.int) # X_float = X_csc.astype(mt.float) # # Xs = [X_csc, X_coo, X_dok, X_int, X_float] # accept_sparses = [['csr', 'coo'], ['coo', 'dok']] # for X, dtype, accept_sparse, copy in product(Xs, dtypes, accept_sparses, # copys): # with warnings.catch_warnings(record=True) as w: # X_checked = check_array(X, dtype=dtype, # accept_sparse=accept_sparse, copy=copy) # if (dtype is object or sp.isspmatrix_dok(X)) and len(w): # message = str(w[0].message) # messages = ["object dtype is not supported by sparse matrices", # "Can't check dok sparse matrix for nan or inf."] # assert message in messages # else: # self.assertEqual(len(w), 0) # if dtype is not None: # self.assertEqual(X_checked.dtype, dtype) # else: # self.assertEqual(X_checked.dtype, X.dtype) # if X.format in accept_sparse: # # no change if allowed # self.assertEqual(X.format, X_checked.format) # else: # # got converted # self.assertEqual(X_checked.format, accept_sparse[0]) # if copy: # assert X is not X_checked # else: # # doesn't copy if it was already good # if X.dtype == X_checked.dtype and X.format == X_checked.format: # assert X is X_checked # other input formats # convert lists to arrays X_dense = check_array([[1, 2], [3, 4]]) assert isinstance(X_dense, Tensor) # raise on too deep lists with self.assertRaises(ValueError): check_array(X_ndim.execute().tolist()) check_array(X_ndim.execute().tolist(), allow_nd=True) # doesn't raise # convert weird stuff to arrays X_no_array = NotAnArray(X_dense.execute()) result = check_array(X_no_array) assert isinstance(result, Tensor) # deprecation warning if string-like array with dtype="numeric" expected_warn_regex = r"converted to decimal numbers if dtype='numeric'" X_str = [['11', '12'], ['13', 'xx']] for X in [ X_str, mt.array(X_str, dtype='U'), mt.array(X_str, dtype='S') ]: with pytest.warns(FutureWarning, match=expected_warn_regex): check_array(X, dtype="numeric") # deprecation warning if byte-like array with dtype="numeric" X_bytes = [[b'a', b'b'], [b'c', b'd']] for X in [X_bytes, mt.array(X_bytes, dtype='V1')]: with pytest.warns(FutureWarning, match=expected_warn_regex): check_array(X, dtype="numeric")
def test_diag(): # test 2-d, shape[0] == shape[1], k == 0 v = tensor(np.arange(16).reshape(4, 4), chunk_size=2) t = diag(v) assert t.shape == (4, ) assert t.op.gpu is False t = tile(t) assert t.nsplits == ((2, 2), ) v = tensor(np.arange(16).reshape(4, 4), chunk_size=(2, 3)) t = diag(v) assert t.shape == (4, ) t = tile(t) assert t.nsplits == ((2, 1, 1), ) # test 1-d, k == 0 v = tensor(np.arange(3), chunk_size=2) t = diag(v, sparse=True) assert t.shape == (3, 3) t = tile(t) assert t.nsplits == ((2, 1), (2, 1)) assert len( [c for c in t.chunks if c.op.__class__.__name__ == 'TensorDiag']) == 2 assert t.chunks[0].op.sparse is True # test 2-d, shape[0] != shape[1] v = tensor(np.arange(24).reshape(4, 6), chunk_size=2) t = diag(v) assert t.shape == np.diag(np.arange(24).reshape(4, 6)).shape t = tile(t) assert tuple(sum(s) for s in t.nsplits) == t.shape v = tensor(np.arange(24).reshape(4, 6), chunk_size=2) t = diag(v, k=1) assert t.shape == np.diag(np.arange(24).reshape(4, 6), k=1).shape t = tile(t) assert tuple(sum(s) for s in t.nsplits) == t.shape t = diag(v, k=2) assert t.shape == np.diag(np.arange(24).reshape(4, 6), k=2).shape t = tile(t) assert tuple(sum(s) for s in t.nsplits) == t.shape t = diag(v, k=-1) assert t.shape == np.diag(np.arange(24).reshape(4, 6), k=-1).shape t = tile(t) assert tuple(sum(s) for s in t.nsplits) == t.shape t = diag(v, k=-2) assert t.shape == np.diag(np.arange(24).reshape(4, 6), k=-2).shape t = tile(t) assert tuple(sum(s) for s in t.nsplits) == t.shape # test tiled zeros' keys a = arange(5, chunk_size=2) t = diag(a) t = tile(t) # 1 and 2 of t.chunks is ones, they have different shapes assert t.chunks[1].op.key != t.chunks[2].op.key
def testMainDataFrameWithoutEtcd(self): self.start_processes(etcd=False, scheduler_args=['-Dscheduler.aggressive_assign=true']) sess = new_session(self.session_manager_ref.address) # test binary arithmetics with different indices raw1 = pd.DataFrame(np.random.rand(10, 10)) df1 = md.DataFrame(raw1, chunk_size=5) raw2 = pd.DataFrame(np.random.rand(10, 10)) df2 = md.DataFrame(raw2, chunk_size=6) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) raw1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = md.DataFrame(raw1, chunk_size=(10, 5)) raw2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = md.DataFrame(raw2, chunk_size=(10, 6)) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) raw1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = md.DataFrame(raw1, chunk_size=5) raw2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = md.DataFrame(raw2, chunk_size=6) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) # test sort_values raw1 = pd.DataFrame(np.random.rand(10, 10)) raw1[0] = raw1[0].apply(str) raw1.columns = pd.MultiIndex.from_product([list('AB'), list('CDEFG')]) df1 = md.DataFrame(raw1, chunk_size=5) r = df1.sort_values([('A', 'C')]) result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1.sort_values([('A', 'C')])) rs = np.random.RandomState(0) raw2 = pd.DataFrame({'a': rs.rand(10), 'b': [f's{rs.randint(1000)}' for _ in range(10)] }) raw2['b'] = raw2['b'].astype(md.ArrowStringDtype()) mdf = md.DataFrame(raw2, chunk_size=4) filtered = mdf[mdf['a'] > 0.5] df2 = filtered.sort_values(by='b') result = df2.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = raw2[raw2['a'] > 0.5].sort_values(by='b') pd.testing.assert_frame_equal(result, expected) s1 = pd.Series(np.random.rand(10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) series1 = md.Series(s1, chunk_size=6) result = series1.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_series_equal(result, s1) # test reindex data = pd.DataFrame(np.random.rand(10, 5), columns=['c1', 'c2', 'c3', 'c4', 'c5']) df3 = md.DataFrame(data, chunk_size=4) r = df3.reindex(index=mt.arange(10, 1, -1, chunk_size=3)) result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = data.reindex(index=np.arange(10, 1, -1)) pd.testing.assert_frame_equal(result, expected) # test rebalance df4 = md.DataFrame(data) r = df4.rebalance() result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, data) chunk_metas = sess.get_tileable_chunk_metas(r.key) workers = list(set(itertools.chain(*(m.workers for m in chunk_metas.values())))) self.assertEqual(len(workers), 2) # test nunique data = pd.DataFrame(np.random.randint(0, 10, (100, 5)), columns=['c1', 'c2', 'c3', 'c4', 'c5']) df5 = md.DataFrame(data, chunk_size=4) r = df5.nunique() result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = data.nunique() pd.testing.assert_series_equal(result, expected)
def test_from_tensor(): tensor = mt.random.rand(10, 10, chunk_size=5) df = dataframe_from_tensor(tensor) assert isinstance(df.index_value._index_value, IndexValue.RangeIndex) assert df.op.dtypes[0] == tensor.dtype df = tile(df) assert len(df.chunks) == 4 assert isinstance(df.chunks[0].index_value._index_value, IndexValue.RangeIndex) assert isinstance(df.chunks[0].index_value, IndexValue) # test converted from 1-d tensor tensor2 = mt.array([1, 2, 3]) # in fact, tensor3 is (3,1) tensor3 = mt.array([tensor2]).T df2 = dataframe_from_tensor(tensor2) df3 = dataframe_from_tensor(tensor3) df2 = tile(df2) df3 = tile(df3) np.testing.assert_equal(df2.chunks[0].index, (0, 0)) np.testing.assert_equal(df3.chunks[0].index, (0, 0)) # test converted from scalar scalar = mt.array(1) np.testing.assert_equal(scalar.ndim, 0) with pytest.raises(TypeError): dataframe_from_tensor(scalar) # from tensor with given index df = dataframe_from_tensor(tensor, index=np.arange(0, 20, 2)) df = tile(df) pd.testing.assert_index_equal(df.chunks[0].index_value.to_pandas(), pd.Index(np.arange(0, 10, 2))) pd.testing.assert_index_equal(df.chunks[1].index_value.to_pandas(), pd.Index(np.arange(0, 10, 2))) pd.testing.assert_index_equal(df.chunks[2].index_value.to_pandas(), pd.Index(np.arange(10, 20, 2))) pd.testing.assert_index_equal(df.chunks[3].index_value.to_pandas(), pd.Index(np.arange(10, 20, 2))) # from tensor with index that is a tensor as well df = dataframe_from_tensor(tensor, index=mt.arange(0, 20, 2)) df = tile(df) assert len(df.chunks[0].inputs) == 2 assert df.chunks[0].index_value.has_value() is False # from tensor with given columns df = dataframe_from_tensor(tensor, columns=list('abcdefghij')) df = tile(df) pd.testing.assert_index_equal(df.dtypes.index, pd.Index(list('abcdefghij'))) pd.testing.assert_index_equal(df.chunks[0].columns_value.to_pandas(), pd.Index(['a', 'b', 'c', 'd', 'e'])) pd.testing.assert_index_equal(df.chunks[0].dtypes.index, pd.Index(['a', 'b', 'c', 'd', 'e'])) pd.testing.assert_index_equal(df.chunks[1].columns_value.to_pandas(), pd.Index(['f', 'g', 'h', 'i', 'j'])) pd.testing.assert_index_equal(df.chunks[1].dtypes.index, pd.Index(['f', 'g', 'h', 'i', 'j'])) pd.testing.assert_index_equal(df.chunks[2].columns_value.to_pandas(), pd.Index(['a', 'b', 'c', 'd', 'e'])) pd.testing.assert_index_equal(df.chunks[2].dtypes.index, pd.Index(['a', 'b', 'c', 'd', 'e'])) pd.testing.assert_index_equal(df.chunks[3].columns_value.to_pandas(), pd.Index(['f', 'g', 'h', 'i', 'j'])) pd.testing.assert_index_equal(df.chunks[3].dtypes.index, pd.Index(['f', 'g', 'h', 'i', 'j'])) # test series from tensor tensor = mt.random.rand(10, chunk_size=4) series = series_from_tensor(tensor, name='a') assert series.dtype == tensor.dtype assert series.name == 'a' pd.testing.assert_index_equal(series.index_value.to_pandas(), pd.RangeIndex(10)) series = tile(series) assert len(series.chunks) == 3 pd.testing.assert_index_equal(series.chunks[0].index_value.to_pandas(), pd.RangeIndex(0, 4)) assert series.chunks[0].name == 'a' pd.testing.assert_index_equal(series.chunks[1].index_value.to_pandas(), pd.RangeIndex(4, 8)) assert series.chunks[1].name == 'a' pd.testing.assert_index_equal(series.chunks[2].index_value.to_pandas(), pd.RangeIndex(8, 10)) assert series.chunks[2].name == 'a' d = OrderedDict([(0, mt.tensor(np.random.rand(4))), (1, mt.tensor(np.random.rand(4)))]) df = dataframe_from_1d_tileables(d) pd.testing.assert_index_equal(df.columns_value.to_pandas(), pd.RangeIndex(2)) df = tile(df) pd.testing.assert_index_equal(df.chunks[0].index_value.to_pandas(), pd.RangeIndex(4)) series = series_from_tensor(mt.random.rand(4)) pd.testing.assert_index_equal(series.index_value.to_pandas(), pd.RangeIndex(4)) series = series_from_tensor(mt.random.rand(4), index=[1, 2, 3]) pd.testing.assert_index_equal(series.index_value.to_pandas(), pd.Index([1, 2, 3])) series = series_from_tensor(mt.random.rand(4), index=pd.Index([1, 2, 3], name='my_index')) pd.testing.assert_index_equal(series.index_value.to_pandas(), pd.Index([1, 2, 3], name='my_index')) assert series.index_value.name == 'my_index' with pytest.raises(TypeError): series_from_tensor(mt.ones((10, 10))) # index has wrong shape with pytest.raises(ValueError): dataframe_from_tensor(mt.random.rand(4, 3), index=mt.random.rand(5)) # columns have wrong shape with pytest.raises(ValueError): dataframe_from_tensor(mt.random.rand(4, 3), columns=['a', 'b']) # index should be 1-d with pytest.raises(ValueError): dataframe_from_tensor(mt.tensor(np.random.rand(3, 2)), index=mt.tensor(np.random.rand(3, 2))) # 1-d tensors should have same shape with pytest.raises(ValueError): dataframe_from_1d_tileables(OrderedDict([(0, mt.tensor(np.random.rand(3))), (1, mt.tensor(np.random.rand(2)))])) # index has wrong shape with pytest.raises(ValueError): dataframe_from_1d_tileables({0: mt.tensor(np.random.rand(3))}, index=mt.tensor(np.random.rand(2))) # columns have wrong shape with pytest.raises(ValueError): dataframe_from_1d_tileables({0: mt.tensor(np.random.rand(3))}, columns=['a', 'b']) # index should be 1-d with pytest.raises(ValueError): series_from_tensor(mt.random.rand(4), index=mt.random.rand(4, 3))