def test_input_tileable(setup): def f(t, x): return (t * x).sum().to_numpy() rs = np.random.RandomState(0) raw = rs.rand(5, 4) t1 = mt.tensor(raw, chunk_size=3) t2 = t1.sum(axis=0) s = spawn(f, args=(t2, 3)) result = s.execute().fetch() expected = (raw.sum(axis=0) * 3).sum() assert pytest.approx(result) == expected df1 = md.DataFrame(raw, chunk_size=3) df1.execute() df2 = shuffle(df1) df2.execute() def f2(input_df): bonus = input_df.iloc[:, 0].fetch().sum() return input_df.sum().to_pandas() + bonus for df in [df1, df2]: s = spawn(f2, args=(df,)) result = s.execute().fetch() expected = pd.DataFrame(raw).sum() + raw[:, 0].sum() pd.testing.assert_series_equal(result, expected)
def testInputTileable(self): def f(t, x): return (t * x).sum().to_numpy() rs = np.random.RandomState(0) raw = rs.rand(5, 4) t1 = mt.tensor(raw, chunk_size=3) t2 = t1.sum(axis=0) s = spawn(f, args=(t2, 3)) sess = new_session() sess._sess._executor = ExecutorForTest('numpy', storage=sess._context) result = s.execute(session=sess).fetch(session=sess) expected = (raw.sum(axis=0) * 3).sum() self.assertAlmostEqual(result, expected) df1 = md.DataFrame(raw, chunk_size=3) df1.execute(session=sess) df2 = shuffle(df1) df2.execute(session=sess) def f2(input_df): bonus = input_df.iloc[:, 0].fetch().sum() return input_df.sum().to_pandas() + bonus for df in [df1, df2]: s = spawn(f2, args=(df, )) result = s.execute(session=sess).fetch(session=sess) expected = pd.DataFrame(raw).sum() + raw[:, 0].sum() pd.testing.assert_series_equal(result, expected)
def testShuffleExpr(self): a = mt.random.rand(10, 3, chunk_size=2) b = md.DataFrame(mt.random.rand(10, 5), chunk_size=2) new_a, new_b = shuffle(a, b, random_state=0) self.assertIs(new_a.op, new_b.op) self.assertIsInstance(new_a.op, LearnShuffle) self.assertEqual(new_a.shape, a.shape) self.assertEqual(new_b.shape, b.shape) self.assertNotEqual(b.index_value.key, new_b.index_value.key) new_a.tiles() self.assertEqual(len(new_a.chunks), 10) self.assertTrue(np.isnan(new_a.chunks[0].shape[0])) self.assertEqual(len(new_b.chunks), 15) self.assertTrue(np.isnan(new_b.chunks[0].shape[0])) self.assertNotEqual(new_b.chunks[0].index_value.key, new_b.chunks[1].index_value.key) self.assertEqual(new_a.chunks[0].op.seeds, new_b.chunks[0].op.seeds) c = mt.random.rand(10, 5, 3, chunk_size=2) d = md.DataFrame(mt.random.rand(10, 5), chunk_size=(2, 5)) new_c, new_d = shuffle(c, d, axes=(0, 1), random_state=0) self.assertIs(new_c.op, new_d.op) self.assertIsInstance(new_c.op, LearnShuffle) self.assertEqual(new_c.shape, c.shape) self.assertEqual(new_d.shape, d.shape) self.assertNotEqual(d.index_value.key, new_d.index_value.key) self.assertFalse( np.all(new_d.dtypes.index[:-1] < new_d.dtypes.index[1:])) pd.testing.assert_series_equal(d.dtypes, new_d.dtypes.sort_index()) new_c.tiles() self.assertEqual(len(new_c.chunks), 5 * 1 * 2) self.assertTrue(np.isnan(new_c.chunks[0].shape[0])) self.assertEqual(len(new_d.chunks), 5) self.assertTrue(np.isnan(new_d.chunks[0].shape[0])) self.assertEqual(new_d.chunks[0].shape[1], 5) self.assertNotEqual(new_d.chunks[0].index_value.key, new_d.chunks[1].index_value.key) pd.testing.assert_series_equal(new_d.chunks[0].dtypes.sort_index(), d.dtypes) self.assertEqual(new_c.chunks[0].op.seeds, new_d.chunks[0].op.seeds) self.assertEqual(len(new_c.chunks[0].op.seeds), 1) self.assertEqual(new_c.chunks[0].op.reduce_sizes, (5, )) with self.assertRaises(ValueError): a = mt.random.rand(10, 5) b = mt.random.rand(10, 4, 3) shuffle(a, b, axes=1) with self.assertRaises(TypeError): shuffle(a, b, unknown_param=True) self.assertIsInstance(shuffle(mt.random.rand(10, 5)), mt.Tensor)
def test_shuffle_expr(): a = mt.random.rand(10, 3, chunk_size=2) b = md.DataFrame(mt.random.rand(10, 5), chunk_size=2) new_a, new_b = shuffle(a, b, random_state=0) assert new_a.op is new_b.op assert isinstance(new_a.op, LearnShuffle) assert new_a.shape == a.shape assert new_b.shape == b.shape assert b.index_value.key != new_b.index_value.key new_a, new_b = tile(new_a, new_b) assert len(new_a.chunks) == 10 assert np.isnan(new_a.chunks[0].shape[0]) assert len(new_b.chunks) == 15 assert np.isnan(new_b.chunks[0].shape[0]) assert new_b.chunks[0].index_value.key != new_b.chunks[1].index_value.key assert new_a.chunks[0].op.seeds == new_b.chunks[0].op.seeds c = mt.random.rand(10, 5, 3, chunk_size=2) d = md.DataFrame(mt.random.rand(10, 5), chunk_size=(2, 5)) new_c, new_d = shuffle(c, d, axes=(0, 1), random_state=0) assert new_c.op is new_d.op assert isinstance(new_c.op, LearnShuffle) assert new_c.shape == c.shape assert new_d.shape == d.shape assert d.index_value.key != new_d.index_value.key assert not np.all(new_d.dtypes.index[:-1] < new_d.dtypes.index[1:]) pd.testing.assert_series_equal(d.dtypes, new_d.dtypes.sort_index()) new_c, new_d = tile(new_c, new_d) assert len(new_c.chunks) == 5 * 1 * 2 assert np.isnan(new_c.chunks[0].shape[0]) assert len(new_d.chunks) == 5 assert np.isnan(new_d.chunks[0].shape[0]) assert new_d.chunks[0].shape[1] == 5 assert new_d.chunks[0].index_value.key != new_d.chunks[1].index_value.key pd.testing.assert_series_equal(new_d.chunks[0].dtypes.sort_index(), d.dtypes) assert new_c.chunks[0].op.seeds == new_d.chunks[0].op.seeds assert len(new_c.chunks[0].op.seeds) == 1 assert new_c.chunks[0].op.reduce_sizes == (5, ) with pytest.raises(ValueError): a = mt.random.rand(10, 5) b = mt.random.rand(10, 4, 3) shuffle(a, b, axes=1) with pytest.raises(TypeError): shuffle(a, b, unknown_param=True) assert isinstance(shuffle(mt.random.rand(10, 5)), mt.Tensor)
def test_shuffle_execution(setup): # test consistency s1 = np.arange(9).reshape(3, 3) s2 = np.arange(1, 10).reshape(3, 3) ts1 = mt.array(s1, chunk_size=2) ts2 = mt.array(s2, chunk_size=2) ret = shuffle(ts1, ts2, axes=[0, 1], random_state=0) res1, res2 = ret.execute().fetch() # calc row index s1_col_0 = s1[:, 0].tolist() rs1_col_0 = [ res1[:, i] for i in range(3) if set(s1_col_0) == set(res1[:, i]) ][0] row_index = [s1_col_0.index(j) for j in rs1_col_0] # calc col index s1_row_0 = s1[0].tolist() rs1_row_0 = [res1[i] for i in range(3) if set(s1_row_0) == set(res1[i])][0] col_index = [s1_row_0.index(j) for j in rs1_row_0] np.testing.assert_array_equal(res2, s2[row_index][:, col_index]) # tensor + tensor raw1 = np.random.rand(10, 15, 20) t1 = mt.array(raw1, chunk_size=8) raw2 = np.random.rand(10, 15, 20) t2 = mt.array(raw2, chunk_size=5) for axes in [(0, ), (0, 1), (0, 2), (1, 2), (0, 1, 2)]: ret = shuffle(t1, t2, axes=axes, random_state=0) res1, res2 = ret.execute().fetch() assert res1.shape == raw1.shape assert res2.shape == raw2.shape np.testing.assert_array_equal(_sort(raw1, axes), _sort(res1, axes)) np.testing.assert_array_equal(_sort(raw2, axes), _sort(res2, axes)) # tensor + tensor(more dimension) raw3 = np.random.rand(10, 15) t3 = mt.array(raw3, chunk_size=(8, 15)) raw4 = np.random.rand(10, 15, 20) t4 = mt.array(raw4, chunk_size=(5, 15, 10)) for axes in [(1, ), (0, 1), (1, 2)]: ret = shuffle(t3, t4, axes=axes, random_state=0) res3, res4 = ret.execute().fetch() assert res3.shape == raw3.shape assert res4.shape == raw4.shape np.testing.assert_array_equal(_sort(raw3, axes), _sort(res3, axes)) np.testing.assert_array_equal(_sort(raw4, axes), _sort(res4, axes)) # tensor + dataframe + series raw5 = np.random.rand(10, 15, 20) t5 = mt.array(raw5, chunk_size=8) raw6 = pd.DataFrame(np.random.rand(10, 15)) df = md.DataFrame(raw6, chunk_size=(8, 15)) raw7 = pd.Series(np.random.rand(10)) series = md.Series(raw7, chunk_size=8) for axes in [(0, ), (1, ), (0, 1), (1, 2), [0, 1, 2]]: ret = shuffle(t5, df, series, axes=axes, random_state=0) # skip check nsplits because it's updated res5, res_df, res_series = ret.execute(extra_config={ 'check_nsplits': False }).fetch(extra_config={'check_nsplits': False}) assert res5.shape == raw5.shape assert res_df.shape == df.shape assert res_series.shape == series.shape