def testBuildSplitIdxToOriginIdx(self): splits = [[(1, False, 2, False), (2, True, 3, True)], [(5, False, 6, True)]] res = build_split_idx_to_origin_idx(splits) self.assertEqual(res, {0: (0, 0), 1: (0, 1), 2: (1, 0)}) splits = [[(5, False, 6, True)], [(1, False, 2, False), (2, True, 3, True)]] res = build_split_idx_to_origin_idx(splits, increase=False) self.assertEqual(res, {0: (1, 0), 1: (1, 1), 2: (0, 0)})
def test_build_split_idx_to_origin_idx(): splits = [[(1, False, 2, False), (2, True, 3, True)], [(5, False, 6, True)]] res = build_split_idx_to_origin_idx(splits) assert res == {0: (0, 0), 1: (0, 1), 2: (1, 0)} splits = [[(5, False, 6, True)], [(1, False, 2, False), (2, True, 3, True)]] res = build_split_idx_to_origin_idx(splits, increase=False) assert res == {0: (1, 0), 1: (1, 1), 2: (0, 0)}
def testAddWithoutShuffle(self): # all the axes are monotonic # data1 with index split into [0...4], [5...9], # columns [3...7], [8...12] data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=np.arange(3, 13)) df1 = from_pandas(data1, chunk_size=5) # data2 with index split into [6...11], [2, 5], # columns [4...9], [10, 13] data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=np.arange(4, 14)) df2 = from_pandas(data2, chunk_size=6) df3 = add(df1, df2) # test df3's index and columns pd.testing.assert_index_equal(df3.columns.to_pandas(), (data1 + data2).columns) self.assertTrue(df3.columns.should_be_monotonic) self.assertIsInstance(df3.index_value.value, IndexValue.Int64Index) self.assertTrue(df3.index_value.should_be_monotonic) pd.testing.assert_index_equal(df3.index_value.to_pandas(), pd.Int64Index([])) self.assertNotEqual(df3.index_value.key, df1.index_value.key) self.assertNotEqual(df3.index_value.key, df2.index_value.key) self.assertEqual(df3.shape[1], 11) # columns is recorded, so we can get it df3.tiles() # test df3's index and columns after tiling pd.testing.assert_index_equal(df3.columns.to_pandas(), (data1 + data2).columns) self.assertTrue(df3.columns.should_be_monotonic) self.assertIsInstance(df3.index_value.value, IndexValue.Int64Index) self.assertTrue(df3.index_value.should_be_monotonic) pd.testing.assert_index_equal(df3.index_value.to_pandas(), pd.Int64Index([])) self.assertNotEqual(df3.index_value.key, df1.index_value.key) self.assertNotEqual(df3.index_value.key, df2.index_value.key) self.assertEqual(df3.shape[1], 11) # columns is recorded, so we can get it data1_index_min_max = [(0, True, 4, True), (5, True, 9, True)] data1_columns_min_max = [[3, True, 7, True], [8, True, 12, True]] data2_index_min_max = [(2, True, 5, True), (6, True, 11, True)] data2_columns_min_max = [(4, True, 9, True), (10, True, 13, True)] left_index_splits, right_index_splits = split_monotonic_index_min_max( data1_index_min_max, True, data2_index_min_max, False) left_columns_splits, right_columns_splits = split_monotonic_index_min_max( data1_columns_min_max, True, data2_columns_min_max, True) left_index_idx_to_original_idx = build_split_idx_to_origin_idx(left_index_splits) right_index_idx_to_original_idx = build_split_idx_to_origin_idx(right_index_splits, False) left_columns_idx_to_original_idx = build_split_idx_to_origin_idx(left_columns_splits) right_columns_idx_to_original_idx = build_split_idx_to_origin_idx(right_columns_splits) self.assertEqual(df3.chunk_shape, (7, 7)) for c in df3.chunks: self.assertIsInstance(c.op, DataFrameAdd) self.assertEqual(len(c.inputs), 2) # test shape idx = c.index # test the left side self.assertIsInstance(c.inputs[0].op, DataFrameIndexAlignMap) left_row_idx, left_row_inner_idx = left_index_idx_to_original_idx[idx[0]] left_col_idx, left_col_inner_idx = left_columns_idx_to_original_idx[idx[1]] expect_df1_input = df1.cix[left_row_idx, left_col_idx].data self.assertIs(c.inputs[0].inputs[0], expect_df1_input) left_index_min_max = left_index_splits[left_row_idx][left_row_inner_idx] self.assertEqual(c.inputs[0].op.index_min, left_index_min_max[0]) self.assertEqual(c.inputs[0].op.index_min_close, left_index_min_max[1]) self.assertEqual(c.inputs[0].op.index_max, left_index_min_max[2]) self.assertEqual(c.inputs[0].op.index_max_close, left_index_min_max[3]) self.assertIsInstance(c.inputs[0].index_value.to_pandas(), type(data1.index)) left_column_min_max = left_columns_splits[left_col_idx][left_col_inner_idx] self.assertEqual(c.inputs[0].op.column_min, left_column_min_max[0]) self.assertEqual(c.inputs[0].op.column_min_close, left_column_min_max[1]) self.assertEqual(c.inputs[0].op.column_max, left_column_min_max[2]) self.assertEqual(c.inputs[0].op.column_max_close, left_column_min_max[3]) expect_left_columns = filter_index_value(expect_df1_input.columns, left_column_min_max, store_data=True) pd.testing.assert_index_equal(c.inputs[0].columns.to_pandas(), expect_left_columns.to_pandas()) pd.testing.assert_index_equal(c.inputs[0].dtypes.index, expect_left_columns.to_pandas()) # test the right side self.assertIsInstance(c.inputs[1].op, DataFrameIndexAlignMap) right_row_idx, right_row_inner_idx = right_index_idx_to_original_idx[idx[0]] right_col_idx, right_col_inner_idx = right_columns_idx_to_original_idx[idx[1]] expect_df2_input = df2.cix[right_row_idx, right_col_idx].data self.assertIs(c.inputs[1].inputs[0], expect_df2_input) right_index_min_max = right_index_splits[right_row_idx][right_row_inner_idx] self.assertEqual(c.inputs[1].op.index_min, right_index_min_max[0]) self.assertEqual(c.inputs[1].op.index_min_close, right_index_min_max[1]) self.assertEqual(c.inputs[1].op.index_max, right_index_min_max[2]) self.assertEqual(c.inputs[1].op.index_max_close, right_index_min_max[3]) self.assertIsInstance(c.inputs[1].index_value.to_pandas(), type(data2.index)) right_column_min_max = right_columns_splits[right_col_idx][right_col_inner_idx] self.assertEqual(c.inputs[1].op.column_min, right_column_min_max[0]) self.assertEqual(c.inputs[1].op.column_min_close, right_column_min_max[1]) self.assertEqual(c.inputs[1].op.column_max, right_column_min_max[2]) self.assertEqual(c.inputs[1].op.column_max_close, right_column_min_max[3]) expect_right_columns = filter_index_value(expect_df2_input.columns, left_column_min_max, store_data=True) pd.testing.assert_index_equal(c.inputs[1].columns.to_pandas(), expect_right_columns.to_pandas()) pd.testing.assert_index_equal(c.inputs[1].dtypes.index, expect_right_columns.to_pandas())
def testWithoutShuffleAndWithOneChunk(self): # only 1 axis is monotonic # data1 with index split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas(data1, chunk_size=(5, 10)) # data2 with index split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas(data2, chunk_size=(6, 10)) df3 = add(df1, df2) # test df3's index and columns pd.testing.assert_index_equal(df3.columns.to_pandas(), (data1 + data2).columns) self.assertTrue(df3.columns.should_be_monotonic) self.assertIsInstance(df3.index_value.value, IndexValue.Int64Index) self.assertTrue(df3.index_value.should_be_monotonic) pd.testing.assert_index_equal(df3.index_value.to_pandas(), pd.Int64Index([])) self.assertNotEqual(df3.index_value.key, df1.index_value.key) self.assertNotEqual(df3.index_value.key, df2.index_value.key) self.assertEqual(df3.shape[1], 12) # columns is recorded, so we can get it df3.tiles() data1_index_min_max = [(0, True, 4, True), (5, True, 9, True)] data2_index_min_max = [(2, True, 5, True), (6, True, 11, True)] left_index_splits, right_index_splits = split_monotonic_index_min_max( data1_index_min_max, True, data2_index_min_max, False) left_index_idx_to_original_idx = build_split_idx_to_origin_idx(left_index_splits) right_index_idx_to_original_idx = build_split_idx_to_origin_idx(right_index_splits, False) self.assertEqual(df3.chunk_shape, (7, 1)) for c in df3.chunks: self.assertIsInstance(c.op, DataFrameAdd) self.assertEqual(len(c.inputs), 2) # test shape idx = c.index # test the left side self.assertIsInstance(c.inputs[0].op, DataFrameIndexAlignMap) left_row_idx, left_row_inner_idx = left_index_idx_to_original_idx[idx[0]] expect_df1_input = df1.cix[left_row_idx, 0].data self.assertIs(c.inputs[0].inputs[0], expect_df1_input) left_index_min_max = left_index_splits[left_row_idx][left_row_inner_idx] self.assertEqual(c.inputs[0].op.index_min, left_index_min_max[0]) self.assertEqual(c.inputs[0].op.index_min_close, left_index_min_max[1]) self.assertEqual(c.inputs[0].op.index_max, left_index_min_max[2]) self.assertEqual(c.inputs[0].op.index_max_close, left_index_min_max[3]) self.assertIsInstance(c.inputs[0].index_value.to_pandas(), type(data1.index)) self.assertEqual(c.inputs[0].op.column_min, expect_df1_input.columns.min_val) self.assertEqual(c.inputs[0].op.column_min_close, expect_df1_input.columns.min_val_close) self.assertEqual(c.inputs[0].op.column_max, expect_df1_input.columns.max_val) self.assertEqual(c.inputs[0].op.column_max_close, expect_df1_input.columns.max_val_close) expect_left_columns = expect_df1_input.columns pd.testing.assert_index_equal(c.inputs[0].columns.to_pandas(), expect_left_columns.to_pandas()) pd.testing.assert_index_equal(c.inputs[0].dtypes.index, expect_left_columns.to_pandas()) # test the right side self.assertIsInstance(c.inputs[1].op, DataFrameIndexAlignMap) right_row_idx, right_row_inner_idx = right_index_idx_to_original_idx[idx[0]] expect_df2_input = df2.cix[right_row_idx, 0].data self.assertIs(c.inputs[1].inputs[0], expect_df2_input) right_index_min_max = right_index_splits[right_row_idx][right_row_inner_idx] self.assertEqual(c.inputs[1].op.index_min, right_index_min_max[0]) self.assertEqual(c.inputs[1].op.index_min_close, right_index_min_max[1]) self.assertEqual(c.inputs[1].op.index_max, right_index_min_max[2]) self.assertEqual(c.inputs[1].op.index_max_close, right_index_min_max[3]) self.assertIsInstance(c.inputs[1].index_value.to_pandas(), type(data2.index)) self.assertEqual(c.inputs[1].op.column_min, expect_df2_input.columns.min_val) self.assertEqual(c.inputs[1].op.column_min_close, expect_df2_input.columns.min_val_close) self.assertEqual(c.inputs[1].op.column_max, expect_df2_input.columns.max_val) self.assertEqual(c.inputs[1].op.column_max_close, expect_df2_input.columns.max_val_close) expect_right_columns = expect_df2_input.columns pd.testing.assert_index_equal(c.inputs[1].columns.to_pandas(), expect_right_columns.to_pandas()) pd.testing.assert_index_equal(c.inputs[1].dtypes.index, expect_right_columns.to_pandas())
def testAddWithOneShuffle(self): # only 1 axis is monotonic # data1 with index split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas(data1, chunk_size=5) # data2 with index split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas(data2, chunk_size=6) df3 = add(df1, df2) # test df3's index and columns pd.testing.assert_index_equal(df3.columns.to_pandas(), (data1 + data2).columns) self.assertTrue(df3.columns.should_be_monotonic) self.assertIsInstance(df3.index_value.value, IndexValue.Int64Index) self.assertTrue(df3.index_value.should_be_monotonic) pd.testing.assert_index_equal(df3.index_value.to_pandas(), pd.Int64Index([])) self.assertNotEqual(df3.index_value.key, df1.index_value.key) self.assertNotEqual(df3.index_value.key, df2.index_value.key) self.assertEqual(df3.shape[1], 12) # columns is recorded, so we can get it df3.tiles() data1_index_min_max = [(0, True, 4, True), (5, True, 9, True)] data2_index_min_max = [(2, True, 5, True), (6, True, 11, True)] left_index_splits, right_index_splits = split_monotonic_index_min_max( data1_index_min_max, True, data2_index_min_max, False) left_index_idx_to_original_idx = build_split_idx_to_origin_idx(left_index_splits) right_index_idx_to_original_idx = build_split_idx_to_origin_idx(right_index_splits, False) self.assertEqual(df3.chunk_shape, (7, 2)) for c in df3.chunks: self.assertIsInstance(c.op, DataFrameAdd) self.assertEqual(len(c.inputs), 2) idx = c.index # test the left side self.assertIsInstance(c.inputs[0].op, DataFrameIndexAlignReduce) expect_dtypes = pd.concat([hash_dtypes(ic.inputs[0].op.data.dtypes, 2)[c.index[1]] for ic in c.inputs[0].inputs[0].inputs]) pd.testing.assert_series_equal(c.inputs[0].dtypes, expect_dtypes) pd.testing.assert_index_equal(c.inputs[0].columns.to_pandas(), c.inputs[0].dtypes.index) self.assertIsInstance(c.inputs[0].index_value.to_pandas(), type(data1.index)) self.assertIsInstance(c.inputs[0].inputs[0].op, DataFrameShuffleProxy) left_row_idx, left_row_inner_idx = left_index_idx_to_original_idx[idx[0]] left_index_min_max = left_index_splits[left_row_idx][left_row_inner_idx] ics = [ic for ic in df1.chunks if ic.index[0] == left_row_idx] for j, ci, ic in zip(itertools.count(0), c.inputs[0].inputs[0].inputs, ics): self.assertIsInstance(ci.op, DataFrameIndexAlignMap) self.assertEqual(ci.index, (idx[0], j)) self.assertEqual(ci.op.index_min, left_index_min_max[0]) self.assertEqual(ci.op.index_min_close, left_index_min_max[1]) self.assertEqual(ci.op.index_max, left_index_min_max[2]) self.assertEqual(ci.op.index_max_close, left_index_min_max[3]) self.assertIsInstance(ci.index_value.to_pandas(), type(data1.index)) self.assertTrue(ci.op.column_shuffle_size, 2) shuffle_segments = ci.op.column_shuffle_segments expected_shuffle_segments = hash_dtypes(ic.data.dtypes, 2) self.assertEqual(len(shuffle_segments), len(expected_shuffle_segments)) for ss, ess in zip(shuffle_segments, expected_shuffle_segments): pd.testing.assert_series_equal(ss, ess) self.assertIs(ci.inputs[0], ic.data) # test the right side self.assertIsInstance(c.inputs[1].op, DataFrameIndexAlignReduce) expect_dtypes = pd.concat([hash_dtypes(ic.inputs[0].op.data.dtypes, 2)[c.index[1]] for ic in c.inputs[1].inputs[0].inputs]) pd.testing.assert_series_equal(c.inputs[1].dtypes, expect_dtypes) pd.testing.assert_index_equal(c.inputs[1].columns.to_pandas(), c.inputs[1].dtypes.index) self.assertIsInstance(c.inputs[1].index_value.to_pandas(), type(data1.index)) self.assertIsInstance(c.inputs[1].inputs[0].op, DataFrameShuffleProxy) right_row_idx, right_row_inner_idx = right_index_idx_to_original_idx[idx[0]] right_index_min_max = right_index_splits[right_row_idx][right_row_inner_idx] ics = [ic for ic in df2.chunks if ic.index[0] == right_row_idx] for j, ci, ic in zip(itertools.count(0), c.inputs[1].inputs[0].inputs, ics): self.assertIsInstance(ci.op, DataFrameIndexAlignMap) self.assertEqual(ci.index, (idx[0], j)) self.assertEqual(ci.op.index_min, right_index_min_max[0]) self.assertEqual(ci.op.index_min_close, right_index_min_max[1]) self.assertEqual(ci.op.index_max, right_index_min_max[2]) self.assertEqual(ci.op.index_max_close, right_index_min_max[3]) self.assertTrue(ci.op.column_shuffle_size, 2) shuffle_segments = ci.op.column_shuffle_segments expected_shuffle_segments = hash_dtypes(ic.data.dtypes, 2) self.assertEqual(len(shuffle_segments), len(expected_shuffle_segments)) for ss, ess in zip(shuffle_segments, expected_shuffle_segments): pd.testing.assert_series_equal(ss, ess) self.assertIs(ci.inputs[0], ic.data) # make sure shuffle proxies' key are different proxy_keys = set() for i in range(df3.chunk_shape[0]): cs = [c for c in df3.chunks if c.index[0] == i] lps = {c.inputs[0].inputs[0].op.key for c in cs} self.assertEqual(len(lps), 1) proxy_keys.add(lps.pop()) rps = {c.inputs[1].inputs[0].op.key for c in cs} self.assertEqual(len(rps), 1) proxy_keys.add(rps.pop()) self.assertEqual(len(proxy_keys), 2 * df3.chunk_shape[0])