def test_value_counts_head(prepare_data, setup, chunk_size): _, pdf = prepare_data df = md.DataFrame(pdf, chunk_size=chunk_size) df1 = df['a'].value_counts() df2 = df1.head(3) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 3 assert len(graph) == 3 assert opt_df2 in graph.results result = df2.execute(extra_config={ 'operand_executors': _iloc_operand_executors }).fetch() expected = pdf['a'].value_counts().head(3) pd.testing.assert_series_equal(result, expected)
def testGroupByAggStrCat(self): agg_fun = lambda x: x.str.cat(sep='_', na_rep='NA') rs = np.random.RandomState(0) raw_df = pd.DataFrame({'a': rs.choice(['A', 'B', 'C'], size=(100,)), 'b': rs.choice([None, 'alfa', 'bravo', 'charlie'], size=(100,))}) mdf = md.DataFrame(raw_df, chunk_size=13) r = mdf.groupby('a').agg(agg_fun) pd.testing.assert_frame_equal(self.executor.execute_dataframe(r, concat=True)[0], raw_df.groupby('a').agg(agg_fun)) raw_series = pd.Series(rs.choice([None, 'alfa', 'bravo', 'charlie'], size=(100,))) ms = md.Series(raw_series, chunk_size=13) r = ms.groupby(lambda x: x % 2).agg(agg_fun) pd.testing.assert_series_equal(self.executor.execute_dataframe(r, concat=True)[0], raw_series.groupby(lambda x: x % 2).agg(agg_fun))
def testRollingAgg(self): df = pd.DataFrame(np.random.rand(4, 3), columns=list('abc')) df2 = md.DataFrame(df, chunk_size=3) r = df2.rolling(3).agg('max') expected = df.rolling(3).agg('max') self.assertEqual(r.shape, df.shape) self.assertIs(r.index_value, df2.index_value) pd.testing.assert_index_equal(r.columns_value.to_pandas(), expected.columns) pd.testing.assert_series_equal(r.dtypes, df2.dtypes) r = r.tiles() for c in r.chunks: self.assertEqual(c.shape, c.inputs[0].shape) self.assertIs(c.index_value, c.inputs[0].index_value) pd.testing.assert_index_equal(c.columns_value.to_pandas(), expected.columns) pd.testing.assert_series_equal(c.dtypes, expected.dtypes)
def testDataFrameIter(self): raw_data = pd.DataFrame(np.random.randint(1000, size=(20, 10))) df = md.DataFrame(raw_data, chunk_size=5) i = 0 for result_row, expect_row in zip(df.iterrows(batch_size=15), raw_data.iterrows()): self.assertEqual(result_row[0], expect_row[0]) pd.testing.assert_series_equal(result_row[1], expect_row[1]) i += 1 self.assertEqual(i, len(raw_data)) i = 0 for result_tup, expect_tup in zip(df.itertuples(batch_size=10), raw_data.itertuples()): self.assertEqual(result_tup, expect_tup) i += 1 self.assertEqual(i, len(raw_data))
def testDataFrameExecuteNotFetch(self): data1 = pd.DataFrame(np.random.random((5, 4)), columns=list('abcd')) sess = Session.default_or_local() df1 = md.DataFrame(data1, chunk_size=2) with self.assertRaises(ValueError): sess.fetch(df1) self.assertIs(df1.execute(), df1) # modify result executor = sess._sess._executor executor.chunk_result[get_tiled( df1).chunks[0].key] = data1.iloc[:2, :2] * 3 expected = data1 expected.iloc[:2, :2] = data1.iloc[:2, :2] * 3 pd.testing.assert_frame_equal(df1.to_pandas(), expected)
def test_dataframe_getitem(): data = pd.DataFrame(np.random.rand(10, 5), columns=['c1', 'c2', 'c3', 'c4', 'c5']) df = md.DataFrame(data, chunk_size=2) series = df['c3'] assert isinstance(series, Series) assert series.shape == (10, ) assert series.name == 'c3' assert series.dtype == data['c3'].dtype assert series.index_value == df.index_value series = tile(series) assert isinstance(series, SERIES_TYPE) assert all(not i.is_coarse() for i in series.inputs) is True assert series.nsplits == ((2, 2, 2, 2, 2), ) assert len(series.chunks) == 5 for i, c in enumerate(series.chunks): assert isinstance(c, SERIES_CHUNK_TYPE) assert c.index == (i, ) assert c.shape == (2, ) df1 = df[['c1', 'c2', 'c3']] assert isinstance(df1, DataFrame) assert df1.shape == (10, 3) assert df1.index_value == df.index_value pd.testing.assert_index_equal(df1.columns_value.to_pandas(), data[['c1', 'c2', 'c3']].columns) pd.testing.assert_series_equal(df1.dtypes, data[['c1', 'c2', 'c3']].dtypes) df1 = tile(df1) assert df1.nsplits == ((2, 2, 2, 2, 2), (2, 1)) assert len(df1.chunks) == 10 for i, c in enumerate(df1.chunks[slice(0, 10, 2)]): assert isinstance(c, DATAFRAME_CHUNK_TYPE) assert c.index == (i, 0) assert c.shape == (2, 2) for i, c in enumerate(df1.chunks[slice(1, 10, 2)]): assert isinstance(c, DATAFRAME_CHUNK_TYPE) assert c.index == (i, 1) assert c.shape == (2, 1)
def testLocalClassifierFromToParquet(self): n_rows = 1000 n_columns = 10 rs = np.random.RandomState(0) X = rs.rand(n_rows, n_columns) y = rs.rand(n_rows) df = pd.DataFrame(X, columns=[f'c{i}' for i in range(n_columns)]) df['id'] = [f'i{i}' for i in range(n_rows)] booster = xgboost.train({}, xgboost.DMatrix(X, y), num_boost_round=2) with tempfile.TemporaryDirectory() as d: m_name = os.path.join(d, 'c.model') result_dir = os.path.join(d, 'result') os.mkdir(result_dir) data_dir = os.path.join(d, 'data') os.mkdir(data_dir) booster.save_model(m_name) df.iloc[:500].to_parquet(os.path.join(d, 'data', 'data1.parquet')) df.iloc[500:].to_parquet(os.path.join(d, 'data', 'data2.parquet')) df = md.read_parquet(data_dir).set_index('id') model = XGBClassifier() model.load_model(m_name) result = model.predict(df, run=False) r = md.DataFrame(result).to_parquet(result_dir) # tiles to ensure no iterative tiling exists g = r.build_graph(tiled=True) self.assertTrue(all(isinstance(n.op, Fuse) for n in g)) self.assertEqual(len(g), 2) r.execute() ret = md.read_parquet(result_dir).to_pandas().iloc[:, 0].to_numpy() model2 = xgboost.XGBClassifier() model2.load_model(m_name) expected = model2.predict(X) expected = np.stack([1 - expected, expected]).argmax(axis=0) np.testing.assert_array_equal(ret, expected)
def test_gpu_execution(setup, check_ref_counts): df_raw = pd.DataFrame(np.random.rand(30, 3), columns=list('abc')) df = to_gpu(md.DataFrame(df_raw, chunk_size=6)) r = df.sum() res = r.execute().fetch() pd.testing.assert_series_equal(res.to_pandas(), df_raw.sum()) r = df.kurt() res = r.execute().fetch() pd.testing.assert_series_equal(res.to_pandas(), df_raw.kurt()) r = df.agg(['sum', 'var']) res = r.execute().fetch() pd.testing.assert_frame_equal(res.to_pandas(), df_raw.agg(['sum', 'var'])) s_raw = pd.Series(np.random.rand(30)) s = to_gpu(md.Series(s_raw, chunk_size=6)) r = s.sum() res = r.execute().fetch() assert pytest.approx(res) == s_raw.sum() r = s.kurt() res = r.execute().fetch() assert pytest.approx(res) == s_raw.kurt() r = s.agg(['sum', 'var']) res = r.execute().fetch() pd.testing.assert_series_equal(res.to_pandas(), s_raw.agg(['sum', 'var'])) s_raw = pd.Series( np.random.randint(0, 3, size=(30, )) * np.random.randint(0, 5, size=(30, ))) s = to_gpu(md.Series(s_raw, chunk_size=6)) r = s.unique() res = r.execute().fetch() np.testing.assert_array_equal( cp.asnumpy(res).sort(), s_raw.unique().sort())
def testArrowTunnelSinglePart(self): import pandas as pd import numpy as np import mars.dataframe as md mars_source_table_name = tn('mars_arrow_tunnel_datasource_spart') mars_des_table_name = tn('mars_arrow_tunnel_datastore_spart') self.odps.delete_table(mars_des_table_name, if_exists=True) self.odps.delete_table(mars_source_table_name, if_exists=True) table = self.odps.create_table(mars_source_table_name, schema=('col1 int, col2 string', 'pt string'), lifecycle=1) pt = table.create_partition('pt=test_part') with pt.open_writer() as writer: writer.write([[1, 'test1'], [2, 'test2']]) r = self.odps.to_mars_dataframe(mars_source_table_name, partition='pt=test_part') \ .execute().to_pandas() expected = pt.to_df().to_pandas() pd.testing.assert_frame_equal(r, expected) data = pd.DataFrame({ 'col1': np.random.rand(1000, ), 'col2': np.random.randint(0, 100, (1000, )), 'col3': np.random.choice(['a', 'b', 'c'], size=(1000, )) }) df = md.DataFrame(data, chunk_size=300) self.odps.persist_mars_dataframe(df, mars_des_table_name, partition='pt=test_part', unknown_as_string=True) expected = self.odps.get_table(mars_des_table_name).get_partition('pt=test_part') \ .to_df().to_pandas() pd.testing.assert_frame_equal( expected.sort_values('col1').reset_index(drop=True), data.sort_values('col1').reset_index(drop=True))
def testExecutableTupleExecute(self): raw_a = np.random.RandomState(0).rand(10, 20) a = mt.tensor(raw_a) raw_df = pd.DataFrame(raw_a) df = md.DataFrame(raw_df) tp = test_namedtuple_type(a, df) executable_tp = mt.ExecutableTuple(tp) self.assertIn('a', dir(executable_tp)) self.assertIs(executable_tp.a, a) self.assertIn(test_namedtuple_type.__name__, repr(executable_tp)) with self.assertRaises(AttributeError): getattr(executable_tp, 'c') res = mt.ExecutableTuple(tp).execute().fetch() self.assertIs(test_namedtuple_type, type(res)) np.testing.assert_array_equal(raw_a, res.a) pd.testing.assert_frame_equal(raw_df, res.b)
async def test_iterative_tiling(create_cluster): session = get_default_session() raw = np.random.RandomState(0).rand(30, 5) raw_df = pd.DataFrame(raw, index=np.arange(1, 31)) df = md.DataFrame(raw_df, chunk_size=10) df = df[df[0] < .7] df2 = df.shift(2) info = await session.execute(df2) await info assert info.result() is None result = (await session.fetch(df2))[0] expected = raw_df[raw_df[0] < .7].shift(2) pd.testing.assert_frame_equal(result, expected) # test meta assert df2.index_value.min_val >= 1 assert df2.index_value.max_val <= 30
def testSetIndex(self): df1 = pd.DataFrame([[1, 3, 3], [4, 2, 6], [7, 8, 9]], index=['a1', 'a2', 'a3'], columns=['x', 'y', 'z']) df2 = md.DataFrame(df1, chunk_size=2) df3 = df2.set_index('y', drop=True) df3.tiles() self.assertEqual(df3.chunk_shape, (2, 2)) pd.testing.assert_index_equal(df3.chunks[0].columns.to_pandas(), pd.Index(['x'])) pd.testing.assert_index_equal(df3.chunks[1].columns.to_pandas(), pd.Index(['z'])) df4 = df2.set_index('y', drop=False) df4.tiles() self.assertEqual(df4.chunk_shape, (2, 2)) pd.testing.assert_index_equal(df4.chunks[0].columns.to_pandas(), pd.Index(['x', 'y'])) pd.testing.assert_index_equal(df4.chunks[1].columns.to_pandas(), pd.Index(['z']))
def test_set_index(): df1 = pd.DataFrame([[1, 3, 3], [4, 2, 6], [7, 8, 9]], index=['a1', 'a2', 'a3'], columns=['x', 'y', 'z']) df2 = md.DataFrame(df1, chunk_size=2) df3 = df2.set_index('y', drop=True) df3 = tile(df3) assert df3.chunk_shape == (2, 2) pd.testing.assert_index_equal(df3.chunks[0].columns_value.to_pandas(), pd.Index(['x'])) pd.testing.assert_index_equal(df3.chunks[1].columns_value.to_pandas(), pd.Index(['z'])) df4 = df2.set_index('y', drop=False) df4 = tile(df4) assert df4.chunk_shape == (2, 2) pd.testing.assert_index_equal(df4.chunks[0].columns_value.to_pandas(), pd.Index(['x', 'y'])) pd.testing.assert_index_equal(df4.chunks[1].columns_value.to_pandas(), pd.Index(['z']))
def test_executable_tuple_execute(setup): raw_a = np.random.RandomState(0).rand(10, 20) a = mt.tensor(raw_a) raw_df = pd.DataFrame(raw_a) df = md.DataFrame(raw_df) tp = test_namedtuple_type(a, df) executable_tp = mt.ExecutableTuple(tp) assert 'a' in dir(executable_tp) assert executable_tp.a is a assert test_namedtuple_type.__name__ in repr(executable_tp) with pytest.raises(AttributeError): getattr(executable_tp, 'c') res = mt.ExecutableTuple(tp).execute().fetch() assert test_namedtuple_type is type(res) np.testing.assert_array_equal(raw_a, res.a) pd.testing.assert_frame_equal(raw_df, res.b)
def testRolling(self): df = pd.DataFrame(np.random.rand(4, 3), columns=list('abc')) df2 = md.DataFrame(df) r = df2.rolling(3, min_periods=1, center=True, win_type='triang', closed='both') expected = df.rolling(3, min_periods=1, center=True, win_type='triang', closed='both') self.assertEqual(repr(r), repr(expected)) with self.assertRaises(KeyError): _ = r['d'] with self.assertRaises(KeyError): _ = r['a', 'd']
def testMixiedInputTypeTrainTestSplit(self): rs = np.random.RandomState(0) df_raw = pd.DataFrame(rs.rand(10, 4)) df = md.DataFrame(df_raw, chunk_size=5) X, y = df.iloc[:, :-1], df.iloc[:, -1] for x_to_tensor, y_to_tensor in itertools.product(range(1), range(1)): x = X if x_to_tensor: x = mt.tensor(x) yy = y if y_to_tensor: yy = mt.tensor(yy) x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0) self.assertIsInstance(x_train, type(x)) self.assertIsInstance(x_test, type(x)) self.assertIsInstance(y_train, type(yy)) self.assertIsInstance(y_test, type(yy))
def testDataFrameGetitem(self): data = pd.DataFrame(np.random.rand(10, 5), columns=['c1', 'c2', 'c3', 'c4', 'c5']) df = md.DataFrame(data, chunk_size=2) series = df['c3'] self.assertIsInstance(series, Series) self.assertEqual(series.shape, (10, )) self.assertEqual(series.name, 'c3') self.assertEqual(series.dtype, data['c3'].dtype) self.assertEqual(series.index_value, df.index_value) series.tiles() self.assertEqual(series.nsplits, ((2, 2, 2, 2, 2), )) self.assertEqual(len(series.chunks), 5) for i, c in enumerate(series.chunks): self.assertIsInstance(c, SERIES_CHUNK_TYPE) self.assertEqual(c.index, (i, )) self.assertEqual(c.shape, (2, )) df1 = df[['c1', 'c2', 'c3']] self.assertIsInstance(df1, DataFrame) self.assertEqual(df1.shape, (10, 3)) self.assertEqual(df1.index_value, df.index_value) pd.testing.assert_index_equal(df1.columns.to_pandas(), data[['c1', 'c2', 'c3']].columns) pd.testing.assert_series_equal(df1.dtypes, data[['c1', 'c2', 'c3']].dtypes) df1.tiles() self.assertEqual(df1.nsplits, ((2, 2, 2, 2, 2), (2, 1))) self.assertEqual(len(df1.chunks), 10) for i, c in enumerate(df1.chunks[slice(0, 10, 2)]): self.assertIsInstance(c, DATAFRAME_CHUNK_TYPE) self.assertEqual(c.index, (i, 0)) self.assertEqual(c.shape, (2, 2)) for i, c in enumerate(df1.chunks[slice(1, 10, 2)]): self.assertIsInstance(c, DATAFRAME_CHUNK_TYPE) self.assertEqual(c.index, (i, 1)) self.assertEqual(c.shape, (2, 1))
def testGroupByCum(self): df1 = pd.DataFrame({ 'a': [3, 5, 2, 7, 1, 2, 4, 6, 2, 4], 'b': [8, 3, 4, 1, 8, 2, 2, 2, 2, 3], 'c': [1, 8, 8, 5, 3, 5, 0, 0, 5, 4] }) mdf = md.DataFrame(df1, chunk_size=3) for fun in ['cummin', 'cummax', 'cumprod', 'cumsum']: r = getattr(mdf.groupby('b'), fun)().tiles() self.assertEqual(r.op.output_types[0], OutputType.dataframe) self.assertEqual(len(r.chunks), 4) self.assertEqual(r.shape, (len(df1), 2)) self.assertEqual(r.chunks[0].shape, (np.nan, 2)) pd.testing.assert_index_equal( r.chunks[0].columns_value.to_pandas(), pd.Index(['a', 'c'])) r = getattr(mdf.groupby('b'), fun)(axis=1).tiles() self.assertEqual(r.op.output_types[0], OutputType.dataframe) self.assertEqual(len(r.chunks), 4) self.assertEqual(r.shape, (len(df1), 3)) self.assertEqual(r.chunks[0].shape, (np.nan, 3)) pd.testing.assert_index_equal( r.chunks[0].columns_value.to_pandas(), df1.columns) r = mdf.groupby('b').cumcount().tiles() self.assertEqual(r.op.output_types[0], OutputType.series) self.assertEqual(len(r.chunks), 4) self.assertEqual(r.shape, (len(df1), )) self.assertEqual(r.chunks[0].shape, (np.nan, )) series1 = pd.Series([2, 2, 5, 7, 3, 7, 8, 8, 5, 6]) ms1 = md.Series(series1, chunk_size=3) for fun in ['cummin', 'cummax', 'cumprod', 'cumsum', 'cumcount']: r = getattr(ms1.groupby(lambda x: x % 2), fun)().tiles() self.assertEqual(r.op.output_types[0], OutputType.series) self.assertEqual(len(r.chunks), 4) self.assertEqual(r.shape, (len(series1), )) self.assertEqual(r.chunks[0].shape, (np.nan, ))
def test_groupby_cum(): df1 = pd.DataFrame({ 'a': [3, 5, 2, 7, 1, 2, 4, 6, 2, 4], 'b': [8, 3, 4, 1, 8, 2, 2, 2, 2, 3], 'c': [1, 8, 8, 5, 3, 5, 0, 0, 5, 4] }) mdf = md.DataFrame(df1, chunk_size=3) for fun in ['cummin', 'cummax', 'cumprod', 'cumsum']: r = tile(getattr(mdf.groupby('b'), fun)()) assert r.op.output_types[0] == OutputType.dataframe assert len(r.chunks) == 4 assert r.shape == (len(df1), 2) assert r.chunks[0].shape == (np.nan, 2) pd.testing.assert_index_equal(r.chunks[0].columns_value.to_pandas(), pd.Index(['a', 'c'])) r = tile(getattr(mdf.groupby('b'), fun)(axis=1)) assert r.op.output_types[0] == OutputType.dataframe assert len(r.chunks) == 4 assert r.shape == (len(df1), 3) assert r.chunks[0].shape == (np.nan, 3) pd.testing.assert_index_equal(r.chunks[0].columns_value.to_pandas(), df1.columns) r = tile(mdf.groupby('b').cumcount()) assert r.op.output_types[0] == OutputType.series assert len(r.chunks) == 4 assert r.shape == (len(df1), ) assert r.chunks[0].shape == (np.nan, ) series1 = pd.Series([2, 2, 5, 7, 3, 7, 8, 8, 5, 6]) ms1 = md.Series(series1, chunk_size=3) for fun in ['cummin', 'cummax', 'cumprod', 'cumsum', 'cumcount']: r = tile(getattr(ms1.groupby(lambda x: x % 2), fun)()) assert r.op.output_types[0] == OutputType.series assert len(r.chunks) == 4 assert r.shape == (len(series1), ) assert r.chunks[0].shape == (np.nan, )
def testDataFrameGetitemBool(self): data = pd.DataFrame(np.random.rand(10, 5), columns=['c1', 'c2', 'c3', 'c4', 'c5']) df = md.DataFrame(data, chunk_size=2) mask_data = data.c1 > 0.5 mask = md.Series(mask_data, chunk_size=2) # getitem by mars series self.assertEqual(df[mask].execute().shape, data[mask_data].shape) pd.testing.assert_frame_equal(df[mask].execute(), data[mask_data]) # getitem by pandas series pd.testing.assert_frame_equal(df[mask_data].execute(), data[mask_data]) # getitem by mars series with alignment but no shuffle mask_data = pd.Series( [True, True, True, False, False, True, True, False, False, True], index=range(9, -1, -1)) mask = md.Series(mask_data, chunk_size=2) pd.testing.assert_frame_equal(df[mask].execute(), data[mask_data]) # getitem by mars series with shuffle alignment mask_data = pd.Series( [True, True, True, False, False, True, True, False, False, True], index=[0, 3, 6, 2, 9, 8, 5, 7, 1, 4]) mask = md.Series(mask_data, chunk_size=2) pd.testing.assert_frame_equal(df[mask].execute().sort_index(), data[mask_data]) # getitem by mars series with shuffle alignment and extra element mask_data = pd.Series([ True, True, True, False, False, True, True, False, False, True, False ], index=[0, 3, 6, 2, 9, 8, 5, 7, 1, 4, 10]) mask = md.Series(mask_data, chunk_size=2) pd.testing.assert_frame_equal(df[mask].execute().sort_index(), data[mask_data])
def test_build_and_search_index_with_filesystem_download(setup): with tempfile.TemporaryDirectory() as f: # params doc_count, query_count, dimension = 2000, 15, 10 topk = 10 doc_chunk, query_chunk = 1000, 5 # data doc, query = gen_data(doc_count=doc_count, query_count=query_count, dimension=dimension) df = md.DataFrame(pd.DataFrame(doc), chunk_size=(doc_chunk, dimension)) q = mt.tensor(query, chunk_size=(query_chunk, dimension)) index = build_index(tensor=df, index_path=f, column_number=2) assert len(os.listdir(f)) > 0 search_index(q[0:5], topk, index) search_index(q[5:10], topk, index) search_index(q[10:15], topk, index)
def testRocCurveAuc(self): service_ep = 'http://127.0.0.1:' + self.web_port timeout = 120 if 'CI' in os.environ else -1 with new_session(service_ep) as sess: run_kwargs = {'timeout': timeout} rs = np.random.RandomState(0) raw = pd.DataFrame({'a': rs.randint(0, 10, (10,)), 'b': rs.rand(10)}) df = md.DataFrame(raw) y = df['a'].to_tensor().astype('int') pred = df['b'].to_tensor().astype('float') fpr, tpr, thresholds = roc_curve(y, pred, pos_label=2, session=sess, run_kwargs=run_kwargs) m = auc(fpr, tpr, session=sess, run_kwargs=run_kwargs) sk_fpr, sk_tpr, sk_threshod = sklearn_roc_curve(raw['a'].to_numpy().astype('int'), raw['b'].to_numpy().astype('float'), pos_label=2) expect_m = sklearn_auc(sk_fpr, sk_tpr) self.assertAlmostEqual(m.fetch(session=sess), expect_m)
def testGroupByGetItem(self): df1 = pd.DataFrame({'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce')}) mdf = md.DataFrame(df1, chunk_size=3) r = mdf.groupby('b')[['a', 'b']].tiles() self.assertIsInstance(r, DataFrameGroupBy) self.assertIsInstance(r.op, GroupByIndex) self.assertEqual(r.selection, ['a', 'b']) self.assertEqual(list(r.key_dtypes.index), ['b']) self.assertEqual(len(r.chunks), 3) r = mdf.groupby('b').a.tiles() self.assertIsInstance(r, SeriesGroupBy) self.assertIsInstance(r.op, GroupByIndex) self.assertEqual(r.name, 'a') self.assertEqual(list(r.key_dtypes.index), ['b']) self.assertEqual(len(r.chunks), 3) with self.assertRaises(IndexError): getattr(mdf.groupby('b')[['a', 'b']], 'a')
def test_mars(ray_start_regular): import pandas as pd cluster = mars.new_cluster_in_ray(worker_num=2, worker_cpu=1) n = 10000 pdf = pd.DataFrame({"a": list(range(n)), "b": list(range(n, 2 * n))}) df = md.DataFrame(pdf) # Convert mars dataframe to ray dataset ds = ray.data.from_mars(df) pd.testing.assert_frame_equal(ds.to_pandas(), df.to_pandas()) ds2 = ds.filter(lambda row: row["a"] % 2 == 0) assert ds2.take(5) == [{"a": 2 * i, "b": n + 2 * i} for i in range(5)] # Convert ray dataset to mars dataframe df2 = ds2.to_mars() pd.testing.assert_frame_equal( df2.head(5).to_pandas(), pd.DataFrame({ "a": list(range(0, 10, 2)), "b": list(range(n, n + 10, 2)) }), ) # Test Arrow Dataset pdf2 = pd.DataFrame({c: range(5) for c in "abc"}) ds3 = ray.data.from_arrow([pa.Table.from_pandas(pdf2) for _ in range(3)]) df3 = ds3.to_mars() pd.testing.assert_frame_equal( df3.head(5).to_pandas(), pdf2, ) # Test simple datasets with pytest.raises(NotImplementedError): ray.data.range(10).to_mars() cluster.stop()
def testDataFrameGetitem(self): data = pd.DataFrame(np.random.rand(10, 5), columns=['c1', 'c2', 'c3', 'c4', 'c5']) df = md.DataFrame(data, chunk_size=2) series1 = df['c2'] pd.testing.assert_series_equal( self.executor.execute_dataframe(series1, concat=True)[0], data['c2']) series2 = df['c5'] pd.testing.assert_series_equal( self.executor.execute_dataframe(series2, concat=True)[0], data['c5']) df1 = df[['c1', 'c2', 'c3']] pd.testing.assert_frame_equal( self.executor.execute_dataframe(df1, concat=True)[0], data[['c1', 'c2', 'c3']]) df2 = df[['c3', 'c2', 'c1']] pd.testing.assert_frame_equal( self.executor.execute_dataframe(df2, concat=True)[0], data[['c3', 'c2', 'c1']]) df3 = df[['c1']] pd.testing.assert_frame_equal( self.executor.execute_dataframe(df3, concat=True)[0], data[['c1']]) df4 = df[['c3', 'c1', 'c2', 'c1']] pd.testing.assert_frame_equal( self.executor.execute_dataframe(df4, concat=True)[0], data[['c3', 'c1', 'c2', 'c1']]) series3 = df['c1'][0] self.assertEqual( self.executor.execute_dataframe(series3, concat=True)[0], data['c1'][0])
def testAccuracyScore(self): service_ep = 'http://127.0.0.1:' + self.web_port timeout = 120 if 'CI' in os.environ else -1 with new_session(service_ep) as sess: run_kwargs = {'timeout': timeout} rs = np.random.RandomState(0) raw = pd.DataFrame({ 'a': rs.randint(0, 10, (10, )), 'b': rs.randint(0, 10, (10, )) }) df = md.DataFrame(raw) y = df['a'].to_tensor().astype('int') pred = df['b'].astype('int') score = accuracy_score(y, pred, session=sess, run_kwargs=run_kwargs) expect = sklearn_accuracy_score(raw['a'].to_numpy().astype('int'), raw['b'].to_numpy().astype('int')) self.assertAlmostEqual(score.fetch(session=sess), expect)
def test_aggregate_str_cat(setup, check_ref_counts): agg_fun = lambda x: x.str.cat(sep='_', na_rep='NA') rs = np.random.RandomState(0) raw_df = pd.DataFrame({ 'a': rs.choice(['A', 'B', 'C'], size=(100, )), 'b': rs.choice([None, 'alfa', 'bravo', 'charlie'], size=(100, )) }) mdf = md.DataFrame(raw_df, chunk_size=13) r = mdf.agg(agg_fun) pd.testing.assert_series_equal(r.execute().fetch(), raw_df.agg(agg_fun)) raw_series = pd.Series( rs.choice([None, 'alfa', 'bravo', 'charlie'], size=(100, ))) ms = md.Series(raw_series, chunk_size=13) r = ms.agg(agg_fun) assert r.execute().fetch() == raw_series.agg(agg_fun)
def testLocalClassifierFromToParquet(self): n_rows = 1000 n_columns = 10 rs = np.random.RandomState(0) X = rs.rand(n_rows, n_columns) y = (rs.rand(n_rows) > 0.5).astype(np.int32) df = pd.DataFrame(X, columns=[f'c{i}' for i in range(n_columns)]) # test with existing model classifier = lightgbm.LGBMClassifier(n_estimators=2) classifier.fit(X, y, verbose=True) with tempfile.TemporaryDirectory() as d: result_dir = os.path.join(d, 'result') os.mkdir(result_dir) data_dir = os.path.join(d, 'data') os.mkdir(data_dir) df.iloc[:500].to_parquet(os.path.join(d, 'data', 'data1.parquet')) df.iloc[500:].to_parquet(os.path.join(d, 'data', 'data2.parquet')) df = md.read_parquet(data_dir) model = LGBMClassifier() model.load_model(classifier) result = model.predict(df, run=False) r = md.DataFrame(result).to_parquet(result_dir) # tiles to ensure no iterative tiling exists g = r.build_graph(tiled=True) self.assertTrue(all(isinstance(n.op, Fuse) for n in g)) self.assertEqual(len(g), 2) r.execute() ret = md.read_parquet(result_dir).to_pandas().iloc[:, 0].to_numpy() expected = classifier.predict(X) expected = np.stack([1 - expected, expected]).argmax(axis=0) np.testing.assert_array_equal(ret, expected)
def testNamed(self): rs = np.random.RandomState(0) raw = rs.rand(10, 10) sess = Session.default_or_local() # test named tensor t = mt.tensor(raw, chunk_size=3) name = 't_name' r1 = t.execute(name=name, session=sess) np.testing.assert_array_equal(r1, raw) t2 = mt.named_tensor(name=name, session=sess) r2 = (t2 + 1).execute(session=sess).fetch() np.testing.assert_array_equal(r2, raw + 1) # test named series name = 's_name' raw = pd.Series([1, 2, 3]) s = md.Series(raw) r1 = s.execute(name=name, session=sess).fetch() pd.testing.assert_series_equal(r1, raw) s2 = md.named_series(name=name, session=sess) r2 = s2.execute(session=sess).fetch() pd.testing.assert_series_equal(r2, raw) # test dataframe name = 'd_name' raw = pd.DataFrame(np.random.rand(10, 3)) d = md.DataFrame(raw, chunk_size=4) r1 = d.execute(name=name, session=sess).fetch() pd.testing.assert_frame_equal(r1, raw) d2 = md.named_dataframe(name=name, session=sess) r2 = d2.execute(session=sess).fetch() pd.testing.assert_frame_equal(r2, raw)
def test_use_arrow_dtype_n_unique(setup, check_ref_counts): with option_context({ 'dataframe.use_arrow_dtype': True, 'combine_size': 2 }): rs = np.random.RandomState(0) data1 = pd.DataFrame({ 'a': rs.random(10), 'b': [f's{i}' for i in rs.randint(100, size=10)] }) data1['c'] = data1['b'].copy() data1['d'] = data1['b'].copy() data1['e'] = data1['b'].copy() df = md.DataFrame(data1, chunk_size=(3, 2)) r = df.nunique(axis=0) result = r.execute().fetch() expected = data1.nunique(axis=0) pd.testing.assert_series_equal(result, expected) r = df.nunique(axis=1) result = r.execute().fetch() expected = data1.nunique(axis=1) pd.testing.assert_series_equal(result, expected)