def testCut(self): s = from_pandas_series(pd.Series([1., 2., 3., 4.]), chunk_size=2) with self.assertRaises(ValueError): _ = cut(s, -1) with self.assertRaises(ValueError): _ = cut([[1, 2], [3, 4]], 3) with self.assertRaises(ValueError): _ = cut([], 3) r, b = cut(s, [1.5, 2.5], retbins=True) self.assertIsInstance(r, SERIES_TYPE) self.assertIsInstance(b, TENSOR_TYPE) r = r.tiles() self.assertEqual(len(r.chunks), 2) for c in r.chunks: self.assertIsInstance(c, SERIES_CHUNK_TYPE) self.assertEqual(c.shape, (2, )) r = cut(s.to_tensor(), [1.5, 2.5]) self.assertIsInstance(r, CATEGORICAL_TYPE) self.assertEqual(len(r), len(s)) self.assertIn('Categorical', repr(r)) r = r.tiles() self.assertEqual(len(r.chunks), 2) for c in r.chunks: self.assertIsInstance(c, CATEGORICAL_CHUNK_TYPE) self.assertEqual(c.shape, (2, )) self.assertEqual(c.ndim, 1) # test serialize g = r.build_graph(tiled=False) g2 = type(g).from_pb(g.to_pb()) g2 = type(g).from_json(g2.to_json()) r2 = next(n for n in g2 if isinstance(n, CATEGORICAL_TYPE)) self.assertEqual(len(r2), len(r)) r = cut([0, 1, 1, 2], bins=4, labels=False) self.assertIsInstance(r, TENSOR_TYPE) e = pd.cut([0, 1, 1, 2], bins=4, labels=False) self.assertEqual(r.dtype, e.dtype)
def testSeriesIsin(self): # one chunk in multiple chunks a = from_pandas_series(pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), chunk_size=10) b = from_pandas_series(pd.Series([2, 1, 9, 3]), chunk_size=2) r = a.isin(b).tiles() for i, c in enumerate(r.chunks): self.assertEqual(c.index, (i, )) self.assertEqual(c.dtype, np.dtype('bool')) self.assertEqual(c.shape, (10, )) self.assertEqual(len(c.op.inputs), 2) self.assertEqual(c.op.object_type, ObjectType.series) self.assertEqual(c.op.inputs[0].index, (i, )) self.assertEqual(c.op.inputs[0].shape, (10, )) self.assertEqual(c.op.inputs[1].index, (0, )) self.assertEqual(c.op.inputs[1].shape, (4, )) # has been rechunked # multiple chunk in one chunks a = from_pandas_series(pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), chunk_size=2) b = from_pandas_series(pd.Series([2, 1, 9, 3]), chunk_size=4) r = a.isin(b).tiles() for i, c in enumerate(r.chunks): self.assertEqual(c.index, (i, )) self.assertEqual(c.dtype, np.dtype('bool')) self.assertEqual(c.shape, (2, )) self.assertEqual(len(c.op.inputs), 2) self.assertEqual(c.op.object_type, ObjectType.series) self.assertEqual(c.op.inputs[0].index, (i, )) self.assertEqual(c.op.inputs[0].shape, (2, )) self.assertEqual(c.op.inputs[1].index, (0, )) self.assertEqual(c.op.inputs[1].shape, (4, )) # multiple chunk in multiple chunks a = from_pandas_series(pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), chunk_size=2) b = from_pandas_series(pd.Series([2, 1, 9, 3]), chunk_size=2) r = a.isin(b).tiles() for i, c in enumerate(r.chunks): self.assertEqual(c.index, (i, )) self.assertEqual(c.dtype, np.dtype('bool')) self.assertEqual(c.shape, (2, )) self.assertEqual(len(c.op.inputs), 2) self.assertEqual(c.op.object_type, ObjectType.series) self.assertEqual(c.op.inputs[0].index, (i, )) self.assertEqual(c.op.inputs[0].shape, (2, )) self.assertEqual(c.op.inputs[1].index, (0, )) self.assertEqual(c.op.inputs[1].shape, (4, )) # has been rechunked with self.assertRaises(TypeError): _ = a.isin('sth')
def testUfunc(self): df_raw = pd.DataFrame(np.random.uniform(size=(10, 10)), index=pd.RangeIndex(9, -1, -1)) df = from_pandas(df_raw, chunk_size=5) series_raw = pd.Series(np.random.uniform(size=10), index=pd.RangeIndex(9, -1, -1)) series = from_pandas_series(series_raw, chunk_size=5) ufuncs = [[np.abs, mt.abs], [np.log, mt.log], [np.log2, mt.log2], [np.log10, mt.log10], [np.sin, mt.sin], [np.cos, mt.cos], [np.tan, mt.tan], [np.sinh, mt.sinh], [np.cosh, mt.cosh], [np.tanh, mt.tanh], [np.arcsin, mt.arcsin], [np.arccos, mt.arccos], [np.arctan, mt.arctan], [np.arcsinh, mt.arcsinh], [np.arccosh, mt.arccosh], [np.arctanh, mt.arctanh], [np.radians, mt.radians], [np.degrees, mt.degrees], [np.ceil, mt.ceil], [np.floor, mt.floor], [ partial(np.around, decimals=2), partial(mt.around, decimals=2) ], [np.exp, mt.exp], [np.exp2, mt.exp2], [np.expm1, mt.expm1], [np.sqrt, mt.sqrt]] for raw, data in [(df_raw, df), (series_raw, series)]: for npf, mtf in ufuncs: r = mtf(data) result = self.executor.execute_tensor(r, concat=True)[0] expected = npf(raw) if isinstance(raw, pd.DataFrame): pd.testing.assert_frame_equal(result, expected) else: pd.testing.assert_series_equal(result, expected) # test numpy ufunc r = npf(data) result = self.executor.execute_tensor(r, concat=True)[0] if isinstance(raw, pd.DataFrame): pd.testing.assert_frame_equal(result, expected) else: pd.testing.assert_series_equal(result, expected)
def testFromPandasSeries(self): data = pd.Series(np.random.rand(10), name='a') series = from_pandas_series(data, chunk_size=4) self.assertEqual(series.name, data.name) self.assertIsInstance(series.index_value._index_value, IndexValue.RangeIndex) self.assertEqual(series.index_value._index_value._slice, slice(0, 10, 1)) self.assertTrue(series.index_value.is_monotonic_increasing) self.assertFalse(series.index_value.is_monotonic_decreasing) self.assertTrue(series.index_value.is_unique) self.assertEqual(series.index_value.min_val, 0) self.assertEqual(series.index_value.max_val, 9) series = series.tiles() self.assertEqual(len(series.chunks), 3) pd.testing.assert_series_equal(series.chunks[0].op.data, series.op.data.iloc[:4]) self.assertEqual(series.chunks[0].index_value._index_value._slice, slice(0, 4, 1)) self.assertTrue( series.chunks[0].index_value._index_value._is_monotonic_increasing) self.assertFalse( series.chunks[0].index_value._index_value._is_monotonic_decreasing) self.assertTrue(series.chunks[0].index_value._index_value._is_unique) pd.testing.assert_series_equal(series.chunks[1].op.data, series.op.data.iloc[4:8]) self.assertEqual(series.chunks[1].index_value._index_value._slice, slice(4, 8, 1)) self.assertTrue( series.chunks[1].index_value._index_value._is_monotonic_increasing) self.assertFalse( series.chunks[1].index_value._index_value._is_monotonic_decreasing) self.assertTrue(series.chunks[1].index_value._index_value._is_unique) pd.testing.assert_series_equal(series.chunks[2].op.data, series.op.data.iloc[8:]) self.assertEqual(series.chunks[2].index_value._index_value._slice, slice(8, 10, 1)) self.assertTrue( series.chunks[2].index_value._index_value._is_monotonic_increasing) self.assertFalse( series.chunks[2].index_value._index_value._is_monotonic_decreasing) self.assertTrue(series.chunks[2].index_value._index_value._is_unique)
def testToCPUExecution(self): pdf = pd.DataFrame(np.random.rand(20, 30), index=np.arange(20, 0, -1)) df = from_pandas_df(pdf, chunk_size=(13, 21)) cdf = to_gpu(df) df2 = to_cpu(cdf) res = self.executor.execute_dataframe(df2, concat=True)[0] self.assertIsInstance(res, pd.DataFrame) pd.testing.assert_frame_equal(res, pdf) pseries = pdf.iloc[:, 0] series = from_pandas_series(pseries, chunk_size=(13, 21)) cseries = to_gpu(series) series2 = to_cpu(cseries) res = self.executor.execute_dataframe(series2, concat=True)[0] self.assertIsInstance(res, pd.Series) pd.testing.assert_series_equal(res, pseries)
def testSeriesReductionSerialize(self): data = pd.Series(np.random.rand(10), name='a') if self.has_skipna: kwargs = dict(axis='index', skipna=False) else: kwargs = dict() reduction_df = getattr(from_pandas_series(data), self.func_name)(**kwargs).tiles() # pb chunk = reduction_df.chunks[0] serials = self._pb_serial(chunk) op, pb = serials[chunk.op, chunk.data] self.assertEqual(tuple(pb.index), chunk.index) self.assertEqual(pb.key, chunk.key) self.assertEqual(tuple(pb.shape), chunk.shape) self.assertEqual(int(op.type.split('.', 1)[1]), self.op_num) chunk2 = self._pb_deserial(serials)[chunk.data] self.assertEqual(chunk.index, chunk2.index) self.assertEqual(chunk.key, chunk2.key) self.assertEqual(chunk.shape, chunk2.shape) self.assertEqual(chunk.name, chunk2.name) self.assertEqual(chunk.op.skipna, chunk2.op.skipna) self.assertEqual(chunk.op.axis, chunk2.op.axis) pd.testing.assert_index_equal(chunk2.index_value.to_pandas(), chunk.index_value.to_pandas()) # json chunk = reduction_df.chunks[0] serials = self._json_serial(chunk) chunk2 = self._json_deserial(serials)[chunk.data] self.assertEqual(chunk.index, chunk2.index) self.assertEqual(chunk.key, chunk2.key) self.assertEqual(chunk.shape, chunk2.shape) self.assertEqual(chunk2.name, chunk.name) self.assertEqual(chunk.op.skipna, chunk2.op.skipna) self.assertEqual(chunk.op.axis, chunk2.op.axis) pd.testing.assert_index_equal(chunk2.index_value.to_pandas(), chunk.index_value.to_pandas())
def testToGPU(self): # test dataframe data = pd.DataFrame(np.random.rand(10, 10), index=np.random.randint(-100, 100, size=(10, )), columns=[np.random.bytes(10) for _ in range(10)]) df = from_pandas_df(data) cdf = to_gpu(df) self.assertEqual(df.index_value, cdf.index_value) self.assertEqual(df.columns_value, cdf.columns_value) self.assertTrue(cdf.op.gpu) pd.testing.assert_series_equal(df.dtypes, cdf.dtypes) cdf = cdf.tiles() df = get_tiled(df) self.assertEqual(df.nsplits, cdf.nsplits) self.assertEqual(df.chunks[0].index_value, cdf.chunks[0].index_value) self.assertEqual(df.chunks[0].columns_value, cdf.chunks[0].columns_value) self.assertTrue(cdf.chunks[0].op.gpu) pd.testing.assert_series_equal(df.chunks[0].dtypes, cdf.chunks[0].dtypes) self.assertIs(cdf, to_gpu(cdf)) # test series sdata = data.iloc[:, 0] series = from_pandas_series(sdata) cseries = to_gpu(series) self.assertEqual(series.index_value, cseries.index_value) self.assertTrue(cseries.op.gpu) cseries = cseries.tiles() series = get_tiled(series) self.assertEqual(series.nsplits, cseries.nsplits) self.assertEqual(series.chunks[0].index_value, cseries.chunks[0].index_value) self.assertTrue(cseries.chunks[0].op.gpu) self.assertIs(cseries, to_gpu(cseries))
def test_to_numeric(): raw = pd.DataFrame({"a": [1.0, 2, 3, -3]}) df = from_pandas_df(raw, chunk_size=2) with pytest.raises(ValueError): _ = to_numeric(df) with pytest.raises(ValueError): _ = to_numeric([['1.0', 1]]) with pytest.raises(ValueError): _ = to_numeric([]) s = from_pandas_series(pd.Series(['1.0', '2.0', 1, -2]), chunk_size=2) r = tile(to_numeric(s)) assert len(r.chunks) == 2 assert isinstance(r, SERIES_TYPE) r = tile(to_numeric(['1.0', '2.0', 1, -2])) assert isinstance(r, TENSOR_TYPE)
def testSeriesReductionSerialize(self): data = pd.Series(np.random.rand(10), name='a') if self.has_skipna: kwargs = dict(axis='index', skipna=False) else: kwargs = dict() reduction_df = getattr(from_pandas_series(data, chunk_size=3), self.func_name)(**kwargs).tiles() # pb chunk = reduction_df.chunks[0] serials = self._pb_serial(chunk) op, pb = serials[chunk.op, chunk.data] self.assertEqual(tuple(pb.index), chunk.index) self.assertEqual(pb.key, chunk.key) self.assertEqual(tuple(pb.shape), chunk.shape) self.assertEqual(int(op.type.split('.', 1)[1]), DataFrameAggregate._op_type_) chunk2 = self._pb_deserial(serials)[chunk.data] self.assertEqual(chunk.index, chunk2.index) self.assertEqual(chunk.key, chunk2.key) self.assertEqual(chunk.shape, chunk2.shape) self.assertEqual(chunk.op.agg_funcs[0].kwds.get('skipna'), chunk2.op.agg_funcs[0].kwds.get('skipna')) self.assertEqual(chunk.op.axis, chunk2.op.axis) # json chunk = reduction_df.chunks[0] serials = self._json_serial(chunk) chunk2 = self._json_deserial(serials)[chunk.data] self.assertEqual(chunk.index, chunk2.index) self.assertEqual(chunk.key, chunk2.key) self.assertEqual(chunk.shape, chunk2.shape) self.assertEqual(chunk.op.agg_funcs[0].kwds.get('skipna'), chunk2.op.agg_funcs[0].kwds.get('skipna')) self.assertEqual(chunk.op.axis, chunk2.op.axis)
def test_cut(): s = from_pandas_series(pd.Series([1., 2., 3., 4.]), chunk_size=2) with pytest.raises(ValueError): _ = cut(s, -1) with pytest.raises(ValueError): _ = cut([[1, 2], [3, 4]], 3) with pytest.raises(ValueError): _ = cut([], 3) r, b = cut(s, [1.5, 2.5], retbins=True) assert isinstance(r, SERIES_TYPE) assert isinstance(b, TENSOR_TYPE) r = tile(r) assert len(r.chunks) == 2 for c in r.chunks: assert isinstance(c, SERIES_CHUNK_TYPE) assert c.shape == (2, ) r = cut(s.to_tensor(), [1.5, 2.5]) assert isinstance(r, CATEGORICAL_TYPE) assert len(r) == len(s) assert 'Categorical' in repr(r) r = tile(r) assert len(r.chunks) == 2 for c in r.chunks: assert isinstance(c, CATEGORICAL_CHUNK_TYPE) assert c.shape == (2, ) assert c.ndim == 1 r = cut([0, 1, 1, 2], bins=4, labels=False) assert isinstance(r, TENSOR_TYPE) e = pd.cut([0, 1, 1, 2], bins=4, labels=False) assert r.dtype == e.dtype
def testSeriesApplyTransform(self): idxes = [chr(ord('A') + i) for i in range(20)] s_raw = pd.Series([i**2 for i in range(20)], index=idxes) series = from_pandas_series(s_raw, chunk_size=5) r = series.apply('add', args=(1, )).tiles() self.assertEqual(r.op._op_type_, opcodes.ADD) r = series.apply(np.sqrt).tiles() self.assertTrue(np.dtype('float64'), r.dtype) self.assertEqual(r.shape, series.shape) self.assertEqual(r.op._op_type_, opcodes.SERIES_APPLY) self.assertEqual(r.op.object_type, ObjectType.series) self.assertEqual(r.chunks[0].shape, (5, )) self.assertEqual(r.chunks[0].inputs[0].shape, (5, )) r = series.apply('sqrt').tiles() self.assertTrue(np.dtype('float64'), r.dtype) self.assertEqual(r.shape, series.shape) self.assertEqual(r.op._op_type_, opcodes.SERIES_APPLY) self.assertEqual(r.op.object_type, ObjectType.series) self.assertEqual(r.chunks[0].shape, (5, )) self.assertEqual(r.chunks[0].inputs[0].shape, (5, )) r = series.transform(lambda x: x + 1).tiles() self.assertTrue(np.dtype('float64'), r.dtype) self.assertEqual(r.shape, series.shape) self.assertEqual(r.op._op_type_, opcodes.SERIES_TRANSFORM) self.assertEqual(r.op.object_type, ObjectType.series) self.assertEqual(r.chunks[0].shape, (5, )) self.assertEqual(r.chunks[0].inputs[0].shape, (5, )) r = series.apply(lambda x: [x, x + 1], convert_dtype=False).tiles() self.assertTrue(np.dtype('object'), r.dtype) self.assertEqual(r.shape, series.shape) self.assertEqual(r.op._op_type_, opcodes.SERIES_APPLY) self.assertEqual(r.op.object_type, ObjectType.series) self.assertEqual(r.chunks[0].shape, (5, )) self.assertEqual(r.chunks[0].inputs[0].shape, (5, ))
def testSeriesSumSerialize(self): data = pd.Series(np.random.rand(10), name='a') sum_df = from_pandas_series(data).sum(axis='index', skipna=False).tiles() # pb chunk = sum_df.chunks[0] serials = self._pb_serial(chunk) op, pb = serials[chunk.op, chunk.data] self.assertEqual(tuple(pb.index), chunk.index) self.assertEqual(pb.key, chunk.key) self.assertEqual(tuple(pb.shape), chunk.shape) self.assertEqual(int(op.type.split('.', 1)[1]), OperandDef.SUM) chunk2 = self._pb_deserial(serials)[chunk.data] self.assertEqual(chunk.index, chunk2.index) self.assertEqual(chunk.key, chunk2.key) self.assertEqual(chunk.shape, chunk2.shape) self.assertEqual(chunk.name, chunk2.name) self.assertEqual(chunk.op.skipna, chunk2.op.skipna) self.assertEqual(chunk.op.axis, chunk2.op.axis) pd.testing.assert_index_equal(chunk2.index_value.to_pandas(), chunk.index_value.to_pandas()) # json chunk = sum_df.chunks[0] serials = self._json_serial(chunk) chunk2 = self._json_deserial(serials)[chunk.data] self.assertEqual(chunk.index, chunk2.index) self.assertEqual(chunk.key, chunk2.key) self.assertEqual(chunk.shape, chunk2.shape) self.assertEqual(chunk2.name, chunk.name) self.assertEqual(chunk.op.skipna, chunk2.op.skipna) self.assertEqual(chunk.op.axis, chunk2.op.axis) pd.testing.assert_index_equal(chunk2.index_value.to_pandas(), chunk.index_value.to_pandas())
def test_from_pandas_series(): data = pd.Series(np.random.rand(10), name='a') series = from_pandas_series(data, chunk_size=4) assert series.name == data.name assert isinstance(series.index_value._index_value, IndexValue.RangeIndex) assert series.index_value._index_value._slice == slice(0, 10, 1) assert series.index_value.is_monotonic_increasing is True assert series.index_value.is_monotonic_decreasing is False assert series.index_value.is_unique is True assert series.index_value.min_val == 0 assert series.index_value.max_val == 9 series = tile(series) assert len(series.chunks) == 3 pd.testing.assert_series_equal(series.chunks[0].op.data, series.op.data.iloc[:4]) assert series.chunks[0].index_value._index_value._slice == slice(0, 4, 1) assert series.chunks[ 0].index_value._index_value._is_monotonic_increasing is True assert series.chunks[ 0].index_value._index_value._is_monotonic_decreasing is False assert series.chunks[0].index_value._index_value._is_unique is True pd.testing.assert_series_equal(series.chunks[1].op.data, series.op.data.iloc[4:8]) assert series.chunks[1].index_value._index_value._slice == slice(4, 8, 1) assert series.chunks[ 1].index_value._index_value._is_monotonic_increasing is True assert series.chunks[ 1].index_value._index_value._is_monotonic_decreasing is False assert series.chunks[1].index_value._index_value._is_unique is True pd.testing.assert_series_equal(series.chunks[2].op.data, series.op.data.iloc[8:]) assert series.chunks[2].index_value._index_value._slice == slice(8, 10, 1) assert series.chunks[ 2].index_value._index_value._is_monotonic_increasing is True assert series.chunks[ 2].index_value._index_value._is_monotonic_decreasing is False assert series.chunks[2].index_value._index_value._is_unique is True
def testCheckNA(self): df_raw = pd.DataFrame(np.nan, index=range(0, 20), columns=list('ABCDEFGHIJ')) for _ in range(20): df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99) df = from_pandas_df(df_raw, chunk_size=4) pd.testing.assert_frame_equal(self.executor.execute_dataframe(df.isna(), concat=True)[0], df_raw.isna()) pd.testing.assert_frame_equal(self.executor.execute_dataframe(df.notna(), concat=True)[0], df_raw.notna()) series_raw = pd.Series(np.nan, index=range(20)) for _ in range(3): series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99) series = from_pandas_series(series_raw, chunk_size=4) pd.testing.assert_series_equal(self.executor.execute_dataframe(series.isna(), concat=True)[0], series_raw.isna()) pd.testing.assert_series_equal(self.executor.execute_dataframe(series.notna(), concat=True)[0], series_raw.notna())
def test_to_gpu(): # test dataframe data = pd.DataFrame(np.random.rand(10, 10), index=np.random.randint(-100, 100, size=(10, )), columns=[np.random.bytes(10) for _ in range(10)]) df = from_pandas_df(data) cdf = to_gpu(df) assert df.index_value == cdf.index_value assert df.columns_value == cdf.columns_value assert cdf.op.gpu is True pd.testing.assert_series_equal(df.dtypes, cdf.dtypes) df, cdf = tile(df, cdf) assert df.nsplits == cdf.nsplits assert df.chunks[0].index_value == cdf.chunks[0].index_value assert df.chunks[0].columns_value == cdf.chunks[0].columns_value assert cdf.chunks[0].op.gpu is True pd.testing.assert_series_equal(df.chunks[0].dtypes, cdf.chunks[0].dtypes) assert cdf is to_gpu(cdf) # test series sdata = data.iloc[:, 0] series = from_pandas_series(sdata) cseries = to_gpu(series) assert series.index_value == cseries.index_value assert cseries.op.gpu is True series, cseries = tile(series, cseries) assert series.nsplits == cseries.nsplits assert series.chunks[0].index_value == cseries.chunks[0].index_value assert cseries.chunks[0].op.gpu is True assert cseries is to_gpu(cseries)
def testRechunkExecution(self): data = pd.DataFrame(np.random.rand(8, 10)) df = from_pandas_df(pd.DataFrame(data), chunk_size=3) df2 = df.rechunk((3, 4)) res = self.executor.execute_dataframe(df2, concat=True)[0] pd.testing.assert_frame_equal(data, res) data = pd.DataFrame(np.random.rand(10, 10), index=np.random.randint(-100, 100, size=(10, )), columns=[np.random.bytes(10) for _ in range(10)]) df = from_pandas_df(data) df2 = df.rechunk(5) res = self.executor.execute_dataframe(df2, concat=True)[0] pd.testing.assert_frame_equal(data, res) # test Series rechunk execution. data = pd.Series(np.random.rand(10, )) series = from_pandas_series(data) series2 = series.rechunk(3) res = self.executor.execute_dataframe(series2, concat=True)[0] pd.testing.assert_series_equal(data, res) series2 = series.rechunk(1) res = self.executor.execute_dataframe(series2, concat=True)[0] pd.testing.assert_series_equal(data, res) # test index rechunk execution data = pd.Index(np.random.rand(10, )) index = from_pandas_index(data) index2 = index.rechunk(3) res = self.executor.execute_dataframe(index2, concat=True)[0] pd.testing.assert_index_equal(data, res) index2 = index.rechunk(1) res = self.executor.execute_dataframe(index2, concat=True)[0] pd.testing.assert_index_equal(data, res)
def testSeriesCumReduction(self): data = pd.Series(np.random.rand(20), index=[str(i) for i in range(20)], name='a') reduction_df1 = self.compute(from_pandas_series(data)) pd.testing.assert_series_equal( self.compute(data), self.executor.execute_dataframe(reduction_df1, concat=True)[0]) reduction_df2 = self.compute(from_pandas_series(data, chunk_size=6)) pd.testing.assert_series_equal( self.compute(data), self.executor.execute_dataframe(reduction_df2, concat=True)[0]) reduction_df3 = self.compute(from_pandas_series(data, chunk_size=3)) pd.testing.assert_series_equal( self.compute(data), self.executor.execute_dataframe(reduction_df3, concat=True)[0]) reduction_df4 = self.compute(from_pandas_series(data, chunk_size=4), axis='index') pd.testing.assert_series_equal( self.compute(data, axis='index'), self.executor.execute_dataframe(reduction_df4, concat=True)[0]) data = pd.Series(np.random.rand(20), name='a') data[0] = 0.1 # make sure not all elements are NAN data[data > 0.5] = np.nan reduction_df1 = self.compute(from_pandas_series(data, chunk_size=3)) pd.testing.assert_series_equal( self.compute(data), self.executor.execute_dataframe(reduction_df1, concat=True)[0]) reduction_df2 = self.compute(from_pandas_series(data, chunk_size=3), skipna=False) pd.testing.assert_series_equal( self.compute(data, skipna=False), self.executor.execute_dataframe(reduction_df2, concat=True)[0])
def testFromPandasSeriesExecution(self): ps = pd.Series(np.random.rand(20), index=[np.arange(20), np.arange(20, 0, -1)], name='a') series = from_pandas_series(ps, chunk_size=13) result = self.executor.execute_dataframe(series, concat=True)[0] pd.testing.assert_series_equal(ps, result)
def test_series(setup, func_name, func_opts): # only one chunk s1 = pd.Series(np.arange(10) + 1) s1 = to_boolean_if_needed(func_opts.func_name, s1) s2 = pd.Series(np.arange(10) + 1) s2 = to_boolean_if_needed(func_opts.func_name, s2) r = func_opts.func(from_pandas_series(s1, chunk_size=10), from_pandas_series(s2, chunk_size=10)) result = r.execute().fetch() expected = func_opts.func(s1, s2) pd.testing.assert_series_equal(expected, result) # same index s1 = pd.Series(np.arange(10) + 1) s1 = to_boolean_if_needed(func_opts.func_name, s1) s2 = pd.Series(np.arange(10) + 1) s2 = to_boolean_if_needed(func_opts.func_name, s2) r = func_opts.func(from_pandas_series(s1, chunk_size=4), from_pandas_series(s2, chunk_size=6)) result = r.execute().fetch() expected = func_opts.func(s1, s2) pd.testing.assert_series_equal(expected, result) # no shuffle s1 = pd.Series(np.arange(10) + 1, index=range(10)) s1 = to_boolean_if_needed(func_opts.func_name, s1) s2 = pd.Series(np.arange(10) + 1, index=range(10, 0, -1)) s2 = to_boolean_if_needed(func_opts.func_name, s2) r = func_opts.func(from_pandas_series(s1, chunk_size=4), from_pandas_series(s2, chunk_size=6)) result = r.execute().fetch() expected = func_opts.func(s1, s2) pd.testing.assert_series_equal(expected, result) # shuffle data = (np.arange(10) + 1).astype(np.int64, copy=False) s1 = pd.Series(data, index=np.random.permutation(range(10))) s1 = to_boolean_if_needed(func_opts.func_name, s1) s2 = pd.Series(data, index=np.random.permutation(range(10, 0, -1))) s2 = to_boolean_if_needed(func_opts.func_name, s2) r = func_opts.func(from_pandas_series(s1, chunk_size=4), from_pandas_series(s2, chunk_size=6)) result = r.execute().fetch() expected = func_opts.func(s1, s2) pd.testing.assert_series_equal(expected, result) if func_opts.func_name in ['__and__', '__or__', '__xor__']: # bitwise logical operators doesn\'t support floating point scalars return # operate with scalar s1 = pd.Series(np.arange(10) + 1, index=np.random.permutation(range(10))) s1 = to_boolean_if_needed(func_opts.func_name, s1) r = func_opts.func(from_pandas_series(s1, chunk_size=4), 4) result = r.execute().fetch() expected = func_opts.func(s1, 4) pd.testing.assert_series_equal(expected, result) # reverse with scalar s1 = pd.Series(np.arange(10) + 1, index=np.random.permutation(range(10))) s1 = to_boolean_if_needed(func_opts.func_name, s1) r = func_opts.func(4, from_pandas_series(s1, chunk_size=4)) result = r.execute().fetch() expected = func_opts.func(4, s1) pd.testing.assert_series_equal(expected, result)
def test_dataframe_and_series(setup, func_name, func_opts): if func_opts.func_name in ['__and__', '__or__', '__xor__']: # pandas fails to compute some expected values due to `na`. return data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) data1 = to_boolean_if_needed(func_opts.func_name, data1) data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) data2 = to_boolean_if_needed(func_opts.func_name, data2) s1 = from_pandas_series(data2[1], chunk_size=(6,)) # operate on single-column dataframe and series df1 = from_pandas(data1[[1]], chunk_size=(5, 5)) r1 = getattr(df1, func_opts.func_name)(s1, axis='index') expected = getattr(data1[[1]], func_opts.func_name)(data2[1], axis='index') result = r1.execute().fetch() pd.testing.assert_frame_equal(expected, result) # operate on dataframe and series without shuffle df2 = from_pandas(data1, chunk_size=(5, 5)) r2 = getattr(df2, func_opts.func_name)(s1, axis='index') expected = getattr(data1, func_opts.func_name)(data2[1], axis='index') result = r2.execute().fetch() pd.testing.assert_frame_equal(expected, result) # operate on dataframe and series with shuffle df3 = from_pandas(data1, chunk_size=(5, 5)) r3 = getattr(df3, func_opts.func_name)(s1, axis='columns') expected = getattr(data1, func_opts.func_name)(data2[1], axis='columns') result = r3.execute().fetch() pd.testing.assert_frame_equal(expected, result) # test both one chunk, axis=0 pdf = pd.DataFrame({'ca': [1, 3, 2], 'cb': [360, 180, 2]}, index=[1, 2, 3]) pdf = to_boolean_if_needed(func_opts.func_name, pdf) df = from_pandas(pdf) series = pd.Series([0, 1, 2], index=[1, 2, 3]) mars_series = from_pandas_series(series) result = getattr(df, func_opts.func_name)(mars_series, axis=0).execute().fetch() expected = getattr(pdf, func_opts.func_name)(series, axis=0) pd.testing.assert_frame_equal(expected, result) # test different number of chunks, axis=0 pdf = pd.DataFrame({'ca': [1, 3, 2], 'cb': [360, 180, 2]}, index=[1, 2, 3]) pdf = to_boolean_if_needed(func_opts.func_name, pdf) df = from_pandas(pdf, chunk_size=1) series = pd.Series([0, 1, 2], index=[1, 2, 3]) mars_series = from_pandas_series(series) result = getattr(df, func_opts.func_name)(mars_series, axis=0).execute().fetch() expected = getattr(pdf, func_opts.func_name)(series, axis=0) pd.testing.assert_frame_equal(expected, result) # test with row shuffle, axis=0 pdf = pd.DataFrame({'ca': [1, 3, 2], 'cb': [360, 180, 2]}, index=[2, 1, 3]) pdf = to_boolean_if_needed(func_opts.func_name, pdf) df = from_pandas(pdf, chunk_size=1) series = pd.Series([0, 1, 2], index=[3, 1, 2]) mars_series = from_pandas_series(series) result = getattr(df, func_opts.func_name)(mars_series, axis=0).execute().fetch() expected = getattr(pdf, func_opts.func_name)(series, axis=0).reindex([3, 1, 2]) # modify the order of rows result = result.reindex(index=[3, 1, 2]) pd.testing.assert_frame_equal(expected, result) # test both one chunk, axis=1 pdf = pd.DataFrame({1: [1, 3, 2], 2: [360, 180, 2], 3: [1, 2, 3]}, index=['ra', 'rb', 'rc']) pdf = to_boolean_if_needed(func_opts.func_name, pdf) df = from_pandas(pdf) series = pd.Series([0, 1, 2], index=[1, 2, 3]) mars_series = from_pandas_series(series) result = getattr(df, func_opts.func_name)(mars_series, axis=1).execute().fetch() expected = getattr(pdf, func_opts.func_name)(series, axis=1) pd.testing.assert_frame_equal(expected, result) # test different number of chunks, axis=1 pdf = pd.DataFrame({1: [1, 3, 2], 2: [360, 180, 2], 3: [1, 2, 3]}, index=['ra', 'rb', 'rc']) pdf = to_boolean_if_needed(func_opts.func_name, pdf) df = from_pandas(pdf, chunk_size=1) series = pd.Series([0, 1, 2], index=[1, 2, 3]) mars_series = from_pandas_series(series) result = getattr(df, func_opts.func_name)(mars_series, axis=1).execute().fetch() expected = getattr(pdf, func_opts.func_name)(series, axis=1) pd.testing.assert_frame_equal(expected, result) # test with row shuffle, axis=1 pdf = pd.DataFrame({1: [1, 3, 2], 3: [1, 2, 3], 2: [360, 180, 2]}, index=['ra', 'rb', 'rc']) pdf = to_boolean_if_needed(func_opts.func_name, pdf) df = from_pandas(pdf, chunk_size=1) series = pd.Series([0, 1, 2], index=[3, 1, 2]) mars_series = from_pandas_series(series) result = getattr(df, func_opts.func_name)(mars_series, axis=1).execute().fetch() expected = getattr(pdf, func_opts.func_name)(series, axis=1) # modify the order of columns result = result[[1, 2, 3]] pd.testing.assert_frame_equal(expected, result)
def testSeriesReduction(self): data = pd.Series(np.random.rand(20), index=[str(i) for i in range(20)], name='a') reduction_df1 = self.compute(from_pandas_series(data)) self.assertEqual( self.compute(data), self.executor.execute_dataframe(reduction_df1, concat=True)[0]) reduction_df2 = self.compute(from_pandas_series(data, chunk_size=6)) self.assertAlmostEqual( self.compute(data), self.executor.execute_dataframe(reduction_df2, concat=True)[0]) reduction_df3 = self.compute(from_pandas_series(data, chunk_size=3)) self.assertAlmostEqual( self.compute(data), self.executor.execute_dataframe(reduction_df3, concat=True)[0]) reduction_df4 = self.compute(from_pandas_series(data, chunk_size=4), axis='index') self.assertAlmostEqual( self.compute(data, axis='index'), self.executor.execute_dataframe(reduction_df4, concat=True)[0]) data = pd.Series(np.random.rand(20), name='a') data[0] = 0.1 # make sure not all elements are NAN data[data > 0.5] = np.nan reduction_df1 = self.compute(from_pandas_series(data, chunk_size=3)) self.assertAlmostEqual( self.compute(data), self.executor.execute_dataframe(reduction_df1, concat=True)[0]) reduction_df2 = self.compute(from_pandas_series(data, chunk_size=3), skipna=False) self.assertTrue( np.isnan( self.executor.execute_dataframe(reduction_df2, concat=True)[0])) if self.has_min_count: reduction_df3 = self.compute(from_pandas_series(data, chunk_size=3), skipna=False, min_count=2) self.assertTrue( np.isnan( self.executor.execute_dataframe(reduction_df3, concat=True)[0])) reduction_df4 = self.compute(from_pandas_series(data, chunk_size=3), min_count=1) self.assertAlmostEqual( self.compute(data, min_count=1), self.executor.execute_dataframe(reduction_df4, concat=True)[0]) reduction_df5 = self.compute(from_pandas_series(data, chunk_size=3), min_count=21) self.assertTrue( np.isnan( self.executor.execute_dataframe(reduction_df5, concat=True)[0]))
def testMainDataFrameWithoutEtcd(self): import pandas as pd from mars.dataframe.datasource.dataframe import from_pandas as from_pandas_df from mars.dataframe.datasource.series import from_pandas as from_pandas_series from mars.dataframe.arithmetic import add self.start_processes(etcd=False, scheduler_args=['-Dscheduler.aggressive_assign=true']) session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id)) data1 = pd.DataFrame(np.random.rand(10, 10)) df1 = from_pandas_df(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10)) df2 = from_pandas_df(data2, chunk_size=6) df3 = add(df1, df2) graph = df3.build_graph() targets = [df3.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) expected = data1 + data2 result = session_ref.fetch_result(graph_key, df3.key) pd.testing.assert_frame_equal(expected, loads(result)) data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas_df(data1, chunk_size=(10, 5)) data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas_df(data2, chunk_size=(10, 6)) df3 = add(df1, df2) graph = df3.build_graph() targets = [df3.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) expected = data1 + data2 result = session_ref.fetch_result(graph_key, df3.key) pd.testing.assert_frame_equal(expected, loads(result)) data1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas_df(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas_df(data2, chunk_size=6) df3 = add(df1, df2) graph = df3.build_graph() targets = [df3.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) expected = data1 + data2 result = session_ref.fetch_result(graph_key, df3.key) pd.testing.assert_frame_equal(expected, loads(result)) s1 = pd.Series(np.random.rand(10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) series1 = from_pandas_series(s1) graph = series1.build_graph() targets = [series1.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, series1.key) pd.testing.assert_series_equal(s1, loads(result))
def testStringMethod(self): s = pd.Series(['a', 'b', 'c'], name='s') series = from_pandas_series(s, chunk_size=2) with self.assertRaises(AttributeError): _ = series.str.non_exist r = series.str.contains('c') self.assertEqual(r.dtype, np.bool_) self.assertEqual(r.name, s.name) pd.testing.assert_index_equal(r.index_value.to_pandas(), s.index) self.assertEqual(r.shape, s.shape) r = r.tiles() for i, c in enumerate(r.chunks): self.assertEqual(c.index, (i, )) self.assertEqual(c.dtype, np.bool_) self.assertEqual(c.name, s.name) pd.testing.assert_index_equal(c.index_value.to_pandas(), s.index[i * 2:(i + 1) * 2]) self.assertEqual(c.shape, (2, ) if i == 0 else (1, )) r = series.str.split(',', expand=True, n=1) self.assertEqual(r.op.object_type, ObjectType.dataframe) self.assertEqual(r.shape, (3, 2)) pd.testing.assert_index_equal(r.index_value.to_pandas(), s.index) pd.testing.assert_index_equal(r.columns_value.to_pandas(), pd.RangeIndex(2)) r = r.tiles() for i, c in enumerate(r.chunks): self.assertEqual(c.index, (i, 0)) pd.testing.assert_index_equal(c.index_value.to_pandas(), s.index[i * 2:(i + 1) * 2]) pd.testing.assert_index_equal(c.columns_value.to_pandas(), pd.RangeIndex(2)) self.assertEqual(c.shape, (2, 2) if i == 0 else (1, 2)) with self.assertRaises(TypeError): _ = series.str.cat([['1', '2']]) with self.assertRaises(ValueError): _ = series.str.cat(['1', '2']) with self.assertRaises(ValueError): _ = series.str.cat(',') with self.assertRaises(TypeError): _ = series.str.cat({'1', '2', '3'}) r = series.str.cat(sep=',') self.assertEqual(r.op.object_type, ObjectType.scalar) self.assertEqual(r.dtype, s.dtype) r = r.tiles() self.assertEqual(len(r.chunks), 1) self.assertEqual(r.chunks[0].op.object_type, ObjectType.scalar) self.assertEqual(r.chunks[0].dtype, s.dtype) r = series.str.extract(r'[ab](\d)', expand=False) self.assertEqual(r.op.object_type, ObjectType.series) self.assertEqual(r.dtype, s.dtype) r = r.tiles() for i, c in enumerate(r.chunks): self.assertEqual(c.index, (i, )) self.assertEqual(c.dtype, s.dtype) self.assertEqual(c.name, s.name) pd.testing.assert_index_equal(c.index_value.to_pandas(), s.index[i * 2:(i + 1) * 2]) self.assertEqual(c.shape, (2, ) if i == 0 else (1, )) r = series.str.extract(r'[ab](\d)', expand=True) self.assertEqual(r.op.object_type, ObjectType.dataframe) self.assertEqual(r.shape, (3, 1)) pd.testing.assert_index_equal(r.index_value.to_pandas(), s.index) pd.testing.assert_index_equal(r.columns_value.to_pandas(), pd.RangeIndex(1)) r = r.tiles() for i, c in enumerate(r.chunks): self.assertEqual(c.index, (i, 0)) pd.testing.assert_index_equal(c.index_value.to_pandas(), s.index[i * 2:(i + 1) * 2]) pd.testing.assert_index_equal(c.columns_value.to_pandas(), pd.RangeIndex(1)) self.assertEqual(c.shape, (2, 1) if i == 0 else (1, 1)) self.assertIn('lstrip', dir(series.str))
def testTransform(self): cols = [chr(ord('A') + i) for i in range(10)] df_raw = pd.DataFrame( dict((c, [i**2 for i in range(20)]) for c in cols)) df = from_pandas_df(df_raw, chunk_size=5) idxes = [chr(ord('A') + i) for i in range(20)] s_raw = pd.Series([i**2 for i in range(20)], index=idxes) series = from_pandas_series(s_raw, chunk_size=5) def rename_fn(f, new_name): f.__name__ = new_name return f old_chunk_store_limit = options.chunk_store_limit try: options.chunk_store_limit = 20 # DATAFRAME CASES # test transform scenarios on data frames r = df.transform(lambda x: list(range(len(x)))).tiles() self.assertTrue(all(v == np.dtype('int64') for v in r.dtypes)) self.assertEqual(r.shape, df.shape) self.assertEqual(r.op._op_type_, opcodes.TRANSFORM) self.assertEqual(r.op.object_type, ObjectType.dataframe) self.assertEqual(r.chunks[0].shape, (df.shape[0], 20 // df.shape[0])) self.assertEqual(r.chunks[0].inputs[0].shape[0], df_raw.shape[0]) self.assertEqual(r.chunks[0].inputs[0].op._op_type_, opcodes.CONCATENATE) r = df.transform(lambda x: list(range(len(x))), axis=1).tiles() self.assertTrue(all(v == np.dtype('int64') for v in r.dtypes)) self.assertEqual(r.shape, df.shape) self.assertEqual(r.op._op_type_, opcodes.TRANSFORM) self.assertEqual(r.op.object_type, ObjectType.dataframe) self.assertEqual(r.chunks[0].shape, (20 // df.shape[1], df.shape[1])) self.assertEqual(r.chunks[0].inputs[0].shape[1], df_raw.shape[1]) self.assertEqual(r.chunks[0].inputs[0].op._op_type_, opcodes.CONCATENATE) r = df.transform(['cumsum', 'cummax', lambda x: x + 1]).tiles() self.assertTrue(all(v == np.dtype('int64') for v in r.dtypes)) self.assertEqual(r.shape, (df.shape[0], df.shape[1] * 3)) self.assertEqual(r.op._op_type_, opcodes.TRANSFORM) self.assertEqual(r.op.object_type, ObjectType.dataframe) self.assertEqual(r.chunks[0].shape, (df.shape[0], 20 // df.shape[0] * 3)) self.assertEqual(r.chunks[0].inputs[0].shape[0], df_raw.shape[0]) self.assertEqual(r.chunks[0].inputs[0].op._op_type_, opcodes.CONCATENATE) r = df.transform({ 'A': 'cumsum', 'D': ['cumsum', 'cummax'], 'F': lambda x: x + 1 }).tiles() self.assertTrue(all(v == np.dtype('int64') for v in r.dtypes)) self.assertEqual(r.shape, (df.shape[0], 4)) self.assertEqual(r.op._op_type_, opcodes.TRANSFORM) self.assertEqual(r.op.object_type, ObjectType.dataframe) self.assertEqual(r.chunks[0].shape, (df.shape[0], 1)) self.assertEqual(r.chunks[0].inputs[0].shape[0], df_raw.shape[0]) self.assertEqual(r.chunks[0].inputs[0].op._op_type_, opcodes.CONCATENATE) # test agg scenarios on series r = df.transform(lambda x: x.iloc[:-1], _call_agg=True).tiles() self.assertTrue(all(v == np.dtype('int64') for v in r.dtypes)) self.assertEqual(r.shape, (np.nan, df.shape[1])) self.assertEqual(r.op._op_type_, opcodes.TRANSFORM) self.assertEqual(r.op.object_type, ObjectType.dataframe) self.assertEqual(r.chunks[0].shape, (np.nan, 1)) self.assertEqual(r.chunks[0].inputs[0].shape[0], df_raw.shape[0]) self.assertEqual(r.chunks[0].inputs[0].op._op_type_, opcodes.CONCATENATE) r = df.transform(lambda x: x.iloc[:-1], axis=1, _call_agg=True).tiles() self.assertTrue(all(v == np.dtype('int64') for v in r.dtypes)) self.assertEqual(r.shape, (df.shape[0], np.nan)) self.assertEqual(r.op._op_type_, opcodes.TRANSFORM) self.assertEqual(r.op.object_type, ObjectType.dataframe) self.assertEqual(r.chunks[0].shape, (2, np.nan)) self.assertEqual(r.chunks[0].inputs[0].shape[1], df_raw.shape[1]) self.assertEqual(r.chunks[0].inputs[0].op._op_type_, opcodes.CONCATENATE) fn_list = [ rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), 'f1'), lambda x: x.iloc[:-1].reset_index(drop=True) ] r = df.transform(fn_list, _call_agg=True).tiles() self.assertTrue(all(v == np.dtype('int64') for v in r.dtypes)) self.assertEqual(r.shape, (np.nan, df.shape[1] * 2)) self.assertEqual(r.op._op_type_, opcodes.TRANSFORM) self.assertEqual(r.op.object_type, ObjectType.dataframe) self.assertEqual(r.chunks[0].shape, (np.nan, 2)) self.assertEqual(r.chunks[0].inputs[0].shape[0], df_raw.shape[0]) self.assertEqual(r.chunks[0].inputs[0].op._op_type_, opcodes.CONCATENATE) r = df.transform(lambda x: x.sum(), _call_agg=True).tiles() self.assertEqual(r.dtype, np.dtype('int64')) self.assertEqual(r.shape, (df.shape[1], )) self.assertEqual(r.op._op_type_, opcodes.TRANSFORM) self.assertEqual(r.op.object_type, ObjectType.series) self.assertEqual(r.chunks[0].shape, (20 // df.shape[0], )) self.assertEqual(r.chunks[0].inputs[0].shape[0], df_raw.shape[0]) self.assertEqual(r.chunks[0].inputs[0].op._op_type_, opcodes.CONCATENATE) fn_dict = { 'A': rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), 'f1'), 'D': [ rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), 'f1'), lambda x: x.iloc[:-1].reset_index(drop=True) ], 'F': lambda x: x.iloc[:-1].reset_index(drop=True), } r = df.transform(fn_dict, _call_agg=True).tiles() self.assertTrue(all(v == np.dtype('int64') for v in r.dtypes)) self.assertEqual(r.shape, (np.nan, 4)) self.assertEqual(r.op._op_type_, opcodes.TRANSFORM) self.assertEqual(r.op.object_type, ObjectType.dataframe) self.assertEqual(r.chunks[0].shape, (np.nan, 1)) self.assertEqual(r.chunks[0].inputs[0].shape[0], df_raw.shape[0]) self.assertEqual(r.chunks[0].inputs[0].op._op_type_, opcodes.CONCATENATE) # SERIES CASES # test transform scenarios on series r = series.transform(lambda x: x + 1).tiles() self.assertTrue(np.dtype('float64'), r.dtype) self.assertEqual(r.shape, series.shape) self.assertEqual(r.op._op_type_, opcodes.TRANSFORM) self.assertEqual(r.op.object_type, ObjectType.series) self.assertEqual(r.chunks[0].shape, (5, )) self.assertEqual(r.chunks[0].inputs[0].shape, (5, )) finally: options.chunk_store_limit = old_chunk_store_limit
def testFillNA(self): df_raw = pd.DataFrame(np.nan, index=range(0, 20), columns=list('ABCDEFGHIJ')) for _ in range(20): df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99) value_df_raw = pd.DataFrame(np.random.randint(0, 100, (10, 7)).astype( np.float32), columns=list('ABCDEFG')) series_raw = pd.Series(np.nan, index=range(20)) for _ in range(3): series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99) value_series_raw = pd.Series(np.random.randint(0, 100, (10, )).astype( np.float32), index=list('ABCDEFGHIJ')) df = from_pandas_df(df_raw) series = from_pandas_series(series_raw) # when nothing supplied, raise with self.assertRaises(ValueError): df.fillna() # when both values and methods supplied, raises with self.assertRaises(ValueError): df.fillna(value=1, method='ffill') # when call on series, cannot supply DataFrames with self.assertRaises(ValueError): series.fillna(value=df) with self.assertRaises(ValueError): series.fillna(value=df_raw) with self.assertRaises(NotImplementedError): series.fillna(value=series_raw, downcast='infer') with self.assertRaises(NotImplementedError): series.ffill(limit=1) df2 = df.fillna(value_series_raw).tiles() self.assertEqual(len(df2.chunks), 1) self.assertEqual(df2.chunks[0].shape, df2.shape) self.assertIsNone(df2.chunks[0].op.stage) series2 = series.fillna(value_series_raw).tiles() self.assertEqual(len(series2.chunks), 1) self.assertEqual(series2.chunks[0].shape, series2.shape) self.assertIsNone(series2.chunks[0].op.stage) df = from_pandas_df(df_raw, chunk_size=5) df2 = df.fillna(value_series_raw).tiles() self.assertEqual(len(df2.chunks), 8) self.assertEqual(df2.chunks[0].shape, (5, 5)) self.assertIsNone(df2.chunks[0].op.stage) series = from_pandas_series(series_raw, chunk_size=5) series2 = series.fillna(value_series_raw).tiles() self.assertEqual(len(series2.chunks), 4) self.assertEqual(series2.chunks[0].shape, (5, )) self.assertIsNone(series2.chunks[0].op.stage) df2 = df.ffill(axis='columns').tiles() self.assertEqual(len(df2.chunks), 8) self.assertEqual(df2.chunks[0].shape, (5, 5)) self.assertEqual(df2.chunks[0].op.axis, 1) self.assertEqual(df2.chunks[0].op.stage, OperandStage.combine) self.assertEqual(df2.chunks[0].op.method, 'ffill') self.assertIsNone(df2.chunks[0].op.limit) series2 = series.bfill().tiles() self.assertEqual(len(series2.chunks), 4) self.assertEqual(series2.chunks[0].shape, (5, )) self.assertEqual(series2.chunks[0].op.stage, OperandStage.combine) self.assertEqual(series2.chunks[0].op.method, 'bfill') self.assertIsNone(series2.chunks[0].op.limit) value_df = from_pandas_df(value_df_raw, chunk_size=7) value_series = from_pandas_series(value_series_raw, chunk_size=7) df2 = df.fillna(value_df).tiles() self.assertEqual(df2.shape, df.shape) self.assertIsNone(df2.chunks[0].op.stage) df2 = df.fillna(value_series).tiles() self.assertEqual(df2.shape, df.shape) self.assertIsNone(df2.chunks[0].op.stage) value_series_raw.index = list(range(10)) value_series = from_pandas_series(value_series_raw) series2 = series.fillna(value_series).tiles() self.assertEqual(series2.shape, series.shape) self.assertIsNone(series2.chunks[0].op.stage)
def testRechunk(self): df = from_pandas_df(pd.DataFrame(np.random.rand(10, 10)), chunk_size=3) df2 = df.rechunk(4).tiles() self.assertEqual(df2.shape, (10, 10)) self.assertEqual(len(df2.chunks), 9) self.assertEqual(df2.chunks[0].shape, (4, 4)) pd.testing.assert_index_equal(df2.chunks[0].index_value.to_pandas(), pd.RangeIndex(4)) pd.testing.assert_index_equal(df2.chunks[0].columns_value.to_pandas(), pd.RangeIndex(4)) self.assertEqual(df2.chunks[2].shape, (4, 2)) pd.testing.assert_index_equal(df2.chunks[2].index_value.to_pandas(), pd.RangeIndex(4)) pd.testing.assert_index_equal(df2.chunks[2].columns_value.to_pandas(), pd.RangeIndex(8, 10)) self.assertEqual(df2.chunks[-1].shape, (2, 2)) pd.testing.assert_index_equal(df2.chunks[-1].index_value.to_pandas(), pd.RangeIndex(8, 10)) pd.testing.assert_index_equal(df2.chunks[-1].columns_value.to_pandas(), pd.RangeIndex(8, 10)) columns = [np.random.bytes(10) for _ in range(10)] index = np.random.randint(-100, 100, size=(4, )) data = pd.DataFrame(np.random.rand(4, 10), index=index, columns=columns) df = from_pandas_df(data, chunk_size=3) df2 = df.rechunk(6).tiles() self.assertEqual(df2.shape, (4, 10)) self.assertEqual(len(df2.chunks), 2) self.assertEqual(df2.chunks[0].shape, (4, 6)) pd.testing.assert_index_equal(df2.chunks[0].index_value.to_pandas(), df.index_value.to_pandas()) pd.testing.assert_index_equal(df2.chunks[0].columns_value.to_pandas(), pd.Index(columns[:6])) self.assertEqual(df2.chunks[1].shape, (4, 4)) pd.testing.assert_index_equal(df2.chunks[1].index_value.to_pandas(), df.index_value.to_pandas()) pd.testing.assert_index_equal(df2.chunks[1].columns_value.to_pandas(), pd.Index(columns[6:])) # test Series rechunk series = from_pandas_series(pd.Series(np.random.rand(10, )), chunk_size=3) series2 = series.rechunk(4).tiles() self.assertEqual(series2.shape, (10, )) self.assertEqual(len(series2.chunks), 3) pd.testing.assert_index_equal(series2.index_value.to_pandas(), pd.RangeIndex(10)) self.assertEqual(series2.chunk_shape, (3, )) self.assertEqual(series2.nsplits, ((4, 4, 2), )) self.assertEqual(series2.chunks[0].shape, (4, )) pd.testing.assert_index_equal( series2.chunks[0].index_value.to_pandas(), pd.RangeIndex(4)) self.assertEqual(series2.chunks[1].shape, (4, )) pd.testing.assert_index_equal( series2.chunks[1].index_value.to_pandas(), pd.RangeIndex(4, 8)) self.assertEqual(series2.chunks[2].shape, (2, )) pd.testing.assert_index_equal( series2.chunks[2].index_value.to_pandas(), pd.RangeIndex(8, 10)) series2 = series.rechunk(1).tiles() self.assertEqual(series2.shape, (10, )) self.assertEqual(len(series2.chunks), 10) pd.testing.assert_index_equal(series2.index_value.to_pandas(), pd.RangeIndex(10)) self.assertEqual(series2.chunk_shape, (10, )) self.assertEqual(series2.nsplits, ((1, ) * 10, )) self.assertEqual(series2.chunks[0].shape, (1, )) pd.testing.assert_index_equal( series2.chunks[0].index_value.to_pandas(), pd.RangeIndex(1)) # no need to rechunk series2 = series.rechunk(3).tiles() series = get_tiled(series) self.assertEqual(series2.chunk_shape, series.chunk_shape) self.assertEqual(series2.nsplits, series.nsplits)
def testStringMethodExecution(self): s = pd.Series(['s1,s2', 'ef,', 'dd', np.nan]) s2 = pd.concat([s, s, s]) series = from_pandas_series(s, chunk_size=2) series2 = from_pandas_series(s2, chunk_size=2) # test getitem r = series.str[:3] result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str[:3] pd.testing.assert_series_equal(result, expected) # test split, expand=False r = series.str.split(',', n=2) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.split(',', n=2) pd.testing.assert_series_equal(result, expected) # test split, expand=True r = series.str.split(',', expand=True, n=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.split(',', expand=True, n=1) pd.testing.assert_frame_equal(result, expected) # test rsplit r = series.str.rsplit(',', expand=True, n=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.rsplit(',', expand=True, n=1) pd.testing.assert_frame_equal(result, expected) # test cat all data r = series2.str.cat(sep='/', na_rep='e') result = self.executor.execute_dataframe(r, concat=True)[0] expected = s2.str.cat(sep='/', na_rep='e') self.assertEqual(result, expected) # test cat list r = series.str.cat(['a', 'b', np.nan, 'c']) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.cat(['a', 'b', np.nan, 'c']) pd.testing.assert_series_equal(result, expected) # test cat series r = series.str.cat(series.str.capitalize(), join='outer') result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.cat(s.str.capitalize(), join='outer') pd.testing.assert_series_equal(result, expected) # test extractall r = series.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)") result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)") pd.testing.assert_frame_equal(result, expected) # test extract, expand=False r = series.str.extract(r'[ab](\d)', expand=False) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.extract(r'[ab](\d)', expand=False) pd.testing.assert_series_equal(result, expected) # test extract, expand=True r = series.str.extract(r'[ab](\d)', expand=True) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.extract(r'[ab](\d)', expand=True) pd.testing.assert_frame_equal(result, expected)
def testTransformExecute(self): cols = [chr(ord('A') + i) for i in range(10)] df_raw = pd.DataFrame(dict((c, [i ** 2 for i in range(20)]) for c in cols)) idx_vals = [chr(ord('A') + i) for i in range(20)] s_raw = pd.Series([i ** 2 for i in range(20)], index=idx_vals) def rename_fn(f, new_name): f.__name__ = new_name return f old_chunk_store_limit = options.chunk_store_limit try: options.chunk_store_limit = 20 # DATAFRAME CASES df = from_pandas_df(df_raw, chunk_size=5) # test transform scenarios on data frames r = df.transform(lambda x: list(range(len(x)))) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.transform(lambda x: list(range(len(x)))) pd.testing.assert_frame_equal(result, expected) r = df.transform(lambda x: list(range(len(x))), axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.transform(lambda x: list(range(len(x))), axis=1) pd.testing.assert_frame_equal(result, expected) r = df.transform(['cumsum', 'cummax', lambda x: x + 1]) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.transform(['cumsum', 'cummax', lambda x: x + 1]) pd.testing.assert_frame_equal(result, expected) fn_dict = OrderedDict([ ('A', 'cumsum'), ('D', ['cumsum', 'cummax']), ('F', lambda x: x + 1), ]) r = df.transform(fn_dict) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.transform(fn_dict) pd.testing.assert_frame_equal(result, expected) # test agg scenarios on series r = df.transform(lambda x: x.iloc[:-1], _call_agg=True) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.agg(lambda x: x.iloc[:-1]) pd.testing.assert_frame_equal(result, expected) r = df.transform(lambda x: x.iloc[:-1], axis=1, _call_agg=True) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.agg(lambda x: x.iloc[:-1], axis=1) pd.testing.assert_frame_equal(result, expected) fn_list = [rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), 'f1'), lambda x: x.iloc[:-1].reset_index(drop=True)] r = df.transform(fn_list, _call_agg=True) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.agg(fn_list) pd.testing.assert_frame_equal(result, expected) r = df.transform(lambda x: x.sum(), _call_agg=True) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.agg(lambda x: x.sum()) pd.testing.assert_series_equal(result, expected) fn_dict = OrderedDict([ ('A', rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), 'f1')), ('D', [rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), 'f1'), lambda x: x.iloc[:-1].reset_index(drop=True)]), ('F', lambda x: x.iloc[:-1].reset_index(drop=True)), ]) r = df.transform(fn_dict, _call_agg=True) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.agg(fn_dict) pd.testing.assert_frame_equal(result, expected) # SERIES CASES series = from_pandas_series(s_raw, chunk_size=5) # test transform scenarios on series r = series.transform(lambda x: x + 1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.transform(lambda x: x + 1) pd.testing.assert_series_equal(result, expected) r = series.transform(['cumsum', lambda x: x + 1]) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.transform(['cumsum', lambda x: x + 1]) pd.testing.assert_frame_equal(result, expected) finally: options.chunk_store_limit = old_chunk_store_limit
def testEagerMode(self, *_): with new_cluster(scheduler_n_process=2, worker_n_process=2, shared_memory='20M', web=True) as cluster: self.assertIsInstance(Session.default_or_local()._sess, LocalClusterSession) with option_context({'eager_mode': True}): a_data = np.random.rand(10, 10) a = mt.tensor(a_data, chunk_size=3) np.testing.assert_array_equal(a, a_data) r1 = a + 1 expected1 = a_data + 1 np.testing.assert_array_equal(r1, expected1) r2 = r1.dot(r1) expected2 = expected1.dot(expected1) np.testing.assert_array_almost_equal(r2, expected2) a = mt.ones((10, 10), chunk_size=3) with self.assertRaises(ValueError): a.fetch() r = a.dot(a) np.testing.assert_array_equal(r.execute(), np.ones((10, 10)) * 10) with new_session('http://' + cluster._web_endpoint).as_default(): self.assertIsInstance(Session.default_or_local()._sess, WebSession) with option_context({'eager_mode': True}): a_data = np.random.rand(10, 10) a = mt.tensor(a_data, chunk_size=3) np.testing.assert_array_equal(a, a_data) r1 = a + 1 expected1 = a_data + 1 np.testing.assert_array_equal(r1, expected1) r2 = r1.dot(r1) expected2 = expected1.dot(expected1) np.testing.assert_array_almost_equal(r2, expected2) web_session = Session.default_or_local()._sess self.assertEqual(web_session.get_task_count(), 3) a = mt.ones((10, 10), chunk_size=3) with self.assertRaises(ValueError): a.fetch() r = a.dot(a) np.testing.assert_array_equal(r.execute(), np.ones((10, 10)) * 10) with new_session('http://' + cluster._web_endpoint).as_default(): from mars.dataframe.datasource.dataframe import from_pandas as from_pandas_df from mars.dataframe.datasource.series import from_pandas as from_pandas_series from mars.dataframe.arithmetic import add self.assertIsInstance(Session.default_or_local()._sess, WebSession) with option_context({'eager_mode': True}): data1 = pd.DataFrame( np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas_df(data1, chunk_size=5) pd.testing.assert_frame_equal(df1.fetch(), data1) data2 = pd.DataFrame( np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas_df(data2, chunk_size=6) pd.testing.assert_frame_equal(df2.fetch(), data2) df3 = add(df1, df2) pd.testing.assert_frame_equal(df3.fetch(), data1 + data2) s1 = pd.Series(np.random.rand(10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) series1 = from_pandas_series(s1) pd.testing.assert_series_equal(series1.fetch(), s1) web_session = Session.default_or_local()._sess self.assertEqual(web_session.get_task_count(), 4)
def testCutExecution(self): rs = np.random.RandomState(0) raw = rs.random(15) * 1000 s = pd.Series(raw, index=['i{}'.format(i) for i in range(15)]) bins = [10, 100, 500] ii = pd.interval_range(10, 500, 3) labels = ['a', 'b'] t = tensor(raw, chunk_size=4) series = from_pandas_series(s, chunk_size=4) iii = from_pandas_index(ii, chunk_size=2) # cut on Series r = cut(series, bins) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_series_equal(result, pd.cut(s, bins)) r, b = cut(series, bins, retbins=True) r_result = self.executor.execute_dataframe(r, concat=True)[0] b_result = self.executor.execute_tensor(b, concat=True)[0] r_expected, b_expected = pd.cut(s, bins, retbins=True) pd.testing.assert_series_equal(r_result, r_expected) np.testing.assert_array_equal(b_result, b_expected) # cut on tensor r = cut(t, bins) # result and expected is array whose dtype is CategoricalDtype result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.cut(raw, bins) self.assertEqual(len(result), len(expected)) for r, e in zip(result, expected): np.testing.assert_equal(r, e) # one chunk r = cut(s, tensor(bins, chunk_size=2), right=False, include_lowest=True) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_series_equal(result, pd.cut(s, bins, right=False, include_lowest=True)) # test labels r = cut(t, bins, labels=labels) # result and expected is array whose dtype is CategoricalDtype result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.cut(raw, bins, labels=labels) self.assertEqual(len(result), len(expected)) for r, e in zip(result, expected): np.testing.assert_equal(r, e) r = cut(t, bins, labels=False) # result and expected is array whose dtype is CategoricalDtype result = self.executor.execute_tensor(r, concat=True)[0] expected = pd.cut(raw, bins, labels=False) np.testing.assert_array_equal(result, expected) # test labels which is tensor labels_t = tensor(['a', 'b'], chunk_size=1) r = cut(raw, bins, labels=labels_t, include_lowest=True) # result and expected is array whose dtype is CategoricalDtype result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.cut(raw, bins, labels=labels, include_lowest=True) self.assertEqual(len(result), len(expected)) for r, e in zip(result, expected): np.testing.assert_equal(r, e) # test labels=False r, b = cut(raw, ii, labels=False, retbins=True) # result and expected is array whose dtype is CategoricalDtype r_result = self.executor.execute_tileable(r, concat=True)[0] b_result = self.executor.execute_tileable(b, concat=True)[0] r_expected, b_expected = pd.cut(raw, ii, labels=False, retbins=True) for r, e in zip(r_result, r_expected): np.testing.assert_equal(r, e) pd.testing.assert_index_equal(b_result, b_expected) # test bins which is md.IntervalIndex r, b = cut(series, iii, labels=tensor(labels, chunk_size=1), retbins=True) r_result = self.executor.execute_dataframe(r, concat=True)[0] b_result = self.executor.execute_dataframe(b, concat=True)[0] r_expected, b_expected = pd.cut(s, ii, labels=labels, retbins=True) pd.testing.assert_series_equal(r_result, r_expected) pd.testing.assert_index_equal(b_result, b_expected) # test duplicates bins2 = [0, 2, 4, 6, 10, 10] r, b = cut(s, bins2, labels=False, retbins=True, right=False, duplicates='drop') r_result = self.executor.execute_dataframe(r, concat=True)[0] b_result = self.executor.execute_tensor(b, concat=True)[0] r_expected, b_expected = pd.cut(s, bins2, labels=False, retbins=True, right=False, duplicates='drop') pd.testing.assert_series_equal(r_result, r_expected) np.testing.assert_array_equal(b_result, b_expected) ctx, executor = self._create_test_context(self.executor) with ctx: # test integer bins r = cut(series, 3) result = executor.execute_dataframes([r])[0] pd.testing.assert_series_equal(result, pd.cut(s, 3)) r, b = cut(series, 3, right=False, retbins=True) r_result, b_result = executor.execute_dataframes([r, b]) r_expected, b_expected = pd.cut(s, 3, right=False, retbins=True) pd.testing.assert_series_equal(r_result, r_expected) np.testing.assert_array_equal(b_result, b_expected) # test min max same s2 = pd.Series([1.1] * 15) r = cut(s2, 3) result = executor.execute_dataframes([r])[0] pd.testing.assert_series_equal(result, pd.cut(s2, 3)) # test inf exist s3 = s2.copy() s3[-1] = np.inf with self.assertRaises(ValueError): executor.execute_dataframes([cut(s3, 3)])