def testGroupBy(self): df1 = pd.DataFrame({ 'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce') }) mdf = md.DataFrame(df1, chunk_size=3) grouped = mdf.groupby('b') assert_groupby_equal( self.executor.execute_dataframe(grouped, concat=True)[0], df1.groupby('b')) df2 = pd.DataFrame( { 'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce') }, index=['i' + str(i) for i in range(9)]) mdf = md.DataFrame(df2, chunk_size=3) grouped = mdf.groupby('b') assert_groupby_equal( self.executor.execute_dataframe(grouped, concat=True)[0], df2.groupby('b')) df3 = pd.DataFrame( { 'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce') }, index=pd.MultiIndex.from_tuples([(i % 3, 'i' + str(i)) for i in range(9)])) mdf = md.DataFrame(df3, chunk_size=3) grouped = mdf.groupby(level=0) assert_groupby_equal( self.executor.execute_dataframe(grouped, concat=True)[0], df3.groupby(level=0)) series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3]) ms1 = md.Series(series1, chunk_size=3) grouped = ms1.groupby(lambda x: x % 3) assert_groupby_equal( self.executor.execute_dataframe(grouped, concat=True)[0], series1.groupby(lambda x: x % 3)) series2 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3], index=['i' + str(i) for i in range(9)]) ms2 = md.Series(series2, chunk_size=3) grouped = ms2.groupby(lambda x: int(x[1:]) % 3) assert_groupby_equal( self.executor.execute_dataframe(grouped, concat=True)[0], series2.groupby(lambda x: int(x[1:]) % 3))
def testDataSerialize(self): array = np.random.rand(1000, 100) assert_array_equal(array, dataserializer.loads(dataserializer.dumps(array))) assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.LZ4))) assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.GZIP))) array = np.random.rand(1000, 100) assert_array_equal( array, dataserializer.load(BytesIO(dataserializer.dumps(array)))) assert_array_equal( array, dataserializer.load( BytesIO( dataserializer.dumps( array, compress=dataserializer.CompressType.LZ4)))) assert_array_equal( array, dataserializer.load( BytesIO( dataserializer.dumps( array, compress=dataserializer.CompressType.GZIP)))) array = np.random.rand(1000, 100).T # test non c-contiguous assert_array_equal(array, dataserializer.loads(dataserializer.dumps(array))) assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.LZ4))) assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.GZIP))) array = np.float64(0.2345) assert_array_equal(array, dataserializer.loads(dataserializer.dumps(array))) assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.LZ4))) assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.GZIP))) # test structured arrays. rec_dtype = np.dtype([('a', 'int64'), ('b', 'double'), ('c', '<U8')]) array = np.ones((100, ), dtype=rec_dtype) array_loaded = dataserializer.loads(dataserializer.dumps(array)) self.assertEqual(array.dtype, array_loaded.dtype) assert_array_equal(array, array_loaded) fn = os.path.join(tempfile.gettempdir(), 'test_dump_file_%d.bin' % id(self)) try: array = np.random.rand(1000, 100).T # test non c-contiguous with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file, compress=dataserializer.CompressType.LZ4) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file, compress=dataserializer.CompressType.GZIP) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) finally: if os.path.exists(fn): os.unlink(fn) # test sparse if sps: mat = sparse.SparseMatrix(sps.random(100, 100, 0.1, format='csr')) des_mat = dataserializer.loads(dataserializer.dumps(mat)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) des_mat = dataserializer.loads( dataserializer.dumps(mat, compress=dataserializer.CompressType.LZ4)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) des_mat = dataserializer.loads( dataserializer.dumps( mat, compress=dataserializer.CompressType.GZIP)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) vector = sparse.SparseVector(sps.csr_matrix(np.random.rand(2)), shape=(2, )) des_vector = dataserializer.loads(dataserializer.dumps(vector)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) des_vector = dataserializer.loads( dataserializer.dumps(vector, compress=dataserializer.CompressType.LZ4)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) des_vector = dataserializer.loads( dataserializer.dumps( vector, compress=dataserializer.CompressType.GZIP)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) # test groupby df1 = pd.DataFrame({ 'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce') }) grouped = wrapped_groupby(df1, 'b') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1, 'b').c restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1, 'b') getattr(grouped, 'indices') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1.b, lambda x: x % 2) restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1.b, lambda x: x % 2) getattr(grouped, 'indices') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) # test categorical s = np.random.RandomState(0).random(10) cat = pd.cut(s, [0.3, 0.5, 0.8]) self.assertIsInstance(cat, pd.Categorical) des_cat = dataserializer.loads(dataserializer.dumps(cat)) self.assertEqual(len(cat), len(des_cat)) for c, dc in zip(cat, des_cat): np.testing.assert_equal(c, dc) # test IntervalIndex s = pd.interval_range(10, 100, 3) dest_s = dataserializer.loads((dataserializer.dumps(s))) pd.testing.assert_index_equal(s, dest_s)
def testGroupByWrapper(self): df = pd.DataFrame( { 'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.random.randn(8) }, index=pd.MultiIndex.from_tuples([(i // 4, i) for i in range(8)])) grouped = GroupByWrapper.from_tuple( wrapped_groupby(df, level=0).to_tuple()) assert_groupby_equal(grouped, df.groupby(level=0)) self.assertEqual(grouped.shape, (8, 4)) self.assertTrue(grouped.is_frame) self.assertGreater(sys.getsizeof(grouped), sys.getsizeof(grouped.groupby_obj)) self.assertGreater(calc_data_size(grouped), sys.getsizeof(grouped.groupby_obj)) grouped = GroupByWrapper.from_tuple( wrapped_groupby(df, level=0).C.to_tuple()) assert_groupby_equal(grouped, df.groupby(level=0).C) self.assertEqual(grouped.shape, (8, )) self.assertFalse(grouped.is_frame) grouped = GroupByWrapper.from_tuple( wrapped_groupby(df, 'B').to_tuple()) assert_groupby_equal(grouped, df.groupby('B')) self.assertEqual(grouped.shape, (8, 4)) self.assertTrue(grouped.is_frame) grouped = GroupByWrapper.from_tuple( wrapped_groupby(df, 'B').C.to_tuple(truncate=True)) assert_groupby_equal(grouped, df.groupby('B').C, with_selection=True) self.assertEqual(grouped.shape, (8, )) self.assertFalse(grouped.is_frame) grouped = GroupByWrapper.from_tuple( wrapped_groupby(df, 'B')[['C', 'D']].to_tuple(truncate=True)) assert_groupby_equal(grouped, df.groupby('B')[['C', 'D']], with_selection=True) self.assertEqual(grouped.shape, (8, 2)) self.assertTrue(grouped.is_frame) grouped = GroupByWrapper.from_tuple( wrapped_groupby(df, ['B', 'C']).to_tuple(truncate=True)) assert_groupby_equal(grouped, df.groupby(['B', 'C'])) self.assertEqual(grouped.shape, (8, 4)) self.assertTrue(grouped.is_frame) grouped = GroupByWrapper.from_tuple( wrapped_groupby(df, ['B', 'C']).C.to_tuple(truncate=True)) assert_groupby_equal(grouped, df.groupby(['B', 'C']).C, with_selection=True) self.assertEqual(grouped.shape, (8, )) self.assertFalse(grouped.is_frame) grouped = GroupByWrapper.from_tuple( wrapped_groupby(df, ['B', 'C'])[['A', 'D']].to_tuple(truncate=True)) assert_groupby_equal(grouped, df.groupby(['B', 'C'])[['A', 'D']], with_selection=True) self.assertEqual(grouped.shape, (8, 2)) self.assertTrue(grouped.is_frame) grouped = GroupByWrapper.from_tuple( wrapped_groupby(df, ['B', 'C'])[['C', 'D']].to_tuple(truncate=True)) assert_groupby_equal(grouped, df.groupby(['B', 'C'])[['C', 'D']], with_selection=True) self.assertEqual(grouped.shape, (8, 2)) self.assertTrue(grouped.is_frame) grouped = GroupByWrapper.from_tuple( wrapped_groupby( df, lambda x: x[-1] % 2).to_tuple(pickle_function=True)) assert_groupby_equal(grouped, df.groupby(lambda x: x[-1] % 2), with_selection=True) self.assertEqual(grouped.shape, (8, 4)) self.assertTrue(grouped.is_frame) grouped = GroupByWrapper.from_tuple( wrapped_groupby( df, lambda x: x[-1] % 2).C.to_tuple(pickle_function=True)) assert_groupby_equal(grouped, df.groupby(lambda x: x[-1] % 2).C, with_selection=True) self.assertEqual(grouped.shape, (8, )) self.assertFalse(grouped.is_frame) grouped = GroupByWrapper.from_tuple( wrapped_groupby( df, lambda x: x[-1] % 2)[['C', 'D']].to_tuple(pickle_function=True)) assert_groupby_equal(grouped, df.groupby(lambda x: x[-1] % 2)[['C', 'D']], with_selection=True) self.assertEqual(grouped.shape, (8, 2)) self.assertTrue(grouped.is_frame) grouped = GroupByWrapper.from_tuple( wrapped_groupby(df.B, lambda x: x[-1] % 2).to_tuple()) assert_groupby_equal(grouped, df.B.groupby(lambda x: x[-1] % 2), with_selection=True) self.assertEqual(grouped.shape, (8, )) self.assertFalse(grouped.is_frame)
def test_groupby(setup): rs = np.random.RandomState(0) data_size = 100 data_dict = {'a': rs.randint(0, 10, size=(data_size,)), 'b': rs.randint(0, 10, size=(data_size,)), 'c': rs.choice(list('abcd'), size=(data_size,))} # test groupby with DataFrames and RangeIndex df1 = pd.DataFrame(data_dict) mdf = md.DataFrame(df1, chunk_size=13) grouped = mdf.groupby('b') assert_groupby_equal(grouped.execute().fetch(), df1.groupby('b')) # test groupby with string index with duplications df2 = pd.DataFrame(data_dict, index=['i' + str(i % 3) for i in range(data_size)]) mdf = md.DataFrame(df2, chunk_size=13) grouped = mdf.groupby('b') assert_groupby_equal(grouped.execute().fetch(), df2.groupby('b')) # test groupby with DataFrames by series grouped = mdf.groupby(mdf['b']) assert_groupby_equal(grouped.execute().fetch(), df2.groupby(df2['b'])) # test groupby with DataFrames by multiple series grouped = mdf.groupby(by=[mdf['b'], mdf['c']]) assert_groupby_equal(grouped.execute().fetch(), df2.groupby(by=[df2['b'], df2['c']])) # test groupby with DataFrames with MultiIndex df3 = pd.DataFrame(data_dict, index=pd.MultiIndex.from_tuples( [(i % 3, 'i' + str(i)) for i in range(data_size)])) mdf = md.DataFrame(df3, chunk_size=13) grouped = mdf.groupby(level=0) assert_groupby_equal(grouped.execute().fetch(), df3.groupby(level=0)) # test groupby with DataFrames by integer columns df4 = pd.DataFrame(list(data_dict.values())).T mdf = md.DataFrame(df4, chunk_size=13) grouped = mdf.groupby(0) assert_groupby_equal(grouped.execute().fetch(), df4.groupby(0)) series1 = pd.Series(data_dict['a']) ms1 = md.Series(series1, chunk_size=13) grouped = ms1.groupby(lambda x: x % 3) assert_groupby_equal(grouped.execute().fetch(), series1.groupby(lambda x: x % 3)) # test groupby series grouped = ms1.groupby(ms1) assert_groupby_equal(grouped.execute().fetch(), series1.groupby(series1)) series2 = pd.Series(data_dict['a'], index=['i' + str(i) for i in range(data_size)]) ms2 = md.Series(series2, chunk_size=13) grouped = ms2.groupby(lambda x: int(x[1:]) % 3) assert_groupby_equal(grouped.execute().fetch(), series2.groupby(lambda x: int(x[1:]) % 3))
def test_groupby_getitem(setup): rs = np.random.RandomState(0) data_size = 100 raw = pd.DataFrame({'a': rs.randint(0, 10, size=(data_size,)), 'b': rs.randint(0, 10, size=(data_size,)), 'c': rs.choice(list('abcd'), size=(data_size,))}, index=pd.MultiIndex.from_tuples([(i % 3, 'i' + str(i)) for i in range(data_size)])) mdf = md.DataFrame(raw, chunk_size=13) r = mdf.groupby(level=0)[['a', 'b']] assert_groupby_equal(r.execute().fetch(), raw.groupby(level=0)[['a', 'b']], with_selection=True) for method in ('tree', 'shuffle'): r = mdf.groupby(level=0)[['a', 'b']].sum(method=method) pd.testing.assert_frame_equal(r.execute().fetch().sort_index(), raw.groupby(level=0)[['a', 'b']].sum().sort_index()) r = mdf.groupby(level=0)[['a', 'b']].apply(lambda x: x + 1) pd.testing.assert_frame_equal(r.execute().fetch().sort_index(), raw.groupby(level=0)[['a', 'b']].apply(lambda x: x + 1).sort_index()) r = mdf.groupby('b')[['a', 'b']] assert_groupby_equal(r.execute().fetch(), raw.groupby('b')[['a', 'b']], with_selection=True) r = mdf.groupby('b')[['a', 'c']] assert_groupby_equal(r.execute().fetch(), raw.groupby('b')[['a', 'c']], with_selection=True) for method in ('tree', 'shuffle'): r = mdf.groupby('b')[['a', 'b']].sum(method=method) pd.testing.assert_frame_equal(r.execute().fetch().sort_index(), raw.groupby('b')[['a', 'b']].sum().sort_index()) r = mdf.groupby('b')[['a', 'b']].agg(['sum', 'count'], method=method) pd.testing.assert_frame_equal(r.execute().fetch().sort_index(), raw.groupby('b')[['a', 'b']].agg(['sum', 'count']).sort_index()) r = mdf.groupby('b')[['a', 'c']].agg(['sum', 'count'], method=method) pd.testing.assert_frame_equal(r.execute().fetch().sort_index(), raw.groupby('b')[['a', 'c']].agg(['sum', 'count']).sort_index()) r = mdf.groupby('b')[['a', 'b']].apply(lambda x: x + 1) pd.testing.assert_frame_equal(r.execute().fetch().sort_index(), raw.groupby('b')[['a', 'b']].apply(lambda x: x + 1).sort_index()) r = mdf.groupby('b')[['a', 'b']].transform(lambda x: x + 1) pd.testing.assert_frame_equal(r.execute().fetch().sort_index(), raw.groupby('b')[['a', 'b']].transform(lambda x: x + 1).sort_index()) r = mdf.groupby('b')[['a', 'b']].cumsum() pd.testing.assert_frame_equal(r.execute().fetch().sort_index(), raw.groupby('b')[['a', 'b']].cumsum().sort_index()) r = mdf.groupby('b').a assert_groupby_equal(r.execute().fetch(), raw.groupby('b').a, with_selection=True) for method in ('shuffle', 'tree'): r = mdf.groupby('b').a.sum(method=method) pd.testing.assert_series_equal(r.execute().fetch().sort_index(), raw.groupby('b').a.sum().sort_index()) r = mdf.groupby('b').a.agg(['sum', 'mean', 'var'], method=method) pd.testing.assert_frame_equal(r.execute().fetch().sort_index(), raw.groupby('b').a.agg(['sum', 'mean', 'var']).sort_index()) r = mdf.groupby('b', as_index=False).a.sum(method=method) pd.testing.assert_frame_equal( r.execute().fetch().sort_values('b', ignore_index=True), raw.groupby('b', as_index=False).a.sum().sort_values('b', ignore_index=True)) r = mdf.groupby('b', as_index=False).b.count(method=method) pd.testing.assert_frame_equal( r.execute().fetch().sort_values('b', ignore_index=True), raw.groupby('b', as_index=False).b.count().sort_values('b', ignore_index=True)) r = mdf.groupby('b', as_index=False).b.agg({'cnt': 'count'}, method=method) pd.testing.assert_frame_equal( r.execute().fetch().sort_values('b', ignore_index=True), raw.groupby('b', as_index=False).b.agg({'cnt': 'count'}).sort_values('b', ignore_index=True)) r = mdf.groupby('b').a.apply(lambda x: x + 1) pd.testing.assert_series_equal(r.execute().fetch().sort_index(), raw.groupby('b').a.apply(lambda x: x + 1).sort_index()) r = mdf.groupby('b').a.transform(lambda x: x + 1) pd.testing.assert_series_equal(r.execute().fetch().sort_index(), raw.groupby('b').a.transform(lambda x: x + 1).sort_index()) r = mdf.groupby('b').a.cumsum() pd.testing.assert_series_equal(r.execute().fetch().sort_index(), raw.groupby('b').a.cumsum().sort_index()) # special test for selection key == 0 raw = pd.DataFrame(rs.rand(data_size, 10)) raw[0] = 0 mdf = md.DataFrame(raw, chunk_size=13) r = mdf.groupby(0, as_index=False)[0].agg({'cnt': 'count'}, method='tree') pd.testing.assert_frame_equal(r.execute().fetch().sort_index(), raw.groupby(0, as_index=False)[0].agg({'cnt': 'count'}))
def testGroupByGetItem(self): df1 = pd.DataFrame( { 'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce') }, index=pd.MultiIndex.from_tuples([(i % 3, 'i' + str(i)) for i in range(9)])) mdf = md.DataFrame(df1, chunk_size=3) r = mdf.groupby(level=0)[['a', 'b']] assert_groupby_equal(self.executor.execute_dataframe(r, concat=True)[0], df1.groupby(level=0)[['a', 'b']], with_selection=True) r = mdf.groupby(level=0)[['a', 'b']].sum(method='tree') pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0], df1.groupby(level=0)[['a', 'b']].sum()) r = mdf.groupby(level=0)[['a', 'b']].apply(lambda x: x + 1) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0].sort_index(), df1.groupby(level=0)[['a', 'b']].apply(lambda x: x + 1).sort_index()) r = mdf.groupby('b')[['a', 'b']] assert_groupby_equal(self.executor.execute_dataframe(r, concat=True)[0], df1.groupby('b')[['a', 'b']], with_selection=True) r = mdf.groupby('b')[['a', 'c']] assert_groupby_equal(self.executor.execute_dataframe(r, concat=True)[0], df1.groupby('b')[['a', 'c']], with_selection=True) r = mdf.groupby('b')[['a', 'b']].sum(method='tree') pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0], df1.groupby('b')[['a', 'b']].sum()) r = mdf.groupby('b')[['a', 'b']].agg(['sum', 'count'], method='tree') pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0], df1.groupby('b')[['a', 'b']].agg(['sum', 'count'])) r = mdf.groupby('b')[['a', 'c']].agg(['sum', 'count'], method='tree') pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0], df1.groupby('b')[['a', 'c']].agg(['sum', 'count'])) r = mdf.groupby('b')[['a', 'b']].apply(lambda x: x + 1) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0].sort_index(), df1.groupby('b')[['a', 'b']].apply(lambda x: x + 1).sort_index()) r = mdf.groupby('b')[['a', 'b']].transform(lambda x: x + 1) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0].sort_index(), df1.groupby('b')[['a', 'b']].transform(lambda x: x + 1).sort_index()) r = mdf.groupby('b')[['a', 'b']].cumsum() pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0].sort_index(), df1.groupby('b')[['a', 'b']].cumsum().sort_index()) r = mdf.groupby('b').a assert_groupby_equal(self.executor.execute_dataframe(r, concat=True)[0], df1.groupby('b').a, with_selection=True) r = mdf.groupby('b').a.sum(method='tree') pd.testing.assert_series_equal( self.executor.execute_dataframe(r, concat=True)[0], df1.groupby('b').a.sum()) r = mdf.groupby('b').a.agg(['sum', 'mean', 'var'], method='tree') pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0], df1.groupby('b').a.agg(['sum', 'mean', 'var'])) r = mdf.groupby('b').a.apply(lambda x: x + 1) pd.testing.assert_series_equal( self.executor.execute_dataframe(r, concat=True)[0].sort_index(), df1.groupby('b').a.apply(lambda x: x + 1).sort_index()) r = mdf.groupby('b').a.transform(lambda x: x + 1) pd.testing.assert_series_equal( self.executor.execute_dataframe(r, concat=True)[0].sort_index(), df1.groupby('b').a.transform(lambda x: x + 1).sort_index()) r = mdf.groupby('b').a.cumsum() pd.testing.assert_series_equal( self.executor.execute_dataframe(r, concat=True)[0].sort_index(), df1.groupby('b').a.cumsum().sort_index())
def testDataSerialize(self): for type_, compress in itertools.product( (None,) + tuple(dataserializer.SerialType.__members__.values()), (None,) + tuple(dataserializer.CompressType.__members__.values())): array = np.random.rand(1000, 100) assert_array_equal(array, dataserializer.loads( dataserializer.dumps(array, serial_type=type_, compress=compress))) array = np.random.rand(1000, 100) assert_array_equal(array, dataserializer.load( BytesIO(dataserializer.dumps(array, serial_type=type_, compress=compress)))) array = np.random.rand(1000, 100).T # test non c-contiguous assert_array_equal(array, dataserializer.loads( dataserializer.dumps(array, serial_type=type_, compress=compress))) array = np.float64(0.2345) assert_array_equal(array, dataserializer.loads( dataserializer.dumps(array, serial_type=type_, compress=compress))) # test non-serializable object if pyarrow: non_serial = type('non_serial', (object,), dict(nbytes=10)) with self.assertRaises(SerializationFailed): dataserializer.dumps(non_serial()) # test structured arrays. rec_dtype = np.dtype([('a', 'int64'), ('b', 'double'), ('c', '<U8')]) array = np.ones((100,), dtype=rec_dtype) array_loaded = dataserializer.loads(dataserializer.dumps(array)) self.assertEqual(array.dtype, array_loaded.dtype) assert_array_equal(array, array_loaded) fn = os.path.join(tempfile.gettempdir(), f'test_dump_file_{id(self)}.bin') try: array = np.random.rand(1000, 100).T # test non c-contiguous with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file, compress=dataserializer.CompressType.LZ4) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file, compress=dataserializer.CompressType.GZIP) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) finally: if os.path.exists(fn): os.unlink(fn) # test sparse if sps: mat = sparse.SparseMatrix(sps.random(100, 100, 0.1, format='csr')) des_mat = dataserializer.loads(dataserializer.dumps(mat)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) des_mat = dataserializer.loads(dataserializer.dumps( mat, compress=dataserializer.CompressType.LZ4)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) des_mat = dataserializer.loads(dataserializer.dumps( mat, compress=dataserializer.CompressType.GZIP)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) vector = sparse.SparseVector(sps.csr_matrix(np.random.rand(2)), shape=(2,)) des_vector = dataserializer.loads(dataserializer.dumps(vector)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) des_vector = dataserializer.loads(dataserializer.dumps( vector, compress=dataserializer.CompressType.LZ4)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) des_vector = dataserializer.loads(dataserializer.dumps( vector, compress=dataserializer.CompressType.GZIP)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) # test groupby df1 = pd.DataFrame({'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce')}) grouped = wrapped_groupby(df1, 'b') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1, 'b').c restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1, 'b') getattr(grouped, 'indices') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1.b, lambda x: x % 2) restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1.b, lambda x: x % 2) getattr(grouped, 'indices') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) # test categorical s = np.random.RandomState(0).random(10) cat = pd.cut(s, [0.3, 0.5, 0.8]) self.assertIsInstance(cat, pd.Categorical) des_cat = dataserializer.loads(dataserializer.dumps(cat)) self.assertEqual(len(cat), len(des_cat)) for c, dc in zip(cat, des_cat): np.testing.assert_equal(c, dc) # test IntervalIndex s = pd.interval_range(10, 100, 3) dest_s = dataserializer.loads((dataserializer.dumps(s))) pd.testing.assert_index_equal(s, dest_s) # test complex s = complex(10 + 5j) dest_s = dataserializer.loads((dataserializer.dumps(s))) self.assertIs(type(s), type(dest_s)) self.assertEqual(s, dest_s) s = np.complex64(10 + 5j) dest_s = dataserializer.loads((dataserializer.dumps(s))) self.assertIs(type(s), type(dest_s)) self.assertEqual(s, dest_s) # test pickle d = ClassToPickle(dict(a=1, b='uvw')) dest_d = dataserializer.loads((dataserializer.dumps(d))) self.assertIs(type(d), type(dest_d)) self.assertEqual(d.a, dest_d.a) # test ndarray with negative strides arr = np.zeros((5, 6, 3)) arr2 = arr[:, :, ::-1] dest_arr2 = dataserializer.loads(dataserializer.dumps(arr2)) np.testing.assert_array_equal(arr2, dest_arr2) # test ArrowArray df = pd.DataFrame({'a': ['s1', 's2', 's3'], 'b': [['s1', 's2'], ['s3'], ['s4', 's5']]}) df['a'] = df['a'].astype(ArrowStringDtype()) df['b'] = df['b'].astype(ArrowListDtype(str)) dest_df = dataserializer.loads(dataserializer.dumps(df)) self.assertIs(type(df), type(dest_df)) pd.testing.assert_frame_equal(df, dest_df) # test DataFrame with SparseDtype s = pd.Series([1, 2, np.nan, np.nan, 3]).astype( pd.SparseDtype(np.dtype(np.float64), np.nan)) dest_s = dataserializer.loads((dataserializer.dumps(s))) pd.testing.assert_series_equal(s, dest_s) df = pd.DataFrame({'s': s}) dest_df = dataserializer.loads((dataserializer.dumps(df))) pd.testing.assert_frame_equal(df, dest_df)
def testGroupBy(self): rs = np.random.RandomState(0) data_size = 100 data_dict = { 'a': rs.randint(0, 10, size=(data_size, )), 'b': rs.randint(0, 10, size=(data_size, )), 'c': rs.choice(list('abcd'), size=(data_size, )) } df1 = pd.DataFrame(data_dict) mdf = md.DataFrame(df1, chunk_size=13) grouped = mdf.groupby('b') assert_groupby_equal( self.executor.execute_dataframe(grouped, concat=True)[0], df1.groupby('b')) df2 = pd.DataFrame(data_dict, index=['i' + str(i) for i in range(data_size)]) mdf = md.DataFrame(df2, chunk_size=13) grouped = mdf.groupby('b') assert_groupby_equal( self.executor.execute_dataframe(grouped, concat=True)[0], df2.groupby('b')) # test groupby series grouped = mdf.groupby(mdf['b']) assert_groupby_equal( self.executor.execute_dataframe(grouped, concat=True)[0], df2.groupby(df2['b'])) # test groupby multiple series grouped = mdf.groupby(by=[mdf['b'], mdf['c']]) assert_groupby_equal( self.executor.execute_dataframe(grouped, concat=True)[0], df2.groupby(by=[df2['b'], df2['c']])) df3 = pd.DataFrame(data_dict, index=pd.MultiIndex.from_tuples([ (i % 3, 'i' + str(i)) for i in range(data_size) ])) mdf = md.DataFrame(df3, chunk_size=13) grouped = mdf.groupby(level=0) assert_groupby_equal( self.executor.execute_dataframe(grouped, concat=True)[0], df3.groupby(level=0)) # test groupby with integer columns df4 = pd.DataFrame(list(data_dict.values())).T mdf = md.DataFrame(df4, chunk_size=13) grouped = mdf.groupby(0) assert_groupby_equal( self.executor.execute_dataframe(grouped, concat=True)[0], df4.groupby(0)) series1 = pd.Series(data_dict['a']) ms1 = md.Series(series1, chunk_size=13) grouped = ms1.groupby(lambda x: x % 3) assert_groupby_equal( self.executor.execute_dataframe(grouped, concat=True)[0], series1.groupby(lambda x: x % 3)) # test groupby series grouped = ms1.groupby(ms1) assert_groupby_equal( self.executor.execute_dataframe(grouped, concat=True)[0], series1.groupby(series1)) series2 = pd.Series(data_dict['a'], index=['i' + str(i) for i in range(data_size)]) ms2 = md.Series(series2, chunk_size=13) grouped = ms2.groupby(lambda x: int(x[1:]) % 3) assert_groupby_equal( self.executor.execute_dataframe(grouped, concat=True)[0], series2.groupby(lambda x: int(x[1:]) % 3))
def test_groupby_wrapper(): df = pd.DataFrame( { 'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.random.randn(8) }, index=pd.MultiIndex.from_tuples([(i // 4, i) for i in range(8)])) conv_func = lambda x: pickle.loads(pickle.dumps(x)) grouped = conv_func(wrapped_groupby(df, level=0)) assert_groupby_equal(grouped, df.groupby(level=0)) assert grouped.shape == (8, 4) assert grouped.is_frame is True assert sys.getsizeof(grouped) > sys.getsizeof(grouped.groupby_obj) assert calc_data_size(grouped) > sys.getsizeof(grouped.groupby_obj) grouped = conv_func(wrapped_groupby(df, level=0).C) assert_groupby_equal(grouped, df.groupby(level=0).C) assert grouped.shape == (8, ) assert grouped.is_frame is False grouped = conv_func(wrapped_groupby(df, 'B')) assert_groupby_equal(grouped, df.groupby('B')) assert grouped.shape == (8, 4) assert grouped.is_frame is True grouped = conv_func(wrapped_groupby(df, 'B').C) assert_groupby_equal(grouped, df.groupby('B').C, with_selection=True) assert grouped.shape == (8, ) assert grouped.is_frame is False grouped = conv_func(wrapped_groupby(df, 'B')[['C', 'D']]) assert_groupby_equal(grouped, df.groupby('B')[['C', 'D']], with_selection=True) assert grouped.shape == (8, 2) assert grouped.is_frame is True grouped = conv_func(wrapped_groupby(df, ['B', 'C'])) assert_groupby_equal(grouped, df.groupby(['B', 'C'])) assert grouped.shape == (8, 4) assert grouped.is_frame is True grouped = conv_func(wrapped_groupby(df, ['B', 'C']).C) assert_groupby_equal(grouped, df.groupby(['B', 'C']).C, with_selection=True) assert grouped.shape == (8, ) assert grouped.is_frame is False grouped = conv_func(wrapped_groupby(df, ['B', 'C'])[['A', 'D']]) assert_groupby_equal(grouped, df.groupby(['B', 'C'])[['A', 'D']], with_selection=True) assert grouped.shape == (8, 2) assert grouped.is_frame is True grouped = conv_func(wrapped_groupby(df, ['B', 'C'])[['C', 'D']]) assert_groupby_equal(grouped, df.groupby(['B', 'C'])[['C', 'D']], with_selection=True) assert grouped.shape == (8, 2) assert grouped.is_frame is True grouped = conv_func(wrapped_groupby(df, lambda x: x[-1] % 2)) assert_groupby_equal(grouped, df.groupby(lambda x: x[-1] % 2), with_selection=True) assert grouped.shape == (8, 4) assert grouped.is_frame is True grouped = conv_func(wrapped_groupby(df, lambda x: x[-1] % 2).C) assert_groupby_equal(grouped, df.groupby(lambda x: x[-1] % 2).C, with_selection=True) assert grouped.shape == (8, ) assert grouped.is_frame is False grouped = conv_func(wrapped_groupby(df, lambda x: x[-1] % 2)[['C', 'D']]) assert_groupby_equal(grouped, df.groupby(lambda x: x[-1] % 2)[['C', 'D']], with_selection=True) assert grouped.shape == (8, 2) assert grouped.is_frame is True grouped = conv_func(wrapped_groupby(df.B, lambda x: x[-1] % 2)) assert_groupby_equal(grouped, df.B.groupby(lambda x: x[-1] % 2), with_selection=True) assert grouped.shape == (8, ) assert grouped.is_frame is False