def test_groupby_params(): raw = pd.DataFrame({'a': [1, 2, 3]}) df = DataFrame(raw) grouped = df.groupby('a') grouped = tile(grouped) c = grouped.chunks[0] c.params = c.get_params_from_data(wrapped_groupby(raw, by='a')) params = c.params.copy() params.pop('index', None) grouped.params = params raw = pd.Series([1, 2, 3], name='a') series = Series(raw) grouped = series.groupby(level=0) grouped = tile(grouped) c = grouped.chunks[0] c.params = c.get_params_from_data(wrapped_groupby(raw, level=0)) params = c.params.copy() params.pop('index', None) grouped.params = params grouped.refresh_params()
def testDataSerialize(self): array = np.random.rand(1000, 100) assert_array_equal(array, dataserializer.loads(dataserializer.dumps(array))) assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.LZ4))) assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.GZIP))) array = np.random.rand(1000, 100) assert_array_equal( array, dataserializer.load(BytesIO(dataserializer.dumps(array)))) assert_array_equal( array, dataserializer.load( BytesIO( dataserializer.dumps( array, compress=dataserializer.CompressType.LZ4)))) assert_array_equal( array, dataserializer.load( BytesIO( dataserializer.dumps( array, compress=dataserializer.CompressType.GZIP)))) array = np.random.rand(1000, 100).T # test non c-contiguous assert_array_equal(array, dataserializer.loads(dataserializer.dumps(array))) assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.LZ4))) assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.GZIP))) array = np.float64(0.2345) assert_array_equal(array, dataserializer.loads(dataserializer.dumps(array))) assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.LZ4))) assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.GZIP))) # test structured arrays. rec_dtype = np.dtype([('a', 'int64'), ('b', 'double'), ('c', '<U8')]) array = np.ones((100, ), dtype=rec_dtype) array_loaded = dataserializer.loads(dataserializer.dumps(array)) self.assertEqual(array.dtype, array_loaded.dtype) assert_array_equal(array, array_loaded) fn = os.path.join(tempfile.gettempdir(), 'test_dump_file_%d.bin' % id(self)) try: array = np.random.rand(1000, 100).T # test non c-contiguous with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file, compress=dataserializer.CompressType.LZ4) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file, compress=dataserializer.CompressType.GZIP) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) finally: if os.path.exists(fn): os.unlink(fn) # test sparse if sps: mat = sparse.SparseMatrix(sps.random(100, 100, 0.1, format='csr')) des_mat = dataserializer.loads(dataserializer.dumps(mat)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) des_mat = dataserializer.loads( dataserializer.dumps(mat, compress=dataserializer.CompressType.LZ4)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) des_mat = dataserializer.loads( dataserializer.dumps( mat, compress=dataserializer.CompressType.GZIP)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) vector = sparse.SparseVector(sps.csr_matrix(np.random.rand(2)), shape=(2, )) des_vector = dataserializer.loads(dataserializer.dumps(vector)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) des_vector = dataserializer.loads( dataserializer.dumps(vector, compress=dataserializer.CompressType.LZ4)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) des_vector = dataserializer.loads( dataserializer.dumps( vector, compress=dataserializer.CompressType.GZIP)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) # test groupby df1 = pd.DataFrame({ 'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce') }) grouped = wrapped_groupby(df1, 'b') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1, 'b').c restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1, 'b') getattr(grouped, 'indices') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1.b, lambda x: x % 2) restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1.b, lambda x: x % 2) getattr(grouped, 'indices') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) # test categorical s = np.random.RandomState(0).random(10) cat = pd.cut(s, [0.3, 0.5, 0.8]) self.assertIsInstance(cat, pd.Categorical) des_cat = dataserializer.loads(dataserializer.dumps(cat)) self.assertEqual(len(cat), len(des_cat)) for c, dc in zip(cat, des_cat): np.testing.assert_equal(c, dc) # test IntervalIndex s = pd.interval_range(10, 100, 3) dest_s = dataserializer.loads((dataserializer.dumps(s))) pd.testing.assert_index_equal(s, dest_s)
def testDataSerialize(self): for type_, compress in itertools.product( (None,) + tuple(dataserializer.SerialType.__members__.values()), (None,) + tuple(dataserializer.CompressType.__members__.values())): array = np.random.rand(1000, 100) assert_array_equal(array, dataserializer.loads( dataserializer.dumps(array, serial_type=type_, compress=compress))) array = np.random.rand(1000, 100) assert_array_equal(array, dataserializer.load( BytesIO(dataserializer.dumps(array, serial_type=type_, compress=compress)))) array = np.random.rand(1000, 100).T # test non c-contiguous assert_array_equal(array, dataserializer.loads( dataserializer.dumps(array, serial_type=type_, compress=compress))) array = np.float64(0.2345) assert_array_equal(array, dataserializer.loads( dataserializer.dumps(array, serial_type=type_, compress=compress))) # test non-serializable object if pyarrow: non_serial = type('non_serial', (object,), dict(nbytes=10)) with self.assertRaises(SerializationFailed): dataserializer.dumps(non_serial()) # test structured arrays. rec_dtype = np.dtype([('a', 'int64'), ('b', 'double'), ('c', '<U8')]) array = np.ones((100,), dtype=rec_dtype) array_loaded = dataserializer.loads(dataserializer.dumps(array)) self.assertEqual(array.dtype, array_loaded.dtype) assert_array_equal(array, array_loaded) fn = os.path.join(tempfile.gettempdir(), f'test_dump_file_{id(self)}.bin') try: array = np.random.rand(1000, 100).T # test non c-contiguous with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file, compress=dataserializer.CompressType.LZ4) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file, compress=dataserializer.CompressType.GZIP) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) finally: if os.path.exists(fn): os.unlink(fn) # test sparse if sps: mat = sparse.SparseMatrix(sps.random(100, 100, 0.1, format='csr')) des_mat = dataserializer.loads(dataserializer.dumps(mat)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) des_mat = dataserializer.loads(dataserializer.dumps( mat, compress=dataserializer.CompressType.LZ4)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) des_mat = dataserializer.loads(dataserializer.dumps( mat, compress=dataserializer.CompressType.GZIP)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) vector = sparse.SparseVector(sps.csr_matrix(np.random.rand(2)), shape=(2,)) des_vector = dataserializer.loads(dataserializer.dumps(vector)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) des_vector = dataserializer.loads(dataserializer.dumps( vector, compress=dataserializer.CompressType.LZ4)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) des_vector = dataserializer.loads(dataserializer.dumps( vector, compress=dataserializer.CompressType.GZIP)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) # test groupby df1 = pd.DataFrame({'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce')}) grouped = wrapped_groupby(df1, 'b') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1, 'b').c restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1, 'b') getattr(grouped, 'indices') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1.b, lambda x: x % 2) restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1.b, lambda x: x % 2) getattr(grouped, 'indices') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) # test categorical s = np.random.RandomState(0).random(10) cat = pd.cut(s, [0.3, 0.5, 0.8]) self.assertIsInstance(cat, pd.Categorical) des_cat = dataserializer.loads(dataserializer.dumps(cat)) self.assertEqual(len(cat), len(des_cat)) for c, dc in zip(cat, des_cat): np.testing.assert_equal(c, dc) # test IntervalIndex s = pd.interval_range(10, 100, 3) dest_s = dataserializer.loads((dataserializer.dumps(s))) pd.testing.assert_index_equal(s, dest_s) # test complex s = complex(10 + 5j) dest_s = dataserializer.loads((dataserializer.dumps(s))) self.assertIs(type(s), type(dest_s)) self.assertEqual(s, dest_s) s = np.complex64(10 + 5j) dest_s = dataserializer.loads((dataserializer.dumps(s))) self.assertIs(type(s), type(dest_s)) self.assertEqual(s, dest_s) # test pickle d = ClassToPickle(dict(a=1, b='uvw')) dest_d = dataserializer.loads((dataserializer.dumps(d))) self.assertIs(type(d), type(dest_d)) self.assertEqual(d.a, dest_d.a) # test ndarray with negative strides arr = np.zeros((5, 6, 3)) arr2 = arr[:, :, ::-1] dest_arr2 = dataserializer.loads(dataserializer.dumps(arr2)) np.testing.assert_array_equal(arr2, dest_arr2) # test ArrowArray df = pd.DataFrame({'a': ['s1', 's2', 's3'], 'b': [['s1', 's2'], ['s3'], ['s4', 's5']]}) df['a'] = df['a'].astype(ArrowStringDtype()) df['b'] = df['b'].astype(ArrowListDtype(str)) dest_df = dataserializer.loads(dataserializer.dumps(df)) self.assertIs(type(df), type(dest_df)) pd.testing.assert_frame_equal(df, dest_df) # test DataFrame with SparseDtype s = pd.Series([1, 2, np.nan, np.nan, 3]).astype( pd.SparseDtype(np.dtype(np.float64), np.nan)) dest_s = dataserializer.loads((dataserializer.dumps(s))) pd.testing.assert_series_equal(s, dest_s) df = pd.DataFrame({'s': s}) dest_df = dataserializer.loads((dataserializer.dumps(df))) pd.testing.assert_frame_equal(df, dest_df)
def testGroupByWrapper(self): df = pd.DataFrame( { 'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.random.randn(8) }, index=pd.MultiIndex.from_tuples([(i // 4, i) for i in range(8)])) grouped = GroupByWrapper.from_tuple( wrapped_groupby(df, level=0).to_tuple()) assert_groupby_equal(grouped, df.groupby(level=0)) self.assertEqual(grouped.shape, (8, 4)) self.assertTrue(grouped.is_frame) self.assertGreater(sys.getsizeof(grouped), sys.getsizeof(grouped.groupby_obj)) self.assertGreater(calc_data_size(grouped), sys.getsizeof(grouped.groupby_obj)) grouped = GroupByWrapper.from_tuple( wrapped_groupby(df, level=0).C.to_tuple()) assert_groupby_equal(grouped, df.groupby(level=0).C) self.assertEqual(grouped.shape, (8, )) self.assertFalse(grouped.is_frame) grouped = GroupByWrapper.from_tuple( wrapped_groupby(df, 'B').to_tuple()) assert_groupby_equal(grouped, df.groupby('B')) self.assertEqual(grouped.shape, (8, 4)) self.assertTrue(grouped.is_frame) grouped = GroupByWrapper.from_tuple( wrapped_groupby(df, 'B').C.to_tuple(truncate=True)) assert_groupby_equal(grouped, df.groupby('B').C, with_selection=True) self.assertEqual(grouped.shape, (8, )) self.assertFalse(grouped.is_frame) grouped = GroupByWrapper.from_tuple( wrapped_groupby(df, 'B')[['C', 'D']].to_tuple(truncate=True)) assert_groupby_equal(grouped, df.groupby('B')[['C', 'D']], with_selection=True) self.assertEqual(grouped.shape, (8, 2)) self.assertTrue(grouped.is_frame) grouped = GroupByWrapper.from_tuple( wrapped_groupby(df, ['B', 'C']).to_tuple(truncate=True)) assert_groupby_equal(grouped, df.groupby(['B', 'C'])) self.assertEqual(grouped.shape, (8, 4)) self.assertTrue(grouped.is_frame) grouped = GroupByWrapper.from_tuple( wrapped_groupby(df, ['B', 'C']).C.to_tuple(truncate=True)) assert_groupby_equal(grouped, df.groupby(['B', 'C']).C, with_selection=True) self.assertEqual(grouped.shape, (8, )) self.assertFalse(grouped.is_frame) grouped = GroupByWrapper.from_tuple( wrapped_groupby(df, ['B', 'C'])[['A', 'D']].to_tuple(truncate=True)) assert_groupby_equal(grouped, df.groupby(['B', 'C'])[['A', 'D']], with_selection=True) self.assertEqual(grouped.shape, (8, 2)) self.assertTrue(grouped.is_frame) grouped = GroupByWrapper.from_tuple( wrapped_groupby(df, ['B', 'C'])[['C', 'D']].to_tuple(truncate=True)) assert_groupby_equal(grouped, df.groupby(['B', 'C'])[['C', 'D']], with_selection=True) self.assertEqual(grouped.shape, (8, 2)) self.assertTrue(grouped.is_frame) grouped = GroupByWrapper.from_tuple( wrapped_groupby( df, lambda x: x[-1] % 2).to_tuple(pickle_function=True)) assert_groupby_equal(grouped, df.groupby(lambda x: x[-1] % 2), with_selection=True) self.assertEqual(grouped.shape, (8, 4)) self.assertTrue(grouped.is_frame) grouped = GroupByWrapper.from_tuple( wrapped_groupby( df, lambda x: x[-1] % 2).C.to_tuple(pickle_function=True)) assert_groupby_equal(grouped, df.groupby(lambda x: x[-1] % 2).C, with_selection=True) self.assertEqual(grouped.shape, (8, )) self.assertFalse(grouped.is_frame) grouped = GroupByWrapper.from_tuple( wrapped_groupby( df, lambda x: x[-1] % 2)[['C', 'D']].to_tuple(pickle_function=True)) assert_groupby_equal(grouped, df.groupby(lambda x: x[-1] % 2)[['C', 'D']], with_selection=True) self.assertEqual(grouped.shape, (8, 2)) self.assertTrue(grouped.is_frame) grouped = GroupByWrapper.from_tuple( wrapped_groupby(df.B, lambda x: x[-1] % 2).to_tuple()) assert_groupby_equal(grouped, df.B.groupby(lambda x: x[-1] % 2), with_selection=True) self.assertEqual(grouped.shape, (8, )) self.assertFalse(grouped.is_frame)
def test_groupby_wrapper(): df = pd.DataFrame( { 'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.random.randn(8) }, index=pd.MultiIndex.from_tuples([(i // 4, i) for i in range(8)])) conv_func = lambda x: pickle.loads(pickle.dumps(x)) grouped = conv_func(wrapped_groupby(df, level=0)) assert_groupby_equal(grouped, df.groupby(level=0)) assert grouped.shape == (8, 4) assert grouped.is_frame is True assert sys.getsizeof(grouped) > sys.getsizeof(grouped.groupby_obj) assert calc_data_size(grouped) > sys.getsizeof(grouped.groupby_obj) grouped = conv_func(wrapped_groupby(df, level=0).C) assert_groupby_equal(grouped, df.groupby(level=0).C) assert grouped.shape == (8, ) assert grouped.is_frame is False grouped = conv_func(wrapped_groupby(df, 'B')) assert_groupby_equal(grouped, df.groupby('B')) assert grouped.shape == (8, 4) assert grouped.is_frame is True grouped = conv_func(wrapped_groupby(df, 'B').C) assert_groupby_equal(grouped, df.groupby('B').C, with_selection=True) assert grouped.shape == (8, ) assert grouped.is_frame is False grouped = conv_func(wrapped_groupby(df, 'B')[['C', 'D']]) assert_groupby_equal(grouped, df.groupby('B')[['C', 'D']], with_selection=True) assert grouped.shape == (8, 2) assert grouped.is_frame is True grouped = conv_func(wrapped_groupby(df, ['B', 'C'])) assert_groupby_equal(grouped, df.groupby(['B', 'C'])) assert grouped.shape == (8, 4) assert grouped.is_frame is True grouped = conv_func(wrapped_groupby(df, ['B', 'C']).C) assert_groupby_equal(grouped, df.groupby(['B', 'C']).C, with_selection=True) assert grouped.shape == (8, ) assert grouped.is_frame is False grouped = conv_func(wrapped_groupby(df, ['B', 'C'])[['A', 'D']]) assert_groupby_equal(grouped, df.groupby(['B', 'C'])[['A', 'D']], with_selection=True) assert grouped.shape == (8, 2) assert grouped.is_frame is True grouped = conv_func(wrapped_groupby(df, ['B', 'C'])[['C', 'D']]) assert_groupby_equal(grouped, df.groupby(['B', 'C'])[['C', 'D']], with_selection=True) assert grouped.shape == (8, 2) assert grouped.is_frame is True grouped = conv_func(wrapped_groupby(df, lambda x: x[-1] % 2)) assert_groupby_equal(grouped, df.groupby(lambda x: x[-1] % 2), with_selection=True) assert grouped.shape == (8, 4) assert grouped.is_frame is True grouped = conv_func(wrapped_groupby(df, lambda x: x[-1] % 2).C) assert_groupby_equal(grouped, df.groupby(lambda x: x[-1] % 2).C, with_selection=True) assert grouped.shape == (8, ) assert grouped.is_frame is False grouped = conv_func(wrapped_groupby(df, lambda x: x[-1] % 2)[['C', 'D']]) assert_groupby_equal(grouped, df.groupby(lambda x: x[-1] % 2)[['C', 'D']], with_selection=True) assert grouped.shape == (8, 2) assert grouped.is_frame is True grouped = conv_func(wrapped_groupby(df.B, lambda x: x[-1] % 2)) assert_groupby_equal(grouped, df.B.groupby(lambda x: x[-1] % 2), with_selection=True) assert grouped.shape == (8, ) assert grouped.is_frame is False