Example #1
0
def test_groupby_params():
    raw = pd.DataFrame({'a': [1, 2, 3]})
    df = DataFrame(raw)
    grouped = df.groupby('a')
    grouped = tile(grouped)
    c = grouped.chunks[0]

    c.params = c.get_params_from_data(wrapped_groupby(raw, by='a'))
    params = c.params.copy()
    params.pop('index', None)
    grouped.params = params

    raw = pd.Series([1, 2, 3], name='a')
    series = Series(raw)
    grouped = series.groupby(level=0)
    grouped = tile(grouped)
    c = grouped.chunks[0]

    c.params = c.get_params_from_data(wrapped_groupby(raw, level=0))
    params = c.params.copy()
    params.pop('index', None)
    grouped.params = params
    grouped.refresh_params()
Example #2
0
    def testDataSerialize(self):
        array = np.random.rand(1000, 100)
        assert_array_equal(array,
                           dataserializer.loads(dataserializer.dumps(array)))
        assert_array_equal(
            array,
            dataserializer.loads(
                dataserializer.dumps(
                    array, compress=dataserializer.CompressType.LZ4)))
        assert_array_equal(
            array,
            dataserializer.loads(
                dataserializer.dumps(
                    array, compress=dataserializer.CompressType.GZIP)))

        array = np.random.rand(1000, 100)
        assert_array_equal(
            array, dataserializer.load(BytesIO(dataserializer.dumps(array))))
        assert_array_equal(
            array,
            dataserializer.load(
                BytesIO(
                    dataserializer.dumps(
                        array, compress=dataserializer.CompressType.LZ4))))
        assert_array_equal(
            array,
            dataserializer.load(
                BytesIO(
                    dataserializer.dumps(
                        array, compress=dataserializer.CompressType.GZIP))))

        array = np.random.rand(1000, 100).T  # test non c-contiguous
        assert_array_equal(array,
                           dataserializer.loads(dataserializer.dumps(array)))
        assert_array_equal(
            array,
            dataserializer.loads(
                dataserializer.dumps(
                    array, compress=dataserializer.CompressType.LZ4)))
        assert_array_equal(
            array,
            dataserializer.loads(
                dataserializer.dumps(
                    array, compress=dataserializer.CompressType.GZIP)))

        array = np.float64(0.2345)
        assert_array_equal(array,
                           dataserializer.loads(dataserializer.dumps(array)))
        assert_array_equal(
            array,
            dataserializer.loads(
                dataserializer.dumps(
                    array, compress=dataserializer.CompressType.LZ4)))
        assert_array_equal(
            array,
            dataserializer.loads(
                dataserializer.dumps(
                    array, compress=dataserializer.CompressType.GZIP)))

        # test structured arrays.
        rec_dtype = np.dtype([('a', 'int64'), ('b', 'double'), ('c', '<U8')])
        array = np.ones((100, ), dtype=rec_dtype)
        array_loaded = dataserializer.loads(dataserializer.dumps(array))
        self.assertEqual(array.dtype, array_loaded.dtype)
        assert_array_equal(array, array_loaded)

        fn = os.path.join(tempfile.gettempdir(),
                          'test_dump_file_%d.bin' % id(self))
        try:
            array = np.random.rand(1000, 100).T  # test non c-contiguous
            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array, dump_file)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))

            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array,
                                    dump_file,
                                    compress=dataserializer.CompressType.LZ4)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))

            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array,
                                    dump_file,
                                    compress=dataserializer.CompressType.GZIP)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))
        finally:
            if os.path.exists(fn):
                os.unlink(fn)

        # test sparse
        if sps:
            mat = sparse.SparseMatrix(sps.random(100, 100, 0.1, format='csr'))
            des_mat = dataserializer.loads(dataserializer.dumps(mat))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            des_mat = dataserializer.loads(
                dataserializer.dumps(mat,
                                     compress=dataserializer.CompressType.LZ4))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            des_mat = dataserializer.loads(
                dataserializer.dumps(
                    mat, compress=dataserializer.CompressType.GZIP))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            vector = sparse.SparseVector(sps.csr_matrix(np.random.rand(2)),
                                         shape=(2, ))
            des_vector = dataserializer.loads(dataserializer.dumps(vector))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

            des_vector = dataserializer.loads(
                dataserializer.dumps(vector,
                                     compress=dataserializer.CompressType.LZ4))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

            des_vector = dataserializer.loads(
                dataserializer.dumps(
                    vector, compress=dataserializer.CompressType.GZIP))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

        # test groupby
        df1 = pd.DataFrame({
            'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
            'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
            'c': list('aabaaddce')
        })
        grouped = wrapped_groupby(df1, 'b')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1, 'b').c
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1, 'b')
        getattr(grouped, 'indices')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1.b, lambda x: x % 2)
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1.b, lambda x: x % 2)
        getattr(grouped, 'indices')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        # test categorical
        s = np.random.RandomState(0).random(10)
        cat = pd.cut(s, [0.3, 0.5, 0.8])
        self.assertIsInstance(cat, pd.Categorical)
        des_cat = dataserializer.loads(dataserializer.dumps(cat))
        self.assertEqual(len(cat), len(des_cat))
        for c, dc in zip(cat, des_cat):
            np.testing.assert_equal(c, dc)

        # test IntervalIndex
        s = pd.interval_range(10, 100, 3)
        dest_s = dataserializer.loads((dataserializer.dumps(s)))
        pd.testing.assert_index_equal(s, dest_s)
Example #3
0
    def testDataSerialize(self):
        for type_, compress in itertools.product(
                (None,) + tuple(dataserializer.SerialType.__members__.values()),
                (None,) + tuple(dataserializer.CompressType.__members__.values())):
            array = np.random.rand(1000, 100)
            assert_array_equal(array, dataserializer.loads(
                dataserializer.dumps(array, serial_type=type_, compress=compress)))

            array = np.random.rand(1000, 100)
            assert_array_equal(array, dataserializer.load(
                BytesIO(dataserializer.dumps(array, serial_type=type_, compress=compress))))

            array = np.random.rand(1000, 100).T  # test non c-contiguous
            assert_array_equal(array, dataserializer.loads(
                dataserializer.dumps(array, serial_type=type_, compress=compress)))

            array = np.float64(0.2345)
            assert_array_equal(array, dataserializer.loads(
                dataserializer.dumps(array, serial_type=type_, compress=compress)))

        # test non-serializable object
        if pyarrow:
            non_serial = type('non_serial', (object,), dict(nbytes=10))
            with self.assertRaises(SerializationFailed):
                dataserializer.dumps(non_serial())

        # test structured arrays.
        rec_dtype = np.dtype([('a', 'int64'), ('b', 'double'), ('c', '<U8')])
        array = np.ones((100,), dtype=rec_dtype)
        array_loaded = dataserializer.loads(dataserializer.dumps(array))
        self.assertEqual(array.dtype, array_loaded.dtype)
        assert_array_equal(array, array_loaded)

        fn = os.path.join(tempfile.gettempdir(), f'test_dump_file_{id(self)}.bin')
        try:
            array = np.random.rand(1000, 100).T  # test non c-contiguous
            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array, dump_file)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))

            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array, dump_file,
                                    compress=dataserializer.CompressType.LZ4)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))

            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array, dump_file,
                                    compress=dataserializer.CompressType.GZIP)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))
        finally:
            if os.path.exists(fn):
                os.unlink(fn)

        # test sparse
        if sps:
            mat = sparse.SparseMatrix(sps.random(100, 100, 0.1, format='csr'))
            des_mat = dataserializer.loads(dataserializer.dumps(mat))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            des_mat = dataserializer.loads(dataserializer.dumps(
                mat, compress=dataserializer.CompressType.LZ4))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            des_mat = dataserializer.loads(dataserializer.dumps(
                mat, compress=dataserializer.CompressType.GZIP))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            vector = sparse.SparseVector(sps.csr_matrix(np.random.rand(2)), shape=(2,))
            des_vector = dataserializer.loads(dataserializer.dumps(vector))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

            des_vector = dataserializer.loads(dataserializer.dumps(
                vector, compress=dataserializer.CompressType.LZ4))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

            des_vector = dataserializer.loads(dataserializer.dumps(
                vector, compress=dataserializer.CompressType.GZIP))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

        # test groupby
        df1 = pd.DataFrame({'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
                            'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
                            'c': list('aabaaddce')})
        grouped = wrapped_groupby(df1, 'b')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1, 'b').c
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1, 'b')
        getattr(grouped, 'indices')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1.b, lambda x: x % 2)
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1.b, lambda x: x % 2)
        getattr(grouped, 'indices')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        # test categorical
        s = np.random.RandomState(0).random(10)
        cat = pd.cut(s, [0.3, 0.5, 0.8])
        self.assertIsInstance(cat, pd.Categorical)
        des_cat = dataserializer.loads(dataserializer.dumps(cat))
        self.assertEqual(len(cat), len(des_cat))
        for c, dc in zip(cat, des_cat):
            np.testing.assert_equal(c, dc)

        # test IntervalIndex
        s = pd.interval_range(10, 100, 3)
        dest_s = dataserializer.loads((dataserializer.dumps(s)))
        pd.testing.assert_index_equal(s, dest_s)

        # test complex
        s = complex(10 + 5j)
        dest_s = dataserializer.loads((dataserializer.dumps(s)))
        self.assertIs(type(s), type(dest_s))
        self.assertEqual(s, dest_s)

        s = np.complex64(10 + 5j)
        dest_s = dataserializer.loads((dataserializer.dumps(s)))
        self.assertIs(type(s), type(dest_s))
        self.assertEqual(s, dest_s)

        # test pickle
        d = ClassToPickle(dict(a=1, b='uvw'))
        dest_d = dataserializer.loads((dataserializer.dumps(d)))
        self.assertIs(type(d), type(dest_d))
        self.assertEqual(d.a, dest_d.a)

        # test ndarray with negative strides
        arr = np.zeros((5, 6, 3))
        arr2 = arr[:, :, ::-1]
        dest_arr2 = dataserializer.loads(dataserializer.dumps(arr2))
        np.testing.assert_array_equal(arr2, dest_arr2)

        # test ArrowArray
        df = pd.DataFrame({'a': ['s1', 's2', 's3'],
                           'b': [['s1', 's2'], ['s3'], ['s4', 's5']]})
        df['a'] = df['a'].astype(ArrowStringDtype())
        df['b'] = df['b'].astype(ArrowListDtype(str))
        dest_df = dataserializer.loads(dataserializer.dumps(df))
        self.assertIs(type(df), type(dest_df))
        pd.testing.assert_frame_equal(df, dest_df)

        # test DataFrame with SparseDtype
        s = pd.Series([1, 2, np.nan, np.nan, 3]).astype(
            pd.SparseDtype(np.dtype(np.float64), np.nan))
        dest_s = dataserializer.loads((dataserializer.dumps(s)))
        pd.testing.assert_series_equal(s, dest_s)
        df = pd.DataFrame({'s': s})
        dest_df = dataserializer.loads((dataserializer.dumps(df)))
        pd.testing.assert_frame_equal(df, dest_df)
Example #4
0
    def testGroupByWrapper(self):
        df = pd.DataFrame(
            {
                'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
                'B':
                ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
                'C': np.random.randn(8),
                'D': np.random.randn(8)
            },
            index=pd.MultiIndex.from_tuples([(i // 4, i) for i in range(8)]))

        grouped = GroupByWrapper.from_tuple(
            wrapped_groupby(df, level=0).to_tuple())
        assert_groupby_equal(grouped, df.groupby(level=0))
        self.assertEqual(grouped.shape, (8, 4))
        self.assertTrue(grouped.is_frame)
        self.assertGreater(sys.getsizeof(grouped),
                           sys.getsizeof(grouped.groupby_obj))
        self.assertGreater(calc_data_size(grouped),
                           sys.getsizeof(grouped.groupby_obj))

        grouped = GroupByWrapper.from_tuple(
            wrapped_groupby(df, level=0).C.to_tuple())
        assert_groupby_equal(grouped, df.groupby(level=0).C)
        self.assertEqual(grouped.shape, (8, ))
        self.assertFalse(grouped.is_frame)

        grouped = GroupByWrapper.from_tuple(
            wrapped_groupby(df, 'B').to_tuple())
        assert_groupby_equal(grouped, df.groupby('B'))
        self.assertEqual(grouped.shape, (8, 4))
        self.assertTrue(grouped.is_frame)

        grouped = GroupByWrapper.from_tuple(
            wrapped_groupby(df, 'B').C.to_tuple(truncate=True))
        assert_groupby_equal(grouped, df.groupby('B').C, with_selection=True)
        self.assertEqual(grouped.shape, (8, ))
        self.assertFalse(grouped.is_frame)

        grouped = GroupByWrapper.from_tuple(
            wrapped_groupby(df, 'B')[['C', 'D']].to_tuple(truncate=True))
        assert_groupby_equal(grouped,
                             df.groupby('B')[['C', 'D']],
                             with_selection=True)
        self.assertEqual(grouped.shape, (8, 2))
        self.assertTrue(grouped.is_frame)

        grouped = GroupByWrapper.from_tuple(
            wrapped_groupby(df, ['B', 'C']).to_tuple(truncate=True))
        assert_groupby_equal(grouped, df.groupby(['B', 'C']))
        self.assertEqual(grouped.shape, (8, 4))
        self.assertTrue(grouped.is_frame)

        grouped = GroupByWrapper.from_tuple(
            wrapped_groupby(df, ['B', 'C']).C.to_tuple(truncate=True))
        assert_groupby_equal(grouped,
                             df.groupby(['B', 'C']).C,
                             with_selection=True)
        self.assertEqual(grouped.shape, (8, ))
        self.assertFalse(grouped.is_frame)

        grouped = GroupByWrapper.from_tuple(
            wrapped_groupby(df, ['B', 'C'])[['A',
                                             'D']].to_tuple(truncate=True))
        assert_groupby_equal(grouped,
                             df.groupby(['B', 'C'])[['A', 'D']],
                             with_selection=True)
        self.assertEqual(grouped.shape, (8, 2))
        self.assertTrue(grouped.is_frame)

        grouped = GroupByWrapper.from_tuple(
            wrapped_groupby(df, ['B', 'C'])[['C',
                                             'D']].to_tuple(truncate=True))
        assert_groupby_equal(grouped,
                             df.groupby(['B', 'C'])[['C', 'D']],
                             with_selection=True)
        self.assertEqual(grouped.shape, (8, 2))
        self.assertTrue(grouped.is_frame)

        grouped = GroupByWrapper.from_tuple(
            wrapped_groupby(
                df, lambda x: x[-1] % 2).to_tuple(pickle_function=True))
        assert_groupby_equal(grouped,
                             df.groupby(lambda x: x[-1] % 2),
                             with_selection=True)
        self.assertEqual(grouped.shape, (8, 4))
        self.assertTrue(grouped.is_frame)

        grouped = GroupByWrapper.from_tuple(
            wrapped_groupby(
                df, lambda x: x[-1] % 2).C.to_tuple(pickle_function=True))
        assert_groupby_equal(grouped,
                             df.groupby(lambda x: x[-1] % 2).C,
                             with_selection=True)
        self.assertEqual(grouped.shape, (8, ))
        self.assertFalse(grouped.is_frame)

        grouped = GroupByWrapper.from_tuple(
            wrapped_groupby(
                df, lambda x: x[-1] % 2)[['C',
                                          'D']].to_tuple(pickle_function=True))
        assert_groupby_equal(grouped,
                             df.groupby(lambda x: x[-1] % 2)[['C', 'D']],
                             with_selection=True)
        self.assertEqual(grouped.shape, (8, 2))
        self.assertTrue(grouped.is_frame)

        grouped = GroupByWrapper.from_tuple(
            wrapped_groupby(df.B, lambda x: x[-1] % 2).to_tuple())
        assert_groupby_equal(grouped,
                             df.B.groupby(lambda x: x[-1] % 2),
                             with_selection=True)
        self.assertEqual(grouped.shape, (8, ))
        self.assertFalse(grouped.is_frame)
Example #5
0
def test_groupby_wrapper():
    df = pd.DataFrame(
        {
            'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
            'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
            'C': np.random.randn(8),
            'D': np.random.randn(8)
        },
        index=pd.MultiIndex.from_tuples([(i // 4, i) for i in range(8)]))

    conv_func = lambda x: pickle.loads(pickle.dumps(x))

    grouped = conv_func(wrapped_groupby(df, level=0))
    assert_groupby_equal(grouped, df.groupby(level=0))
    assert grouped.shape == (8, 4)
    assert grouped.is_frame is True
    assert sys.getsizeof(grouped) > sys.getsizeof(grouped.groupby_obj)
    assert calc_data_size(grouped) > sys.getsizeof(grouped.groupby_obj)

    grouped = conv_func(wrapped_groupby(df, level=0).C)
    assert_groupby_equal(grouped, df.groupby(level=0).C)
    assert grouped.shape == (8, )
    assert grouped.is_frame is False

    grouped = conv_func(wrapped_groupby(df, 'B'))
    assert_groupby_equal(grouped, df.groupby('B'))
    assert grouped.shape == (8, 4)
    assert grouped.is_frame is True

    grouped = conv_func(wrapped_groupby(df, 'B').C)
    assert_groupby_equal(grouped, df.groupby('B').C, with_selection=True)
    assert grouped.shape == (8, )
    assert grouped.is_frame is False

    grouped = conv_func(wrapped_groupby(df, 'B')[['C', 'D']])
    assert_groupby_equal(grouped,
                         df.groupby('B')[['C', 'D']],
                         with_selection=True)
    assert grouped.shape == (8, 2)
    assert grouped.is_frame is True

    grouped = conv_func(wrapped_groupby(df, ['B', 'C']))
    assert_groupby_equal(grouped, df.groupby(['B', 'C']))
    assert grouped.shape == (8, 4)
    assert grouped.is_frame is True

    grouped = conv_func(wrapped_groupby(df, ['B', 'C']).C)
    assert_groupby_equal(grouped,
                         df.groupby(['B', 'C']).C,
                         with_selection=True)
    assert grouped.shape == (8, )
    assert grouped.is_frame is False

    grouped = conv_func(wrapped_groupby(df, ['B', 'C'])[['A', 'D']])
    assert_groupby_equal(grouped,
                         df.groupby(['B', 'C'])[['A', 'D']],
                         with_selection=True)
    assert grouped.shape == (8, 2)
    assert grouped.is_frame is True

    grouped = conv_func(wrapped_groupby(df, ['B', 'C'])[['C', 'D']])
    assert_groupby_equal(grouped,
                         df.groupby(['B', 'C'])[['C', 'D']],
                         with_selection=True)
    assert grouped.shape == (8, 2)
    assert grouped.is_frame is True

    grouped = conv_func(wrapped_groupby(df, lambda x: x[-1] % 2))
    assert_groupby_equal(grouped,
                         df.groupby(lambda x: x[-1] % 2),
                         with_selection=True)
    assert grouped.shape == (8, 4)
    assert grouped.is_frame is True

    grouped = conv_func(wrapped_groupby(df, lambda x: x[-1] % 2).C)
    assert_groupby_equal(grouped,
                         df.groupby(lambda x: x[-1] % 2).C,
                         with_selection=True)
    assert grouped.shape == (8, )
    assert grouped.is_frame is False

    grouped = conv_func(wrapped_groupby(df, lambda x: x[-1] % 2)[['C', 'D']])
    assert_groupby_equal(grouped,
                         df.groupby(lambda x: x[-1] % 2)[['C', 'D']],
                         with_selection=True)
    assert grouped.shape == (8, 2)
    assert grouped.is_frame is True

    grouped = conv_func(wrapped_groupby(df.B, lambda x: x[-1] % 2))
    assert_groupby_equal(grouped,
                         df.B.groupby(lambda x: x[-1] % 2),
                         with_selection=True)
    assert grouped.shape == (8, )
    assert grouped.is_frame is False