Beispiel #1
0
    def testCut(self):
        s = from_pandas_series(pd.Series([1., 2., 3., 4.]), chunk_size=2)

        with self.assertRaises(ValueError):
            _ = cut(s, -1)

        with self.assertRaises(ValueError):
            _ = cut([[1, 2], [3, 4]], 3)

        with self.assertRaises(ValueError):
            _ = cut([], 3)

        r, b = cut(s, [1.5, 2.5], retbins=True)
        self.assertIsInstance(r, SERIES_TYPE)
        self.assertIsInstance(b, TENSOR_TYPE)

        r = r.tiles()

        self.assertEqual(len(r.chunks), 2)
        for c in r.chunks:
            self.assertIsInstance(c, SERIES_CHUNK_TYPE)
            self.assertEqual(c.shape, (2, ))

        r = cut(s.to_tensor(), [1.5, 2.5])
        self.assertIsInstance(r, CATEGORICAL_TYPE)
        self.assertEqual(len(r), len(s))
        self.assertIn('Categorical', repr(r))

        r = r.tiles()

        self.assertEqual(len(r.chunks), 2)
        for c in r.chunks:
            self.assertIsInstance(c, CATEGORICAL_CHUNK_TYPE)
            self.assertEqual(c.shape, (2, ))
            self.assertEqual(c.ndim, 1)

        # test serialize
        g = r.build_graph(tiled=False)
        g2 = type(g).from_pb(g.to_pb())
        g2 = type(g).from_json(g2.to_json())
        r2 = next(n for n in g2 if isinstance(n, CATEGORICAL_TYPE))
        self.assertEqual(len(r2), len(r))

        r = cut([0, 1, 1, 2], bins=4, labels=False)
        self.assertIsInstance(r, TENSOR_TYPE)
        e = pd.cut([0, 1, 1, 2], bins=4, labels=False)
        self.assertEqual(r.dtype, e.dtype)
Beispiel #2
0
    def testSeriesIsin(self):
        # one chunk in multiple chunks
        a = from_pandas_series(pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
                               chunk_size=10)
        b = from_pandas_series(pd.Series([2, 1, 9, 3]), chunk_size=2)

        r = a.isin(b).tiles()
        for i, c in enumerate(r.chunks):
            self.assertEqual(c.index, (i, ))
            self.assertEqual(c.dtype, np.dtype('bool'))
            self.assertEqual(c.shape, (10, ))
            self.assertEqual(len(c.op.inputs), 2)
            self.assertEqual(c.op.object_type, ObjectType.series)
            self.assertEqual(c.op.inputs[0].index, (i, ))
            self.assertEqual(c.op.inputs[0].shape, (10, ))
            self.assertEqual(c.op.inputs[1].index, (0, ))
            self.assertEqual(c.op.inputs[1].shape, (4, ))  # has been rechunked

        # multiple chunk in one chunks
        a = from_pandas_series(pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
                               chunk_size=2)
        b = from_pandas_series(pd.Series([2, 1, 9, 3]), chunk_size=4)

        r = a.isin(b).tiles()
        for i, c in enumerate(r.chunks):
            self.assertEqual(c.index, (i, ))
            self.assertEqual(c.dtype, np.dtype('bool'))
            self.assertEqual(c.shape, (2, ))
            self.assertEqual(len(c.op.inputs), 2)
            self.assertEqual(c.op.object_type, ObjectType.series)
            self.assertEqual(c.op.inputs[0].index, (i, ))
            self.assertEqual(c.op.inputs[0].shape, (2, ))
            self.assertEqual(c.op.inputs[1].index, (0, ))
            self.assertEqual(c.op.inputs[1].shape, (4, ))

        # multiple chunk in multiple chunks
        a = from_pandas_series(pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
                               chunk_size=2)
        b = from_pandas_series(pd.Series([2, 1, 9, 3]), chunk_size=2)

        r = a.isin(b).tiles()
        for i, c in enumerate(r.chunks):
            self.assertEqual(c.index, (i, ))
            self.assertEqual(c.dtype, np.dtype('bool'))
            self.assertEqual(c.shape, (2, ))
            self.assertEqual(len(c.op.inputs), 2)
            self.assertEqual(c.op.object_type, ObjectType.series)
            self.assertEqual(c.op.inputs[0].index, (i, ))
            self.assertEqual(c.op.inputs[0].shape, (2, ))
            self.assertEqual(c.op.inputs[1].index, (0, ))
            self.assertEqual(c.op.inputs[1].shape, (4, ))  # has been rechunked

        with self.assertRaises(TypeError):
            _ = a.isin('sth')
    def testUfunc(self):
        df_raw = pd.DataFrame(np.random.uniform(size=(10, 10)),
                              index=pd.RangeIndex(9, -1, -1))
        df = from_pandas(df_raw, chunk_size=5)

        series_raw = pd.Series(np.random.uniform(size=10),
                               index=pd.RangeIndex(9, -1, -1))
        series = from_pandas_series(series_raw, chunk_size=5)

        ufuncs = [[np.abs, mt.abs], [np.log, mt.log], [np.log2, mt.log2],
                  [np.log10, mt.log10], [np.sin, mt.sin], [np.cos, mt.cos],
                  [np.tan, mt.tan], [np.sinh, mt.sinh], [np.cosh, mt.cosh],
                  [np.tanh, mt.tanh], [np.arcsin, mt.arcsin],
                  [np.arccos, mt.arccos], [np.arctan, mt.arctan],
                  [np.arcsinh, mt.arcsinh], [np.arccosh, mt.arccosh],
                  [np.arctanh, mt.arctanh], [np.radians, mt.radians],
                  [np.degrees, mt.degrees], [np.ceil, mt.ceil],
                  [np.floor, mt.floor],
                  [
                      partial(np.around, decimals=2),
                      partial(mt.around, decimals=2)
                  ], [np.exp, mt.exp], [np.exp2, mt.exp2],
                  [np.expm1, mt.expm1], [np.sqrt, mt.sqrt]]

        for raw, data in [(df_raw, df), (series_raw, series)]:
            for npf, mtf in ufuncs:
                r = mtf(data)

                result = self.executor.execute_tensor(r, concat=True)[0]
                expected = npf(raw)

                if isinstance(raw, pd.DataFrame):
                    pd.testing.assert_frame_equal(result, expected)
                else:
                    pd.testing.assert_series_equal(result, expected)

                # test numpy ufunc
                r = npf(data)

                result = self.executor.execute_tensor(r, concat=True)[0]

                if isinstance(raw, pd.DataFrame):
                    pd.testing.assert_frame_equal(result, expected)
                else:
                    pd.testing.assert_series_equal(result, expected)
Beispiel #4
0
    def testFromPandasSeries(self):
        data = pd.Series(np.random.rand(10), name='a')
        series = from_pandas_series(data, chunk_size=4)

        self.assertEqual(series.name, data.name)
        self.assertIsInstance(series.index_value._index_value,
                              IndexValue.RangeIndex)
        self.assertEqual(series.index_value._index_value._slice,
                         slice(0, 10, 1))
        self.assertTrue(series.index_value.is_monotonic_increasing)
        self.assertFalse(series.index_value.is_monotonic_decreasing)
        self.assertTrue(series.index_value.is_unique)
        self.assertEqual(series.index_value.min_val, 0)
        self.assertEqual(series.index_value.max_val, 9)

        series = series.tiles()

        self.assertEqual(len(series.chunks), 3)
        pd.testing.assert_series_equal(series.chunks[0].op.data,
                                       series.op.data.iloc[:4])
        self.assertEqual(series.chunks[0].index_value._index_value._slice,
                         slice(0, 4, 1))
        self.assertTrue(
            series.chunks[0].index_value._index_value._is_monotonic_increasing)
        self.assertFalse(
            series.chunks[0].index_value._index_value._is_monotonic_decreasing)
        self.assertTrue(series.chunks[0].index_value._index_value._is_unique)
        pd.testing.assert_series_equal(series.chunks[1].op.data,
                                       series.op.data.iloc[4:8])
        self.assertEqual(series.chunks[1].index_value._index_value._slice,
                         slice(4, 8, 1))
        self.assertTrue(
            series.chunks[1].index_value._index_value._is_monotonic_increasing)
        self.assertFalse(
            series.chunks[1].index_value._index_value._is_monotonic_decreasing)
        self.assertTrue(series.chunks[1].index_value._index_value._is_unique)
        pd.testing.assert_series_equal(series.chunks[2].op.data,
                                       series.op.data.iloc[8:])
        self.assertEqual(series.chunks[2].index_value._index_value._slice,
                         slice(8, 10, 1))
        self.assertTrue(
            series.chunks[2].index_value._index_value._is_monotonic_increasing)
        self.assertFalse(
            series.chunks[2].index_value._index_value._is_monotonic_decreasing)
        self.assertTrue(series.chunks[2].index_value._index_value._is_unique)
Beispiel #5
0
    def testToCPUExecution(self):
        pdf = pd.DataFrame(np.random.rand(20, 30), index=np.arange(20, 0, -1))
        df = from_pandas_df(pdf, chunk_size=(13, 21))
        cdf = to_gpu(df)
        df2 = to_cpu(cdf)

        res = self.executor.execute_dataframe(df2, concat=True)[0]
        self.assertIsInstance(res, pd.DataFrame)
        pd.testing.assert_frame_equal(res, pdf)

        pseries = pdf.iloc[:, 0]
        series = from_pandas_series(pseries, chunk_size=(13, 21))
        cseries = to_gpu(series)
        series2 = to_cpu(cseries)

        res = self.executor.execute_dataframe(series2, concat=True)[0]
        self.assertIsInstance(res, pd.Series)
        pd.testing.assert_series_equal(res, pseries)
Beispiel #6
0
    def testSeriesReductionSerialize(self):
        data = pd.Series(np.random.rand(10), name='a')
        if self.has_skipna:
            kwargs = dict(axis='index', skipna=False)
        else:
            kwargs = dict()
        reduction_df = getattr(from_pandas_series(data),
                               self.func_name)(**kwargs).tiles()

        # pb
        chunk = reduction_df.chunks[0]
        serials = self._pb_serial(chunk)
        op, pb = serials[chunk.op, chunk.data]

        self.assertEqual(tuple(pb.index), chunk.index)
        self.assertEqual(pb.key, chunk.key)
        self.assertEqual(tuple(pb.shape), chunk.shape)
        self.assertEqual(int(op.type.split('.', 1)[1]), self.op_num)

        chunk2 = self._pb_deserial(serials)[chunk.data]

        self.assertEqual(chunk.index, chunk2.index)
        self.assertEqual(chunk.key, chunk2.key)
        self.assertEqual(chunk.shape, chunk2.shape)
        self.assertEqual(chunk.name, chunk2.name)
        self.assertEqual(chunk.op.skipna, chunk2.op.skipna)
        self.assertEqual(chunk.op.axis, chunk2.op.axis)
        pd.testing.assert_index_equal(chunk2.index_value.to_pandas(),
                                      chunk.index_value.to_pandas())

        # json
        chunk = reduction_df.chunks[0]
        serials = self._json_serial(chunk)

        chunk2 = self._json_deserial(serials)[chunk.data]

        self.assertEqual(chunk.index, chunk2.index)
        self.assertEqual(chunk.key, chunk2.key)
        self.assertEqual(chunk.shape, chunk2.shape)
        self.assertEqual(chunk2.name, chunk.name)
        self.assertEqual(chunk.op.skipna, chunk2.op.skipna)
        self.assertEqual(chunk.op.axis, chunk2.op.axis)
        pd.testing.assert_index_equal(chunk2.index_value.to_pandas(),
                                      chunk.index_value.to_pandas())
Beispiel #7
0
    def testToGPU(self):
        # test dataframe
        data = pd.DataFrame(np.random.rand(10, 10),
                            index=np.random.randint(-100, 100, size=(10, )),
                            columns=[np.random.bytes(10) for _ in range(10)])
        df = from_pandas_df(data)
        cdf = to_gpu(df)

        self.assertEqual(df.index_value, cdf.index_value)
        self.assertEqual(df.columns_value, cdf.columns_value)
        self.assertTrue(cdf.op.gpu)
        pd.testing.assert_series_equal(df.dtypes, cdf.dtypes)

        cdf = cdf.tiles()
        df = get_tiled(df)

        self.assertEqual(df.nsplits, cdf.nsplits)
        self.assertEqual(df.chunks[0].index_value, cdf.chunks[0].index_value)
        self.assertEqual(df.chunks[0].columns_value,
                         cdf.chunks[0].columns_value)
        self.assertTrue(cdf.chunks[0].op.gpu)
        pd.testing.assert_series_equal(df.chunks[0].dtypes,
                                       cdf.chunks[0].dtypes)

        self.assertIs(cdf, to_gpu(cdf))

        # test series
        sdata = data.iloc[:, 0]
        series = from_pandas_series(sdata)
        cseries = to_gpu(series)

        self.assertEqual(series.index_value, cseries.index_value)
        self.assertTrue(cseries.op.gpu)

        cseries = cseries.tiles()
        series = get_tiled(series)

        self.assertEqual(series.nsplits, cseries.nsplits)
        self.assertEqual(series.chunks[0].index_value,
                         cseries.chunks[0].index_value)
        self.assertTrue(cseries.chunks[0].op.gpu)

        self.assertIs(cseries, to_gpu(cseries))
Beispiel #8
0
def test_to_numeric():
    raw = pd.DataFrame({"a": [1.0, 2, 3, -3]})
    df = from_pandas_df(raw, chunk_size=2)

    with pytest.raises(ValueError):
        _ = to_numeric(df)

    with pytest.raises(ValueError):
        _ = to_numeric([['1.0', 1]])

    with pytest.raises(ValueError):
        _ = to_numeric([])

    s = from_pandas_series(pd.Series(['1.0', '2.0', 1, -2]), chunk_size=2)
    r = tile(to_numeric(s))
    assert len(r.chunks) == 2
    assert isinstance(r, SERIES_TYPE)

    r = tile(to_numeric(['1.0', '2.0', 1, -2]))
    assert isinstance(r, TENSOR_TYPE)
Beispiel #9
0
    def testSeriesReductionSerialize(self):
        data = pd.Series(np.random.rand(10), name='a')
        if self.has_skipna:
            kwargs = dict(axis='index', skipna=False)
        else:
            kwargs = dict()
        reduction_df = getattr(from_pandas_series(data, chunk_size=3),
                               self.func_name)(**kwargs).tiles()

        # pb
        chunk = reduction_df.chunks[0]
        serials = self._pb_serial(chunk)
        op, pb = serials[chunk.op, chunk.data]

        self.assertEqual(tuple(pb.index), chunk.index)
        self.assertEqual(pb.key, chunk.key)
        self.assertEqual(tuple(pb.shape), chunk.shape)
        self.assertEqual(int(op.type.split('.', 1)[1]),
                         DataFrameAggregate._op_type_)

        chunk2 = self._pb_deserial(serials)[chunk.data]

        self.assertEqual(chunk.index, chunk2.index)
        self.assertEqual(chunk.key, chunk2.key)
        self.assertEqual(chunk.shape, chunk2.shape)
        self.assertEqual(chunk.op.agg_funcs[0].kwds.get('skipna'),
                         chunk2.op.agg_funcs[0].kwds.get('skipna'))
        self.assertEqual(chunk.op.axis, chunk2.op.axis)

        # json
        chunk = reduction_df.chunks[0]
        serials = self._json_serial(chunk)

        chunk2 = self._json_deserial(serials)[chunk.data]

        self.assertEqual(chunk.index, chunk2.index)
        self.assertEqual(chunk.key, chunk2.key)
        self.assertEqual(chunk.shape, chunk2.shape)
        self.assertEqual(chunk.op.agg_funcs[0].kwds.get('skipna'),
                         chunk2.op.agg_funcs[0].kwds.get('skipna'))
        self.assertEqual(chunk.op.axis, chunk2.op.axis)
Beispiel #10
0
def test_cut():
    s = from_pandas_series(pd.Series([1., 2., 3., 4.]), chunk_size=2)

    with pytest.raises(ValueError):
        _ = cut(s, -1)

    with pytest.raises(ValueError):
        _ = cut([[1, 2], [3, 4]], 3)

    with pytest.raises(ValueError):
        _ = cut([], 3)

    r, b = cut(s, [1.5, 2.5], retbins=True)
    assert isinstance(r, SERIES_TYPE)
    assert isinstance(b, TENSOR_TYPE)

    r = tile(r)

    assert len(r.chunks) == 2
    for c in r.chunks:
        assert isinstance(c, SERIES_CHUNK_TYPE)
        assert c.shape == (2, )

    r = cut(s.to_tensor(), [1.5, 2.5])
    assert isinstance(r, CATEGORICAL_TYPE)
    assert len(r) == len(s)
    assert 'Categorical' in repr(r)

    r = tile(r)

    assert len(r.chunks) == 2
    for c in r.chunks:
        assert isinstance(c, CATEGORICAL_CHUNK_TYPE)
        assert c.shape == (2, )
        assert c.ndim == 1

    r = cut([0, 1, 1, 2], bins=4, labels=False)
    assert isinstance(r, TENSOR_TYPE)
    e = pd.cut([0, 1, 1, 2], bins=4, labels=False)
    assert r.dtype == e.dtype
Beispiel #11
0
    def testSeriesApplyTransform(self):
        idxes = [chr(ord('A') + i) for i in range(20)]
        s_raw = pd.Series([i**2 for i in range(20)], index=idxes)

        series = from_pandas_series(s_raw, chunk_size=5)

        r = series.apply('add', args=(1, )).tiles()
        self.assertEqual(r.op._op_type_, opcodes.ADD)

        r = series.apply(np.sqrt).tiles()
        self.assertTrue(np.dtype('float64'), r.dtype)
        self.assertEqual(r.shape, series.shape)
        self.assertEqual(r.op._op_type_, opcodes.SERIES_APPLY)
        self.assertEqual(r.op.object_type, ObjectType.series)
        self.assertEqual(r.chunks[0].shape, (5, ))
        self.assertEqual(r.chunks[0].inputs[0].shape, (5, ))

        r = series.apply('sqrt').tiles()
        self.assertTrue(np.dtype('float64'), r.dtype)
        self.assertEqual(r.shape, series.shape)
        self.assertEqual(r.op._op_type_, opcodes.SERIES_APPLY)
        self.assertEqual(r.op.object_type, ObjectType.series)
        self.assertEqual(r.chunks[0].shape, (5, ))
        self.assertEqual(r.chunks[0].inputs[0].shape, (5, ))

        r = series.transform(lambda x: x + 1).tiles()
        self.assertTrue(np.dtype('float64'), r.dtype)
        self.assertEqual(r.shape, series.shape)
        self.assertEqual(r.op._op_type_, opcodes.SERIES_TRANSFORM)
        self.assertEqual(r.op.object_type, ObjectType.series)
        self.assertEqual(r.chunks[0].shape, (5, ))
        self.assertEqual(r.chunks[0].inputs[0].shape, (5, ))

        r = series.apply(lambda x: [x, x + 1], convert_dtype=False).tiles()
        self.assertTrue(np.dtype('object'), r.dtype)
        self.assertEqual(r.shape, series.shape)
        self.assertEqual(r.op._op_type_, opcodes.SERIES_APPLY)
        self.assertEqual(r.op.object_type, ObjectType.series)
        self.assertEqual(r.chunks[0].shape, (5, ))
        self.assertEqual(r.chunks[0].inputs[0].shape, (5, ))
Beispiel #12
0
    def testSeriesSumSerialize(self):
        data = pd.Series(np.random.rand(10), name='a')
        sum_df = from_pandas_series(data).sum(axis='index',
                                              skipna=False).tiles()

        # pb
        chunk = sum_df.chunks[0]
        serials = self._pb_serial(chunk)
        op, pb = serials[chunk.op, chunk.data]

        self.assertEqual(tuple(pb.index), chunk.index)
        self.assertEqual(pb.key, chunk.key)
        self.assertEqual(tuple(pb.shape), chunk.shape)
        self.assertEqual(int(op.type.split('.', 1)[1]), OperandDef.SUM)

        chunk2 = self._pb_deserial(serials)[chunk.data]

        self.assertEqual(chunk.index, chunk2.index)
        self.assertEqual(chunk.key, chunk2.key)
        self.assertEqual(chunk.shape, chunk2.shape)
        self.assertEqual(chunk.name, chunk2.name)
        self.assertEqual(chunk.op.skipna, chunk2.op.skipna)
        self.assertEqual(chunk.op.axis, chunk2.op.axis)
        pd.testing.assert_index_equal(chunk2.index_value.to_pandas(),
                                      chunk.index_value.to_pandas())

        # json
        chunk = sum_df.chunks[0]
        serials = self._json_serial(chunk)

        chunk2 = self._json_deserial(serials)[chunk.data]

        self.assertEqual(chunk.index, chunk2.index)
        self.assertEqual(chunk.key, chunk2.key)
        self.assertEqual(chunk.shape, chunk2.shape)
        self.assertEqual(chunk2.name, chunk.name)
        self.assertEqual(chunk.op.skipna, chunk2.op.skipna)
        self.assertEqual(chunk.op.axis, chunk2.op.axis)
        pd.testing.assert_index_equal(chunk2.index_value.to_pandas(),
                                      chunk.index_value.to_pandas())
Beispiel #13
0
def test_from_pandas_series():
    data = pd.Series(np.random.rand(10), name='a')
    series = from_pandas_series(data, chunk_size=4)

    assert series.name == data.name
    assert isinstance(series.index_value._index_value, IndexValue.RangeIndex)
    assert series.index_value._index_value._slice == slice(0, 10, 1)
    assert series.index_value.is_monotonic_increasing is True
    assert series.index_value.is_monotonic_decreasing is False
    assert series.index_value.is_unique is True
    assert series.index_value.min_val == 0
    assert series.index_value.max_val == 9

    series = tile(series)

    assert len(series.chunks) == 3
    pd.testing.assert_series_equal(series.chunks[0].op.data,
                                   series.op.data.iloc[:4])
    assert series.chunks[0].index_value._index_value._slice == slice(0, 4, 1)
    assert series.chunks[
        0].index_value._index_value._is_monotonic_increasing is True
    assert series.chunks[
        0].index_value._index_value._is_monotonic_decreasing is False
    assert series.chunks[0].index_value._index_value._is_unique is True
    pd.testing.assert_series_equal(series.chunks[1].op.data,
                                   series.op.data.iloc[4:8])
    assert series.chunks[1].index_value._index_value._slice == slice(4, 8, 1)
    assert series.chunks[
        1].index_value._index_value._is_monotonic_increasing is True
    assert series.chunks[
        1].index_value._index_value._is_monotonic_decreasing is False
    assert series.chunks[1].index_value._index_value._is_unique is True
    pd.testing.assert_series_equal(series.chunks[2].op.data,
                                   series.op.data.iloc[8:])
    assert series.chunks[2].index_value._index_value._slice == slice(8, 10, 1)
    assert series.chunks[
        2].index_value._index_value._is_monotonic_increasing is True
    assert series.chunks[
        2].index_value._index_value._is_monotonic_decreasing is False
    assert series.chunks[2].index_value._index_value._is_unique is True
Beispiel #14
0
    def testCheckNA(self):
        df_raw = pd.DataFrame(np.nan, index=range(0, 20), columns=list('ABCDEFGHIJ'))
        for _ in range(20):
            df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99)

        df = from_pandas_df(df_raw, chunk_size=4)

        pd.testing.assert_frame_equal(self.executor.execute_dataframe(df.isna(), concat=True)[0],
                                      df_raw.isna())
        pd.testing.assert_frame_equal(self.executor.execute_dataframe(df.notna(), concat=True)[0],
                                      df_raw.notna())

        series_raw = pd.Series(np.nan, index=range(20))
        for _ in range(3):
            series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99)

        series = from_pandas_series(series_raw, chunk_size=4)

        pd.testing.assert_series_equal(self.executor.execute_dataframe(series.isna(), concat=True)[0],
                                       series_raw.isna())
        pd.testing.assert_series_equal(self.executor.execute_dataframe(series.notna(), concat=True)[0],
                                       series_raw.notna())
Beispiel #15
0
def test_to_gpu():
    # test dataframe
    data = pd.DataFrame(np.random.rand(10, 10),
                        index=np.random.randint(-100, 100, size=(10, )),
                        columns=[np.random.bytes(10) for _ in range(10)])
    df = from_pandas_df(data)
    cdf = to_gpu(df)

    assert df.index_value == cdf.index_value
    assert df.columns_value == cdf.columns_value
    assert cdf.op.gpu is True
    pd.testing.assert_series_equal(df.dtypes, cdf.dtypes)

    df, cdf = tile(df, cdf)

    assert df.nsplits == cdf.nsplits
    assert df.chunks[0].index_value == cdf.chunks[0].index_value
    assert df.chunks[0].columns_value == cdf.chunks[0].columns_value
    assert cdf.chunks[0].op.gpu is True
    pd.testing.assert_series_equal(df.chunks[0].dtypes, cdf.chunks[0].dtypes)

    assert cdf is to_gpu(cdf)

    # test series
    sdata = data.iloc[:, 0]
    series = from_pandas_series(sdata)
    cseries = to_gpu(series)

    assert series.index_value == cseries.index_value
    assert cseries.op.gpu is True

    series, cseries = tile(series, cseries)

    assert series.nsplits == cseries.nsplits
    assert series.chunks[0].index_value == cseries.chunks[0].index_value
    assert cseries.chunks[0].op.gpu is True

    assert cseries is to_gpu(cseries)
    def testRechunkExecution(self):
        data = pd.DataFrame(np.random.rand(8, 10))
        df = from_pandas_df(pd.DataFrame(data), chunk_size=3)
        df2 = df.rechunk((3, 4))
        res = self.executor.execute_dataframe(df2, concat=True)[0]
        pd.testing.assert_frame_equal(data, res)

        data = pd.DataFrame(np.random.rand(10, 10),
                            index=np.random.randint(-100, 100, size=(10, )),
                            columns=[np.random.bytes(10) for _ in range(10)])
        df = from_pandas_df(data)
        df2 = df.rechunk(5)
        res = self.executor.execute_dataframe(df2, concat=True)[0]
        pd.testing.assert_frame_equal(data, res)

        # test Series rechunk execution.
        data = pd.Series(np.random.rand(10, ))
        series = from_pandas_series(data)
        series2 = series.rechunk(3)
        res = self.executor.execute_dataframe(series2, concat=True)[0]
        pd.testing.assert_series_equal(data, res)

        series2 = series.rechunk(1)
        res = self.executor.execute_dataframe(series2, concat=True)[0]
        pd.testing.assert_series_equal(data, res)

        # test index rechunk execution
        data = pd.Index(np.random.rand(10, ))
        index = from_pandas_index(data)
        index2 = index.rechunk(3)
        res = self.executor.execute_dataframe(index2, concat=True)[0]
        pd.testing.assert_index_equal(data, res)

        index2 = index.rechunk(1)
        res = self.executor.execute_dataframe(index2, concat=True)[0]
        pd.testing.assert_index_equal(data, res)
Beispiel #17
0
    def testSeriesCumReduction(self):
        data = pd.Series(np.random.rand(20),
                         index=[str(i) for i in range(20)],
                         name='a')
        reduction_df1 = self.compute(from_pandas_series(data))
        pd.testing.assert_series_equal(
            self.compute(data),
            self.executor.execute_dataframe(reduction_df1, concat=True)[0])

        reduction_df2 = self.compute(from_pandas_series(data, chunk_size=6))
        pd.testing.assert_series_equal(
            self.compute(data),
            self.executor.execute_dataframe(reduction_df2, concat=True)[0])

        reduction_df3 = self.compute(from_pandas_series(data, chunk_size=3))
        pd.testing.assert_series_equal(
            self.compute(data),
            self.executor.execute_dataframe(reduction_df3, concat=True)[0])

        reduction_df4 = self.compute(from_pandas_series(data, chunk_size=4),
                                     axis='index')
        pd.testing.assert_series_equal(
            self.compute(data, axis='index'),
            self.executor.execute_dataframe(reduction_df4, concat=True)[0])

        data = pd.Series(np.random.rand(20), name='a')
        data[0] = 0.1  # make sure not all elements are NAN
        data[data > 0.5] = np.nan
        reduction_df1 = self.compute(from_pandas_series(data, chunk_size=3))
        pd.testing.assert_series_equal(
            self.compute(data),
            self.executor.execute_dataframe(reduction_df1, concat=True)[0])

        reduction_df2 = self.compute(from_pandas_series(data, chunk_size=3),
                                     skipna=False)
        pd.testing.assert_series_equal(
            self.compute(data, skipna=False),
            self.executor.execute_dataframe(reduction_df2, concat=True)[0])
    def testFromPandasSeriesExecution(self):
        ps = pd.Series(np.random.rand(20), index=[np.arange(20), np.arange(20, 0, -1)], name='a')
        series = from_pandas_series(ps, chunk_size=13)

        result = self.executor.execute_dataframe(series, concat=True)[0]
        pd.testing.assert_series_equal(ps, result)
def test_series(setup, func_name, func_opts):
    # only one chunk
    s1 = pd.Series(np.arange(10) + 1)
    s1 = to_boolean_if_needed(func_opts.func_name, s1)
    s2 = pd.Series(np.arange(10) + 1)
    s2 = to_boolean_if_needed(func_opts.func_name, s2)
    r = func_opts.func(from_pandas_series(s1, chunk_size=10), from_pandas_series(s2, chunk_size=10))
    result = r.execute().fetch()
    expected = func_opts.func(s1, s2)
    pd.testing.assert_series_equal(expected, result)

    # same index
    s1 = pd.Series(np.arange(10) + 1)
    s1 = to_boolean_if_needed(func_opts.func_name, s1)
    s2 = pd.Series(np.arange(10) + 1)
    s2 = to_boolean_if_needed(func_opts.func_name, s2)
    r = func_opts.func(from_pandas_series(s1, chunk_size=4), from_pandas_series(s2, chunk_size=6))
    result = r.execute().fetch()
    expected = func_opts.func(s1, s2)
    pd.testing.assert_series_equal(expected, result)

    # no shuffle
    s1 = pd.Series(np.arange(10) + 1, index=range(10))
    s1 = to_boolean_if_needed(func_opts.func_name, s1)
    s2 = pd.Series(np.arange(10) + 1, index=range(10, 0, -1))
    s2 = to_boolean_if_needed(func_opts.func_name, s2)
    r = func_opts.func(from_pandas_series(s1, chunk_size=4), from_pandas_series(s2, chunk_size=6))
    result = r.execute().fetch()
    expected = func_opts.func(s1, s2)
    pd.testing.assert_series_equal(expected, result)

    # shuffle
    data = (np.arange(10) + 1).astype(np.int64, copy=False)
    s1 = pd.Series(data, index=np.random.permutation(range(10)))
    s1 = to_boolean_if_needed(func_opts.func_name, s1)
    s2 = pd.Series(data, index=np.random.permutation(range(10, 0, -1)))
    s2 = to_boolean_if_needed(func_opts.func_name, s2)
    r = func_opts.func(from_pandas_series(s1, chunk_size=4), from_pandas_series(s2, chunk_size=6))
    result = r.execute().fetch()
    expected = func_opts.func(s1, s2)
    pd.testing.assert_series_equal(expected, result)

    if func_opts.func_name in ['__and__', '__or__', '__xor__']:
        # bitwise logical operators doesn\'t support floating point scalars
        return

    # operate with scalar
    s1 = pd.Series(np.arange(10) + 1, index=np.random.permutation(range(10)))
    s1 = to_boolean_if_needed(func_opts.func_name, s1)
    r = func_opts.func(from_pandas_series(s1, chunk_size=4), 4)
    result = r.execute().fetch()
    expected = func_opts.func(s1, 4)
    pd.testing.assert_series_equal(expected, result)

    # reverse with scalar
    s1 = pd.Series(np.arange(10) + 1, index=np.random.permutation(range(10)))
    s1 = to_boolean_if_needed(func_opts.func_name, s1)
    r = func_opts.func(4, from_pandas_series(s1, chunk_size=4))
    result = r.execute().fetch()
    expected = func_opts.func(4, s1)
    pd.testing.assert_series_equal(expected, result)
def test_dataframe_and_series(setup, func_name, func_opts):
    if func_opts.func_name in ['__and__', '__or__', '__xor__']:
        # pandas fails to compute some expected values due to `na`.
        return

    data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10),
                         columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
    data1 = to_boolean_if_needed(func_opts.func_name, data1)
    data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1),
                         columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
    data2 = to_boolean_if_needed(func_opts.func_name, data2)

    s1 = from_pandas_series(data2[1], chunk_size=(6,))

    # operate on single-column dataframe and series
    df1 = from_pandas(data1[[1]], chunk_size=(5, 5))
    r1 = getattr(df1, func_opts.func_name)(s1, axis='index')

    expected = getattr(data1[[1]], func_opts.func_name)(data2[1], axis='index')
    result = r1.execute().fetch()
    pd.testing.assert_frame_equal(expected, result)

    # operate on dataframe and series without shuffle
    df2 = from_pandas(data1, chunk_size=(5, 5))
    r2 = getattr(df2, func_opts.func_name)(s1, axis='index')

    expected = getattr(data1, func_opts.func_name)(data2[1], axis='index')
    result = r2.execute().fetch()
    pd.testing.assert_frame_equal(expected, result)

    # operate on dataframe and series with shuffle
    df3 = from_pandas(data1, chunk_size=(5, 5))
    r3 = getattr(df3, func_opts.func_name)(s1, axis='columns')

    expected = getattr(data1, func_opts.func_name)(data2[1], axis='columns')
    result = r3.execute().fetch()
    pd.testing.assert_frame_equal(expected, result)

    # test both one chunk, axis=0
    pdf = pd.DataFrame({'ca': [1, 3, 2], 'cb': [360, 180, 2]}, index=[1, 2, 3])
    pdf = to_boolean_if_needed(func_opts.func_name, pdf)
    df = from_pandas(pdf)
    series = pd.Series([0, 1, 2], index=[1, 2, 3])
    mars_series = from_pandas_series(series)
    result = getattr(df, func_opts.func_name)(mars_series, axis=0).execute().fetch()
    expected = getattr(pdf, func_opts.func_name)(series, axis=0)
    pd.testing.assert_frame_equal(expected, result)

    # test different number of chunks, axis=0
    pdf = pd.DataFrame({'ca': [1, 3, 2], 'cb': [360, 180, 2]}, index=[1, 2, 3])
    pdf = to_boolean_if_needed(func_opts.func_name, pdf)
    df = from_pandas(pdf, chunk_size=1)
    series = pd.Series([0, 1, 2], index=[1, 2, 3])
    mars_series = from_pandas_series(series)
    result = getattr(df, func_opts.func_name)(mars_series, axis=0).execute().fetch()
    expected = getattr(pdf, func_opts.func_name)(series, axis=0)
    pd.testing.assert_frame_equal(expected, result)

    # test with row shuffle, axis=0
    pdf = pd.DataFrame({'ca': [1, 3, 2], 'cb': [360, 180, 2]}, index=[2, 1, 3])
    pdf = to_boolean_if_needed(func_opts.func_name, pdf)
    df = from_pandas(pdf, chunk_size=1)
    series = pd.Series([0, 1, 2], index=[3, 1, 2])
    mars_series = from_pandas_series(series)
    result = getattr(df, func_opts.func_name)(mars_series, axis=0).execute().fetch()
    expected = getattr(pdf, func_opts.func_name)(series, axis=0).reindex([3, 1, 2])
    # modify the order of rows
    result = result.reindex(index=[3, 1, 2])
    pd.testing.assert_frame_equal(expected, result)

    # test both one chunk, axis=1
    pdf = pd.DataFrame({1: [1, 3, 2], 2: [360, 180, 2], 3: [1, 2, 3]}, index=['ra', 'rb', 'rc'])
    pdf = to_boolean_if_needed(func_opts.func_name, pdf)
    df = from_pandas(pdf)
    series = pd.Series([0, 1, 2], index=[1, 2, 3])
    mars_series = from_pandas_series(series)
    result = getattr(df, func_opts.func_name)(mars_series, axis=1).execute().fetch()
    expected = getattr(pdf, func_opts.func_name)(series, axis=1)
    pd.testing.assert_frame_equal(expected, result)

    # test different number of chunks, axis=1
    pdf = pd.DataFrame({1: [1, 3, 2], 2: [360, 180, 2], 3: [1, 2, 3]}, index=['ra', 'rb', 'rc'])
    pdf = to_boolean_if_needed(func_opts.func_name, pdf)
    df = from_pandas(pdf, chunk_size=1)
    series = pd.Series([0, 1, 2], index=[1, 2, 3])
    mars_series = from_pandas_series(series)
    result = getattr(df, func_opts.func_name)(mars_series, axis=1).execute().fetch()
    expected = getattr(pdf, func_opts.func_name)(series, axis=1)
    pd.testing.assert_frame_equal(expected, result)

    # test with row shuffle, axis=1
    pdf = pd.DataFrame({1: [1, 3, 2], 3: [1, 2, 3], 2: [360, 180, 2]}, index=['ra', 'rb', 'rc'])
    pdf = to_boolean_if_needed(func_opts.func_name, pdf)
    df = from_pandas(pdf, chunk_size=1)
    series = pd.Series([0, 1, 2], index=[3, 1, 2])
    mars_series = from_pandas_series(series)
    result = getattr(df, func_opts.func_name)(mars_series, axis=1).execute().fetch()
    expected = getattr(pdf, func_opts.func_name)(series, axis=1)
    # modify the order of columns
    result = result[[1, 2, 3]]
    pd.testing.assert_frame_equal(expected, result)
Beispiel #21
0
    def testSeriesReduction(self):
        data = pd.Series(np.random.rand(20),
                         index=[str(i) for i in range(20)],
                         name='a')
        reduction_df1 = self.compute(from_pandas_series(data))
        self.assertEqual(
            self.compute(data),
            self.executor.execute_dataframe(reduction_df1, concat=True)[0])

        reduction_df2 = self.compute(from_pandas_series(data, chunk_size=6))
        self.assertAlmostEqual(
            self.compute(data),
            self.executor.execute_dataframe(reduction_df2, concat=True)[0])

        reduction_df3 = self.compute(from_pandas_series(data, chunk_size=3))
        self.assertAlmostEqual(
            self.compute(data),
            self.executor.execute_dataframe(reduction_df3, concat=True)[0])

        reduction_df4 = self.compute(from_pandas_series(data, chunk_size=4),
                                     axis='index')
        self.assertAlmostEqual(
            self.compute(data, axis='index'),
            self.executor.execute_dataframe(reduction_df4, concat=True)[0])

        data = pd.Series(np.random.rand(20), name='a')
        data[0] = 0.1  # make sure not all elements are NAN
        data[data > 0.5] = np.nan
        reduction_df1 = self.compute(from_pandas_series(data, chunk_size=3))
        self.assertAlmostEqual(
            self.compute(data),
            self.executor.execute_dataframe(reduction_df1, concat=True)[0])

        reduction_df2 = self.compute(from_pandas_series(data, chunk_size=3),
                                     skipna=False)
        self.assertTrue(
            np.isnan(
                self.executor.execute_dataframe(reduction_df2,
                                                concat=True)[0]))

        if self.has_min_count:
            reduction_df3 = self.compute(from_pandas_series(data,
                                                            chunk_size=3),
                                         skipna=False,
                                         min_count=2)
            self.assertTrue(
                np.isnan(
                    self.executor.execute_dataframe(reduction_df3,
                                                    concat=True)[0]))

            reduction_df4 = self.compute(from_pandas_series(data,
                                                            chunk_size=3),
                                         min_count=1)
            self.assertAlmostEqual(
                self.compute(data, min_count=1),
                self.executor.execute_dataframe(reduction_df4, concat=True)[0])

            reduction_df5 = self.compute(from_pandas_series(data,
                                                            chunk_size=3),
                                         min_count=21)
            self.assertTrue(
                np.isnan(
                    self.executor.execute_dataframe(reduction_df5,
                                                    concat=True)[0]))
    def testMainDataFrameWithoutEtcd(self):
        import pandas as pd
        from mars.dataframe.datasource.dataframe import from_pandas as from_pandas_df
        from mars.dataframe.datasource.series import from_pandas as from_pandas_series
        from mars.dataframe.arithmetic import add

        self.start_processes(etcd=False, scheduler_args=['-Dscheduler.aggressive_assign=true'])

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id))

        data1 = pd.DataFrame(np.random.rand(10, 10))
        df1 = from_pandas_df(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10))
        df2 = from_pandas_df(data2, chunk_size=6)

        df3 = add(df1, df2)

        graph = df3.build_graph()
        targets = [df3.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = data1 + data2
        result = session_ref.fetch_result(graph_key, df3.key)
        pd.testing.assert_frame_equal(expected, loads(result))

        data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas_df(data1, chunk_size=(10, 5))
        data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1),
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas_df(data2, chunk_size=(10, 6))

        df3 = add(df1, df2)

        graph = df3.build_graph()
        targets = [df3.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = data1 + data2
        result = session_ref.fetch_result(graph_key, df3.key)
        pd.testing.assert_frame_equal(expected, loads(result))

        data1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas_df(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas_df(data2, chunk_size=6)

        df3 = add(df1, df2)

        graph = df3.build_graph()
        targets = [df3.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = data1 + data2
        result = session_ref.fetch_result(graph_key, df3.key)
        pd.testing.assert_frame_equal(expected, loads(result))

        s1 = pd.Series(np.random.rand(10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3])
        series1 = from_pandas_series(s1)

        graph = series1.build_graph()
        targets = [series1.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, series1.key)
        pd.testing.assert_series_equal(s1, loads(result))
Beispiel #23
0
    def testStringMethod(self):
        s = pd.Series(['a', 'b', 'c'], name='s')
        series = from_pandas_series(s, chunk_size=2)

        with self.assertRaises(AttributeError):
            _ = series.str.non_exist

        r = series.str.contains('c')
        self.assertEqual(r.dtype, np.bool_)
        self.assertEqual(r.name, s.name)
        pd.testing.assert_index_equal(r.index_value.to_pandas(), s.index)
        self.assertEqual(r.shape, s.shape)

        r = r.tiles()
        for i, c in enumerate(r.chunks):
            self.assertEqual(c.index, (i, ))
            self.assertEqual(c.dtype, np.bool_)
            self.assertEqual(c.name, s.name)
            pd.testing.assert_index_equal(c.index_value.to_pandas(),
                                          s.index[i * 2:(i + 1) * 2])
            self.assertEqual(c.shape, (2, ) if i == 0 else (1, ))

        r = series.str.split(',', expand=True, n=1)
        self.assertEqual(r.op.object_type, ObjectType.dataframe)
        self.assertEqual(r.shape, (3, 2))
        pd.testing.assert_index_equal(r.index_value.to_pandas(), s.index)
        pd.testing.assert_index_equal(r.columns_value.to_pandas(),
                                      pd.RangeIndex(2))

        r = r.tiles()
        for i, c in enumerate(r.chunks):
            self.assertEqual(c.index, (i, 0))
            pd.testing.assert_index_equal(c.index_value.to_pandas(),
                                          s.index[i * 2:(i + 1) * 2])
            pd.testing.assert_index_equal(c.columns_value.to_pandas(),
                                          pd.RangeIndex(2))
            self.assertEqual(c.shape, (2, 2) if i == 0 else (1, 2))

        with self.assertRaises(TypeError):
            _ = series.str.cat([['1', '2']])

        with self.assertRaises(ValueError):
            _ = series.str.cat(['1', '2'])

        with self.assertRaises(ValueError):
            _ = series.str.cat(',')

        with self.assertRaises(TypeError):
            _ = series.str.cat({'1', '2', '3'})

        r = series.str.cat(sep=',')
        self.assertEqual(r.op.object_type, ObjectType.scalar)
        self.assertEqual(r.dtype, s.dtype)

        r = r.tiles()
        self.assertEqual(len(r.chunks), 1)
        self.assertEqual(r.chunks[0].op.object_type, ObjectType.scalar)
        self.assertEqual(r.chunks[0].dtype, s.dtype)

        r = series.str.extract(r'[ab](\d)', expand=False)
        self.assertEqual(r.op.object_type, ObjectType.series)
        self.assertEqual(r.dtype, s.dtype)

        r = r.tiles()
        for i, c in enumerate(r.chunks):
            self.assertEqual(c.index, (i, ))
            self.assertEqual(c.dtype, s.dtype)
            self.assertEqual(c.name, s.name)
            pd.testing.assert_index_equal(c.index_value.to_pandas(),
                                          s.index[i * 2:(i + 1) * 2])
            self.assertEqual(c.shape, (2, ) if i == 0 else (1, ))

        r = series.str.extract(r'[ab](\d)', expand=True)
        self.assertEqual(r.op.object_type, ObjectType.dataframe)
        self.assertEqual(r.shape, (3, 1))
        pd.testing.assert_index_equal(r.index_value.to_pandas(), s.index)
        pd.testing.assert_index_equal(r.columns_value.to_pandas(),
                                      pd.RangeIndex(1))

        r = r.tiles()
        for i, c in enumerate(r.chunks):
            self.assertEqual(c.index, (i, 0))
            pd.testing.assert_index_equal(c.index_value.to_pandas(),
                                          s.index[i * 2:(i + 1) * 2])
            pd.testing.assert_index_equal(c.columns_value.to_pandas(),
                                          pd.RangeIndex(1))
            self.assertEqual(c.shape, (2, 1) if i == 0 else (1, 1))

        self.assertIn('lstrip', dir(series.str))
Beispiel #24
0
    def testTransform(self):
        cols = [chr(ord('A') + i) for i in range(10)]
        df_raw = pd.DataFrame(
            dict((c, [i**2 for i in range(20)]) for c in cols))
        df = from_pandas_df(df_raw, chunk_size=5)

        idxes = [chr(ord('A') + i) for i in range(20)]
        s_raw = pd.Series([i**2 for i in range(20)], index=idxes)
        series = from_pandas_series(s_raw, chunk_size=5)

        def rename_fn(f, new_name):
            f.__name__ = new_name
            return f

        old_chunk_store_limit = options.chunk_store_limit
        try:
            options.chunk_store_limit = 20

            # DATAFRAME CASES
            # test transform scenarios on data frames
            r = df.transform(lambda x: list(range(len(x)))).tiles()
            self.assertTrue(all(v == np.dtype('int64') for v in r.dtypes))
            self.assertEqual(r.shape, df.shape)
            self.assertEqual(r.op._op_type_, opcodes.TRANSFORM)
            self.assertEqual(r.op.object_type, ObjectType.dataframe)
            self.assertEqual(r.chunks[0].shape,
                             (df.shape[0], 20 // df.shape[0]))
            self.assertEqual(r.chunks[0].inputs[0].shape[0], df_raw.shape[0])
            self.assertEqual(r.chunks[0].inputs[0].op._op_type_,
                             opcodes.CONCATENATE)

            r = df.transform(lambda x: list(range(len(x))), axis=1).tiles()
            self.assertTrue(all(v == np.dtype('int64') for v in r.dtypes))
            self.assertEqual(r.shape, df.shape)
            self.assertEqual(r.op._op_type_, opcodes.TRANSFORM)
            self.assertEqual(r.op.object_type, ObjectType.dataframe)
            self.assertEqual(r.chunks[0].shape,
                             (20 // df.shape[1], df.shape[1]))
            self.assertEqual(r.chunks[0].inputs[0].shape[1], df_raw.shape[1])
            self.assertEqual(r.chunks[0].inputs[0].op._op_type_,
                             opcodes.CONCATENATE)

            r = df.transform(['cumsum', 'cummax', lambda x: x + 1]).tiles()
            self.assertTrue(all(v == np.dtype('int64') for v in r.dtypes))
            self.assertEqual(r.shape, (df.shape[0], df.shape[1] * 3))
            self.assertEqual(r.op._op_type_, opcodes.TRANSFORM)
            self.assertEqual(r.op.object_type, ObjectType.dataframe)
            self.assertEqual(r.chunks[0].shape,
                             (df.shape[0], 20 // df.shape[0] * 3))
            self.assertEqual(r.chunks[0].inputs[0].shape[0], df_raw.shape[0])
            self.assertEqual(r.chunks[0].inputs[0].op._op_type_,
                             opcodes.CONCATENATE)

            r = df.transform({
                'A': 'cumsum',
                'D': ['cumsum', 'cummax'],
                'F': lambda x: x + 1
            }).tiles()
            self.assertTrue(all(v == np.dtype('int64') for v in r.dtypes))
            self.assertEqual(r.shape, (df.shape[0], 4))
            self.assertEqual(r.op._op_type_, opcodes.TRANSFORM)
            self.assertEqual(r.op.object_type, ObjectType.dataframe)
            self.assertEqual(r.chunks[0].shape, (df.shape[0], 1))
            self.assertEqual(r.chunks[0].inputs[0].shape[0], df_raw.shape[0])
            self.assertEqual(r.chunks[0].inputs[0].op._op_type_,
                             opcodes.CONCATENATE)

            # test agg scenarios on series
            r = df.transform(lambda x: x.iloc[:-1], _call_agg=True).tiles()
            self.assertTrue(all(v == np.dtype('int64') for v in r.dtypes))
            self.assertEqual(r.shape, (np.nan, df.shape[1]))
            self.assertEqual(r.op._op_type_, opcodes.TRANSFORM)
            self.assertEqual(r.op.object_type, ObjectType.dataframe)
            self.assertEqual(r.chunks[0].shape, (np.nan, 1))
            self.assertEqual(r.chunks[0].inputs[0].shape[0], df_raw.shape[0])
            self.assertEqual(r.chunks[0].inputs[0].op._op_type_,
                             opcodes.CONCATENATE)

            r = df.transform(lambda x: x.iloc[:-1], axis=1,
                             _call_agg=True).tiles()
            self.assertTrue(all(v == np.dtype('int64') for v in r.dtypes))
            self.assertEqual(r.shape, (df.shape[0], np.nan))
            self.assertEqual(r.op._op_type_, opcodes.TRANSFORM)
            self.assertEqual(r.op.object_type, ObjectType.dataframe)
            self.assertEqual(r.chunks[0].shape, (2, np.nan))
            self.assertEqual(r.chunks[0].inputs[0].shape[1], df_raw.shape[1])
            self.assertEqual(r.chunks[0].inputs[0].op._op_type_,
                             opcodes.CONCATENATE)

            fn_list = [
                rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), 'f1'),
                lambda x: x.iloc[:-1].reset_index(drop=True)
            ]
            r = df.transform(fn_list, _call_agg=True).tiles()
            self.assertTrue(all(v == np.dtype('int64') for v in r.dtypes))
            self.assertEqual(r.shape, (np.nan, df.shape[1] * 2))
            self.assertEqual(r.op._op_type_, opcodes.TRANSFORM)
            self.assertEqual(r.op.object_type, ObjectType.dataframe)
            self.assertEqual(r.chunks[0].shape, (np.nan, 2))
            self.assertEqual(r.chunks[0].inputs[0].shape[0], df_raw.shape[0])
            self.assertEqual(r.chunks[0].inputs[0].op._op_type_,
                             opcodes.CONCATENATE)

            r = df.transform(lambda x: x.sum(), _call_agg=True).tiles()
            self.assertEqual(r.dtype, np.dtype('int64'))
            self.assertEqual(r.shape, (df.shape[1], ))
            self.assertEqual(r.op._op_type_, opcodes.TRANSFORM)
            self.assertEqual(r.op.object_type, ObjectType.series)
            self.assertEqual(r.chunks[0].shape, (20 // df.shape[0], ))
            self.assertEqual(r.chunks[0].inputs[0].shape[0], df_raw.shape[0])
            self.assertEqual(r.chunks[0].inputs[0].op._op_type_,
                             opcodes.CONCATENATE)

            fn_dict = {
                'A':
                rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), 'f1'),
                'D': [
                    rename_fn(lambda x: x.iloc[1:].reset_index(drop=True),
                              'f1'),
                    lambda x: x.iloc[:-1].reset_index(drop=True)
                ],
                'F':
                lambda x: x.iloc[:-1].reset_index(drop=True),
            }
            r = df.transform(fn_dict, _call_agg=True).tiles()
            self.assertTrue(all(v == np.dtype('int64') for v in r.dtypes))
            self.assertEqual(r.shape, (np.nan, 4))
            self.assertEqual(r.op._op_type_, opcodes.TRANSFORM)
            self.assertEqual(r.op.object_type, ObjectType.dataframe)
            self.assertEqual(r.chunks[0].shape, (np.nan, 1))
            self.assertEqual(r.chunks[0].inputs[0].shape[0], df_raw.shape[0])
            self.assertEqual(r.chunks[0].inputs[0].op._op_type_,
                             opcodes.CONCATENATE)

            # SERIES CASES
            # test transform scenarios on series
            r = series.transform(lambda x: x + 1).tiles()
            self.assertTrue(np.dtype('float64'), r.dtype)
            self.assertEqual(r.shape, series.shape)
            self.assertEqual(r.op._op_type_, opcodes.TRANSFORM)
            self.assertEqual(r.op.object_type, ObjectType.series)
            self.assertEqual(r.chunks[0].shape, (5, ))
            self.assertEqual(r.chunks[0].inputs[0].shape, (5, ))
        finally:
            options.chunk_store_limit = old_chunk_store_limit
Beispiel #25
0
    def testFillNA(self):
        df_raw = pd.DataFrame(np.nan,
                              index=range(0, 20),
                              columns=list('ABCDEFGHIJ'))
        for _ in range(20):
            df_raw.iloc[random.randint(0, 19),
                        random.randint(0, 9)] = random.randint(0, 99)
        value_df_raw = pd.DataFrame(np.random.randint(0, 100, (10, 7)).astype(
            np.float32),
                                    columns=list('ABCDEFG'))
        series_raw = pd.Series(np.nan, index=range(20))
        for _ in range(3):
            series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99)
        value_series_raw = pd.Series(np.random.randint(0, 100, (10, )).astype(
            np.float32),
                                     index=list('ABCDEFGHIJ'))

        df = from_pandas_df(df_raw)
        series = from_pandas_series(series_raw)

        # when nothing supplied, raise
        with self.assertRaises(ValueError):
            df.fillna()
        # when both values and methods supplied, raises
        with self.assertRaises(ValueError):
            df.fillna(value=1, method='ffill')
        # when call on series, cannot supply DataFrames
        with self.assertRaises(ValueError):
            series.fillna(value=df)
        with self.assertRaises(ValueError):
            series.fillna(value=df_raw)
        with self.assertRaises(NotImplementedError):
            series.fillna(value=series_raw, downcast='infer')
        with self.assertRaises(NotImplementedError):
            series.ffill(limit=1)

        df2 = df.fillna(value_series_raw).tiles()
        self.assertEqual(len(df2.chunks), 1)
        self.assertEqual(df2.chunks[0].shape, df2.shape)
        self.assertIsNone(df2.chunks[0].op.stage)

        series2 = series.fillna(value_series_raw).tiles()
        self.assertEqual(len(series2.chunks), 1)
        self.assertEqual(series2.chunks[0].shape, series2.shape)
        self.assertIsNone(series2.chunks[0].op.stage)

        df = from_pandas_df(df_raw, chunk_size=5)
        df2 = df.fillna(value_series_raw).tiles()
        self.assertEqual(len(df2.chunks), 8)
        self.assertEqual(df2.chunks[0].shape, (5, 5))
        self.assertIsNone(df2.chunks[0].op.stage)

        series = from_pandas_series(series_raw, chunk_size=5)
        series2 = series.fillna(value_series_raw).tiles()
        self.assertEqual(len(series2.chunks), 4)
        self.assertEqual(series2.chunks[0].shape, (5, ))
        self.assertIsNone(series2.chunks[0].op.stage)

        df2 = df.ffill(axis='columns').tiles()
        self.assertEqual(len(df2.chunks), 8)
        self.assertEqual(df2.chunks[0].shape, (5, 5))
        self.assertEqual(df2.chunks[0].op.axis, 1)
        self.assertEqual(df2.chunks[0].op.stage, OperandStage.combine)
        self.assertEqual(df2.chunks[0].op.method, 'ffill')
        self.assertIsNone(df2.chunks[0].op.limit)

        series2 = series.bfill().tiles()
        self.assertEqual(len(series2.chunks), 4)
        self.assertEqual(series2.chunks[0].shape, (5, ))
        self.assertEqual(series2.chunks[0].op.stage, OperandStage.combine)
        self.assertEqual(series2.chunks[0].op.method, 'bfill')
        self.assertIsNone(series2.chunks[0].op.limit)

        value_df = from_pandas_df(value_df_raw, chunk_size=7)
        value_series = from_pandas_series(value_series_raw, chunk_size=7)

        df2 = df.fillna(value_df).tiles()
        self.assertEqual(df2.shape, df.shape)
        self.assertIsNone(df2.chunks[0].op.stage)

        df2 = df.fillna(value_series).tiles()
        self.assertEqual(df2.shape, df.shape)
        self.assertIsNone(df2.chunks[0].op.stage)

        value_series_raw.index = list(range(10))
        value_series = from_pandas_series(value_series_raw)
        series2 = series.fillna(value_series).tiles()
        self.assertEqual(series2.shape, series.shape)
        self.assertIsNone(series2.chunks[0].op.stage)
Beispiel #26
0
    def testRechunk(self):
        df = from_pandas_df(pd.DataFrame(np.random.rand(10, 10)), chunk_size=3)
        df2 = df.rechunk(4).tiles()

        self.assertEqual(df2.shape, (10, 10))
        self.assertEqual(len(df2.chunks), 9)

        self.assertEqual(df2.chunks[0].shape, (4, 4))
        pd.testing.assert_index_equal(df2.chunks[0].index_value.to_pandas(),
                                      pd.RangeIndex(4))
        pd.testing.assert_index_equal(df2.chunks[0].columns_value.to_pandas(),
                                      pd.RangeIndex(4))

        self.assertEqual(df2.chunks[2].shape, (4, 2))
        pd.testing.assert_index_equal(df2.chunks[2].index_value.to_pandas(),
                                      pd.RangeIndex(4))
        pd.testing.assert_index_equal(df2.chunks[2].columns_value.to_pandas(),
                                      pd.RangeIndex(8, 10))

        self.assertEqual(df2.chunks[-1].shape, (2, 2))
        pd.testing.assert_index_equal(df2.chunks[-1].index_value.to_pandas(),
                                      pd.RangeIndex(8, 10))
        pd.testing.assert_index_equal(df2.chunks[-1].columns_value.to_pandas(),
                                      pd.RangeIndex(8, 10))

        columns = [np.random.bytes(10) for _ in range(10)]
        index = np.random.randint(-100, 100, size=(4, ))
        data = pd.DataFrame(np.random.rand(4, 10),
                            index=index,
                            columns=columns)
        df = from_pandas_df(data, chunk_size=3)
        df2 = df.rechunk(6).tiles()

        self.assertEqual(df2.shape, (4, 10))
        self.assertEqual(len(df2.chunks), 2)

        self.assertEqual(df2.chunks[0].shape, (4, 6))
        pd.testing.assert_index_equal(df2.chunks[0].index_value.to_pandas(),
                                      df.index_value.to_pandas())
        pd.testing.assert_index_equal(df2.chunks[0].columns_value.to_pandas(),
                                      pd.Index(columns[:6]))

        self.assertEqual(df2.chunks[1].shape, (4, 4))
        pd.testing.assert_index_equal(df2.chunks[1].index_value.to_pandas(),
                                      df.index_value.to_pandas())
        pd.testing.assert_index_equal(df2.chunks[1].columns_value.to_pandas(),
                                      pd.Index(columns[6:]))

        # test Series rechunk
        series = from_pandas_series(pd.Series(np.random.rand(10, )),
                                    chunk_size=3)
        series2 = series.rechunk(4).tiles()

        self.assertEqual(series2.shape, (10, ))
        self.assertEqual(len(series2.chunks), 3)
        pd.testing.assert_index_equal(series2.index_value.to_pandas(),
                                      pd.RangeIndex(10))

        self.assertEqual(series2.chunk_shape, (3, ))
        self.assertEqual(series2.nsplits, ((4, 4, 2), ))
        self.assertEqual(series2.chunks[0].shape, (4, ))
        pd.testing.assert_index_equal(
            series2.chunks[0].index_value.to_pandas(), pd.RangeIndex(4))
        self.assertEqual(series2.chunks[1].shape, (4, ))
        pd.testing.assert_index_equal(
            series2.chunks[1].index_value.to_pandas(), pd.RangeIndex(4, 8))
        self.assertEqual(series2.chunks[2].shape, (2, ))
        pd.testing.assert_index_equal(
            series2.chunks[2].index_value.to_pandas(), pd.RangeIndex(8, 10))

        series2 = series.rechunk(1).tiles()

        self.assertEqual(series2.shape, (10, ))
        self.assertEqual(len(series2.chunks), 10)
        pd.testing.assert_index_equal(series2.index_value.to_pandas(),
                                      pd.RangeIndex(10))

        self.assertEqual(series2.chunk_shape, (10, ))
        self.assertEqual(series2.nsplits, ((1, ) * 10, ))
        self.assertEqual(series2.chunks[0].shape, (1, ))
        pd.testing.assert_index_equal(
            series2.chunks[0].index_value.to_pandas(), pd.RangeIndex(1))

        # no need to rechunk
        series2 = series.rechunk(3).tiles()
        series = get_tiled(series)
        self.assertEqual(series2.chunk_shape, series.chunk_shape)
        self.assertEqual(series2.nsplits, series.nsplits)
Beispiel #27
0
    def testStringMethodExecution(self):
        s = pd.Series(['s1,s2', 'ef,', 'dd', np.nan])
        s2 = pd.concat([s, s, s])

        series = from_pandas_series(s, chunk_size=2)
        series2 = from_pandas_series(s2, chunk_size=2)

        # test getitem
        r = series.str[:3]
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.str[:3]
        pd.testing.assert_series_equal(result, expected)

        # test split, expand=False
        r = series.str.split(',', n=2)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.str.split(',', n=2)
        pd.testing.assert_series_equal(result, expected)

        # test split, expand=True
        r = series.str.split(',', expand=True, n=1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.str.split(',', expand=True, n=1)
        pd.testing.assert_frame_equal(result, expected)

        # test rsplit
        r = series.str.rsplit(',', expand=True, n=1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.str.rsplit(',', expand=True, n=1)
        pd.testing.assert_frame_equal(result, expected)

        # test cat all data
        r = series2.str.cat(sep='/', na_rep='e')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s2.str.cat(sep='/', na_rep='e')
        self.assertEqual(result, expected)

        # test cat list
        r = series.str.cat(['a', 'b', np.nan, 'c'])
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.str.cat(['a', 'b', np.nan, 'c'])
        pd.testing.assert_series_equal(result, expected)

        # test cat series
        r = series.str.cat(series.str.capitalize(), join='outer')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.str.cat(s.str.capitalize(), join='outer')
        pd.testing.assert_series_equal(result, expected)

        # test extractall
        r = series.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)")
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)")
        pd.testing.assert_frame_equal(result, expected)

        # test extract, expand=False
        r = series.str.extract(r'[ab](\d)', expand=False)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.str.extract(r'[ab](\d)', expand=False)
        pd.testing.assert_series_equal(result, expected)

        # test extract, expand=True
        r = series.str.extract(r'[ab](\d)', expand=True)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.str.extract(r'[ab](\d)', expand=True)
        pd.testing.assert_frame_equal(result, expected)
Beispiel #28
0
    def testTransformExecute(self):
        cols = [chr(ord('A') + i) for i in range(10)]
        df_raw = pd.DataFrame(dict((c, [i ** 2 for i in range(20)]) for c in cols))

        idx_vals = [chr(ord('A') + i) for i in range(20)]
        s_raw = pd.Series([i ** 2 for i in range(20)], index=idx_vals)

        def rename_fn(f, new_name):
            f.__name__ = new_name
            return f

        old_chunk_store_limit = options.chunk_store_limit
        try:
            options.chunk_store_limit = 20

            # DATAFRAME CASES
            df = from_pandas_df(df_raw, chunk_size=5)

            # test transform scenarios on data frames
            r = df.transform(lambda x: list(range(len(x))))
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.transform(lambda x: list(range(len(x))))
            pd.testing.assert_frame_equal(result, expected)

            r = df.transform(lambda x: list(range(len(x))), axis=1)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.transform(lambda x: list(range(len(x))), axis=1)
            pd.testing.assert_frame_equal(result, expected)

            r = df.transform(['cumsum', 'cummax', lambda x: x + 1])
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.transform(['cumsum', 'cummax', lambda x: x + 1])
            pd.testing.assert_frame_equal(result, expected)

            fn_dict = OrderedDict([
                ('A', 'cumsum'),
                ('D', ['cumsum', 'cummax']),
                ('F', lambda x: x + 1),
            ])
            r = df.transform(fn_dict)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.transform(fn_dict)
            pd.testing.assert_frame_equal(result, expected)

            # test agg scenarios on series
            r = df.transform(lambda x: x.iloc[:-1], _call_agg=True)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.agg(lambda x: x.iloc[:-1])
            pd.testing.assert_frame_equal(result, expected)

            r = df.transform(lambda x: x.iloc[:-1], axis=1, _call_agg=True)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.agg(lambda x: x.iloc[:-1], axis=1)
            pd.testing.assert_frame_equal(result, expected)

            fn_list = [rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), 'f1'),
                       lambda x: x.iloc[:-1].reset_index(drop=True)]
            r = df.transform(fn_list, _call_agg=True)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.agg(fn_list)
            pd.testing.assert_frame_equal(result, expected)

            r = df.transform(lambda x: x.sum(), _call_agg=True)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.agg(lambda x: x.sum())
            pd.testing.assert_series_equal(result, expected)

            fn_dict = OrderedDict([
                ('A', rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), 'f1')),
                ('D', [rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), 'f1'),
                       lambda x: x.iloc[:-1].reset_index(drop=True)]),
                ('F', lambda x: x.iloc[:-1].reset_index(drop=True)),
            ])
            r = df.transform(fn_dict, _call_agg=True)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.agg(fn_dict)
            pd.testing.assert_frame_equal(result, expected)

            # SERIES CASES
            series = from_pandas_series(s_raw, chunk_size=5)

            # test transform scenarios on series
            r = series.transform(lambda x: x + 1)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = s_raw.transform(lambda x: x + 1)
            pd.testing.assert_series_equal(result, expected)

            r = series.transform(['cumsum', lambda x: x + 1])
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = s_raw.transform(['cumsum', lambda x: x + 1])
            pd.testing.assert_frame_equal(result, expected)
        finally:
            options.chunk_store_limit = old_chunk_store_limit
Beispiel #29
0
    def testEagerMode(self, *_):
        with new_cluster(scheduler_n_process=2,
                         worker_n_process=2,
                         shared_memory='20M',
                         web=True) as cluster:

            self.assertIsInstance(Session.default_or_local()._sess,
                                  LocalClusterSession)

            with option_context({'eager_mode': True}):
                a_data = np.random.rand(10, 10)

                a = mt.tensor(a_data, chunk_size=3)
                np.testing.assert_array_equal(a, a_data)

                r1 = a + 1
                expected1 = a_data + 1
                np.testing.assert_array_equal(r1, expected1)

                r2 = r1.dot(r1)
                expected2 = expected1.dot(expected1)
                np.testing.assert_array_almost_equal(r2, expected2)

            a = mt.ones((10, 10), chunk_size=3)
            with self.assertRaises(ValueError):
                a.fetch()

            r = a.dot(a)
            np.testing.assert_array_equal(r.execute(), np.ones((10, 10)) * 10)

            with new_session('http://' + cluster._web_endpoint).as_default():
                self.assertIsInstance(Session.default_or_local()._sess,
                                      WebSession)

                with option_context({'eager_mode': True}):
                    a_data = np.random.rand(10, 10)

                    a = mt.tensor(a_data, chunk_size=3)
                    np.testing.assert_array_equal(a, a_data)

                    r1 = a + 1
                    expected1 = a_data + 1
                    np.testing.assert_array_equal(r1, expected1)

                    r2 = r1.dot(r1)
                    expected2 = expected1.dot(expected1)
                    np.testing.assert_array_almost_equal(r2, expected2)

                    web_session = Session.default_or_local()._sess
                    self.assertEqual(web_session.get_task_count(), 3)

                a = mt.ones((10, 10), chunk_size=3)
                with self.assertRaises(ValueError):
                    a.fetch()

                r = a.dot(a)
                np.testing.assert_array_equal(r.execute(),
                                              np.ones((10, 10)) * 10)

            with new_session('http://' + cluster._web_endpoint).as_default():
                from mars.dataframe.datasource.dataframe import from_pandas as from_pandas_df
                from mars.dataframe.datasource.series import from_pandas as from_pandas_series
                from mars.dataframe.arithmetic import add

                self.assertIsInstance(Session.default_or_local()._sess,
                                      WebSession)

                with option_context({'eager_mode': True}):
                    data1 = pd.DataFrame(
                        np.random.rand(10, 10),
                        index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
                        columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
                    df1 = from_pandas_df(data1, chunk_size=5)
                    pd.testing.assert_frame_equal(df1.fetch(), data1)

                    data2 = pd.DataFrame(
                        np.random.rand(10, 10),
                        index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
                        columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
                    df2 = from_pandas_df(data2, chunk_size=6)
                    pd.testing.assert_frame_equal(df2.fetch(), data2)

                    df3 = add(df1, df2)
                    pd.testing.assert_frame_equal(df3.fetch(), data1 + data2)

                    s1 = pd.Series(np.random.rand(10),
                                   index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3])
                    series1 = from_pandas_series(s1)
                    pd.testing.assert_series_equal(series1.fetch(), s1)

                web_session = Session.default_or_local()._sess
                self.assertEqual(web_session.get_task_count(), 4)
Beispiel #30
0
    def testCutExecution(self):
        rs = np.random.RandomState(0)
        raw = rs.random(15) * 1000
        s = pd.Series(raw, index=['i{}'.format(i) for i in range(15)])
        bins = [10, 100, 500]
        ii = pd.interval_range(10, 500, 3)
        labels = ['a', 'b']

        t = tensor(raw, chunk_size=4)
        series = from_pandas_series(s, chunk_size=4)
        iii = from_pandas_index(ii, chunk_size=2)

        # cut on Series
        r = cut(series, bins)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_series_equal(result, pd.cut(s, bins))

        r, b = cut(series, bins, retbins=True)
        r_result = self.executor.execute_dataframe(r, concat=True)[0]
        b_result = self.executor.execute_tensor(b, concat=True)[0]
        r_expected, b_expected = pd.cut(s, bins, retbins=True)
        pd.testing.assert_series_equal(r_result, r_expected)
        np.testing.assert_array_equal(b_result, b_expected)

        # cut on tensor
        r = cut(t, bins)
        # result and expected is array whose dtype is CategoricalDtype
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.cut(raw, bins)
        self.assertEqual(len(result), len(expected))
        for r, e in zip(result, expected):
            np.testing.assert_equal(r, e)

        # one chunk
        r = cut(s, tensor(bins, chunk_size=2), right=False, include_lowest=True)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_series_equal(result, pd.cut(s, bins, right=False, include_lowest=True))

        # test labels
        r = cut(t, bins, labels=labels)
        # result and expected is array whose dtype is CategoricalDtype
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.cut(raw, bins, labels=labels)
        self.assertEqual(len(result), len(expected))
        for r, e in zip(result, expected):
            np.testing.assert_equal(r, e)

        r = cut(t, bins, labels=False)
        # result and expected is array whose dtype is CategoricalDtype
        result = self.executor.execute_tensor(r, concat=True)[0]
        expected = pd.cut(raw, bins, labels=False)
        np.testing.assert_array_equal(result, expected)

        # test labels which is tensor
        labels_t = tensor(['a', 'b'], chunk_size=1)
        r = cut(raw, bins, labels=labels_t, include_lowest=True)
        # result and expected is array whose dtype is CategoricalDtype
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.cut(raw, bins, labels=labels, include_lowest=True)
        self.assertEqual(len(result), len(expected))
        for r, e in zip(result, expected):
            np.testing.assert_equal(r, e)

        # test labels=False
        r, b = cut(raw, ii, labels=False, retbins=True)
        # result and expected is array whose dtype is CategoricalDtype
        r_result = self.executor.execute_tileable(r, concat=True)[0]
        b_result = self.executor.execute_tileable(b, concat=True)[0]
        r_expected, b_expected = pd.cut(raw, ii, labels=False, retbins=True)
        for r, e in zip(r_result, r_expected):
            np.testing.assert_equal(r, e)
        pd.testing.assert_index_equal(b_result, b_expected)

        # test bins which is md.IntervalIndex
        r, b = cut(series, iii, labels=tensor(labels, chunk_size=1), retbins=True)
        r_result = self.executor.execute_dataframe(r, concat=True)[0]
        b_result = self.executor.execute_dataframe(b, concat=True)[0]
        r_expected, b_expected = pd.cut(s, ii, labels=labels, retbins=True)
        pd.testing.assert_series_equal(r_result, r_expected)
        pd.testing.assert_index_equal(b_result, b_expected)

        # test duplicates
        bins2 = [0, 2, 4, 6, 10, 10]
        r, b = cut(s, bins2, labels=False, retbins=True,
                   right=False, duplicates='drop')
        r_result = self.executor.execute_dataframe(r, concat=True)[0]
        b_result = self.executor.execute_tensor(b, concat=True)[0]
        r_expected, b_expected = pd.cut(s, bins2, labels=False, retbins=True,
                                        right=False, duplicates='drop')
        pd.testing.assert_series_equal(r_result, r_expected)
        np.testing.assert_array_equal(b_result, b_expected)

        ctx, executor = self._create_test_context(self.executor)
        with ctx:
            # test integer bins
            r = cut(series, 3)
            result = executor.execute_dataframes([r])[0]
            pd.testing.assert_series_equal(result, pd.cut(s, 3))

            r, b = cut(series, 3, right=False, retbins=True)
            r_result, b_result = executor.execute_dataframes([r, b])
            r_expected, b_expected = pd.cut(s, 3, right=False, retbins=True)
            pd.testing.assert_series_equal(r_result, r_expected)
            np.testing.assert_array_equal(b_result, b_expected)

            # test min max same
            s2 = pd.Series([1.1] * 15)
            r = cut(s2, 3)
            result = executor.execute_dataframes([r])[0]
            pd.testing.assert_series_equal(result, pd.cut(s2, 3))

            # test inf exist
            s3 = s2.copy()
            s3[-1] = np.inf
            with self.assertRaises(ValueError):
                executor.execute_dataframes([cut(s3, 3)])