Exemple #1
0
def test_value_counts_head(prepare_data, setup, chunk_size):
    _, pdf = prepare_data
    df = md.DataFrame(pdf, chunk_size=chunk_size)

    df1 = df['a'].value_counts()
    df2 = df1.head(3)
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 3
    assert len(graph) == 3
    assert opt_df2 in graph.results

    result = df2.execute(extra_config={
        'operand_executors': _iloc_operand_executors
    }).fetch()
    expected = pdf['a'].value_counts().head(3)
    pd.testing.assert_series_equal(result, expected)
Exemple #2
0
    def testGroupByAggStrCat(self):
        agg_fun = lambda x: x.str.cat(sep='_', na_rep='NA')

        rs = np.random.RandomState(0)
        raw_df = pd.DataFrame({'a': rs.choice(['A', 'B', 'C'], size=(100,)),
                               'b': rs.choice([None, 'alfa', 'bravo', 'charlie'], size=(100,))})

        mdf = md.DataFrame(raw_df, chunk_size=13)

        r = mdf.groupby('a').agg(agg_fun)
        pd.testing.assert_frame_equal(self.executor.execute_dataframe(r, concat=True)[0],
                                      raw_df.groupby('a').agg(agg_fun))

        raw_series = pd.Series(rs.choice([None, 'alfa', 'bravo', 'charlie'], size=(100,)))

        ms = md.Series(raw_series, chunk_size=13)

        r = ms.groupby(lambda x: x % 2).agg(agg_fun)
        pd.testing.assert_series_equal(self.executor.execute_dataframe(r, concat=True)[0],
                                       raw_series.groupby(lambda x: x % 2).agg(agg_fun))
Exemple #3
0
    def testRollingAgg(self):
        df = pd.DataFrame(np.random.rand(4, 3), columns=list('abc'))
        df2 = md.DataFrame(df, chunk_size=3)

        r = df2.rolling(3).agg('max')
        expected = df.rolling(3).agg('max')

        self.assertEqual(r.shape, df.shape)
        self.assertIs(r.index_value, df2.index_value)
        pd.testing.assert_index_equal(r.columns_value.to_pandas(),
                                      expected.columns)
        pd.testing.assert_series_equal(r.dtypes, df2.dtypes)

        r = r.tiles()
        for c in r.chunks:
            self.assertEqual(c.shape, c.inputs[0].shape)
            self.assertIs(c.index_value, c.inputs[0].index_value)
            pd.testing.assert_index_equal(c.columns_value.to_pandas(),
                                          expected.columns)
            pd.testing.assert_series_equal(c.dtypes, expected.dtypes)
Exemple #4
0
    def testDataFrameIter(self):
        raw_data = pd.DataFrame(np.random.randint(1000, size=(20, 10)))
        df = md.DataFrame(raw_data, chunk_size=5)

        i = 0
        for result_row, expect_row in zip(df.iterrows(batch_size=15),
                                          raw_data.iterrows()):
            self.assertEqual(result_row[0], expect_row[0])
            pd.testing.assert_series_equal(result_row[1], expect_row[1])
            i += 1

        self.assertEqual(i, len(raw_data))

        i = 0
        for result_tup, expect_tup in zip(df.itertuples(batch_size=10),
                                          raw_data.itertuples()):
            self.assertEqual(result_tup, expect_tup)
            i += 1

        self.assertEqual(i, len(raw_data))
Exemple #5
0
    def testDataFrameExecuteNotFetch(self):
        data1 = pd.DataFrame(np.random.random((5, 4)), columns=list('abcd'))
        sess = Session.default_or_local()

        df1 = md.DataFrame(data1, chunk_size=2)

        with self.assertRaises(ValueError):
            sess.fetch(df1)

        self.assertIs(df1.execute(), df1)

        # modify result
        executor = sess._sess._executor
        executor.chunk_result[get_tiled(
            df1).chunks[0].key] = data1.iloc[:2, :2] * 3

        expected = data1
        expected.iloc[:2, :2] = data1.iloc[:2, :2] * 3

        pd.testing.assert_frame_equal(df1.to_pandas(), expected)
Exemple #6
0
def test_dataframe_getitem():
    data = pd.DataFrame(np.random.rand(10, 5),
                        columns=['c1', 'c2', 'c3', 'c4', 'c5'])
    df = md.DataFrame(data, chunk_size=2)

    series = df['c3']
    assert isinstance(series, Series)
    assert series.shape == (10, )
    assert series.name == 'c3'
    assert series.dtype == data['c3'].dtype
    assert series.index_value == df.index_value

    series = tile(series)
    assert isinstance(series, SERIES_TYPE)
    assert all(not i.is_coarse() for i in series.inputs) is True
    assert series.nsplits == ((2, 2, 2, 2, 2), )
    assert len(series.chunks) == 5
    for i, c in enumerate(series.chunks):
        assert isinstance(c, SERIES_CHUNK_TYPE)
        assert c.index == (i, )
        assert c.shape == (2, )

    df1 = df[['c1', 'c2', 'c3']]
    assert isinstance(df1, DataFrame)
    assert df1.shape == (10, 3)
    assert df1.index_value == df.index_value
    pd.testing.assert_index_equal(df1.columns_value.to_pandas(),
                                  data[['c1', 'c2', 'c3']].columns)
    pd.testing.assert_series_equal(df1.dtypes, data[['c1', 'c2', 'c3']].dtypes)

    df1 = tile(df1)
    assert df1.nsplits == ((2, 2, 2, 2, 2), (2, 1))
    assert len(df1.chunks) == 10
    for i, c in enumerate(df1.chunks[slice(0, 10, 2)]):
        assert isinstance(c, DATAFRAME_CHUNK_TYPE)
        assert c.index == (i, 0)
        assert c.shape == (2, 2)
    for i, c in enumerate(df1.chunks[slice(1, 10, 2)]):
        assert isinstance(c, DATAFRAME_CHUNK_TYPE)
        assert c.index == (i, 1)
        assert c.shape == (2, 1)
Exemple #7
0
    def testLocalClassifierFromToParquet(self):
        n_rows = 1000
        n_columns = 10
        rs = np.random.RandomState(0)
        X = rs.rand(n_rows, n_columns)
        y = rs.rand(n_rows)
        df = pd.DataFrame(X, columns=[f'c{i}' for i in range(n_columns)])
        df['id'] = [f'i{i}' for i in range(n_rows)]

        booster = xgboost.train({}, xgboost.DMatrix(X, y), num_boost_round=2)

        with tempfile.TemporaryDirectory() as d:
            m_name = os.path.join(d, 'c.model')
            result_dir = os.path.join(d, 'result')
            os.mkdir(result_dir)
            data_dir = os.path.join(d, 'data')
            os.mkdir(data_dir)

            booster.save_model(m_name)

            df.iloc[:500].to_parquet(os.path.join(d, 'data', 'data1.parquet'))
            df.iloc[500:].to_parquet(os.path.join(d, 'data', 'data2.parquet'))

            df = md.read_parquet(data_dir).set_index('id')
            model = XGBClassifier()
            model.load_model(m_name)
            result = model.predict(df, run=False)
            r = md.DataFrame(result).to_parquet(result_dir)

            # tiles to ensure no iterative tiling exists
            g = r.build_graph(tiled=True)
            self.assertTrue(all(isinstance(n.op, Fuse) for n in g))
            self.assertEqual(len(g), 2)
            r.execute()

            ret = md.read_parquet(result_dir).to_pandas().iloc[:, 0].to_numpy()
            model2 = xgboost.XGBClassifier()
            model2.load_model(m_name)
            expected = model2.predict(X)
            expected = np.stack([1 - expected, expected]).argmax(axis=0)
            np.testing.assert_array_equal(ret, expected)
Exemple #8
0
def test_gpu_execution(setup, check_ref_counts):
    df_raw = pd.DataFrame(np.random.rand(30, 3), columns=list('abc'))
    df = to_gpu(md.DataFrame(df_raw, chunk_size=6))

    r = df.sum()
    res = r.execute().fetch()
    pd.testing.assert_series_equal(res.to_pandas(), df_raw.sum())

    r = df.kurt()
    res = r.execute().fetch()
    pd.testing.assert_series_equal(res.to_pandas(), df_raw.kurt())

    r = df.agg(['sum', 'var'])
    res = r.execute().fetch()
    pd.testing.assert_frame_equal(res.to_pandas(), df_raw.agg(['sum', 'var']))

    s_raw = pd.Series(np.random.rand(30))
    s = to_gpu(md.Series(s_raw, chunk_size=6))

    r = s.sum()
    res = r.execute().fetch()
    assert pytest.approx(res) == s_raw.sum()

    r = s.kurt()
    res = r.execute().fetch()
    assert pytest.approx(res) == s_raw.kurt()

    r = s.agg(['sum', 'var'])
    res = r.execute().fetch()
    pd.testing.assert_series_equal(res.to_pandas(), s_raw.agg(['sum', 'var']))

    s_raw = pd.Series(
        np.random.randint(0, 3, size=(30, )) *
        np.random.randint(0, 5, size=(30, )))
    s = to_gpu(md.Series(s_raw, chunk_size=6))

    r = s.unique()
    res = r.execute().fetch()
    np.testing.assert_array_equal(
        cp.asnumpy(res).sort(),
        s_raw.unique().sort())
    def testArrowTunnelSinglePart(self):
        import pandas as pd
        import numpy as np
        import mars.dataframe as md

        mars_source_table_name = tn('mars_arrow_tunnel_datasource_spart')
        mars_des_table_name = tn('mars_arrow_tunnel_datastore_spart')
        self.odps.delete_table(mars_des_table_name, if_exists=True)
        self.odps.delete_table(mars_source_table_name, if_exists=True)
        table = self.odps.create_table(mars_source_table_name,
                                       schema=('col1 int, col2 string',
                                               'pt string'),
                                       lifecycle=1)
        pt = table.create_partition('pt=test_part')
        with pt.open_writer() as writer:
            writer.write([[1, 'test1'], [2, 'test2']])

        r = self.odps.to_mars_dataframe(mars_source_table_name, partition='pt=test_part') \
            .execute().to_pandas()
        expected = pt.to_df().to_pandas()
        pd.testing.assert_frame_equal(r, expected)

        data = pd.DataFrame({
            'col1':
            np.random.rand(1000, ),
            'col2':
            np.random.randint(0, 100, (1000, )),
            'col3':
            np.random.choice(['a', 'b', 'c'], size=(1000, ))
        })

        df = md.DataFrame(data, chunk_size=300)
        self.odps.persist_mars_dataframe(df,
                                         mars_des_table_name,
                                         partition='pt=test_part',
                                         unknown_as_string=True)
        expected = self.odps.get_table(mars_des_table_name).get_partition('pt=test_part') \
            .to_df().to_pandas()
        pd.testing.assert_frame_equal(
            expected.sort_values('col1').reset_index(drop=True),
            data.sort_values('col1').reset_index(drop=True))
Exemple #10
0
    def testExecutableTupleExecute(self):
        raw_a = np.random.RandomState(0).rand(10, 20)
        a = mt.tensor(raw_a)

        raw_df = pd.DataFrame(raw_a)
        df = md.DataFrame(raw_df)

        tp = test_namedtuple_type(a, df)
        executable_tp = mt.ExecutableTuple(tp)

        self.assertIn('a', dir(executable_tp))
        self.assertIs(executable_tp.a, a)
        self.assertIn(test_namedtuple_type.__name__, repr(executable_tp))
        with self.assertRaises(AttributeError):
            getattr(executable_tp, 'c')

        res = mt.ExecutableTuple(tp).execute().fetch()
        self.assertIs(test_namedtuple_type, type(res))

        np.testing.assert_array_equal(raw_a, res.a)
        pd.testing.assert_frame_equal(raw_df, res.b)
Exemple #11
0
async def test_iterative_tiling(create_cluster):
    session = get_default_session()

    raw = np.random.RandomState(0).rand(30, 5)
    raw_df = pd.DataFrame(raw, index=np.arange(1, 31))

    df = md.DataFrame(raw_df, chunk_size=10)
    df = df[df[0] < .7]
    df2 = df.shift(2)

    info = await session.execute(df2)
    await info
    assert info.result() is None
    result = (await session.fetch(df2))[0]

    expected = raw_df[raw_df[0] < .7].shift(2)
    pd.testing.assert_frame_equal(result, expected)

    # test meta
    assert df2.index_value.min_val >= 1
    assert df2.index_value.max_val <= 30
Exemple #12
0
    def testSetIndex(self):
        df1 = pd.DataFrame([[1, 3, 3], [4, 2, 6], [7, 8, 9]],
                           index=['a1', 'a2', 'a3'],
                           columns=['x', 'y', 'z'])
        df2 = md.DataFrame(df1, chunk_size=2)

        df3 = df2.set_index('y', drop=True)
        df3.tiles()
        self.assertEqual(df3.chunk_shape, (2, 2))
        pd.testing.assert_index_equal(df3.chunks[0].columns.to_pandas(),
                                      pd.Index(['x']))
        pd.testing.assert_index_equal(df3.chunks[1].columns.to_pandas(),
                                      pd.Index(['z']))

        df4 = df2.set_index('y', drop=False)
        df4.tiles()
        self.assertEqual(df4.chunk_shape, (2, 2))
        pd.testing.assert_index_equal(df4.chunks[0].columns.to_pandas(),
                                      pd.Index(['x', 'y']))
        pd.testing.assert_index_equal(df4.chunks[1].columns.to_pandas(),
                                      pd.Index(['z']))
Exemple #13
0
def test_set_index():
    df1 = pd.DataFrame([[1, 3, 3], [4, 2, 6], [7, 8, 9]],
                       index=['a1', 'a2', 'a3'],
                       columns=['x', 'y', 'z'])
    df2 = md.DataFrame(df1, chunk_size=2)

    df3 = df2.set_index('y', drop=True)
    df3 = tile(df3)
    assert df3.chunk_shape == (2, 2)
    pd.testing.assert_index_equal(df3.chunks[0].columns_value.to_pandas(),
                                  pd.Index(['x']))
    pd.testing.assert_index_equal(df3.chunks[1].columns_value.to_pandas(),
                                  pd.Index(['z']))

    df4 = df2.set_index('y', drop=False)
    df4 = tile(df4)
    assert df4.chunk_shape == (2, 2)
    pd.testing.assert_index_equal(df4.chunks[0].columns_value.to_pandas(),
                                  pd.Index(['x', 'y']))
    pd.testing.assert_index_equal(df4.chunks[1].columns_value.to_pandas(),
                                  pd.Index(['z']))
Exemple #14
0
def test_executable_tuple_execute(setup):
    raw_a = np.random.RandomState(0).rand(10, 20)
    a = mt.tensor(raw_a)

    raw_df = pd.DataFrame(raw_a)
    df = md.DataFrame(raw_df)

    tp = test_namedtuple_type(a, df)
    executable_tp = mt.ExecutableTuple(tp)

    assert 'a' in dir(executable_tp)
    assert executable_tp.a is a
    assert test_namedtuple_type.__name__ in repr(executable_tp)
    with pytest.raises(AttributeError):
        getattr(executable_tp, 'c')

    res = mt.ExecutableTuple(tp).execute().fetch()
    assert test_namedtuple_type is type(res)

    np.testing.assert_array_equal(raw_a, res.a)
    pd.testing.assert_frame_equal(raw_df, res.b)
Exemple #15
0
    def testRolling(self):
        df = pd.DataFrame(np.random.rand(4, 3), columns=list('abc'))
        df2 = md.DataFrame(df)

        r = df2.rolling(3,
                        min_periods=1,
                        center=True,
                        win_type='triang',
                        closed='both')
        expected = df.rolling(3,
                              min_periods=1,
                              center=True,
                              win_type='triang',
                              closed='both')
        self.assertEqual(repr(r), repr(expected))

        with self.assertRaises(KeyError):
            _ = r['d']

        with self.assertRaises(KeyError):
            _ = r['a', 'd']
Exemple #16
0
    def testMixiedInputTypeTrainTestSplit(self):
        rs = np.random.RandomState(0)
        df_raw = pd.DataFrame(rs.rand(10, 4))
        df = md.DataFrame(df_raw, chunk_size=5)
        X, y = df.iloc[:, :-1], df.iloc[:, -1]

        for x_to_tensor, y_to_tensor in itertools.product(range(1), range(1)):
            x = X
            if x_to_tensor:
                x = mt.tensor(x)
            yy = y
            if y_to_tensor:
                yy = mt.tensor(yy)

            x_train, x_test, y_train, y_test = train_test_split(x,
                                                                y,
                                                                random_state=0)
            self.assertIsInstance(x_train, type(x))
            self.assertIsInstance(x_test, type(x))
            self.assertIsInstance(y_train, type(yy))
            self.assertIsInstance(y_test, type(yy))
Exemple #17
0
    def testDataFrameGetitem(self):
        data = pd.DataFrame(np.random.rand(10, 5),
                            columns=['c1', 'c2', 'c3', 'c4', 'c5'])
        df = md.DataFrame(data, chunk_size=2)

        series = df['c3']
        self.assertIsInstance(series, Series)
        self.assertEqual(series.shape, (10, ))
        self.assertEqual(series.name, 'c3')
        self.assertEqual(series.dtype, data['c3'].dtype)
        self.assertEqual(series.index_value, df.index_value)

        series.tiles()
        self.assertEqual(series.nsplits, ((2, 2, 2, 2, 2), ))
        self.assertEqual(len(series.chunks), 5)
        for i, c in enumerate(series.chunks):
            self.assertIsInstance(c, SERIES_CHUNK_TYPE)
            self.assertEqual(c.index, (i, ))
            self.assertEqual(c.shape, (2, ))

        df1 = df[['c1', 'c2', 'c3']]
        self.assertIsInstance(df1, DataFrame)
        self.assertEqual(df1.shape, (10, 3))
        self.assertEqual(df1.index_value, df.index_value)
        pd.testing.assert_index_equal(df1.columns.to_pandas(),
                                      data[['c1', 'c2', 'c3']].columns)
        pd.testing.assert_series_equal(df1.dtypes, data[['c1', 'c2',
                                                         'c3']].dtypes)

        df1.tiles()
        self.assertEqual(df1.nsplits, ((2, 2, 2, 2, 2), (2, 1)))
        self.assertEqual(len(df1.chunks), 10)
        for i, c in enumerate(df1.chunks[slice(0, 10, 2)]):
            self.assertIsInstance(c, DATAFRAME_CHUNK_TYPE)
            self.assertEqual(c.index, (i, 0))
            self.assertEqual(c.shape, (2, 2))
        for i, c in enumerate(df1.chunks[slice(1, 10, 2)]):
            self.assertIsInstance(c, DATAFRAME_CHUNK_TYPE)
            self.assertEqual(c.index, (i, 1))
            self.assertEqual(c.shape, (2, 1))
Exemple #18
0
    def testGroupByCum(self):
        df1 = pd.DataFrame({
            'a': [3, 5, 2, 7, 1, 2, 4, 6, 2, 4],
            'b': [8, 3, 4, 1, 8, 2, 2, 2, 2, 3],
            'c': [1, 8, 8, 5, 3, 5, 0, 0, 5, 4]
        })
        mdf = md.DataFrame(df1, chunk_size=3)

        for fun in ['cummin', 'cummax', 'cumprod', 'cumsum']:
            r = getattr(mdf.groupby('b'), fun)().tiles()
            self.assertEqual(r.op.output_types[0], OutputType.dataframe)
            self.assertEqual(len(r.chunks), 4)
            self.assertEqual(r.shape, (len(df1), 2))
            self.assertEqual(r.chunks[0].shape, (np.nan, 2))
            pd.testing.assert_index_equal(
                r.chunks[0].columns_value.to_pandas(), pd.Index(['a', 'c']))

            r = getattr(mdf.groupby('b'), fun)(axis=1).tiles()
            self.assertEqual(r.op.output_types[0], OutputType.dataframe)
            self.assertEqual(len(r.chunks), 4)
            self.assertEqual(r.shape, (len(df1), 3))
            self.assertEqual(r.chunks[0].shape, (np.nan, 3))
            pd.testing.assert_index_equal(
                r.chunks[0].columns_value.to_pandas(), df1.columns)

        r = mdf.groupby('b').cumcount().tiles()
        self.assertEqual(r.op.output_types[0], OutputType.series)
        self.assertEqual(len(r.chunks), 4)
        self.assertEqual(r.shape, (len(df1), ))
        self.assertEqual(r.chunks[0].shape, (np.nan, ))

        series1 = pd.Series([2, 2, 5, 7, 3, 7, 8, 8, 5, 6])
        ms1 = md.Series(series1, chunk_size=3)

        for fun in ['cummin', 'cummax', 'cumprod', 'cumsum', 'cumcount']:
            r = getattr(ms1.groupby(lambda x: x % 2), fun)().tiles()
            self.assertEqual(r.op.output_types[0], OutputType.series)
            self.assertEqual(len(r.chunks), 4)
            self.assertEqual(r.shape, (len(series1), ))
            self.assertEqual(r.chunks[0].shape, (np.nan, ))
Exemple #19
0
def test_groupby_cum():
    df1 = pd.DataFrame({
        'a': [3, 5, 2, 7, 1, 2, 4, 6, 2, 4],
        'b': [8, 3, 4, 1, 8, 2, 2, 2, 2, 3],
        'c': [1, 8, 8, 5, 3, 5, 0, 0, 5, 4]
    })
    mdf = md.DataFrame(df1, chunk_size=3)

    for fun in ['cummin', 'cummax', 'cumprod', 'cumsum']:
        r = tile(getattr(mdf.groupby('b'), fun)())
        assert r.op.output_types[0] == OutputType.dataframe
        assert len(r.chunks) == 4
        assert r.shape == (len(df1), 2)
        assert r.chunks[0].shape == (np.nan, 2)
        pd.testing.assert_index_equal(r.chunks[0].columns_value.to_pandas(),
                                      pd.Index(['a', 'c']))

        r = tile(getattr(mdf.groupby('b'), fun)(axis=1))
        assert r.op.output_types[0] == OutputType.dataframe
        assert len(r.chunks) == 4
        assert r.shape == (len(df1), 3)
        assert r.chunks[0].shape == (np.nan, 3)
        pd.testing.assert_index_equal(r.chunks[0].columns_value.to_pandas(),
                                      df1.columns)

    r = tile(mdf.groupby('b').cumcount())
    assert r.op.output_types[0] == OutputType.series
    assert len(r.chunks) == 4
    assert r.shape == (len(df1), )
    assert r.chunks[0].shape == (np.nan, )

    series1 = pd.Series([2, 2, 5, 7, 3, 7, 8, 8, 5, 6])
    ms1 = md.Series(series1, chunk_size=3)

    for fun in ['cummin', 'cummax', 'cumprod', 'cumsum', 'cumcount']:
        r = tile(getattr(ms1.groupby(lambda x: x % 2), fun)())
        assert r.op.output_types[0] == OutputType.series
        assert len(r.chunks) == 4
        assert r.shape == (len(series1), )
        assert r.chunks[0].shape == (np.nan, )
Exemple #20
0
    def testDataFrameGetitemBool(self):
        data = pd.DataFrame(np.random.rand(10, 5),
                            columns=['c1', 'c2', 'c3', 'c4', 'c5'])
        df = md.DataFrame(data, chunk_size=2)

        mask_data = data.c1 > 0.5
        mask = md.Series(mask_data, chunk_size=2)

        # getitem by mars series
        self.assertEqual(df[mask].execute().shape, data[mask_data].shape)
        pd.testing.assert_frame_equal(df[mask].execute(), data[mask_data])

        # getitem by pandas series
        pd.testing.assert_frame_equal(df[mask_data].execute(), data[mask_data])

        # getitem by mars series with alignment but no shuffle
        mask_data = pd.Series(
            [True, True, True, False, False, True, True, False, False, True],
            index=range(9, -1, -1))
        mask = md.Series(mask_data, chunk_size=2)
        pd.testing.assert_frame_equal(df[mask].execute(), data[mask_data])

        # getitem by mars series with shuffle alignment
        mask_data = pd.Series(
            [True, True, True, False, False, True, True, False, False, True],
            index=[0, 3, 6, 2, 9, 8, 5, 7, 1, 4])
        mask = md.Series(mask_data, chunk_size=2)
        pd.testing.assert_frame_equal(df[mask].execute().sort_index(),
                                      data[mask_data])

        # getitem by mars series with shuffle alignment and extra element
        mask_data = pd.Series([
            True, True, True, False, False, True, True, False, False, True,
            False
        ],
                              index=[0, 3, 6, 2, 9, 8, 5, 7, 1, 4, 10])
        mask = md.Series(mask_data, chunk_size=2)
        pd.testing.assert_frame_equal(df[mask].execute().sort_index(),
                                      data[mask_data])
Exemple #21
0
def test_build_and_search_index_with_filesystem_download(setup):
    with tempfile.TemporaryDirectory() as f:
        # params
        doc_count, query_count, dimension = 2000, 15, 10
        topk = 10
        doc_chunk, query_chunk = 1000, 5

        # data
        doc, query = gen_data(doc_count=doc_count,
                              query_count=query_count,
                              dimension=dimension)

        df = md.DataFrame(pd.DataFrame(doc), chunk_size=(doc_chunk, dimension))
        q = mt.tensor(query, chunk_size=(query_chunk, dimension))

        index = build_index(tensor=df, index_path=f, column_number=2)

        assert len(os.listdir(f)) > 0

        search_index(q[0:5], topk, index)
        search_index(q[5:10], topk, index)
        search_index(q[10:15], topk, index)
Exemple #22
0
    def testRocCurveAuc(self):
        service_ep = 'http://127.0.0.1:' + self.web_port
        timeout = 120 if 'CI' in os.environ else -1
        with new_session(service_ep) as sess:
            run_kwargs = {'timeout': timeout}

            rs = np.random.RandomState(0)
            raw = pd.DataFrame({'a': rs.randint(0, 10, (10,)),
                                'b': rs.rand(10)})

            df = md.DataFrame(raw)
            y = df['a'].to_tensor().astype('int')
            pred = df['b'].to_tensor().astype('float')
            fpr, tpr, thresholds = roc_curve(y, pred, pos_label=2,
                                             session=sess, run_kwargs=run_kwargs)
            m = auc(fpr, tpr, session=sess, run_kwargs=run_kwargs)

            sk_fpr, sk_tpr, sk_threshod = sklearn_roc_curve(raw['a'].to_numpy().astype('int'),
                                                            raw['b'].to_numpy().astype('float'),
                                                            pos_label=2)
            expect_m = sklearn_auc(sk_fpr, sk_tpr)
            self.assertAlmostEqual(m.fetch(session=sess), expect_m)
Exemple #23
0
    def testGroupByGetItem(self):
        df1 = pd.DataFrame({'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
                            'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
                            'c': list('aabaaddce')})
        mdf = md.DataFrame(df1, chunk_size=3)

        r = mdf.groupby('b')[['a', 'b']].tiles()
        self.assertIsInstance(r, DataFrameGroupBy)
        self.assertIsInstance(r.op, GroupByIndex)
        self.assertEqual(r.selection, ['a', 'b'])
        self.assertEqual(list(r.key_dtypes.index), ['b'])
        self.assertEqual(len(r.chunks), 3)

        r = mdf.groupby('b').a.tiles()
        self.assertIsInstance(r, SeriesGroupBy)
        self.assertIsInstance(r.op, GroupByIndex)
        self.assertEqual(r.name, 'a')
        self.assertEqual(list(r.key_dtypes.index), ['b'])
        self.assertEqual(len(r.chunks), 3)

        with self.assertRaises(IndexError):
            getattr(mdf.groupby('b')[['a', 'b']], 'a')
Exemple #24
0
def test_mars(ray_start_regular):
    import pandas as pd

    cluster = mars.new_cluster_in_ray(worker_num=2, worker_cpu=1)
    n = 10000
    pdf = pd.DataFrame({"a": list(range(n)), "b": list(range(n, 2 * n))})
    df = md.DataFrame(pdf)

    # Convert mars dataframe to ray dataset
    ds = ray.data.from_mars(df)
    pd.testing.assert_frame_equal(ds.to_pandas(), df.to_pandas())
    ds2 = ds.filter(lambda row: row["a"] % 2 == 0)
    assert ds2.take(5) == [{"a": 2 * i, "b": n + 2 * i} for i in range(5)]

    # Convert ray dataset to mars dataframe
    df2 = ds2.to_mars()
    pd.testing.assert_frame_equal(
        df2.head(5).to_pandas(),
        pd.DataFrame({
            "a": list(range(0, 10, 2)),
            "b": list(range(n, n + 10, 2))
        }),
    )

    # Test Arrow Dataset
    pdf2 = pd.DataFrame({c: range(5) for c in "abc"})
    ds3 = ray.data.from_arrow([pa.Table.from_pandas(pdf2) for _ in range(3)])
    df3 = ds3.to_mars()
    pd.testing.assert_frame_equal(
        df3.head(5).to_pandas(),
        pdf2,
    )

    # Test simple datasets
    with pytest.raises(NotImplementedError):
        ray.data.range(10).to_mars()

    cluster.stop()
Exemple #25
0
    def testDataFrameGetitem(self):
        data = pd.DataFrame(np.random.rand(10, 5),
                            columns=['c1', 'c2', 'c3', 'c4', 'c5'])
        df = md.DataFrame(data, chunk_size=2)

        series1 = df['c2']
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(series1, concat=True)[0],
            data['c2'])

        series2 = df['c5']
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(series2, concat=True)[0],
            data['c5'])

        df1 = df[['c1', 'c2', 'c3']]
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(df1, concat=True)[0],
            data[['c1', 'c2', 'c3']])

        df2 = df[['c3', 'c2', 'c1']]
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(df2, concat=True)[0],
            data[['c3', 'c2', 'c1']])

        df3 = df[['c1']]
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(df3, concat=True)[0], data[['c1']])

        df4 = df[['c3', 'c1', 'c2', 'c1']]
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(df4, concat=True)[0],
            data[['c3', 'c1', 'c2', 'c1']])

        series3 = df['c1'][0]
        self.assertEqual(
            self.executor.execute_dataframe(series3, concat=True)[0],
            data['c1'][0])
Exemple #26
0
    def testAccuracyScore(self):
        service_ep = 'http://127.0.0.1:' + self.web_port
        timeout = 120 if 'CI' in os.environ else -1
        with new_session(service_ep) as sess:
            run_kwargs = {'timeout': timeout}

            rs = np.random.RandomState(0)
            raw = pd.DataFrame({
                'a': rs.randint(0, 10, (10, )),
                'b': rs.randint(0, 10, (10, ))
            })

            df = md.DataFrame(raw)
            y = df['a'].to_tensor().astype('int')
            pred = df['b'].astype('int')

            score = accuracy_score(y,
                                   pred,
                                   session=sess,
                                   run_kwargs=run_kwargs)
            expect = sklearn_accuracy_score(raw['a'].to_numpy().astype('int'),
                                            raw['b'].to_numpy().astype('int'))
            self.assertAlmostEqual(score.fetch(session=sess), expect)
Exemple #27
0
def test_aggregate_str_cat(setup, check_ref_counts):
    agg_fun = lambda x: x.str.cat(sep='_', na_rep='NA')

    rs = np.random.RandomState(0)
    raw_df = pd.DataFrame({
        'a':
        rs.choice(['A', 'B', 'C'], size=(100, )),
        'b':
        rs.choice([None, 'alfa', 'bravo', 'charlie'], size=(100, ))
    })

    mdf = md.DataFrame(raw_df, chunk_size=13)

    r = mdf.agg(agg_fun)
    pd.testing.assert_series_equal(r.execute().fetch(), raw_df.agg(agg_fun))

    raw_series = pd.Series(
        rs.choice([None, 'alfa', 'bravo', 'charlie'], size=(100, )))

    ms = md.Series(raw_series, chunk_size=13)

    r = ms.agg(agg_fun)
    assert r.execute().fetch() == raw_series.agg(agg_fun)
Exemple #28
0
    def testLocalClassifierFromToParquet(self):
        n_rows = 1000
        n_columns = 10
        rs = np.random.RandomState(0)
        X = rs.rand(n_rows, n_columns)
        y = (rs.rand(n_rows) > 0.5).astype(np.int32)
        df = pd.DataFrame(X, columns=[f'c{i}' for i in range(n_columns)])

        # test with existing model
        classifier = lightgbm.LGBMClassifier(n_estimators=2)
        classifier.fit(X, y, verbose=True)

        with tempfile.TemporaryDirectory() as d:
            result_dir = os.path.join(d, 'result')
            os.mkdir(result_dir)
            data_dir = os.path.join(d, 'data')
            os.mkdir(data_dir)

            df.iloc[:500].to_parquet(os.path.join(d, 'data', 'data1.parquet'))
            df.iloc[500:].to_parquet(os.path.join(d, 'data', 'data2.parquet'))

            df = md.read_parquet(data_dir)
            model = LGBMClassifier()
            model.load_model(classifier)
            result = model.predict(df, run=False)
            r = md.DataFrame(result).to_parquet(result_dir)

            # tiles to ensure no iterative tiling exists
            g = r.build_graph(tiled=True)
            self.assertTrue(all(isinstance(n.op, Fuse) for n in g))
            self.assertEqual(len(g), 2)
            r.execute()

            ret = md.read_parquet(result_dir).to_pandas().iloc[:, 0].to_numpy()
            expected = classifier.predict(X)
            expected = np.stack([1 - expected, expected]).argmax(axis=0)
            np.testing.assert_array_equal(ret, expected)
Exemple #29
0
    def testNamed(self):
        rs = np.random.RandomState(0)
        raw = rs.rand(10, 10)

        sess = Session.default_or_local()

        # test named tensor
        t = mt.tensor(raw, chunk_size=3)
        name = 't_name'
        r1 = t.execute(name=name, session=sess)
        np.testing.assert_array_equal(r1, raw)

        t2 = mt.named_tensor(name=name, session=sess)
        r2 = (t2 + 1).execute(session=sess).fetch()
        np.testing.assert_array_equal(r2, raw + 1)

        # test named series
        name = 's_name'
        raw = pd.Series([1, 2, 3])
        s = md.Series(raw)
        r1 = s.execute(name=name, session=sess).fetch()
        pd.testing.assert_series_equal(r1, raw)

        s2 = md.named_series(name=name, session=sess)
        r2 = s2.execute(session=sess).fetch()
        pd.testing.assert_series_equal(r2, raw)

        # test dataframe
        name = 'd_name'
        raw = pd.DataFrame(np.random.rand(10, 3))
        d = md.DataFrame(raw, chunk_size=4)
        r1 = d.execute(name=name, session=sess).fetch()
        pd.testing.assert_frame_equal(r1, raw)

        d2 = md.named_dataframe(name=name, session=sess)
        r2 = d2.execute(session=sess).fetch()
        pd.testing.assert_frame_equal(r2, raw)
Exemple #30
0
def test_use_arrow_dtype_n_unique(setup, check_ref_counts):
    with option_context({
            'dataframe.use_arrow_dtype': True,
            'combine_size': 2
    }):
        rs = np.random.RandomState(0)
        data1 = pd.DataFrame({
            'a': rs.random(10),
            'b': [f's{i}' for i in rs.randint(100, size=10)]
        })
        data1['c'] = data1['b'].copy()
        data1['d'] = data1['b'].copy()
        data1['e'] = data1['b'].copy()

        df = md.DataFrame(data1, chunk_size=(3, 2))
        r = df.nunique(axis=0)
        result = r.execute().fetch()
        expected = data1.nunique(axis=0)
        pd.testing.assert_series_equal(result, expected)

        r = df.nunique(axis=1)
        result = r.execute().fetch()
        expected = data1.nunique(axis=1)
        pd.testing.assert_series_equal(result, expected)