def testWithGivenSession(session):
            with option_context({'vineyard.socket': '/tmp/vineyard.sock'}):
                df1 = DataFrame(pd.DataFrame(np.arange(12).reshape(3, 4), columns=['a', 'b', 'c', 'd']),
                                chunk_size=2)
                object_id = df1.to_vineyard().execute(session=session)
                df2 = md.from_vineyard(object_id)

                df1_value = df1.execute(session=session)
                df2_value = df2.execute(session=session)
                pd.testing.assert_frame_equal(df1_value.reset_index(drop=True), df2_value.reset_index(drop=True))
        def testWithGivenSession(session):
            ipc_socket = os.environ.get('VINEYARD_IPC_SOCKET', '/tmp/vineyard/vineyard.sock')
            with option_context({'vineyard.socket': ipc_socket}):
                df1 = DataFrame(pd.DataFrame(np.arange(12).reshape(3, 4), columns=['a', 'b', 'c', 'd']),
                                chunk_size=2)
                object_id = df1.to_vineyard().execute(session=session).fetch()
                df2 = md.from_vineyard(object_id)

                df1_value = df1.execute(session=session).fetch()
                df2_value = df2.execute(session=session).fetch()
                pd.testing.assert_frame_equal(df1_value.reset_index(drop=True), df2_value.reset_index(drop=True))
Exemple #3
0
def test_sort_values_execution(setup):
    distinct_opts = ['0'] if sys.platform.lower().startswith('win') else [
        '0', '1'
    ]
    for add_distinct in distinct_opts:
        os.environ['PSRS_DISTINCT_COL'] = add_distinct
        df = pd.DataFrame(np.random.rand(100, 10),
                          columns=['a' + str(i) for i in range(10)])

        # test one chunk
        mdf = DataFrame(df)
        result = mdf.sort_values('a0').execute().fetch()
        expected = df.sort_values('a0')

        pd.testing.assert_frame_equal(result, expected)

        result = mdf.sort_values(['a6', 'a7'],
                                 ascending=False).execute().fetch()
        expected = df.sort_values(['a6', 'a7'], ascending=False)

        pd.testing.assert_frame_equal(result, expected)

        # test psrs
        mdf = DataFrame(df, chunk_size=10)
        result = mdf.sort_values('a0').execute().fetch()
        expected = df.sort_values('a0')

        pd.testing.assert_frame_equal(result, expected)

        result = mdf.sort_values(['a3', 'a4']).execute().fetch()
        expected = df.sort_values(['a3', 'a4'])

        pd.testing.assert_frame_equal(result, expected)

        # test ascending=False
        result = mdf.sort_values(['a0', 'a1'],
                                 ascending=False).execute().fetch()
        expected = df.sort_values(['a0', 'a1'], ascending=False)

        pd.testing.assert_frame_equal(result, expected)

        result = mdf.sort_values(['a7'], ascending=False).execute().fetch()
        expected = df.sort_values(['a7'], ascending=False)

        pd.testing.assert_frame_equal(result, expected)

        # test multiindex
        df2 = df.copy(deep=True)
        df2.columns = pd.MultiIndex.from_product([list('AB'), list('CDEFG')])
        mdf = DataFrame(df2, chunk_size=10)

        result = mdf.sort_values([('A', 'C')]).execute().fetch()
        expected = df2.sort_values([('A', 'C')])

        pd.testing.assert_frame_equal(result, expected)

        # test rechunk
        mdf = DataFrame(df, chunk_size=3)
        result = mdf.sort_values('a0').execute().fetch()
        expected = df.sort_values('a0')

        pd.testing.assert_frame_equal(result, expected)

        result = mdf.sort_values(['a3', 'a4']).execute().fetch()
        expected = df.sort_values(['a3', 'a4'])

        pd.testing.assert_frame_equal(result, expected)

        # test other types
        raw = pd.DataFrame(
            {
                'a': np.random.rand(10),
                'b': np.random.randint(1000, size=10),
                'c': np.random.rand(10),
                'd': [np.random.bytes(10) for _ in range(10)],
                'e': [pd.Timestamp(f'201{i}') for i in range(10)],
                'f': [pd.Timedelta(f'{i} days') for i in range(10)]
            }, )
        mdf = DataFrame(raw, chunk_size=3)

        for label in raw.columns:
            result = mdf.sort_values(label).execute().fetch()
            expected = raw.sort_values(label)
            pd.testing.assert_frame_equal(result, expected)

        result = mdf.sort_values(['a', 'b', 'e'],
                                 ascending=False).execute().fetch()
        expected = raw.sort_values(['a', 'b', 'e'], ascending=False)

        pd.testing.assert_frame_equal(result, expected)

        # test nan
        df = pd.DataFrame({
            'col1': ['A', 'A', 'B', 'B', 'D', 'C'],
            'col2': [2, 1, 9, np.nan, 7, 4],
            'col3': [0, 1, 9, 4, 2, 3],
        })
        mdf = DataFrame(df)
        result = mdf.sort_values(['col2']).execute().fetch()
        expected = df.sort_values(['col2'])

        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(df, chunk_size=3)
        result = mdf.sort_values(['col2']).execute().fetch()
        expected = df.sort_values(['col2'])

        pd.testing.assert_frame_equal(result, expected)

        # test None (issue #1885)
        df = pd.DataFrame(np.random.rand(1000, 10))

        df[0][df[0] < 0.5] = 'A'
        df[0][df[0] != 'A'] = None

        mdf = DataFrame(df)
        result = mdf.sort_values([0, 1]).execute().fetch()
        expected = df.sort_values([0, 1])

        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(df, chunk_size=100)
        result = mdf.sort_values([0, 1]).execute().fetch()
        expected = df.sort_values([0, 1])

        pd.testing.assert_frame_equal(result, expected)

        # test ignore_index
        df = pd.DataFrame(np.random.rand(10, 3),
                          columns=['a' + str(i) for i in range(3)])

        mdf = DataFrame(df, chunk_size=3)
        result = mdf.sort_values(['a0', 'a1'],
                                 ignore_index=True).execute().fetch()
        try:  # for python3.5
            expected = df.sort_values(['a0', 'a1'], ignore_index=True)
        except TypeError:
            expected = df.sort_values(['a0', 'a1'])
            expected.index = pd.RangeIndex(len(expected))

        pd.testing.assert_frame_equal(result, expected)

        # test inplace
        mdf = DataFrame(df)
        mdf.sort_values('a0', inplace=True)
        result = mdf.execute().fetch()
        df.sort_values('a0', inplace=True)

        pd.testing.assert_frame_equal(result, df)

        # test unknown shape
        df = pd.DataFrame({'a': list(range(10)), 'b': np.random.random(10)})
        mdf = DataFrame(df, chunk_size=4)
        filtered = mdf[mdf['a'] > 2]
        result = filtered.sort_values(by='b').execute().fetch()

        pd.testing.assert_frame_equal(result,
                                      df[df['a'] > 2].sort_values(by='b'))

        # test empty dataframe
        df = pd.DataFrame({'a': list(range(10)), 'b': np.random.random(10)})
        mdf = DataFrame(df, chunk_size=4)
        filtered = mdf[mdf['b'] > 100]
        result = filtered.sort_values(by='b').execute().fetch()

        pd.testing.assert_frame_equal(result,
                                      df[df['b'] > 100].sort_values(by='b'))

        # test chunks with zero length
        df = pd.DataFrame({'a': list(range(10)), 'b': np.random.random(10)})
        df.iloc[4:8, 1] = 0

        mdf = DataFrame(df, chunk_size=4)
        filtered = mdf[mdf['b'] != 0]
        result = filtered.sort_values(by='b').execute().fetch()

        pd.testing.assert_frame_equal(result,
                                      df[df['b'] != 0].sort_values(by='b'))

        # test Series.sort_values
        raw = pd.Series(np.random.rand(10))
        series = Series(raw)
        result = series.sort_values().execute().fetch()
        expected = raw.sort_values()

        pd.testing.assert_series_equal(result, expected)

        series = Series(raw, chunk_size=3)
        result = series.sort_values().execute().fetch()
        expected = raw.sort_values()

        pd.testing.assert_series_equal(result, expected)

        series = Series(raw, chunk_size=2)
        result = series.sort_values(ascending=False).execute().fetch()
        expected = raw.sort_values(ascending=False)

        pd.testing.assert_series_equal(result, expected)

        # test empty series
        series = pd.Series(list(range(10)), name='a')
        mseries = Series(series, chunk_size=4)
        filtered = mseries[mseries > 100]
        result = filtered.sort_values().execute().fetch()

        pd.testing.assert_series_equal(result,
                                       series[series > 100].sort_values())

        # test series with None
        series = pd.Series(np.arange(1000, ))

        series[series < 500] = 'A'
        series[series != 'A'] = None

        mseries = Series(series, chunk_size=100)
        result = mseries.sort_values().execute().fetch()
        expected = series.sort_values()
        pd.testing.assert_series_equal(result.reset_index(drop=True),
                                       expected.reset_index(drop=True))
Exemple #4
0
def test_sort_index_execution(setup):
    raw = pd.DataFrame(np.random.rand(100, 20), index=np.random.rand(100))

    mdf = DataFrame(raw)
    result = mdf.sort_index().execute().fetch()
    expected = raw.sort_index()
    pd.testing.assert_frame_equal(result, expected)

    mdf = DataFrame(raw)
    mdf.sort_index(inplace=True)
    result = mdf.execute().fetch()
    expected = raw.sort_index()
    pd.testing.assert_frame_equal(result, expected)

    mdf = DataFrame(raw, chunk_size=30)
    result = mdf.sort_index().execute().fetch()
    expected = raw.sort_index()
    pd.testing.assert_frame_equal(result, expected)

    mdf = DataFrame(raw, chunk_size=20)
    result = mdf.sort_index(ascending=False).execute().fetch()
    expected = raw.sort_index(ascending=False)
    pd.testing.assert_frame_equal(result, expected)

    mdf = DataFrame(raw, chunk_size=10)
    result = mdf.sort_index(ignore_index=True).execute().fetch()
    try:  # for python3.5
        expected = raw.sort_index(ignore_index=True)
    except TypeError:
        expected = raw.sort_index()
        expected.index = pd.RangeIndex(len(expected))
    pd.testing.assert_frame_equal(result, expected)

    # test axis=1
    raw = pd.DataFrame(np.random.rand(10, 10), columns=np.random.rand(10))

    mdf = DataFrame(raw)
    result = mdf.sort_index(axis=1).execute().fetch()
    expected = raw.sort_index(axis=1)
    pd.testing.assert_frame_equal(result, expected)

    mdf = DataFrame(raw, chunk_size=3)
    result = mdf.sort_index(axis=1).execute().fetch()
    expected = raw.sort_index(axis=1)
    pd.testing.assert_frame_equal(result, expected)

    mdf = DataFrame(raw, chunk_size=4)
    result = mdf.sort_index(axis=1, ascending=False).execute().fetch()
    expected = raw.sort_index(axis=1, ascending=False)
    pd.testing.assert_frame_equal(result, expected)

    mdf = DataFrame(raw, chunk_size=4)

    result = mdf.sort_index(axis=1, ignore_index=True).execute().fetch()
    try:  # for python3.5
        expected = raw.sort_index(axis=1, ignore_index=True)
    except TypeError:
        expected = raw.sort_index(axis=1)
        expected.index = pd.RangeIndex(len(expected))
    pd.testing.assert_frame_equal(result, expected)

    # test series
    raw = pd.Series(np.random.rand(10, ), index=np.random.rand(10))

    series = Series(raw)
    result = series.sort_index().execute().fetch()
    expected = raw.sort_index()
    pd.testing.assert_series_equal(result, expected)

    series = Series(raw, chunk_size=2)
    result = series.sort_index().execute().fetch()
    expected = raw.sort_index()
    pd.testing.assert_series_equal(result, expected)

    series = Series(raw, chunk_size=3)
    result = series.sort_index(ascending=False).execute().fetch()
    expected = raw.sort_index(ascending=False)
    pd.testing.assert_series_equal(result, expected)