Beispiel #1
0
    def testDataFrameShuffle(self, *_):
        from mars.dataframe.datasource.dataframe import from_pandas as from_pandas_df
        from mars.dataframe.merge.merge import merge
        from mars.dataframe.utils import sort_dataframe_inplace

        with new_cluster(scheduler_n_process=2, worker_n_process=2,
                         shared_memory='20M', web=True) as cluster:
            session = cluster.session

            data1 = pd.DataFrame(np.arange(20).reshape((4, 5)) + 1, columns=['a', 'b', 'c', 'd', 'e'])
            data2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=['a', 'b', 'x', 'y'])

            df1 = from_pandas_df(data1, chunk_size=2)
            df2 = from_pandas_df(data2, chunk_size=2)

            r1 = data1.merge(data2)
            r2 = session.run(merge(df1, df2), timeout=_exec_timeout)
            pd.testing.assert_frame_equal(sort_dataframe_inplace(r1, 0), sort_dataframe_inplace(r2, 0))

            r1 = data1.merge(data2, how='inner', on=['a', 'b'])
            r2 = session.run(merge(df1, df2, how='inner', on=['a', 'b']), timeout=_exec_timeout)
            pd.testing.assert_frame_equal(sort_dataframe_inplace(r1, 0), sort_dataframe_inplace(r2, 0))

            web_session = new_session('http://' + cluster._web_endpoint)

            r1 = data1.merge(data2)
            r2 = web_session.run(merge(df1, df2), timeout=_exec_timeout)
            pd.testing.assert_frame_equal(sort_dataframe_inplace(r1, 0), sort_dataframe_inplace(r2, 0))

            r1 = data1.merge(data2, how='inner', on=['a', 'b'])
            r2 = web_session.run(merge(df1, df2, how='inner', on=['a', 'b']), timeout=_exec_timeout)
            pd.testing.assert_frame_equal(sort_dataframe_inplace(r1, 0), sort_dataframe_inplace(r2, 0))
    def testMerge(self):
        df1 = pd.DataFrame(np.arange(20).reshape((4, 5)) + 1, columns=['a', 'b', 'c', 'd', 'e'])
        df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=['a', 'b', 'x', 'y'])

        mdf1 = from_pandas(df1, chunk_size=2)
        mdf2 = from_pandas(df2, chunk_size=2)

        # Note [Index of Merge]
        #
        # When `left_index` and `right_index` of `merge` is both false, pandas will generate an RangeIndex to
        # the final result dataframe.
        #
        # We chunked the `left` and `right` dataframe, thus every result chunk will have its own RangeIndex.
        # When they are contenated we don't generate a new RangeIndex for the result, thus we cannot obtain the
        # same index value with pandas. But we guarantee that the content of dataframe is correct.

        # merge on index
        expected0 = df1.merge(df2)
        jdf0 = mdf1.merge(mdf2)
        result0 = self.executor.execute_dataframe(jdf0, concat=True)[0]
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected0, 0), sort_dataframe_inplace(result0, 0))

        # merge on left index and `right_on`
        expected1 = df1.merge(df2, how='left', right_on='x', left_index=True)
        jdf1 = mdf1.merge(mdf2, how='left', right_on='x', left_index=True)
        result1 = self.executor.execute_dataframe(jdf1, concat=True)[0]
        expected1.set_index('a_x', inplace=True)
        result1.set_index('a_x', inplace=True)
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected1, 0), sort_dataframe_inplace(result1, 0))

        # merge on `left_on` and right index
        expected2 = df1.merge(df2, how='right', left_on='a', right_index=True)
        jdf2 = mdf1.merge(mdf2, how='right', left_on='a', right_index=True)
        result2 = self.executor.execute_dataframe(jdf2, concat=True)[0]
        expected2.set_index('a', inplace=True)
        result2.set_index('a', inplace=True)
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected2, 0), sort_dataframe_inplace(result2, 0))

        # merge on `left_on` and `right_on`
        expected3 = df1.merge(df2, how='left', left_on='a', right_on='x')
        jdf3 = mdf1.merge(mdf2, how='left', left_on='a', right_on='x')
        result3 = self.executor.execute_dataframe(jdf3, concat=True)[0]
        expected3.set_index('a_x', inplace=True)
        result3.set_index('a_x', inplace=True)
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected3, 0), sort_dataframe_inplace(result3, 0))

        # merge on `on`
        expected4 = df1.merge(df2, how='right', on='a')
        jdf4 = mdf1.merge(mdf2, how='right', on='a')
        result4 = self.executor.execute_dataframe(jdf4, concat=True)[0]
        expected4.set_index('a', inplace=True)
        result4.set_index('a', inplace=True)
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected4, 0), sort_dataframe_inplace(result4, 0))

        # merge on multiple columns
        expected5 = df1.merge(df2, how='inner', on=['a', 'b'])
        jdf5 = mdf1.merge(mdf2, how='inner', on=['a', 'b'])
        result5 = self.executor.execute_dataframe(jdf5, concat=True)[0]
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected5, 0), sort_dataframe_inplace(result5, 0))
    def testJoinOn(self):
        df1 = pd.DataFrame([[1, 3, 3], [4, 2, 6], [7, 8, 9]], columns=['a1', 'a2', 'a3'])
        df2 = pd.DataFrame([[1, 2, 3], [1, 5, 6], [7, 8, 9]], columns=['a1', 'b2', 'b3']) + 1
        df2 = pd.concat([df2, df2 + 1])

        mdf1 = from_pandas(df1, chunk_size=2)
        mdf2 = from_pandas(df2, chunk_size=2)

        expected0 = df1.join(df2, on=None, lsuffix='_l', rsuffix='_r')
        jdf0 = mdf1.join(mdf2, on=None, lsuffix='_l', rsuffix='_r')
        result0 = self.executor.execute_dataframe(jdf0, concat=True)[0]
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected0, 0), sort_dataframe_inplace(result0, 0))

        expected1 = df1.join(df2, how='left', on='a1', lsuffix='_l', rsuffix='_r')
        jdf1 = mdf1.join(mdf2, how='left', on='a1', lsuffix='_l', rsuffix='_r')
        result1 = self.executor.execute_dataframe(jdf1, concat=True)[0]

        # Note [Columns of Left Join]
        #
        # I believe we have no chance to obtain the entirely same result with pandas here:
        #
        # Look at the following example:
        #
        # >>> df1
        #     a1  a2  a3
        # 0   1   3   3
        # >>> df2
        #     a1  b2  b3
        # 1   2   6   7
        # >>> df3
        #     a1  b2  b3
        # 1   2   6   7
        # 1   2   6   7
        #
        # >>> df1.merge(df2, how='left', left_on='a1', left_index=False, right_index=True)
        #     a1_x  a2  a3  a1_y  b2  b3
        # 0   1   3   3     2   6   7
        # >>> df1.merge(df3, how='left', left_on='a1', left_index=False, right_index=True)
        #     a1  a1_x  a2  a3  a1_y  b2  b3
        # 0   1     1   3   3     2   6   7
        # 0   1     1   3   3     2   6   7
        #
        # Note that the result of `df1.merge(df3)` has an extra column `a` compared to `df1.merge(df2)`.
        # The value of column `a` is the same of `a1_x`, just because `1` occurs twice in index of `df3`.
        # I haven't invistagated why pandas has such behaviour...
        #
        # We cannot yeild the same result with pandas, because, the `df3` is chunked, then some of the
        # result chunk has 6 columns, others may have 7 columns, when concatenated into one DataFrame
        # some cells of column `a` will have value `NaN`, which is different from the result of pandas.
        #
        # But we can guarantee that other effective columns have absolutely same value with pandas.

        columns_to_compare = jdf1.columns_value.to_pandas()

        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected1[columns_to_compare], 0, 1),
                                      sort_dataframe_inplace(result1[columns_to_compare], 0, 1))

        # Note [Index of Join on EmptyDataFrame]
        #
        # It is tricky that it is non-trivial to get the same `index` result with pandas.
        #
        # Look at the following example:
        #
        # >>> df1
        #    a1  a2  a3
        # 1   4   2   6
        # >>> df2
        #    a1  b2  b3
        # 1   2   6   7
        # 2   8   9  10
        # >>> df3
        # Empty DataFrame
        # Columns: [a1, a2, a3]
        # Index: []
        # >>> df1.join(df2, how='right', on='a2', lsuffix='_l', rsuffix='_r')
        #       a1_l  a2   a3  a1_r  b2  b3
        # 1.0   4.0   2  6.0     8   9  10
        # NaN   NaN   1  NaN     2   6   7
        # >>> df3.join(df2, how='right', on='a2', lsuffix='_l', rsuffix='_r')
        #     a1_l  a2  a3  a1_r  b2  b3
        # 1   NaN   1 NaN     2   6   7
        # 2   NaN   2 NaN     8   9  10
        #
        # When the `left` dataframe is not empty, the mismatched rows in `right` will have index value `NaN`,
        # and the matched rows have index value from `right`. When the `left` dataframe is empty, the mismatched
        # rows have index value from `right`.
        #
        # Since we chunked the `left` dataframe, it is uneasy to obtain the same index value with pandas in the
        # final result dataframe, but we guaranteed that the dataframe content is correctly.

        expected2 = df1.join(df2, how='right', on='a2', lsuffix='_l', rsuffix='_r')
        jdf2 = mdf1.join(mdf2, how='right', on='a2', lsuffix='_l', rsuffix='_r')
        result2 = self.executor.execute_dataframe(jdf2, concat=True)[0]

        expected2.set_index('a2', inplace=True)
        result2.set_index('a2', inplace=True)
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected2, 0), sort_dataframe_inplace(result2, 0))

        expected3 = df1.join(df2, how='inner', on='a2', lsuffix='_l', rsuffix='_r')
        jdf3 = mdf1.join(mdf2, how='inner', on='a2', lsuffix='_l', rsuffix='_r')
        result3 = self.executor.execute_dataframe(jdf3, concat=True)[0]
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected3, 0), sort_dataframe_inplace(result3, 0))

        expected4 = df1.join(df2, how='outer', on='a2', lsuffix='_l', rsuffix='_r')
        jdf4 = mdf1.join(mdf2, how='outer', on='a2', lsuffix='_l', rsuffix='_r')
        result4 = self.executor.execute_dataframe(jdf4, concat=True)[0]

        expected4.set_index('a2', inplace=True)
        result4.set_index('a2', inplace=True)
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected4, 0), sort_dataframe_inplace(result4, 0))
Beispiel #4
0
def test_merge(setup):
    df1 = pd.DataFrame(np.arange(20).reshape((4, 5)) + 1,
                       columns=['a', 'b', 'c', 'd', 'e'])
    df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1,
                       columns=['a', 'b', 'x', 'y'])
    df3 = df1.copy()
    df3.index = pd.RangeIndex(2, 6, name='index')
    df4 = df1.copy()
    df4.index = pd.MultiIndex.from_tuples([(i, i + 1) for i in range(4)],
                                          names=['i1', 'i2'])

    mdf1 = from_pandas(df1, chunk_size=2)
    mdf2 = from_pandas(df2, chunk_size=2)
    mdf3 = from_pandas(df3, chunk_size=3)
    mdf4 = from_pandas(df4, chunk_size=2)

    # Note [Index of Merge]
    #
    # When `left_index` and `right_index` of `merge` is both false, pandas will generate an RangeIndex to
    # the final result dataframe.
    #
    # We chunked the `left` and `right` dataframe, thus every result chunk will have its own RangeIndex.
    # When they are contenated we don't generate a new RangeIndex for the result, thus we cannot obtain the
    # same index value with pandas. But we guarantee that the content of dataframe is correct.

    # merge on index
    expected0 = df1.merge(df2)
    jdf0 = mdf1.merge(mdf2)
    result0 = jdf0.execute().fetch()
    pd.testing.assert_frame_equal(sort_dataframe_inplace(expected0, 0),
                                  sort_dataframe_inplace(result0, 0))

    # merge on left index and `right_on`
    expected1 = df1.merge(df2, how='left', right_on='x', left_index=True)
    jdf1 = mdf1.merge(mdf2, how='left', right_on='x', left_index=True)
    result1 = jdf1.execute().fetch()
    expected1.set_index('a_x', inplace=True)
    result1.set_index('a_x', inplace=True)
    pd.testing.assert_frame_equal(sort_dataframe_inplace(expected1, 0),
                                  sort_dataframe_inplace(result1, 0))

    # merge on `left_on` and right index
    expected2 = df1.merge(df2, how='right', left_on='a', right_index=True)
    jdf2 = mdf1.merge(mdf2, how='right', left_on='a', right_index=True)
    result2 = jdf2.execute().fetch()
    expected2.set_index('a', inplace=True)
    result2.set_index('a', inplace=True)
    pd.testing.assert_frame_equal(sort_dataframe_inplace(expected2, 0),
                                  sort_dataframe_inplace(result2, 0))

    # merge on `left_on` and `right_on`
    expected3 = df1.merge(df2, how='left', left_on='a', right_on='x')
    jdf3 = mdf1.merge(mdf2, how='left', left_on='a', right_on='x')
    result3 = jdf3.execute().fetch()
    expected3.set_index('a_x', inplace=True)
    result3.set_index('a_x', inplace=True)
    pd.testing.assert_frame_equal(sort_dataframe_inplace(expected3, 0),
                                  sort_dataframe_inplace(result3, 0))

    # merge on `on`
    expected4 = df1.merge(df2, how='right', on='a')
    jdf4 = mdf1.merge(mdf2, how='right', on='a')
    result4 = jdf4.execute().fetch()
    expected4.set_index('a', inplace=True)
    result4.set_index('a', inplace=True)
    pd.testing.assert_frame_equal(sort_dataframe_inplace(expected4, 0),
                                  sort_dataframe_inplace(result4, 0))

    # merge on multiple columns
    expected5 = df1.merge(df2, how='inner', on=['a', 'b'])
    jdf5 = mdf1.merge(mdf2, how='inner', on=['a', 'b'])
    result5 = jdf5.execute().fetch()
    pd.testing.assert_frame_equal(sort_dataframe_inplace(expected5, 0),
                                  sort_dataframe_inplace(result5, 0))

    # merge when some on is index
    expected6 = df3.merge(df2, how='inner', left_on='index', right_on='a')
    jdf6 = mdf3.merge(mdf2, how='inner', left_on='index', right_on='a')
    result6 = jdf6.execute().fetch()
    pd.testing.assert_frame_equal(sort_dataframe_inplace(expected6, 0),
                                  sort_dataframe_inplace(result6, 0))

    # merge when on is in MultiIndex
    expected7 = df4.merge(df2, how='inner', left_on='i1', right_on='a')
    jdf7 = mdf4.merge(mdf2, how='inner', left_on='i1', right_on='a')
    result7 = jdf7.execute().fetch()
    pd.testing.assert_frame_equal(sort_dataframe_inplace(expected7, 0),
                                  sort_dataframe_inplace(result7, 0))

    # merge when on is in MultiIndex, and on not in index
    expected8 = df4.merge(df2, how='inner', on=['a', 'b'])
    jdf8 = mdf4.merge(mdf2, how='inner', on=['a', 'b'])
    result8 = jdf8.execute().fetch()
    pd.testing.assert_frame_equal(sort_dataframe_inplace(expected8, 0),
                                  sort_dataframe_inplace(result8, 0))