Exemple #1
0
    def testJoin(self):
        df1 = pd.DataFrame([[1, 3, 3], [4, 2, 6], [7, 8, 9]],
                           index=['a1', 'a2', 'a3'])
        df2 = pd.DataFrame([[1, 2, 3], [1, 5, 6], [7, 8, 9]],
                           index=['a1', 'b2', 'b3']) + 1
        df2 = pd.concat([df2, df2 + 1])

        mdf1 = from_pandas(df1, chunk_size=2)
        mdf2 = from_pandas(df2, chunk_size=2)

        parameters = [
            {
                'lsuffix': 'l_',
                'rsuffix': 'r_'
            },
            {
                'lsuffix': 'l_',
                'rsuffix': 'r_',
                'how': 'left'
            },
            {
                'lsuffix': 'l_',
                'rsuffix': 'r_',
                'how': 'right'
            },
            {
                'lsuffix': 'l_',
                'rsuffix': 'r_',
                'how': 'inner'
            },
            {
                'lsuffix': 'l_',
                'rsuffix': 'r_',
                'how': 'left'
            },
        ]

        for kw in parameters:
            df = mdf1.join(mdf2, **kw)
            df = df.tiles()

            self.assertEqual(df.chunk_shape, (3, 1))
            for chunk in df.chunks:
                self.assertIsInstance(chunk.op, DataFrameShuffleMerge)
                self.assertEqual(chunk.op.how, kw.get('how', 'left'))
                left, right = chunk.op.inputs
                self.assertIsInstance(left.op, DataFrameMergeAlignReduce)
                self.assertIsInstance(right.op, DataFrameMergeAlignReduce)
                self.assertEqual(len(left.inputs[0].inputs), 2)
                self.assertEqual(len(right.inputs[0].inputs), 3)
                for lchunk in left.inputs[0].inputs:
                    self.assertIsInstance(lchunk.op, DataFrameMergeAlignMap)
                    self.assertEqual(lchunk.op.index_shuffle_size, 3)
                    self.assertEqual(lchunk.op.shuffle_on, None)
                for rchunk in right.inputs[0].inputs:
                    self.assertIsInstance(rchunk.op, DataFrameMergeAlignMap)
                    self.assertEqual(rchunk.op.index_shuffle_size, 3)
                    self.assertEqual(rchunk.op.shuffle_on, None)
                pd.testing.assert_index_equal(chunk.columns_value.to_pandas(),
                                              df.columns_value.to_pandas())
    def testWithMultiForms(self):
        # test multiple forms
        # such as self+other, self.add(other), add(self,other)
        data1 = pd.DataFrame(np.random.rand(10, 10))
        data1 = self.to_boolean_if_needed(data1)
        df1 = from_pandas(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10))
        data2 = self.to_boolean_if_needed(data2)
        df2 = from_pandas(data2, chunk_size=6)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(self.func(df1, df2),
                                                 concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)
        result = self.executor.execute_dataframe(self.func(df1, df2),
                                                 concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)
        result = self.executor.execute_dataframe(getattr(df1,
                                                         self.func_name)(df2),
                                                 concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)
        result = self.executor.execute_dataframe(getattr(df1,
                                                         self.rfunc_name)(df2),
                                                 concat=True)[0]
        pd.testing.assert_frame_equal(self.func(data2, data1), result)
Exemple #3
0
    def testWithOneShuffleExecution(self):
        # only 1 axis is monotonic
        # data1 with index split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=5)
        # data2 with index split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1),
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas(data2, chunk_size=6)

        df3 = self.func(df1, df2)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

        # only 1 axis is monotonic
        # data1 with columns split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10), index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
                             columns=np.arange(10))
        df1 = from_pandas(data1, chunk_size=5)
        # data2 with columns split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10), index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
                             columns=np.arange(11, 1, -1))
        df2 = from_pandas(data2, chunk_size=6)

        df3 = self.func(df1, df2)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)
Exemple #4
0
    def testBothOneChunk(self):
        # no axis is monotonic, but 1 chunk for all axes
        data1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=10)
        data2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas(data2, chunk_size=10)

        df3 = add(df1, df2)

        # test df3's index and columns
        pd.testing.assert_index_equal(df3.columns.to_pandas(), (data1 + data2).columns)
        self.assertTrue(df3.columns.should_be_monotonic)
        self.assertIsInstance(df3.index_value.value, IndexValue.Int64Index)
        self.assertTrue(df3.index_value.should_be_monotonic)
        pd.testing.assert_index_equal(df3.index_value.to_pandas(), pd.Int64Index([]))
        self.assertNotEqual(df3.index_value.key, df1.index_value.key)
        self.assertNotEqual(df3.index_value.key, df2.index_value.key)
        self.assertEqual(df3.shape[1], 12)  # columns is recorded, so we can get it

        df3.tiles()

        self.assertEqual(df3.chunk_shape, (1, 1))
        for c in df3.chunks:
            self.assertIsInstance(c.op, DataFrameAdd)
            self.assertEqual(len(c.inputs), 2)
            # test the left side
            self.assertIs(c.inputs[0], df1.chunks[0].data)
            # test the right side
            self.assertIs(c.inputs[1], df2.chunks[0].data)
    def testWithShuffleOnStringIndex(self):
        if self.func_name in ['__and__', '__or__', '__xor__']:
            # FIXME bitwise logical operators behave differently with pandas when index is not aligned.
            return

        # no axis is monotonic, and the index values are strings.
        data1 = pd.DataFrame(
            np.random.rand(10, 10),
            index=[str(x) for x in [0, 10, 2, 3, 4, 5, 6, 7, 8, 9]],
            columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        data1 = self.to_boolean_if_needed(data1)
        df1 = from_pandas(data1, chunk_size=5)
        data2 = pd.DataFrame(
            np.random.rand(10, 10),
            index=[str(x) for x in [11, 1, 2, 5, 7, 6, 8, 9, 10, 3]],
            columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        data2 = self.to_boolean_if_needed(data2)
        df2 = from_pandas(data2, chunk_size=6)

        df3 = self.func(df1, df2)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)
Exemple #6
0
    def testAppend(self):
        df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))
        df2 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))

        mdf1 = from_pandas(df1, chunk_size=3)
        mdf2 = from_pandas(df2, chunk_size=3)
        adf = mdf1.append(mdf2)

        self.assertEqual(adf.shape, (20, 4))
        self.assertIsInstance(adf.index_value.value, IndexValue.Int64Index)

        tiled = adf.tiles()
        self.assertEqual(tiled.nsplits, ((3, 3, 3, 1, 3, 3, 3, 1), (3, 1)))
        self.assertEqual(tiled.chunk_shape, (8, 2))
        for i, c in enumerate(tiled.chunks):
            index = (i // 2, i % 2)
            self.assertEqual(c.index, index)

        mdf1 = from_pandas(df1, chunk_size=3)
        mdf2 = from_pandas(df2, chunk_size=3)
        adf = mdf1.append(mdf2, ignore_index=True)

        self.assertEqual(adf.shape, (20, 4))
        self.assertIsInstance(adf.index_value.value, IndexValue.RangeIndex)
        pd.testing.assert_index_equal(adf.index_value.to_pandas(), pd.RangeIndex(20))

        tiled = adf.tiles()
        self.assertEqual(tiled.nsplits, ((3, 3, 3, 1, 3, 3, 3, 1), (3, 1)))
        self.assertEqual(tiled.chunk_shape, (8, 2))
        self.assertIsInstance(tiled.chunks[0].op, ChunkStandardizeRangeIndex)
Exemple #7
0
def test_append():
    df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))
    df2 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))

    mdf1 = from_pandas(df1, chunk_size=3)
    mdf2 = from_pandas(df2, chunk_size=3)
    adf = mdf1.append(mdf2)

    assert adf.shape == (20, 4)
    assert isinstance(adf.index_value.value, IndexValue.Int64Index)

    tiled = tile(adf)
    assert tiled.nsplits == ((3, 3, 3, 1, 3, 3, 3, 1), (3, 1))
    assert tiled.chunk_shape == (8, 2)
    for i, c in enumerate(tiled.chunks):
        index = (i // 2, i % 2)
        assert c.index == index

    mdf1 = from_pandas(df1, chunk_size=3)
    mdf2 = from_pandas(df2, chunk_size=3)
    adf = mdf1.append(mdf2, ignore_index=True)

    assert adf.shape == (20, 4)
    assert isinstance(adf.index_value.value, IndexValue.RangeIndex)
    pd.testing.assert_index_equal(adf.index_value.to_pandas(), pd.RangeIndex(20))

    tiled = tile(adf)
    assert tiled.nsplits == ((3, 3, 3, 1, 3, 3, 3, 1), (3, 1))
    assert tiled.chunk_shape == (8, 2)
    assert isinstance(tiled.chunks[0].op, ChunkStandardizeRangeIndex)
def test_without_shuffle_execution(setup, func_name, func_opts):
    if func_opts.func_name in ['__and__', '__or__', '__xor__']:
        # FIXME bitwise logical operators behave differently with pandas when index is not aligned.
        return

    # all the axes are monotonic
    # data1 with index split into [0...4], [5...9],
    # columns [3...7], [8...12]
    data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10),
                         columns=np.arange(3, 13))
    data1 = to_boolean_if_needed(func_opts.func_name,  data1)
    df1 = from_pandas(data1, chunk_size=5)
    # data2 with index split into [6...11], [2, 5],
    # columns [4...9], [10, 13]
    data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1),
                         columns=np.arange(4, 14))
    data2 = to_boolean_if_needed(func_opts.func_name,  data2)
    df2 = from_pandas(data2, chunk_size=6)

    df3 = func_opts.func(df1, df2)

    expected = func_opts.func(data1, data2)
    result = df3.execute().fetch()

    pd.testing.assert_frame_equal(expected, result)
    def testMerge(self):
        df1 = pd.DataFrame(np.arange(20).reshape((4, 5)) + 1, columns=['a', 'b', 'c', 'd', 'e'])
        df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=['a', 'b', 'x', 'y'])

        mdf1 = from_pandas(df1, chunk_size=2)
        mdf2 = from_pandas(df2, chunk_size=2)

        # Note [Index of Merge]
        #
        # When `left_index` and `right_index` of `merge` is both false, pandas will generate an RangeIndex to
        # the final result dataframe.
        #
        # We chunked the `left` and `right` dataframe, thus every result chunk will have its own RangeIndex.
        # When they are contenated we don't generate a new RangeIndex for the result, thus we cannot obtain the
        # same index value with pandas. But we guarantee that the content of dataframe is correct.

        # merge on index
        expected0 = df1.merge(df2)
        jdf0 = mdf1.merge(mdf2)
        result0 = self.executor.execute_dataframe(jdf0, concat=True)[0]
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected0, 0), sort_dataframe_inplace(result0, 0))

        # merge on left index and `right_on`
        expected1 = df1.merge(df2, how='left', right_on='x', left_index=True)
        jdf1 = mdf1.merge(mdf2, how='left', right_on='x', left_index=True)
        result1 = self.executor.execute_dataframe(jdf1, concat=True)[0]
        expected1.set_index('a_x', inplace=True)
        result1.set_index('a_x', inplace=True)
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected1, 0), sort_dataframe_inplace(result1, 0))

        # merge on `left_on` and right index
        expected2 = df1.merge(df2, how='right', left_on='a', right_index=True)
        jdf2 = mdf1.merge(mdf2, how='right', left_on='a', right_index=True)
        result2 = self.executor.execute_dataframe(jdf2, concat=True)[0]
        expected2.set_index('a', inplace=True)
        result2.set_index('a', inplace=True)
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected2, 0), sort_dataframe_inplace(result2, 0))

        # merge on `left_on` and `right_on`
        expected3 = df1.merge(df2, how='left', left_on='a', right_on='x')
        jdf3 = mdf1.merge(mdf2, how='left', left_on='a', right_on='x')
        result3 = self.executor.execute_dataframe(jdf3, concat=True)[0]
        expected3.set_index('a_x', inplace=True)
        result3.set_index('a_x', inplace=True)
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected3, 0), sort_dataframe_inplace(result3, 0))

        # merge on `on`
        expected4 = df1.merge(df2, how='right', on='a')
        jdf4 = mdf1.merge(mdf2, how='right', on='a')
        result4 = self.executor.execute_dataframe(jdf4, concat=True)[0]
        expected4.set_index('a', inplace=True)
        result4.set_index('a', inplace=True)
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected4, 0), sort_dataframe_inplace(result4, 0))

        # merge on multiple columns
        expected5 = df1.merge(df2, how='inner', on=['a', 'b'])
        jdf5 = mdf1.merge(mdf2, how='inner', on=['a', 'b'])
        result5 = self.executor.execute_dataframe(jdf5, concat=True)[0]
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected5, 0), sort_dataframe_inplace(result5, 0))
def test_with_plain_value(setup, func_name, func_opts):
    if func_opts.func_name in ['__and__', '__or__', '__xor__']:
        # skip tests for bitwise logical operators on plain value.
        return

    data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10),
                         columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
    data1 = to_boolean_if_needed(func_opts.func_name, data1)
    df1 = from_pandas(data1, chunk_size=6)
    s1 = df1[2]

    r = getattr(df1, func_opts.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], axis=0)
    result = r.execute().fetch()
    expected = getattr(data1, func_opts.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], axis=0)
    pd.testing.assert_frame_equal(expected, result)

    r = getattr(df1, func_opts.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10), axis=0)
    result = r.execute().fetch()
    expected = getattr(data1, func_opts.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10), axis=0)
    pd.testing.assert_frame_equal(expected, result)

    r = getattr(s1, func_opts.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    result = r.execute().fetch()
    expected = getattr(data1[2], func_opts.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    pd.testing.assert_series_equal(expected, result)

    r = getattr(s1, func_opts.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
    result = r.execute().fetch()
    expected = getattr(data1[2], func_opts.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
    pd.testing.assert_series_equal(expected, result)

    # specify index, not the default range index
    data1 = pd.DataFrame(np.random.rand(10, 7), index=np.arange(5, 15),
                         columns=[4, 1, 3, 2, 5, 6, 7])
    data1 = to_boolean_if_needed(func_opts.func_name, data1)
    df1 = from_pandas(data1, chunk_size=6)
    s1 = df1[2]

    r = getattr(df1, func_opts.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0)
    result = r.execute().fetch()
    expected = getattr(data1, func_opts.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0)
    pd.testing.assert_frame_equal(expected, result)

    r = getattr(df1, func_opts.func_name)(from_array(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])), axis=0)
    result = r.execute().fetch()
    expected = getattr(data1, func_opts.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0)
    pd.testing.assert_frame_equal(expected, result)

    r = getattr(s1, func_opts.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
    result = r.execute().fetch()
    expected = getattr(data1[2], func_opts.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
    pd.testing.assert_series_equal(expected, result)

    r = getattr(s1, func_opts.func_name)(from_array(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])))
    result = r.execute().fetch()
    expected = getattr(data1[2], func_opts.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
    pd.testing.assert_series_equal(expected, result)
    def testWithPlainValue(self):
        if self.func_name in ['__and__', '__or__', '__xor__']:
            # skip tests for bitwise logical operators on plain value.
            return

        data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        data1 = self.to_boolean_if_needed(data1)
        df1 = from_pandas(data1, chunk_size=6)
        s1 = df1[2]

        r = getattr(df1, self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], axis=0)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1, self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], axis=0)
        pd.testing.assert_frame_equal(expected, result)

        r = getattr(df1, self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10), axis=0)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1, self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10), axis=0)
        pd.testing.assert_frame_equal(expected, result)

        r = getattr(s1, self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1[2], self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
        pd.testing.assert_series_equal(expected, result)

        r = getattr(s1, self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1[2], self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
        pd.testing.assert_series_equal(expected, result)

        # specify index, not the default range index
        data1 = pd.DataFrame(np.random.rand(10, 7), index=np.arange(5, 15),
                             columns=[4, 1, 3, 2, 5, 6, 7])
        data1 = self.to_boolean_if_needed(data1)
        df1 = from_pandas(data1, chunk_size=6)
        s1 = df1[2]

        r = getattr(df1, self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1, self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0)
        pd.testing.assert_frame_equal(expected, result)

        r = getattr(df1, self.func_name)(from_array(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])), axis=0)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1, self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0)
        pd.testing.assert_frame_equal(expected, result)

        r = getattr(s1, self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1[2], self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
        pd.testing.assert_series_equal(expected, result)

        r = getattr(s1, self.func_name)(from_array(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])))
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1[2], self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
        pd.testing.assert_series_equal(expected, result)
    def testWithPlainValue(self):
        data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=6)
        s1 = df1[2]

        r = getattr(df1, self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], axis=0)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1, self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], axis=0)
        pd.testing.assert_frame_equal(expected, result)

        r = getattr(df1, self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10), axis=0)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1, self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10), axis=0)
        pd.testing.assert_frame_equal(expected, result)

        r = getattr(s1, self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1[2], self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
        pd.testing.assert_series_equal(expected, result)

        r = getattr(s1, self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1[2], self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
        pd.testing.assert_series_equal(expected, result)

        # specify index, not the default range index
        data1 = pd.DataFrame(np.random.rand(10, 7), index=np.arange(5, 15),
                             columns=[4, 1, 3, 2, 5, 6, 7])
        df1 = from_pandas(data1, chunk_size=6)
        s1 = df1[2]

        r = getattr(df1, self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1, self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0)
        pd.testing.assert_frame_equal(expected, result)

        r = getattr(df1, self.func_name)(from_array(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])), axis=0)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1, self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0)
        pd.testing.assert_frame_equal(expected, result)

        r = getattr(s1, self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1[2], self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
        pd.testing.assert_series_equal(expected, result)

        r = getattr(s1, self.func_name)(from_array(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])))
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1[2], self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
        pd.testing.assert_series_equal(expected, result)
Exemple #13
0
    def testDataFrame(self):
        with option_context({'eager_mode': True}):
            from mars.dataframe.arithmetic import add

            data1 = pd.DataFrame(np.random.rand(10, 10))
            df1 = from_pandas(data1, chunk_size=5)
            pd.testing.assert_frame_equal(df1.fetch(), data1)

            data2 = pd.DataFrame(np.random.rand(10, 10))
            df2 = from_pandas(data2, chunk_size=6)
            pd.testing.assert_frame_equal(df2.fetch(), data2)

            df3 = add(df1, df2)
            pd.testing.assert_frame_equal(df3.fetch(), data1 + data2)
Exemple #14
0
    def testWithAllShuffleExecution(self):
        # no axis is monotonic
        data1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas(data2, chunk_size=6)

        df3 = self.func(df1, df2)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)
Exemple #15
0
    def testWithShuffleOnStringIndex(self):
        # no axis is monotonic, and the index values are strings.
        data1 = pd.DataFrame(np.random.rand(10, 10), index=[str(x) for x in [0, 10, 2, 3, 4, 5, 6, 7, 8, 9]],
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10), index=[str(x) for x in [11, 1, 2, 5, 7, 6, 8, 9, 10, 3]],
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas(data2, chunk_size=6)

        df3 = self.func(df1, df2)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)
def test_dataframe_and_scalar(setup, func_name, func_opts):
    if func_opts.func_name in ['__and__', '__or__', '__xor__']:
        # FIXME bitwise logical operators doesn\'t support floating point scalars
        return

    # test dataframe and scalar
    pdf = pd.DataFrame(np.random.rand(10, 10))
    pdf = to_boolean_if_needed(func_opts.func_name, pdf)
    df = from_pandas(pdf, chunk_size=2)
    expected = func_opts.func(pdf, 1)
    result = func_opts.func(df, 1).execute().fetch()
    pd.testing.assert_frame_equal(expected, result)
    result2 = func_opts.func(df, 1).execute().fetch()
    pd.testing.assert_frame_equal(expected, result2)
    result3 = getattr(df, func_opts.func_name)(1).execute().fetch()
    pd.testing.assert_frame_equal(expected, result3)

    # test scalar and dataframe
    result4 = func_opts.func(df, 1).execute().fetch()
    pd.testing.assert_frame_equal(expected, result4)

    expected2 = func_opts.func(1, pdf)
    result5 = func_opts.func(1, df).execute().fetch()
    pd.testing.assert_frame_equal(expected2, result5)

    result6 = getattr(df, func_opts.rfunc_name)(1).execute().fetch()
    pd.testing.assert_frame_equal(expected2, result6)
def test_same_index(setup, func_name, func_opts):
    data = pd.DataFrame(np.random.rand(10, 10), index=np.random.randint(0, 2, size=(10,)),
                        columns=['c' + str(i) for i in range(10)])
    data = to_boolean_if_needed(func_opts.func_name, data)
    df = from_pandas(data, chunk_size=3)
    df2 = func_opts.func(df, df)

    expected = func_opts.func(data, data)
    result = df2.execute().fetch()
    pd.testing.assert_frame_equal(expected, result)

    series = from_pandas_series(data.iloc[0], chunk_size=3)
    df3 = func_opts.func(df, series)

    expected = func_opts.func(data, data.iloc[0])
    result = df3.execute().fetch()
    pd.testing.assert_frame_equal(expected, result)

    series = from_pandas_series(data.iloc[:, 0], chunk_size=3)
    df4 = getattr(df, func_opts.func_name)(series, axis=0)

    if func_opts.func_name not in ['__and__', '__or__', '__xor__']:
        expected = getattr(data, func_opts.func_name)(data.iloc[:, 0], axis=0)
        result = df4.execute().fetch()
        pd.testing.assert_frame_equal(expected, result)
    def testDataframeAndScalar(self):
        if self.func_name in ['__and__', '__or__', '__xor__']:
            # FIXME bitwise logical operators doesn\'t support floating point scalars
            return

        # test dataframe and scalar
        pdf = pd.DataFrame(np.random.rand(10, 10))
        pdf = self.to_boolean_if_needed(pdf)
        df = from_pandas(pdf, chunk_size=2)
        expected = self.func(pdf, 1)
        result = self.executor.execute_dataframe(self.func(df, 1),
                                                 concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)
        result2 = self.executor.execute_dataframe(self.func(df, 1),
                                                  concat=True)[0]
        pd.testing.assert_frame_equal(expected, result2)
        result3 = self.executor.execute_dataframe(getattr(df,
                                                          self.func_name)(1),
                                                  concat=True)[0]
        pd.testing.assert_frame_equal(expected, result3)

        # test scalar and dataframe
        result4 = self.executor.execute_dataframe(self.func(df, 1),
                                                  concat=True)[0]
        pd.testing.assert_frame_equal(expected, result4)

        expected2 = self.func(1, pdf)
        result5 = self.executor.execute_dataframe(self.func(1, df),
                                                  concat=True)[0]
        pd.testing.assert_frame_equal(expected2, result5)

        result6 = self.executor.execute_dataframe(getattr(df,
                                                          self.rfunc_name)(1),
                                                  concat=True)[0]
        pd.testing.assert_frame_equal(expected2, result6)
Exemple #19
0
    def testAddSelf(self):
        data = pd.DataFrame(np.random.rand(10, 10), index=np.random.randint(-100, 100, size=(10,)),
                            columns=[np.random.bytes(10) for _ in range(10)])
        df = from_pandas(data, chunk_size=3)
        df2 = add(df, df)

        # test df2's index and columns
        pd.testing.assert_index_equal(df2.columns.to_pandas(), (data + data).columns)
        self.assertTrue(df2.columns.should_be_monotonic)
        self.assertIsInstance(df2.index_value.value, IndexValue.Int64Index)
        self.assertTrue(df2.index_value.should_be_monotonic)
        pd.testing.assert_index_equal(df2.index_value.to_pandas(), pd.Int64Index([]))
        self.assertEqual(df2.index_value.key, df.index_value.key)
        self.assertEqual(df2.columns.key, df.columns.key)
        self.assertEqual(df2.shape[1], 10)

        df2.tiles()

        self.assertEqual(df2.chunk_shape, df.chunk_shape)
        for c in df2.chunks:
            self.assertIsInstance(c.op, DataFrameAdd)
            self.assertEqual(len(c.inputs), 2)
            # test the left side
            self.assertIs(c.inputs[0], df.cix[c.index].data)
            # test the right side
            self.assertIs(c.inputs[1], df.cix[c.index].data)
    def testSameIndex(self):
        data = pd.DataFrame(np.random.rand(10, 10),
                            index=np.random.randint(0, 2, size=(10, )),
                            columns=['c' + str(i) for i in range(10)])
        data = self.to_boolean_if_needed(data)
        df = from_pandas(data, chunk_size=3)
        df2 = self.func(df, df)

        expected = self.func(data, data)
        result = self.executor.execute_dataframe(df2, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        series = from_pandas_series(data.iloc[0], chunk_size=3)
        df3 = self.func(df, series)

        expected = self.func(data, data.iloc[0])
        result = self.executor.execute_dataframe(df3, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        series = from_pandas_series(data.iloc[:, 0], chunk_size=3)
        df4 = getattr(df, self.func_name)(series, axis=0)

        expected = getattr(data, self.func_name)(data.iloc[:, 0], axis=0)
        result = self.executor.execute_dataframe(df4, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)
Exemple #21
0
    def testAddScalar(self):
        data = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10),
                            columns=np.arange(3, 13))
        df = from_pandas(data, chunk_size=5)
        # test add with scalar
        result = add(df, 1)
        result2 = df.add(1)

        # test radd with scalar
        result3 = df.radd(1)
        result4 = df + 1
        result5 = 1 + df
        pd.testing.assert_index_equal(result.columns.to_pandas(), data.columns)
        self.assertIsInstance(result.index_value.value, IndexValue.Int64Index)

        pd.testing.assert_index_equal(result2.columns.to_pandas(), data.columns)
        self.assertIsInstance(result2.index_value.value, IndexValue.Int64Index)

        pd.testing.assert_index_equal(result3.columns.to_pandas(), data.columns)
        self.assertIsInstance(result3.index_value.value, IndexValue.Int64Index)

        pd.testing.assert_index_equal(result4.columns.to_pandas(), data.columns)
        self.assertIsInstance(result4.index_value.value, IndexValue.Int64Index)

        pd.testing.assert_index_equal(result5.columns.to_pandas(), data.columns)
        self.assertIsInstance(result5.index_value.value, IndexValue.Int64Index)

        # test NotImplemented, use other's radd instead
        class TestRadd:
            def __radd__(self, other):
                return 1

        other = TestRadd()
        ret = df + other
        self.assertEqual(ret, 1)
Exemple #22
0
def test_merge_on_duplicate_columns(setup):
    raw1 = pd.DataFrame([['foo', 1, 'bar'],
                         ['bar', 2, 'foo'],
                         ['baz', 3, 'foo']],
                        columns=['lkey', 'value', 'value'],
                        index=['a1', 'a2', 'a3'])
    raw2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
                         'value': [5, 6, 7, 8]}, index=['a1', 'a2', 'a3', 'a4'])

    df1 = from_pandas(raw1, chunk_size=2)
    df2 = from_pandas(raw2, chunk_size=3)

    r = df1.merge(df2, left_on='lkey', right_on='rkey')
    result = r.execute().fetch()
    expected = raw1.merge(raw2, left_on='lkey', right_on='rkey')
    pd.testing.assert_frame_equal(expected, result)
def test_negative(setup):
    data1 = pd.DataFrame(np.random.randint(low=0, high=100, size=(10, 10)))
    df1 = from_pandas(data1, chunk_size=5)

    result = -df1.execute().fetch()
    expected = -data1
    pd.testing.assert_frame_equal(expected, result)
    def testDataframeAndScalar(self):
        # test dataframe and scalar
        pdf = pd.DataFrame(np.random.rand(10, 10))
        df = from_pandas(pdf, chunk_size=2)
        expected = self.func(pdf, 1)
        result = self.executor.execute_dataframe(self.func(df, 1),
                                                 concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)
        result2 = self.executor.execute_dataframe(self.func(df, 1),
                                                  concat=True)[0]
        pd.testing.assert_frame_equal(expected, result2)
        result3 = self.executor.execute_dataframe(getattr(df,
                                                          self.func_name)(1),
                                                  concat=True)[0]
        pd.testing.assert_frame_equal(expected, result3)

        # test scalar and dataframe
        result4 = self.executor.execute_dataframe(self.func(df, 1),
                                                  concat=True)[0]
        pd.testing.assert_frame_equal(expected, result4)

        expected2 = self.func(1, pdf)
        result5 = self.executor.execute_dataframe(self.func(1, df),
                                                  concat=True)[0]
        pd.testing.assert_frame_equal(expected2, result5)

        result6 = self.executor.execute_dataframe(getattr(df,
                                                          self.rfunc_name)(1),
                                                  concat=True)[0]
        pd.testing.assert_frame_equal(expected2, result6)
    def testNegative(self):
        data1 = pd.DataFrame(np.random.randint(low=0, high=100, size=(10, 10)))
        df1 = from_pandas(data1, chunk_size=5)

        result = self.executor.execute_dataframe(-df1, concat=True)[0]
        expected = -data1
        pd.testing.assert_frame_equal(expected, result)
def test_not(setup):
    data1 = pd.DataFrame(np.random.uniform(low=-1, high=1, size=(10, 10)) > 0)
    df1 = from_pandas(data1, chunk_size=5)

    result = ~df1.execute().fetch()
    expected = ~data1
    pd.testing.assert_frame_equal(expected, result)
Exemple #27
0
    def testRfunc(self):
        data1 = pd.DataFrame(np.random.rand(10, 10))
        df1 = from_pandas(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10))
        df2 = from_pandas(data2, chunk_size=6)
        df3 = getattr(df1, self.rfunc_name)(df2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]
        expected = self.func(data2, data1)
        pd.testing.assert_frame_equal(expected, result)

        data3 = pd.DataFrame(np.random.rand(10, 10))
        df4 = from_pandas(data3, chunk_size=5)
        df5 = getattr(df4, self.rfunc_name)(1)
        result = self.executor.execute_dataframe(df5, concat=True)[0]
        expected2 = self.func(1, data3)
        pd.testing.assert_frame_equal(expected2, result)
Exemple #28
0
    def testAbs(self):
        data1 = pd.DataFrame(np.random.uniform(low=-1, high=1, size=(10, 10)))
        df1 = from_pandas(data1, chunk_size=5)

        result = self.executor.execute_dataframe(abs(df1), concat=True)[0]
        expected = data1.abs()
        pd.testing.assert_frame_equal(expected, result)
    def testRadd(self):
        data1 = pd.DataFrame(np.random.rand(10, 10))
        df1 = from_pandas(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10))
        df2 = from_pandas(data2, chunk_size=6)
        df3 = df1.radd(df2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]
        expected = data1 + data2
        pd.testing.assert_frame_equal(expected, result)

        data3 = pd.DataFrame(np.random.rand(10, 10))
        df4 = from_pandas(data3, chunk_size=5)
        df5 = df4.radd(1)
        result = self.executor.execute_dataframe(df5, concat=True)[0]
        expected2 = data3 + 1
        pd.testing.assert_frame_equal(expected2, result)
    def testWithoutShuffleAndWithOneChunk(self):
        if self.func_name in ['__and__', '__or__', '__xor__']:
            # FIXME bitwise logical operators behave differently with pandas when index is not aligned.
            return

        # only 1 axis is monotonic
        # data1 with index split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        data1 = self.to_boolean_if_needed(data1)
        df1 = from_pandas(data1, chunk_size=(5, 10))
        # data2 with index split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(11, 1, -1),
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        data2 = self.to_boolean_if_needed(data2)
        df2 = from_pandas(data2, chunk_size=(6, 10))

        df3 = self.func(df1, df2)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

        # only 1 axis is monotonic
        # data1 with columns split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
                             columns=np.arange(10))
        data1 = self.to_boolean_if_needed(data1)
        df1 = from_pandas(data1, chunk_size=(10, 5))
        # data2 with columns split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
                             columns=np.arange(11, 1, -1))
        data2 = self.to_boolean_if_needed(data2)
        df2 = from_pandas(data2, chunk_size=(10, 6))

        df3 = self.func(df1, df2)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)