Exemple #1
0
    def testResetIndex(self):
        data = pd.DataFrame([('bird', 389.0), ('bird', 24.0), ('mammal', 80.5),
                             ('mammal', np.nan)],
                            index=['falcon', 'parrot', 'lion', 'monkey'],
                            columns=('class', 'max_speed'))
        df = df_reset_index(from_pandas_df(data, chunk_size=2))
        r = data.reset_index()

        self.assertEqual(df.shape, (4, 3))
        pd.testing.assert_series_equal(df.dtypes, r.dtypes)

        df2 = df.tiles()

        self.assertEqual(len(df2.chunks), 2)
        self.assertEqual(df2.chunks[0].shape, (2, 3))
        pd.testing.assert_index_equal(df2.chunks[0].index_value.to_pandas(),
                                      pd.RangeIndex(2))
        pd.testing.assert_series_equal(df2.chunks[0].dtypes, r.dtypes)
        self.assertEqual(df2.chunks[1].shape, (2, 3))
        pd.testing.assert_index_equal(df2.chunks[1].index_value.to_pandas(),
                                      pd.RangeIndex(2, 4))
        pd.testing.assert_series_equal(df2.chunks[1].dtypes, r.dtypes)

        df = df_reset_index(from_pandas_df(data, chunk_size=1), drop=True)
        r = data.reset_index(drop=True)

        self.assertEqual(df.shape, (4, 2))
        pd.testing.assert_series_equal(df.dtypes, r.dtypes)

        df2 = df.tiles()

        self.assertEqual(len(df2.chunks), 8)

        for c in df2.chunks:
            self.assertEqual(c.shape, (1, 1))
            pd.testing.assert_index_equal(
                c.index_value.to_pandas(),
                pd.RangeIndex(c.index[0], c.index[0] + 1))
            pd.testing.assert_series_equal(c.dtypes,
                                           r.dtypes[c.index[1]:c.index[1] + 1])

        # test Series
        series_data = pd.Series([1, 2, 3, 4],
                                name='foo',
                                index=pd.Index(['a', 'b', 'c', 'd'],
                                               name='idx'))
        s = series_reset_index(from_pandas_series(series_data, chunk_size=2))
        r = series_data.reset_index()

        self.assertEqual(s.shape, (4, 2))
        pd.testing.assert_series_equal(s.dtypes, r.dtypes)

        s2 = s.tiles()
        self.assertEqual(len(s2.chunks), 2)
        self.assertEqual(s2.chunks[0].shape, (2, 2))
        pd.testing.assert_index_equal(s2.chunks[0].index_value.to_pandas(),
                                      pd.RangeIndex(2))
        self.assertEqual(s2.chunks[1].shape, (2, 2))
        pd.testing.assert_index_equal(s2.chunks[1].index_value.to_pandas(),
                                      pd.RangeIndex(2, 4))
Exemple #2
0
    def testResetIndexExecution(self):
        data = pd.DataFrame([('bird',    389.0),
                             ('bird',     24.0),
                             ('mammal',   80.5),
                             ('mammal', np.nan)],
                            index=['falcon', 'parrot', 'lion', 'monkey'],
                            columns=('class', 'max_speed'))
        df = from_pandas_df(data)
        df2 = df_reset_index(df)
        result = self.executor.execute_dataframe(df2, concat=True)[0]
        expected = data.reset_index()
        pd.testing.assert_frame_equal(result, expected)

        df = from_pandas_df(data, chunk_size=2)
        df2 = df_reset_index(df)
        result = self.executor.execute_dataframe(df2, concat=True)[0]
        expected = data.reset_index()
        pd.testing.assert_frame_equal(result, expected)

        df = from_pandas_df(data, chunk_size=1)
        df2 = df_reset_index(df, drop=True)
        result = self.executor.execute_dataframe(df2, concat=True)[0]
        expected = data.reset_index(drop=True)
        pd.testing.assert_frame_equal(result, expected)

        index = pd.MultiIndex.from_tuples([('bird', 'falcon'),
                                           ('bird', 'parrot'),
                                           ('mammal', 'lion'),
                                           ('mammal', 'monkey')],
                                          names=['class', 'name'])
        data = pd.DataFrame([('bird',    389.0),
                             ('bird',     24.0),
                             ('mammal',   80.5),
                             ('mammal', np.nan)],
                            index=index,
                            columns=('type', 'max_speed'))
        df = from_pandas_df(data, chunk_size=1)
        df2 = df_reset_index(df, level='class')
        result = self.executor.execute_dataframe(df2, concat=True)[0]
        expected = data.reset_index(level='class')
        pd.testing.assert_frame_equal(result, expected)

        columns = pd.MultiIndex.from_tuples([('speed', 'max'), ('species', 'type')])
        data.columns = columns
        df = from_pandas_df(data, chunk_size=2)
        df2 = df_reset_index(df, level='class', col_level=1, col_fill='species')
        result = self.executor.execute_dataframe(df2, concat=True)[0]
        expected = data.reset_index(level='class', col_level=1, col_fill='species')
        pd.testing.assert_frame_equal(result, expected)

        # Test Series

        s = pd.Series([1, 2, 3, 4], name='foo',
                      index=pd.Index(['a', 'b', 'c', 'd'], name='idx'))

        series = from_pandas_series(s)
        s2 = series_reset_index(series, name='bar')
        result = self.executor.execute_dataframe(s2, concat=True)[0]
        expected = s.reset_index(name='bar')
        pd.testing.assert_frame_equal(result, expected)

        series = from_pandas_series(s, chunk_size=2)
        s2 = series_reset_index(series, drop=True)
        result = self.executor.execute_dataframe(s2, concat=True)[0]
        expected = s.reset_index(drop=True)
        pd.testing.assert_series_equal(result, expected)

        # Test Unknown shape
        sess = new_session()
        data1 = pd.DataFrame(np.random.rand(10, 3), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9])
        df1 = from_pandas_df(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 3), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3])
        df2 = from_pandas_df(data2, chunk_size=6)
        df = (df1 + df2).reset_index()
        result = sess.run(df)
        pd.testing.assert_index_equal(result.index, pd.RangeIndex(12))
        # Inconsistent with Pandas when input dataframe's shape is unknown.
        result = result.sort_values(by=result.columns[0])
        expected = (data1 + data2).reset_index()
        np.testing.assert_array_equal(result.to_numpy(), expected.to_numpy())

        data1 = pd.Series(np.random.rand(10,), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9])
        series1 = from_pandas_series(data1, chunk_size=3)
        data2 = pd.Series(np.random.rand(10,), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3])
        series2 = from_pandas_series(data2, chunk_size=3)
        df = (series1 + series2).reset_index()
        result = sess.run(df)
        pd.testing.assert_index_equal(result.index, pd.RangeIndex(12))
        # Inconsistent with Pandas when input dataframe's shape is unknown.
        result = result.sort_values(by=result.columns[0])
        expected = (data1 + data2).reset_index()
        np.testing.assert_array_equal(result.to_numpy(), expected.to_numpy())