def testGPUExecution(self): # test sort_values distinct_opts = ['0'] if sys.platform.lower().startswith('win') else [ '0', '1' ] for add_distinct in distinct_opts: os.environ['PSRS_DISTINCT_COL'] = add_distinct # test dataframe raw = pd.DataFrame(np.random.rand(100, 10), columns=['a' + str(i) for i in range(10)]) mdf = DataFrame(raw, chunk_size=30).to_gpu() result = self.executor.execute_dataframe(mdf.sort_values(by='a0'), concat=True)[0] expected = raw.sort_values(by='a0') pd.testing.assert_frame_equal(result.to_pandas(), expected) # test series raw = pd.Series(np.random.rand(10)) series = Series(raw).to_gpu() result = self.executor.execute_dataframe(series.sort_values(), concat=True)[0] expected = raw.sort_values() pd.testing.assert_series_equal(result.to_pandas(), expected) # test DataFrame.sort_index raw = pd.DataFrame(np.random.rand(10, 10), columns=np.random.rand(10)) mdf = DataFrame(raw).to_gpu() result = self.executor.execute_dataframe(mdf.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_frame_equal(result.to_pandas(), expected) # test Series.sort_index raw = pd.Series(np.random.rand(10, ), index=np.random.rand(10)) series = Series(raw).to_gpu() result = self.executor.execute_dataframe(series.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_series_equal(result.to_pandas(), expected)
def testSortValuesExecution(self): distinct_opts = ['0'] if sys.platform.lower().startswith('win') else [ '0', '1' ] for add_distinct in distinct_opts: os.environ['PSRS_DISTINCT_COL'] = add_distinct df = pd.DataFrame(np.random.rand(100, 10), columns=['a' + str(i) for i in range(10)]) # test one chunk mdf = DataFrame(df) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a6', 'a7'], ascending=False), concat=True)[0] expected = df.sort_values(['a6', 'a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test psrs mdf = DataFrame(df, chunk_size=10) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a3', 'a4']), concat=True)[0] expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test ascending=False result = self.executor.execute_dataframe(mdf.sort_values( ['a0', 'a1'], ascending=False), concat=True)[0] expected = df.sort_values(['a0', 'a1'], ascending=False) pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a7'], ascending=False), concat=True)[0] expected = df.sort_values(['a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test multiindex df2 = df.copy(deep=True) df2.columns = pd.MultiIndex.from_product( [list('AB'), list('CDEFG')]) mdf = DataFrame(df2, chunk_size=10) result = self.executor.execute_dataframe(mdf.sort_values([('A', 'C')]), concat=True)[0] expected = df2.sort_values([('A', 'C')]) pd.testing.assert_frame_equal(result, expected) # test rechunk mdf = DataFrame(df, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a3', 'a4']), concat=True)[0] expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test other types raw = pd.DataFrame( { 'a': np.random.rand(10), 'b': np.random.randint(1000, size=10), 'c': np.random.rand(10), 'd': [np.random.bytes(10) for _ in range(10)], 'e': [pd.Timestamp(f'201{i}') for i in range(10)], 'f': [pd.Timedelta(f'{i} days') for i in range(10)] }, ) mdf = DataFrame(raw, chunk_size=3) for label in raw.columns: result = self.executor.execute_dataframe( mdf.sort_values(label), concat=True)[0] expected = raw.sort_values(label) pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a', 'b', 'e'], ascending=False), concat=True)[0] expected = raw.sort_values(['a', 'b', 'e'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test nan df = pd.DataFrame({ 'col1': ['A', 'A', 'B', 'B', 'D', 'C'], 'col2': [2, 1, 9, np.nan, 7, 4], 'col3': [0, 1, 9, 4, 2, 3], }) mdf = DataFrame(df) result = self.executor.execute_dataframe(mdf.sort_values(['col2']), concat=True)[0] expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(df, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_values(['col2']), concat=True)[0] expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) # test ignore_index executor = ExecutorForTest(storage=new_session().context) df = pd.DataFrame(np.random.rand(10, 3), columns=['a' + str(i) for i in range(3)]) mdf = DataFrame(df, chunk_size=3) result = executor.execute_dataframe(mdf.sort_values( ['a0', 'a1'], ignore_index=True), concat=True)[0] try: # for python3.5 expected = df.sort_values(['a0', 'a1'], ignore_index=True) except TypeError: expected = df.sort_values(['a0', 'a1']) expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test inplace mdf = DataFrame(df) mdf.sort_values('a0', inplace=True) result = self.executor.execute_dataframe(mdf, concat=True)[0] df.sort_values('a0', inplace=True) pd.testing.assert_frame_equal(result, df) # test unknown shape df = pd.DataFrame({ 'a': list(range(10)), 'b': np.random.random(10) }) mdf = DataFrame(df, chunk_size=4) filtered = mdf[mdf['a'] > 2] result = self.executor.execute_dataframe( filtered.sort_values(by='b'), concat=True)[0] pd.testing.assert_frame_equal(result, df[df['a'] > 2].sort_values(by='b')) # test Series.sort_values raw = pd.Series(np.random.rand(10)) series = Series(raw) result = self.executor.execute_dataframe(series.sort_values(), concat=True)[0] expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=3) result = self.executor.execute_dataframe(series.sort_values(), concat=True)[0] expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=2) result = self.executor.execute_dataframe( series.sort_values(ascending=False), concat=True)[0] expected = raw.sort_values(ascending=False) pd.testing.assert_series_equal(result, expected)
def test_sort_values_execution(setup): distinct_opts = ['0'] if sys.platform.lower().startswith('win') else [ '0', '1' ] for add_distinct in distinct_opts: os.environ['PSRS_DISTINCT_COL'] = add_distinct df = pd.DataFrame(np.random.rand(100, 10), columns=['a' + str(i) for i in range(10)]) # test one chunk mdf = DataFrame(df) result = mdf.sort_values('a0').execute().fetch() expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = mdf.sort_values(['a6', 'a7'], ascending=False).execute().fetch() expected = df.sort_values(['a6', 'a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test psrs mdf = DataFrame(df, chunk_size=10) result = mdf.sort_values('a0').execute().fetch() expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = mdf.sort_values(['a3', 'a4']).execute().fetch() expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test ascending=False result = mdf.sort_values(['a0', 'a1'], ascending=False).execute().fetch() expected = df.sort_values(['a0', 'a1'], ascending=False) pd.testing.assert_frame_equal(result, expected) result = mdf.sort_values(['a7'], ascending=False).execute().fetch() expected = df.sort_values(['a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test multiindex df2 = df.copy(deep=True) df2.columns = pd.MultiIndex.from_product([list('AB'), list('CDEFG')]) mdf = DataFrame(df2, chunk_size=10) result = mdf.sort_values([('A', 'C')]).execute().fetch() expected = df2.sort_values([('A', 'C')]) pd.testing.assert_frame_equal(result, expected) # test rechunk mdf = DataFrame(df, chunk_size=3) result = mdf.sort_values('a0').execute().fetch() expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = mdf.sort_values(['a3', 'a4']).execute().fetch() expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test other types raw = pd.DataFrame( { 'a': np.random.rand(10), 'b': np.random.randint(1000, size=10), 'c': np.random.rand(10), 'd': [np.random.bytes(10) for _ in range(10)], 'e': [pd.Timestamp(f'201{i}') for i in range(10)], 'f': [pd.Timedelta(f'{i} days') for i in range(10)] }, ) mdf = DataFrame(raw, chunk_size=3) for label in raw.columns: result = mdf.sort_values(label).execute().fetch() expected = raw.sort_values(label) pd.testing.assert_frame_equal(result, expected) result = mdf.sort_values(['a', 'b', 'e'], ascending=False).execute().fetch() expected = raw.sort_values(['a', 'b', 'e'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test nan df = pd.DataFrame({ 'col1': ['A', 'A', 'B', 'B', 'D', 'C'], 'col2': [2, 1, 9, np.nan, 7, 4], 'col3': [0, 1, 9, 4, 2, 3], }) mdf = DataFrame(df) result = mdf.sort_values(['col2']).execute().fetch() expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(df, chunk_size=3) result = mdf.sort_values(['col2']).execute().fetch() expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) # test None (issue #1885) df = pd.DataFrame(np.random.rand(1000, 10)) df[0][df[0] < 0.5] = 'A' df[0][df[0] != 'A'] = None mdf = DataFrame(df) result = mdf.sort_values([0, 1]).execute().fetch() expected = df.sort_values([0, 1]) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(df, chunk_size=100) result = mdf.sort_values([0, 1]).execute().fetch() expected = df.sort_values([0, 1]) pd.testing.assert_frame_equal(result, expected) # test ignore_index df = pd.DataFrame(np.random.rand(10, 3), columns=['a' + str(i) for i in range(3)]) mdf = DataFrame(df, chunk_size=3) result = mdf.sort_values(['a0', 'a1'], ignore_index=True).execute().fetch() try: # for python3.5 expected = df.sort_values(['a0', 'a1'], ignore_index=True) except TypeError: expected = df.sort_values(['a0', 'a1']) expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test inplace mdf = DataFrame(df) mdf.sort_values('a0', inplace=True) result = mdf.execute().fetch() df.sort_values('a0', inplace=True) pd.testing.assert_frame_equal(result, df) # test unknown shape df = pd.DataFrame({'a': list(range(10)), 'b': np.random.random(10)}) mdf = DataFrame(df, chunk_size=4) filtered = mdf[mdf['a'] > 2] result = filtered.sort_values(by='b').execute().fetch() pd.testing.assert_frame_equal(result, df[df['a'] > 2].sort_values(by='b')) # test empty dataframe df = pd.DataFrame({'a': list(range(10)), 'b': np.random.random(10)}) mdf = DataFrame(df, chunk_size=4) filtered = mdf[mdf['b'] > 100] result = filtered.sort_values(by='b').execute().fetch() pd.testing.assert_frame_equal(result, df[df['b'] > 100].sort_values(by='b')) # test chunks with zero length df = pd.DataFrame({'a': list(range(10)), 'b': np.random.random(10)}) df.iloc[4:8, 1] = 0 mdf = DataFrame(df, chunk_size=4) filtered = mdf[mdf['b'] != 0] result = filtered.sort_values(by='b').execute().fetch() pd.testing.assert_frame_equal(result, df[df['b'] != 0].sort_values(by='b')) # test Series.sort_values raw = pd.Series(np.random.rand(10)) series = Series(raw) result = series.sort_values().execute().fetch() expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=3) result = series.sort_values().execute().fetch() expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=2) result = series.sort_values(ascending=False).execute().fetch() expected = raw.sort_values(ascending=False) pd.testing.assert_series_equal(result, expected) # test empty series series = pd.Series(list(range(10)), name='a') mseries = Series(series, chunk_size=4) filtered = mseries[mseries > 100] result = filtered.sort_values().execute().fetch() pd.testing.assert_series_equal(result, series[series > 100].sort_values()) # test series with None series = pd.Series(np.arange(1000, )) series[series < 500] = 'A' series[series != 'A'] = None mseries = Series(series, chunk_size=100) result = mseries.sort_values().execute().fetch() expected = series.sort_values() pd.testing.assert_series_equal(result.reset_index(drop=True), expected.reset_index(drop=True))
def testSortValuesExecution(self): df = pd.DataFrame(np.random.rand(100, 10), columns=['a' + str(i) for i in range(10)]) # test one chunk mdf = DataFrame(df) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a6', 'a7'], ascending=False), concat=True)[0] expected = df.sort_values(['a6', 'a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test psrs mdf = DataFrame(df, chunk_size=10) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values(['a3', 'a4']), concat=True)[0] expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test ascending=False result = self.executor.execute_dataframe(mdf.sort_values( ['a0', 'a1'], ascending=False), concat=True)[0] expected = df.sort_values(['a0', 'a1'], ascending=False) pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a7'], ascending=False), concat=True)[0] expected = df.sort_values(['a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test rechunk mdf = DataFrame(df, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values(['a3', 'a4']), concat=True)[0] expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test other types raw = pd.DataFrame( { 'a': np.random.rand(10), 'b': np.random.randint(1000, size=10), 'c': np.random.rand(10), 'd': [np.random.bytes(10) for _ in range(10)], 'e': [pd.Timestamp('201{}'.format(i)) for i in range(10)], 'f': [pd.Timedelta('{} days'.format(i)) for i in range(10)] }, ) mdf = DataFrame(raw, chunk_size=3) for label in raw.columns: result = self.executor.execute_dataframe(mdf.sort_values(label), concat=True)[0] expected = raw.sort_values(label) pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a', 'b', 'e'], ascending=False), concat=True)[0] expected = raw.sort_values(['a', 'b', 'e'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test nan df = pd.DataFrame({ 'col1': ['A', 'A', 'B', 'B', 'D', 'C'], 'col2': [2, 1, 9, np.nan, 7, 4], 'col3': [0, 1, 9, 4, 2, 3], }) mdf = DataFrame(df) result = self.executor.execute_dataframe(mdf.sort_values(['col2']), concat=True)[0] expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(df, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_values(['col2']), concat=True)[0] expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) # test ignore_index executor = ExecutorForTest(storage=new_session().context) df = pd.DataFrame(np.random.rand(10, 3), columns=['a' + str(i) for i in range(3)]) mdf = DataFrame(df, chunk_size=3) result = executor.execute_dataframe(mdf.sort_values(['a0', 'a1'], ignore_index=True), concat=True)[0] try: # for python3.5 expected = df.sort_values(['a0', 'a1'], ignore_index=True) except TypeError: expected = df.sort_values(['a0', 'a1']) expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test inplace mdf = DataFrame(df) mdf.sort_values('a0', inplace=True) result = self.executor.execute_dataframe(mdf, concat=True)[0] df.sort_values('a0', inplace=True) pd.testing.assert_frame_equal(result, df) # test unknown shape df = pd.DataFrame({'a': list(range(10)), 'b': np.random.random(10)}) mdf = DataFrame(df, chunk_size=4) filtered = mdf[mdf['a'] > 2] result = self.executor.execute_dataframe(filtered.sort_values(by='b'), concat=True)[0] pd.testing.assert_frame_equal(result, df[df['a'] > 2].sort_values(by='b')) # test Sereis.sort_values raw = pd.Series(np.random.rand(10)) series = Series(raw) result = self.executor.execute_dataframe(series.sort_values(), concat=True)[0] expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=3) result = self.executor.execute_dataframe(series.sort_values(), concat=True)[0] expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=2) result = self.executor.execute_dataframe( series.sort_values(ascending=False), concat=True)[0] expected = raw.sort_values(ascending=False) pd.testing.assert_series_equal(result, expected)