def test_series_corr(setup): rs = np.random.RandomState(0) raw = rs.rand(20) raw = pd.Series(np.where(raw > 0.4, raw, np.nan)) raw2 = rs.rand(20) raw2 = pd.Series(np.where(raw2 > 0.4, raw2, np.nan)) s = Series(raw) s2 = Series(raw2) result = s.corr(s2) assert result.execute().fetch() == raw.corr(raw2) result = s.corr(s2, method='kendall') assert result.execute().fetch() == raw.corr(raw2, method='kendall') result = s.autocorr(2) assert result.execute().fetch() == raw.autocorr(2) s = Series(raw, chunk_size=6) s2 = Series(raw2, chunk_size=4) with pytest.raises(Exception): s.corr(s2, method='kendall').execute() result = s.corr(s2) assert pytest.approx(result.execute().fetch()) == raw.corr(raw2) result = s.corr(s2, min_periods=7) assert pytest.approx(result.execute().fetch()) == raw.corr(raw2, min_periods=7) result = s.autocorr(2) assert pytest.approx(result.execute().fetch()) == raw.autocorr(2)
def test_dataframe_corr_with(setup): rs = np.random.RandomState(0) raw_df = rs.rand(20, 10) raw_df = pd.DataFrame(np.where(raw_df > 0.4, raw_df, np.nan), columns=list('ABCDEFGHIJ')) raw_df2 = rs.rand(20, 10) raw_df2 = pd.DataFrame(np.where(raw_df2 > 0.4, raw_df2, np.nan), columns=list('ACDEGHIJKL')) raw_s = rs.rand(20) raw_s = pd.Series(np.where(raw_s > 0.4, raw_s, np.nan)) raw_s2 = rs.rand(10) raw_s2 = pd.Series(np.where(raw_s2 > 0.4, raw_s2, np.nan), index=raw_df2.columns) df = DataFrame(raw_df) df2 = DataFrame(raw_df2) result = df.corrwith(df2) pd.testing.assert_series_equal(result.execute().fetch(), raw_df.corrwith(raw_df2)) result = df.corrwith(df2, axis=1) pd.testing.assert_series_equal(result.execute().fetch(), raw_df.corrwith(raw_df2, axis=1)) result = df.corrwith(df2, method='kendall') pd.testing.assert_series_equal(result.execute().fetch(), raw_df.corrwith(raw_df2, method='kendall')) df = DataFrame(raw_df, chunk_size=4) df2 = DataFrame(raw_df2, chunk_size=6) s = Series(raw_s, chunk_size=5) s2 = Series(raw_s2, chunk_size=5) with pytest.raises(Exception): df.corrwith(df2, method='kendall').execute() result = df.corrwith(df2) pd.testing.assert_series_equal(result.execute().fetch().sort_index(), raw_df.corrwith(raw_df2).sort_index()) result = df.corrwith(df2, axis=1) pd.testing.assert_series_equal( result.execute().fetch().sort_index(), raw_df.corrwith(raw_df2, axis=1).sort_index()) result = df.corrwith(s) pd.testing.assert_series_equal(result.execute().fetch().sort_index(), raw_df.corrwith(raw_s).sort_index()) result = df.corrwith(s2, axis=1) pd.testing.assert_series_equal( result.execute().fetch().sort_index(), raw_df.corrwith(raw_s2, axis=1).sort_index())
def testSeriesQuantileExecution(self): raw = pd.Series(np.random.rand(10), name='a') a = Series(raw, chunk_size=3) # q = 0.5, scalar r = a.quantile() result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile() self.assertEqual(result, expected) # q is a list r = a.quantile([0.3, 0.7]) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile([0.3, 0.7]) pd.testing.assert_series_equal(result, expected) # test interpolation r = a.quantile([0.3, 0.7], interpolation='midpoint') result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile([0.3, 0.7], interpolation='midpoint') pd.testing.assert_series_equal(result, expected) ctx, executor = self._create_test_context(self.executor) with ctx: q = tensor([0.3, 0.7]) # q is a tensor r = a.quantile(q) result = executor.execute_dataframes([r])[0] expected = raw.quantile([0.3, 0.7]) pd.testing.assert_series_equal(result, expected)
def test_series_quantile_execution(setup): raw = pd.Series(np.random.rand(10), name='a') a = Series(raw, chunk_size=3) # q = 0.5, scalar r = a.quantile() result = r.execute().fetch() expected = raw.quantile() assert result == expected # q is a list r = a.quantile([0.3, 0.7]) result = r.execute().fetch() expected = raw.quantile([0.3, 0.7]) pd.testing.assert_series_equal(result, expected) # test interpolation r = a.quantile([0.3, 0.7], interpolation='midpoint') result = r.execute().fetch() expected = raw.quantile([0.3, 0.7], interpolation='midpoint') pd.testing.assert_series_equal(result, expected) q = tensor([0.3, 0.7]) # q is a tensor r = a.quantile(q) result = r.execute().fetch() expected = raw.quantile([0.3, 0.7]) pd.testing.assert_series_equal(result, expected)
def testSeriesCorr(self): rs = np.random.RandomState(0) raw = rs.rand(20) raw = pd.Series(np.where(raw > 0.4, raw, np.nan)) raw2 = rs.rand(20) raw2 = pd.Series(np.where(raw2 > 0.4, raw2, np.nan)) s = Series(raw) s2 = Series(raw2) result = s.corr(s2) self.assertEqual( self.executor.execute_dataframe(result, concat=True)[0], raw.corr(raw2)) result = s.corr(s2, method='kendall') self.assertEqual( self.executor.execute_dataframe(result, concat=True)[0], raw.corr(raw2, method='kendall')) result = s.autocorr(2) self.assertEqual( self.executor.execute_dataframe(result, concat=True)[0], raw.autocorr(2)) s = Series(raw, chunk_size=6) s2 = Series(raw2, chunk_size=4) with self.assertRaises(Exception): self.executor.execute_dataframe(s.corr(s2, method='kendall'), concat=True) result = s.corr(s2) self.assertAlmostEqual( self.executor.execute_dataframe(result, concat=True)[0], raw.corr(raw2)) result = s.corr(s2, min_periods=7) self.assertAlmostEqual( self.executor.execute_dataframe(result, concat=True)[0], raw.corr(raw2, min_periods=7)) result = s.autocorr(2) self.assertAlmostEqual( self.executor.execute_dataframe(result, concat=True)[0], raw.autocorr(2))
def testGPUExecution(self): # test sort_values distinct_opts = ['0'] if sys.platform.lower().startswith('win') else [ '0', '1' ] for add_distinct in distinct_opts: os.environ['PSRS_DISTINCT_COL'] = add_distinct # test dataframe raw = pd.DataFrame(np.random.rand(100, 10), columns=['a' + str(i) for i in range(10)]) mdf = DataFrame(raw, chunk_size=30).to_gpu() result = self.executor.execute_dataframe(mdf.sort_values(by='a0'), concat=True)[0] expected = raw.sort_values(by='a0') pd.testing.assert_frame_equal(result.to_pandas(), expected) # test series raw = pd.Series(np.random.rand(10)) series = Series(raw).to_gpu() result = self.executor.execute_dataframe(series.sort_values(), concat=True)[0] expected = raw.sort_values() pd.testing.assert_series_equal(result.to_pandas(), expected) # test DataFrame.sort_index raw = pd.DataFrame(np.random.rand(10, 10), columns=np.random.rand(10)) mdf = DataFrame(raw).to_gpu() result = self.executor.execute_dataframe(mdf.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_frame_equal(result.to_pandas(), expected) # test Series.sort_index raw = pd.Series(np.random.rand(10, ), index=np.random.rand(10)) series = Series(raw).to_gpu() result = self.executor.execute_dataframe(series.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_series_equal(result.to_pandas(), expected)
def testSeriesQuantileExecution(self): raw = pd.Series(np.random.rand(10), name='a') a = Series(raw, chunk_size=3) # q = 0.5, scalar r = a.quantile() result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile() self.assertEqual(result, expected) # q is a list r = a.quantile([0.3, 0.7]) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile([0.3, 0.7]) pd.testing.assert_series_equal(result, expected) # test interpolation r = a.quantile([0.3, 0.7], interpolation='midpoint') result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile([0.3, 0.7], interpolation='midpoint') pd.testing.assert_series_equal(result, expected) this = self class MockSession: def __init__(self): self.executor = this.executor ctx = LocalContext(MockSession()) executor = ExecutorForTest('numpy', storage=ctx) with ctx: q = tensor([0.3, 0.7]) # q is a tensor r = a.quantile(q) result = executor.execute_dataframes([r])[0] expected = raw.quantile([0.3, 0.7]) pd.testing.assert_series_equal(result, expected)
def testToDatetimeExecution(self): # scalar r = to_datetime(1490195805, unit='s') result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.to_datetime(1490195805, unit='s') self.assertEqual(pd.to_datetime(result.item()), expected) # test list like raw = ['3/11/2000', '3/12/2000', '3/13/2000'] t = tensor(raw, chunk_size=2) r = to_datetime(t, infer_datetime_format=True) result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.to_datetime(raw, infer_datetime_format=True) pd.testing.assert_index_equal(result, expected) # test series raw_series = pd.Series(raw) s = Series(raw_series, chunk_size=2) r = to_datetime(s) result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.to_datetime(raw_series) pd.testing.assert_series_equal(result, expected) # test DataFrame raw_df = pd.DataFrame({ 'year': [2015, 2016], 'month': [2, 3], 'day': [4, 5] }) df = DataFrame(raw_df, chunk_size=(1, 2)) r = to_datetime(df) result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.to_datetime(raw_df) pd.testing.assert_series_equal(result, expected) # test Index raw_index = pd.Index([1, 2, 3]) s = Index(raw_index, chunk_size=2) r = to_datetime(s) result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.to_datetime(raw_index) pd.testing.assert_index_equal(result, expected) # test raises == 'ignore' raw = ['13000101'] r = to_datetime(raw, format='%Y%m%d', errors='ignore') result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.to_datetime(raw, format='%Y%m%d', errors='ignore') pd.testing.assert_index_equal(result, expected) # test unit r = to_datetime([1490195805], unit='s') result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.to_datetime([1490195805], unit='s') pd.testing.assert_index_equal(result, expected) # test origin r = to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01')) result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01')) pd.testing.assert_index_equal(result, expected)
def testDotExecution(self): df1_raw = pd.DataFrame(np.random.rand(4, 7)) df2_raw = pd.DataFrame(np.random.rand(7, 5), columns=list('efghi')) s1_raw = pd.Series(np.random.rand(7)) s2_raw = pd.Series(np.random.rand(7)) df1 = DataFrame(df1_raw, chunk_size=(3, 2)) df2 = DataFrame(df2_raw, chunk_size=(3, 4)) # df.dot(df) r = df1.dot(df2) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df1_raw.dot(df2_raw) pd.testing.assert_frame_equal(result, expected) # test @ r = df1 @ df2 result = self.executor.execute_dataframe(r, concat=True)[0] expected = df1_raw @ df2_raw pd.testing.assert_frame_equal(result, expected) series1 = Series(s1_raw, chunk_size=5) # df.dot(series) r = df1.dot(series1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df1_raw.dot(s1_raw) pd.testing.assert_series_equal(result, expected) # df.dot(2d_array) r = df1.dot(df2_raw.to_numpy()) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df1_raw.dot(df2_raw.to_numpy()) pd.testing.assert_frame_equal(result, expected) # df.dot(1d_array) r = df1.dot(s1_raw.to_numpy()) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df1_raw.dot(s1_raw.to_numpy()) pd.testing.assert_series_equal(result, expected) series2 = Series(s2_raw, chunk_size=4) # series.dot(series) r = series1.dot(series2) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s1_raw.dot(s2_raw) self.assertAlmostEqual(result, expected) # series.dot(df) r = series1.dot(df2) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s1_raw.dot(df2_raw) pd.testing.assert_series_equal(result, expected) # series.dot(2d_array) r = series1.dot(df2_raw.to_numpy()) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s1_raw.dot(df2_raw.to_numpy()) np.testing.assert_almost_equal(result, expected) # series.dot(1d_array) r = series1.dot(s2_raw.to_numpy()) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s1_raw.dot(s2_raw.to_numpy()) self.assertAlmostEqual(result, expected)
def test_sort_values_execution(setup): distinct_opts = ['0'] if sys.platform.lower().startswith('win') else [ '0', '1' ] for add_distinct in distinct_opts: os.environ['PSRS_DISTINCT_COL'] = add_distinct df = pd.DataFrame(np.random.rand(100, 10), columns=['a' + str(i) for i in range(10)]) # test one chunk mdf = DataFrame(df) result = mdf.sort_values('a0').execute().fetch() expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = mdf.sort_values(['a6', 'a7'], ascending=False).execute().fetch() expected = df.sort_values(['a6', 'a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test psrs mdf = DataFrame(df, chunk_size=10) result = mdf.sort_values('a0').execute().fetch() expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = mdf.sort_values(['a3', 'a4']).execute().fetch() expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test ascending=False result = mdf.sort_values(['a0', 'a1'], ascending=False).execute().fetch() expected = df.sort_values(['a0', 'a1'], ascending=False) pd.testing.assert_frame_equal(result, expected) result = mdf.sort_values(['a7'], ascending=False).execute().fetch() expected = df.sort_values(['a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test multiindex df2 = df.copy(deep=True) df2.columns = pd.MultiIndex.from_product([list('AB'), list('CDEFG')]) mdf = DataFrame(df2, chunk_size=10) result = mdf.sort_values([('A', 'C')]).execute().fetch() expected = df2.sort_values([('A', 'C')]) pd.testing.assert_frame_equal(result, expected) # test rechunk mdf = DataFrame(df, chunk_size=3) result = mdf.sort_values('a0').execute().fetch() expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = mdf.sort_values(['a3', 'a4']).execute().fetch() expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test other types raw = pd.DataFrame( { 'a': np.random.rand(10), 'b': np.random.randint(1000, size=10), 'c': np.random.rand(10), 'd': [np.random.bytes(10) for _ in range(10)], 'e': [pd.Timestamp(f'201{i}') for i in range(10)], 'f': [pd.Timedelta(f'{i} days') for i in range(10)] }, ) mdf = DataFrame(raw, chunk_size=3) for label in raw.columns: result = mdf.sort_values(label).execute().fetch() expected = raw.sort_values(label) pd.testing.assert_frame_equal(result, expected) result = mdf.sort_values(['a', 'b', 'e'], ascending=False).execute().fetch() expected = raw.sort_values(['a', 'b', 'e'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test nan df = pd.DataFrame({ 'col1': ['A', 'A', 'B', 'B', 'D', 'C'], 'col2': [2, 1, 9, np.nan, 7, 4], 'col3': [0, 1, 9, 4, 2, 3], }) mdf = DataFrame(df) result = mdf.sort_values(['col2']).execute().fetch() expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(df, chunk_size=3) result = mdf.sort_values(['col2']).execute().fetch() expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) # test None (issue #1885) df = pd.DataFrame(np.random.rand(1000, 10)) df[0][df[0] < 0.5] = 'A' df[0][df[0] != 'A'] = None mdf = DataFrame(df) result = mdf.sort_values([0, 1]).execute().fetch() expected = df.sort_values([0, 1]) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(df, chunk_size=100) result = mdf.sort_values([0, 1]).execute().fetch() expected = df.sort_values([0, 1]) pd.testing.assert_frame_equal(result, expected) # test ignore_index df = pd.DataFrame(np.random.rand(10, 3), columns=['a' + str(i) for i in range(3)]) mdf = DataFrame(df, chunk_size=3) result = mdf.sort_values(['a0', 'a1'], ignore_index=True).execute().fetch() try: # for python3.5 expected = df.sort_values(['a0', 'a1'], ignore_index=True) except TypeError: expected = df.sort_values(['a0', 'a1']) expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test inplace mdf = DataFrame(df) mdf.sort_values('a0', inplace=True) result = mdf.execute().fetch() df.sort_values('a0', inplace=True) pd.testing.assert_frame_equal(result, df) # test unknown shape df = pd.DataFrame({'a': list(range(10)), 'b': np.random.random(10)}) mdf = DataFrame(df, chunk_size=4) filtered = mdf[mdf['a'] > 2] result = filtered.sort_values(by='b').execute().fetch() pd.testing.assert_frame_equal(result, df[df['a'] > 2].sort_values(by='b')) # test empty dataframe df = pd.DataFrame({'a': list(range(10)), 'b': np.random.random(10)}) mdf = DataFrame(df, chunk_size=4) filtered = mdf[mdf['b'] > 100] result = filtered.sort_values(by='b').execute().fetch() pd.testing.assert_frame_equal(result, df[df['b'] > 100].sort_values(by='b')) # test chunks with zero length df = pd.DataFrame({'a': list(range(10)), 'b': np.random.random(10)}) df.iloc[4:8, 1] = 0 mdf = DataFrame(df, chunk_size=4) filtered = mdf[mdf['b'] != 0] result = filtered.sort_values(by='b').execute().fetch() pd.testing.assert_frame_equal(result, df[df['b'] != 0].sort_values(by='b')) # test Series.sort_values raw = pd.Series(np.random.rand(10)) series = Series(raw) result = series.sort_values().execute().fetch() expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=3) result = series.sort_values().execute().fetch() expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=2) result = series.sort_values(ascending=False).execute().fetch() expected = raw.sort_values(ascending=False) pd.testing.assert_series_equal(result, expected) # test empty series series = pd.Series(list(range(10)), name='a') mseries = Series(series, chunk_size=4) filtered = mseries[mseries > 100] result = filtered.sort_values().execute().fetch() pd.testing.assert_series_equal(result, series[series > 100].sort_values()) # test series with None series = pd.Series(np.arange(1000, )) series[series < 500] = 'A' series[series != 'A'] = None mseries = Series(series, chunk_size=100) result = mseries.sort_values().execute().fetch() expected = series.sort_values() pd.testing.assert_series_equal(result.reset_index(drop=True), expected.reset_index(drop=True))
def test_sort_index_execution(setup): raw = pd.DataFrame(np.random.rand(100, 20), index=np.random.rand(100)) mdf = DataFrame(raw) result = mdf.sort_index().execute().fetch() expected = raw.sort_index() pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw) mdf.sort_index(inplace=True) result = mdf.execute().fetch() expected = raw.sort_index() pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=30) result = mdf.sort_index().execute().fetch() expected = raw.sort_index() pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=20) result = mdf.sort_index(ascending=False).execute().fetch() expected = raw.sort_index(ascending=False) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=10) result = mdf.sort_index(ignore_index=True).execute().fetch() try: # for python3.5 expected = raw.sort_index(ignore_index=True) except TypeError: expected = raw.sort_index() expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test axis=1 raw = pd.DataFrame(np.random.rand(10, 10), columns=np.random.rand(10)) mdf = DataFrame(raw) result = mdf.sort_index(axis=1).execute().fetch() expected = raw.sort_index(axis=1) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=3) result = mdf.sort_index(axis=1).execute().fetch() expected = raw.sort_index(axis=1) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=4) result = mdf.sort_index(axis=1, ascending=False).execute().fetch() expected = raw.sort_index(axis=1, ascending=False) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=4) result = mdf.sort_index(axis=1, ignore_index=True).execute().fetch() try: # for python3.5 expected = raw.sort_index(axis=1, ignore_index=True) except TypeError: expected = raw.sort_index(axis=1) expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test series raw = pd.Series(np.random.rand(10, ), index=np.random.rand(10)) series = Series(raw) result = series.sort_index().execute().fetch() expected = raw.sort_index() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=2) result = series.sort_index().execute().fetch() expected = raw.sort_index() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=3) result = series.sort_index(ascending=False).execute().fetch() expected = raw.sort_index(ascending=False) pd.testing.assert_series_equal(result, expected)
def testSortValuesExecution(self): df = pd.DataFrame(np.random.rand(100, 10), columns=['a' + str(i) for i in range(10)]) # test one chunk mdf = DataFrame(df) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a6', 'a7'], ascending=False), concat=True)[0] expected = df.sort_values(['a6', 'a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test psrs mdf = DataFrame(df, chunk_size=10) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values(['a3', 'a4']), concat=True)[0] expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test ascending=False result = self.executor.execute_dataframe(mdf.sort_values( ['a0', 'a1'], ascending=False), concat=True)[0] expected = df.sort_values(['a0', 'a1'], ascending=False) pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a7'], ascending=False), concat=True)[0] expected = df.sort_values(['a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test rechunk mdf = DataFrame(df, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values(['a3', 'a4']), concat=True)[0] expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test other types raw = pd.DataFrame( { 'a': np.random.rand(10), 'b': np.random.randint(1000, size=10), 'c': np.random.rand(10), 'd': [np.random.bytes(10) for _ in range(10)], 'e': [pd.Timestamp('201{}'.format(i)) for i in range(10)], 'f': [pd.Timedelta('{} days'.format(i)) for i in range(10)] }, ) mdf = DataFrame(raw, chunk_size=3) for label in raw.columns: result = self.executor.execute_dataframe(mdf.sort_values(label), concat=True)[0] expected = raw.sort_values(label) pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a', 'b', 'e'], ascending=False), concat=True)[0] expected = raw.sort_values(['a', 'b', 'e'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test nan df = pd.DataFrame({ 'col1': ['A', 'A', 'B', 'B', 'D', 'C'], 'col2': [2, 1, 9, np.nan, 7, 4], 'col3': [0, 1, 9, 4, 2, 3], }) mdf = DataFrame(df) result = self.executor.execute_dataframe(mdf.sort_values(['col2']), concat=True)[0] expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(df, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_values(['col2']), concat=True)[0] expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) # test ignore_index executor = ExecutorForTest(storage=new_session().context) df = pd.DataFrame(np.random.rand(10, 3), columns=['a' + str(i) for i in range(3)]) mdf = DataFrame(df, chunk_size=3) result = executor.execute_dataframe(mdf.sort_values(['a0', 'a1'], ignore_index=True), concat=True)[0] try: # for python3.5 expected = df.sort_values(['a0', 'a1'], ignore_index=True) except TypeError: expected = df.sort_values(['a0', 'a1']) expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test inplace mdf = DataFrame(df) mdf.sort_values('a0', inplace=True) result = self.executor.execute_dataframe(mdf, concat=True)[0] df.sort_values('a0', inplace=True) pd.testing.assert_frame_equal(result, df) # test unknown shape df = pd.DataFrame({'a': list(range(10)), 'b': np.random.random(10)}) mdf = DataFrame(df, chunk_size=4) filtered = mdf[mdf['a'] > 2] result = self.executor.execute_dataframe(filtered.sort_values(by='b'), concat=True)[0] pd.testing.assert_frame_equal(result, df[df['a'] > 2].sort_values(by='b')) # test Sereis.sort_values raw = pd.Series(np.random.rand(10)) series = Series(raw) result = self.executor.execute_dataframe(series.sort_values(), concat=True)[0] expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=3) result = self.executor.execute_dataframe(series.sort_values(), concat=True)[0] expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=2) result = self.executor.execute_dataframe( series.sort_values(ascending=False), concat=True)[0] expected = raw.sort_values(ascending=False) pd.testing.assert_series_equal(result, expected)
def test_dot_execution(setup): df1_raw = pd.DataFrame(np.random.rand(4, 7)) df2_raw = pd.DataFrame(np.random.rand(7, 5), columns=list('efghi')) s1_raw = pd.Series(np.random.rand(7)) s2_raw = pd.Series(np.random.rand(7)) df1 = DataFrame(df1_raw, chunk_size=(3, 2)) df2 = DataFrame(df2_raw, chunk_size=(3, 4)) # df.dot(df) r = df1.dot(df2) result = r.execute().fetch() expected = df1_raw.dot(df2_raw) pd.testing.assert_frame_equal(result, expected) # test @ r = df1 @ df2 result = r.execute().fetch() expected = df1_raw @ df2_raw pd.testing.assert_frame_equal(result, expected) series1 = Series(s1_raw, chunk_size=5) # df.dot(series) r = df1.dot(series1) result = r.execute().fetch() expected = df1_raw.dot(s1_raw) pd.testing.assert_series_equal(result, expected) # df.dot(2d_array) r = df1.dot(df2_raw.to_numpy()) result = r.execute().fetch() expected = df1_raw.dot(df2_raw.to_numpy()) pd.testing.assert_frame_equal(result, expected) # df.dot(1d_array) r = df1.dot(s1_raw.to_numpy()) result = r.execute().fetch() expected = df1_raw.dot(s1_raw.to_numpy()) pd.testing.assert_series_equal(result, expected) series2 = Series(s2_raw, chunk_size=4) # series.dot(series) r = series1.dot(series2) result = r.execute().fetch() expected = s1_raw.dot(s2_raw) assert pytest.approx(result) == expected # series.dot(df) r = series1.dot(df2) result = r.execute().fetch() expected = s1_raw.dot(df2_raw) pd.testing.assert_series_equal(result, expected) # series.dot(2d_array) r = series1.dot(df2_raw.to_numpy()) result = r.execute().fetch() expected = s1_raw.dot(df2_raw.to_numpy()) np.testing.assert_almost_equal(result, expected) # series.dot(1d_array) r = series1.dot(s2_raw.to_numpy()) result = r.execute().fetch() expected = s1_raw.dot(s2_raw.to_numpy()) assert pytest.approx(result) == expected
def testSortIndexExecution(self): raw = pd.DataFrame(np.random.rand(100, 20), index=np.random.rand(100)) mdf = DataFrame(raw) result = self.executor.execute_dataframe(mdf.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw) mdf.sort_index(inplace=True) result = self.executor.execute_dataframe(mdf, concat=True)[0] expected = raw.sort_index() pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=30) result = self.executor.execute_dataframe(mdf.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=20) result = self.executor.execute_dataframe( mdf.sort_index(ascending=False), concat=True)[0] expected = raw.sort_index(ascending=False) pd.testing.assert_frame_equal(result, expected) executor = ExecutorForTest(storage=new_session().context) mdf = DataFrame(raw, chunk_size=10) result = executor.execute_dataframe(mdf.sort_index(ignore_index=True), concat=True)[0] try: # for python3.5 expected = raw.sort_index(ignore_index=True) except TypeError: expected = raw.sort_index() expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test axis=1 raw = pd.DataFrame(np.random.rand(10, 10), columns=np.random.rand(10)) mdf = DataFrame(raw) result = self.executor.execute_dataframe(mdf.sort_index(axis=1), concat=True)[0] expected = raw.sort_index(axis=1) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_index(axis=1), concat=True)[0] expected = raw.sort_index(axis=1) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=4) result = self.executor.execute_dataframe(mdf.sort_index( axis=1, ascending=False), concat=True)[0] expected = raw.sort_index(axis=1, ascending=False) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=4) executor = ExecutorForTest(storage=new_session().context) result = executor.execute_dataframe(mdf.sort_index(axis=1, ignore_index=True), concat=True)[0] try: # for python3.5 expected = raw.sort_index(axis=1, ignore_index=True) except TypeError: expected = raw.sort_index(axis=1) expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test series raw = pd.Series(np.random.rand(10, ), index=np.random.rand(10)) series = Series(raw) result = self.executor.execute_dataframe(series.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=2) result = self.executor.execute_dataframe(series.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=3) result = self.executor.execute_dataframe( series.sort_index(ascending=False), concat=True)[0] expected = raw.sort_index(ascending=False) pd.testing.assert_series_equal(result, expected)
def test_to_datetime_execution(setup): # scalar r = to_datetime(1490195805, unit='s') result = r.execute().fetch(extra_config={ 'check_dtypes': False, 'check_shape': False }) expected = pd.to_datetime(1490195805, unit='s') assert pd.to_datetime(result) == expected # test list like raw = ['3/11/2000', '3/12/2000', '3/13/2000'] t = tensor(raw, chunk_size=2) r = to_datetime(t, infer_datetime_format=True) result = r.execute().fetch() expected = pd.to_datetime(raw, infer_datetime_format=True) pd.testing.assert_index_equal(result, expected) # test series raw_series = pd.Series(raw) s = Series(raw_series, chunk_size=2) r = to_datetime(s) result = r.execute().fetch() expected = pd.to_datetime(raw_series) pd.testing.assert_series_equal(result, expected) # test DataFrame raw_df = pd.DataFrame({ 'year': [2015, 2016], 'month': [2, 3], 'day': [4, 5] }) df = DataFrame(raw_df, chunk_size=(1, 2)) r = to_datetime(df) result = r.execute().fetch() expected = pd.to_datetime(raw_df) pd.testing.assert_series_equal(result, expected) # test Index raw_index = pd.Index([1, 2, 3]) s = Index(raw_index, chunk_size=2) r = to_datetime(s) result = r.execute().fetch() expected = pd.to_datetime(raw_index) pd.testing.assert_index_equal(result, expected) # test raises == 'ignore' raw = ['13000101'] r = to_datetime(raw, format='%Y%m%d', errors='ignore') result = r.execute().fetch() expected = pd.to_datetime(raw, format='%Y%m%d', errors='ignore') pd.testing.assert_index_equal(result, expected) # test unit r = to_datetime([1490195805], unit='s') result = r.execute().fetch() expected = pd.to_datetime([1490195805], unit='s') pd.testing.assert_index_equal(result, expected) # test origin r = to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01')) result = r.execute().fetch() expected = pd.to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01')) pd.testing.assert_index_equal(result, expected)
def testSortValuesExecution(self): distinct_opts = ['0'] if sys.platform.lower().startswith('win') else [ '0', '1' ] for add_distinct in distinct_opts: os.environ['PSRS_DISTINCT_COL'] = add_distinct df = pd.DataFrame(np.random.rand(100, 10), columns=['a' + str(i) for i in range(10)]) # test one chunk mdf = DataFrame(df) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a6', 'a7'], ascending=False), concat=True)[0] expected = df.sort_values(['a6', 'a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test psrs mdf = DataFrame(df, chunk_size=10) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a3', 'a4']), concat=True)[0] expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test ascending=False result = self.executor.execute_dataframe(mdf.sort_values( ['a0', 'a1'], ascending=False), concat=True)[0] expected = df.sort_values(['a0', 'a1'], ascending=False) pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a7'], ascending=False), concat=True)[0] expected = df.sort_values(['a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test multiindex df2 = df.copy(deep=True) df2.columns = pd.MultiIndex.from_product( [list('AB'), list('CDEFG')]) mdf = DataFrame(df2, chunk_size=10) result = self.executor.execute_dataframe(mdf.sort_values([('A', 'C')]), concat=True)[0] expected = df2.sort_values([('A', 'C')]) pd.testing.assert_frame_equal(result, expected) # test rechunk mdf = DataFrame(df, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a3', 'a4']), concat=True)[0] expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test other types raw = pd.DataFrame( { 'a': np.random.rand(10), 'b': np.random.randint(1000, size=10), 'c': np.random.rand(10), 'd': [np.random.bytes(10) for _ in range(10)], 'e': [pd.Timestamp(f'201{i}') for i in range(10)], 'f': [pd.Timedelta(f'{i} days') for i in range(10)] }, ) mdf = DataFrame(raw, chunk_size=3) for label in raw.columns: result = self.executor.execute_dataframe( mdf.sort_values(label), concat=True)[0] expected = raw.sort_values(label) pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a', 'b', 'e'], ascending=False), concat=True)[0] expected = raw.sort_values(['a', 'b', 'e'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test nan df = pd.DataFrame({ 'col1': ['A', 'A', 'B', 'B', 'D', 'C'], 'col2': [2, 1, 9, np.nan, 7, 4], 'col3': [0, 1, 9, 4, 2, 3], }) mdf = DataFrame(df) result = self.executor.execute_dataframe(mdf.sort_values(['col2']), concat=True)[0] expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(df, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_values(['col2']), concat=True)[0] expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) # test ignore_index executor = ExecutorForTest(storage=new_session().context) df = pd.DataFrame(np.random.rand(10, 3), columns=['a' + str(i) for i in range(3)]) mdf = DataFrame(df, chunk_size=3) result = executor.execute_dataframe(mdf.sort_values( ['a0', 'a1'], ignore_index=True), concat=True)[0] try: # for python3.5 expected = df.sort_values(['a0', 'a1'], ignore_index=True) except TypeError: expected = df.sort_values(['a0', 'a1']) expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test inplace mdf = DataFrame(df) mdf.sort_values('a0', inplace=True) result = self.executor.execute_dataframe(mdf, concat=True)[0] df.sort_values('a0', inplace=True) pd.testing.assert_frame_equal(result, df) # test unknown shape df = pd.DataFrame({ 'a': list(range(10)), 'b': np.random.random(10) }) mdf = DataFrame(df, chunk_size=4) filtered = mdf[mdf['a'] > 2] result = self.executor.execute_dataframe( filtered.sort_values(by='b'), concat=True)[0] pd.testing.assert_frame_equal(result, df[df['a'] > 2].sort_values(by='b')) # test Series.sort_values raw = pd.Series(np.random.rand(10)) series = Series(raw) result = self.executor.execute_dataframe(series.sort_values(), concat=True)[0] expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=3) result = self.executor.execute_dataframe(series.sort_values(), concat=True)[0] expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=2) result = self.executor.execute_dataframe( series.sort_values(ascending=False), concat=True)[0] expected = raw.sort_values(ascending=False) pd.testing.assert_series_equal(result, expected)
def testDataFrameCorrWith(self): rs = np.random.RandomState(0) raw_df = rs.rand(20, 10) raw_df = pd.DataFrame(np.where(raw_df > 0.4, raw_df, np.nan), columns=list('ABCDEFGHIJ')) raw_df2 = rs.rand(20, 10) raw_df2 = pd.DataFrame(np.where(raw_df2 > 0.4, raw_df2, np.nan), columns=list('ACDEGHIJKL')) raw_s = rs.rand(20) raw_s = pd.Series(np.where(raw_s > 0.4, raw_s, np.nan)) raw_s2 = rs.rand(10) raw_s2 = pd.Series(np.where(raw_s2 > 0.4, raw_s2, np.nan), index=raw_df2.columns) df = DataFrame(raw_df) df2 = DataFrame(raw_df2) result = df.corrwith(df2) pd.testing.assert_series_equal( self.executor.execute_dataframe(result, concat=True)[0], raw_df.corrwith(raw_df2)) result = df.corrwith(df2, axis=1) pd.testing.assert_series_equal( self.executor.execute_dataframe(result, concat=True)[0], raw_df.corrwith(raw_df2, axis=1)) result = df.corrwith(df2, method='kendall') pd.testing.assert_series_equal( self.executor.execute_dataframe(result, concat=True)[0], raw_df.corrwith(raw_df2, method='kendall')) df = DataFrame(raw_df, chunk_size=4) df2 = DataFrame(raw_df2, chunk_size=6) s = Series(raw_s, chunk_size=5) s2 = Series(raw_s2, chunk_size=5) with self.assertRaises(Exception): self.executor.execute_dataframe(df.corrwith(df2, method='kendall'), concat=True) result = df.corrwith(df2) pd.testing.assert_series_equal( self.executor.execute_dataframe(result, concat=True)[0].sort_index(), raw_df.corrwith(raw_df2).sort_index()) result = df.corrwith(df2, axis=1) pd.testing.assert_series_equal( self.executor.execute_dataframe(result, concat=True)[0].sort_index(), raw_df.corrwith(raw_df2, axis=1).sort_index()) result = df.corrwith(s) pd.testing.assert_series_equal( self.executor.execute_dataframe(result, concat=True)[0].sort_index(), raw_df.corrwith(raw_s).sort_index()) result = df.corrwith(s2, axis=1) pd.testing.assert_series_equal( self.executor.execute_dataframe(result, concat=True)[0].sort_index(), raw_df.corrwith(raw_s2, axis=1).sort_index())