def testWithGivenSession(session): with option_context( {'vineyard.socket': '/tmp/vineyard/vineyard.sock'}): df1 = DataFrame(pd.DataFrame(np.arange(12).reshape(3, 4), columns=['a', 'b', 'c', 'd']), chunk_size=2) object_id = df1.to_vineyard().execute(session=session).fetch() df2 = md.from_vineyard(object_id) df1_value = df1.execute(session=session).fetch() df2_value = df2.execute(session=session).fetch() pd.testing.assert_frame_equal(df1_value.reset_index(drop=True), df2_value.reset_index(drop=True))
def testArrowStringSortValues(self): rs = np.random.RandomState(0) raw = pd.DataFrame({ 'a': rs.rand(10), 'b': [f's{rs.randint(1000)}' for _ in range(10)] }) raw['b'] = raw['b'].astype(ArrowStringDtype()) mdf = DataFrame(raw, chunk_size=3) df = mdf.sort_values(by='b') result = self.executor.execute_dataframe(df, concat=True)[0] expected = raw.sort_values(by='b') pd.testing.assert_frame_equal(result, expected)
def testToParquetFastParquetExecution(self): raw = pd.DataFrame({ 'col1': np.random.rand(100), 'col2': np.arange(100), 'col3': np.random.choice(['a', 'b', 'c'], (100,)), }) df = DataFrame(raw, chunk_size=33) with tempfile.TemporaryDirectory() as base_path: # test fastparquet path = os.path.join(base_path, 'out-fastparquet-*.parquet') r = df.to_parquet(path, engine='fastparquet', compression='gzip') self.executor.execute_dataframe(r)
def run_with_given_session(session, **kw): ipc_socket = os.environ.get('VINEYARD_IPC_SOCKET', '/tmp/vineyard/vineyard.sock') with option_context({'vineyard.socket': ipc_socket}): df1 = DataFrame(pd.DataFrame(np.arange(12).reshape(3, 4), columns=['a', 'b', 'c', 'd']), chunk_size=2) object_id = df1.to_vineyard().execute( session=session, **kw).fetch(session=session) df2 = md.from_vineyard(object_id) df1_value = df1.execute(session=session, **kw).fetch(session=session) df2_value = df2.execute(session=session, **kw).fetch(session=session) pd.testing.assert_frame_equal(df1_value.reset_index(drop=True), df2_value.reset_index(drop=True))
def test_to_csv(): raw = pd.DataFrame(np.random.rand(10, 5)) df = DataFrame(raw, chunk_size=4) r = df.to_csv('*.csv') r = tile(r) assert r.chunk_shape[1] == 1 for i, c in enumerate(r.chunks): assert type(c.op).__name__ == 'DataFrameToCSV' assert c.inputs[0] is r.inputs[0].chunks[i].data # test one file r = df.to_csv('out.csv') r = tile(r) assert r.chunk_shape[1] == 1 for i, c in enumerate(r.chunks): assert len(c.inputs) == 2 assert c.inputs[0].inputs[0] is r.inputs[0].chunks[i].data assert type(c.inputs[1].op).__name__ == 'DataFrameToCSVStat'
def testToParquetArrowExecution(self): raw = pd.DataFrame({ 'col1': np.random.rand(100), 'col2': np.arange(100), 'col3': np.random.choice(['a', 'b', 'c'], (100, )), }) df = DataFrame(raw, chunk_size=33) with tempfile.TemporaryDirectory() as base_path: # DATAFRAME TESTS path = os.path.join(base_path, 'out-*.parquet') r = df.to_parquet(path) self.executor.execute_dataframe(r) read_df = md.read_parquet(path) result = self.executor.execute_dataframe(read_df, concat=True)[0] result = result.sort_index() pd.testing.assert_frame_equal(result, raw) read_df = md.read_parquet(path) result = self.executor.execute_dataframe(read_df, concat=True)[0] result = result.sort_index() pd.testing.assert_frame_equal(result, raw) # test read_parquet then to_parquet read_df = md.read_parquet(path) r = read_df.to_parquet(path) self.executor.execute_dataframes([r]) # test partition_cols path = os.path.join(base_path, 'out-partitioned') r = df.to_parquet(path, partition_cols=['col3']) self.executor.execute_dataframe(r) read_df = md.read_parquet(path) result = self.executor.execute_dataframe(read_df, concat=True)[0] result['col3'] = result['col3'].astype('object') pd.testing.assert_frame_equal( result.sort_values('col1').reset_index(drop=True), raw.sort_values('col1').reset_index(drop=True))
def testToSQL(self): index = pd.RangeIndex(100, 0, -1, name='index') raw = pd.DataFrame( { 'col1': np.random.rand(100), 'col2': np.random.choice(['a', 'b', 'c'], (100, )), 'col3': np.arange(100).astype('int64'), }, index=index) with tempfile.TemporaryDirectory() as d: table_name1 = 'test_table' table_name2 = 'test_table2' uri = 'sqlite:///' + os.path.join(d, 'test.db') engine = sqlalchemy.create_engine(uri) # test write dataframe df = DataFrame(raw, chunk_size=33) r = df.to_sql(table_name1, con=engine) self.executor.execute_dataframe(r) written = pd.read_sql(table_name1, con=engine, index_col='index') \ .sort_index(ascending=False) pd.testing.assert_frame_equal(raw, written) # test write with existing table with self.assertRaises(ValueError): df.to_sql(table_name1, con=uri).execute() # test write series series = md.Series(raw.col1, chunk_size=33) with engine.connect() as conn: r = series.to_sql(table_name2, con=conn) self.executor.execute_dataframe(r) written = pd.read_sql(table_name2, con=engine, index_col='index') \ .sort_index(ascending=False) pd.testing.assert_frame_equal(raw.col1.to_frame(), written)
def testToCSVExecution(self): index = pd.RangeIndex(100, 0, -1, name='index') raw = pd.DataFrame( { 'col1': np.random.rand(100), 'col2': np.random.choice(['a', 'b', 'c'], (100, )), 'col3': np.arange(100) }, index=index) df = DataFrame(raw, chunk_size=33) with tempfile.TemporaryDirectory() as base_path: # test one file path = os.path.join(base_path, 'out.csv') r = df.to_csv(path) self.executor.execute_dataframe(r) result = pd.read_csv(path, dtype=raw.dtypes.to_dict()) result.set_index('index', inplace=True) pd.testing.assert_frame_equal(result, raw) # test multi files path = os.path.join(base_path, 'out-*.csv') r = df.to_csv(path) self.executor.execute_dataframe(r) dfs = [ pd.read_csv(os.path.join(base_path, 'out-{}.csv'.format(i)), dtype=raw.dtypes.to_dict()) for i in range(4) ] result = pd.concat(dfs, axis=0) result.set_index('index', inplace=True) pd.testing.assert_frame_equal(result, raw) pd.testing.assert_frame_equal(dfs[1].set_index('index'), raw.iloc[33:66])
def testGPUExecution(self): # test sort_values distinct_opts = ['0'] if sys.platform.lower().startswith('win') else [ '0', '1' ] for add_distinct in distinct_opts: os.environ['PSRS_DISTINCT_COL'] = add_distinct # test dataframe raw = pd.DataFrame(np.random.rand(100, 10), columns=['a' + str(i) for i in range(10)]) mdf = DataFrame(raw, chunk_size=30).to_gpu() result = self.executor.execute_dataframe(mdf.sort_values(by='a0'), concat=True)[0] expected = raw.sort_values(by='a0') pd.testing.assert_frame_equal(result.to_pandas(), expected) # test series raw = pd.Series(np.random.rand(10)) series = Series(raw).to_gpu() result = self.executor.execute_dataframe(series.sort_values(), concat=True)[0] expected = raw.sort_values() pd.testing.assert_series_equal(result.to_pandas(), expected) # test DataFrame.sort_index raw = pd.DataFrame(np.random.rand(10, 10), columns=np.random.rand(10)) mdf = DataFrame(raw).to_gpu() result = self.executor.execute_dataframe(mdf.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_frame_equal(result.to_pandas(), expected) # test Series.sort_index raw = pd.Series(np.random.rand(10, ), index=np.random.rand(10)) series = Series(raw).to_gpu() result = self.executor.execute_dataframe(series.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_series_equal(result.to_pandas(), expected)
def testToDatetimeExecution(self): # scalar r = to_datetime(1490195805, unit='s') result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.to_datetime(1490195805, unit='s') self.assertEqual(pd.to_datetime(result.item()), expected) # test list like raw = ['3/11/2000', '3/12/2000', '3/13/2000'] t = tensor(raw, chunk_size=2) r = to_datetime(t, infer_datetime_format=True) result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.to_datetime(raw, infer_datetime_format=True) pd.testing.assert_index_equal(result, expected) # test series raw_series = pd.Series(raw) s = Series(raw_series, chunk_size=2) r = to_datetime(s) result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.to_datetime(raw_series) pd.testing.assert_series_equal(result, expected) # test DataFrame raw_df = pd.DataFrame({ 'year': [2015, 2016], 'month': [2, 3], 'day': [4, 5] }) df = DataFrame(raw_df, chunk_size=(1, 2)) r = to_datetime(df) result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.to_datetime(raw_df) pd.testing.assert_series_equal(result, expected) # test Index raw_index = pd.Index([1, 2, 3]) s = Index(raw_index, chunk_size=2) r = to_datetime(s) result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.to_datetime(raw_index) pd.testing.assert_index_equal(result, expected) # test raises == 'ignore' raw = ['13000101'] r = to_datetime(raw, format='%Y%m%d', errors='ignore') result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.to_datetime(raw, format='%Y%m%d', errors='ignore') pd.testing.assert_index_equal(result, expected) # test unit r = to_datetime([1490195805], unit='s') result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.to_datetime([1490195805], unit='s') pd.testing.assert_index_equal(result, expected) # test origin r = to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01')) result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01')) pd.testing.assert_index_equal(result, expected)
def testDotExecution(self): df1_raw = pd.DataFrame(np.random.rand(4, 7)) df2_raw = pd.DataFrame(np.random.rand(7, 5), columns=list('efghi')) s1_raw = pd.Series(np.random.rand(7)) s2_raw = pd.Series(np.random.rand(7)) df1 = DataFrame(df1_raw, chunk_size=(3, 2)) df2 = DataFrame(df2_raw, chunk_size=(3, 4)) # df.dot(df) r = df1.dot(df2) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df1_raw.dot(df2_raw) pd.testing.assert_frame_equal(result, expected) # test @ r = df1 @ df2 result = self.executor.execute_dataframe(r, concat=True)[0] expected = df1_raw @ df2_raw pd.testing.assert_frame_equal(result, expected) series1 = Series(s1_raw, chunk_size=5) # df.dot(series) r = df1.dot(series1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df1_raw.dot(s1_raw) pd.testing.assert_series_equal(result, expected) # df.dot(2d_array) r = df1.dot(df2_raw.to_numpy()) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df1_raw.dot(df2_raw.to_numpy()) pd.testing.assert_frame_equal(result, expected) # df.dot(1d_array) r = df1.dot(s1_raw.to_numpy()) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df1_raw.dot(s1_raw.to_numpy()) pd.testing.assert_series_equal(result, expected) series2 = Series(s2_raw, chunk_size=4) # series.dot(series) r = series1.dot(series2) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s1_raw.dot(s2_raw) self.assertAlmostEqual(result, expected) # series.dot(df) r = series1.dot(df2) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s1_raw.dot(df2_raw) pd.testing.assert_series_equal(result, expected) # series.dot(2d_array) r = series1.dot(df2_raw.to_numpy()) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s1_raw.dot(df2_raw.to_numpy()) np.testing.assert_almost_equal(result, expected) # series.dot(1d_array) r = series1.dot(s2_raw.to_numpy()) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s1_raw.dot(s2_raw.to_numpy()) self.assertAlmostEqual(result, expected)
def test_sort_values_execution(setup): distinct_opts = ['0'] if sys.platform.lower().startswith('win') else [ '0', '1' ] for add_distinct in distinct_opts: os.environ['PSRS_DISTINCT_COL'] = add_distinct df = pd.DataFrame(np.random.rand(100, 10), columns=['a' + str(i) for i in range(10)]) # test one chunk mdf = DataFrame(df) result = mdf.sort_values('a0').execute().fetch() expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = mdf.sort_values(['a6', 'a7'], ascending=False).execute().fetch() expected = df.sort_values(['a6', 'a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test psrs mdf = DataFrame(df, chunk_size=10) result = mdf.sort_values('a0').execute().fetch() expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = mdf.sort_values(['a3', 'a4']).execute().fetch() expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test ascending=False result = mdf.sort_values(['a0', 'a1'], ascending=False).execute().fetch() expected = df.sort_values(['a0', 'a1'], ascending=False) pd.testing.assert_frame_equal(result, expected) result = mdf.sort_values(['a7'], ascending=False).execute().fetch() expected = df.sort_values(['a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test multiindex df2 = df.copy(deep=True) df2.columns = pd.MultiIndex.from_product([list('AB'), list('CDEFG')]) mdf = DataFrame(df2, chunk_size=10) result = mdf.sort_values([('A', 'C')]).execute().fetch() expected = df2.sort_values([('A', 'C')]) pd.testing.assert_frame_equal(result, expected) # test rechunk mdf = DataFrame(df, chunk_size=3) result = mdf.sort_values('a0').execute().fetch() expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = mdf.sort_values(['a3', 'a4']).execute().fetch() expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test other types raw = pd.DataFrame( { 'a': np.random.rand(10), 'b': np.random.randint(1000, size=10), 'c': np.random.rand(10), 'd': [np.random.bytes(10) for _ in range(10)], 'e': [pd.Timestamp(f'201{i}') for i in range(10)], 'f': [pd.Timedelta(f'{i} days') for i in range(10)] }, ) mdf = DataFrame(raw, chunk_size=3) for label in raw.columns: result = mdf.sort_values(label).execute().fetch() expected = raw.sort_values(label) pd.testing.assert_frame_equal(result, expected) result = mdf.sort_values(['a', 'b', 'e'], ascending=False).execute().fetch() expected = raw.sort_values(['a', 'b', 'e'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test nan df = pd.DataFrame({ 'col1': ['A', 'A', 'B', 'B', 'D', 'C'], 'col2': [2, 1, 9, np.nan, 7, 4], 'col3': [0, 1, 9, 4, 2, 3], }) mdf = DataFrame(df) result = mdf.sort_values(['col2']).execute().fetch() expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(df, chunk_size=3) result = mdf.sort_values(['col2']).execute().fetch() expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) # test None (issue #1885) df = pd.DataFrame(np.random.rand(1000, 10)) df[0][df[0] < 0.5] = 'A' df[0][df[0] != 'A'] = None mdf = DataFrame(df) result = mdf.sort_values([0, 1]).execute().fetch() expected = df.sort_values([0, 1]) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(df, chunk_size=100) result = mdf.sort_values([0, 1]).execute().fetch() expected = df.sort_values([0, 1]) pd.testing.assert_frame_equal(result, expected) # test ignore_index df = pd.DataFrame(np.random.rand(10, 3), columns=['a' + str(i) for i in range(3)]) mdf = DataFrame(df, chunk_size=3) result = mdf.sort_values(['a0', 'a1'], ignore_index=True).execute().fetch() try: # for python3.5 expected = df.sort_values(['a0', 'a1'], ignore_index=True) except TypeError: expected = df.sort_values(['a0', 'a1']) expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test inplace mdf = DataFrame(df) mdf.sort_values('a0', inplace=True) result = mdf.execute().fetch() df.sort_values('a0', inplace=True) pd.testing.assert_frame_equal(result, df) # test unknown shape df = pd.DataFrame({'a': list(range(10)), 'b': np.random.random(10)}) mdf = DataFrame(df, chunk_size=4) filtered = mdf[mdf['a'] > 2] result = filtered.sort_values(by='b').execute().fetch() pd.testing.assert_frame_equal(result, df[df['a'] > 2].sort_values(by='b')) # test empty dataframe df = pd.DataFrame({'a': list(range(10)), 'b': np.random.random(10)}) mdf = DataFrame(df, chunk_size=4) filtered = mdf[mdf['b'] > 100] result = filtered.sort_values(by='b').execute().fetch() pd.testing.assert_frame_equal(result, df[df['b'] > 100].sort_values(by='b')) # test chunks with zero length df = pd.DataFrame({'a': list(range(10)), 'b': np.random.random(10)}) df.iloc[4:8, 1] = 0 mdf = DataFrame(df, chunk_size=4) filtered = mdf[mdf['b'] != 0] result = filtered.sort_values(by='b').execute().fetch() pd.testing.assert_frame_equal(result, df[df['b'] != 0].sort_values(by='b')) # test Series.sort_values raw = pd.Series(np.random.rand(10)) series = Series(raw) result = series.sort_values().execute().fetch() expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=3) result = series.sort_values().execute().fetch() expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=2) result = series.sort_values(ascending=False).execute().fetch() expected = raw.sort_values(ascending=False) pd.testing.assert_series_equal(result, expected) # test empty series series = pd.Series(list(range(10)), name='a') mseries = Series(series, chunk_size=4) filtered = mseries[mseries > 100] result = filtered.sort_values().execute().fetch() pd.testing.assert_series_equal(result, series[series > 100].sort_values()) # test series with None series = pd.Series(np.arange(1000, )) series[series < 500] = 'A' series[series != 'A'] = None mseries = Series(series, chunk_size=100) result = mseries.sort_values().execute().fetch() expected = series.sort_values() pd.testing.assert_series_equal(result.reset_index(drop=True), expected.reset_index(drop=True))
def test_sort_index_execution(setup): raw = pd.DataFrame(np.random.rand(100, 20), index=np.random.rand(100)) mdf = DataFrame(raw) result = mdf.sort_index().execute().fetch() expected = raw.sort_index() pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw) mdf.sort_index(inplace=True) result = mdf.execute().fetch() expected = raw.sort_index() pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=30) result = mdf.sort_index().execute().fetch() expected = raw.sort_index() pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=20) result = mdf.sort_index(ascending=False).execute().fetch() expected = raw.sort_index(ascending=False) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=10) result = mdf.sort_index(ignore_index=True).execute().fetch() try: # for python3.5 expected = raw.sort_index(ignore_index=True) except TypeError: expected = raw.sort_index() expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test axis=1 raw = pd.DataFrame(np.random.rand(10, 10), columns=np.random.rand(10)) mdf = DataFrame(raw) result = mdf.sort_index(axis=1).execute().fetch() expected = raw.sort_index(axis=1) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=3) result = mdf.sort_index(axis=1).execute().fetch() expected = raw.sort_index(axis=1) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=4) result = mdf.sort_index(axis=1, ascending=False).execute().fetch() expected = raw.sort_index(axis=1, ascending=False) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=4) result = mdf.sort_index(axis=1, ignore_index=True).execute().fetch() try: # for python3.5 expected = raw.sort_index(axis=1, ignore_index=True) except TypeError: expected = raw.sort_index(axis=1) expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test series raw = pd.Series(np.random.rand(10, ), index=np.random.rand(10)) series = Series(raw) result = series.sort_index().execute().fetch() expected = raw.sort_index() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=2) result = series.sort_index().execute().fetch() expected = raw.sort_index() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=3) result = series.sort_index(ascending=False).execute().fetch() expected = raw.sort_index(ascending=False) pd.testing.assert_series_equal(result, expected)
def testDataFrameQuantileExecution(self): raw = pd.DataFrame( { 'a': np.random.rand(10), 'b': np.random.randint(1000, size=10), 'c': np.random.rand(10), 'd': [np.random.bytes(10) for _ in range(10)], 'e': [pd.Timestamp('201{}'.format(i)) for i in range(10)], 'f': [pd.Timedelta('{} days'.format(i)) for i in range(10)] }, index=pd.RangeIndex(1, 11)) df = DataFrame(raw, chunk_size=3) # q = 0.5, axis = 0, series r = df.quantile() result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile() pd.testing.assert_series_equal(result, expected) # q = 0.5, axis = 1, series r = df.quantile(axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile(axis=1) pd.testing.assert_series_equal(result, expected) # q is a list, axis = 0, dataframe r = df.quantile([0.3, 0.7]) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile([0.3, 0.7]) pd.testing.assert_frame_equal(result, expected) # q is a list, axis = 1, dataframe r = df.quantile([0.3, 0.7], axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile([0.3, 0.7], axis=1) pd.testing.assert_frame_equal(result, expected) # test interpolation r = df.quantile([0.3, 0.7], interpolation='midpoint') result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile([0.3, 0.7], interpolation='midpoint') pd.testing.assert_frame_equal(result, expected) this = self class MockSession: def __init__(self): self.executor = this.executor ctx = LocalContext(MockSession()) executor = ExecutorForTest('numpy', storage=ctx) with ctx: q = tensor([0.3, 0.7]) # q is a tensor r = df.quantile(q) result = executor.execute_dataframes([r])[0] expected = raw.quantile([0.3, 0.7]) pd.testing.assert_frame_equal(result, expected) # test numeric_only raw2 = pd.DataFrame( { 'a': np.random.rand(10), 'b': np.random.randint(1000, size=10), 'c': np.random.rand(10), 'd': [pd.Timestamp('201{}'.format(i)) for i in range(10)], }, index=pd.RangeIndex(1, 11)) df2 = DataFrame(raw2, chunk_size=3) r = df2.quantile([0.3, 0.7], numeric_only=False) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw2.quantile([0.3, 0.7], numeric_only=False) pd.testing.assert_frame_equal(result, expected) r = df2.quantile(numeric_only=False) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw2.quantile(numeric_only=False) pd.testing.assert_series_equal(result, expected)
def test_dataframe_corr_with(setup): rs = np.random.RandomState(0) raw_df = rs.rand(20, 10) raw_df = pd.DataFrame(np.where(raw_df > 0.4, raw_df, np.nan), columns=list('ABCDEFGHIJ')) raw_df2 = rs.rand(20, 10) raw_df2 = pd.DataFrame(np.where(raw_df2 > 0.4, raw_df2, np.nan), columns=list('ACDEGHIJKL')) raw_s = rs.rand(20) raw_s = pd.Series(np.where(raw_s > 0.4, raw_s, np.nan)) raw_s2 = rs.rand(10) raw_s2 = pd.Series(np.where(raw_s2 > 0.4, raw_s2, np.nan), index=raw_df2.columns) df = DataFrame(raw_df) df2 = DataFrame(raw_df2) result = df.corrwith(df2) pd.testing.assert_series_equal(result.execute().fetch(), raw_df.corrwith(raw_df2)) result = df.corrwith(df2, axis=1) pd.testing.assert_series_equal(result.execute().fetch(), raw_df.corrwith(raw_df2, axis=1)) result = df.corrwith(df2, method='kendall') pd.testing.assert_series_equal(result.execute().fetch(), raw_df.corrwith(raw_df2, method='kendall')) df = DataFrame(raw_df, chunk_size=4) df2 = DataFrame(raw_df2, chunk_size=6) s = Series(raw_s, chunk_size=5) s2 = Series(raw_s2, chunk_size=5) with pytest.raises(Exception): df.corrwith(df2, method='kendall').execute() result = df.corrwith(df2) pd.testing.assert_series_equal(result.execute().fetch().sort_index(), raw_df.corrwith(raw_df2).sort_index()) result = df.corrwith(df2, axis=1) pd.testing.assert_series_equal( result.execute().fetch().sort_index(), raw_df.corrwith(raw_df2, axis=1).sort_index()) result = df.corrwith(s) pd.testing.assert_series_equal(result.execute().fetch().sort_index(), raw_df.corrwith(raw_s).sort_index()) result = df.corrwith(s2, axis=1) pd.testing.assert_series_equal( result.execute().fetch().sort_index(), raw_df.corrwith(raw_s2, axis=1).sort_index())
def testDataFrameCorrWith(self): rs = np.random.RandomState(0) raw_df = rs.rand(20, 10) raw_df = pd.DataFrame(np.where(raw_df > 0.4, raw_df, np.nan), columns=list('ABCDEFGHIJ')) raw_df2 = rs.rand(20, 10) raw_df2 = pd.DataFrame(np.where(raw_df2 > 0.4, raw_df2, np.nan), columns=list('ACDEGHIJKL')) raw_s = rs.rand(20) raw_s = pd.Series(np.where(raw_s > 0.4, raw_s, np.nan)) raw_s2 = rs.rand(10) raw_s2 = pd.Series(np.where(raw_s2 > 0.4, raw_s2, np.nan), index=raw_df2.columns) df = DataFrame(raw_df) df2 = DataFrame(raw_df2) result = df.corrwith(df2) pd.testing.assert_series_equal( self.executor.execute_dataframe(result, concat=True)[0], raw_df.corrwith(raw_df2)) result = df.corrwith(df2, axis=1) pd.testing.assert_series_equal( self.executor.execute_dataframe(result, concat=True)[0], raw_df.corrwith(raw_df2, axis=1)) result = df.corrwith(df2, method='kendall') pd.testing.assert_series_equal( self.executor.execute_dataframe(result, concat=True)[0], raw_df.corrwith(raw_df2, method='kendall')) df = DataFrame(raw_df, chunk_size=4) df2 = DataFrame(raw_df2, chunk_size=6) s = Series(raw_s, chunk_size=5) s2 = Series(raw_s2, chunk_size=5) with self.assertRaises(Exception): self.executor.execute_dataframe(df.corrwith(df2, method='kendall'), concat=True) result = df.corrwith(df2) pd.testing.assert_series_equal( self.executor.execute_dataframe(result, concat=True)[0].sort_index(), raw_df.corrwith(raw_df2).sort_index()) result = df.corrwith(df2, axis=1) pd.testing.assert_series_equal( self.executor.execute_dataframe(result, concat=True)[0].sort_index(), raw_df.corrwith(raw_df2, axis=1).sort_index()) result = df.corrwith(s) pd.testing.assert_series_equal( self.executor.execute_dataframe(result, concat=True)[0].sort_index(), raw_df.corrwith(raw_s).sort_index()) result = df.corrwith(s2, axis=1) pd.testing.assert_series_equal( self.executor.execute_dataframe(result, concat=True)[0].sort_index(), raw_df.corrwith(raw_s2, axis=1).sort_index())
def testSortValuesExecution(self): df = pd.DataFrame(np.random.rand(100, 10), columns=['a' + str(i) for i in range(10)]) # test one chunk mdf = DataFrame(df) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a6', 'a7'], ascending=False), concat=True)[0] expected = df.sort_values(['a6', 'a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test psrs mdf = DataFrame(df, chunk_size=10) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values(['a3', 'a4']), concat=True)[0] expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test ascending=False result = self.executor.execute_dataframe(mdf.sort_values( ['a0', 'a1'], ascending=False), concat=True)[0] expected = df.sort_values(['a0', 'a1'], ascending=False) pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a7'], ascending=False), concat=True)[0] expected = df.sort_values(['a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test rechunk mdf = DataFrame(df, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values(['a3', 'a4']), concat=True)[0] expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test other types raw = pd.DataFrame( { 'a': np.random.rand(10), 'b': np.random.randint(1000, size=10), 'c': np.random.rand(10), 'd': [np.random.bytes(10) for _ in range(10)], 'e': [pd.Timestamp('201{}'.format(i)) for i in range(10)], 'f': [pd.Timedelta('{} days'.format(i)) for i in range(10)] }, ) mdf = DataFrame(raw, chunk_size=3) for label in raw.columns: result = self.executor.execute_dataframe(mdf.sort_values(label), concat=True)[0] expected = raw.sort_values(label) pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a', 'b', 'e'], ascending=False), concat=True)[0] expected = raw.sort_values(['a', 'b', 'e'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test nan df = pd.DataFrame({ 'col1': ['A', 'A', 'B', 'B', 'D', 'C'], 'col2': [2, 1, 9, np.nan, 7, 4], 'col3': [0, 1, 9, 4, 2, 3], }) mdf = DataFrame(df) result = self.executor.execute_dataframe(mdf.sort_values(['col2']), concat=True)[0] expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(df, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_values(['col2']), concat=True)[0] expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) # test ignore_index executor = ExecutorForTest(storage=new_session().context) df = pd.DataFrame(np.random.rand(10, 3), columns=['a' + str(i) for i in range(3)]) mdf = DataFrame(df, chunk_size=3) result = executor.execute_dataframe(mdf.sort_values(['a0', 'a1'], ignore_index=True), concat=True)[0] try: # for python3.5 expected = df.sort_values(['a0', 'a1'], ignore_index=True) except TypeError: expected = df.sort_values(['a0', 'a1']) expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test inplace mdf = DataFrame(df) mdf.sort_values('a0', inplace=True) result = self.executor.execute_dataframe(mdf, concat=True)[0] df.sort_values('a0', inplace=True) pd.testing.assert_frame_equal(result, df) # test unknown shape df = pd.DataFrame({'a': list(range(10)), 'b': np.random.random(10)}) mdf = DataFrame(df, chunk_size=4) filtered = mdf[mdf['a'] > 2] result = self.executor.execute_dataframe(filtered.sort_values(by='b'), concat=True)[0] pd.testing.assert_frame_equal(result, df[df['a'] > 2].sort_values(by='b')) # test Sereis.sort_values raw = pd.Series(np.random.rand(10)) series = Series(raw) result = self.executor.execute_dataframe(series.sort_values(), concat=True)[0] expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=3) result = self.executor.execute_dataframe(series.sort_values(), concat=True)[0] expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=2) result = self.executor.execute_dataframe( series.sort_values(ascending=False), concat=True)[0] expected = raw.sort_values(ascending=False) pd.testing.assert_series_equal(result, expected)
def test_to_datetime_execution(setup): # scalar r = to_datetime(1490195805, unit='s') result = r.execute().fetch(extra_config={ 'check_dtypes': False, 'check_shape': False }) expected = pd.to_datetime(1490195805, unit='s') assert pd.to_datetime(result) == expected # test list like raw = ['3/11/2000', '3/12/2000', '3/13/2000'] t = tensor(raw, chunk_size=2) r = to_datetime(t, infer_datetime_format=True) result = r.execute().fetch() expected = pd.to_datetime(raw, infer_datetime_format=True) pd.testing.assert_index_equal(result, expected) # test series raw_series = pd.Series(raw) s = Series(raw_series, chunk_size=2) r = to_datetime(s) result = r.execute().fetch() expected = pd.to_datetime(raw_series) pd.testing.assert_series_equal(result, expected) # test DataFrame raw_df = pd.DataFrame({ 'year': [2015, 2016], 'month': [2, 3], 'day': [4, 5] }) df = DataFrame(raw_df, chunk_size=(1, 2)) r = to_datetime(df) result = r.execute().fetch() expected = pd.to_datetime(raw_df) pd.testing.assert_series_equal(result, expected) # test Index raw_index = pd.Index([1, 2, 3]) s = Index(raw_index, chunk_size=2) r = to_datetime(s) result = r.execute().fetch() expected = pd.to_datetime(raw_index) pd.testing.assert_index_equal(result, expected) # test raises == 'ignore' raw = ['13000101'] r = to_datetime(raw, format='%Y%m%d', errors='ignore') result = r.execute().fetch() expected = pd.to_datetime(raw, format='%Y%m%d', errors='ignore') pd.testing.assert_index_equal(result, expected) # test unit r = to_datetime([1490195805], unit='s') result = r.execute().fetch() expected = pd.to_datetime([1490195805], unit='s') pd.testing.assert_index_equal(result, expected) # test origin r = to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01')) result = r.execute().fetch() expected = pd.to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01')) pd.testing.assert_index_equal(result, expected)
def test_dot_execution(setup): df1_raw = pd.DataFrame(np.random.rand(4, 7)) df2_raw = pd.DataFrame(np.random.rand(7, 5), columns=list('efghi')) s1_raw = pd.Series(np.random.rand(7)) s2_raw = pd.Series(np.random.rand(7)) df1 = DataFrame(df1_raw, chunk_size=(3, 2)) df2 = DataFrame(df2_raw, chunk_size=(3, 4)) # df.dot(df) r = df1.dot(df2) result = r.execute().fetch() expected = df1_raw.dot(df2_raw) pd.testing.assert_frame_equal(result, expected) # test @ r = df1 @ df2 result = r.execute().fetch() expected = df1_raw @ df2_raw pd.testing.assert_frame_equal(result, expected) series1 = Series(s1_raw, chunk_size=5) # df.dot(series) r = df1.dot(series1) result = r.execute().fetch() expected = df1_raw.dot(s1_raw) pd.testing.assert_series_equal(result, expected) # df.dot(2d_array) r = df1.dot(df2_raw.to_numpy()) result = r.execute().fetch() expected = df1_raw.dot(df2_raw.to_numpy()) pd.testing.assert_frame_equal(result, expected) # df.dot(1d_array) r = df1.dot(s1_raw.to_numpy()) result = r.execute().fetch() expected = df1_raw.dot(s1_raw.to_numpy()) pd.testing.assert_series_equal(result, expected) series2 = Series(s2_raw, chunk_size=4) # series.dot(series) r = series1.dot(series2) result = r.execute().fetch() expected = s1_raw.dot(s2_raw) assert pytest.approx(result) == expected # series.dot(df) r = series1.dot(df2) result = r.execute().fetch() expected = s1_raw.dot(df2_raw) pd.testing.assert_series_equal(result, expected) # series.dot(2d_array) r = series1.dot(df2_raw.to_numpy()) result = r.execute().fetch() expected = s1_raw.dot(df2_raw.to_numpy()) np.testing.assert_almost_equal(result, expected) # series.dot(1d_array) r = series1.dot(s2_raw.to_numpy()) result = r.execute().fetch() expected = s1_raw.dot(s2_raw.to_numpy()) assert pytest.approx(result) == expected
def test_dataframe_quantile_execution(setup): raw = pd.DataFrame( { 'a': np.random.rand(10), 'b': np.random.randint(1000, size=10), 'c': np.random.rand(10), 'd': [np.random.bytes(10) for _ in range(10)], 'e': [pd.Timestamp(f'201{i}') for i in range(10)], 'f': [pd.Timedelta(f'{i} days') for i in range(10)] }, index=pd.RangeIndex(1, 11)) df = DataFrame(raw, chunk_size=3) # q = 0.5, axis = 0, series r = df.quantile() result = r.execute().fetch() expected = raw.quantile() pd.testing.assert_series_equal(result, expected) # q = 0.5, axis = 1, series r = df.quantile(axis=1) result = r.execute().fetch() expected = raw.quantile(axis=1) pd.testing.assert_series_equal(result, expected) # q is a list, axis = 0, dataframe r = df.quantile([0.3, 0.7]) result = r.execute().fetch() expected = raw.quantile([0.3, 0.7]) pd.testing.assert_frame_equal(result, expected) # q is a list, axis = 1, dataframe r = df.quantile([0.3, 0.7], axis=1) result = r.execute().fetch() expected = raw.quantile([0.3, 0.7], axis=1) pd.testing.assert_frame_equal(result, expected) # test interpolation r = df.quantile([0.3, 0.7], interpolation='midpoint') result = r.execute().fetch() expected = raw.quantile([0.3, 0.7], interpolation='midpoint') pd.testing.assert_frame_equal(result, expected) q = tensor([0.3, 0.7]) # q is a tensor r = df.quantile(q) result = r.execute().fetch() expected = raw.quantile([0.3, 0.7]) pd.testing.assert_frame_equal(result, expected) # test numeric_only raw2 = pd.DataFrame( { 'a': np.random.rand(10), 'b': np.random.randint(1000, size=10), 'c': np.random.rand(10), 'd': [pd.Timestamp(f'201{i}') for i in range(10)], }, index=pd.RangeIndex(1, 11)) df2 = DataFrame(raw2, chunk_size=3) r = df2.quantile([0.3, 0.7], numeric_only=False) result = r.execute().fetch() expected = raw2.quantile([0.3, 0.7], numeric_only=False) pd.testing.assert_frame_equal(result, expected) r = df2.quantile(numeric_only=False) result = r.execute().fetch() expected = raw2.quantile(numeric_only=False) pd.testing.assert_series_equal(result, expected)
def testSortIndexExecution(self): raw = pd.DataFrame(np.random.rand(100, 20), index=np.random.rand(100)) mdf = DataFrame(raw) result = self.executor.execute_dataframe(mdf.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw) mdf.sort_index(inplace=True) result = self.executor.execute_dataframe(mdf, concat=True)[0] expected = raw.sort_index() pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=30) result = self.executor.execute_dataframe(mdf.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=20) result = self.executor.execute_dataframe( mdf.sort_index(ascending=False), concat=True)[0] expected = raw.sort_index(ascending=False) pd.testing.assert_frame_equal(result, expected) executor = ExecutorForTest(storage=new_session().context) mdf = DataFrame(raw, chunk_size=10) result = executor.execute_dataframe(mdf.sort_index(ignore_index=True), concat=True)[0] try: # for python3.5 expected = raw.sort_index(ignore_index=True) except TypeError: expected = raw.sort_index() expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test axis=1 raw = pd.DataFrame(np.random.rand(10, 10), columns=np.random.rand(10)) mdf = DataFrame(raw) result = self.executor.execute_dataframe(mdf.sort_index(axis=1), concat=True)[0] expected = raw.sort_index(axis=1) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_index(axis=1), concat=True)[0] expected = raw.sort_index(axis=1) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=4) result = self.executor.execute_dataframe(mdf.sort_index( axis=1, ascending=False), concat=True)[0] expected = raw.sort_index(axis=1, ascending=False) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=4) executor = ExecutorForTest(storage=new_session().context) result = executor.execute_dataframe(mdf.sort_index(axis=1, ignore_index=True), concat=True)[0] try: # for python3.5 expected = raw.sort_index(axis=1, ignore_index=True) except TypeError: expected = raw.sort_index(axis=1) expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test series raw = pd.Series(np.random.rand(10, ), index=np.random.rand(10)) series = Series(raw) result = self.executor.execute_dataframe(series.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=2) result = self.executor.execute_dataframe(series.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=3) result = self.executor.execute_dataframe( series.sort_index(ascending=False), concat=True)[0] expected = raw.sort_index(ascending=False) pd.testing.assert_series_equal(result, expected)
def test_to_csv_execution(setup): index = pd.RangeIndex(100, 0, -1, name='index') raw = pd.DataFrame( { 'col1': np.random.rand(100), 'col2': np.random.choice(['a', 'b', 'c'], (100, )), 'col3': np.arange(100) }, index=index) df = DataFrame(raw, chunk_size=33) with tempfile.TemporaryDirectory() as base_path: # DATAFRAME TESTS # test one file with dataframe path = os.path.join(base_path, 'out.csv') df.to_csv(path).execute() result = pd.read_csv(path, dtype=raw.dtypes.to_dict()) result.set_index('index', inplace=True) pd.testing.assert_frame_equal(result, raw) # test multi files with dataframe path = os.path.join(base_path, 'out-*.csv') df.to_csv(path).execute() dfs = [ pd.read_csv(os.path.join(base_path, f'out-{i}.csv'), dtype=raw.dtypes.to_dict()) for i in range(4) ] result = pd.concat(dfs, axis=0) result.set_index('index', inplace=True) pd.testing.assert_frame_equal(result, raw) pd.testing.assert_frame_equal(dfs[1].set_index('index'), raw.iloc[33:66]) # test df with unknown shape df2 = DataFrame(raw, chunk_size=(50, 2)) df2 = df2[df2['col1'] < 1] path2 = os.path.join(base_path, 'out2.csv') df2.to_csv(path2).execute() result = pd.read_csv(path2, dtype=raw.dtypes.to_dict()) result.set_index('index', inplace=True) pd.testing.assert_frame_equal(result, raw) # SERIES TESTS series = md.Series(raw.col1, chunk_size=33) # test one file with series path = os.path.join(base_path, 'out.csv') series.to_csv(path).execute() result = pd.read_csv(path, dtype=raw.dtypes.to_dict()) result.set_index('index', inplace=True) pd.testing.assert_frame_equal(result, raw.col1.to_frame()) # test multi files with series path = os.path.join(base_path, 'out-*.csv') series.to_csv(path).execute() dfs = [ pd.read_csv(os.path.join(base_path, f'out-{i}.csv'), dtype=raw.dtypes.to_dict()) for i in range(4) ] result = pd.concat(dfs, axis=0) result.set_index('index', inplace=True) pd.testing.assert_frame_equal(result, raw.col1.to_frame()) pd.testing.assert_frame_equal(dfs[1].set_index('index'), raw.col1.to_frame().iloc[33:66])
def testSortValuesExecution(self): distinct_opts = ['0'] if sys.platform.lower().startswith('win') else [ '0', '1' ] for add_distinct in distinct_opts: os.environ['PSRS_DISTINCT_COL'] = add_distinct df = pd.DataFrame(np.random.rand(100, 10), columns=['a' + str(i) for i in range(10)]) # test one chunk mdf = DataFrame(df) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a6', 'a7'], ascending=False), concat=True)[0] expected = df.sort_values(['a6', 'a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test psrs mdf = DataFrame(df, chunk_size=10) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a3', 'a4']), concat=True)[0] expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test ascending=False result = self.executor.execute_dataframe(mdf.sort_values( ['a0', 'a1'], ascending=False), concat=True)[0] expected = df.sort_values(['a0', 'a1'], ascending=False) pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a7'], ascending=False), concat=True)[0] expected = df.sort_values(['a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test multiindex df2 = df.copy(deep=True) df2.columns = pd.MultiIndex.from_product( [list('AB'), list('CDEFG')]) mdf = DataFrame(df2, chunk_size=10) result = self.executor.execute_dataframe(mdf.sort_values([('A', 'C')]), concat=True)[0] expected = df2.sort_values([('A', 'C')]) pd.testing.assert_frame_equal(result, expected) # test rechunk mdf = DataFrame(df, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a3', 'a4']), concat=True)[0] expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test other types raw = pd.DataFrame( { 'a': np.random.rand(10), 'b': np.random.randint(1000, size=10), 'c': np.random.rand(10), 'd': [np.random.bytes(10) for _ in range(10)], 'e': [pd.Timestamp(f'201{i}') for i in range(10)], 'f': [pd.Timedelta(f'{i} days') for i in range(10)] }, ) mdf = DataFrame(raw, chunk_size=3) for label in raw.columns: result = self.executor.execute_dataframe( mdf.sort_values(label), concat=True)[0] expected = raw.sort_values(label) pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a', 'b', 'e'], ascending=False), concat=True)[0] expected = raw.sort_values(['a', 'b', 'e'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test nan df = pd.DataFrame({ 'col1': ['A', 'A', 'B', 'B', 'D', 'C'], 'col2': [2, 1, 9, np.nan, 7, 4], 'col3': [0, 1, 9, 4, 2, 3], }) mdf = DataFrame(df) result = self.executor.execute_dataframe(mdf.sort_values(['col2']), concat=True)[0] expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(df, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_values(['col2']), concat=True)[0] expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) # test ignore_index executor = ExecutorForTest(storage=new_session().context) df = pd.DataFrame(np.random.rand(10, 3), columns=['a' + str(i) for i in range(3)]) mdf = DataFrame(df, chunk_size=3) result = executor.execute_dataframe(mdf.sort_values( ['a0', 'a1'], ignore_index=True), concat=True)[0] try: # for python3.5 expected = df.sort_values(['a0', 'a1'], ignore_index=True) except TypeError: expected = df.sort_values(['a0', 'a1']) expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test inplace mdf = DataFrame(df) mdf.sort_values('a0', inplace=True) result = self.executor.execute_dataframe(mdf, concat=True)[0] df.sort_values('a0', inplace=True) pd.testing.assert_frame_equal(result, df) # test unknown shape df = pd.DataFrame({ 'a': list(range(10)), 'b': np.random.random(10) }) mdf = DataFrame(df, chunk_size=4) filtered = mdf[mdf['a'] > 2] result = self.executor.execute_dataframe( filtered.sort_values(by='b'), concat=True)[0] pd.testing.assert_frame_equal(result, df[df['a'] > 2].sort_values(by='b')) # test Series.sort_values raw = pd.Series(np.random.rand(10)) series = Series(raw) result = self.executor.execute_dataframe(series.sort_values(), concat=True)[0] expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=3) result = self.executor.execute_dataframe(series.sort_values(), concat=True)[0] expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=2) result = self.executor.execute_dataframe( series.sort_values(ascending=False), concat=True)[0] expected = raw.sort_values(ascending=False) pd.testing.assert_series_equal(result, expected)
def testDataFrameQuantileExecution(self): raw = pd.DataFrame({'a': np.random.rand(10), 'b': np.random.randint(1000, size=10), 'c': np.random.rand(10), 'd': [np.random.bytes(10) for _ in range(10)], 'e': [pd.Timestamp(f'201{i}') for i in range(10)], 'f': [pd.Timedelta(f'{i} days') for i in range(10)] }, index=pd.RangeIndex(1, 11)) df = DataFrame(raw, chunk_size=3) # q = 0.5, axis = 0, series r = df.quantile() result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile() pd.testing.assert_series_equal(result, expected) # q = 0.5, axis = 1, series r = df.quantile(axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile(axis=1) pd.testing.assert_series_equal(result, expected) # q is a list, axis = 0, dataframe r = df.quantile([0.3, 0.7]) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile([0.3, 0.7]) pd.testing.assert_frame_equal(result, expected) # q is a list, axis = 1, dataframe r = df.quantile([0.3, 0.7], axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile([0.3, 0.7], axis=1) pd.testing.assert_frame_equal(result, expected) # test interpolation r = df.quantile([0.3, 0.7], interpolation='midpoint') result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile([0.3, 0.7], interpolation='midpoint') pd.testing.assert_frame_equal(result, expected) ctx, executor = self._create_test_context(self.executor) with ctx: q = tensor([0.3, 0.7]) # q is a tensor r = df.quantile(q) result = executor.execute_dataframes([r])[0] expected = raw.quantile([0.3, 0.7]) pd.testing.assert_frame_equal(result, expected) # test numeric_only raw2 = pd.DataFrame({'a': np.random.rand(10), 'b': np.random.randint(1000, size=10), 'c': np.random.rand(10), 'd': [pd.Timestamp(f'201{i}') for i in range(10)], }, index=pd.RangeIndex(1, 11)) df2 = DataFrame(raw2, chunk_size=3) r = df2.quantile([0.3, 0.7], numeric_only=False) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw2.quantile([0.3, 0.7], numeric_only=False) pd.testing.assert_frame_equal(result, expected) r = df2.quantile(numeric_only=False) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw2.quantile(numeric_only=False) pd.testing.assert_series_equal(result, expected)