def testToCSVExecution(self): index = pd.RangeIndex(100, 0, -1, name='index') raw = pd.DataFrame({ 'col1': np.random.rand(100), 'col2': np.random.choice(['a', 'b', 'c'], (100,)), 'col3': np.arange(100) }, index=index) df = DataFrame(raw, chunk_size=33) with tempfile.TemporaryDirectory() as base_path: # test one file path = os.path.join(base_path, 'out.csv') r = df.to_csv(path) self.executor.execute_dataframe(r) result = pd.read_csv(path, dtype=raw.dtypes.to_dict()) result.set_index('index', inplace=True) pd.testing.assert_frame_equal(result, raw) # test multi files path = os.path.join(base_path, 'out-*.csv') r = df.to_csv(path) self.executor.execute_dataframe(r) dfs = [pd.read_csv(os.path.join(base_path, 'out-{}.csv'.format(i)), dtype=raw.dtypes.to_dict()) for i in range(4)] result = pd.concat(dfs, axis=0) result.set_index('index', inplace=True) pd.testing.assert_frame_equal(result, raw) pd.testing.assert_frame_equal(dfs[1].set_index('index'), raw.iloc[33: 66])
def test_to_csv(): raw = pd.DataFrame(np.random.rand(10, 5)) df = DataFrame(raw, chunk_size=4) r = df.to_csv('*.csv') r = tile(r) assert r.chunk_shape[1] == 1 for i, c in enumerate(r.chunks): assert type(c.op).__name__ == 'DataFrameToCSV' assert c.inputs[0] is r.inputs[0].chunks[i].data # test one file r = df.to_csv('out.csv') r = tile(r) assert r.chunk_shape[1] == 1 for i, c in enumerate(r.chunks): assert len(c.inputs) == 2 assert c.inputs[0].inputs[0] is r.inputs[0].chunks[i].data assert type(c.inputs[1].op).__name__ == 'DataFrameToCSVStat'
def test_to_csv_execution(setup): index = pd.RangeIndex(100, 0, -1, name='index') raw = pd.DataFrame( { 'col1': np.random.rand(100), 'col2': np.random.choice(['a', 'b', 'c'], (100, )), 'col3': np.arange(100) }, index=index) df = DataFrame(raw, chunk_size=33) with tempfile.TemporaryDirectory() as base_path: # DATAFRAME TESTS # test one file with dataframe path = os.path.join(base_path, 'out.csv') df.to_csv(path).execute() result = pd.read_csv(path, dtype=raw.dtypes.to_dict()) result.set_index('index', inplace=True) pd.testing.assert_frame_equal(result, raw) # test multi files with dataframe path = os.path.join(base_path, 'out-*.csv') df.to_csv(path).execute() dfs = [ pd.read_csv(os.path.join(base_path, f'out-{i}.csv'), dtype=raw.dtypes.to_dict()) for i in range(4) ] result = pd.concat(dfs, axis=0) result.set_index('index', inplace=True) pd.testing.assert_frame_equal(result, raw) pd.testing.assert_frame_equal(dfs[1].set_index('index'), raw.iloc[33:66]) # test df with unknown shape df2 = DataFrame(raw, chunk_size=(50, 2)) df2 = df2[df2['col1'] < 1] path2 = os.path.join(base_path, 'out2.csv') df2.to_csv(path2).execute() result = pd.read_csv(path2, dtype=raw.dtypes.to_dict()) result.set_index('index', inplace=True) pd.testing.assert_frame_equal(result, raw) # SERIES TESTS series = md.Series(raw.col1, chunk_size=33) # test one file with series path = os.path.join(base_path, 'out.csv') series.to_csv(path).execute() result = pd.read_csv(path, dtype=raw.dtypes.to_dict()) result.set_index('index', inplace=True) pd.testing.assert_frame_equal(result, raw.col1.to_frame()) # test multi files with series path = os.path.join(base_path, 'out-*.csv') series.to_csv(path).execute() dfs = [ pd.read_csv(os.path.join(base_path, f'out-{i}.csv'), dtype=raw.dtypes.to_dict()) for i in range(4) ] result = pd.concat(dfs, axis=0) result.set_index('index', inplace=True) pd.testing.assert_frame_equal(result, raw.col1.to_frame()) pd.testing.assert_frame_equal(dfs[1].set_index('index'), raw.col1.to_frame().iloc[33:66])