def load_datasets(self, outofcore): data_path = self.config['file'] print('Loading Data from {}...'.format(data_path)) if not path.isabs(data_path): config_dir = path.split(self.config_path)[0] data_path = path.join(config_dir, data_path) if not path.exists(data_path): raise IOError('Unable to find input dataset: "{}"'.format(data_path)) axes_fields = [] for f in self.axes.values(): axes_fields += [f[1], f[2]] load_fields = [f for f in self.fields.values() if f is not None] + axes_fields if data_path.endswith(".csv"): self.df = pd.read_csv(data_path, usecols=load_fields) # parse categorical fields for f in self.categorical_fields: self.df[f] = self.df[f].astype('category') elif data_path.endswith(".castra"): import dask.dataframe as dd self.df = dd.from_castra(data_path) if not outofcore: self.df = self.df.cache(cache=dict) else: raise IOError("Unknown data file type; .csv and .castra currently supported")
def test_categorical_index_with_dask_dataframe(): pytest.importorskip('dask.dataframe') import dask.dataframe as dd import dask A = pd.DataFrame({'x': [1, 2, 3, 4]}, index=pd.Index(['a', 'a', 'b', 'b'], name='foo')) B = pd.DataFrame({'x': [4, 5, 6]}, index=pd.Index(['c', 'd', 'd'], name='foo')) path = tempfile.mkdtemp(prefix='castra-') try: with Castra(path=path, template=A, categories=['foo']) as c: c.extend(A) c.extend(B) df = dd.from_castra(path) assert df.divisions == ('a', 'c', 'd') result = df.compute(get=dask.async.get_sync) expected = pd.concat([A, B]) expected.index = pd.CategoricalIndex(expected.index, name=expected.index.name, ordered=True) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(df.loc['a'].compute(), expected.loc['a']) tm.assert_frame_equal(df.loc['b'].compute(get=dask.async.get_sync), expected.loc['b']) finally: shutil.rmtree(path)
def test_from_castra(): pytest.importorskip("castra") df = pd.DataFrame({"x": ["a", "b", "c", "d"], "y": [2, 3, 4, 5]}, index=pd.Index([1.0, 2.0, 3.0, 4.0], name="ind")) a = dd.from_pandas(df, 2) c = a.to_castra() with_castra = dd.from_castra(c) with_fn = dd.from_castra(c.path) with_columns = dd.from_castra(c, "x") try: tm.assert_frame_equal(df, with_castra.compute()) tm.assert_frame_equal(df, with_fn.compute()) tm.assert_series_equal(df.x, with_columns.compute()) finally: # Calling c.drop() is a race condition on drop from `with_fn.__del__` # and c.drop. Manually `del`ing gets around this. del with_fn, c
def test_from_castra(): pytest.importorskip('castra') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [2, 3, 4, 5]}, index=pd.Index([1., 2., 3., 4.], name='ind')) a = dd.from_pandas(df, 2) c = a.to_castra() with_castra = dd.from_castra(c) with_fn = dd.from_castra(c.path) with_columns = dd.from_castra(c, 'x') try: tm.assert_frame_equal(df, with_castra.compute()) tm.assert_frame_equal(df, with_fn.compute()) tm.assert_series_equal(df.x, with_columns.compute()) finally: # Calling c.drop() is a race condition on drop from `with_fn.__del__` # and c.drop. Manually `del`ing gets around this. del with_fn, c
def test_from_castra_with_selection(): """ Optimizations fuse getitems with load_partitions We used to use getitem for both column access and selections """ pytest.importorskip("castra") df = pd.DataFrame({"x": ["a", "b", "c", "d"], "y": [2, 3, 4, 5]}, index=pd.Index([1.0, 2.0, 3.0, 4.0], name="ind")) a = dd.from_pandas(df, 2) b = dd.from_castra(a.to_castra()) assert eq(b[b.y > 3].x, df[df.y > 3].x)
def test_from_castra_with_selection(): """ Optimizations fuse getitems with load_partitions We used to use getitem for both column access and selections """ pytest.importorskip('castra') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [2, 3, 4, 5]}, index=pd.Index([1., 2., 3., 4.], name='ind')) a = dd.from_pandas(df, 2) b = dd.from_castra(a.to_castra()) assert eq(b[b.y > 3].x, df[df.y > 3].x)
def test_from_castra_with_selection(): """ Optimizations fuse getitems with load_partitions We used to use getitem for both column access and selections """ pytest.importorskip('castra') df = pd.DataFrame({ 'x': ['a', 'b', 'c', 'd'], 'y': [2, 3, 4, 5] }, index=pd.Index([1., 2., 3., 4.], name='ind')) a = dd.from_pandas(df, 2) b = dd.from_castra(a.to_castra()) assert eq(b[b.y > 3].x, df[df.y > 3].x)
def load_datasets(self, outofcore): data_path = self.config['file'] objpath = self.config.get('objpath', None) print('Loading Data from {}...'.format(data_path)) if not path.isabs(data_path): config_dir = path.split(self.config_path)[0] data_path = path.join(config_dir, data_path) if not path.exists(data_path): raise IOError( 'Unable to find input dataset: "{}"'.format(data_path)) axes_fields = [] for f in self.axes.values(): axes_fields += [f['xaxis'], f['yaxis']] load_fields = [f for f in self.fields.values() if f is not None ] + axes_fields if data_path.endswith(".csv"): self.df = pd.read_csv(data_path, usecols=load_fields) # parse categorical fields for f in self.categorical_fields: self.df[f] = self.df[f].astype('category') elif data_path.endswith(".h5"): if not objpath: from os.path import basename, splitext objpath = splitext(basename(data_path))[0] self.df = pd.read_hdf(data_path, objpath) # parse categorical fields for f in self.categorical_fields: self.df[f] = self.df[f].astype('category') elif data_path.endswith(".castra"): import dask.dataframe as dd self.df = dd.from_castra(data_path) if not outofcore: self.df = self.df.cache(cache=dict) else: raise IOError( "Unknown data file type; .csv and .castra currently supported")
def test_from_castra_with_selection(): """ Optimizations fuse getitems with load_partitions We used to use getitem for both column access and selections """ castra = pytest.importorskip('castra') blosc = pytest.importorskip('blosc') if (LooseVersion(blosc.__version__) == '1.3.0' or LooseVersion(castra.__version__) < '0.1.8'): pytest.skip() df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [2, 3, 4, 5]}, index=pd.Index([1., 2., 3., 4.], name='ind')) a = dd.from_pandas(df, 2) b = dd.from_castra(a.to_castra()) assert_eq(b[b.y > 3].x, df[df.y > 3].x)
def test_from_castra_with_selection(): """ Optimizations fuse getitems with load_partitions We used to use getitem for both column access and selections """ castra = pytest.importorskip('castra') blosc = pytest.importorskip('blosc') if (LooseVersion(blosc.__version__) == '1.3.0' or LooseVersion(castra.__version__) < '0.1.8'): pytest.skip() df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [2, 3, 4, 5]}, index=pd.Index([1., 2., 3., 4.], name='ind')) a = dd.from_pandas(df, 2) b = dd.from_castra(a.to_castra()) assert_eq(b[b.y > 3].x, df[df.y > 3].x)
if len(sys.argv)>4: p.x = sys.argv[4] if len(sys.argv)>5: p.y = sys.argv[5] if len(sys.argv)>6: p.categories = sys.argv[6:] from dask.cache import Cache Cache(p.cachesize).register() filetypes_storing_categories = {'parq','castra'} read = odict([(f,odict()) for f in ["parq","bcolz","feather","castra","h5","csv"]]) read["csv"] ["dask"] = lambda filepath,p: dd.read_csv(filepath, usecols=p.columns) read["h5"] ["dask"] = lambda filepath,p: dd.read_hdf(filepath, p.base, chunksize=p.chunksize, columns=p.columns) read["castra"] ["dask"] = lambda filepath,p: dd.from_castra(filepath) read["bcolz"] ["dask"] = lambda filepath,p: dd.from_bcolz(filepath, chunksize=1000000) read["parq"] ["dask"] = lambda filepath,p: dd.io.parquet.read_parquet(filepath,index=False, categories=p.categories, columns=p.columns) read["csv"] ["pandas"] = lambda filepath,p: pd.read_csv(filepath, usecols=p.columns) read["h5"] ["pandas"] = lambda filepath,p: pd.read_hdf(filepath, p.base, columns=p.columns) read["feather"] ["pandas"] = lambda filepath,p: feather.read_dataframe(filepath) read["parq"] ["pandas"] = lambda filepath,p: fp.ParquetFile(filepath).to_pandas() write = odict([(f,odict()) for f in ["parq","snappy.parq","gz.parq","bcolz","feather","castra","h5","csv"]]) write["csv"] ["dask"] = lambda df,filepath,p: df.to_csv(filepath.replace(".csv","*.csv")) write["h5"] ["dask"] = lambda df,filepath,p: df.to_hdf(filepath, p.base) write["castra"] ["dask"] = lambda df,filepath,p: df.to_castra(filepath,categories=p.categories) write["parq"] ["dask"] = lambda df,filepath,p: dd.io.parquet.to_parquet(filepath, df) ## **p.parq_opts