def load_datasets(self, outofcore):
        data_path = self.config['file']
        print('Loading Data from {}...'.format(data_path))

        if not path.isabs(data_path):
            config_dir = path.split(self.config_path)[0]
            data_path = path.join(config_dir, data_path)

        if not path.exists(data_path):
            raise IOError('Unable to find input dataset: "{}"'.format(data_path))

        axes_fields = []
        for f in self.axes.values():
            axes_fields += [f[1], f[2]]

        load_fields = [f for f in self.fields.values() if f is not None] + axes_fields

        if data_path.endswith(".csv"):
            self.df = pd.read_csv(data_path, usecols=load_fields)

            # parse categorical fields
            for f in self.categorical_fields:
                self.df[f] = self.df[f].astype('category')

        elif data_path.endswith(".castra"):
            import dask.dataframe as dd
            self.df = dd.from_castra(data_path)
            if not outofcore:
                self.df = self.df.cache(cache=dict)

        else:
            raise IOError("Unknown data file type; .csv and .castra currently supported")
Beispiel #2
0
def test_categorical_index_with_dask_dataframe():
    pytest.importorskip('dask.dataframe')
    import dask.dataframe as dd
    import dask

    A = pd.DataFrame({'x': [1, 2, 3, 4]},
                    index=pd.Index(['a', 'a', 'b', 'b'], name='foo'))
    B = pd.DataFrame({'x': [4, 5, 6]},
                    index=pd.Index(['c', 'd', 'd'], name='foo'))


    path = tempfile.mkdtemp(prefix='castra-')
    try:
        with Castra(path=path, template=A, categories=['foo']) as c:
            c.extend(A)
            c.extend(B)

            df = dd.from_castra(path)
            assert df.divisions == ('a', 'c', 'd')

            result = df.compute(get=dask.async.get_sync)

            expected = pd.concat([A, B])
            expected.index = pd.CategoricalIndex(expected.index,
                    name=expected.index.name, ordered=True)

            tm.assert_frame_equal(result, expected)

            tm.assert_frame_equal(df.loc['a'].compute(), expected.loc['a'])
            tm.assert_frame_equal(df.loc['b'].compute(get=dask.async.get_sync),
                                  expected.loc['b'])
    finally:
        shutil.rmtree(path)
Beispiel #3
0
def test_from_castra():
    pytest.importorskip("castra")
    df = pd.DataFrame({"x": ["a", "b", "c", "d"], "y": [2, 3, 4, 5]}, index=pd.Index([1.0, 2.0, 3.0, 4.0], name="ind"))
    a = dd.from_pandas(df, 2)

    c = a.to_castra()
    with_castra = dd.from_castra(c)
    with_fn = dd.from_castra(c.path)
    with_columns = dd.from_castra(c, "x")
    try:
        tm.assert_frame_equal(df, with_castra.compute())
        tm.assert_frame_equal(df, with_fn.compute())
        tm.assert_series_equal(df.x, with_columns.compute())
    finally:
        # Calling c.drop() is a race condition on drop from `with_fn.__del__`
        # and c.drop. Manually `del`ing gets around this.
        del with_fn, c
Beispiel #4
0
def test_from_castra():
    pytest.importorskip('castra')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [2, 3, 4, 5]},
                       index=pd.Index([1., 2., 3., 4.], name='ind'))
    a = dd.from_pandas(df, 2)

    c = a.to_castra()
    with_castra = dd.from_castra(c)
    with_fn = dd.from_castra(c.path)
    with_columns = dd.from_castra(c, 'x')
    try:
        tm.assert_frame_equal(df, with_castra.compute())
        tm.assert_frame_equal(df, with_fn.compute())
        tm.assert_series_equal(df.x, with_columns.compute())
    finally:
        # Calling c.drop() is a race condition on drop from `with_fn.__del__`
        # and c.drop. Manually `del`ing gets around this.
        del with_fn, c
Beispiel #5
0
def test_from_castra_with_selection():
    """ Optimizations fuse getitems with load_partitions

    We used to use getitem for both column access and selections
    """
    pytest.importorskip("castra")
    df = pd.DataFrame({"x": ["a", "b", "c", "d"], "y": [2, 3, 4, 5]}, index=pd.Index([1.0, 2.0, 3.0, 4.0], name="ind"))
    a = dd.from_pandas(df, 2)

    b = dd.from_castra(a.to_castra())

    assert eq(b[b.y > 3].x, df[df.y > 3].x)
Beispiel #6
0
def test_from_castra_with_selection():
    """ Optimizations fuse getitems with load_partitions

    We used to use getitem for both column access and selections
    """
    pytest.importorskip('castra')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [2, 3, 4, 5]},
                       index=pd.Index([1., 2., 3., 4.], name='ind'))
    a = dd.from_pandas(df, 2)

    b = dd.from_castra(a.to_castra())

    assert eq(b[b.y > 3].x, df[df.y > 3].x)
Beispiel #7
0
def test_from_castra_with_selection():
    """ Optimizations fuse getitems with load_partitions

    We used to use getitem for both column access and selections
    """
    pytest.importorskip('castra')
    df = pd.DataFrame({
        'x': ['a', 'b', 'c', 'd'],
        'y': [2, 3, 4, 5]
    },
                      index=pd.Index([1., 2., 3., 4.], name='ind'))
    a = dd.from_pandas(df, 2)

    b = dd.from_castra(a.to_castra())

    assert eq(b[b.y > 3].x, df[df.y > 3].x)
Beispiel #8
0
    def load_datasets(self, outofcore):
        data_path = self.config['file']
        objpath = self.config.get('objpath', None)
        print('Loading Data from {}...'.format(data_path))

        if not path.isabs(data_path):
            config_dir = path.split(self.config_path)[0]
            data_path = path.join(config_dir, data_path)

        if not path.exists(data_path):
            raise IOError(
                'Unable to find input dataset: "{}"'.format(data_path))

        axes_fields = []
        for f in self.axes.values():
            axes_fields += [f['xaxis'], f['yaxis']]

        load_fields = [f for f in self.fields.values() if f is not None
                       ] + axes_fields

        if data_path.endswith(".csv"):
            self.df = pd.read_csv(data_path, usecols=load_fields)

            # parse categorical fields
            for f in self.categorical_fields:
                self.df[f] = self.df[f].astype('category')

        elif data_path.endswith(".h5"):
            if not objpath:
                from os.path import basename, splitext
                objpath = splitext(basename(data_path))[0]
            self.df = pd.read_hdf(data_path, objpath)

            # parse categorical fields
            for f in self.categorical_fields:
                self.df[f] = self.df[f].astype('category')

        elif data_path.endswith(".castra"):
            import dask.dataframe as dd
            self.df = dd.from_castra(data_path)
            if not outofcore:
                self.df = self.df.cache(cache=dict)

        else:
            raise IOError(
                "Unknown data file type; .csv and .castra currently supported")
Beispiel #9
0
def test_from_castra_with_selection():
    """ Optimizations fuse getitems with load_partitions

    We used to use getitem for both column access and selections
    """
    castra = pytest.importorskip('castra')
    blosc = pytest.importorskip('blosc')
    if (LooseVersion(blosc.__version__) == '1.3.0' or
            LooseVersion(castra.__version__) < '0.1.8'):
        pytest.skip()
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [2, 3, 4, 5]},
                      index=pd.Index([1., 2., 3., 4.], name='ind'))
    a = dd.from_pandas(df, 2)

    b = dd.from_castra(a.to_castra())

    assert_eq(b[b.y > 3].x, df[df.y > 3].x)
Beispiel #10
0
def test_from_castra_with_selection():
    """ Optimizations fuse getitems with load_partitions

    We used to use getitem for both column access and selections
    """
    castra = pytest.importorskip('castra')
    blosc = pytest.importorskip('blosc')
    if (LooseVersion(blosc.__version__) == '1.3.0' or
            LooseVersion(castra.__version__) < '0.1.8'):
        pytest.skip()
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [2, 3, 4, 5]},
                      index=pd.Index([1., 2., 3., 4.], name='ind'))
    a = dd.from_pandas(df, 2)

    b = dd.from_castra(a.to_castra())

    assert_eq(b[b.y > 3].x, df[df.y > 3].x)
Beispiel #11
0
    if len(sys.argv)>4: p.x           = sys.argv[4]
    if len(sys.argv)>5: p.y           = sys.argv[5]
    if len(sys.argv)>6: p.categories  = sys.argv[6:]

from dask.cache import Cache
Cache(p.cachesize).register()


filetypes_storing_categories = {'parq','castra'}


read = odict([(f,odict()) for f in ["parq","bcolz","feather","castra","h5","csv"]])
               
read["csv"]     ["dask"]   = lambda filepath,p:  dd.read_csv(filepath, usecols=p.columns)
read["h5"]      ["dask"]   = lambda filepath,p:  dd.read_hdf(filepath, p.base, chunksize=p.chunksize, columns=p.columns)
read["castra"]  ["dask"]   = lambda filepath,p:  dd.from_castra(filepath)
read["bcolz"]   ["dask"]   = lambda filepath,p:  dd.from_bcolz(filepath, chunksize=1000000)
read["parq"]    ["dask"]   = lambda filepath,p:  dd.io.parquet.read_parquet(filepath,index=False, categories=p.categories, columns=p.columns)

read["csv"]     ["pandas"] = lambda filepath,p:  pd.read_csv(filepath, usecols=p.columns)
read["h5"]      ["pandas"] = lambda filepath,p:  pd.read_hdf(filepath, p.base, columns=p.columns)
read["feather"] ["pandas"] = lambda filepath,p:  feather.read_dataframe(filepath)
read["parq"]    ["pandas"] = lambda filepath,p:  fp.ParquetFile(filepath).to_pandas()


write = odict([(f,odict()) for f in ["parq","snappy.parq","gz.parq","bcolz","feather","castra","h5","csv"]])

write["csv"]          ["dask"]   = lambda df,filepath,p:  df.to_csv(filepath.replace(".csv","*.csv"))
write["h5"]           ["dask"]   = lambda df,filepath,p:  df.to_hdf(filepath, p.base)
write["castra"]       ["dask"]   = lambda df,filepath,p:  df.to_castra(filepath,categories=p.categories)
write["parq"]         ["dask"]   = lambda df,filepath,p:  dd.io.parquet.to_parquet(filepath, df) ## **p.parq_opts