Example #1
0
def _reload(data, path, **kwargs):
    with open(path, 'wb') as f:
        d = DataFile(path, f)
        io.write_df(data, d, **kwargs)
    with open(path, 'rb') as f:
        d = DataFile(path, f)
        df = io.read_df(d)
    return df
Example #2
0
def test_compression_pickle(randomdata, datadir_local):
    path = os.path.join(datadir_local, "tmp.pkl.gz")
    # write compressed with pandas
    randomdata.to_pickle(path, compression="gzip")

    # read compressed with blocks
    d = LocalDataFile(path, path)
    df = io.read_df(d)
    assert np.isclose(df, randomdata).all().all()
Example #3
0
 def merged(rgroup):
     frames = []
     for cgroup in grouped:
         p = next(p for p in grouped[cgroup]
                  if os.path.basename(p) == rgroup)
         args = read_args.copy()
         if cgroup in cgroup_args:
             args.update(cgroup_args[cgroup])
         frames.append(read_df(p, **args))
     return _merge_all(frames, merge=merge)
Example #4
0
 def merged(rgroup):
     frames = []
     for cgroup in grouped:
         datafile = next(d for d in grouped[cgroup]
                         if os.path.basename(d.path) == rgroup)
         args = read_args.copy()
         if cgroup in cgroup_args:
             args.update(cgroup_args[cgroup])
         frames.append(read_df(datafile, **args))
     return _merge_all(frames, merge=merge)
Example #5
0
def assemble(path, cgroups=None, rgroups=None,
             read_args={}, cgroup_args={}, merge='inner', filesystem=GCSFileSystem()):
    """ Assemble multiple dataframe blocks into a single frame

    Each file included in the path (or subdirs of that path) is combined into
    a single dataframe by first concatenating over row groups and then merging
    over cgroups. The merges are performed in the order of listed cgroups if
    provided, otherwise in alphabetic order. Files are opened by a method inferred
    from their extension

    Parameters
    ----------
    path : str
        The glob-able path to all datafiles to assemble into a frame
        e.g. gs://example/*/*, gs://example/*/part.0.pq, gs://example/c[1-2]/*
        See the README for a more detailed explanation
    cgroups : list of str, optional
        The list of cgroups (folder names) to include from the glob path
    rgroups : list of str, optional
        The list of rgroups (file names) to include from the glob path
    read_args : optional
        Any additional keyword args to pass to the read function
    cgroup_args : {cgroup: kwargs}, optional
        Any cgroup specific read arguments, where each key is the name
        of the cgroup and each value is a dictionary of keyword args
    merge : one of 'left', 'right', 'outer', 'inner', default 'inner'
        The merge strategy to pass to pandas.merge
    filesystem : blocks.filesystem.FileSystem or similar
        A filesystem object that implements the blocks.FileSystem API

    Returns
    -------
    data : pd.DataFrame
        The combined dataframe from all the blocks

    """
    grouped = _collect(path, cgroups, rgroups, filesystem)

    # ----------------------------------------
    # Concatenate all rgroups
    # ----------------------------------------
    frames = []

    for group in grouped:
        datafiles = grouped[group]
        args = read_args.copy()
        if group in cgroup_args:
            args.update(cgroup_args[group])
        frames.append(pd.concat(read_df(d, **args) for d in datafiles))

    # ----------------------------------------
    # Merge all cgroups
    # ----------------------------------------
    return _merge_all(frames, merge=merge)
Example #6
0
def test_compression_parquet(randomdata, datadir_local):
    pytest.importorskip("pyarrow")
    pytest.importorskip("pandas", minversion="0.22.0")
    path = os.path.join(datadir_local, "tmp.parquet.gz")
    # write compressed with pandas
    randomdata.to_parquet(path, compression="gzip")

    # read compressed with blocks
    d = LocalDataFile(path, path)
    df = io.read_df(d)
    assert np.isclose(df, randomdata).all().all()

    # write compressed with blocks
    d = LocalDataFile(path, path)
    io.write_df(randomdata, d)
    # read compressed with pandas
    df = pd.read_parquet(path)
    assert np.isclose(df, randomdata).all().all()
Example #7
0
def test_compression_pickle(randomdata, datadir_local):
    path = os.path.join(datadir_local, 'tmp.pkl.gz')
    # write compressed with pandas
    randomdata.to_pickle(path, compression='gzip')

    # read compressed with blocks
    with open(path, 'rb') as f:
        d = DataFile(path, f)
        df = io.read_df(d)
    assert (np.isclose(df, randomdata).all().all())

    # write compressed with blocks
    with open(path, 'wb') as f:
        d = DataFile(path, f)
        io.write_df(randomdata, d)
    # read compressed with pandas
    df = pd.read_pickle(path, compression='gzip')
    assert (np.isclose(df, randomdata).all().all())
Example #8
0
def test_compression_parquet(randomdata, datadir_local):
    pytest.importorskip('pyarrow')
    pytest.importorskip('pandas', minversion='0.22.0')
    path = os.path.join(datadir_local, 'tmp.parquet.gz')
    # write compressed with pandas
    randomdata.to_parquet(path, compression='gzip')

    # read compressed with blocks
    with open(path, 'rb') as f:
        d = DataFile(path, f)
        df = io.read_df(d)
    assert (np.isclose(df, randomdata).all().all())

    # write compressed with blocks
    with open(path, 'wb') as f:
        d = DataFile(path, f)
        io.write_df(randomdata, d)
    # read compressed with pandas
    df = pd.read_parquet(path)
    assert (np.isclose(df, randomdata).all().all())
Example #9
0
def iterate(
    path: str,
    axis: int = -1,
    cgroups: Optional[Sequence[cgroup]] = None,
    rgroups: Optional[Sequence[rgroup]] = None,
    read_args: Any = {},
    cgroup_args: Dict[cgroup, Any] = {},
    merge: str = "inner",
    filesystem: FileSystem = FileSystem(),
    tmpdir: str = None,
) -> Union[Iterator[Tuple[cgroup, rgroup, pd.DataFrame]], Iterator[Tuple[
        str, pd.DataFrame]]]:
    """Iterate over dataframe blocks

    Each file include in the path (or subdirs of that path) is opened as a
    dataframe and returned in a generator of (cname, rname, dataframe).
    Files are opened by a method inferred from their extension

    Parameters
    ----------
    path : str
        The glob-able path to all files to assemble into a frame
        e.g. gs://example/*/*, gs://example/*/part.0.pq, gs://example/c[1-2]/*
        See the README for a more detailed explanation
    axis : int, default -1
        The axis to iterate along
        If -1 (the default), iterate over both columns and rows
        If 0, iterate over the rgroups, combining any cgroups
        If 1, iterate over the cgroups, combining any rgroups
    cgroups : list of str, or {str: args} optional
        The list of cgroups (folder names) to include from the glob path
    rgroups : list of str, optional
        The list of rgroups (file names) to include from the glob path
    read_args : dict, optional
        Any additional keyword args to pass to the read function
    cgroup_args : {cgroup: kwargs}, optional
        Any cgroup specific read arguments, where each key is the name
        of the cgroup and each value is a dictionary of keyword args
    merge : one of 'left', 'right', 'outer', 'inner', default 'inner'
        The merge strategy to pass to pandas.merge, only used when axis=0
    filesystem : blocks.filesystem.FileSystem or similar
        A filesystem object that implements the blocks.FileSystem API

    Returns
    -------
    data : generator
        A generator of (cname, rname, dataframe) for each collected path
        If axis=0, yields (rname, dataframe)
        If axis=1, yields (cname, dataframe)

    """
    grouped = _collect(path, cgroups, rgroups, filesystem, tmpdir)

    if axis == -1:
        for cgroup in grouped:
            args = read_args.copy()
            if cgroup in cgroup_args:
                args.update(cgroup_args[cgroup])
            for path in grouped[cgroup]:
                yield _cname(path), _rname(path), read_df(path, **args)

    elif axis == 0:
        # find the shared files among all subfolders
        rgroups = _shared_rgroups(grouped)

        for rgroup in sorted(rgroups):
            frames = []
            for cgroup in grouped:
                path = next(d for d in grouped[cgroup] if _rname(d) == rgroup)

                args = read_args.copy()
                if cgroup in cgroup_args:
                    args.update(cgroup_args[cgroup])
                frames.append(read_df(path, **args))
            yield rgroup, _merge_all(frames, merge=merge)

    elif axis == 1:
        for cgroup in grouped:
            files = grouped[cgroup]
            args = read_args.copy()
            if cgroup in cgroup_args:
                args.update(cgroup_args[cgroup])
            yield cgroup, pd.concat(read_df(path, **args) for path in files)

    else:
        raise ValueError("Invalid choice for axis, options are -1, 0, 1")
Example #10
0
def _reload(data, path, **kwargs):
    d = LocalDataFile(path, path)
    io.write_df(data, d, **kwargs)
    d = LocalDataFile(path, path)
    df = io.read_df(d)
    return df