Esempio n. 1
0
def place(
    df: pd.DataFrame,
    path: str,
    filesystem: FileSystem = FileSystem(),
    tmpdir: str = None,
    **write_args,
) -> None:
    """Place a dataframe block onto the filesystem at the specified path

    Parameters
    ----------
    df : pd.DataFrame
        The data to place
    path : str
        Path to the directory (possibly on GCS) in which to place the columns
    write_args : dict
        Any additional args to pass to the write function
    filesystem : blocks.filesystem.FileSystem or similar
        A filesystem object that implements the blocks.FileSystem API

    """
    fname = os.path.basename(path)
    tmp = os.path.join(tmpdir, fname)
    write_df(df, tmp, **write_args)
    filesystem.copy(tmp, path)
Esempio n. 2
0
def _reload(data, path, **kwargs):
    with open(path, 'wb') as f:
        d = DataFile(path, f)
        io.write_df(data, d, **kwargs)
    with open(path, 'rb') as f:
        d = DataFile(path, f)
        df = io.read_df(d)
    return df
Esempio n. 3
0
def test_compression_parquet(randomdata, datadir_local):
    pytest.importorskip("pyarrow")
    pytest.importorskip("pandas", minversion="0.22.0")
    path = os.path.join(datadir_local, "tmp.parquet.gz")
    # write compressed with pandas
    randomdata.to_parquet(path, compression="gzip")

    # read compressed with blocks
    d = LocalDataFile(path, path)
    df = io.read_df(d)
    assert np.isclose(df, randomdata).all().all()

    # write compressed with blocks
    d = LocalDataFile(path, path)
    io.write_df(randomdata, d)
    # read compressed with pandas
    df = pd.read_parquet(path)
    assert np.isclose(df, randomdata).all().all()
Esempio n. 4
0
def place(df, path, filesystem=GCSFileSystem(), **write_args):
    """ Place a dataframe block onto the filesystem at the specified path

    Parameters
    ----------
    df : pd.DataFrame
        The data to place
    path : str
        Path to the directory (possibly on GCS) in which to place the columns
    write_args : dict
        Any additional args to pass to the write function
    filesystem : blocks.filesystem.FileSystem or similar
        A filesystem object that implements the blocks.FileSystem API

    """
    bucket, fname = os.path.dirname(path), os.path.basename(path)
    with filesystem.store(bucket, [fname]) as datafiles:
        write_df(df, datafiles[0], **write_args)
Esempio n. 5
0
def test_compression_pickle(randomdata, datadir_local):
    path = os.path.join(datadir_local, 'tmp.pkl.gz')
    # write compressed with pandas
    randomdata.to_pickle(path, compression='gzip')

    # read compressed with blocks
    with open(path, 'rb') as f:
        d = DataFile(path, f)
        df = io.read_df(d)
    assert (np.isclose(df, randomdata).all().all())

    # write compressed with blocks
    with open(path, 'wb') as f:
        d = DataFile(path, f)
        io.write_df(randomdata, d)
    # read compressed with pandas
    df = pd.read_pickle(path, compression='gzip')
    assert (np.isclose(df, randomdata).all().all())
Esempio n. 6
0
def test_compression_parquet(randomdata, datadir_local):
    pytest.importorskip('pyarrow')
    pytest.importorskip('pandas', minversion='0.22.0')
    path = os.path.join(datadir_local, 'tmp.parquet.gz')
    # write compressed with pandas
    randomdata.to_parquet(path, compression='gzip')

    # read compressed with blocks
    with open(path, 'rb') as f:
        d = DataFile(path, f)
        df = io.read_df(d)
    assert (np.isclose(df, randomdata).all().all())

    # write compressed with blocks
    with open(path, 'wb') as f:
        d = DataFile(path, f)
        io.write_df(randomdata, d)
    # read compressed with pandas
    df = pd.read_parquet(path)
    assert (np.isclose(df, randomdata).all().all())
Esempio n. 7
0
def divide(
    df: pd.DataFrame,
    path: str,
    n_rgroup: int = 1,
    rgroup_offset: int = 0,
    cgroup_columns: Optional[Dict[Optional[cgroup], Sequence[str]]] = None,
    extension: str = ".pq",
    convert: bool = False,
    filesystem: FileSystem = FileSystem(),
    prefix=None,
    tmpdir: str = None,
    **write_args,
) -> None:
    """Split a dataframe into rgroups/cgroups and save to disk

    Note that this splitting does not preserve the original index, so make sure
    to have another column to track values

    Parameters
    ----------
    df : pd.DataFrame
        The data to divide
    path : str
        Path to the directory (possibly on GCS) in which to place the columns
    n_rgroup : int, default 1
        The number of row groups to partition the data into
        The rgroups will have approximately equal sizes
    rgroup_offset : int, default 0
        The index to start from in the name of file parts
        e.g. If rgroup_offset=10 then the first file will be `part_00010.pq`
    cgroup_columns : {cgroup: list of column names}
        The column lists to form cgroups; if None, do not make cgroups
        Each key is the name of the cgroup, and each value is the list of columns to include
        To reassemble later make sure to include join keys for each cgroup
    extension : str, default .pq
        The file extension for the dataframe (file type inferred from this extension
    convert : bool, default False
        If true attempt to coerce types to numeric. This can avoid issues with ambiguous
        object columns but requires additional time
    filesystem : blocks.filesystem.FileSystem or similar
        A filesystem object that implements the blocks.FileSystem API
    prefix: str
        Prefix to add to written filenames
    write_args : dict
        Any additional args to pass to the write function

    """
    # Use a single dummy cgroup if None wanted
    if cgroup_columns is None:
        cgroup_columns = {None: df.columns}

    # Add leading dot if not in extension
    if extension[0] != ".":
        extension = "." + extension

    if convert:
        for col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="ignore")

    files = []
    for cname, columns in cgroup_columns.items():
        cgroup = df[columns]

        bucket = os.path.join(path, cname) if cname else path
        tmp_cgroup = os.path.join(tmpdir, cname) if cname else tmpdir

        if not filesystem.isdir(tmp_cgroup):
            filesystem.mkdir(tmp_cgroup)

        rnames = [
            "part_{:05d}{}".format(i + rgroup_offset, extension)
            for i in range(n_rgroup)
        ]
        if prefix is not None:
            rnames = [prefix + "_" + rn for rn in rnames]

        for rgroup, rname in zip(np.array_split(cgroup, n_rgroup), rnames):
            tmp = os.path.join(tmp_cgroup, rname)
            write_df(rgroup.reset_index(drop=True), tmp, **write_args)
            files.append((cname, rname) if cname else (rname, ))

    filesystem.copy(
        [os.path.join(tmpdir, *f) for f in files],
        [os.path.join(path, *f) for f in files],
    )
Esempio n. 8
0
def _reload(data, path, **kwargs):
    d = LocalDataFile(path, path)
    io.write_df(data, d, **kwargs)
    d = LocalDataFile(path, path)
    df = io.read_df(d)
    return df
Esempio n. 9
0
def divide(
        df, path,
        n_rgroup=1,
        rgroup_offset=0,
        cgroup_columns=None,
        extension='.pq',
        convert=False,
        filesystem=GCSFileSystem(),
        **write_args
):
    """ Split a dataframe into rgroups/cgroups and save to disk

    Note that this splitting does not preserve the original index, so make sure
    to have another column to track values

    Parameters
    ----------
    df : pd.DataFrame
        The data to divide
    path : str
        Path to the directory (possibly on GCS) in which to place the columns
    n_rgroup : int, default 1
        The number of row groups to partition the data into
        The rgroups will have approximately equal sizes
    rgroup_offset : int, default 0
        The index to start from in the name of file parts
        e.g. If rgroup_offset=10 then the first file will be `part_00010.pq`
    cgroup_columns : {cgroup: list of column names}
        The column lists to form cgroups; if None, do not make cgroups
        Each key is the name of the cgroup, and each value is the list of columns to include
        To reassemble later make sure to include join keys for each cgroup
    extension : str, default .pq
        The file extension for the dataframe (file type inferred from this extension
    convert : bool, default False
        If true attempt to coerce types to numeric. This can avoid issues with amiguous
        object columns but requires additional time
    filesystem : blocks.filesystem.FileSystem or similar
        A filesystem object that implements the blocks.FileSystem API
    write_args : dict
        Any additional args to pass to the write function

    """
    # Use a single dummy cgroup if None wanted
    if cgroup_columns is None:
        cgroup_columns = {None: df.columns}

    # Add leading dot if not in extension
    if extension[0] != '.':
        extension = '.'+extension

    if convert:
        for col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='ignore')

    for cname, columns in cgroup_columns.items():
        cgroup = df[columns]

        bucket = os.path.join(path, cname) if cname else path
        rnames = ['part_{:05d}{}'.format(i+rgroup_offset, extension) for i in range(n_rgroup)]
        with filesystem.store(bucket, rnames) as datafiles:
            for rgroup, d in zip(np.array_split(cgroup, n_rgroup), datafiles):
                write_df(rgroup.reset_index(drop=True), d, **write_args)