Example #1
def ls(path: str,
       full_path: bool = False,
       recursive: bool = False,
       **kwargs) -> List[str]:
    """ List the contents of a local/s3 directory

    path : str
        Local or S3 Path

    full_path : bool
        Include the full path, or just the path relative to `path`

    recursive : bool
        Recursively list within the given path

    kwargs : Dict
        If path is an s3 path, fs: s3fs.S3FileSystem can be specified

    if s3.is_s3path(path):
        return s3.ls(path, full_path, recursive, **kwargs)
        return local.ls(path, full_path, recursive)
Example #2
def save_parquet_fp(df: pd.DataFrame, path: str, **kwargs) -> None:
    """ Helper function to save a DataFrame to a parquet DataSet

    See the [fastparquet Docs](https://fastparquet.readthedocs.io/en/latest/api.html) for more information

    df : pd.DataFrame
        The DataFrame to export to parquet

    path : str
        The root path the save the DataFrame to, this can either be S3 or local

    Additional Parameters
    The following parameters are optional and can tweak how the DataFrame gets
    converted to parquet.

    fs : s3fs.S3FileSystem
        This will be used to save the data to S3 if applicable

    file_scheme: "simple"|"hive" (default "hive")
        If simple: all goes in a single file
        If hive: each row group is in a separate file, and a separate file
        (called "_metadata") contains the metadata.

    write_index: bool
        Whether or not to write the index to a separate column.  By default we
        write the index *if* it is not 0, 1, ..., n.

    partition_on: List[str]
        Passed to groupby in order to split data within each row-group,
        producing a structured directory tree. Note: as with pandas, null
        values will be dropped. Ignored if file_scheme is simple.

    See [fastparquet.write](https://fastparquet.readthedocs.io/en/latest/api.html#fastparquet.write)
    documentation for full details.

    import fastparquet as fp

    fs = kwargs.pop("fs", None)
    file_scheme = kwargs.pop("file_scheme", "hive")

    if s3.is_s3path(path):
        fs = fs or s3fs.S3FileSystem()
        myopen = fs.open
        myopen = open

    logger.info("Writing Dataframe to Parquet using fastparquet")

    fp.write(path, df, file_scheme=file_scheme, open_with=myopen, **kwargs)

Example #3
def cp(from_path: str,
       to_path: str,
       overwrite: bool = True,
       include_folder_name: bool = True,
       **kwargs) -> None:
    """ Copy a file or directory of files from local/s3 to local/s3

    from_path : str
        Directory/file path to copy

    to_path : str
        Path to copy file(s) to.

    overwrite : bool (default True)
        Should the to_path be overwritten if it already exists?

    include_folder_name : bool (default True)
        If copying a directory, add the directory name automatically to the
        to_path.  i.e. if True, the entire folder will be copied to the
        to_path. If False, the *contents* of the directory will be copied to
        the to_path

    kwargs : Dict
        Extra arguments to pass to the appropriate cp (either _local.cp or

    if s3.is_s3path(from_path) or s3.is_s3path(to_path):
        s3.cp(from_path, to_path, overwrite, include_folder_name, **kwargs)
        local.cp(from_path, to_path, overwrite, include_folder_name)
Example #4
def already_exists(path: str, **kwargs) -> bool:
    """ Check if a file/directory already exists

    path : str
        File / Directory path

    kwargs : Dict
        If path is an s3 path, fs: s3fs.S3FileSystem can be optionally specified

    if s3.is_s3path(path):
        return s3.already_exists(path, **kwargs)
        return local.already_exists(path)
Example #5
def load_parquet_fp(path: str, **kwargs) -> pd.DataFrame:
    """ Helper function to load a parquet Dataset as a Pandas DataFrame using

    First creates a [ParquetFile](https://fastparquet.readthedocs.io/en/latest/api.html#fastparquet.ParquetFile)
    and then converts the ParquetFile to a DataFrame using .to_pandas.
    Refer to the fastparquet documentation for accepted arguments

    path : str
        The root directory of the Parquet Dataset stored locally or in S3

    import fastparquet as fp

        f"Reading in Parquet dataset to ParquetFile. kwargs passed {kwargs!r}")

    fs = kwargs.pop("fs", None)

    # Pull out arguments that should be directed to to_pandas
    to_pandas_args = parse_args(fp, ["ParquetFile", "to_pandas"], **kwargs)
    # Remove these args from kwargs
    kwargs = {
        k: v
        for k, v in kwargs.items() if k in set(kwargs) - set(to_pandas_args)

    if s3.is_s3path(path):
        fs = fs or s3fs.S3FileSystem()
        myopen = fs.open
        myopen = open

    pf = fp.ParquetFile(path, open_with=myopen, **kwargs)

    df = pf.to_pandas(**to_pandas_args)
    return df
Example #6
def get_size(path: str, **kwargs) -> int:
    """ Return size of file/directory in bytes

    path : str
        File / Directory path

    kwargs : Dict
        If path is an s3 path, fs: s3fs.S3FileSystem can be optionally specified

    fs = kwargs.pop("fs", None)
    if s3.is_s3path(path):
        return s3.get_size(path, fs)
        return local.get_size(path)
Example #7
def rm(path: str, dry_run: bool = False, **kwargs) -> None:
    """ Deletes a file or directory

    path : str
        File path to delete

    dry_run : bool
        Print out number of files to be deleted and exit. If False, numbe of
        files to be deleted will be logged and files will be removed

    kwargs : Dict
        If path is an s3 path, fs: s3fs.S3FileSystem can be specified

    if s3.is_s3path(path):
        s3.rm(path, dry_run, **kwargs)
        local.rm(path, dry_run)
Example #8
def test_is_s3path(tmpdir):
    path = "s3://airdna-data/scratch/ewellinger"
    assert s3.is_s3path(path)
    assert not s3.is_s3path(tmpdir)
Example #9
def save_parquet_pa(df: pd.DataFrame, path: str, **kwargs) -> None:
    """ Helper function to save a DataFrame to a parquet DataSet

    See the [PyArrow Docs](https://arrow.apache.org/docs/python/index.html) for more information

    df : pd.DataFrame
        The DataFrame to export to parquet

    path : str
        The root path the save the DataFrame to, this can either be S3 or local

    Additional Parameters
    The following parameters are optional and can tweak how the DataFrame gets
    converted to parquet.

    fs : s3fs.S3FileSystem
        This will be used to save the data to S3 if applicable

    schema : pyarrow.Schema
        Passed to pyarrow.Table.from_pandas()
        The expected schema of the Arrow Table. This can be used to indicate
        the type of columns if we cannot infer it automatically.

    preserve_index : bool (default False)
        Passed to pyarrow.Table.from_pandas()
        Whether to store the index as an additional column in the resulting Table

    nthreads : int
        Passed to pyarrow.Table.from_pandas()
        If greater than 1, convert columns to Arrow in parallel using indicated
        number of threads

    columns : List[str]
        Passed to pyarrow.Table.from_pandas()
        List of columns to be converted. Uses all columns be default

    partition_cols : List[str]
        Passed to pyarrow.parquet.write_to_dataset()
        Column names by which to partition the dataset
        Columns are partitioned in the order that they are given

    import pyarrow as pa
    import pyarrow.parquet as pq

        f"Converting dataframe to PyArrow Table. kwargs passed {kwargs!r}")

    fs = kwargs.pop("fs", None)
    schema = kwargs.pop("schema", None)
    preserve_index = kwargs.pop("preserve_index", False)
    nthreads = kwargs.pop("nthreads", None)
    columns = kwargs.pop("columns", None)
    partition_cols = kwargs.pop("partition_cols", None)

    # Convert the dataframe into a pyArrow Table object
    table = pa.Table.from_pandas(df,

    if not s3.is_s3path(path):
        fs = None
    elif fs is None:
        fs = s3fs.S3FileSystem()

    logger.info("Writing Arrow Table to Parquet Dataset")


Example #10
def load_parquet_pa(path: str, **kwargs) -> pd.DataFrame:
    """ Helper function to load a parquet Dataset as a Pandas DataFrame

    path : str
        The root directory of the Parquet Dataset stored locally or in S3

    Additional Parameters
    The following parameters are optional and can tweak how the Dataset gets
    converted back to a DataFrame

    split_row_groups : bool (default False)
        Passed to pyarrow.parquet.ParquetDataset()
        Divide files into pieces for each row group in the file

    filters : List[Tuple]
        Passed to pyarrow.parquet.ParquetDataset()
        List of filters to apply, like `[('x', '=', 0), ...]`. This implements
        partition-level (hive) filtering only, i.e., to prevent the loading of
        some files of the dataset.

    columns : List[str]
        Passed to pyarrow.parquet.ParquetDataset().read()
        Names of columns to read from the dataset

    Any additional kwargs are passed to pyarrow.Table.to_pandas().
    See [documentation](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html?highlight=table#pyarrow.Table.to_pandas) for more information

    import pyarrow.parquet as pq

        f"Reading in Parquet dataset to PyArrow Table. kwargs passed {kwargs!r}"

    fs = kwargs.pop("fs", None)
    split_row_groups = kwargs.pop("split_row_groups", False)
    filters = kwargs.pop("filters", None)
    columns = kwargs.pop("columns", None)

    if not s3.is_s3path(path):
        fs = None
    elif fs is None:
        fs = s3fs.S3FileSystem()

    dataset = pq.ParquetDataset(path,

    table = dataset.read(columns=columns)

        f"Converting PyArrow Table to Pandas DataFrame. kwargs passed {kwargs!r}"

    return table.to_pandas(**kwargs)
Example #11
def save_object(obj: object,
                path: str,
                file_type: Optional[str] = None,
                overwrite: bool = True,
                protocol: int = pickle.HIGHEST_PROTOCOL,
                **kwargs) -> None:
    """ Save an object in memory to a file

    obj : object
        Python object in memory

    path : str
        Local or S3 path to save file. If file_type is not specified, an
        attempt will be made to infer the file_type based on the extension.

    file_type : str
        Type of file to save.
        Supported options are currently:
                Additional kwargs are passed to pickle.dumps
                Save a pandas DataFrame as a CSV file.  Additional kwargs are
                passed to obj.to_csv
                NOTE: A TypeError will be thrown in "csv" is specified and obj
                is not a pandas DataFrame
                Additional kwargs are passed to json.dumps
                Save a pandas DataFrame to a parquet dataset. Additional kwargs
                are passed to the _save_parquet helper function and are applied
                to either pa.Table.from_pandas() or pq.write_to_dataset()
                depending on the argument.
                NOTE: This functionality is still in beta and currently only works with a pandas dataframe as input.

    overwrite : bool
        Should the file be overwritten if it already exists?

    protocol : int
        Used when calling pickle

    kwargs : Dict
        The following extra parameters can be passed:
            fs : s3fs.S3FileSystem
                Used when the path is an s3 path
            acl : str
                Used to set the Access Control List settings when writing to S3

    fs = kwargs.pop("fs", None)
    acl = kwargs.pop("acl", "bucket-owner-full-control")

    # Check to see if path already exists
    if not overwrite and already_exists(path, fs=fs):
        raise ValueError(f"overwrite set to False and {path!r} already exists")

    if file_type is None:
        file_type = _file_type_helper(path)

    if file_type == "pickle":
        logger.info(f"Saving obj as a pickle file. kwargs passed {kwargs!r}")
        obj = pickle.dumps(obj, protocol=protocol, **kwargs)
    elif file_type == "raw":
        logger.info(f"Saving obj as a raw file.")
    elif file_type == "csv":
        logger.info(f"Saving obj as a CSV file. kwargs passed {kwargs!r}")
        if not isinstance(obj, pd.DataFrame):
            raise TypeError(
                f"obj must be a pandas DataFrame when file_type='csv'. {type(obj)!r} passed"
        obj = obj.to_csv(path_or_buf=None, **kwargs)
    elif file_type == "json":
        logger.info(f"Saving obj as a json file. kwargs passed {kwargs!r}")
        obj = json.dumps(obj, **kwargs)
    elif file_type == "parquet":
        if not isinstance(obj, pd.DataFrame):
            raise TypeError(
                f"Saving to parquet currently only supports a pandas DataFrame. {type(obj)!r} passed"
        from ._parquet import save_parquet
        return save_parquet(obj, path, fs=fs, **kwargs)
        raise ValueError(f"file_type={file_type!r} is not supported")

    # Save file to appropriate system
    if s3.is_s3path(path):
        logger.info("Saving object to S3")
        s3.save_object(obj, path, overwrite, fs, acl)
        logger.info("Saving object to local")
        path = local._norm_path(path)
        if isinstance(obj, (bytes, bytearray)):
            mode = "wb"
            mode = "w"

        with open(path, mode) as f:
Example #12
def load_object(path: str, file_type: Optional[str] = None, **kwargs) -> Any:
    """ Load a file into memory

    path : str
        Path to the file. If file_type is not specified, an attempt will be
        made to infer the file_type based on the extension.

    file_type : str
        Type of file to load.  Supported options are currently:
                kwargs are passed to pickle.loads
                Load a CSV file into a pandas DataFrame. Additional kwargs are
                passed to pd.read_csv
                kwargs are passed to json.loads
                Load a parquet dataset in as a pandas DataFrame. Additional
                kwargs are passed to _parquet.load_parquet(). See that function
                for more information
                NOTE: This functionality is still in beta

    kwarg : Dict
        fs : s3fs.S3FileSystem
            Will be passed to s3.load_object if path is an s3path

    Any : Depends on the file_type specified
    # Pop fs from kwargs
    fs = kwargs.pop("fs", None)

    if file_type is None:
        file_type = _file_type_helper(path)

    if file_type == "parquet":
        from ._parquet import load_parquet
        return load_parquet(path, fs=fs, **kwargs)

    if s3.is_s3path(path):
        logger.info(f"Loading {path!r} from S3")
        data_file = s3.load_object(path, fs)
        path = local._norm_path(path)
        logger.info(f"Loading {path!r} from local directory")
        data_file = open(path, "rb")

    if file_type == "pickle":
            f"Loading file as a 'pickle' object. kwargs passed {kwargs!r}")
        data_read = data_file.read()
        obj = pickle.loads(data_read, **kwargs)
    elif file_type == "raw":
        logger.info("Loading file as a 'raw' object")
        obj = data_file.read()
    elif file_type == "csv":
        logger.info("Loading file as a 'csv' object")
        import pandas as pd
        obj = pd.read_csv(data_file, **kwargs)
    elif file_type == "json":
            f"loading file as a 'json' object. kwargs passed {kwargs!r}")
        obj = json.load(data_file, **kwargs)
        if hasattr(data_file, "close"):
            logger.info(f"Closing data_file {data_file!r}")
        raise ValueError(f"File type {file_type!r} is not supported")

    if hasattr(data_file, "close"):
        logger.info(f"Closing data_file {data_file!r}")

    return obj