Example #1
0
def _chunked_to_csv(df_iter, csvfn, mode, apply, open_kwargs=None, **kwargs):
    open_kwargs = open_kwargs or {}
    with open_file(csvfn, mode, **open_kwargs) as fout:
        for i, chunkdf in tqdm(enumerate(df_iter)):
            if apply:
                result = apply(chunkdf)
                chunkdf = chunkdf if result is None else result
            chunkdf.to_csv(fout, **kwargs)
Example #2
0
    def read_csv(self,
                 csvfn,
                 name,
                 chunksize=10000,
                 append=False,
                 apply=None,
                 mode='r',
                 open_kwargs=None,
                 **kwargs):
        """
        read large files from s3, hdfs, http/s, sftp, scp, ssh, write to om.datasets

        Usage:
            om.datasets.read_csv('/path/filename.csv', 'dataset-name')

            Optionally define a process function that receives each dataframe chunk for processing:

            def process(df):
                # apply any processing to df
                return df

            om.datasets.read_csv(...., apply=process)
        """
        store = self
        open_kwargs = open_kwargs or {}
        with open_file(csvfn, mode=mode, **open_kwargs) as fin:
            it = pd.read_csv(fin, chunksize=chunksize, iterator=True, **kwargs)
            pbar = tqdm(it)
            try:
                for i, chunkdf in enumerate(pbar):
                    if apply:
                        result = apply(chunkdf)
                        chunkdf = chunkdf if result is None else result
                    store.put(chunkdf, name, append=(i > 0) or append)
            finally:
                pbar.close()
        return store.getl(name)
Example #3
0
    def read_csv(self,
                 csvfn,
                 name,
                 chunksize=10000,
                 append=False,
                 apply=None,
                 mode='r',
                 open_kwargs=None,
                 **kwargs):
        """
        read large files from s3, hdfs, http/s, sftp, scp, ssh, write to om.datasets

        Usage:

            To insert a local csv file into a dataset::

                om.datasets.read_csv('/path/filename.csv', 'dataset-name')

            To insert a file stored in any of the supported locations specify
            its fully qualified location and filename. The specific format must
            be specified according to the `smart_open`_ library::

                om.datasets.read_csv('https://...', 'dataset-name')
                om.datasets.read_csv('s3://...', 'dataset-name')
                om.datasets.read_csv('hdfs://...', 'dataset-name')
                om.datasets.read_csv('sftp://...', 'dataset-name')
                om.datasets.read_csv('scp://...', 'dataset-name')
                om.datasets.read_csv('ssh://...', 'dataset-name')

            Optionally define a function to receives each chunk as a dataframe
            and apply further processing (e.g. transformations, filtering)::

                def process(df):
                    # apply any processing to df
                    return df

                om.datasets.read_csv(...., apply=process)

        Args:
            csvfn (str): the fully qualified path and name of the csv file,
               according to the `smart_open`_ library
            chunksize (int): the size of each chunk processed before writing
                to the dataset
            append (bool): if True, appends to the dataset. defaults to False
            apply (callable): if specified, each chunk is forwarded as a
               DataFrame and the returned result is inserted to the dataset.
               Use this for transformations or filtering
            mode (str): file open mode, defaults to r
            open_kwargs (dict): additional kwargs to `smart_open`_
            **kwargs: additional kwargs are passed to ``pandas.read_csv``

        Returns:
            MDataFrame

        See Also:

            * `smart_open` https://github.com/RaRe-Technologies/smart_open
            * `pandas.read_csv`

        .. _smart_open: https://github.com/RaRe-Technologies/smart_open
        """
        store = self
        open_kwargs = open_kwargs or {}
        with open_file(csvfn, mode=mode, **open_kwargs) as fin:
            it = pd.read_csv(fin, chunksize=chunksize, iterator=True, **kwargs)
            pbar = tqdm(it)
            try:
                for i, chunkdf in enumerate(pbar):
                    if apply:
                        result = apply(chunkdf)
                        chunkdf = chunkdf if result is None else result
                    store.put(chunkdf, name, append=(i > 0) or append)
            finally:
                pbar.close()
        return store.getl(name)
Example #4
0
def download_and_unpack_package(
    pkg_uri: str,
    base_directory: str,
    logger: Optional[logging.Logger] = default_logger,
) -> str:
    """Download the package corresponding to this URI and unpack it if zipped.

    Will be written to a file or directory named {base_directory}/{uri}.
    Returns the path to this file or directory.
    """
    pkg_file = Path(_get_local_path(base_directory, pkg_uri))
    with FileLock(str(pkg_file) + ".lock"):
        if logger is None:
            logger = default_logger

        logger.debug(f"Fetching package for URI: {pkg_uri}")

        local_dir = get_local_dir_from_uri(pkg_uri, base_directory)
        assert local_dir != pkg_file, "Invalid pkg_file!"
        if local_dir.exists():
            assert local_dir.is_dir(), f"{local_dir} is not a directory"
        else:
            protocol, pkg_name = parse_uri(pkg_uri)
            if protocol == Protocol.GCS:
                # Download package from the GCS.
                code = _internal_kv_get(pkg_uri)
                if code is None:
                    raise IOError(f"Failed to fetch URI {pkg_uri} from GCS.")
                code = code or b""
                pkg_file.write_bytes(code)

                if is_zip_uri(pkg_uri):
                    unzip_package(
                        package_path=pkg_file,
                        target_dir=local_dir,
                        remove_top_level_directory=False,
                        unlink_zip=True,
                        logger=logger,
                    )
                else:
                    return str(pkg_file)
            elif protocol in Protocol.remote_protocols():
                # Download package from remote URI
                tp = None

                if protocol == Protocol.S3:
                    try:
                        from smart_open import open as open_file
                        import boto3
                    except ImportError:
                        raise ImportError(
                            "You must `pip install smart_open` and "
                            "`pip install boto3` to fetch URIs in s3 "
                            "bucket.")
                    tp = {"client": boto3.client("s3")}
                elif protocol == Protocol.GS:
                    try:
                        from smart_open import open as open_file
                        from google.cloud import storage  # noqa: F401
                    except ImportError:
                        raise ImportError(
                            "You must `pip install smart_open` and "
                            "`pip install google-cloud-storage` "
                            "to fetch URIs in Google Cloud Storage bucket.")
                elif protocol == Protocol.FILE:
                    pkg_uri = pkg_uri[len("file://"):]

                    def open_file(uri, mode, *, transport_params=None):
                        return open(uri, mode)

                else:
                    try:
                        from smart_open import open as open_file
                    except ImportError:
                        raise ImportError(
                            "You must `pip install smart_open` "
                            f"to fetch {protocol.value.upper()} URIs.")

                with open_file(pkg_uri, "rb",
                               transport_params=tp) as package_zip:
                    with open_file(pkg_file, "wb") as fin:
                        fin.write(package_zip.read())

                unzip_package(
                    package_path=pkg_file,
                    target_dir=local_dir,
                    remove_top_level_directory=True,
                    unlink_zip=True,
                    logger=logger,
                )
            else:
                raise NotImplementedError(
                    f"Protocol {protocol} is not supported")

        return str(local_dir)