def _chunked_to_csv(df_iter, csvfn, mode, apply, open_kwargs=None, **kwargs): open_kwargs = open_kwargs or {} with open_file(csvfn, mode, **open_kwargs) as fout: for i, chunkdf in tqdm(enumerate(df_iter)): if apply: result = apply(chunkdf) chunkdf = chunkdf if result is None else result chunkdf.to_csv(fout, **kwargs)
def read_csv(self, csvfn, name, chunksize=10000, append=False, apply=None, mode='r', open_kwargs=None, **kwargs): """ read large files from s3, hdfs, http/s, sftp, scp, ssh, write to om.datasets Usage: om.datasets.read_csv('/path/filename.csv', 'dataset-name') Optionally define a process function that receives each dataframe chunk for processing: def process(df): # apply any processing to df return df om.datasets.read_csv(...., apply=process) """ store = self open_kwargs = open_kwargs or {} with open_file(csvfn, mode=mode, **open_kwargs) as fin: it = pd.read_csv(fin, chunksize=chunksize, iterator=True, **kwargs) pbar = tqdm(it) try: for i, chunkdf in enumerate(pbar): if apply: result = apply(chunkdf) chunkdf = chunkdf if result is None else result store.put(chunkdf, name, append=(i > 0) or append) finally: pbar.close() return store.getl(name)
def read_csv(self, csvfn, name, chunksize=10000, append=False, apply=None, mode='r', open_kwargs=None, **kwargs): """ read large files from s3, hdfs, http/s, sftp, scp, ssh, write to om.datasets Usage: To insert a local csv file into a dataset:: om.datasets.read_csv('/path/filename.csv', 'dataset-name') To insert a file stored in any of the supported locations specify its fully qualified location and filename. The specific format must be specified according to the `smart_open`_ library:: om.datasets.read_csv('https://...', 'dataset-name') om.datasets.read_csv('s3://...', 'dataset-name') om.datasets.read_csv('hdfs://...', 'dataset-name') om.datasets.read_csv('sftp://...', 'dataset-name') om.datasets.read_csv('scp://...', 'dataset-name') om.datasets.read_csv('ssh://...', 'dataset-name') Optionally define a function to receives each chunk as a dataframe and apply further processing (e.g. transformations, filtering):: def process(df): # apply any processing to df return df om.datasets.read_csv(...., apply=process) Args: csvfn (str): the fully qualified path and name of the csv file, according to the `smart_open`_ library chunksize (int): the size of each chunk processed before writing to the dataset append (bool): if True, appends to the dataset. defaults to False apply (callable): if specified, each chunk is forwarded as a DataFrame and the returned result is inserted to the dataset. Use this for transformations or filtering mode (str): file open mode, defaults to r open_kwargs (dict): additional kwargs to `smart_open`_ **kwargs: additional kwargs are passed to ``pandas.read_csv`` Returns: MDataFrame See Also: * `smart_open` https://github.com/RaRe-Technologies/smart_open * `pandas.read_csv` .. _smart_open: https://github.com/RaRe-Technologies/smart_open """ store = self open_kwargs = open_kwargs or {} with open_file(csvfn, mode=mode, **open_kwargs) as fin: it = pd.read_csv(fin, chunksize=chunksize, iterator=True, **kwargs) pbar = tqdm(it) try: for i, chunkdf in enumerate(pbar): if apply: result = apply(chunkdf) chunkdf = chunkdf if result is None else result store.put(chunkdf, name, append=(i > 0) or append) finally: pbar.close() return store.getl(name)
def download_and_unpack_package( pkg_uri: str, base_directory: str, logger: Optional[logging.Logger] = default_logger, ) -> str: """Download the package corresponding to this URI and unpack it if zipped. Will be written to a file or directory named {base_directory}/{uri}. Returns the path to this file or directory. """ pkg_file = Path(_get_local_path(base_directory, pkg_uri)) with FileLock(str(pkg_file) + ".lock"): if logger is None: logger = default_logger logger.debug(f"Fetching package for URI: {pkg_uri}") local_dir = get_local_dir_from_uri(pkg_uri, base_directory) assert local_dir != pkg_file, "Invalid pkg_file!" if local_dir.exists(): assert local_dir.is_dir(), f"{local_dir} is not a directory" else: protocol, pkg_name = parse_uri(pkg_uri) if protocol == Protocol.GCS: # Download package from the GCS. code = _internal_kv_get(pkg_uri) if code is None: raise IOError(f"Failed to fetch URI {pkg_uri} from GCS.") code = code or b"" pkg_file.write_bytes(code) if is_zip_uri(pkg_uri): unzip_package( package_path=pkg_file, target_dir=local_dir, remove_top_level_directory=False, unlink_zip=True, logger=logger, ) else: return str(pkg_file) elif protocol in Protocol.remote_protocols(): # Download package from remote URI tp = None if protocol == Protocol.S3: try: from smart_open import open as open_file import boto3 except ImportError: raise ImportError( "You must `pip install smart_open` and " "`pip install boto3` to fetch URIs in s3 " "bucket.") tp = {"client": boto3.client("s3")} elif protocol == Protocol.GS: try: from smart_open import open as open_file from google.cloud import storage # noqa: F401 except ImportError: raise ImportError( "You must `pip install smart_open` and " "`pip install google-cloud-storage` " "to fetch URIs in Google Cloud Storage bucket.") elif protocol == Protocol.FILE: pkg_uri = pkg_uri[len("file://"):] def open_file(uri, mode, *, transport_params=None): return open(uri, mode) else: try: from smart_open import open as open_file except ImportError: raise ImportError( "You must `pip install smart_open` " f"to fetch {protocol.value.upper()} URIs.") with open_file(pkg_uri, "rb", transport_params=tp) as package_zip: with open_file(pkg_file, "wb") as fin: fin.write(package_zip.read()) unzip_package( package_path=pkg_file, target_dir=local_dir, remove_top_level_directory=True, unlink_zip=True, logger=logger, ) else: raise NotImplementedError( f"Protocol {protocol} is not supported") return str(local_dir)