def open(self): if self.conn is None: if self.mode == "r": self.conn = make_reader(pd.read_csv, 'read_csv', 'CSV')(self.url) else: raise NotImplementedError self.attrs = {} return self
def _internal_read_csv(path, chunksize="256 MiB", **kwargs): if isinstance(chunksize, str): chunksize = parse_bytes(chunksize) if isinstance(path, list): filenames = path elif isinstance(path, str): filenames = sorted(glob(path)) elif hasattr(path, "__fspath__"): filenames = sorted(glob(path.__fspath__())) else: raise TypeError("Path type not understood:{}".format(type(path))) if not filenames: msg = f"A file in: {filenames} does not exist." raise FileNotFoundError(msg) name = "read-csv-" + tokenize(path, tokenize, ** kwargs) # TODO: get last modified time compression = kwargs.get("compression", False) if compression and chunksize: # compressed CSVs reading must read the entire file kwargs.pop("byte_range", None) warn("Warning %s compression does not support breaking apart files\n" "Please ensure that each individual file can fit in memory and\n" "use the keyword ``chunksize=None to remove this message``\n" "Setting ``chunksize=(size of file)``" % compression) chunksize = None if chunksize is None: return read_csv_without_chunksize(path, **kwargs) dask_reader = make_reader(cudf.read_csv, "read_csv", "CSV") meta = dask_reader(filenames[0], **kwargs)._meta dsk = {} i = 0 dtypes = meta.dtypes.values for fn in filenames: size = os.path.getsize(fn) for start in range(0, size, chunksize): kwargs2 = kwargs.copy() kwargs2["byte_range"] = ( start, chunksize, ) # specify which chunk of the file we care about if start != 0: kwargs2[ "names"] = meta.columns # no header in the middle of the file kwargs2["header"] = None dsk[(name, i)] = (apply, _read_csv, [fn, dtypes], kwargs2) i += 1 divisions = [None] * (len(dsk) + 1) return dd.core.new_dd_object(dsk, name, meta, divisions)
def read_csv(path, chunksize="256 MiB", **kwargs): """ Read CSV files into a dask_cudf.DataFrame This API parallelizes the ``cudf.read_csv`` function in the following ways: It supports loading many files at once using globstrings: >>> import dask_cudf >>> df = dask_cudf.read_csv("myfiles.*.csv") In some cases it can break up large files: >>> df = dask_cudf.read_csv("largefile.csv", chunksize="256 MiB") It can read CSV files from external resources (e.g. S3, HTTP, FTP) >>> df = dask_cudf.read_csv("s3://bucket/myfiles.*.csv") >>> df = dask_cudf.read_csv("https://www.mycloud.com/sample.csv") Internally ``dask_cudf.read_csv`` uses ``cudf.read_csv`` and supports many of the same keyword arguments with the same performance guarantees. See the docstring for ``cudf.read_csv()`` for more information on available keyword arguments. Parameters ---------- path : str, path object, or file-like object Either a path to a file (a str, pathlib.Path, or py._path.local.LocalPath), URL (including http, ftp, and S3 locations), or any object with a read() method (such as builtin open() file handler function or StringIO). chunksize : int or str, default "256 MiB" The target task partition size. If `None`, a single block is used for each file. **kwargs : dict Passthrough key-word arguments that are sent to ``cudf.read_csv``. Examples -------- >>> import dask_cudf >>> ddf = dask_cudf.read_csv("sample.csv", usecols=["a", "b"]) >>> ddf.compute() a b 0 1 hi 1 2 hello 2 3 ai """ if "://" in str(path): func = make_reader(cudf.read_csv, "read_csv", "CSV") return func(path, blocksize=chunksize, **kwargs) else: return _internal_read_csv(path=path, chunksize=chunksize, **kwargs)
def read_csv(path, chunksize="256 MiB", **kwargs): if "://" in str(path): func = make_reader(cudf.read_csv, "read_csv", "CSV") return func(path, blocksize=chunksize, **kwargs) else: return _internal_read_csv(path=path, chunksize=chunksize, **kwargs)