def text_blocks_to_pandas( reader, block_lists, header, head, kwargs, enforce=False, specified_dtypes=None, path=None, blocksize=None, urlpath=None, ): """Convert blocks of bytes to a dask.dataframe This accepts a list of lists of values of bytes where each list corresponds to one file, and the value of bytes concatenate to comprise the entire file, in order. Parameters ---------- reader : callable ``pd.read_csv`` or ``pd.read_table``. block_lists : list of lists of delayed values of bytes The lists of bytestrings where each list corresponds to one logical file header : bytestring The header, found at the front of the first file, to be prepended to all blocks head : pd.DataFrame An example Pandas DataFrame to be used for metadata. kwargs : dict Keyword arguments to pass down to ``reader`` path : tuple, optional A tuple containing column name for path and the path_converter if provided Returns ------- A dask.dataframe """ dtypes = head.dtypes.to_dict() # dtypes contains only instances of CategoricalDtype, which causes issues # in coerce_dtypes for non-uniform categories across partitions. # We will modify `dtype` (which is inferred) to # 1. contain instances of CategoricalDtypes for user-provided types # 2. contain 'category' for data inferred types categoricals = head.select_dtypes(include=["category"]).columns if isinstance(specified_dtypes, Mapping): known_categoricals = [ k for k in categoricals if isinstance(specified_dtypes.get(k), CategoricalDtype) and specified_dtypes.get(k).categories is not None ] unknown_categoricals = categoricals.difference(known_categoricals) else: unknown_categoricals = categoricals # Fixup the dtypes for k in unknown_categoricals: dtypes[k] = "category" columns = list(head.columns) blocks = tuple(flatten(block_lists)) # Create mask of first blocks from nested block_lists is_first = tuple(block_mask(block_lists)) is_last = tuple(block_mask_last(block_lists)) if path: colname, path_converter = path paths = [b[1].path for b in blocks] if path_converter: paths = [path_converter(p) for p in paths] head = head.assign( **{ colname: pd.Categorical.from_codes(np.zeros(len(head), dtype=int), set(paths)) }) path = (colname, paths) if len(unknown_categoricals): head = clear_known_categories(head, cols=unknown_categoricals) # Define parts parts = [] colname, paths = path or (None, None) for i in range(len(blocks)): parts.append( [blocks[i], paths[i] if paths else None, is_first[i], is_last[i]]) # Construct the output collection with from_map return from_map( CSVFunctionWrapper( columns, None, colname, head, header, reader, dtypes, enforce, kwargs, ), parts, meta=head, label="read-csv", token=tokenize(reader, urlpath, columns, enforce, head, blocksize), enforce_metadata=False, produces_tasks=True, )
def read_hdf( pattern, key, start=0, stop=None, columns=None, chunksize=1000000, sorted_index=False, lock=True, mode="r", ): """ Read HDF files into a Dask DataFrame Read hdf files into a dask dataframe. This function is like ``pandas.read_hdf``, except it can read from a single large file, or from multiple files, or from multiple keys from the same file. Parameters ---------- pattern : string, pathlib.Path, list File pattern (string), pathlib.Path, buffer to read from, or list of file paths. Can contain wildcards. key : group identifier in the store. Can contain wildcards start : optional, integer (defaults to 0), row number to start at stop : optional, integer (defaults to None, the last row), row number to stop at columns : list of columns, optional A list of columns that if not None, will limit the return columns (default is None) chunksize : positive integer, optional Maximal number of rows per partition (default is 1000000). sorted_index : boolean, optional Option to specify whether or not the input hdf files have a sorted index (default is False). lock : boolean, optional Option to use a lock to prevent concurrency issues (default is True). mode : {'a', 'r', 'r+'}, default 'r'. Mode to use when opening file(s). 'r' Read-only; no data can be modified. 'a' Append; an existing file is opened for reading and writing, and if the file does not exist it is created. 'r+' It is similar to 'a', but the file must already exist. Returns ------- dask.DataFrame Examples -------- Load single file >>> dd.read_hdf('myfile.1.hdf5', '/x') # doctest: +SKIP Load multiple files >>> dd.read_hdf('myfile.*.hdf5', '/x') # doctest: +SKIP >>> dd.read_hdf(['myfile.1.hdf5', 'myfile.2.hdf5'], '/x') # doctest: +SKIP Load multiple datasets >>> dd.read_hdf('myfile.1.hdf5', '/*') # doctest: +SKIP """ if lock is True: lock = get_scheduler_lock() key = key if key.startswith("/") else "/" + key # Convert path-like objects to a string pattern = stringify_path(pattern) if isinstance(pattern, str): paths = sorted(glob(pattern)) else: paths = pattern if not isinstance(pattern, str) and len(paths) == 0: raise ValueError("No files provided") if not paths or len(paths) == 0: raise OSError(f"File(s) not found: {pattern}") for path in paths: try: exists = os.path.exists(path) except (ValueError, TypeError): exists = False if not exists: raise OSError(f"File not found or insufficient permissions: {path}") if (start != 0 or stop is not None) and len(paths) > 1: raise NotImplementedError(read_hdf_error_msg) if chunksize <= 0: raise ValueError("Chunksize must be a positive integer") if (start != 0 or stop is not None) and sorted_index: raise ValueError( "When assuming pre-partitioned data, data must be " "read in its entirety using the same chunksizes" ) # Build metadata with pd.HDFStore(paths[0], mode=mode) as hdf: meta_key = _expand_key(key, hdf)[0] try: meta = pd.read_hdf(paths[0], meta_key, mode=mode, stop=0) except IndexError: # if file is empty, don't set stop meta = pd.read_hdf(paths[0], meta_key, mode=mode) if columns is not None: meta = meta[columns] # Common kwargs if meta.ndim == 1: common_kwargs = {"name": meta.name, "mode": mode} else: common_kwargs = {"mode": mode} # Build parts parts, divisions = _build_parts( paths, key, start, stop, chunksize, sorted_index, mode ) # Construct the output collection with from_map return from_map( HDFFunctionWrapper(columns, meta.ndim, lock, common_kwargs), parts, meta=meta, divisions=divisions, label="read-hdf", token=tokenize(paths, key, start, stop, sorted_index, chunksize, mode), enforce_metadata=False, )
def make_timeseries( start="2000-01-01", end="2000-12-31", dtypes={ "name": str, "id": int, "x": float, "y": float }, freq="10s", partition_freq="1M", seed=None, **kwargs, ): """Create timeseries dataframe with random data Parameters ---------- start: datetime (or datetime-like string) Start of time series end: datetime (or datetime-like string) End of time series dtypes: dict Mapping of column names to types. Valid types include {float, int, str, 'category'} freq: string String like '2s' or '1H' or '12W' for the time series frequency partition_freq: string String like '1M' or '2Y' to divide the dataframe into partitions seed: int (optional) Randomstate seed kwargs: Keywords to pass down to individual column creation functions. Keywords should be prefixed by the column name and then an underscore. Examples -------- >>> import dask.dataframe as dd >>> df = dd.demo.make_timeseries('2000', '2010', ... {'value': float, 'name': str, 'id': int}, ... freq='2H', partition_freq='1D', seed=1) >>> df.head() # doctest: +SKIP id name value 2000-01-01 00:00:00 969 Jerry -0.309014 2000-01-01 02:00:00 1010 Ray -0.760675 2000-01-01 04:00:00 1016 Patricia -0.063261 2000-01-01 06:00:00 960 Charlie 0.788245 2000-01-01 08:00:00 1031 Kevin 0.466002 """ divisions = list(pd.date_range(start=start, end=end, freq=partition_freq)) npartitions = len(divisions) - 1 if seed is None: # Get random integer seed for each partition. We can # call `random_state_data` in `MakeTimeseriesPart` state_data = np.random.randint(2e9, size=npartitions) else: state_data = random_state_data(npartitions, seed) # Build parts parts = [] for i in range(len(divisions) - 1): parts.append((divisions[i:i + 2], state_data[i])) # Construct the output collection with from_map return from_map( MakeTimeseriesPart(dtypes, freq, kwargs), parts, meta=make_timeseries_part("2000", "2000", dtypes, "1H", state_data[0], kwargs), divisions=divisions, label="make-timeseries", token=tokenize(start, end, dtypes, freq, partition_freq, state_data), enforce_metadata=False, )
def read_orc( path, engine="pyarrow", columns=None, index=None, split_stripes=1, aggregate_files=None, storage_options=None, ): """Read dataframe from ORC file(s) Parameters ---------- path: str or list(str) Location of file(s), which can be a full URL with protocol specifier, and may include glob character if a single string. engine: 'pyarrow' or ORCEngine Backend ORC engine to use for IO. Default is "pyarrow". columns: None or list(str) Columns to load. If None, loads all. index: str Column name to set as index. split_stripes: int or False Maximum number of ORC stripes to include in each output-DataFrame partition. Use False to specify a 1-to-1 mapping between files and partitions. Default is 1. aggregate_files : bool, default False Whether distinct file paths may be aggregated into the same output partition. A setting of True means that any two file paths may be aggregated into the same output partition, while False means that inter-file aggregation is prohibited. storage_options: None or dict Further parameters to pass to the bytes backend. Returns ------- Dask.DataFrame (even if there is only one column) Examples -------- >>> df = dd.read_orc('https://github.com/apache/orc/raw/' ... 'master/examples/demo-11-zlib.orc') # doctest: +SKIP """ # Get engine engine = _get_engine(engine) # Process file path(s) storage_options = storage_options or {} fs, fs_token, paths = get_fs_token_paths(path, mode="rb", storage_options=storage_options) # Let backend engine generate a list of parts # from the ORC metadata. The backend should also # return the schema and DataFrame-collection metadata parts, schema, meta = engine.read_metadata( fs, paths, columns, index, split_stripes, aggregate_files, ) # Construct the output collection with from_map return from_map( ORCFunctionWrapper(fs, columns, schema, engine, index), parts, meta=meta, divisions=[None] * (len(parts) + 1), label="read-orc", token=tokenize(fs_token, path, columns), enforce_metadata=False, )