Exemple #1
0
def text_blocks_to_pandas(
    reader,
    block_lists,
    header,
    head,
    kwargs,
    enforce=False,
    specified_dtypes=None,
    path=None,
    blocksize=None,
    urlpath=None,
):
    """Convert blocks of bytes to a dask.dataframe

    This accepts a list of lists of values of bytes where each list corresponds
    to one file, and the value of bytes concatenate to comprise the entire
    file, in order.

    Parameters
    ----------
    reader : callable
        ``pd.read_csv`` or ``pd.read_table``.
    block_lists : list of lists of delayed values of bytes
        The lists of bytestrings where each list corresponds to one logical file
    header : bytestring
        The header, found at the front of the first file, to be prepended to
        all blocks
    head : pd.DataFrame
        An example Pandas DataFrame to be used for metadata.
    kwargs : dict
        Keyword arguments to pass down to ``reader``
    path : tuple, optional
        A tuple containing column name for path and the path_converter if provided

    Returns
    -------
    A dask.dataframe
    """
    dtypes = head.dtypes.to_dict()
    # dtypes contains only instances of CategoricalDtype, which causes issues
    # in coerce_dtypes for non-uniform categories across partitions.
    # We will modify `dtype` (which is inferred) to
    # 1. contain instances of CategoricalDtypes for user-provided types
    # 2. contain 'category' for data inferred types
    categoricals = head.select_dtypes(include=["category"]).columns

    if isinstance(specified_dtypes, Mapping):
        known_categoricals = [
            k for k in categoricals
            if isinstance(specified_dtypes.get(k), CategoricalDtype)
            and specified_dtypes.get(k).categories is not None
        ]
        unknown_categoricals = categoricals.difference(known_categoricals)
    else:
        unknown_categoricals = categoricals

    # Fixup the dtypes
    for k in unknown_categoricals:
        dtypes[k] = "category"

    columns = list(head.columns)

    blocks = tuple(flatten(block_lists))
    # Create mask of first blocks from nested block_lists
    is_first = tuple(block_mask(block_lists))
    is_last = tuple(block_mask_last(block_lists))

    if path:
        colname, path_converter = path
        paths = [b[1].path for b in blocks]
        if path_converter:
            paths = [path_converter(p) for p in paths]
        head = head.assign(
            **{
                colname:
                pd.Categorical.from_codes(np.zeros(len(head), dtype=int),
                                          set(paths))
            })
        path = (colname, paths)

    if len(unknown_categoricals):
        head = clear_known_categories(head, cols=unknown_categoricals)

    # Define parts
    parts = []
    colname, paths = path or (None, None)
    for i in range(len(blocks)):
        parts.append(
            [blocks[i], paths[i] if paths else None, is_first[i], is_last[i]])

    # Construct the output collection with from_map
    return from_map(
        CSVFunctionWrapper(
            columns,
            None,
            colname,
            head,
            header,
            reader,
            dtypes,
            enforce,
            kwargs,
        ),
        parts,
        meta=head,
        label="read-csv",
        token=tokenize(reader, urlpath, columns, enforce, head, blocksize),
        enforce_metadata=False,
        produces_tasks=True,
    )
Exemple #2
0
def read_hdf(
    pattern,
    key,
    start=0,
    stop=None,
    columns=None,
    chunksize=1000000,
    sorted_index=False,
    lock=True,
    mode="r",
):
    """
    Read HDF files into a Dask DataFrame

    Read hdf files into a dask dataframe. This function is like
    ``pandas.read_hdf``, except it can read from a single large file, or from
    multiple files, or from multiple keys from the same file.

    Parameters
    ----------
    pattern : string, pathlib.Path, list
        File pattern (string), pathlib.Path, buffer to read from, or list of
        file paths. Can contain wildcards.
    key : group identifier in the store. Can contain wildcards
    start : optional, integer (defaults to 0), row number to start at
    stop : optional, integer (defaults to None, the last row), row number to
        stop at
    columns : list of columns, optional
        A list of columns that if not None, will limit the return
        columns (default is None)
    chunksize : positive integer, optional
        Maximal number of rows per partition (default is 1000000).
    sorted_index : boolean, optional
        Option to specify whether or not the input hdf files have a sorted
        index (default is False).
    lock : boolean, optional
        Option to use a lock to prevent concurrency issues (default is True).
    mode : {'a', 'r', 'r+'}, default 'r'. Mode to use when opening file(s).
        'r'
            Read-only; no data can be modified.
        'a'
            Append; an existing file is opened for reading and writing,
            and if the file does not exist it is created.
        'r+'
            It is similar to 'a', but the file must already exist.

    Returns
    -------
    dask.DataFrame

    Examples
    --------
    Load single file

    >>> dd.read_hdf('myfile.1.hdf5', '/x')  # doctest: +SKIP

    Load multiple files

    >>> dd.read_hdf('myfile.*.hdf5', '/x')  # doctest: +SKIP

    >>> dd.read_hdf(['myfile.1.hdf5', 'myfile.2.hdf5'], '/x')  # doctest: +SKIP

    Load multiple datasets

    >>> dd.read_hdf('myfile.1.hdf5', '/*')  # doctest: +SKIP
    """
    if lock is True:
        lock = get_scheduler_lock()

    key = key if key.startswith("/") else "/" + key
    # Convert path-like objects to a string
    pattern = stringify_path(pattern)

    if isinstance(pattern, str):
        paths = sorted(glob(pattern))
    else:
        paths = pattern

    if not isinstance(pattern, str) and len(paths) == 0:
        raise ValueError("No files provided")
    if not paths or len(paths) == 0:
        raise OSError(f"File(s) not found: {pattern}")
    for path in paths:
        try:
            exists = os.path.exists(path)
        except (ValueError, TypeError):
            exists = False
        if not exists:
            raise OSError(f"File not found or insufficient permissions: {path}")
    if (start != 0 or stop is not None) and len(paths) > 1:
        raise NotImplementedError(read_hdf_error_msg)
    if chunksize <= 0:
        raise ValueError("Chunksize must be a positive integer")
    if (start != 0 or stop is not None) and sorted_index:
        raise ValueError(
            "When assuming pre-partitioned data, data must be "
            "read in its entirety using the same chunksizes"
        )

    # Build metadata
    with pd.HDFStore(paths[0], mode=mode) as hdf:
        meta_key = _expand_key(key, hdf)[0]
    try:
        meta = pd.read_hdf(paths[0], meta_key, mode=mode, stop=0)
    except IndexError:  # if file is empty, don't set stop
        meta = pd.read_hdf(paths[0], meta_key, mode=mode)
    if columns is not None:
        meta = meta[columns]

    # Common kwargs
    if meta.ndim == 1:
        common_kwargs = {"name": meta.name, "mode": mode}
    else:
        common_kwargs = {"mode": mode}

    # Build parts
    parts, divisions = _build_parts(
        paths, key, start, stop, chunksize, sorted_index, mode
    )

    # Construct the output collection with from_map
    return from_map(
        HDFFunctionWrapper(columns, meta.ndim, lock, common_kwargs),
        parts,
        meta=meta,
        divisions=divisions,
        label="read-hdf",
        token=tokenize(paths, key, start, stop, sorted_index, chunksize, mode),
        enforce_metadata=False,
    )
Exemple #3
0
def make_timeseries(
    start="2000-01-01",
    end="2000-12-31",
    dtypes={
        "name": str,
        "id": int,
        "x": float,
        "y": float
    },
    freq="10s",
    partition_freq="1M",
    seed=None,
    **kwargs,
):
    """Create timeseries dataframe with random data

    Parameters
    ----------
    start: datetime (or datetime-like string)
        Start of time series
    end: datetime (or datetime-like string)
        End of time series
    dtypes: dict
        Mapping of column names to types.
        Valid types include {float, int, str, 'category'}
    freq: string
        String like '2s' or '1H' or '12W' for the time series frequency
    partition_freq: string
        String like '1M' or '2Y' to divide the dataframe into partitions
    seed: int (optional)
        Randomstate seed
    kwargs:
        Keywords to pass down to individual column creation functions.
        Keywords should be prefixed by the column name and then an underscore.

    Examples
    --------
    >>> import dask.dataframe as dd
    >>> df = dd.demo.make_timeseries('2000', '2010',
    ...                              {'value': float, 'name': str, 'id': int},
    ...                              freq='2H', partition_freq='1D', seed=1)
    >>> df.head()  # doctest: +SKIP
                           id      name     value
    2000-01-01 00:00:00   969     Jerry -0.309014
    2000-01-01 02:00:00  1010       Ray -0.760675
    2000-01-01 04:00:00  1016  Patricia -0.063261
    2000-01-01 06:00:00   960   Charlie  0.788245
    2000-01-01 08:00:00  1031     Kevin  0.466002
    """
    divisions = list(pd.date_range(start=start, end=end, freq=partition_freq))
    npartitions = len(divisions) - 1
    if seed is None:
        # Get random integer seed for each partition. We can
        # call `random_state_data` in `MakeTimeseriesPart`
        state_data = np.random.randint(2e9, size=npartitions)
    else:
        state_data = random_state_data(npartitions, seed)

    # Build parts
    parts = []
    for i in range(len(divisions) - 1):
        parts.append((divisions[i:i + 2], state_data[i]))

    # Construct the output collection with from_map
    return from_map(
        MakeTimeseriesPart(dtypes, freq, kwargs),
        parts,
        meta=make_timeseries_part("2000", "2000", dtypes, "1H", state_data[0],
                                  kwargs),
        divisions=divisions,
        label="make-timeseries",
        token=tokenize(start, end, dtypes, freq, partition_freq, state_data),
        enforce_metadata=False,
    )
Exemple #4
0
def read_orc(
    path,
    engine="pyarrow",
    columns=None,
    index=None,
    split_stripes=1,
    aggregate_files=None,
    storage_options=None,
):
    """Read dataframe from ORC file(s)

    Parameters
    ----------
    path: str or list(str)
        Location of file(s), which can be a full URL with protocol
        specifier, and may include glob character if a single string.
    engine: 'pyarrow' or ORCEngine
        Backend ORC engine to use for IO. Default is "pyarrow".
    columns: None or list(str)
        Columns to load. If None, loads all.
    index: str
        Column name to set as index.
    split_stripes: int or False
        Maximum number of ORC stripes to include in each output-DataFrame
        partition. Use False to specify a 1-to-1 mapping between files
        and partitions. Default is 1.
    aggregate_files : bool, default False
        Whether distinct file paths may be aggregated into the same output
        partition. A setting of True means that any two file paths may be
        aggregated into the same output partition, while False means that
        inter-file aggregation is prohibited.
    storage_options: None or dict
        Further parameters to pass to the bytes backend.

    Returns
    -------
    Dask.DataFrame (even if there is only one column)

    Examples
    --------
    >>> df = dd.read_orc('https://github.com/apache/orc/raw/'
    ...                  'master/examples/demo-11-zlib.orc')  # doctest: +SKIP
    """

    # Get engine
    engine = _get_engine(engine)

    # Process file path(s)
    storage_options = storage_options or {}
    fs, fs_token, paths = get_fs_token_paths(path,
                                             mode="rb",
                                             storage_options=storage_options)

    # Let backend engine generate a list of parts
    # from the ORC metadata.  The backend should also
    # return the schema and DataFrame-collection metadata
    parts, schema, meta = engine.read_metadata(
        fs,
        paths,
        columns,
        index,
        split_stripes,
        aggregate_files,
    )

    # Construct the output collection with from_map
    return from_map(
        ORCFunctionWrapper(fs, columns, schema, engine, index),
        parts,
        meta=meta,
        divisions=[None] * (len(parts) + 1),
        label="read-orc",
        token=tokenize(fs_token, path, columns),
        enforce_metadata=False,
    )