Beispiel #1
0
    def random_frame(self, seed: int, dc: DataContainer,
                     **kwargs) -> dd.Series:
        """This function - in contrast to others in this module - will only ever be called on data frames"""

        random_state = np.random.RandomState(seed=seed)

        # Idea taken from dask.DataFrame.sample:
        # initialize a random state for each of the partitions
        # separately and then create a random series
        # for each partition
        df = dc.df
        name = "sample-" + tokenize(df, random_state)

        state_data = random_state_data(df.npartitions, random_state)
        dsk = {(name, i): (
            self.random_function,
            (df._name, i),
            np.random.RandomState(state),
            kwargs,
        )
               for i, state in enumerate(state_data)}

        graph = HighLevelGraph.from_collections(name, dsk, dependencies=[df])
        random_series = Series(graph, name, ("random", "float64"),
                               df.divisions)

        # This part seems to be stupid, but helps us do a very simple
        # task without going into the (private) internals of Dask:
        # copy all meta information from the original input dataframe
        # This is important so that the returned series looks
        # exactly like coming from the input dataframe
        return_df = df.assign(random=random_series)["random"]
        return return_df
Beispiel #2
0
def _compute_partition_stats(column: Series,
                             allow_overlap: bool = False,
                             **kwargs) -> Tuple[List, List, List[int]]:
    """For a given column, compute the min, max, and len of each partition.

    And make sure that the partitions are sorted relative to each other.
    NOTE: this does not guarantee that every partition is internally sorted.
    """
    mins = column.map_partitions(M.min, meta=column)
    maxes = column.map_partitions(M.max, meta=column)
    lens = column.map_partitions(len, meta=column)
    mins, maxes, lens = compute(mins, maxes, lens, **kwargs)
    mins = remove_nans(mins)
    maxes = remove_nans(maxes)
    non_empty_mins = [m for m, length in zip(mins, lens) if length != 0]
    non_empty_maxes = [m for m, length in zip(maxes, lens) if length != 0]
    if (sorted(non_empty_mins) != non_empty_mins
            or sorted(non_empty_maxes) != non_empty_maxes):
        raise ValueError(
            f"Partitions are not sorted ascending by {column.name or 'the index'}",
            f"In your dataset the (min, max, len) values of {column.name or 'the index'} "
            f"for each partition are : {list(zip(mins, maxes, lens))}",
        )
    if not allow_overlap and any(
            a <= b for a, b in zip(non_empty_mins[1:], non_empty_maxes[:-1])):
        warnings.warn(
            "Partitions have overlapping values, so divisions are non-unique."
            "Use `set_index(sorted=True)` with no `divisions` to allow dask to fix the overlap. "
            f"In your dataset the (min, max, len) values of {column.name or 'the index'} "
            f"for each partition are : {list(zip(mins, maxes, lens))}",
            UserWarning,
        )
    lens = methods.tolist(lens)
    if not allow_overlap:
        return (mins, maxes, lens)
    else:
        return (non_empty_mins, non_empty_maxes, lens)
Beispiel #3
0
    def _agg(self,
             how,
             meta=None,
             fill_value=np.nan,
             how_args=(),
             how_kwargs={}):
        """Aggregate using one or more operations

        Parameters
        ----------
        how : str
            Name of aggregation operation
        fill_value : scalar, optional
            Value to use for missing values, applied during upsampling.
            Default is NaN.
        how_args : optional
            Positional arguments for aggregation operation.
        how_kwargs : optional
            Keyword arguments for aggregation operation.

        Returns
        -------
        Dask DataFrame or Series
        """
        rule = self._rule
        kwargs = self._kwargs
        name = "resample-" + tokenize(self.obj, rule, kwargs, how, *how_args,
                                      **how_kwargs)

        # Create a grouper to determine closed and label conventions
        newdivs, outdivs = _resample_bin_and_out_divs(self.obj.divisions, rule,
                                                      **kwargs)

        # Repartition divs into bins. These won't match labels after mapping
        partitioned = self.obj.repartition(newdivs, force=True)

        keys = partitioned.__dask_keys__()
        dsk = {}

        args = zip(keys, outdivs, outdivs[1:],
                   ["left"] * (len(keys) - 1) + [None])
        for i, (k, s, e, c) in enumerate(args):
            dsk[(name, i)] = (
                _resample_series,
                k,
                s,
                e,
                c,
                rule,
                kwargs,
                how,
                fill_value,
                list(how_args),
                how_kwargs,
            )

        # Infer output metadata
        meta_r = self.obj._meta_nonempty.resample(self._rule, **self._kwargs)
        meta = getattr(meta_r, how)(*how_args, **how_kwargs)

        graph = HighLevelGraph.from_collections(name,
                                                dsk,
                                                dependencies=[partitioned])
        if isinstance(meta, pd.DataFrame):
            return DataFrame(graph, name, meta, outdivs)
        return Series(graph, name, meta, outdivs)
Beispiel #4
0
def _approximate_quantile(df, q):
    """Approximate quantiles of DataFrame or Series.
    [NOTE: Same logic as dask.dataframe Series quantile]
    """
    # current implementation needs q to be sorted so
    # sort if array-like, otherwise leave it alone
    q_ndarray = np.array(q)
    if q_ndarray.ndim > 0:
        q_ndarray.sort(kind="mergesort")
        q = q_ndarray

    # Lets assume we are dealing with a DataFrame throughout
    if isinstance(df, (Series, Index)):
        df = df.to_frame()
    assert isinstance(df, DataFrame)
    final_type = df._meta._constructor

    # Create metadata
    meta = df._meta_nonempty.quantiles(q=q)

    # Define final action (create df with quantiles as index)
    def finalize_tsk(tsk):
        return (final_type, tsk, q)

    return_type = df.__class__

    # pandas/cudf uses quantile in [0, 1]
    # numpy / cupy uses [0, 100]
    qs = np.asarray(q)
    token = tokenize(df, qs)

    if len(qs) == 0:
        name = "quantiles-" + token
        empty_index = gd.Index([], dtype=float)
        return Series(
            {
                (name, 0): final_type(
                    {col: [] for col in df.columns},
                    name=df.name,
                    index=empty_index,
                )
            },
            name,
            df._meta,
            [None, None],
        )
    else:
        new_divisions = [np.min(q), np.max(q)]

    name = "quantiles-1-" + token
    val_dsk = {
        (name, i): (_quantile, key, qs)
        for i, key in enumerate(df.__dask_keys__())
    }

    name2 = "quantiles-2-" + token
    merge_dsk = {
        (name2, 0): finalize_tsk(
            (merge_quantiles, qs, [qs] * df.npartitions, sorted(val_dsk))
        )
    }
    dsk = toolz.merge(val_dsk, merge_dsk)
    graph = HighLevelGraph.from_collections(name2, dsk, dependencies=[df])
    return return_type(graph, name2, meta, new_divisions)
Beispiel #5
0
def _calculate_divisions(
    df: DataFrame,
    partition_col: Series,
    repartition: bool,
    npartitions: int,
    upsample: float = 1.0,
    partition_size: float = 128e6,
) -> Tuple[List, List, List]:
    """
    Utility function to calculate divisions for calls to `map_partitions`
    """
    sizes = df.map_partitions(sizeof) if repartition else []
    divisions = partition_col._repartition_quantiles(npartitions,
                                                     upsample=upsample)
    mins = partition_col.map_partitions(M.min)
    maxes = partition_col.map_partitions(M.max)

    try:
        divisions, sizes, mins, maxes = compute(divisions, sizes, mins, maxes)
    except TypeError as e:
        # When there are nulls and a column is non-numeric, a TypeError is sometimes raised as a result of
        # 1) computing mins/maxes above, 2) every null being switched to NaN, and 3) NaN being a float.
        # Also, Pandas ExtensionDtypes may cause TypeErrors when dealing with special nulls such as pd.NaT or pd.NA.
        # If this happens, we hint the user about eliminating nulls beforehand.
        if not is_numeric_dtype(partition_col.dtype):
            obj, suggested_method = (
                ("column",
                 f"`.dropna(subset=['{partition_col.name}'])`") if any(
                     partition_col._name == df[c]._name for c in df) else
                ("series", "`.loc[series[~series.isna()]]`"))
            raise NotImplementedError(
                f"Divisions calculation failed for non-numeric {obj} '{partition_col.name}'.\n"
                f"This is probably due to the presence of nulls, which Dask does not entirely support in the index.\n"
                f"We suggest you try with {suggested_method}.") from e
        # For numeric types there shouldn't be problems with nulls, so we raise as-it-is this particular TypeError
        else:
            raise e

    divisions = methods.tolist(divisions)
    if type(sizes) is not list:
        sizes = methods.tolist(sizes)
    mins = methods.tolist(mins)
    maxes = methods.tolist(maxes)

    empty_dataframe_detected = pd.isna(divisions).all()
    if repartition or empty_dataframe_detected:
        total = sum(sizes)
        npartitions = max(math.ceil(total / partition_size), 1)
        npartitions = min(npartitions, df.npartitions)
        n = len(divisions)
        try:
            divisions = np.interp(
                x=np.linspace(0, n - 1, npartitions + 1),
                xp=np.linspace(0, n - 1, n),
                fp=divisions,
            ).tolist()
        except (TypeError, ValueError):  # str type
            indexes = np.linspace(0, n - 1, npartitions + 1).astype(int)
            divisions = [divisions[i] for i in indexes]
    else:
        # Drop duplicate divisions returned by partition quantiles
        divisions = list(toolz.unique(divisions[:-1])) + [divisions[-1]]

    mins = remove_nans(mins)
    maxes = remove_nans(maxes)
    if pd.api.types.is_categorical_dtype(partition_col.dtype):
        dtype = partition_col.dtype
        mins = pd.Categorical(mins, dtype=dtype).codes.tolist()
        maxes = pd.Categorical(maxes, dtype=dtype).codes.tolist()

    return divisions, mins, maxes