def random_frame(self, seed: int, dc: DataContainer, **kwargs) -> dd.Series: """This function - in contrast to others in this module - will only ever be called on data frames""" random_state = np.random.RandomState(seed=seed) # Idea taken from dask.DataFrame.sample: # initialize a random state for each of the partitions # separately and then create a random series # for each partition df = dc.df name = "sample-" + tokenize(df, random_state) state_data = random_state_data(df.npartitions, random_state) dsk = {(name, i): ( self.random_function, (df._name, i), np.random.RandomState(state), kwargs, ) for i, state in enumerate(state_data)} graph = HighLevelGraph.from_collections(name, dsk, dependencies=[df]) random_series = Series(graph, name, ("random", "float64"), df.divisions) # This part seems to be stupid, but helps us do a very simple # task without going into the (private) internals of Dask: # copy all meta information from the original input dataframe # This is important so that the returned series looks # exactly like coming from the input dataframe return_df = df.assign(random=random_series)["random"] return return_df
def _compute_partition_stats(column: Series, allow_overlap: bool = False, **kwargs) -> Tuple[List, List, List[int]]: """For a given column, compute the min, max, and len of each partition. And make sure that the partitions are sorted relative to each other. NOTE: this does not guarantee that every partition is internally sorted. """ mins = column.map_partitions(M.min, meta=column) maxes = column.map_partitions(M.max, meta=column) lens = column.map_partitions(len, meta=column) mins, maxes, lens = compute(mins, maxes, lens, **kwargs) mins = remove_nans(mins) maxes = remove_nans(maxes) non_empty_mins = [m for m, length in zip(mins, lens) if length != 0] non_empty_maxes = [m for m, length in zip(maxes, lens) if length != 0] if (sorted(non_empty_mins) != non_empty_mins or sorted(non_empty_maxes) != non_empty_maxes): raise ValueError( f"Partitions are not sorted ascending by {column.name or 'the index'}", f"In your dataset the (min, max, len) values of {column.name or 'the index'} " f"for each partition are : {list(zip(mins, maxes, lens))}", ) if not allow_overlap and any( a <= b for a, b in zip(non_empty_mins[1:], non_empty_maxes[:-1])): warnings.warn( "Partitions have overlapping values, so divisions are non-unique." "Use `set_index(sorted=True)` with no `divisions` to allow dask to fix the overlap. " f"In your dataset the (min, max, len) values of {column.name or 'the index'} " f"for each partition are : {list(zip(mins, maxes, lens))}", UserWarning, ) lens = methods.tolist(lens) if not allow_overlap: return (mins, maxes, lens) else: return (non_empty_mins, non_empty_maxes, lens)
def _agg(self, how, meta=None, fill_value=np.nan, how_args=(), how_kwargs={}): """Aggregate using one or more operations Parameters ---------- how : str Name of aggregation operation fill_value : scalar, optional Value to use for missing values, applied during upsampling. Default is NaN. how_args : optional Positional arguments for aggregation operation. how_kwargs : optional Keyword arguments for aggregation operation. Returns ------- Dask DataFrame or Series """ rule = self._rule kwargs = self._kwargs name = "resample-" + tokenize(self.obj, rule, kwargs, how, *how_args, **how_kwargs) # Create a grouper to determine closed and label conventions newdivs, outdivs = _resample_bin_and_out_divs(self.obj.divisions, rule, **kwargs) # Repartition divs into bins. These won't match labels after mapping partitioned = self.obj.repartition(newdivs, force=True) keys = partitioned.__dask_keys__() dsk = {} args = zip(keys, outdivs, outdivs[1:], ["left"] * (len(keys) - 1) + [None]) for i, (k, s, e, c) in enumerate(args): dsk[(name, i)] = ( _resample_series, k, s, e, c, rule, kwargs, how, fill_value, list(how_args), how_kwargs, ) # Infer output metadata meta_r = self.obj._meta_nonempty.resample(self._rule, **self._kwargs) meta = getattr(meta_r, how)(*how_args, **how_kwargs) graph = HighLevelGraph.from_collections(name, dsk, dependencies=[partitioned]) if isinstance(meta, pd.DataFrame): return DataFrame(graph, name, meta, outdivs) return Series(graph, name, meta, outdivs)
def _approximate_quantile(df, q): """Approximate quantiles of DataFrame or Series. [NOTE: Same logic as dask.dataframe Series quantile] """ # current implementation needs q to be sorted so # sort if array-like, otherwise leave it alone q_ndarray = np.array(q) if q_ndarray.ndim > 0: q_ndarray.sort(kind="mergesort") q = q_ndarray # Lets assume we are dealing with a DataFrame throughout if isinstance(df, (Series, Index)): df = df.to_frame() assert isinstance(df, DataFrame) final_type = df._meta._constructor # Create metadata meta = df._meta_nonempty.quantiles(q=q) # Define final action (create df with quantiles as index) def finalize_tsk(tsk): return (final_type, tsk, q) return_type = df.__class__ # pandas/cudf uses quantile in [0, 1] # numpy / cupy uses [0, 100] qs = np.asarray(q) token = tokenize(df, qs) if len(qs) == 0: name = "quantiles-" + token empty_index = gd.Index([], dtype=float) return Series( { (name, 0): final_type( {col: [] for col in df.columns}, name=df.name, index=empty_index, ) }, name, df._meta, [None, None], ) else: new_divisions = [np.min(q), np.max(q)] name = "quantiles-1-" + token val_dsk = { (name, i): (_quantile, key, qs) for i, key in enumerate(df.__dask_keys__()) } name2 = "quantiles-2-" + token merge_dsk = { (name2, 0): finalize_tsk( (merge_quantiles, qs, [qs] * df.npartitions, sorted(val_dsk)) ) } dsk = toolz.merge(val_dsk, merge_dsk) graph = HighLevelGraph.from_collections(name2, dsk, dependencies=[df]) return return_type(graph, name2, meta, new_divisions)
def _calculate_divisions( df: DataFrame, partition_col: Series, repartition: bool, npartitions: int, upsample: float = 1.0, partition_size: float = 128e6, ) -> Tuple[List, List, List]: """ Utility function to calculate divisions for calls to `map_partitions` """ sizes = df.map_partitions(sizeof) if repartition else [] divisions = partition_col._repartition_quantiles(npartitions, upsample=upsample) mins = partition_col.map_partitions(M.min) maxes = partition_col.map_partitions(M.max) try: divisions, sizes, mins, maxes = compute(divisions, sizes, mins, maxes) except TypeError as e: # When there are nulls and a column is non-numeric, a TypeError is sometimes raised as a result of # 1) computing mins/maxes above, 2) every null being switched to NaN, and 3) NaN being a float. # Also, Pandas ExtensionDtypes may cause TypeErrors when dealing with special nulls such as pd.NaT or pd.NA. # If this happens, we hint the user about eliminating nulls beforehand. if not is_numeric_dtype(partition_col.dtype): obj, suggested_method = ( ("column", f"`.dropna(subset=['{partition_col.name}'])`") if any( partition_col._name == df[c]._name for c in df) else ("series", "`.loc[series[~series.isna()]]`")) raise NotImplementedError( f"Divisions calculation failed for non-numeric {obj} '{partition_col.name}'.\n" f"This is probably due to the presence of nulls, which Dask does not entirely support in the index.\n" f"We suggest you try with {suggested_method}.") from e # For numeric types there shouldn't be problems with nulls, so we raise as-it-is this particular TypeError else: raise e divisions = methods.tolist(divisions) if type(sizes) is not list: sizes = methods.tolist(sizes) mins = methods.tolist(mins) maxes = methods.tolist(maxes) empty_dataframe_detected = pd.isna(divisions).all() if repartition or empty_dataframe_detected: total = sum(sizes) npartitions = max(math.ceil(total / partition_size), 1) npartitions = min(npartitions, df.npartitions) n = len(divisions) try: divisions = np.interp( x=np.linspace(0, n - 1, npartitions + 1), xp=np.linspace(0, n - 1, n), fp=divisions, ).tolist() except (TypeError, ValueError): # str type indexes = np.linspace(0, n - 1, npartitions + 1).astype(int) divisions = [divisions[i] for i in indexes] else: # Drop duplicate divisions returned by partition quantiles divisions = list(toolz.unique(divisions[:-1])) + [divisions[-1]] mins = remove_nans(mins) maxes = remove_nans(maxes) if pd.api.types.is_categorical_dtype(partition_col.dtype): dtype = partition_col.dtype mins = pd.Categorical(mins, dtype=dtype).codes.tolist() maxes = pd.Categorical(maxes, dtype=dtype).codes.tolist() return divisions, mins, maxes