def _compute_partition_stats(column: Series, allow_overlap: bool = False, **kwargs) -> Tuple[List, List, List[int]]: """For a given column, compute the min, max, and len of each partition. And make sure that the partitions are sorted relative to each other. NOTE: this does not guarantee that every partition is internally sorted. """ mins = column.map_partitions(M.min, meta=column) maxes = column.map_partitions(M.max, meta=column) lens = column.map_partitions(len, meta=column) mins, maxes, lens = compute(mins, maxes, lens, **kwargs) mins = remove_nans(mins) maxes = remove_nans(maxes) non_empty_mins = [m for m, length in zip(mins, lens) if length != 0] non_empty_maxes = [m for m, length in zip(maxes, lens) if length != 0] if (sorted(non_empty_mins) != non_empty_mins or sorted(non_empty_maxes) != non_empty_maxes): raise ValueError( f"Partitions are not sorted ascending by {column.name or 'the index'}", f"In your dataset the (min, max, len) values of {column.name or 'the index'} " f"for each partition are : {list(zip(mins, maxes, lens))}", ) if not allow_overlap and any( a <= b for a, b in zip(non_empty_mins[1:], non_empty_maxes[:-1])): warnings.warn( "Partitions have overlapping values, so divisions are non-unique." "Use `set_index(sorted=True)` with no `divisions` to allow dask to fix the overlap. " f"In your dataset the (min, max, len) values of {column.name or 'the index'} " f"for each partition are : {list(zip(mins, maxes, lens))}", UserWarning, ) lens = methods.tolist(lens) if not allow_overlap: return (mins, maxes, lens) else: return (non_empty_mins, non_empty_maxes, lens)
def _calculate_divisions( df: DataFrame, partition_col: Series, repartition: bool, npartitions: int, upsample: float = 1.0, partition_size: float = 128e6, ) -> Tuple[List, List, List]: """ Utility function to calculate divisions for calls to `map_partitions` """ sizes = df.map_partitions(sizeof) if repartition else [] divisions = partition_col._repartition_quantiles(npartitions, upsample=upsample) mins = partition_col.map_partitions(M.min) maxes = partition_col.map_partitions(M.max) try: divisions, sizes, mins, maxes = compute(divisions, sizes, mins, maxes) except TypeError as e: # When there are nulls and a column is non-numeric, a TypeError is sometimes raised as a result of # 1) computing mins/maxes above, 2) every null being switched to NaN, and 3) NaN being a float. # Also, Pandas ExtensionDtypes may cause TypeErrors when dealing with special nulls such as pd.NaT or pd.NA. # If this happens, we hint the user about eliminating nulls beforehand. if not is_numeric_dtype(partition_col.dtype): obj, suggested_method = ( ("column", f"`.dropna(subset=['{partition_col.name}'])`") if any( partition_col._name == df[c]._name for c in df) else ("series", "`.loc[series[~series.isna()]]`")) raise NotImplementedError( f"Divisions calculation failed for non-numeric {obj} '{partition_col.name}'.\n" f"This is probably due to the presence of nulls, which Dask does not entirely support in the index.\n" f"We suggest you try with {suggested_method}.") from e # For numeric types there shouldn't be problems with nulls, so we raise as-it-is this particular TypeError else: raise e divisions = methods.tolist(divisions) if type(sizes) is not list: sizes = methods.tolist(sizes) mins = methods.tolist(mins) maxes = methods.tolist(maxes) empty_dataframe_detected = pd.isna(divisions).all() if repartition or empty_dataframe_detected: total = sum(sizes) npartitions = max(math.ceil(total / partition_size), 1) npartitions = min(npartitions, df.npartitions) n = len(divisions) try: divisions = np.interp( x=np.linspace(0, n - 1, npartitions + 1), xp=np.linspace(0, n - 1, n), fp=divisions, ).tolist() except (TypeError, ValueError): # str type indexes = np.linspace(0, n - 1, npartitions + 1).astype(int) divisions = [divisions[i] for i in indexes] else: # Drop duplicate divisions returned by partition quantiles divisions = list(toolz.unique(divisions[:-1])) + [divisions[-1]] mins = remove_nans(mins) maxes = remove_nans(maxes) if pd.api.types.is_categorical_dtype(partition_col.dtype): dtype = partition_col.dtype mins = pd.Categorical(mins, dtype=dtype).codes.tolist() maxes = pd.Categorical(maxes, dtype=dtype).codes.tolist() return divisions, mins, maxes