def histogram( arr: da.Array, bins: Optional[int] = None, return_edges: bool = True, range: Optional[Tuple[int, int]] = None, # pylint: disable=redefined-builtin dtype: Optional[DTypeDef] = None, ) -> Tuple[da.Array, ...]: """Calculate "histogram" for both numerical and categorical.""" if len(arr.shape) != 1: raise ValueError("Histogram only supports 1-d array.") if is_dtype(detect_dtype(arr, dtype), Continuous()): if range is not None: minimum, maximum = range else: minimum, maximum = arr.min(axis=0), arr.max(axis=0) if bins is None: raise ValueError( "num_bins cannot be None if calculating numerical histograms.") counts, edges = da.histogram(arr, bins, range=[minimum, maximum]) centers = (edges[:-1] + edges[1:]) / 2 if not return_edges: return counts, centers return counts, centers, edges elif is_dtype(detect_dtype(arr, dtype), Nominal()): # Dask array's unique is way slower than the values_counts on Series # See https://github.com/dask/dask/issues/2851 # centers, counts = da.unique(arr, return_counts=True) srs = dd.from_dask_array(arr) value_counts = srs.value_counts() counts = value_counts.to_dask_array() centers = value_counts.index.to_dask_array() return (counts, centers) else: raise ValueError(f"Unsupported dtype {arr.dtype}")
def calc_hist_kde( data: da.Array, bins: int, bandwidth: float) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]: """ Calculate a density histogram and its corresponding kernel density estimate over a given series. The kernel is guassian. Parameters ---------- data: da.Array one numerical column over which to compute the histogram and kde bins : int number of bins to use in the histogram bandwidth: float bandwidth for the kde Returns ------- Tuple[pd.DataFrame, np.ndarray, np.ndarray] The histogram in a dataframe, range of points for the kde, and the kde calculated at the specified points """ minv, maxv = dask.compute(data.min(), data.max()) hist_arr, bins_arr = da.histogram(data, range=[minv, maxv], bins=bins, density=True) hist_arr = hist_arr.compute() intervals = _format_bin_intervals(bins_arr) hist_df = pd.DataFrame({ "intervals": intervals, "left": bins_arr[:-1], "right": bins_arr[1:], "freq": hist_arr, }) pts_rng = np.linspace(minv, maxv, 1000) pdf = gaussian_kde(data.compute(), bw_method=bandwidth)(pts_rng) return hist_df, pts_rng, pdf