def calc_hist(srs: dd.Series, bins: int, orig_df_len: int) -> Tuple[pd.DataFrame, float]: """ Calculate a histogram over a given series. Parameters ---------- srs : dd.Series one numerical column over which to compute the histogram bins : int number of bins to use in the histogram orig_df_len : int length of the original dataframe Returns ------- Tuple[pd.DataFrame, float]: The histogram in a dataframe and the percent of missing values """ miss_pct = round(srs.isna().sum() / len(srs) * 100, 1) data = srs.dropna().values if len(data) == 0: # all values in column are missing return pd.DataFrame({"left": [], "right": [], "freq": []}), miss_pct minv, maxv = data.min(), data.max() hist_arr, bins_arr = np.histogram(data, range=[minv, maxv], bins=bins) intervals = _format_bin_intervals(bins_arr) hist_df = pd.DataFrame({ "intervals": intervals, "left": bins_arr[:-1], "right": bins_arr[1:], "freq": hist_arr, "pct": hist_arr / orig_df_len * 100, }) return hist_df, miss_pct
def calc_bar_pie(srs: dd.Series, ngroups: int, largest: bool) -> Tuple[pd.DataFrame, int, float]: """ Calculates the group counts given a series. Parameters ---------- srs One categorical column ngroups Number of groups to return largest If true, show the groups with the largest count, else show the groups with the smallest count Returns ------- Tuple[pd.DataFrame, float] A dataframe of the group counts, the total count of groups, and the percent of missing values """ miss_pct = round(srs.isna().sum() / len(srs) * 100, 1) try: grp_srs = srs.groupby(srs).size() except TypeError: srs = srs.astype(str) grp_srs = srs.groupby(srs).size() # select largest or smallest groups smp_srs = grp_srs.nlargest(n=ngroups) if largest else grp_srs.nsmallest( n=ngroups) df = smp_srs.to_frame().rename(columns={srs.name: "cnt"}).reset_index() # add a row containing the sum of the other groups other_cnt = len(srs) - df["cnt"].sum() df = df.append(pd.DataFrame({srs.name: ["Others"], "cnt": [other_cnt]})) # add a column containing the percent of count in each group df["pct"] = df["cnt"] / len(srs) * 100 df.columns = ["col", "cnt", "pct"] df["col"] = df["col"].astype( str) # needed when numeric is cast as categorical return df, len(grp_srs), miss_pct