def _calc_box(srs: dd.Series, qntls: da.Array, cfg: Config) -> Dict[str, Any]: """ Box plot calculations """ # quartiles data = { f"qrtl{i + 1}": qntls.loc[qnt].sum() for i, qnt in enumerate((0.25, 0.5, 0.75)) } # inter-quartile range iqr = data["qrtl3"] - data["qrtl1"] srs_iqr = srs[srs.between(data["qrtl1"] - 1.5 * iqr, data["qrtl3"] + 1.5 * iqr)] # lower and upper whiskers data["lw"], data["uw"] = srs_iqr.min(), srs_iqr.max() # outliers otlrs = srs[~srs.between(data["qrtl1"] - 1.5 * iqr, data["qrtl3"] + 1.5 * iqr)] # randomly sample at most 100 outliers from each partition without replacement smp_otlrs = otlrs.map_partitions(lambda x: x.sample(min(100, x.shape[0])), meta=otlrs) data["otlrs"] = smp_otlrs.values if cfg.insight.enable: data["notlrs"] = otlrs.shape[0] return data
def calc_box(srs: dd.Series, qntls: da.Array) -> Dict[str, Any]: """ Box plot calculations Parameters ---------- srs one numerical column qntls quantiles of the column """ data: Dict[str, Any] = {} # quartiles data["qrtl1"] = qntls.loc[0.25].sum() data["qrtl2"] = qntls.loc[0.5].sum() data["qrtl3"] = qntls.loc[0.75].sum() iqr = data["qrtl3"] - data["qrtl1"] srs_iqr = srs[srs.between(data["qrtl1"] - 1.5 * iqr, data["qrtl3"] + 1.5 * iqr)] # outliers otlrs = srs[~srs.between(data["qrtl1"] - 1.5 * iqr, data["qrtl3"] + 1.5 * iqr)] # randomly sample at most 100 outliers from each partition without replacement smp_otlrs = otlrs.map_partitions(lambda x: x.sample(min(100, x.shape[0])), meta=otlrs) data["lw"] = srs_iqr.min() data["uw"] = srs_iqr.max() data["otlrs"] = smp_otlrs.values ## if cfg.insights_enable data["notlrs"] = otlrs.shape[0] return data