def _calc_box_stats(grp_srs: dd.Series, grp: str, dlyd: bool = False) -> pd.DataFrame: """ Auxiliary function to calculate the Tukey box plot statistics dlyd is for if this function is called when dask is computing in parallel (dask.delayed) """ stats: Dict[str, Any] = dict() try: # this is a bad fix for the problem of when there is no data passed to this function if dlyd: qntls = np.round(grp_srs.quantile([0.25, 0.50, 0.75]), 3) else: qntls = np.round(grp_srs.quantile([0.25, 0.50, 0.75]).compute(), 3) stats["q1"], stats["q2"], stats["q3"] = qntls[0.25], qntls[ 0.50], qntls[0.75] except ValueError: stats["q1"], stats["q2"], stats["q3"] = np.nan, np.nan, np.nan iqr = stats["q3"] - stats["q1"] stats["lw"] = grp_srs[grp_srs >= stats["q1"] - 1.5 * iqr].min() stats["uw"] = grp_srs[grp_srs <= stats["q3"] + 1.5 * iqr].max() if not dlyd: stats["lw"], stats["uw"] = dask.compute(stats["lw"], stats["uw"]) otlrs = grp_srs[(grp_srs < stats["lw"]) | (grp_srs > stats["uw"])] if len(otlrs) > 100: # sample 100 outliers otlrs = otlrs.sample(frac=100 / len(otlrs)) stats["otlrs"] = list(otlrs) if dlyd else list(otlrs.compute()) return pd.DataFrame({grp: stats})
def calc_box(srs: dd.Series, qntls: da.Array) -> Dict[str, Any]: """ Box plot calculations Parameters ---------- srs one numerical column qntls quantiles of the column """ data: Dict[str, Any] = {} # quartiles data["qrtl1"] = qntls.loc[0.25].sum() data["qrtl2"] = qntls.loc[0.5].sum() data["qrtl3"] = qntls.loc[0.75].sum() iqr = data["qrtl3"] - data["qrtl1"] srs_iqr = srs[srs.between(data["qrtl1"] - 1.5 * iqr, data["qrtl3"] + 1.5 * iqr)] # outliers otlrs = srs[~srs.between(data["qrtl1"] - 1.5 * iqr, data["qrtl3"] + 1.5 * iqr)] # randomly sample at most 100 outliers from each partition without replacement smp_otlrs = otlrs.map_partitions(lambda x: x.sample(min(100, x.shape[0])), meta=otlrs) data["lw"] = srs_iqr.min() data["uw"] = srs_iqr.max() data["otlrs"] = smp_otlrs.values ## if cfg.insights_enable data["notlrs"] = otlrs.shape[0] return data
def _nom_calcs(srs: dd.Series, head: pd.Series, cfg: Config) -> Dict[str, Any]: """ Computations for a nominal column in plot(df) """ # dictionary of data for the bar chart and related insights data: Dict[str, Any] = {} # value counts for barchart and uniformity insight grps = srs.value_counts(sort=False) if cfg.bar.enable: # select the largest or smallest groups data["bar"] = ( grps.nlargest(cfg.bar.bars) if cfg.bar.sort_descending else grps.nsmallest(cfg.bar.bars) ) data["nuniq"] = grps.shape[0] if cfg.insight.enable: data["chisq"] = chisquare(grps.values) # chi-squared test for uniformity data["nuniq"] = grps.shape[0] # number of unique values data["npres"] = grps.sum() # number of present (not null) values if not head.apply(lambda x: isinstance(x, str)).all(): srs = srs.astype(str) # srs must be a string to compute the value lengths data["min_len"], data["max_len"] = srs.str.len().min(), srs.str.len().max() return data
def _cont_calcs(srs: dd.Series, cfg: Config) -> Dict[str, Any]: """ Computations for a continuous column in plot(df) """ # dictionary of data for the histogram and related insights data: Dict[str, Any] = {} if cfg.insight.enable: data["npres"] = srs.shape[0] # number of present (not null) values # drop infinite values srs = srs[~srs.isin({np.inf, -np.inf})] # histogram data["hist"] = da.histogram(srs, bins=cfg.hist.bins, range=(srs.min(), srs.max())) if cfg.insight.enable: data["chisq"] = chisquare(data["hist"][0]) data["norm"] = normaltest(data["hist"][0]) data["skew"] = skewtest(data["hist"][0]) data["nneg"] = (srs < 0).sum() # number of negative values data["nuniq"] = srs.nunique_approx() # number of unique values data["nzero"] = (srs == 0).sum() # number of zeros data["nreals"] = srs.shape[0] # number of non-inf values return data
def uni_histogram( srs: dd.Series, bins: int, dtype: Optional[DTypeDef] = None, ) -> Tuple[da.Array, ...]: """Calculate "histogram" for both numerical and categorical.""" if is_dtype(detect_dtype(srs, dtype), Continuous()): counts, edges = da.histogram(srs, bins, range=[srs.min(), srs.max()]) centers = (edges[:-1] + edges[1:]) / 2 return counts, centers, edges elif is_dtype(detect_dtype(srs, dtype), Nominal()): # Dask array's unique is way slower than the values_counts on Series # See https://github.com/dask/dask/issues/2851 # centers, counts = da.unique(arr, return_counts=True) value_counts = srs.value_counts() counts = value_counts.to_dask_array() centers = value_counts.index.to_dask_array() return (counts, centers) else: raise ValueError(f"Unsupported dtype {srs.dtype}")
def _calc_box(srs: dd.Series, qntls: da.Array, cfg: Config) -> Dict[str, Any]: """ Box plot calculations """ # quartiles data = { f"qrtl{i + 1}": qntls.loc[qnt].sum() for i, qnt in enumerate((0.25, 0.5, 0.75)) } # inter-quartile range iqr = data["qrtl3"] - data["qrtl1"] srs_iqr = srs[srs.between(data["qrtl1"] - 1.5 * iqr, data["qrtl3"] + 1.5 * iqr)] # lower and upper whiskers data["lw"], data["uw"] = srs_iqr.min(), srs_iqr.max() # outliers otlrs = srs[~srs.between(data["qrtl1"] - 1.5 * iqr, data["qrtl3"] + 1.5 * iqr)] # randomly sample at most 100 outliers from each partition without replacement smp_otlrs = otlrs.map_partitions(lambda x: x.sample(min(100, x.shape[0])), meta=otlrs) data["otlrs"] = smp_otlrs.values if cfg.insight.enable: data["notlrs"] = otlrs.shape[0] return data
def calc_hist(srs: dd.Series, bins: int, orig_df_len: int) -> Tuple[pd.DataFrame, float]: """ Calculate a histogram over a given series. Parameters ---------- srs : dd.Series one numerical column over which to compute the histogram bins : int number of bins to use in the histogram orig_df_len : int length of the original dataframe Returns ------- Tuple[pd.DataFrame, float]: The histogram in a dataframe and the percent of missing values """ miss_pct = round(srs.isna().sum() / len(srs) * 100, 1) data = srs.dropna().values if len(data) == 0: # all values in column are missing return pd.DataFrame({"left": [], "right": [], "freq": []}), miss_pct minv, maxv = data.min(), data.max() hist_arr, bins_arr = np.histogram(data, range=[minv, maxv], bins=bins) intervals = _format_bin_intervals(bins_arr) hist_df = pd.DataFrame({ "intervals": intervals, "left": bins_arr[:-1], "right": bins_arr[1:], "freq": hist_arr, "pct": hist_arr / orig_df_len * 100, }) return hist_df, miss_pct
def uni_histogram( srs: dd.Series, srs_dtype: DType, cfg: Config, ) -> Tuple[da.Array, ...]: """Calculate "histogram" for both numerical and categorical.""" if isinstance(srs_dtype, Continuous): counts, edges = da.histogram(srs, cfg.hist.bins, (srs.min(), srs.max())) centers = (edges[:-1] + edges[1:]) / 2 return counts, centers, edges elif isinstance(srs_dtype, (Nominal, GeoGraphy, SmallCardNum, DateTime)): # Dask array's unique is way slower than the values_counts on Series # See https://github.com/dask/dask/issues/2851 # centers, counts = da.unique(arr, return_counts=True) value_counts = srs.value_counts() counts = value_counts.to_dask_array() centers = value_counts.index.to_dask_array() return (counts, centers) else: raise ValueError(f"Unsupported dtype {srs.dtype}")
def calc_stats_dt(srs: dd.Series) -> Dict[str, str]: """ Calculate stats from a datetime column Parameters ---------- srs a datetime column Returns ------- Dict[str, str] Dictionary that contains Overview """ size = len(srs) # include nan count = srs.count() # exclude nan uniq_count = srs.nunique() overview_dict = { "Distinct Count": uniq_count, "Unique (%)": uniq_count / count, "Missing": size - count, "Missing (%)": 1 - (count / size), "Memory Size": srs.memory_usage(), "Minimum": srs.min(), "Maximum": srs.max(), } return overview_dict
def calc_cat_stats( srs: dd.Series, df: dd.DataFrame, bins: int, nrows: int, nuniq: Optional[dd.core.Scalar] = None, ) -> Dict[str, Any]: """ Calculate stats for a categorical column Parameters ---------- srs a categorical column df groupby-count on the categorical column as a dataframe bins number of bins for the category length frequency histogram nrows number of rows before dropping null values nuniq number of unique values in the column """ # pylint: disable=too-many-locals # overview stats stats = { "nrows": nrows, "npres": srs.shape[0], "nuniq": nuniq, # if cfg.bar_endable or cfg.pie_enable else srs.nunique(), "mem_use": srs.memory_usage(deep=True), "first_rows": srs.reset_index(drop=True).loc[:4], } # length stats lengths = srs.str.len() minv, maxv = lengths.min(), lengths.max() hist = da.histogram(lengths.values, bins=bins, range=[minv, maxv]) leng = { "Mean": lengths.mean(), "Standard Deviation": lengths.std(), "Median": lengths.quantile(0.5), "Minimum": minv, "Maximum": maxv, } # letter stats # computed on groupby-count: # compute the statistic for each group then multiply by the count of the group grp, col = df.columns lc_cnt = (df[grp].str.count(r"[a-z]") * df[col]).sum() uc_cnt = (df[grp].str.count(r"[A-Z]") * df[col]).sum() letter = { "Count": lc_cnt + uc_cnt, "Lowercase Letter": lc_cnt, "Space Separator": (df[grp].str.count(r"[ ]") * df[col]).sum(), "Uppercase Letter": uc_cnt, "Dash Punctuation": (df[grp].str.count(r"[-]") * df[col]).sum(), "Decimal Number": (df[grp].str.count(r"[0-9]") * df[col]).sum(), } return {"stats": stats, "len_stats": leng, "letter_stats": letter, "len_hist": hist}
def calc_nom_col(srs: dd.Series, first_rows: pd.Series, ngroups: int, largest: bool) -> Dict[str, Any]: """ Computations for a categorical column in plot(df) Parameters ---------- srs srs over which to compute the barchart and insights first_rows first rows of the dataset read into memory ngroups number of groups to show in the barchart largest whether to show the largest or smallest groups """ # dictionary of data for the bar chart and related insights data = {} ## if cfg.barchart_enable or cfg.insight.uniform_enable: grps = srs.value_counts(sort=False) ## if cfg.barchart_enable: ## nbars = cfg.barchart_nbars ## largest = cfg.barchart_largest # select the largest or smallest groups data["bar"] = grps.nlargest(ngroups) if largest else grps.nsmallest( ngroups) ## if cfg.insight.uniform_enable: # compute a chi-squared test on the frequency distribution data["chisq"] = chisquare(grps.values) ## if cfg.barchart_enable or cfg.insight.unique_enable: # total number of groups data["nuniq"] = grps.shape[0] ## if cfg.insight.missing_enable: # number of present (not null) values data["npres"] = grps.sum() ## if cfg.insight.unique_enable and not cfg.barchart_enable: ## data["nuniq"] = srs.nunique() ## if cfg.insight.missing_enable and not cfg.barchart_enable: ## data["npresent"] = srs.shape[0] ## if cfg.insight.constant_length_enable: if not first_rows.apply(lambda x: isinstance(x, str)).all(): srs = srs.astype( str) # srs must be a string to compute the value lengths length = srs.str.len() data["min_len"], data["max_len"] = length.min(), length.max() return data
def calc_cat_stats(srs: dd.Series, bins: int, nrows: int, nuniq: Optional[dd.core.Scalar] = None) -> Dict[str, Any]: """ Calculate stats for a categorical column Parameters ---------- srs a categorical column nrows number of rows before dropping null values bins number of bins for the category length frequency histogram """ # overview stats stats = { "nrows": nrows, "npres": srs.shape[0], "nuniq": nuniq, # if cfg.bar_endable or cfg.pie_enable else srs.nunique(), "mem_use": srs.memory_usage(deep=True), "first_rows": srs.reset_index(drop=True).loc[:4], } # length stats lengths = srs.str.len() minv, maxv = lengths.min(), lengths.max() hist = da.histogram(lengths.values, bins=bins, range=[minv, maxv]) leng = { "Mean": lengths.mean(), "Standard Deviation": lengths.std(), "Median": lengths.quantile(0.5), "Minimum": minv, "Maximum": maxv, } # letter stats letter = { "Count": srs.str.count(r"[a-zA-Z]").sum(), "Lowercase Letter": srs.str.count(r"[a-z]").sum(), "Space Separator": srs.str.count(r"[ ]").sum(), "Uppercase Letter": srs.str.count(r"[A-Z]").sum(), "Dash Punctuation": srs.str.count(r"[-]").sum(), "Decimal Number": srs.str.count(r"[0-9]").sum(), } return { "stats": stats, "len_stats": leng, "letter_stats": letter, "len_hist": hist }
def _calc_box_stats(grp_srs: dd.Series, grp: str) -> pd.DataFrame: """ Auxiliary function to calculate the Tukey box plot statistics Parameters ---------- grp_srs: dd.Series one numerical column grp: str Name of the group of the corresponding series values Returns ------- pd.DataFrame A dataframe containing box plot statistics """ stats: Dict[str, Any] = dict() try: # this is a bad fix for the problem of when there is no data passed to this function qntls = np.round(grp_srs.quantile([0.25, 0.50, 0.75]).compute(), 3) stats["q1"], stats["q2"], stats["q3"] = qntls[0.25], qntls[ 0.50], qntls[0.75] except ValueError: stats["q1"], stats["q2"], stats["q3"] = np.nan, np.nan, np.nan iqr = stats["q3"] - stats["q1"] stats["lw"] = grp_srs[grp_srs >= stats["q1"] - 1.5 * iqr].min().compute() stats["uw"] = grp_srs[grp_srs <= stats["q3"] + 1.5 * iqr].max().compute() otlrs = grp_srs[(grp_srs < stats["lw"]) | (grp_srs > stats["uw"])] if len(otlrs) > 100: # sample 100 outliers otlrs = otlrs.sample(frac=100 / len(otlrs)) stats["otlrs"] = list(otlrs.compute()) return pd.DataFrame({grp: stats})
def is_geopoint(col: dd.Series) -> bool: """ Given a column, return if its type is a geopoint type """ lat_long = pd.Series(col.compute()[:100], dtype="string") lat_long_ratio: float = np.sum(validate_lat_long(lat_long)) / lat_long.shape[0] return lat_long_ratio > 0.8
def is_geography(col: dd.Series) -> bool: """ Given a column, return if its type is a geography type """ geo = col.compute()[:100] geo_ratio: float = np.sum(validate_country(geo)) / geo.shape[0] return geo_ratio > 0.8
def detect_without_known(col: dd.Series, detect_small_distinct: bool) -> DType: # pylint: disable=too-many-return-statements """ This function detects dtypes of column when users didn't specify. """ if is_nominal(col.dtype): if is_geography(col): return GeoGraphy() if is_geopoint(col): return GeoPoint() else: return Nominal() elif is_continuous(col.dtype): if detect_small_distinct: # detect as categorical if distinct value is small nuniques = col.nunique_approx().compute() if nuniques < 10: return Nominal() else: return Continuous() else: return Continuous() elif is_datetime(col.dtype): return DateTime() else: raise UnreachableError
def get_type(data: dd.Series) -> DataType: """ Returns the type of the input data. Identified types are according to the DataType Enumeration. Parameter __________ The data for which the type needs to be identified. Returns __________ str representing the type of the data. """ col_type = DataType.TYPE_UNSUP try: if pd.api.types.is_bool_dtype(data): col_type = DataType.TYPE_CAT elif (pd.api.types.is_numeric_dtype(data) and dask.compute(data.dropna().unique().size) == 2): col_type = DataType.TYPE_CAT elif pd.api.types.is_numeric_dtype(data): col_type = DataType.TYPE_NUM else: col_type = DataType.TYPE_CAT except NotImplementedError as error: # TO-DO LOGGER.info("Type cannot be determined due to : %s", error) return col_type
def calc_cont_col(srs: dd.Series, bins: int) -> Dict[str, Any]: """ Computations for a numerical column in plot(df) Parameters ---------- srs srs over which to compute the barchart and insights bins number of bins in the bar chart """ # dictionary of data for the histogram and related insights data: Dict[str, Any] = {} ## if cfg.insight.missing_enable: data["npres"] = srs.shape[0] ## if cfg.insight.infinity_enable: is_inf_srs = srs.isin({np.inf, -np.inf}) data["ninf"] = is_inf_srs.sum() # remove infinite values srs = srs[~is_inf_srs] ## if cfg.hist_enable or config.insight.uniform_enable or cfg.insight.normal_enable: ## bins = cfg.hist_bins data["hist"] = da.histogram(srs, bins=bins, range=[srs.min(), srs.max()]) ## if cfg.insight.uniform_enable: data["chisq"] = chisquare(data["hist"][0]) ## if cfg.insight.normal_enable data["norm"] = normaltest(data["hist"][0]) ## if cfg.insight.negative_enable: data["nneg"] = (srs < 0).sum() ## if cfg.insight.skew_enabled: data["skew"] = skewtest(data["hist"][0]) ## if cfg.insight.unique_enabled: data["nuniq"] = srs.nunique() ## if cfg.insight.zero_enabled: data["nzero"] = (srs == 0).sum() return data
def histogram( srs: dd.Series, bins: Optional[int] = None, return_edges: bool = True, range: Optional[Tuple[int, int]] = None, # pylint: disable=redefined-builtin dtype: Optional[DTypeDef] = None, ) -> Union[Tuple[da.Array, da.Array], Tuple[da.Array, da.Array, da.Array]]: """ Calculate "histogram" for both numerical and categorical """ if is_dtype(detect_dtype(srs, dtype), Continuous()): if range is not None: minimum, maximum = range else: minimum, maximum = srs.min(axis=0), srs.max(axis=0) minimum, maximum = dask.compute(minimum, maximum) assert ( bins is not None ), "num_bins cannot be None if calculating numerical histograms" counts, edges = da.histogram(srs.to_dask_array(), bins, range=[minimum, maximum]) centers = (edges[:-1] + edges[1:]) / 2 if not return_edges: return counts, centers return counts, centers, edges elif is_dtype(detect_dtype(srs, dtype), Nominal()): value_counts = srs.value_counts() counts = value_counts.to_dask_array() # Dask array dones't understand the pandas dtypes such as categorical type. # We convert these types into str before calling into `to_dask_array`. if is_pandas_categorical(value_counts.index.dtype): centers = value_counts.index.astype("str").to_dask_array() else: centers = value_counts.index.to_dask_array() return (counts, centers) else: raise UnreachableError()
def calc_qqnorm(srs: dd.Series) -> Tuple[np.ndarray, np.ndarray]: """ Calculate QQ plot given a series. Parameters ---------- srs One numerical column from which to compute the quantiles Returns ------- Tuple[np.ndarray, np.ndarray] A tuple of (actual quantiles, theoretical quantiles) """ q_range = np.linspace(0.01, 0.99, 100) actual_qs, mean, std = dask.compute(srs.quantile(q_range), srs.mean(), srs.std()) theory_qs = np.sort(np.asarray(norm.ppf(q_range, mean, std))) return actual_qs, theory_qs
def cast_column_to_type(col: dd.Series, expected_type: str): """Cast the given column to the expected type""" current_type = col.dtype if similar_type(current_type, expected_type): logger.debug("...not converting.") return None current_float = pd.api.types.is_float_dtype(current_type) expected_integer = pd.api.types.is_integer_dtype(expected_type) if current_float and expected_integer: logger.debug("...truncating...") # Currently "trunc" can not be applied to NA (the pandas missing value type), # because NA is a different type. It works with np.NaN though. # For our use case, that does not matter, as the conversion to integer later # will convert both NA and np.NaN to NA. col = da.trunc(col.fillna(value=np.NaN)) logger.debug(f"Need to cast from {current_type} to {expected_type}") return col.astype(expected_type)
def _calc_nom_stats( srs: dd.Series, df: dd.DataFrame, nrows: int, nuniq: dd.core.Scalar, ) -> Dict[str, Any]: """ Calculate statistics for a nominal column """ # overview stats stats = { "nrows": nrows, "npres": srs.shape[0], "nuniq": nuniq, "mem_use": srs.memory_usage(deep=True), "first_rows": srs.reset_index(drop=True).loc[:4], } # length stats leng = { "Mean": srs.str.len().mean(), "Standard Deviation": srs.str.len().std(), "Median": srs.str.len().quantile(0.5), "Minimum": srs.str.len().min(), "Maximum": srs.str.len().max(), } # letter stats # computed on groupby-count: # compute the statistic for each group then multiply by the count of the group grp, col = df.columns lc_cnt = (df[grp].str.count(r"[a-z]") * df[col]).sum() uc_cnt = (df[grp].str.count(r"[A-Z]") * df[col]).sum() letter = { "Count": lc_cnt + uc_cnt, "Lowercase Letter": lc_cnt, "Space Separator": (df[grp].str.count(r"[ ]") * df[col]).sum(), "Uppercase Letter": uc_cnt, "Dash Punctuation": (df[grp].str.count(r"[-]") * df[col]).sum(), "Decimal Number": (df[grp].str.count(r"[0-9]") * df[col]).sum(), } return {"stats": stats, "len_stats": leng, "letter_stats": letter}
def calc_bar_pie(srs: dd.Series, ngroups: int, largest: bool) -> Tuple[pd.DataFrame, int, float]: """ Calculates the group counts given a series. Parameters ---------- srs One categorical column ngroups Number of groups to return largest If true, show the groups with the largest count, else show the groups with the smallest count Returns ------- Tuple[pd.DataFrame, float] A dataframe of the group counts, the total count of groups, and the percent of missing values """ miss_pct = round(srs.isna().sum() / len(srs) * 100, 1) try: grp_srs = srs.groupby(srs).size() except TypeError: srs = srs.astype(str) grp_srs = srs.groupby(srs).size() # select largest or smallest groups smp_srs = grp_srs.nlargest(n=ngroups) if largest else grp_srs.nsmallest( n=ngroups) df = smp_srs.to_frame().rename(columns={srs.name: "cnt"}).reset_index() # add a row containing the sum of the other groups other_cnt = len(srs) - df["cnt"].sum() df = df.append(pd.DataFrame({srs.name: ["Others"], "cnt": [other_cnt]})) # add a column containing the percent of count in each group df["pct"] = df["cnt"] / len(srs) * 100 df.columns = ["col", "cnt", "pct"] df["col"] = df["col"].astype( str) # needed when numeric is cast as categorical return df, len(grp_srs), miss_pct
def calc_stats_dt(srs: dd.Series) -> Dict[str, str]: """ Calculate stats from a datetime column """ size = srs.shape[0] # include nan count = srs.count() # exclude nan # nunique_approx() has error when type is datetime try: uniq_count = srs.nunique_approx() except: # pylint: disable=W0702 uniq_count = srs.nunique() overview_dict = { "Distinct Count": uniq_count, "Approximate Unique (%)": uniq_count / count, "Missing": size - count, "Missing (%)": 1 - (count / size), "Memory Size": srs.memory_usage(deep=True), "Minimum": srs.min(), "Maximum": srs.max(), } return overview_dict
def coerce_code(v: dd.Series, codes: List[int]) -> dd.Series: # Set non-ints and unexpected codes to missing (-1) v = dd.to_numeric(v, errors="coerce") v = v.where(v.isin(codes), np.nan) return v.fillna(-1).astype("int8")
def round_series_up(s: dd.Series) -> dd.Series: """Apply roundup function to all elements of `s`""" return s.apply(roundup, meta=pd.Series(data=[], dtype=np.float32))
def cont_comps(srs: dd.Series, cfg: Config) -> Dict[str, Any]: """ All computations required for plot(df, Continuous) """ # pylint: disable=too-many-branches data: Dict[str, Any] = {} if cfg.stats.enable or cfg.hist.enable: data["nrows"] = srs.shape[0] # total rows srs = srs.dropna() if cfg.stats.enable: data["npres"] = srs.shape[0] # number of present (not null) values srs = srs[~srs.isin({np.inf, -np.inf})] # remove infinite values if cfg.hist.enable or cfg.qqnorm.enable and cfg.insight.enable: data["hist"] = da.histogram(srs, cfg.hist.bins, (srs.min(), srs.max())) if cfg.insight.enable: data["norm"] = normaltest(data["hist"][0]) if cfg.hist.enable and cfg.insight.enable: data["chisq"] = chisquare(data["hist"][0]) # compute only the required amount of quantiles if cfg.qqnorm.enable: data["qntls"] = srs.quantile(np.linspace(0.01, 0.99, 99)) elif cfg.stats.enable: data["qntls"] = srs.quantile([0.05, 0.25, 0.5, 0.75, 0.95]) elif cfg.box.enable: data["qntls"] = srs.quantile([0.25, 0.5, 0.75]) if cfg.stats.enable or cfg.hist.enable and cfg.insight.enable: data["skew"] = skew(srs) if cfg.stats.enable or cfg.qqnorm.enable: data["mean"] = srs.mean() data["std"] = srs.std() if cfg.stats.enable: data["min"] = srs.min() data["max"] = srs.max() data["nreals"] = srs.shape[0] data["nzero"] = (srs == 0).sum() data["nneg"] = (srs < 0).sum() data["kurt"] = kurtosis(srs) data["mem_use"] = srs.memory_usage(deep=True) # compute the density histogram if cfg.kde.enable: # To avoid the singular matrix problem, gaussian_kde needs a non-zero std. if not math.isclose( dask.compute(data["min"])[0], dask.compute(data["max"])[0]): data["dens"] = da.histogram(srs, cfg.kde.bins, (srs.min(), srs.max()), density=True) # gaussian kernel density estimate data["kde"] = gaussian_kde( srs.map_partitions(lambda x: x.sample(min(1000, x.shape[0])), meta=srs)) else: data["kde"] = None if cfg.box.enable: data.update(_calc_box(srs, data["qntls"], cfg)) if cfg.value_table.enable: value_counts = srs.value_counts(sort=False) if cfg.stats.enable: data["nuniq"] = value_counts.shape[0] data["value_table"] = value_counts.nlargest(cfg.value_table.ngroups) elif cfg.stats.enable: data["nuniq"] = srs.nunique_approx() return data
def nom_comps(srs: dd.Series, head: pd.Series, cfg: Config) -> Dict[str, Any]: """ All computations required for plot(df, Nominal) """ # pylint: disable=too-many-branches data: Dict[str, Any] = dict() data["nrows"] = srs.shape[0] # total rows srs = srs.dropna() # drop null values grps = srs.value_counts( sort=False) # counts of unique values in the series data["geo"] = grps if cfg.stats.enable or cfg.bar.enable or cfg.pie.enable: data["nuniq"] = grps.shape[0] # total number of groups # compute bar and pie together unless the parameters are different if cfg.bar.enable or cfg.pie.enable: # select the largest or smallest groups data["bar"] = (grps.nlargest(cfg.bar.bars) if cfg.bar.sort_descending else grps.nsmallest(cfg.bar.bars)) if cfg.bar.bars == cfg.pie.slices and cfg.bar.sort_descending == cfg.pie.sort_descending: data["pie"] = data["bar"] else: data["pie"] = (grps.nlargest(cfg.pie.slices) if cfg.pie.sort_descending else grps.nsmallest( cfg.pie.slices)) if cfg.bar.bars == cfg.value_table.ngroups and cfg.bar.sort_descending: data["value_table"] = data["bar"] elif cfg.pie.slices == cfg.value_table.ngroups and cfg.pie.sort_descending: data["value_table"] = data["pie"] else: data["value_table"] = grps.nlargest(cfg.value_table.ngroups) if cfg.insight.enable: data["chisq"] = chisquare(grps.values) df = grps.reset_index() # dataframe with group names and counts if cfg.stats.enable or cfg.wordlen.enable: if not head.apply(lambda x: isinstance(x, str)).all(): srs = srs.astype( str) # srs must be a string to compute the value lengths if cfg.stats.enable or cfg.wordcloud.enable or cfg.wordfreq.enable: if not head.apply(lambda x: isinstance(x, str)).all(): df[df.columns[0]] = df[df.columns[0]].astype(str) if cfg.stats.enable: data.update(_calc_nom_stats(srs, df, data["nrows"], data["nuniq"])) elif cfg.wordfreq.enable and cfg.insight.enable: data["len_stats"] = { "Minimum": srs.str.len().min(), "Maximum": srs.str.len().max() } if cfg.wordlen.enable: lens = srs.str.len() data["len_hist"] = da.histogram(lens, cfg.wordlen.bins, (lens.min(), lens.max())) if cfg.wordcloud.enable or cfg.wordfreq.enable: if all( getattr(cfg.wordcloud, att) == getattr(cfg.wordfreq, att) for att in ("top_words", "stopword", "stem", "lemmatize")): word_freqs = _calc_word_freq( df, cfg.wordfreq.top_words, cfg.wordfreq.stopword, cfg.wordfreq.lemmatize, cfg.wordfreq.stem, ) data["word_cnts_cloud"] = word_freqs["word_cnts"] data["nuniq_words_cloud"] = word_freqs["nuniq_words"] else: word_freqs = _calc_word_freq( df.copy(), cfg.wordfreq.top_words, cfg.wordfreq.stopword, cfg.wordfreq.lemmatize, cfg.wordfreq.stem, ) word_freqs_cloud = _calc_word_freq( df, cfg.wordcloud.top_words, cfg.wordcloud.stopword, cfg.wordcloud.lemmatize, cfg.wordcloud.stem, ) data["word_cnts_cloud"] = word_freqs_cloud["word_cnts"] data["nuniq_words_cloud"] = word_freqs["nuniq_words"] data["word_cnts_freq"] = word_freqs["word_cnts"] data["nwords_freq"] = word_freqs["nwords"] return data
def nom_comps( srs: dd.Series, first_rows: pd.Series, ngroups: int, largest: bool, bins: int, top_words: int, stopword: bool, lemmatize: bool, stem: bool, ) -> Dict[str, Any]: """ This function aggregates all of the computations required for plot(df, Nominal()) Parameters ---------- srs one categorical column ngroups Number of groups to return largest If true, show the groups with the largest count, else show the groups with the smallest count bins number of bins for the category length frequency histogram top_words Number of highest frequency words to show in the wordcloud and word frequency bar chart stopword If True, remove stop words, else keep them lemmatize If True, lemmatize the words before computing the word frequencies, else don't stem If True, extract the stem of the words before computing the word frequencies, else don't """ # pylint: disable=too-many-arguments data: Dict[str, Any] = {} # total rows data["nrows"] = srs.shape[0] # drop null values srs = srs.dropna() ## if cfg.bar_enable or cfg.pie_enable # counts of unique values in the series grps = srs.value_counts(sort=False) # total number of groups data["nuniq"] = grps.shape[0] # select the largest or smallest groups data["bar"] = grps.nlargest(ngroups) if largest else grps.nsmallest( ngroups) ## if cfg.barchart_bars == cfg.piechart_slices: data["pie"] = data["bar"] ## else ## data["pie"] = grps.nlargest(ngroups) if largest else grps.nsmallest(ngroups) ## if cfg.insights.evenness_enable data["chisq"] = chisquare(grps.values) ## if cfg.stats_enable df = grps.reset_index() ## if cfg.stats_enable or cfg.word_freq_enable if not first_rows.apply(lambda x: isinstance(x, str)).all(): srs = srs.astype( str) # srs must be a string to compute the value lengths df[df.columns[0]] = df[df.columns[0]].astype(str) data.update(calc_cat_stats(srs, df, bins, data["nrows"], data["nuniq"])) # ## if cfg.word_freq_enable data.update(calc_word_freq(df, top_words, stopword, lemmatize, stem)) return data
def cont_comps(srs: dd.Series, bins: int) -> Dict[str, Any]: """ This function aggregates all of the computations required for plot(df, Continuous()) Parameters ---------- srs one numerical column bins the number of bins in the histogram """ data: Dict[str, Any] = {} ## if cfg.stats_enable or cfg.hist_enable or # calculate the total number of rows then drop the missing values data["nrows"] = srs.shape[0] srs = srs.dropna() ## if cfg.stats_enable # number of not null (present) values data["npres"] = srs.shape[0] # remove infinite values srs = srs[~srs.isin({np.inf, -np.inf})] # shared computations ## if cfg.stats_enable or cfg.hist_enable or cfg.qqplot_enable and cfg.insights_enable: data["min"], data["max"] = srs.min(), srs.max() ## if cfg.hist_enable or cfg.qqplot_enable and cfg.ingsights_enable: data["hist"] = da.histogram(srs, bins=bins, range=[data["min"], data["max"]]) ## if cfg.insights_enable and (cfg.qqplot_enable or cfg.hist_enable): data["norm"] = normaltest(data["hist"][0]) ## if cfg.qqplot_enable data["qntls"] = srs.quantile(np.linspace(0.01, 0.99, 99)) ## elif cfg.stats_enable ## data["qntls"] = srs.quantile([0.05, 0.25, 0.5, 0.75, 0.95]) ## elif cfg.boxplot_enable ## data["qntls"] = srs.quantile([0.25, 0.5, 0.75]) ## if cfg.stats_enable or cfg.hist_enable and cfg.insights_enable: data["skew"] = skew(srs) # if cfg.stats_enable data["nuniq"] = srs.nunique() data["nreals"] = srs.shape[0] data["nzero"] = (srs == 0).sum() data["nneg"] = (srs < 0).sum() data["mean"] = srs.mean() data["std"] = srs.std() data["kurt"] = kurtosis(srs) data["mem_use"] = srs.memory_usage(deep=True) ## if cfg.hist_enable and cfg.insight_enable data["chisq"] = chisquare(data["hist"][0]) # compute the density histogram data["dens"] = da.histogram(srs, bins=bins, range=[data["min"], data["max"]], density=True) # gaussian kernel density estimate data["kde"] = gaussian_kde( srs.map_partitions(lambda x: x.sample(min(1000, x.shape[0])), meta=srs)) ## if cfg.box_enable data.update(calc_box(srs, data["qntls"])) return data