def _nom_calcs(srs: dd.Series, head: pd.Series, cfg: Config) -> Dict[str, Any]: """ Computations for a nominal column in plot(df) """ # dictionary of data for the bar chart and related insights data: Dict[str, Any] = {} # value counts for barchart and uniformity insight grps = srs.value_counts(sort=False) if cfg.bar.enable: # select the largest or smallest groups data["bar"] = ( grps.nlargest(cfg.bar.bars) if cfg.bar.sort_descending else grps.nsmallest(cfg.bar.bars) ) data["nuniq"] = grps.shape[0] if cfg.insight.enable: data["chisq"] = chisquare(grps.values) # chi-squared test for uniformity data["nuniq"] = grps.shape[0] # number of unique values data["npres"] = grps.sum() # number of present (not null) values if not head.apply(lambda x: isinstance(x, str)).all(): srs = srs.astype(str) # srs must be a string to compute the value lengths data["min_len"], data["max_len"] = srs.str.len().min(), srs.str.len().max() return data
def calc_nom_col(srs: dd.Series, first_rows: pd.Series, ngroups: int, largest: bool) -> Dict[str, Any]: """ Computations for a categorical column in plot(df) Parameters ---------- srs srs over which to compute the barchart and insights first_rows first rows of the dataset read into memory ngroups number of groups to show in the barchart largest whether to show the largest or smallest groups """ # dictionary of data for the bar chart and related insights data = {} ## if cfg.barchart_enable or cfg.insight.uniform_enable: grps = srs.value_counts(sort=False) ## if cfg.barchart_enable: ## nbars = cfg.barchart_nbars ## largest = cfg.barchart_largest # select the largest or smallest groups data["bar"] = grps.nlargest(ngroups) if largest else grps.nsmallest( ngroups) ## if cfg.insight.uniform_enable: # compute a chi-squared test on the frequency distribution data["chisq"] = chisquare(grps.values) ## if cfg.barchart_enable or cfg.insight.unique_enable: # total number of groups data["nuniq"] = grps.shape[0] ## if cfg.insight.missing_enable: # number of present (not null) values data["npres"] = grps.sum() ## if cfg.insight.unique_enable and not cfg.barchart_enable: ## data["nuniq"] = srs.nunique() ## if cfg.insight.missing_enable and not cfg.barchart_enable: ## data["npresent"] = srs.shape[0] ## if cfg.insight.constant_length_enable: if not first_rows.apply(lambda x: isinstance(x, str)).all(): srs = srs.astype( str) # srs must be a string to compute the value lengths length = srs.str.len() data["min_len"], data["max_len"] = length.min(), length.max() return data
def cast_column_to_type(col: dd.Series, expected_type: str): """Cast the given column to the expected type""" current_type = col.dtype if similar_type(current_type, expected_type): logger.debug("...not converting.") return None if pd.api.types.is_integer_dtype(expected_type): if pd.api.types.is_float_dtype(current_type): logger.debug("...truncating...") # Currently "trunc" can not be applied to NA (the pandas missing value type), # because NA is a different type. It works with np.NaN though. # For our use case, that does not matter, as the conversion to integer later # will convert both NA and np.NaN to NA. col = da.trunc(col.fillna(value=np.NaN)) elif pd.api.types.is_timedelta64_dtype(current_type): logger.debug(f"Explicitly casting from {current_type} to np.int64") return col.astype(np.int64) logger.debug(f"Need to cast from {current_type} to {expected_type}") return col.astype(expected_type)
def calc_bar_pie(srs: dd.Series, ngroups: int, largest: bool) -> Tuple[pd.DataFrame, int, float]: """ Calculates the group counts given a series. Parameters ---------- srs One categorical column ngroups Number of groups to return largest If true, show the groups with the largest count, else show the groups with the smallest count Returns ------- Tuple[pd.DataFrame, float] A dataframe of the group counts, the total count of groups, and the percent of missing values """ miss_pct = round(srs.isna().sum() / len(srs) * 100, 1) try: grp_srs = srs.groupby(srs).size() except TypeError: srs = srs.astype(str) grp_srs = srs.groupby(srs).size() # select largest or smallest groups smp_srs = grp_srs.nlargest(n=ngroups) if largest else grp_srs.nsmallest( n=ngroups) df = smp_srs.to_frame().rename(columns={srs.name: "cnt"}).reset_index() # add a row containing the sum of the other groups other_cnt = len(srs) - df["cnt"].sum() df = df.append(pd.DataFrame({srs.name: ["Others"], "cnt": [other_cnt]})) # add a column containing the percent of count in each group df["pct"] = df["cnt"] / len(srs) * 100 df.columns = ["col", "cnt", "pct"] df["col"] = df["col"].astype( str) # needed when numeric is cast as categorical return df, len(grp_srs), miss_pct
def nom_comps(srs: dd.Series, head: pd.Series, cfg: Config) -> Dict[str, Any]: """ All computations required for plot(df, Nominal) """ # pylint: disable=too-many-branches data: Dict[str, Any] = dict() data["nrows"] = srs.shape[0] # total rows srs = srs.dropna() # drop null values grps = srs.value_counts( sort=False) # counts of unique values in the series data["geo"] = grps if cfg.stats.enable or cfg.bar.enable or cfg.pie.enable: data["nuniq"] = grps.shape[0] # total number of groups # compute bar and pie together unless the parameters are different if cfg.bar.enable or cfg.pie.enable: # select the largest or smallest groups data["bar"] = (grps.nlargest(cfg.bar.bars) if cfg.bar.sort_descending else grps.nsmallest(cfg.bar.bars)) if cfg.bar.bars == cfg.pie.slices and cfg.bar.sort_descending == cfg.pie.sort_descending: data["pie"] = data["bar"] else: data["pie"] = (grps.nlargest(cfg.pie.slices) if cfg.pie.sort_descending else grps.nsmallest( cfg.pie.slices)) if cfg.bar.bars == cfg.value_table.ngroups and cfg.bar.sort_descending: data["value_table"] = data["bar"] elif cfg.pie.slices == cfg.value_table.ngroups and cfg.pie.sort_descending: data["value_table"] = data["pie"] else: data["value_table"] = grps.nlargest(cfg.value_table.ngroups) if cfg.insight.enable: data["chisq"] = chisquare(grps.values) df = grps.reset_index() # dataframe with group names and counts if cfg.stats.enable or cfg.wordlen.enable: if not head.apply(lambda x: isinstance(x, str)).all(): srs = srs.astype( str) # srs must be a string to compute the value lengths if cfg.stats.enable or cfg.wordcloud.enable or cfg.wordfreq.enable: if not head.apply(lambda x: isinstance(x, str)).all(): df[df.columns[0]] = df[df.columns[0]].astype(str) if cfg.stats.enable: data.update(_calc_nom_stats(srs, df, data["nrows"], data["nuniq"])) elif cfg.wordfreq.enable and cfg.insight.enable: data["len_stats"] = { "Minimum": srs.str.len().min(), "Maximum": srs.str.len().max() } if cfg.wordlen.enable: lens = srs.str.len() data["len_hist"] = da.histogram(lens, cfg.wordlen.bins, (lens.min(), lens.max())) if cfg.wordcloud.enable or cfg.wordfreq.enable: if all( getattr(cfg.wordcloud, att) == getattr(cfg.wordfreq, att) for att in ("top_words", "stopword", "stem", "lemmatize")): word_freqs = _calc_word_freq( df, cfg.wordfreq.top_words, cfg.wordfreq.stopword, cfg.wordfreq.lemmatize, cfg.wordfreq.stem, ) data["word_cnts_cloud"] = word_freqs["word_cnts"] data["nuniq_words_cloud"] = word_freqs["nuniq_words"] else: word_freqs = _calc_word_freq( df.copy(), cfg.wordfreq.top_words, cfg.wordfreq.stopword, cfg.wordfreq.lemmatize, cfg.wordfreq.stem, ) word_freqs_cloud = _calc_word_freq( df, cfg.wordcloud.top_words, cfg.wordcloud.stopword, cfg.wordcloud.lemmatize, cfg.wordcloud.stem, ) data["word_cnts_cloud"] = word_freqs_cloud["word_cnts"] data["nuniq_words_cloud"] = word_freqs["nuniq_words"] data["word_cnts_freq"] = word_freqs["word_cnts"] data["nwords_freq"] = word_freqs["nwords"] return data
def nom_comps( srs: dd.Series, first_rows: pd.Series, ngroups: int, largest: bool, bins: int, top_words: int, stopword: bool, lemmatize: bool, stem: bool, ) -> Dict[str, Any]: """ This function aggregates all of the computations required for plot(df, Nominal()) Parameters ---------- srs one categorical column ngroups Number of groups to return largest If true, show the groups with the largest count, else show the groups with the smallest count bins number of bins for the category length frequency histogram top_words Number of highest frequency words to show in the wordcloud and word frequency bar chart stopword If True, remove stop words, else keep them lemmatize If True, lemmatize the words before computing the word frequencies, else don't stem If True, extract the stem of the words before computing the word frequencies, else don't """ # pylint: disable=too-many-arguments data: Dict[str, Any] = {} # total rows data["nrows"] = srs.shape[0] # drop null values srs = srs.dropna() ## if cfg.bar_enable or cfg.pie_enable # counts of unique values in the series grps = srs.value_counts(sort=False) # total number of groups data["nuniq"] = grps.shape[0] # select the largest or smallest groups data["bar"] = grps.nlargest(ngroups) if largest else grps.nsmallest( ngroups) ## if cfg.barchart_bars == cfg.piechart_slices: data["pie"] = data["bar"] ## else ## data["pie"] = grps.nlargest(ngroups) if largest else grps.nsmallest(ngroups) ## if cfg.insights.evenness_enable data["chisq"] = chisquare(grps.values) ## if cfg.stats_enable df = grps.reset_index() ## if cfg.stats_enable or cfg.word_freq_enable if not first_rows.apply(lambda x: isinstance(x, str)).all(): srs = srs.astype( str) # srs must be a string to compute the value lengths df[df.columns[0]] = df[df.columns[0]].astype(str) data.update(calc_cat_stats(srs, df, bins, data["nrows"], data["nuniq"])) # ## if cfg.word_freq_enable data.update(calc_word_freq(df, top_words, stopword, lemmatize, stem)) return data