def describe_categorical_1d(series: pd.Series, series_description: dict) -> dict: """Describe a categorical series. Args: series: The Series to describe. series_description: The dict containing the series description so far. Returns: A dict containing calculated series description values. """ # Make sure we deal with strings (Issue #100) series = series.astype(str) # Only run if at least 1 non-missing value value_counts = series_description["value_counts_without_nan"] stats = {"top": value_counts.index[0], "freq": value_counts.iloc[0]} redact = config["vars"]["cat"]["redact"].get(float) if not redact: stats.update({"first_rows": series.head(5)}) stats.update( histogram_compute(value_counts, len(value_counts), name="histogram_frequencies")) chi_squared_threshold = config["vars"]["num"][ "chi_squared_threshold"].get(float) if chi_squared_threshold > 0.0: stats["chi_squared"] = list(chisquare(value_counts.values)) check_length = config["vars"]["cat"]["length"].get(bool) if check_length: stats.update(length_summary(series)) stats.update( histogram_compute(stats["length"], stats["length"].nunique(), name="histogram_length")) check_unicode = config["vars"]["cat"]["characters"].get(bool) if check_unicode: stats.update(unicode_summary(series)) stats["n_characters_distinct"] = stats["n_characters"] stats["n_characters"] = stats["character_counts"].values.sum() stats["category_alias_counts"].index = stats[ "category_alias_counts"].index.str.replace("_", " ") words = config["vars"]["cat"]["words"] if words: stats.update(word_summary(series)) coerce_str_to_date = config["vars"]["cat"]["coerce_str_to_date"].get( bool) if coerce_str_to_date: stats["date_warning"] = warning_type_date(series) return stats
def describe_categorical_1d(series: pd.Series, series_description: dict) -> dict: """Describe a categorical series. Args: series: The Series to describe. series_description: The dict containing the series description so far. Returns: A dict containing calculated series description values. """ # Make sure we deal with strings (Issue #100) series = series.astype(str) # Only run if at least 1 non-missing value value_counts = series_description["value_counts_without_nan"] stats = {"top": value_counts.index[0], "freq": value_counts.iloc[0]} chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get( float ) if chi_squared_threshold > 0.0: stats["chi_squared"] = list(chisquare(value_counts.values)) check_length = config["vars"]["cat"]["length"].get(bool) if check_length: from visions.application.summaries.series.text_summary import length_summary stats.update(length_summary(series)) check_unicode = config["vars"]["cat"]["unicode"].get(bool) if check_unicode: from visions.application.summaries.series.text_summary import ( unicode_summary, ) stats.update(unicode_summary(series)) stats["category_alias_counts"].index = stats[ "category_alias_counts" ].index.str.replace("_", " ") coerce_str_to_date = config["vars"]["cat"]["coerce_str_to_date"].get(bool) if coerce_str_to_date: stats["date_warning"] = warning_type_date(series) return stats