def describe_categorical_1d(series: pd.Series, series_description: dict) -> dict:
    """Describe a categorical series.

    Args:
        series: The Series to describe.
        series_description: The dict containing the series description so far.

    Returns:
        A dict containing calculated series description values.
    """
    # Make sure we deal with strings (Issue #100)
    series = series.astype(str)

    # Only run if at least 1 non-missing value
    value_counts = series_description["value_counts_without_nan"]

    stats = {"top": value_counts.index[0], "freq": value_counts.iloc[0]}

    check_composition = config["vars"]["cat"]["check_composition"].get(bool)
    if check_composition:
        contains = {
            "chars": series.str.contains(r"[a-zA-Z]", case=False, regex=True).any(),
            "digits": series.str.contains(r"[0-9]", case=False, regex=True).any(),
            "spaces": series.str.contains(r"\s", case=False, regex=True).any(),
            "non-words": series.str.contains(r"\W", case=False, regex=True).any(),
        }
        stats["max_length"] = series.str.len().max()
        stats["mean_length"] = series.str.len().mean()
        stats["min_length"] = series.str.len().min()
        stats["composition"] = contains

    stats["date_warning"] = warning_type_date(series)

    return stats
Ejemplo n.º 2
0
    def describe_categorical_1d(series: pd.Series,
                                series_description: dict) -> dict:
        """Describe a categorical series.

        Args:
            series: The Series to describe.
            series_description: The dict containing the series description so far.

        Returns:
            A dict containing calculated series description values.
        """
        # Make sure we deal with strings (Issue #100)
        series = series.astype(str)

        # Only run if at least 1 non-missing value
        value_counts = series_description["value_counts_without_nan"]

        stats = {"top": value_counts.index[0], "freq": value_counts.iloc[0]}

        redact = config["vars"]["cat"]["redact"].get(float)
        if not redact:
            stats.update({"first_rows": series.head(5)})

        stats.update(
            histogram_compute(value_counts,
                              len(value_counts),
                              name="histogram_frequencies"))

        chi_squared_threshold = config["vars"]["num"][
            "chi_squared_threshold"].get(float)
        if chi_squared_threshold > 0.0:
            stats["chi_squared"] = list(chisquare(value_counts.values))

        check_length = config["vars"]["cat"]["length"].get(bool)
        if check_length:
            stats.update(length_summary(series))
            stats.update(
                histogram_compute(stats["length"],
                                  stats["length"].nunique(),
                                  name="histogram_length"))

        check_unicode = config["vars"]["cat"]["characters"].get(bool)
        if check_unicode:
            stats.update(unicode_summary(series))
            stats["n_characters_distinct"] = stats["n_characters"]
            stats["n_characters"] = stats["character_counts"].values.sum()

            stats["category_alias_counts"].index = stats[
                "category_alias_counts"].index.str.replace("_", " ")

        words = config["vars"]["cat"]["words"]
        if words:
            stats.update(word_summary(series))

        coerce_str_to_date = config["vars"]["cat"]["coerce_str_to_date"].get(
            bool)
        if coerce_str_to_date:
            stats["date_warning"] = warning_type_date(series)

        return stats
def describe_unique_1d(series: pd.Series, series_description: dict) -> dict:
    """Describe a unique series (placeholder).

    Args:
        series: The Series to describe.
        series_description: The dict containing the series description so far.

    Returns:
        An empty dict.
    """
    stats = {"date_warning": warning_type_date(series)}

    return stats
    def describe_categorical_1d(series: pd.Series, series_description: dict) -> dict:
        """Describe a categorical series.

        Args:
            series: The Series to describe.
            series_description: The dict containing the series description so far.

        Returns:
            A dict containing calculated series description values.
        """
        # Make sure we deal with strings (Issue #100)
        series = series.astype(str)

        # Only run if at least 1 non-missing value
        value_counts = series_description["value_counts_without_nan"]

        stats = {"top": value_counts.index[0], "freq": value_counts.iloc[0]}

        chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get(
            float
        )
        if chi_squared_threshold > 0.0:
            stats["chi_squared"] = list(chisquare(value_counts.values))

        check_length = config["vars"]["cat"]["length"].get(bool)
        if check_length:
            from visions.application.summaries.series.text_summary import length_summary

            stats.update(length_summary(series))

        check_unicode = config["vars"]["cat"]["unicode"].get(bool)
        if check_unicode:
            from visions.application.summaries.series.text_summary import (
                unicode_summary,
            )

            stats.update(unicode_summary(series))

            stats["category_alias_counts"].index = stats[
                "category_alias_counts"
            ].index.str.replace("_", " ")

        coerce_str_to_date = config["vars"]["cat"]["coerce_str_to_date"].get(bool)
        if coerce_str_to_date:
            stats["date_warning"] = warning_type_date(series)

        return stats
Ejemplo n.º 5
0
def describe_categorical_1d(series: pd.Series,
                            series_description: dict) -> dict:
    """Describe a categorical series.

    Args:
        series: The Series to describe.
        series_description: The dict containing the series description so far.

    Returns:
        A dict containing calculated series description values.
    """
    # Make sure we deal with strings (Issue #100)
    series = series.astype(str)

    # Only run if at least 1 non-missing value
    value_counts = series_description["value_counts_without_nan"]

    stats = {"top": value_counts.index[0], "freq": value_counts.iloc[0]}

    chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get(
        float)
    if chi_squared_threshold > 0.0:
        stats["chi_squared"] = list(chisquare(value_counts.values))

    check_composition = config["vars"]["cat"]["check_composition"].get(bool)
    if check_composition:
        stats["max_length"] = series.str.len().max()
        stats["mean_length"] = series.str.len().mean()
        stats["min_length"] = series.str.len().min()

        from visions.application.summaries.series.text_summary import text_summary

        stats.update(text_summary(series))
        stats["length"] = series.str.len()

    stats["date_warning"] = warning_type_date(series)

    return stats