Beispiel #1
0
def _nom_calcs(srs: dd.Series, head: pd.Series, cfg: Config) -> Dict[str, Any]:
    """
    Computations for a nominal column in plot(df)
    """
    # dictionary of data for the bar chart and related insights
    data: Dict[str, Any] = {}

    # value counts for barchart and uniformity insight
    grps = srs.value_counts(sort=False)

    if cfg.bar.enable:
        # select the largest or smallest groups
        data["bar"] = (
            grps.nlargest(cfg.bar.bars) if cfg.bar.sort_descending else grps.nsmallest(cfg.bar.bars)
        )
        data["nuniq"] = grps.shape[0]

    if cfg.insight.enable:
        data["chisq"] = chisquare(grps.values)  # chi-squared test for uniformity
        data["nuniq"] = grps.shape[0]  # number of unique values
        data["npres"] = grps.sum()  # number of present (not null) values
        if not head.apply(lambda x: isinstance(x, str)).all():
            srs = srs.astype(str)  # srs must be a string to compute the value lengths
        data["min_len"], data["max_len"] = srs.str.len().min(), srs.str.len().max()

    return data
Beispiel #2
0
def _cont_calcs(srs: dd.Series, cfg: Config) -> Dict[str, Any]:
    """
    Computations for a continuous column in plot(df)
    """
    # dictionary of data for the histogram and related insights
    data: Dict[str, Any] = {}

    if cfg.insight.enable:
        data["npres"] = srs.shape[0]  # number of present (not null) values

    # drop infinite values
    srs = srs[~srs.isin({np.inf, -np.inf})]

    # histogram
    data["hist"] = da.histogram(srs, bins=cfg.hist.bins, range=(srs.min(), srs.max()))

    if cfg.insight.enable:
        data["chisq"] = chisquare(data["hist"][0])
        data["norm"] = normaltest(data["hist"][0])
        data["skew"] = skewtest(data["hist"][0])
        data["nneg"] = (srs < 0).sum()  # number of negative values
        data["nuniq"] = srs.nunique_approx()  # number of unique values
        data["nzero"] = (srs == 0).sum()  # number of zeros
        data["nreals"] = srs.shape[0]  # number of non-inf values
    return data
Beispiel #3
0
def calc_nom_col(srs: dd.Series, first_rows: pd.Series, ngroups: int,
                 largest: bool) -> Dict[str, Any]:
    """
    Computations for a categorical column in plot(df)

    Parameters
    ----------
    srs
        srs over which to compute the barchart and insights
    first_rows
        first rows of the dataset read into memory
    ngroups
        number of groups to show in the barchart
    largest
        whether to show the largest or smallest groups
    """
    # dictionary of data for the bar chart and related insights
    data = {}

    ## if cfg.barchart_enable or cfg.insight.uniform_enable:
    grps = srs.value_counts(sort=False)

    ##    if cfg.barchart_enable:
    ##       nbars = cfg.barchart_nbars
    ##       largest = cfg.barchart_largest
    # select the largest or smallest groups
    data["bar"] = grps.nlargest(ngroups) if largest else grps.nsmallest(
        ngroups)

    ##    if cfg.insight.uniform_enable:
    # compute a chi-squared test on the frequency distribution
    data["chisq"] = chisquare(grps.values)

    ##    if cfg.barchart_enable or cfg.insight.unique_enable:
    # total number of groups
    data["nuniq"] = grps.shape[0]

    ##    if cfg.insight.missing_enable:
    # number of present (not null) values
    data["npres"] = grps.sum()

    ## if cfg.insight.unique_enable and not cfg.barchart_enable:
    ## data["nuniq"] = srs.nunique()

    ## if cfg.insight.missing_enable and not cfg.barchart_enable:
    ## data["npresent"] = srs.shape[0]

    ## if cfg.insight.constant_length_enable:
    if not first_rows.apply(lambda x: isinstance(x, str)).all():
        srs = srs.astype(
            str)  # srs must be a string to compute the value lengths
    length = srs.str.len()
    data["min_len"], data["max_len"] = length.min(), length.max()

    return data
Beispiel #4
0
def calc_cont_col(srs: dd.Series, bins: int) -> Dict[str, Any]:
    """
    Computations for a numerical column in plot(df)

    Parameters
    ----------
    srs
        srs over which to compute the barchart and insights
    bins
        number of bins in the bar chart
    """
    # dictionary of data for the histogram and related insights
    data: Dict[str, Any] = {}

    ## if cfg.insight.missing_enable:
    data["npres"] = srs.shape[0]

    ## if cfg.insight.infinity_enable:
    is_inf_srs = srs.isin({np.inf, -np.inf})
    data["ninf"] = is_inf_srs.sum()

    # remove infinite values
    srs = srs[~is_inf_srs]

    ## if cfg.hist_enable or config.insight.uniform_enable or cfg.insight.normal_enable:
    ## bins = cfg.hist_bins
    data["hist"] = da.histogram(srs, bins=bins, range=[srs.min(), srs.max()])

    ## if cfg.insight.uniform_enable:
    data["chisq"] = chisquare(data["hist"][0])

    ## if cfg.insight.normal_enable
    data["norm"] = normaltest(data["hist"][0])

    ## if cfg.insight.negative_enable:
    data["nneg"] = (srs < 0).sum()

    ## if cfg.insight.skew_enabled:
    data["skew"] = skewtest(data["hist"][0])

    ## if cfg.insight.unique_enabled:
    data["nuniq"] = srs.nunique()

    ## if cfg.insight.zero_enabled:
    data["nzero"] = (srs == 0).sum()

    return data
Beispiel #5
0
def cont_comps(srs: dd.Series, cfg: Config) -> Dict[str, Any]:
    """
    All computations required for plot(df, Continuous)
    """
    # pylint: disable=too-many-branches
    data: Dict[str, Any] = {}

    if cfg.stats.enable or cfg.hist.enable:
        data["nrows"] = srs.shape[0]  # total rows
    srs = srs.dropna()
    if cfg.stats.enable:
        data["npres"] = srs.shape[0]  # number of present (not null) values
    srs = srs[~srs.isin({np.inf, -np.inf})]  # remove infinite values
    if cfg.hist.enable or cfg.qqnorm.enable and cfg.insight.enable:
        data["hist"] = da.histogram(srs, cfg.hist.bins, (srs.min(), srs.max()))
        if cfg.insight.enable:
            data["norm"] = normaltest(data["hist"][0])
    if cfg.hist.enable and cfg.insight.enable:
        data["chisq"] = chisquare(data["hist"][0])
    # compute only the required amount of quantiles
    if cfg.qqnorm.enable:
        data["qntls"] = srs.quantile(np.linspace(0.01, 0.99, 99))
    elif cfg.stats.enable:
        data["qntls"] = srs.quantile([0.05, 0.25, 0.5, 0.75, 0.95])
    elif cfg.box.enable:
        data["qntls"] = srs.quantile([0.25, 0.5, 0.75])
    if cfg.stats.enable or cfg.hist.enable and cfg.insight.enable:
        data["skew"] = skew(srs)
    if cfg.stats.enable or cfg.qqnorm.enable:
        data["mean"] = srs.mean()
        data["std"] = srs.std()
    if cfg.stats.enable:
        data["min"] = srs.min()
        data["max"] = srs.max()
        data["nreals"] = srs.shape[0]
        data["nzero"] = (srs == 0).sum()
        data["nneg"] = (srs < 0).sum()
        data["kurt"] = kurtosis(srs)
        data["mem_use"] = srs.memory_usage(deep=True)
    # compute the density histogram
    if cfg.kde.enable:
        # To avoid the singular matrix problem, gaussian_kde needs a non-zero std.
        if not math.isclose(
                dask.compute(data["min"])[0],
                dask.compute(data["max"])[0]):
            data["dens"] = da.histogram(srs,
                                        cfg.kde.bins, (srs.min(), srs.max()),
                                        density=True)
            # gaussian kernel density estimate
            data["kde"] = gaussian_kde(
                srs.map_partitions(lambda x: x.sample(min(1000, x.shape[0])),
                                   meta=srs))
        else:
            data["kde"] = None
    if cfg.box.enable:
        data.update(_calc_box(srs, data["qntls"], cfg))
    if cfg.value_table.enable:
        value_counts = srs.value_counts(sort=False)
        if cfg.stats.enable:
            data["nuniq"] = value_counts.shape[0]
        data["value_table"] = value_counts.nlargest(cfg.value_table.ngroups)
    elif cfg.stats.enable:
        data["nuniq"] = srs.nunique_approx()

    return data
Beispiel #6
0
def nom_comps(srs: dd.Series, head: pd.Series, cfg: Config) -> Dict[str, Any]:
    """
    All computations required for plot(df, Nominal)
    """
    # pylint: disable=too-many-branches
    data: Dict[str, Any] = dict()

    data["nrows"] = srs.shape[0]  # total rows
    srs = srs.dropna()  # drop null values
    grps = srs.value_counts(
        sort=False)  # counts of unique values in the series
    data["geo"] = grps

    if cfg.stats.enable or cfg.bar.enable or cfg.pie.enable:
        data["nuniq"] = grps.shape[0]  # total number of groups

    # compute bar and pie together unless the parameters are different
    if cfg.bar.enable or cfg.pie.enable:
        # select the largest or smallest groups
        data["bar"] = (grps.nlargest(cfg.bar.bars) if cfg.bar.sort_descending
                       else grps.nsmallest(cfg.bar.bars))

        if cfg.bar.bars == cfg.pie.slices and cfg.bar.sort_descending == cfg.pie.sort_descending:
            data["pie"] = data["bar"]
        else:
            data["pie"] = (grps.nlargest(cfg.pie.slices)
                           if cfg.pie.sort_descending else grps.nsmallest(
                               cfg.pie.slices))

        if cfg.bar.bars == cfg.value_table.ngroups and cfg.bar.sort_descending:
            data["value_table"] = data["bar"]
        elif cfg.pie.slices == cfg.value_table.ngroups and cfg.pie.sort_descending:
            data["value_table"] = data["pie"]
        else:
            data["value_table"] = grps.nlargest(cfg.value_table.ngroups)

        if cfg.insight.enable:
            data["chisq"] = chisquare(grps.values)

    df = grps.reset_index()  # dataframe with group names and counts

    if cfg.stats.enable or cfg.wordlen.enable:
        if not head.apply(lambda x: isinstance(x, str)).all():
            srs = srs.astype(
                str)  # srs must be a string to compute the value lengths
    if cfg.stats.enable or cfg.wordcloud.enable or cfg.wordfreq.enable:
        if not head.apply(lambda x: isinstance(x, str)).all():
            df[df.columns[0]] = df[df.columns[0]].astype(str)

    if cfg.stats.enable:
        data.update(_calc_nom_stats(srs, df, data["nrows"], data["nuniq"]))
    elif cfg.wordfreq.enable and cfg.insight.enable:
        data["len_stats"] = {
            "Minimum": srs.str.len().min(),
            "Maximum": srs.str.len().max()
        }
    if cfg.wordlen.enable:
        lens = srs.str.len()
        data["len_hist"] = da.histogram(lens, cfg.wordlen.bins,
                                        (lens.min(), lens.max()))
    if cfg.wordcloud.enable or cfg.wordfreq.enable:
        if all(
                getattr(cfg.wordcloud, att) == getattr(cfg.wordfreq, att)
                for att in ("top_words", "stopword", "stem", "lemmatize")):
            word_freqs = _calc_word_freq(
                df,
                cfg.wordfreq.top_words,
                cfg.wordfreq.stopword,
                cfg.wordfreq.lemmatize,
                cfg.wordfreq.stem,
            )
            data["word_cnts_cloud"] = word_freqs["word_cnts"]
            data["nuniq_words_cloud"] = word_freqs["nuniq_words"]
        else:
            word_freqs = _calc_word_freq(
                df.copy(),
                cfg.wordfreq.top_words,
                cfg.wordfreq.stopword,
                cfg.wordfreq.lemmatize,
                cfg.wordfreq.stem,
            )
            word_freqs_cloud = _calc_word_freq(
                df,
                cfg.wordcloud.top_words,
                cfg.wordcloud.stopword,
                cfg.wordcloud.lemmatize,
                cfg.wordcloud.stem,
            )
            data["word_cnts_cloud"] = word_freqs_cloud["word_cnts"]
            data["nuniq_words_cloud"] = word_freqs["nuniq_words"]

        data["word_cnts_freq"] = word_freqs["word_cnts"]
        data["nwords_freq"] = word_freqs["nwords"]

    return data
Beispiel #7
0
def cont_comps(srs: dd.Series, bins: int) -> Dict[str, Any]:
    """
    This function aggregates all of the computations required for plot(df, Continuous())

    Parameters
    ----------
    srs
        one numerical column
    bins
        the number of bins in the histogram
    """

    data: Dict[str, Any] = {}

    ## if cfg.stats_enable or cfg.hist_enable or
    # calculate the total number of rows then drop the missing values
    data["nrows"] = srs.shape[0]
    srs = srs.dropna()
    ## if cfg.stats_enable
    # number of not null (present) values
    data["npres"] = srs.shape[0]
    # remove infinite values
    srs = srs[~srs.isin({np.inf, -np.inf})]

    # shared computations
    ## if cfg.stats_enable or cfg.hist_enable or cfg.qqplot_enable and cfg.insights_enable:
    data["min"], data["max"] = srs.min(), srs.max()
    ## if cfg.hist_enable or cfg.qqplot_enable and cfg.ingsights_enable:
    data["hist"] = da.histogram(srs,
                                bins=bins,
                                range=[data["min"], data["max"]])
    ## if cfg.insights_enable and (cfg.qqplot_enable or cfg.hist_enable):
    data["norm"] = normaltest(data["hist"][0])
    ## if cfg.qqplot_enable
    data["qntls"] = srs.quantile(np.linspace(0.01, 0.99, 99))
    ## elif cfg.stats_enable
    ## data["qntls"] = srs.quantile([0.05, 0.25, 0.5, 0.75, 0.95])
    ## elif cfg.boxplot_enable
    ## data["qntls"] = srs.quantile([0.25, 0.5, 0.75])
    ## if cfg.stats_enable or cfg.hist_enable and cfg.insights_enable:
    data["skew"] = skew(srs)

    # if cfg.stats_enable
    data["nuniq"] = srs.nunique()
    data["nreals"] = srs.shape[0]
    data["nzero"] = (srs == 0).sum()
    data["nneg"] = (srs < 0).sum()
    data["mean"] = srs.mean()
    data["std"] = srs.std()
    data["kurt"] = kurtosis(srs)
    data["mem_use"] = srs.memory_usage(deep=True)

    ## if cfg.hist_enable and cfg.insight_enable
    data["chisq"] = chisquare(data["hist"][0])

    # compute the density histogram
    data["dens"] = da.histogram(srs,
                                bins=bins,
                                range=[data["min"], data["max"]],
                                density=True)
    # gaussian kernel density estimate
    data["kde"] = gaussian_kde(
        srs.map_partitions(lambda x: x.sample(min(1000, x.shape[0])),
                           meta=srs))

    ## if cfg.box_enable
    data.update(calc_box(srs, data["qntls"]))

    return data
Beispiel #8
0
def nom_comps(
    srs: dd.Series,
    first_rows: pd.Series,
    ngroups: int,
    largest: bool,
    bins: int,
    top_words: int,
    stopword: bool,
    lemmatize: bool,
    stem: bool,
) -> Dict[str, Any]:
    """
    This function aggregates all of the computations required for plot(df, Nominal())

    Parameters
    ----------
    srs
        one categorical column
    ngroups
        Number of groups to return
    largest
        If true, show the groups with the largest count,
        else show the groups with the smallest count
    bins
        number of bins for the category length frequency histogram
    top_words
        Number of highest frequency words to show in the
        wordcloud and word frequency bar chart
    stopword
        If True, remove stop words, else keep them
    lemmatize
        If True, lemmatize the words before computing
        the word frequencies, else don't
    stem
        If True, extract the stem of the words before
        computing the word frequencies, else don't
    """  # pylint: disable=too-many-arguments

    data: Dict[str, Any] = {}

    # total rows
    data["nrows"] = srs.shape[0]
    # drop null values
    srs = srs.dropna()

    ## if cfg.bar_enable or cfg.pie_enable
    # counts of unique values in the series
    grps = srs.value_counts(sort=False)
    # total number of groups
    data["nuniq"] = grps.shape[0]
    # select the largest or smallest groups
    data["bar"] = grps.nlargest(ngroups) if largest else grps.nsmallest(
        ngroups)
    ##     if cfg.barchart_bars == cfg.piechart_slices:
    data["pie"] = data["bar"]
    ##     else
    ##     data["pie"] = grps.nlargest(ngroups) if largest else grps.nsmallest(ngroups)
    ##     if cfg.insights.evenness_enable
    data["chisq"] = chisquare(grps.values)

    ## if cfg.stats_enable
    df = grps.reset_index()
    ## if cfg.stats_enable or cfg.word_freq_enable
    if not first_rows.apply(lambda x: isinstance(x, str)).all():
        srs = srs.astype(
            str)  # srs must be a string to compute the value lengths
        df[df.columns[0]] = df[df.columns[0]].astype(str)
    data.update(calc_cat_stats(srs, df, bins, data["nrows"], data["nuniq"]))
    # ## if cfg.word_freq_enable
    data.update(calc_word_freq(df, top_words, stopword, lemmatize, stem))

    return data