Beispiel #1
0
def _calc_box_stats(grp_srs: dd.Series,
                    grp: str,
                    dlyd: bool = False) -> pd.DataFrame:
    """
    Auxiliary function to calculate the Tukey box plot statistics
    dlyd is for if this function is called when dask is computing in parallel (dask.delayed)
    """
    stats: Dict[str, Any] = dict()

    try:  # this is a bad fix for the problem of when there is no data passed to this function
        if dlyd:
            qntls = np.round(grp_srs.quantile([0.25, 0.50, 0.75]), 3)
        else:
            qntls = np.round(grp_srs.quantile([0.25, 0.50, 0.75]).compute(), 3)
        stats["q1"], stats["q2"], stats["q3"] = qntls[0.25], qntls[
            0.50], qntls[0.75]
    except ValueError:
        stats["q1"], stats["q2"], stats["q3"] = np.nan, np.nan, np.nan

    iqr = stats["q3"] - stats["q1"]
    stats["lw"] = grp_srs[grp_srs >= stats["q1"] - 1.5 * iqr].min()
    stats["uw"] = grp_srs[grp_srs <= stats["q3"] + 1.5 * iqr].max()
    if not dlyd:
        stats["lw"], stats["uw"] = dask.compute(stats["lw"], stats["uw"])

    otlrs = grp_srs[(grp_srs < stats["lw"]) | (grp_srs > stats["uw"])]
    if len(otlrs) > 100:  # sample 100 outliers
        otlrs = otlrs.sample(frac=100 / len(otlrs))
    stats["otlrs"] = list(otlrs) if dlyd else list(otlrs.compute())

    return pd.DataFrame({grp: stats})
Beispiel #2
0
def calc_box(srs: dd.Series, qntls: da.Array) -> Dict[str, Any]:
    """
    Box plot calculations

    Parameters
    ----------
    srs
        one numerical column
    qntls
        quantiles of the column
    """
    data: Dict[str, Any] = {}

    # quartiles
    data["qrtl1"] = qntls.loc[0.25].sum()
    data["qrtl2"] = qntls.loc[0.5].sum()
    data["qrtl3"] = qntls.loc[0.75].sum()
    iqr = data["qrtl3"] - data["qrtl1"]
    srs_iqr = srs[srs.between(data["qrtl1"] - 1.5 * iqr,
                              data["qrtl3"] + 1.5 * iqr)]
    # outliers
    otlrs = srs[~srs.between(data["qrtl1"] - 1.5 * iqr, data["qrtl3"] +
                             1.5 * iqr)]
    # randomly sample at most 100 outliers from each partition without replacement
    smp_otlrs = otlrs.map_partitions(lambda x: x.sample(min(100, x.shape[0])),
                                     meta=otlrs)
    data["lw"] = srs_iqr.min()
    data["uw"] = srs_iqr.max()
    data["otlrs"] = smp_otlrs.values
    ##    if cfg.insights_enable
    data["notlrs"] = otlrs.shape[0]

    return data
Beispiel #3
0
def _nom_calcs(srs: dd.Series, head: pd.Series, cfg: Config) -> Dict[str, Any]:
    """
    Computations for a nominal column in plot(df)
    """
    # dictionary of data for the bar chart and related insights
    data: Dict[str, Any] = {}

    # value counts for barchart and uniformity insight
    grps = srs.value_counts(sort=False)

    if cfg.bar.enable:
        # select the largest or smallest groups
        data["bar"] = (
            grps.nlargest(cfg.bar.bars) if cfg.bar.sort_descending else grps.nsmallest(cfg.bar.bars)
        )
        data["nuniq"] = grps.shape[0]

    if cfg.insight.enable:
        data["chisq"] = chisquare(grps.values)  # chi-squared test for uniformity
        data["nuniq"] = grps.shape[0]  # number of unique values
        data["npres"] = grps.sum()  # number of present (not null) values
        if not head.apply(lambda x: isinstance(x, str)).all():
            srs = srs.astype(str)  # srs must be a string to compute the value lengths
        data["min_len"], data["max_len"] = srs.str.len().min(), srs.str.len().max()

    return data
Beispiel #4
0
def _cont_calcs(srs: dd.Series, cfg: Config) -> Dict[str, Any]:
    """
    Computations for a continuous column in plot(df)
    """
    # dictionary of data for the histogram and related insights
    data: Dict[str, Any] = {}

    if cfg.insight.enable:
        data["npres"] = srs.shape[0]  # number of present (not null) values

    # drop infinite values
    srs = srs[~srs.isin({np.inf, -np.inf})]

    # histogram
    data["hist"] = da.histogram(srs, bins=cfg.hist.bins, range=(srs.min(), srs.max()))

    if cfg.insight.enable:
        data["chisq"] = chisquare(data["hist"][0])
        data["norm"] = normaltest(data["hist"][0])
        data["skew"] = skewtest(data["hist"][0])
        data["nneg"] = (srs < 0).sum()  # number of negative values
        data["nuniq"] = srs.nunique_approx()  # number of unique values
        data["nzero"] = (srs == 0).sum()  # number of zeros
        data["nreals"] = srs.shape[0]  # number of non-inf values
    return data
Beispiel #5
0
def uni_histogram(
    srs: dd.Series,
    bins: int,
    dtype: Optional[DTypeDef] = None,
) -> Tuple[da.Array, ...]:
    """Calculate "histogram" for both numerical and categorical."""

    if is_dtype(detect_dtype(srs, dtype), Continuous()):

        counts, edges = da.histogram(srs, bins, range=[srs.min(), srs.max()])
        centers = (edges[:-1] + edges[1:]) / 2

        return counts, centers, edges

    elif is_dtype(detect_dtype(srs, dtype), Nominal()):
        # Dask array's unique is way slower than the values_counts on Series
        # See https://github.com/dask/dask/issues/2851
        # centers, counts = da.unique(arr, return_counts=True)

        value_counts = srs.value_counts()

        counts = value_counts.to_dask_array()
        centers = value_counts.index.to_dask_array()

        return (counts, centers)
    else:
        raise ValueError(f"Unsupported dtype {srs.dtype}")
Beispiel #6
0
def _calc_box(srs: dd.Series, qntls: da.Array, cfg: Config) -> Dict[str, Any]:
    """
    Box plot calculations
    """
    # quartiles
    data = {
        f"qrtl{i + 1}": qntls.loc[qnt].sum()
        for i, qnt in enumerate((0.25, 0.5, 0.75))
    }

    # inter-quartile range
    iqr = data["qrtl3"] - data["qrtl1"]
    srs_iqr = srs[srs.between(data["qrtl1"] - 1.5 * iqr,
                              data["qrtl3"] + 1.5 * iqr)]
    # lower and upper whiskers
    data["lw"], data["uw"] = srs_iqr.min(), srs_iqr.max()

    # outliers
    otlrs = srs[~srs.between(data["qrtl1"] - 1.5 * iqr, data["qrtl3"] +
                             1.5 * iqr)]
    # randomly sample at most 100 outliers from each partition without replacement
    smp_otlrs = otlrs.map_partitions(lambda x: x.sample(min(100, x.shape[0])),
                                     meta=otlrs)
    data["otlrs"] = smp_otlrs.values
    if cfg.insight.enable:
        data["notlrs"] = otlrs.shape[0]

    return data
Beispiel #7
0
def calc_hist(srs: dd.Series, bins: int,
              orig_df_len: int) -> Tuple[pd.DataFrame, float]:
    """
    Calculate a histogram over a given series.

    Parameters
    ----------
    srs : dd.Series
        one numerical column over which to compute the histogram
    bins : int
        number of bins to use in the histogram
    orig_df_len : int
        length of the original dataframe

    Returns
    -------
    Tuple[pd.DataFrame, float]:
        The histogram in a dataframe and the percent of missing values
    """
    miss_pct = round(srs.isna().sum() / len(srs) * 100, 1)
    data = srs.dropna().values
    if len(data) == 0:  # all values in column are missing
        return pd.DataFrame({"left": [], "right": [], "freq": []}), miss_pct
    minv, maxv = data.min(), data.max()
    hist_arr, bins_arr = np.histogram(data, range=[minv, maxv], bins=bins)
    intervals = _format_bin_intervals(bins_arr)
    hist_df = pd.DataFrame({
        "intervals": intervals,
        "left": bins_arr[:-1],
        "right": bins_arr[1:],
        "freq": hist_arr,
        "pct": hist_arr / orig_df_len * 100,
    })
    return hist_df, miss_pct
Beispiel #8
0
def uni_histogram(
    srs: dd.Series,
    srs_dtype: DType,
    cfg: Config,
) -> Tuple[da.Array, ...]:
    """Calculate "histogram" for both numerical and categorical."""

    if isinstance(srs_dtype, Continuous):

        counts, edges = da.histogram(srs, cfg.hist.bins, (srs.min(), srs.max()))
        centers = (edges[:-1] + edges[1:]) / 2

        return counts, centers, edges

    elif isinstance(srs_dtype, (Nominal, GeoGraphy, SmallCardNum, DateTime)):
        # Dask array's unique is way slower than the values_counts on Series
        # See https://github.com/dask/dask/issues/2851
        # centers, counts = da.unique(arr, return_counts=True)

        value_counts = srs.value_counts()

        counts = value_counts.to_dask_array()
        centers = value_counts.index.to_dask_array()

        return (counts, centers)
    else:
        raise ValueError(f"Unsupported dtype {srs.dtype}")
Beispiel #9
0
def calc_stats_dt(srs: dd.Series) -> Dict[str, str]:
    """
    Calculate stats from a datetime column

    Parameters
    ----------
    srs
        a datetime column
    Returns
    -------
    Dict[str, str]
        Dictionary that contains Overview
    """
    size = len(srs)  # include nan
    count = srs.count()  # exclude nan
    uniq_count = srs.nunique()
    overview_dict = {
        "Distinct Count": uniq_count,
        "Unique (%)": uniq_count / count,
        "Missing": size - count,
        "Missing (%)": 1 - (count / size),
        "Memory Size": srs.memory_usage(),
        "Minimum": srs.min(),
        "Maximum": srs.max(),
    }

    return overview_dict
Beispiel #10
0
def calc_cat_stats(
    srs: dd.Series,
    df: dd.DataFrame,
    bins: int,
    nrows: int,
    nuniq: Optional[dd.core.Scalar] = None,
) -> Dict[str, Any]:
    """
    Calculate stats for a categorical column

    Parameters
    ----------
    srs
        a categorical column
    df
        groupby-count on the categorical column as a dataframe
    bins
        number of bins for the category length frequency histogram
    nrows
        number of rows before dropping null values
    nuniq
        number of unique values in the column
    """
    # pylint: disable=too-many-locals
    # overview stats
    stats = {
        "nrows": nrows,
        "npres": srs.shape[0],
        "nuniq": nuniq,  # if cfg.bar_endable or cfg.pie_enable else srs.nunique(),
        "mem_use": srs.memory_usage(deep=True),
        "first_rows": srs.reset_index(drop=True).loc[:4],
    }
    # length stats
    lengths = srs.str.len()
    minv, maxv = lengths.min(), lengths.max()
    hist = da.histogram(lengths.values, bins=bins, range=[minv, maxv])
    leng = {
        "Mean": lengths.mean(),
        "Standard Deviation": lengths.std(),
        "Median": lengths.quantile(0.5),
        "Minimum": minv,
        "Maximum": maxv,
    }
    # letter stats
    # computed on groupby-count:
    # compute the statistic for each group then multiply by the count of the group
    grp, col = df.columns
    lc_cnt = (df[grp].str.count(r"[a-z]") * df[col]).sum()
    uc_cnt = (df[grp].str.count(r"[A-Z]") * df[col]).sum()
    letter = {
        "Count": lc_cnt + uc_cnt,
        "Lowercase Letter": lc_cnt,
        "Space Separator": (df[grp].str.count(r"[ ]") * df[col]).sum(),
        "Uppercase Letter": uc_cnt,
        "Dash Punctuation": (df[grp].str.count(r"[-]") * df[col]).sum(),
        "Decimal Number": (df[grp].str.count(r"[0-9]") * df[col]).sum(),
    }

    return {"stats": stats, "len_stats": leng, "letter_stats": letter, "len_hist": hist}
Beispiel #11
0
def calc_nom_col(srs: dd.Series, first_rows: pd.Series, ngroups: int,
                 largest: bool) -> Dict[str, Any]:
    """
    Computations for a categorical column in plot(df)

    Parameters
    ----------
    srs
        srs over which to compute the barchart and insights
    first_rows
        first rows of the dataset read into memory
    ngroups
        number of groups to show in the barchart
    largest
        whether to show the largest or smallest groups
    """
    # dictionary of data for the bar chart and related insights
    data = {}

    ## if cfg.barchart_enable or cfg.insight.uniform_enable:
    grps = srs.value_counts(sort=False)

    ##    if cfg.barchart_enable:
    ##       nbars = cfg.barchart_nbars
    ##       largest = cfg.barchart_largest
    # select the largest or smallest groups
    data["bar"] = grps.nlargest(ngroups) if largest else grps.nsmallest(
        ngroups)

    ##    if cfg.insight.uniform_enable:
    # compute a chi-squared test on the frequency distribution
    data["chisq"] = chisquare(grps.values)

    ##    if cfg.barchart_enable or cfg.insight.unique_enable:
    # total number of groups
    data["nuniq"] = grps.shape[0]

    ##    if cfg.insight.missing_enable:
    # number of present (not null) values
    data["npres"] = grps.sum()

    ## if cfg.insight.unique_enable and not cfg.barchart_enable:
    ## data["nuniq"] = srs.nunique()

    ## if cfg.insight.missing_enable and not cfg.barchart_enable:
    ## data["npresent"] = srs.shape[0]

    ## if cfg.insight.constant_length_enable:
    if not first_rows.apply(lambda x: isinstance(x, str)).all():
        srs = srs.astype(
            str)  # srs must be a string to compute the value lengths
    length = srs.str.len()
    data["min_len"], data["max_len"] = length.min(), length.max()

    return data
Beispiel #12
0
def calc_cat_stats(srs: dd.Series,
                   bins: int,
                   nrows: int,
                   nuniq: Optional[dd.core.Scalar] = None) -> Dict[str, Any]:
    """
    Calculate stats for a categorical column

    Parameters
    ----------
    srs
        a categorical column
    nrows
        number of rows before dropping null values
    bins
        number of bins for the category length frequency histogram
    """
    # overview stats
    stats = {
        "nrows": nrows,
        "npres": srs.shape[0],
        "nuniq":
        nuniq,  # if cfg.bar_endable or cfg.pie_enable else srs.nunique(),
        "mem_use": srs.memory_usage(deep=True),
        "first_rows": srs.reset_index(drop=True).loc[:4],
    }
    # length stats
    lengths = srs.str.len()
    minv, maxv = lengths.min(), lengths.max()
    hist = da.histogram(lengths.values, bins=bins, range=[minv, maxv])
    leng = {
        "Mean": lengths.mean(),
        "Standard Deviation": lengths.std(),
        "Median": lengths.quantile(0.5),
        "Minimum": minv,
        "Maximum": maxv,
    }
    # letter stats
    letter = {
        "Count": srs.str.count(r"[a-zA-Z]").sum(),
        "Lowercase Letter": srs.str.count(r"[a-z]").sum(),
        "Space Separator": srs.str.count(r"[ ]").sum(),
        "Uppercase Letter": srs.str.count(r"[A-Z]").sum(),
        "Dash Punctuation": srs.str.count(r"[-]").sum(),
        "Decimal Number": srs.str.count(r"[0-9]").sum(),
    }

    return {
        "stats": stats,
        "len_stats": leng,
        "letter_stats": letter,
        "len_hist": hist
    }
Beispiel #13
0
def _calc_box_stats(grp_srs: dd.Series, grp: str) -> pd.DataFrame:
    """
    Auxiliary function to calculate the Tukey box plot statistics

    Parameters
    ----------
    grp_srs: dd.Series
        one numerical column
    grp: str
        Name of the group of the corresponding series values

    Returns
    -------
    pd.DataFrame
        A dataframe containing box plot statistics
    """
    stats: Dict[str, Any] = dict()

    try:  # this is a bad fix for the problem of when there is no data passed to this function
        qntls = np.round(grp_srs.quantile([0.25, 0.50, 0.75]).compute(), 3)
        stats["q1"], stats["q2"], stats["q3"] = qntls[0.25], qntls[
            0.50], qntls[0.75]
    except ValueError:
        stats["q1"], stats["q2"], stats["q3"] = np.nan, np.nan, np.nan

    iqr = stats["q3"] - stats["q1"]
    stats["lw"] = grp_srs[grp_srs >= stats["q1"] - 1.5 * iqr].min().compute()
    stats["uw"] = grp_srs[grp_srs <= stats["q3"] + 1.5 * iqr].max().compute()

    otlrs = grp_srs[(grp_srs < stats["lw"]) | (grp_srs > stats["uw"])]
    if len(otlrs) > 100:  # sample 100 outliers
        otlrs = otlrs.sample(frac=100 / len(otlrs))
    stats["otlrs"] = list(otlrs.compute())

    return pd.DataFrame({grp: stats})
Beispiel #14
0
def is_geopoint(col: dd.Series) -> bool:
    """
    Given a column, return if its type is a geopoint type
    """
    lat_long = pd.Series(col.compute()[:100], dtype="string")
    lat_long_ratio: float = np.sum(validate_lat_long(lat_long)) / lat_long.shape[0]
    return lat_long_ratio > 0.8
Beispiel #15
0
def is_geography(col: dd.Series) -> bool:
    """
    Given a column, return if its type is a geography type
    """
    geo = col.compute()[:100]
    geo_ratio: float = np.sum(validate_country(geo)) / geo.shape[0]
    return geo_ratio > 0.8
Beispiel #16
0
def detect_without_known(col: dd.Series, detect_small_distinct: bool) -> DType:
    # pylint: disable=too-many-return-statements
    """
    This function detects dtypes of column when users didn't specify.
    """
    if is_nominal(col.dtype):
        if is_geography(col):
            return GeoGraphy()
        if is_geopoint(col):
            return GeoPoint()
        else:
            return Nominal()

    elif is_continuous(col.dtype):
        if detect_small_distinct:
            # detect as categorical if distinct value is small
            nuniques = col.nunique_approx().compute()
            if nuniques < 10:
                return Nominal()
            else:
                return Continuous()
        else:
            return Continuous()

    elif is_datetime(col.dtype):
        return DateTime()
    else:
        raise UnreachableError
Beispiel #17
0
def get_type(data: dd.Series) -> DataType:
    """ Returns the type of the input data.
        Identified types are according to the DataType Enumeration.

    Parameter
    __________
    The data for which the type needs to be identified.

    Returns
    __________
    str representing the type of the data.
    """
    col_type = DataType.TYPE_UNSUP
    try:
        if pd.api.types.is_bool_dtype(data):
            col_type = DataType.TYPE_CAT
        elif (pd.api.types.is_numeric_dtype(data)
              and dask.compute(data.dropna().unique().size) == 2):
            col_type = DataType.TYPE_CAT
        elif pd.api.types.is_numeric_dtype(data):
            col_type = DataType.TYPE_NUM
        else:
            col_type = DataType.TYPE_CAT
    except NotImplementedError as error:  # TO-DO
        LOGGER.info("Type cannot be determined due to : %s", error)

    return col_type
Beispiel #18
0
def calc_cont_col(srs: dd.Series, bins: int) -> Dict[str, Any]:
    """
    Computations for a numerical column in plot(df)

    Parameters
    ----------
    srs
        srs over which to compute the barchart and insights
    bins
        number of bins in the bar chart
    """
    # dictionary of data for the histogram and related insights
    data: Dict[str, Any] = {}

    ## if cfg.insight.missing_enable:
    data["npres"] = srs.shape[0]

    ## if cfg.insight.infinity_enable:
    is_inf_srs = srs.isin({np.inf, -np.inf})
    data["ninf"] = is_inf_srs.sum()

    # remove infinite values
    srs = srs[~is_inf_srs]

    ## if cfg.hist_enable or config.insight.uniform_enable or cfg.insight.normal_enable:
    ## bins = cfg.hist_bins
    data["hist"] = da.histogram(srs, bins=bins, range=[srs.min(), srs.max()])

    ## if cfg.insight.uniform_enable:
    data["chisq"] = chisquare(data["hist"][0])

    ## if cfg.insight.normal_enable
    data["norm"] = normaltest(data["hist"][0])

    ## if cfg.insight.negative_enable:
    data["nneg"] = (srs < 0).sum()

    ## if cfg.insight.skew_enabled:
    data["skew"] = skewtest(data["hist"][0])

    ## if cfg.insight.unique_enabled:
    data["nuniq"] = srs.nunique()

    ## if cfg.insight.zero_enabled:
    data["nzero"] = (srs == 0).sum()

    return data
Beispiel #19
0
def histogram(
    srs: dd.Series,
    bins: Optional[int] = None,
    return_edges: bool = True,
    range: Optional[Tuple[int, int]] = None,  # pylint: disable=redefined-builtin
    dtype: Optional[DTypeDef] = None,
) -> Union[Tuple[da.Array, da.Array], Tuple[da.Array, da.Array, da.Array]]:
    """
    Calculate "histogram" for both numerical and categorical
    """

    if is_dtype(detect_dtype(srs, dtype), Continuous()):
        if range is not None:
            minimum, maximum = range
        else:
            minimum, maximum = srs.min(axis=0), srs.max(axis=0)
        minimum, maximum = dask.compute(minimum, maximum)

        assert (
            bins is not None
        ), "num_bins cannot be None if calculating numerical histograms"

        counts, edges = da.histogram(srs.to_dask_array(),
                                     bins,
                                     range=[minimum, maximum])
        centers = (edges[:-1] + edges[1:]) / 2

        if not return_edges:
            return counts, centers
        return counts, centers, edges
    elif is_dtype(detect_dtype(srs, dtype), Nominal()):
        value_counts = srs.value_counts()
        counts = value_counts.to_dask_array()

        # Dask array dones't understand the pandas dtypes such as categorical type.
        # We convert these types into str before calling into `to_dask_array`.

        if is_pandas_categorical(value_counts.index.dtype):
            centers = value_counts.index.astype("str").to_dask_array()
        else:
            centers = value_counts.index.to_dask_array()
        return (counts, centers)
    else:
        raise UnreachableError()
Beispiel #20
0
def calc_qqnorm(srs: dd.Series) -> Tuple[np.ndarray, np.ndarray]:
    """
    Calculate QQ plot given a series.

    Parameters
    ----------
    srs
        One numerical column from which to compute the quantiles

    Returns
    -------
    Tuple[np.ndarray, np.ndarray]
        A tuple of (actual quantiles, theoretical quantiles)
    """
    q_range = np.linspace(0.01, 0.99, 100)
    actual_qs, mean, std = dask.compute(srs.quantile(q_range), srs.mean(),
                                        srs.std())
    theory_qs = np.sort(np.asarray(norm.ppf(q_range, mean, std)))
    return actual_qs, theory_qs
Beispiel #21
0
def cast_column_to_type(col: dd.Series, expected_type: str):
    """Cast the given column to the expected type"""
    current_type = col.dtype

    if similar_type(current_type, expected_type):
        logger.debug("...not converting.")
        return None

    current_float = pd.api.types.is_float_dtype(current_type)
    expected_integer = pd.api.types.is_integer_dtype(expected_type)
    if current_float and expected_integer:
        logger.debug("...truncating...")
        # Currently "trunc" can not be applied to NA (the pandas missing value type),
        # because NA is a different type. It works with np.NaN though.
        # For our use case, that does not matter, as the conversion to integer later
        # will convert both NA and np.NaN to NA.
        col = da.trunc(col.fillna(value=np.NaN))

    logger.debug(f"Need to cast from {current_type} to {expected_type}")
    return col.astype(expected_type)
Beispiel #22
0
def _calc_nom_stats(
    srs: dd.Series,
    df: dd.DataFrame,
    nrows: int,
    nuniq: dd.core.Scalar,
) -> Dict[str, Any]:
    """
    Calculate statistics for a nominal column
    """
    # overview stats
    stats = {
        "nrows": nrows,
        "npres": srs.shape[0],
        "nuniq": nuniq,
        "mem_use": srs.memory_usage(deep=True),
        "first_rows": srs.reset_index(drop=True).loc[:4],
    }
    # length stats
    leng = {
        "Mean": srs.str.len().mean(),
        "Standard Deviation": srs.str.len().std(),
        "Median": srs.str.len().quantile(0.5),
        "Minimum": srs.str.len().min(),
        "Maximum": srs.str.len().max(),
    }
    # letter stats
    # computed on groupby-count:
    # compute the statistic for each group then multiply by the count of the group
    grp, col = df.columns
    lc_cnt = (df[grp].str.count(r"[a-z]") * df[col]).sum()
    uc_cnt = (df[grp].str.count(r"[A-Z]") * df[col]).sum()
    letter = {
        "Count": lc_cnt + uc_cnt,
        "Lowercase Letter": lc_cnt,
        "Space Separator": (df[grp].str.count(r"[ ]") * df[col]).sum(),
        "Uppercase Letter": uc_cnt,
        "Dash Punctuation": (df[grp].str.count(r"[-]") * df[col]).sum(),
        "Decimal Number": (df[grp].str.count(r"[0-9]") * df[col]).sum(),
    }

    return {"stats": stats, "len_stats": leng, "letter_stats": letter}
Beispiel #23
0
def calc_bar_pie(srs: dd.Series, ngroups: int,
                 largest: bool) -> Tuple[pd.DataFrame, int, float]:
    """
    Calculates the group counts given a series.

    Parameters
    ----------
    srs
        One categorical column
    ngroups
        Number of groups to return
    largest
        If true, show the groups with the largest count,
        else show the groups with the smallest count

    Returns
    -------
    Tuple[pd.DataFrame, float]
        A dataframe of the group counts, the total count of groups,
        and the percent of missing values
    """
    miss_pct = round(srs.isna().sum() / len(srs) * 100, 1)
    try:
        grp_srs = srs.groupby(srs).size()
    except TypeError:
        srs = srs.astype(str)
        grp_srs = srs.groupby(srs).size()
    # select largest or smallest groups
    smp_srs = grp_srs.nlargest(n=ngroups) if largest else grp_srs.nsmallest(
        n=ngroups)
    df = smp_srs.to_frame().rename(columns={srs.name: "cnt"}).reset_index()
    # add a row containing the sum of the other groups
    other_cnt = len(srs) - df["cnt"].sum()
    df = df.append(pd.DataFrame({srs.name: ["Others"], "cnt": [other_cnt]}))
    # add a column containing the percent of count in each group
    df["pct"] = df["cnt"] / len(srs) * 100
    df.columns = ["col", "cnt", "pct"]
    df["col"] = df["col"].astype(
        str)  # needed when numeric is cast as categorical
    return df, len(grp_srs), miss_pct
Beispiel #24
0
def calc_stats_dt(srs: dd.Series) -> Dict[str, str]:
    """
    Calculate stats from a datetime column
    """
    size = srs.shape[0]  # include nan
    count = srs.count()  # exclude nan
    # nunique_approx() has error when type is datetime
    try:
        uniq_count = srs.nunique_approx()
    except:  # pylint: disable=W0702
        uniq_count = srs.nunique()
    overview_dict = {
        "Distinct Count": uniq_count,
        "Approximate Unique (%)": uniq_count / count,
        "Missing": size - count,
        "Missing (%)": 1 - (count / size),
        "Memory Size": srs.memory_usage(deep=True),
        "Minimum": srs.min(),
        "Maximum": srs.max(),
    }

    return overview_dict
Beispiel #25
0
 def coerce_code(v: dd.Series, codes: List[int]) -> dd.Series:
     # Set non-ints and unexpected codes to missing (-1)
     v = dd.to_numeric(v, errors="coerce")
     v = v.where(v.isin(codes), np.nan)
     return v.fillna(-1).astype("int8")
Beispiel #26
0
def round_series_up(s: dd.Series) -> dd.Series:
    """Apply roundup function to all elements of `s`"""
    return s.apply(roundup, meta=pd.Series(data=[], dtype=np.float32))
Beispiel #27
0
def cont_comps(srs: dd.Series, cfg: Config) -> Dict[str, Any]:
    """
    All computations required for plot(df, Continuous)
    """
    # pylint: disable=too-many-branches
    data: Dict[str, Any] = {}

    if cfg.stats.enable or cfg.hist.enable:
        data["nrows"] = srs.shape[0]  # total rows
    srs = srs.dropna()
    if cfg.stats.enable:
        data["npres"] = srs.shape[0]  # number of present (not null) values
    srs = srs[~srs.isin({np.inf, -np.inf})]  # remove infinite values
    if cfg.hist.enable or cfg.qqnorm.enable and cfg.insight.enable:
        data["hist"] = da.histogram(srs, cfg.hist.bins, (srs.min(), srs.max()))
        if cfg.insight.enable:
            data["norm"] = normaltest(data["hist"][0])
    if cfg.hist.enable and cfg.insight.enable:
        data["chisq"] = chisquare(data["hist"][0])
    # compute only the required amount of quantiles
    if cfg.qqnorm.enable:
        data["qntls"] = srs.quantile(np.linspace(0.01, 0.99, 99))
    elif cfg.stats.enable:
        data["qntls"] = srs.quantile([0.05, 0.25, 0.5, 0.75, 0.95])
    elif cfg.box.enable:
        data["qntls"] = srs.quantile([0.25, 0.5, 0.75])
    if cfg.stats.enable or cfg.hist.enable and cfg.insight.enable:
        data["skew"] = skew(srs)
    if cfg.stats.enable or cfg.qqnorm.enable:
        data["mean"] = srs.mean()
        data["std"] = srs.std()
    if cfg.stats.enable:
        data["min"] = srs.min()
        data["max"] = srs.max()
        data["nreals"] = srs.shape[0]
        data["nzero"] = (srs == 0).sum()
        data["nneg"] = (srs < 0).sum()
        data["kurt"] = kurtosis(srs)
        data["mem_use"] = srs.memory_usage(deep=True)
    # compute the density histogram
    if cfg.kde.enable:
        # To avoid the singular matrix problem, gaussian_kde needs a non-zero std.
        if not math.isclose(
                dask.compute(data["min"])[0],
                dask.compute(data["max"])[0]):
            data["dens"] = da.histogram(srs,
                                        cfg.kde.bins, (srs.min(), srs.max()),
                                        density=True)
            # gaussian kernel density estimate
            data["kde"] = gaussian_kde(
                srs.map_partitions(lambda x: x.sample(min(1000, x.shape[0])),
                                   meta=srs))
        else:
            data["kde"] = None
    if cfg.box.enable:
        data.update(_calc_box(srs, data["qntls"], cfg))
    if cfg.value_table.enable:
        value_counts = srs.value_counts(sort=False)
        if cfg.stats.enable:
            data["nuniq"] = value_counts.shape[0]
        data["value_table"] = value_counts.nlargest(cfg.value_table.ngroups)
    elif cfg.stats.enable:
        data["nuniq"] = srs.nunique_approx()

    return data
Beispiel #28
0
def nom_comps(srs: dd.Series, head: pd.Series, cfg: Config) -> Dict[str, Any]:
    """
    All computations required for plot(df, Nominal)
    """
    # pylint: disable=too-many-branches
    data: Dict[str, Any] = dict()

    data["nrows"] = srs.shape[0]  # total rows
    srs = srs.dropna()  # drop null values
    grps = srs.value_counts(
        sort=False)  # counts of unique values in the series
    data["geo"] = grps

    if cfg.stats.enable or cfg.bar.enable or cfg.pie.enable:
        data["nuniq"] = grps.shape[0]  # total number of groups

    # compute bar and pie together unless the parameters are different
    if cfg.bar.enable or cfg.pie.enable:
        # select the largest or smallest groups
        data["bar"] = (grps.nlargest(cfg.bar.bars) if cfg.bar.sort_descending
                       else grps.nsmallest(cfg.bar.bars))

        if cfg.bar.bars == cfg.pie.slices and cfg.bar.sort_descending == cfg.pie.sort_descending:
            data["pie"] = data["bar"]
        else:
            data["pie"] = (grps.nlargest(cfg.pie.slices)
                           if cfg.pie.sort_descending else grps.nsmallest(
                               cfg.pie.slices))

        if cfg.bar.bars == cfg.value_table.ngroups and cfg.bar.sort_descending:
            data["value_table"] = data["bar"]
        elif cfg.pie.slices == cfg.value_table.ngroups and cfg.pie.sort_descending:
            data["value_table"] = data["pie"]
        else:
            data["value_table"] = grps.nlargest(cfg.value_table.ngroups)

        if cfg.insight.enable:
            data["chisq"] = chisquare(grps.values)

    df = grps.reset_index()  # dataframe with group names and counts

    if cfg.stats.enable or cfg.wordlen.enable:
        if not head.apply(lambda x: isinstance(x, str)).all():
            srs = srs.astype(
                str)  # srs must be a string to compute the value lengths
    if cfg.stats.enable or cfg.wordcloud.enable or cfg.wordfreq.enable:
        if not head.apply(lambda x: isinstance(x, str)).all():
            df[df.columns[0]] = df[df.columns[0]].astype(str)

    if cfg.stats.enable:
        data.update(_calc_nom_stats(srs, df, data["nrows"], data["nuniq"]))
    elif cfg.wordfreq.enable and cfg.insight.enable:
        data["len_stats"] = {
            "Minimum": srs.str.len().min(),
            "Maximum": srs.str.len().max()
        }
    if cfg.wordlen.enable:
        lens = srs.str.len()
        data["len_hist"] = da.histogram(lens, cfg.wordlen.bins,
                                        (lens.min(), lens.max()))
    if cfg.wordcloud.enable or cfg.wordfreq.enable:
        if all(
                getattr(cfg.wordcloud, att) == getattr(cfg.wordfreq, att)
                for att in ("top_words", "stopword", "stem", "lemmatize")):
            word_freqs = _calc_word_freq(
                df,
                cfg.wordfreq.top_words,
                cfg.wordfreq.stopword,
                cfg.wordfreq.lemmatize,
                cfg.wordfreq.stem,
            )
            data["word_cnts_cloud"] = word_freqs["word_cnts"]
            data["nuniq_words_cloud"] = word_freqs["nuniq_words"]
        else:
            word_freqs = _calc_word_freq(
                df.copy(),
                cfg.wordfreq.top_words,
                cfg.wordfreq.stopword,
                cfg.wordfreq.lemmatize,
                cfg.wordfreq.stem,
            )
            word_freqs_cloud = _calc_word_freq(
                df,
                cfg.wordcloud.top_words,
                cfg.wordcloud.stopword,
                cfg.wordcloud.lemmatize,
                cfg.wordcloud.stem,
            )
            data["word_cnts_cloud"] = word_freqs_cloud["word_cnts"]
            data["nuniq_words_cloud"] = word_freqs["nuniq_words"]

        data["word_cnts_freq"] = word_freqs["word_cnts"]
        data["nwords_freq"] = word_freqs["nwords"]

    return data
Beispiel #29
0
def nom_comps(
    srs: dd.Series,
    first_rows: pd.Series,
    ngroups: int,
    largest: bool,
    bins: int,
    top_words: int,
    stopword: bool,
    lemmatize: bool,
    stem: bool,
) -> Dict[str, Any]:
    """
    This function aggregates all of the computations required for plot(df, Nominal())

    Parameters
    ----------
    srs
        one categorical column
    ngroups
        Number of groups to return
    largest
        If true, show the groups with the largest count,
        else show the groups with the smallest count
    bins
        number of bins for the category length frequency histogram
    top_words
        Number of highest frequency words to show in the
        wordcloud and word frequency bar chart
    stopword
        If True, remove stop words, else keep them
    lemmatize
        If True, lemmatize the words before computing
        the word frequencies, else don't
    stem
        If True, extract the stem of the words before
        computing the word frequencies, else don't
    """  # pylint: disable=too-many-arguments

    data: Dict[str, Any] = {}

    # total rows
    data["nrows"] = srs.shape[0]
    # drop null values
    srs = srs.dropna()

    ## if cfg.bar_enable or cfg.pie_enable
    # counts of unique values in the series
    grps = srs.value_counts(sort=False)
    # total number of groups
    data["nuniq"] = grps.shape[0]
    # select the largest or smallest groups
    data["bar"] = grps.nlargest(ngroups) if largest else grps.nsmallest(
        ngroups)
    ##     if cfg.barchart_bars == cfg.piechart_slices:
    data["pie"] = data["bar"]
    ##     else
    ##     data["pie"] = grps.nlargest(ngroups) if largest else grps.nsmallest(ngroups)
    ##     if cfg.insights.evenness_enable
    data["chisq"] = chisquare(grps.values)

    ## if cfg.stats_enable
    df = grps.reset_index()
    ## if cfg.stats_enable or cfg.word_freq_enable
    if not first_rows.apply(lambda x: isinstance(x, str)).all():
        srs = srs.astype(
            str)  # srs must be a string to compute the value lengths
        df[df.columns[0]] = df[df.columns[0]].astype(str)
    data.update(calc_cat_stats(srs, df, bins, data["nrows"], data["nuniq"]))
    # ## if cfg.word_freq_enable
    data.update(calc_word_freq(df, top_words, stopword, lemmatize, stem))

    return data
Beispiel #30
0
def cont_comps(srs: dd.Series, bins: int) -> Dict[str, Any]:
    """
    This function aggregates all of the computations required for plot(df, Continuous())

    Parameters
    ----------
    srs
        one numerical column
    bins
        the number of bins in the histogram
    """

    data: Dict[str, Any] = {}

    ## if cfg.stats_enable or cfg.hist_enable or
    # calculate the total number of rows then drop the missing values
    data["nrows"] = srs.shape[0]
    srs = srs.dropna()
    ## if cfg.stats_enable
    # number of not null (present) values
    data["npres"] = srs.shape[0]
    # remove infinite values
    srs = srs[~srs.isin({np.inf, -np.inf})]

    # shared computations
    ## if cfg.stats_enable or cfg.hist_enable or cfg.qqplot_enable and cfg.insights_enable:
    data["min"], data["max"] = srs.min(), srs.max()
    ## if cfg.hist_enable or cfg.qqplot_enable and cfg.ingsights_enable:
    data["hist"] = da.histogram(srs,
                                bins=bins,
                                range=[data["min"], data["max"]])
    ## if cfg.insights_enable and (cfg.qqplot_enable or cfg.hist_enable):
    data["norm"] = normaltest(data["hist"][0])
    ## if cfg.qqplot_enable
    data["qntls"] = srs.quantile(np.linspace(0.01, 0.99, 99))
    ## elif cfg.stats_enable
    ## data["qntls"] = srs.quantile([0.05, 0.25, 0.5, 0.75, 0.95])
    ## elif cfg.boxplot_enable
    ## data["qntls"] = srs.quantile([0.25, 0.5, 0.75])
    ## if cfg.stats_enable or cfg.hist_enable and cfg.insights_enable:
    data["skew"] = skew(srs)

    # if cfg.stats_enable
    data["nuniq"] = srs.nunique()
    data["nreals"] = srs.shape[0]
    data["nzero"] = (srs == 0).sum()
    data["nneg"] = (srs < 0).sum()
    data["mean"] = srs.mean()
    data["std"] = srs.std()
    data["kurt"] = kurtosis(srs)
    data["mem_use"] = srs.memory_usage(deep=True)

    ## if cfg.hist_enable and cfg.insight_enable
    data["chisq"] = chisquare(data["hist"][0])

    # compute the density histogram
    data["dens"] = da.histogram(srs,
                                bins=bins,
                                range=[data["min"], data["max"]],
                                density=True)
    # gaussian kernel density estimate
    data["kde"] = gaussian_kde(
        srs.map_partitions(lambda x: x.sample(min(1000, x.shape[0])),
                           meta=srs))

    ## if cfg.box_enable
    data.update(calc_box(srs, data["qntls"]))

    return data