Esempio n. 1
0
def drop_missing(
    data: pd.DataFrame,
    drop_threshold_cols: float = 1,
    drop_threshold_rows: float = 1,
    col_exclude: Optional[List[str]] = None,
) -> pd.DataFrame:
    """ Drops completely empty columns and rows by default and optionally provides \
        flexibility to loosen restrictions to drop additional non-empty columns and \
        rows based on the fraction of NA-values.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame
    drop_threshold_cols : float, optional
        Drop columns with NA-ratio equal to or above the specified threshold, by \
        default 1
    drop_threshold_rows : float, optional
        Drop rows with NA-ratio equal to or above the specified threshold, by default 1
    col_exclude : Optional[List[str]], optional
        Specify a list of columns to exclude from dropping. The excluded columns do \
        not affect the drop thresholds, by default None

    Returns
    -------
    pd.DataFrame
        Pandas DataFrame without any empty columns or rows

    Notes
    -----
    Columns are dropped first
    """

    # Validate Inputs
    _validate_input_range(drop_threshold_cols, "drop_threshold_cols", 0, 1)
    _validate_input_range(drop_threshold_rows, "drop_threshold_rows", 0, 1)

    col_exclude = [] if col_exclude is None else col_exclude.copy()
    data_exclude = data[col_exclude]

    data = pd.DataFrame(data).copy()

    data_dropped = data.drop(columns=col_exclude, errors="ignore")
    data_dropped = data_dropped.drop(
        columns=data_dropped.loc[
            :, _missing_vals(data)["mv_cols_ratio"] > drop_threshold_cols
        ].columns
    ).dropna(axis=1, how="all")

    data = pd.concat([data_dropped, data_exclude], axis=1)

    data_cleaned = data.drop(
        index=data.loc[
            _missing_vals(data)["mv_rows_ratio"] > drop_threshold_rows, :
        ].index
    ).dropna(axis=0, how="all")
    return data_cleaned
Esempio n. 2
0
def convert_datatypes(
    data: pd.DataFrame,
    category: bool = True,
    cat_threshold: float = 0.05,
    cat_exclude: Optional[List[Union[str, int]]] = None,
) -> pd.DataFrame:
    """ Converts columns to best possible dtypes using dtypes supporting pd.NA.
    Temporarily not converting to integers due to an issue in pandas. This is expected \
        to be fixed in pandas 1.1. See https://github.com/pandas-dev/pandas/issues/33803

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame
    category : bool, optional
        Change dtypes of columns with dtype "object" to "category". Set threshold \
        using cat_threshold or exclude columns using cat_exclude, by default True
    cat_threshold : float, optional
        Ratio of unique values below which categories are inferred and column dtype is \
        changed to categorical, by default 0.05
    cat_exclude : Optional[List[Union[str, int]]], optional
        List of columns to exclude from categorical conversion, by default None

    Returns
    -------
    pd.DataFrame
        Pandas DataFrame with converted Datatypes
    """

    # Validate Inputs
    _validate_input_bool(category, "Category")
    _validate_input_range(cat_threshold, "cat_threshold", 0, 1)

    cat_exclude = [] if cat_exclude is None else cat_exclude.copy()

    data = pd.DataFrame(data).copy()
    for col in data.columns:
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
        if (
            category
            and unique_vals_ratio < cat_threshold
            and col not in cat_exclude
            and data[col].dtype == "object"
        ):
            data[col] = data[col].astype("category")

        data[col] = data[col].convert_dtypes(
            infer_objects=True,
            convert_string=True,
            convert_integer=False,
            convert_boolean=True,
        )

    data = optimize_ints(data)
    data = optimize_floats(data)

    return data
Esempio n. 3
0
def pool_duplicate_subsets(
    data: pd.DataFrame,
    col_dupl_thresh: float = 0.2,
    subset_thresh: float = 0.2,
    min_col_pool: int = 3,
    exclude: Optional[List[str]] = None,
    return_details=False,
) -> pd.DataFrame:
    """ Checks for duplicates in subsets of columns and pools them. This can reduce \
        the number of columns in the data without loosing much information. Suitable \
        columns are combined to subsets and tested for duplicates. In case sufficient \
        duplicates can be found, the respective columns are aggregated into a \
        "pooled_var" column. Identical numbers in the "pooled_var" column indicate \
        identical information in the respective rows.

        Note:  It is advised to exclude features that provide sufficient informational \
        content by themselves as well as the target column by using the "exclude" \
        setting.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame
    col_dupl_thresh : float, optional
        Columns with a ratio of duplicates higher than "col_dupl_thresh" are \
        considered in the further analysis. Columns with a lower ratio are not \
        considered for pooling, by default 0.2
    subset_thresh : float, optional
        The first subset with a duplicate threshold higher than "subset_thresh" is \
        chosen and aggregated. If no subset reaches the threshold, the algorithm \
        continues with continuously smaller subsets until "min_col_pool" is reached, \
        by default 0.2
    min_col_pool : int, optional
        Minimum number of columns to pool. The algorithm attempts to combine as many \
        columns as possible to suitable subsets and stops when "min_col_pool" is \
        reached, by default 3
    exclude : Optional[List[str]], optional
        List of column names to be excluded from the analysis. These columns are \
        passed through without modification, by default None
    return_details : bool, optional
        Provdies flexibility to return intermediary results, by default False

    Returns
    -------
    pd.DataFrame
        DataFrame with low cardinality columns pooled

    optional:
    subset_cols: List of columns used as subset
    """

    # Input validation
    _validate_input_range(col_dupl_thresh, "col_dupl_thresh", 0, 1)
    _validate_input_range(subset_thresh, "subset_thresh", 0, 1)
    _validate_input_range(min_col_pool, "min_col_pool", 0, data.shape[1])

    excluded_cols = []
    if exclude is not None:
        excluded_cols = data[exclude]
        data = data.drop(columns=exclude)

    subset_cols = []
    for i in range(data.shape[1] + 1 - min_col_pool):
        check_list = [
            col for col in data.columns
            if data.duplicated(subset=col).mean() > col_dupl_thresh
        ]

        if len(check_list) > 0:
            combinations = itertools.combinations(check_list,
                                                  len(check_list) - i)
        else:
            continue

        ratios = [
            *map(lambda comb: data.duplicated(subset=list(comb)).mean(),
                 combinations)
        ]

        max_ratio = max(ratios)
        max_idx = np.argmax(ratios)

        if max_ratio > subset_thresh:
            best_subset = itertools.islice(
                itertools.combinations(check_list,
                                       len(check_list) - i),
                max_idx,
                max_idx + 1,
            )
            best_subset = data[list(list(best_subset)[0])]
            subset_cols = best_subset.columns.tolist()

            unique_subset = (
                best_subset.drop_duplicates().reset_index().rename(
                    columns={"index": "pooled_vars"}))
            data = data.merge(unique_subset,
                              how="left",
                              on=best_subset.columns.tolist()).drop(
                                  columns=best_subset.columns.tolist())
            data.index = pd.RangeIndex(len(data))
            break

    data = pd.concat([data, pd.DataFrame(excluded_cols)], axis=1)

    if return_details:
        return data, subset_cols

    return data
Esempio n. 4
0
def mv_col_handling(
    data: pd.DataFrame,
    target: Optional[Union[str, pd.Series, List]] = None,
    mv_threshold: float = 0.1,
    corr_thresh_features: float = 0.5,
    corr_thresh_target: float = 0.3,
    return_details: bool = False,
) -> pd.DataFrame:
    """ Converts columns with a high ratio of missing values into binary features and \
    eventually drops them based on their correlation with other features and the \
    target variable. This function follows a three step process:
    - 1) Identify features with a high ratio of missing values (above 'mv_threshold').
    - 2) Identify high correlations of these features among themselves and with \
        other features in the dataset (above 'corr_thresh_features').
    - 3) Features with high ratio of missing values and high correlation among each \
        other are dropped unless they correlate reasonably well with the target \
        variable (above 'corr_thresh_target').

    Note: If no target is provided, the process exits after step two and drops columns \
    identified up to this point.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame
    target : Optional[Union[str, pd.Series, List]], optional
        Specify target for correlation. I.e. label column to generate only the \
        correlations between each feature and the label, by default None
    mv_threshold : float, optional
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger \
        than mv_threshold are candidates for dropping and undergo further analysis, by \
        default 0.1
    corr_thresh_features : float, optional
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified \
        features (with a high mv-ratio) is allowed to have with another feature. If \
        this threshold is overstepped, the feature undergoes further analysis, by \
        default 0.5
    corr_thresh_target : float, optional
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining \
        feature (i.e. feature with a high mv-ratio and high correlation to another \
        existing feature) with the target. If this threshold is not met the feature is \
        ultimately dropped, by default 0.3
    return_details : bool, optional
        Provdies flexibility to return intermediary results, by default False

    Returns
    -------
    pd.DataFrame
        Updated Pandas DataFrame

    optional:
    cols_mv: Columns with missing values included in the analysis
    drop_cols: List of dropped columns
    """

    # Validate Inputs
    _validate_input_range(mv_threshold, "mv_threshold", 0, 1)
    _validate_input_range(corr_thresh_features, "corr_thresh_features", 0, 1)
    _validate_input_range(corr_thresh_target, "corr_thresh_target", 0, 1)

    data = pd.DataFrame(data).copy()
    data_local = data.copy()
    mv_ratios = _missing_vals(data_local)["mv_cols_ratio"]
    cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
    data_local[cols_mv] = (data_local[cols_mv].applymap(
        lambda x: 1 if not pd.isnull(x) else x).fillna(0))

    high_corr_features = []
    data_temp = data_local.copy()
    for col in cols_mv:
        corrmat = corr_mat(data_temp, colored=False)
        if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
            high_corr_features.append(col)
            data_temp = data_temp.drop(columns=[col])

    drop_cols = []
    if target is None:
        data = data.drop(columns=high_corr_features)
    else:
        corrs = corr_mat(data_local, target=target,
                         colored=False).loc[high_corr_features]
        drop_cols = corrs.loc[
            abs(corrs.iloc[:, 0]) < corr_thresh_target].index.tolist()
        data = data.drop(columns=drop_cols)

    if return_details:
        return data, cols_mv, drop_cols

    return data
Esempio n. 5
0
def data_cleaning(
    data: pd.DataFrame,
    drop_threshold_cols: float = 0.9,
    drop_threshold_rows: float = 0.9,
    drop_duplicates: bool = True,
    convert_dtypes: bool = True,
    col_exclude: Optional[List[str]] = None,
    category: bool = True,
    cat_threshold: float = 0.03,
    cat_exclude: Optional[List[Union[str, int]]] = None,
    clean_col_names: bool = True,
    show: str = "changes",
) -> pd.DataFrame:
    """ Perform initial data cleaning tasks on a dataset, such as dropping single \
        valued and empty rows, empty columns as well as optimizing the datatypes.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame
    drop_threshold_cols : float, optional
        Drop columns with NA-ratio equal to or above the specified threshold, by \
        default 0.9
    drop_threshold_rows : float, optional
        Drop rows with NA-ratio equal to or above the specified threshold, by \
        default 0.9
    drop_duplicates : bool, optional
        Drop duplicate rows, keeping the first occurence. This step comes after the \
        dropping of missing values, by default True
    convert_dtypes : bool, optional
        Convert dtypes using pd.convert_dtypes(), by default True
    col_exclude : Optional[List[str]], optional
        Specify a list of columns to exclude from dropping, by default None
    category : bool, optional
        Enable changing dtypes of "object" columns to "category". Set threshold using \
        cat_threshold. Requires convert_dtypes=True, by default True
    cat_threshold : float, optional
        Ratio of unique values below which categories are inferred and column dtype is \
        changed to categorical, by default 0.03
    cat_exclude : Optional[List[str]], optional
        List of columns to exclude from categorical conversion, by default None
    clean_column_names: bool, optional
        Cleans the column names and provides hints on duplicate and long names, by \
        default True
    show : str, optional
        {"all", "changes", None}, by default "changes"
        Specify verbosity of the output:

            * "all": Print information about the data before and after cleaning as \
            well as information about  changes and memory usage (deep). Please be \
            aware, that this can slow down the function by quite a bit.
            * "changes": Print out differences in the data before and after cleaning.
            * None: No information about the data and the data cleaning is printed.

    Returns
    -------
    pd.DataFrame
        Cleaned Pandas DataFrame

    See also
    --------
    convert_datatypes: Convert columns to best possible dtypes.
    drop_missing : Flexibly drop columns and rows.
    _memory_usage: Gives the total memory usage in megabytes.
    _missing_vals: Metrics about missing values in the dataset.

    Notes
    -----
    The category dtype is not grouped in the summary, unless it contains exactly the \
    same categories.
    """

    # Validate Inputs
    _validate_input_range(drop_threshold_cols, "drop_threshold_cols", 0, 1)
    _validate_input_range(drop_threshold_rows, "drop_threshold_rows", 0, 1)
    _validate_input_bool(drop_duplicates, "drop_duplicates")
    _validate_input_bool(convert_dtypes, "convert_datatypes")
    _validate_input_bool(category, "category")
    _validate_input_range(cat_threshold, "cat_threshold", 0, 1)

    data = pd.DataFrame(data).copy()
    data_cleaned = drop_missing(data,
                                drop_threshold_cols,
                                drop_threshold_rows,
                                col_exclude=col_exclude)

    if clean_col_names:
        data_cleaned = clean_column_names(data_cleaned)

    single_val_cols = data_cleaned.columns[data_cleaned.nunique(
        dropna=False) == 1].tolist()
    data_cleaned = data_cleaned.drop(columns=single_val_cols)

    dupl_rows = None

    if drop_duplicates:
        data_cleaned, dupl_rows = _drop_duplicates(data_cleaned)
    if convert_dtypes:
        data_cleaned = convert_datatypes(
            data_cleaned,
            category=category,
            cat_threshold=cat_threshold,
            cat_exclude=cat_exclude,
        )

    _diff_report(
        data,
        data_cleaned,
        dupl_rows=dupl_rows,
        single_val_cols=single_val_cols,
        show=show,
    )

    return data_cleaned
Esempio n. 6
0
def dist_plot(
    data: pd.DataFrame,
    mean_color: str = "orange",
    figsize: Tuple = (16, 2),
    fill_range: Tuple = (0.025, 0.975),
    showall: bool = False,
    kde_kws: Dict[str, Any] = None,
    rug_kws: Dict[str, Any] = None,
    fill_kws: Dict[str, Any] = None,
    font_kws: Dict[str, Any] = None,
):
    """ Two-dimensional visualization of the distribution of non binary numerical features.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the \
        index/column information is used to label the plots
    mean_color : str, optional
        Color of the vertical line indicating the mean of the data, by default "orange"
    figsize : Tuple, optional
        Controls the figure size, by default (16, 2)
    fill_range : Tuple, optional
        Set the quantiles for shading. Default spans 95% of the data, which is about two std. deviations \
        above and below the mean, by default (0.025, 0.975)
    showall : bool, optional
        Set to True to remove the output limit of 20 plots, by default False
    kde_kws : Dict[str, Any], optional
        Keyword arguments for kdeplot(), by default {"color": "k", "alpha": 0.7, "linewidth": 1.5, "bw": 0.3}
    rug_kws : Dict[str, Any], optional
        Keyword arguments for rugplot(), by default {"color": "#ff3333", "alpha": 0.05, "linewidth": 4, \
        "height": 0.075}
    fill_kws : Dict[str, Any], optional
        Keyword arguments to control the fill, by default {"color": "#80d4ff", "alpha": 0.2}
    font_kws : Dict[str, Any], optional
        Keyword arguments to control the font, by default {"color":  "#111111", "weight": "normal", "size": \
        11}

    Returns
    -------
    ax: matplotlib Axes
        Returns the Axes object with the plot for further tweaking.
    """

    # Validate Inputs
    _validate_input_range(fill_range[0], "fill_range_lower", 0, 1)
    _validate_input_range(fill_range[1], "fill_range_upper", 0, 1)
    _validate_input_smaller(fill_range[0], fill_range[1], "fill_range")
    _validate_input_bool(showall, "showall")

    # Handle dictionary defaults
    kde_kws = {
        "alpha": 0.75,
        "linewidth": 1.5,
        "bw": 0.4
    } if kde_kws is None else kde_kws.copy()
    rug_kws = ({
        "color": "#ff3333",
        "alpha": 0.05,
        "linewidth": 4,
        "height": 0.075
    } if rug_kws is None else rug_kws.copy())
    fill_kws = {
        "color": "#80d4ff",
        "alpha": 0.2
    } if fill_kws is None else fill_kws.copy()
    font_kws = {
        "color": "#111111",
        "weight": "normal",
        "size": 11
    } if font_kws is None else font_kws.copy()

    data = pd.DataFrame(data.copy()).dropna(axis=1, how="all")
    data = data.loc[:, data.nunique() > 2]
    cols = list(data.select_dtypes(include=["number"]).columns)
    data = data[cols]
    data = data.loc[:, data.nunique() > 2]

    if len(cols) == 0:
        print("No columns with numeric data were detected.")
        return

    elif len(cols) >= 20 and showall is False:
        print(
            f"Note: The number of non binary numerical features is very large ({len(cols)}), please consider"
            " splitting the data. Showing plots for the first 20 numerical features. Override this by setting"
            " showall=True.")
        cols = cols[:20]

    for col in cols:
        num_dropped_vals = data[col].isna().sum()
        if num_dropped_vals > 0:
            col_data = data[col].dropna(axis=0)
            print(
                f"Dropped {num_dropped_vals} missing values from column {col}."
            )

        else:
            col_data = data[col]

        _, ax = plt.subplots(figsize=figsize)
        ax = sns.distplot(
            col_data,
            hist=False,
            rug=True,
            kde_kws=kde_kws,
            rug_kws=rug_kws,
        )

        # Vertical lines and fill
        x, y = ax.lines[0].get_xydata().T
        ax.fill_between(
            x,
            y,
            where=((x >= np.quantile(col_data, fill_range[0])) &
                   (x <= np.quantile(col_data, fill_range[1]))),
            label=f"{fill_range[0]*100:.1f}% - {fill_range[1]*100:.1f}%",
            **fill_kws,
        )

        mean = np.mean(col_data)
        std = scipy.stats.tstd(col_data)
        ax.vlines(x=mean,
                  ymin=0,
                  ymax=np.interp(mean, x, y),
                  ls="dotted",
                  color=mean_color,
                  lw=2,
                  label="mean")
        ax.vlines(
            x=np.median(col_data),
            ymin=0,
            ymax=np.interp(np.median(col_data), x, y),
            ls=":",
            color=".3",
            label="median",
        )
        ax.vlines(
            x=[mean - std, mean + std],
            ymin=0,
            ymax=[np.interp(mean - std, x, y),
                  np.interp(mean + std, x, y)],
            ls=":",
            color=".5",
            label="\u03BC \u00B1 \u03C3",
        )

        ax.set_ylim(0)
        ax.set_xlim(ax.get_xlim()[0] * 1.15, ax.get_xlim()[1] * 1.15)

        # Annotations and legend
        ax.text(0.01,
                0.85,
                f"Mean: {mean:.2f}",
                fontdict=font_kws,
                transform=ax.transAxes)
        ax.text(0.01,
                0.7,
                f"Std. dev: {std:.2f}",
                fontdict=font_kws,
                transform=ax.transAxes)
        ax.text(
            0.01,
            0.55,
            f"Skew: {scipy.stats.skew(col_data):.2f}",
            fontdict=font_kws,
            transform=ax.transAxes,
        )
        ax.text(
            0.01,
            0.4,
            f"Kurtosis: {scipy.stats.kurtosis(col_data):.2f}",  # Excess Kurtosis
            fontdict=font_kws,
            transform=ax.transAxes,
        )
        ax.text(0.01,
                0.25,
                f"Count: {len(col_data)}",
                fontdict=font_kws,
                transform=ax.transAxes)
        ax.legend(loc="upper right")

    return ax
Esempio n. 7
0
def cat_plot(
    data: pd.DataFrame,
    figsize: Tuple = (18, 18),
    top: int = 3,
    bottom: int = 3,
    bar_color_top: str = "#5ab4ac",
    bar_color_bottom: str = "#d8b365",
):
    """ Two-dimensional visualization of the number and frequency of categorical features.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the \
        index/column information is used to label the plots
    figsize : Tuple, optional
        Use to control the figure size, by default (18, 18)
    top : int, optional
        Show the "top" most frequent values in a column, by default 3
    bottom : int, optional
        Show the "bottom" most frequent values in a column, by default 3
    bar_color_top : str, optional
        Use to control the color of the bars indicating the most common values, by default "#5ab4ac"
    bar_color_bottom : str, optional
        Use to control the color of the bars indicating the least common values, by default "#d8b365"
    cmap : str, optional
        The mapping from data values to color space, by default "BrBG"

    Returns
    -------
    Gridspec
        gs: Figure with array of Axes objects
    """

    # Validate Inputs
    _validate_input_int(top, "top")
    _validate_input_int(bottom, "bottom")
    _validate_input_range(top, "top", 0, data.shape[1])
    _validate_input_range(bottom, "bottom", 0, data.shape[1])
    _validate_input_sum_larger(1, "top and bottom", top, bottom)

    data = pd.DataFrame(data).copy()
    cols = data.select_dtypes(exclude=["number"]).columns.tolist()
    data = data[cols]
    for col in data.columns:
        if data[col].dtype.name == "category" or data[
                col].dtype.name == "string":
            data[col] = data[col].astype("object")

    if len(cols) == 0:
        print("No columns with categorical data were detected.")

    fig = plt.figure(figsize=figsize)
    gs = fig.add_gridspec(nrows=6, ncols=len(cols), wspace=0.21)

    for count, col in enumerate(cols):

        n_unique = data[col].nunique(dropna=True)
        value_counts = data[col].value_counts()
        lim_top, lim_bot = top, bottom

        if n_unique < top + bottom:
            lim_top = int(n_unique // 2)
            lim_bot = int(n_unique // 2) + 1

        if n_unique <= 2:
            lim_top = lim_bot = int(n_unique // 2)

        value_counts_top = value_counts[0:lim_top]
        value_counts_idx_top = value_counts_top.index.tolist()
        value_counts_bot = value_counts[-lim_bot:]
        value_counts_idx_bot = value_counts_bot.index.tolist()

        if top == 0:
            value_counts_top = value_counts_idx_top = []

        if bottom == 0:
            value_counts_bot = value_counts_idx_bot = []

        data.loc[data[col].isin(value_counts_idx_top), col] = 10
        data.loc[data[col].isin(value_counts_idx_bot), col] = 0
        data.loc[((data[col] != 10) & (data[col] != 0)), col] = 5
        data[col] = data[col].rolling(2, min_periods=1).mean()

        value_counts_idx_top = [elem[:20] for elem in value_counts_idx_top]
        value_counts_idx_bot = [elem[:20] for elem in value_counts_idx_bot]

        # Barcharts
        ax_top = fig.add_subplot(gs[:1, count:count + 1])
        ax_top.bar(value_counts_idx_top,
                   value_counts_top,
                   color=bar_color_top,
                   width=0.85)
        ax_top.bar(value_counts_idx_bot,
                   value_counts_bot,
                   color=bar_color_bottom,
                   width=0.85)
        ax_top.set(frame_on=False)
        ax_top.tick_params(axis="x", labelrotation=90)

        # Summary stats
        ax_bottom = fig.add_subplot(gs[1:2, count:count + 1])
        plt.subplots_adjust(hspace=0.075)
        ax_bottom.get_yaxis().set_visible(False)
        ax_bottom.get_xaxis().set_visible(False)
        ax_bottom.set(frame_on=False)
        ax_bottom.text(
            0,
            0,
            f"Unique values: {n_unique}\n\n"
            f"Top {lim_top} vals: {sum(value_counts_top)} ({sum(value_counts_top)/data.shape[0]*100:.1f}%)\n"
            f"Bot {lim_bot} vals: {sum(value_counts_bot)} ({sum(value_counts_bot)/data.shape[0]*100:.1f}%)",
            transform=ax_bottom.transAxes,
            color="#111111",
            fontsize=11,
        )

    # Heatmap
    color_bot_rgb = to_rgb(bar_color_bottom)
    color_white = to_rgb("#FFFFFF")
    color_top_rgb = to_rgb(bar_color_top)
    cat_plot_cmap = LinearSegmentedColormap.from_list(
        "cat_plot_cmap", [color_bot_rgb, color_white, color_top_rgb], N=200)
    ax_hm = fig.add_subplot(gs[2:, :])
    sns.heatmap(data,
                cmap=cat_plot_cmap,
                cbar=False,
                vmin=0,
                vmax=10,
                ax=ax_hm)
    ax_hm.set_yticks(np.round(ax_hm.get_yticks()[0::5], -1))
    ax_hm.set_yticklabels(ax_hm.get_yticks())
    ax_hm.set_xticklabels(ax_hm.get_xticklabels(),
                          horizontalalignment="center",
                          fontweight="light",
                          fontsize="medium")
    ax_hm.tick_params(length=1, colors="#111111")

    gs.figure.suptitle("Categorical data plot",
                       x=0.5,
                       y=0.91,
                       fontsize=18,
                       color="#111111")

    return gs
Esempio n. 8
0
def corr_plot(
    data: pd.DataFrame,
    split: Optional[str] = None,
    threshold: float = 0,
    target: Optional[Union[pd.Series, str]] = None,
    method: str = "pearson",
    cmap: str = "BrBG",
    figsize: Tuple = (12, 10),
    annot: bool = True,
    dev: bool = False,
    **kwargs,
):
    """ Two-dimensional visualization of the correlation between feature-columns, excluding NA values.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the \
        index/column information is used to label the plots
    split : Optional[str], optional
        Type of split to be performed {None, "pos", "neg", "high", "low"}, by default None
            * None: visualize all correlations between the feature-columns
            * pos: visualize all positive correlations between the feature-columns above the threshold
            * neg: visualize all negative correlations between the feature-columns below the threshold
            * high: visualize all correlations between the feature-columns for which abs(corr) > threshold \
                is True
            * low: visualize all correlations between the feature-columns for which abs(corr) < threshold \
                is True

    threshold : float, optional
        Value between 0 and 1 to set the correlation threshold, by default 0 unless split = "high" \
        or split = "low", in which case default is 0.3
    target : Optional[Union[pd.Series, str]], optional
        Specify target for correlation. E.g. label column to generate only the correlations between each \
        feature and the label, by default None
    method : str, optional
        method: {"pearson", "spearman", "kendall"}, by default "pearson"
            * pearson: measures linear relationships and requires normally distributed and homoscedastic data.
            * spearman: ranked/ordinal correlation, measures monotonic relationships.
            * kendall: ranked/ordinal correlation, measures monotonic relationships. Computationally more \
                expensive but more robust in smaller dataets than "spearman".

    cmap : str, optional
        The mapping from data values to color space, matplotlib colormap name or object, or list of colors, \
        by default "BrBG"
    figsize : Tuple, optional
        Use to control the figure size, by default (12, 10)
    annot : bool, optional
        Use to show or hide annotations, by default True
    dev : bool, optional
        Display figure settings in the plot by setting dev = True. If False, the settings are not displayed, \
        by default False

    Keyword Arguments : optional
        Additional elements to control the visualization of the plot, e.g.:

            * mask: bool, default True
                If set to False the entire correlation matrix, including the upper triangle is shown. Set \
                dev = False in this case to avoid overlap.
            * vmax: float, default is calculated from the given correlation coefficients.
                Value between -1 or vmin <= vmax <= 1, limits the range of the colorbar.
            * vmin: float, default is calculated from the given correlation coefficients.
                Value between -1 <= vmin <= 1 or vmax, limits the range of the colorbar.
            * linewidths: float, default 0.5
                Controls the line-width inbetween the squares.
            * annot_kws: dict, default {"size" : 10}
                Controls the font size of the annotations. Only available when annot = True.
            * cbar_kws: dict, default {"shrink": .95, "aspect": 30}
                Controls the size of the colorbar.
            * Many more kwargs are available, i.e. "alpha" to control blending, or options to adjust labels, \
                ticks ...

        Kwargs can be supplied through a dictionary of key-value pairs (see above).

    Returns
    -------
    ax: matplotlib Axes
        Returns the Axes object with the plot for further tweaking.
    """

    # Validate Inputs
    _validate_input_range(threshold, "threshold", -1, 1)
    _validate_input_bool(annot, "annot")
    _validate_input_bool(dev, "dev")

    data = pd.DataFrame(data)

    corr = corr_mat(data,
                    split=split,
                    threshold=threshold,
                    target=target,
                    method=method,
                    colored=False)

    mask = np.zeros_like(corr, dtype=np.bool)

    if target is None:
        mask = np.triu(np.ones_like(corr, dtype=np.bool))

    vmax = np.round(np.nanmax(corr.where(~mask)) - 0.05, 2)
    vmin = np.round(np.nanmin(corr.where(~mask)) + 0.05, 2)

    fig, ax = plt.subplots(figsize=figsize)

    # Specify kwargs for the heatmap
    kwargs = {
        "mask": mask,
        "cmap": cmap,
        "annot": annot,
        "vmax": vmax,
        "vmin": vmin,
        "linewidths": 0.5,
        "annot_kws": {
            "size": 10
        },
        "cbar_kws": {
            "shrink": 0.95,
            "aspect": 30
        },
        **kwargs,
    }

    # Draw heatmap with mask and default settings
    sns.heatmap(corr, center=0, fmt=".2f", **kwargs)

    ax.set_title(f"Feature-correlation ({method})", fontdict={"fontsize": 18})

    # Settings
    if dev:
        fig.suptitle(
            f"\
            Settings (dev-mode): \n\
            - split-mode: {split} \n\
            - threshold: {threshold} \n\
            - method: {method} \n\
            - annotations: {annot} \n\
            - cbar: \n\
                - vmax: {vmax} \n\
                - vmin: {vmin} \n\
            - linewidths: {kwargs['linewidths']} \n\
            - annot_kws: {kwargs['annot_kws']} \n\
            - cbar_kws: {kwargs['cbar_kws']}",
            fontsize=12,
            color="gray",
            x=0.35,
            y=0.85,
            ha="left",
        )

    return ax
Esempio n. 9
0
def corr_mat(
    data: pd.DataFrame,
    split: Optional[
        str] = None,  # Optional[Literal['pos', 'neg', 'high', 'low']] = None,
    threshold: float = 0,
    target: Optional[Union[pd.DataFrame, pd.Series, np.ndarray, str]] = None,
    method:
    str = "pearson",  # Literal['pearson', 'spearman', 'kendall'] = "pearson",
    colored: bool = True,
) -> Union[pd.DataFrame, Any]:
    """ Returns a color-encoded correlation matrix.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the \
        index/column information is used to label the plots
    split : Optional[str], optional
        Type of split to be performed, by default None
        {None, "pos", "neg", "high", "low"}
    threshold : float, optional
        Value between 0 and 1 to set the correlation threshold, by default 0 unless split = "high" \
        or split = "low", in which case default is 0.3
    target : Optional[Union[pd.DataFrame, str]], optional
        Specify target for correlation. E.g. label column to generate only the correlations between each \
        feature and the label, by default None
    method : str, optional
        method: {"pearson", "spearman", "kendall"}, by default "pearson"
        * pearson: measures linear relationships and requires normally distributed and homoscedastic data.
        * spearman: ranked/ordinal correlation, measures monotonic relationships.
        * kendall: ranked/ordinal correlation, measures monotonic relationships. Computationally more \
            expensive but more robust in smaller dataets than "spearman"
    colored : bool, optional
        If True the negative values in the correlation matrix are colored in red, by default True

    Returns
    -------
    Union[pd.DataFrame, pd.Styler]
        If colored = True - corr: Pandas Styler object
        If colored = False - corr: Pandas DataFrame
    """

    # Validate Inputs
    _validate_input_range(threshold, "threshold", -1, 1)
    _validate_input_bool(colored, "colored")

    def color_negative_red(val):
        color = "#FF3344" if val < 0 else None
        return "color: %s" % color

    data = pd.DataFrame(data)

    if isinstance(target, (str, list, pd.Series, np.ndarray)):
        target_data = []
        if isinstance(target, str):
            target_data = data[target]
            data = data.drop(target, axis=1)

        elif isinstance(target, (list, pd.Series, np.ndarray)):
            target_data = pd.Series(target)
            target = target_data.name

        corr = pd.DataFrame(data.corrwith(target_data, method=method))
        corr = corr.sort_values(corr.columns[0], ascending=False)
        corr.columns = [target]

    else:
        corr = data.corr(method=method)

    corr = _corr_selector(corr, split=split, threshold=threshold)

    if colored:
        return corr.style.applymap(color_negative_red).format("{:.2f}",
                                                              na_rep="-")
    else:
        return corr
Esempio n. 10
0
def dist_plot(
    data: pd.DataFrame,
    mean_color: str = "orange",
    size: int = 2.5,
    fill_range: Tuple = (0.025, 0.975),
    showall: bool = False,
    kde_kws: Dict[str, Any] = None,
    rug_kws: Dict[str, Any] = None,
    fill_kws: Dict[str, Any] = None,
    font_kws: Dict[str, Any] = None,
):
    """ Two-dimensional visualization of the distribution of non binary numerical features.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame \
        is provided, the index/column information is used to label the plots
    mean_color : str, optional
        Color of the vertical line indicating the mean of the data, by default "orange"
    size : int, optional
        Controls the plot size, by default 2.5
    fill_range : Tuple, optional
        Set the quantiles for shading. Default spans 95% of the data, which is about \
        two std. deviations above and below the mean, by default (0.025, 0.975)
    showall : bool, optional
        Set to True to remove the output limit of 20 plots, by default False
    kde_kws : Dict[str, Any], optional
        Keyword arguments for kdeplot(), by default {"color": "k", "alpha": 0.75, \
        "linewidth": 1.5, "bw_adjust": 0.8}
    rug_kws : Dict[str, Any], optional
        Keyword arguments for rugplot(), by default {"color": "#ff3333", \
        "alpha": 0.15, "lw": 3, "height": 0.075}
    fill_kws : Dict[str, Any], optional
        Keyword arguments to control the fill, by default {"color": "#80d4ff", \
        "alpha": 0.2}
    font_kws : Dict[str, Any], optional
        Keyword arguments to control the font, by default {"color":  "#111111", \
        "weight": "normal", "size": 11}

    Returns
    -------
    ax: matplotlib Axes
        Returns the Axes object with the plot for further tweaking.
    """

    # Validate Inputs
    _validate_input_range(fill_range[0], "fill_range_lower", 0, 1)
    _validate_input_range(fill_range[1], "fill_range_upper", 0, 1)
    _validate_input_smaller(fill_range[0], fill_range[1], "fill_range")
    _validate_input_bool(showall, "showall")

    # Handle dictionary defaults
    kde_kws = ({
        "alpha": 0.75,
        "linewidth": 1.5,
        "bw_adjust": 0.8
    } if kde_kws is None else kde_kws.copy())
    rug_kws = ({
        "color": "#ff3333",
        "alpha": 0.15,
        "lw": 3,
        "height": 0.075
    } if rug_kws is None else rug_kws.copy())
    fill_kws = ({
        "color": "#80d4ff",
        "alpha": 0.2
    } if fill_kws is None else fill_kws.copy())
    font_kws = ({
        "color": "#111111",
        "weight": "normal",
        "size": 11
    } if font_kws is None else font_kws.copy())

    data = pd.DataFrame(data.copy()).dropna(axis=1, how="all")
    df = data.copy()
    data = data.loc[:, data.nunique() > 2]
    if data.shape[0] > 10000:
        data = data.sample(n=10000, random_state=408)
        print(
            "Large dataset detected, using 10000 random samples for the plots. Summary"
            " statistics are still based on the entire dataset.")
    cols = list(data.select_dtypes(include=["number"]).columns)
    data = data[cols]

    if len(cols) == 0:
        print("No columns with numeric data were detected.")
        return None

    if len(cols) >= 20 and showall is False:
        print(
            "Note: The number of non binary numerical features is very large "
            f"({len(cols)}), please consider splitting the data. Showing plots for "
            "the first 20 numerical features. Override this by setting showall=True."
        )
        cols = cols[:20]

    g = None
    for col in cols:
        col_data = data[col].dropna(axis=0)
        col_df = df[col].dropna(axis=0)

        g = sns.displot(
            col_data,
            kind="kde",
            rug=True,
            height=size,
            aspect=5,
            legend=False,
            rug_kws=rug_kws,
            **kde_kws,
        )

        # Vertical lines and fill
        x, y = g.axes[0, 0].lines[0].get_xydata().T
        g.axes[0, 0].fill_between(
            x,
            y,
            where=((x >= np.quantile(col_df, fill_range[0]))
                   & (x <= np.quantile(col_df, fill_range[1]))),
            label=f"{fill_range[0]*100:.1f}% - {fill_range[1]*100:.1f}%",
            **fill_kws,
        )

        mean = np.mean(col_df)
        std = scipy.stats.tstd(col_df)
        g.axes[0, 0].vlines(
            x=mean,
            ymin=0,
            ymax=np.interp(mean, x, y),
            ls="dotted",
            color=mean_color,
            lw=2,
            label="mean",
        )
        g.axes[0, 0].vlines(
            x=np.median(col_df),
            ymin=0,
            ymax=np.interp(np.median(col_df), x, y),
            ls=":",
            color=".3",
            label="median",
        )
        g.axes[0, 0].vlines(
            x=[mean - std, mean + std],
            ymin=0,
            ymax=[np.interp(mean - std, x, y),
                  np.interp(mean + std, x, y)],
            ls=":",
            color=".5",
            label="\u03BC \u00B1 \u03C3",
        )

        g.axes[0, 0].set_ylim(0)
        g.axes[0, 0].set_xlim(
            g.axes[0, 0].get_xlim()[0] - g.axes[0, 0].get_xlim()[1] * 0.05,
            g.axes[0, 0].get_xlim()[1] * 1.03,
        )

        # Annotations and legend
        g.axes[0, 0].text(
            0.005,
            0.9,
            f"Mean: {mean:.2f}",
            fontdict=font_kws,
            transform=g.axes[0, 0].transAxes,
        )
        g.axes[0, 0].text(
            0.005,
            0.7,
            f"Std. dev: {std:.2f}",
            fontdict=font_kws,
            transform=g.axes[0, 0].transAxes,
        )
        g.axes[0, 0].text(
            0.005,
            0.5,
            f"Skew: {scipy.stats.skew(col_df):.2f}",
            fontdict=font_kws,
            transform=g.axes[0, 0].transAxes,
        )
        g.axes[0, 0].text(
            0.005,
            0.3,
            f"Kurtosis: {scipy.stats.kurtosis(col_df):.2f}",  # Excess Kurtosis
            fontdict=font_kws,
            transform=g.axes[0, 0].transAxes,
        )
        g.axes[0, 0].text(
            0.005,
            0.1,
            f"Count: {len(col_df)}",
            fontdict=font_kws,
            transform=g.axes[0, 0].transAxes,
        )
        g.axes[0, 0].legend(loc="upper right")

    return g.axes[0, 0]
Esempio n. 11
0
def train_dev_test_split(data,
                         target,
                         dev_size=0.1,
                         test_size=0.1,
                         stratify=None,
                         random_state=408):
    """
    Split a dataset and a label column into train, dev and test sets.

    Parameters
    ----------

    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame \
    is provided, the index/column information is used to label the plots.

    target: string, list, np.array or pd.Series, default None
        Specify target for correlation. E.g. label column to generate only the \
        correlations between each feature and the label.

    dev_size: float, default 0.1
        If float, should be between 0.0 and 1.0 and represent the proportion of the \
        dataset to include in the dev split.

    test_size: float, default 0.1
        If float, should be between 0.0 and 1.0 and represent the proportion of the \
        dataset to include in the test split.

    stratify: target column, default None
        If not None, data is split in a stratified fashion, using the input as the \
        class labels.

    random_state: integer, default 408
        Random_state is the seed used by the random number generator.

    Returns
    -------
    tuple: Tuple containing train-dev-test split of inputs.
    """

    # Validate Inputs
    _validate_input_range(dev_size, "dev_size", 0, 1)
    _validate_input_range(test_size, "test_size", 0, 1)
    _validate_input_int(random_state, "random_state")
    _validate_input_sum_smaller(1, "Dev and test", dev_size, test_size)

    target_data = []
    if isinstance(target, str):
        target_data = data[target]
        data = data.drop(target, axis=1)

    elif isinstance(target, (list, pd.Series, np.ndarray)):
        target_data = pd.Series(target)

    X_train, X_dev_test, y_train, y_dev_test = train_test_split(
        data,
        target_data,
        test_size=dev_size + test_size,
        random_state=random_state,
        stratify=stratify,
    )

    if (dev_size == 0) or (test_size == 0):
        return X_train, X_dev_test, y_train, y_dev_test

    else:
        X_dev, X_test, y_dev, y_test = train_test_split(
            X_dev_test,
            y_dev_test,
            test_size=test_size / (dev_size + test_size),
            random_state=random_state,
            stratify=y_dev_test,
        )
        return X_train, X_dev, X_test, y_train, y_dev, y_test