def drop_missing( data: pd.DataFrame, drop_threshold_cols: float = 1, drop_threshold_rows: float = 1, col_exclude: Optional[List[str]] = None, ) -> pd.DataFrame: """ Drops completely empty columns and rows by default and optionally provides \ flexibility to loosen restrictions to drop additional non-empty columns and \ rows based on the fraction of NA-values. Parameters ---------- data : pd.DataFrame 2D dataset that can be coerced into Pandas DataFrame drop_threshold_cols : float, optional Drop columns with NA-ratio equal to or above the specified threshold, by \ default 1 drop_threshold_rows : float, optional Drop rows with NA-ratio equal to or above the specified threshold, by default 1 col_exclude : Optional[List[str]], optional Specify a list of columns to exclude from dropping. The excluded columns do \ not affect the drop thresholds, by default None Returns ------- pd.DataFrame Pandas DataFrame without any empty columns or rows Notes ----- Columns are dropped first """ # Validate Inputs _validate_input_range(drop_threshold_cols, "drop_threshold_cols", 0, 1) _validate_input_range(drop_threshold_rows, "drop_threshold_rows", 0, 1) col_exclude = [] if col_exclude is None else col_exclude.copy() data_exclude = data[col_exclude] data = pd.DataFrame(data).copy() data_dropped = data.drop(columns=col_exclude, errors="ignore") data_dropped = data_dropped.drop( columns=data_dropped.loc[ :, _missing_vals(data)["mv_cols_ratio"] > drop_threshold_cols ].columns ).dropna(axis=1, how="all") data = pd.concat([data_dropped, data_exclude], axis=1) data_cleaned = data.drop( index=data.loc[ _missing_vals(data)["mv_rows_ratio"] > drop_threshold_rows, : ].index ).dropna(axis=0, how="all") return data_cleaned
def convert_datatypes( data: pd.DataFrame, category: bool = True, cat_threshold: float = 0.05, cat_exclude: Optional[List[Union[str, int]]] = None, ) -> pd.DataFrame: """ Converts columns to best possible dtypes using dtypes supporting pd.NA. Temporarily not converting to integers due to an issue in pandas. This is expected \ to be fixed in pandas 1.1. See https://github.com/pandas-dev/pandas/issues/33803 Parameters ---------- data : pd.DataFrame 2D dataset that can be coerced into Pandas DataFrame category : bool, optional Change dtypes of columns with dtype "object" to "category". Set threshold \ using cat_threshold or exclude columns using cat_exclude, by default True cat_threshold : float, optional Ratio of unique values below which categories are inferred and column dtype is \ changed to categorical, by default 0.05 cat_exclude : Optional[List[Union[str, int]]], optional List of columns to exclude from categorical conversion, by default None Returns ------- pd.DataFrame Pandas DataFrame with converted Datatypes """ # Validate Inputs _validate_input_bool(category, "Category") _validate_input_range(cat_threshold, "cat_threshold", 0, 1) cat_exclude = [] if cat_exclude is None else cat_exclude.copy() data = pd.DataFrame(data).copy() for col in data.columns: unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0] if ( category and unique_vals_ratio < cat_threshold and col not in cat_exclude and data[col].dtype == "object" ): data[col] = data[col].astype("category") data[col] = data[col].convert_dtypes( infer_objects=True, convert_string=True, convert_integer=False, convert_boolean=True, ) data = optimize_ints(data) data = optimize_floats(data) return data
def pool_duplicate_subsets( data: pd.DataFrame, col_dupl_thresh: float = 0.2, subset_thresh: float = 0.2, min_col_pool: int = 3, exclude: Optional[List[str]] = None, return_details=False, ) -> pd.DataFrame: """ Checks for duplicates in subsets of columns and pools them. This can reduce \ the number of columns in the data without loosing much information. Suitable \ columns are combined to subsets and tested for duplicates. In case sufficient \ duplicates can be found, the respective columns are aggregated into a \ "pooled_var" column. Identical numbers in the "pooled_var" column indicate \ identical information in the respective rows. Note: It is advised to exclude features that provide sufficient informational \ content by themselves as well as the target column by using the "exclude" \ setting. Parameters ---------- data : pd.DataFrame 2D dataset that can be coerced into Pandas DataFrame col_dupl_thresh : float, optional Columns with a ratio of duplicates higher than "col_dupl_thresh" are \ considered in the further analysis. Columns with a lower ratio are not \ considered for pooling, by default 0.2 subset_thresh : float, optional The first subset with a duplicate threshold higher than "subset_thresh" is \ chosen and aggregated. If no subset reaches the threshold, the algorithm \ continues with continuously smaller subsets until "min_col_pool" is reached, \ by default 0.2 min_col_pool : int, optional Minimum number of columns to pool. The algorithm attempts to combine as many \ columns as possible to suitable subsets and stops when "min_col_pool" is \ reached, by default 3 exclude : Optional[List[str]], optional List of column names to be excluded from the analysis. These columns are \ passed through without modification, by default None return_details : bool, optional Provdies flexibility to return intermediary results, by default False Returns ------- pd.DataFrame DataFrame with low cardinality columns pooled optional: subset_cols: List of columns used as subset """ # Input validation _validate_input_range(col_dupl_thresh, "col_dupl_thresh", 0, 1) _validate_input_range(subset_thresh, "subset_thresh", 0, 1) _validate_input_range(min_col_pool, "min_col_pool", 0, data.shape[1]) excluded_cols = [] if exclude is not None: excluded_cols = data[exclude] data = data.drop(columns=exclude) subset_cols = [] for i in range(data.shape[1] + 1 - min_col_pool): check_list = [ col for col in data.columns if data.duplicated(subset=col).mean() > col_dupl_thresh ] if len(check_list) > 0: combinations = itertools.combinations(check_list, len(check_list) - i) else: continue ratios = [ *map(lambda comb: data.duplicated(subset=list(comb)).mean(), combinations) ] max_ratio = max(ratios) max_idx = np.argmax(ratios) if max_ratio > subset_thresh: best_subset = itertools.islice( itertools.combinations(check_list, len(check_list) - i), max_idx, max_idx + 1, ) best_subset = data[list(list(best_subset)[0])] subset_cols = best_subset.columns.tolist() unique_subset = ( best_subset.drop_duplicates().reset_index().rename( columns={"index": "pooled_vars"})) data = data.merge(unique_subset, how="left", on=best_subset.columns.tolist()).drop( columns=best_subset.columns.tolist()) data.index = pd.RangeIndex(len(data)) break data = pd.concat([data, pd.DataFrame(excluded_cols)], axis=1) if return_details: return data, subset_cols return data
def mv_col_handling( data: pd.DataFrame, target: Optional[Union[str, pd.Series, List]] = None, mv_threshold: float = 0.1, corr_thresh_features: float = 0.5, corr_thresh_target: float = 0.3, return_details: bool = False, ) -> pd.DataFrame: """ Converts columns with a high ratio of missing values into binary features and \ eventually drops them based on their correlation with other features and the \ target variable. This function follows a three step process: - 1) Identify features with a high ratio of missing values (above 'mv_threshold'). - 2) Identify high correlations of these features among themselves and with \ other features in the dataset (above 'corr_thresh_features'). - 3) Features with high ratio of missing values and high correlation among each \ other are dropped unless they correlate reasonably well with the target \ variable (above 'corr_thresh_target'). Note: If no target is provided, the process exits after step two and drops columns \ identified up to this point. Parameters ---------- data : pd.DataFrame 2D dataset that can be coerced into Pandas DataFrame target : Optional[Union[str, pd.Series, List]], optional Specify target for correlation. I.e. label column to generate only the \ correlations between each feature and the label, by default None mv_threshold : float, optional Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger \ than mv_threshold are candidates for dropping and undergo further analysis, by \ default 0.1 corr_thresh_features : float, optional Value between 0 <= threshold <= 1. Maximum correlation a previously identified \ features (with a high mv-ratio) is allowed to have with another feature. If \ this threshold is overstepped, the feature undergoes further analysis, by \ default 0.5 corr_thresh_target : float, optional Value between 0 <= threshold <= 1. Minimum required correlation of a remaining \ feature (i.e. feature with a high mv-ratio and high correlation to another \ existing feature) with the target. If this threshold is not met the feature is \ ultimately dropped, by default 0.3 return_details : bool, optional Provdies flexibility to return intermediary results, by default False Returns ------- pd.DataFrame Updated Pandas DataFrame optional: cols_mv: Columns with missing values included in the analysis drop_cols: List of dropped columns """ # Validate Inputs _validate_input_range(mv_threshold, "mv_threshold", 0, 1) _validate_input_range(corr_thresh_features, "corr_thresh_features", 0, 1) _validate_input_range(corr_thresh_target, "corr_thresh_target", 0, 1) data = pd.DataFrame(data).copy() data_local = data.copy() mv_ratios = _missing_vals(data_local)["mv_cols_ratio"] cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist() data_local[cols_mv] = (data_local[cols_mv].applymap( lambda x: 1 if not pd.isnull(x) else x).fillna(0)) high_corr_features = [] data_temp = data_local.copy() for col in cols_mv: corrmat = corr_mat(data_temp, colored=False) if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features: high_corr_features.append(col) data_temp = data_temp.drop(columns=[col]) drop_cols = [] if target is None: data = data.drop(columns=high_corr_features) else: corrs = corr_mat(data_local, target=target, colored=False).loc[high_corr_features] drop_cols = corrs.loc[ abs(corrs.iloc[:, 0]) < corr_thresh_target].index.tolist() data = data.drop(columns=drop_cols) if return_details: return data, cols_mv, drop_cols return data
def data_cleaning( data: pd.DataFrame, drop_threshold_cols: float = 0.9, drop_threshold_rows: float = 0.9, drop_duplicates: bool = True, convert_dtypes: bool = True, col_exclude: Optional[List[str]] = None, category: bool = True, cat_threshold: float = 0.03, cat_exclude: Optional[List[Union[str, int]]] = None, clean_col_names: bool = True, show: str = "changes", ) -> pd.DataFrame: """ Perform initial data cleaning tasks on a dataset, such as dropping single \ valued and empty rows, empty columns as well as optimizing the datatypes. Parameters ---------- data : pd.DataFrame 2D dataset that can be coerced into Pandas DataFrame drop_threshold_cols : float, optional Drop columns with NA-ratio equal to or above the specified threshold, by \ default 0.9 drop_threshold_rows : float, optional Drop rows with NA-ratio equal to or above the specified threshold, by \ default 0.9 drop_duplicates : bool, optional Drop duplicate rows, keeping the first occurence. This step comes after the \ dropping of missing values, by default True convert_dtypes : bool, optional Convert dtypes using pd.convert_dtypes(), by default True col_exclude : Optional[List[str]], optional Specify a list of columns to exclude from dropping, by default None category : bool, optional Enable changing dtypes of "object" columns to "category". Set threshold using \ cat_threshold. Requires convert_dtypes=True, by default True cat_threshold : float, optional Ratio of unique values below which categories are inferred and column dtype is \ changed to categorical, by default 0.03 cat_exclude : Optional[List[str]], optional List of columns to exclude from categorical conversion, by default None clean_column_names: bool, optional Cleans the column names and provides hints on duplicate and long names, by \ default True show : str, optional {"all", "changes", None}, by default "changes" Specify verbosity of the output: * "all": Print information about the data before and after cleaning as \ well as information about changes and memory usage (deep). Please be \ aware, that this can slow down the function by quite a bit. * "changes": Print out differences in the data before and after cleaning. * None: No information about the data and the data cleaning is printed. Returns ------- pd.DataFrame Cleaned Pandas DataFrame See also -------- convert_datatypes: Convert columns to best possible dtypes. drop_missing : Flexibly drop columns and rows. _memory_usage: Gives the total memory usage in megabytes. _missing_vals: Metrics about missing values in the dataset. Notes ----- The category dtype is not grouped in the summary, unless it contains exactly the \ same categories. """ # Validate Inputs _validate_input_range(drop_threshold_cols, "drop_threshold_cols", 0, 1) _validate_input_range(drop_threshold_rows, "drop_threshold_rows", 0, 1) _validate_input_bool(drop_duplicates, "drop_duplicates") _validate_input_bool(convert_dtypes, "convert_datatypes") _validate_input_bool(category, "category") _validate_input_range(cat_threshold, "cat_threshold", 0, 1) data = pd.DataFrame(data).copy() data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows, col_exclude=col_exclude) if clean_col_names: data_cleaned = clean_column_names(data_cleaned) single_val_cols = data_cleaned.columns[data_cleaned.nunique( dropna=False) == 1].tolist() data_cleaned = data_cleaned.drop(columns=single_val_cols) dupl_rows = None if drop_duplicates: data_cleaned, dupl_rows = _drop_duplicates(data_cleaned) if convert_dtypes: data_cleaned = convert_datatypes( data_cleaned, category=category, cat_threshold=cat_threshold, cat_exclude=cat_exclude, ) _diff_report( data, data_cleaned, dupl_rows=dupl_rows, single_val_cols=single_val_cols, show=show, ) return data_cleaned
def dist_plot( data: pd.DataFrame, mean_color: str = "orange", figsize: Tuple = (16, 2), fill_range: Tuple = (0.025, 0.975), showall: bool = False, kde_kws: Dict[str, Any] = None, rug_kws: Dict[str, Any] = None, fill_kws: Dict[str, Any] = None, font_kws: Dict[str, Any] = None, ): """ Two-dimensional visualization of the distribution of non binary numerical features. Parameters ---------- data : pd.DataFrame 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the \ index/column information is used to label the plots mean_color : str, optional Color of the vertical line indicating the mean of the data, by default "orange" figsize : Tuple, optional Controls the figure size, by default (16, 2) fill_range : Tuple, optional Set the quantiles for shading. Default spans 95% of the data, which is about two std. deviations \ above and below the mean, by default (0.025, 0.975) showall : bool, optional Set to True to remove the output limit of 20 plots, by default False kde_kws : Dict[str, Any], optional Keyword arguments for kdeplot(), by default {"color": "k", "alpha": 0.7, "linewidth": 1.5, "bw": 0.3} rug_kws : Dict[str, Any], optional Keyword arguments for rugplot(), by default {"color": "#ff3333", "alpha": 0.05, "linewidth": 4, \ "height": 0.075} fill_kws : Dict[str, Any], optional Keyword arguments to control the fill, by default {"color": "#80d4ff", "alpha": 0.2} font_kws : Dict[str, Any], optional Keyword arguments to control the font, by default {"color": "#111111", "weight": "normal", "size": \ 11} Returns ------- ax: matplotlib Axes Returns the Axes object with the plot for further tweaking. """ # Validate Inputs _validate_input_range(fill_range[0], "fill_range_lower", 0, 1) _validate_input_range(fill_range[1], "fill_range_upper", 0, 1) _validate_input_smaller(fill_range[0], fill_range[1], "fill_range") _validate_input_bool(showall, "showall") # Handle dictionary defaults kde_kws = { "alpha": 0.75, "linewidth": 1.5, "bw": 0.4 } if kde_kws is None else kde_kws.copy() rug_kws = ({ "color": "#ff3333", "alpha": 0.05, "linewidth": 4, "height": 0.075 } if rug_kws is None else rug_kws.copy()) fill_kws = { "color": "#80d4ff", "alpha": 0.2 } if fill_kws is None else fill_kws.copy() font_kws = { "color": "#111111", "weight": "normal", "size": 11 } if font_kws is None else font_kws.copy() data = pd.DataFrame(data.copy()).dropna(axis=1, how="all") data = data.loc[:, data.nunique() > 2] cols = list(data.select_dtypes(include=["number"]).columns) data = data[cols] data = data.loc[:, data.nunique() > 2] if len(cols) == 0: print("No columns with numeric data were detected.") return elif len(cols) >= 20 and showall is False: print( f"Note: The number of non binary numerical features is very large ({len(cols)}), please consider" " splitting the data. Showing plots for the first 20 numerical features. Override this by setting" " showall=True.") cols = cols[:20] for col in cols: num_dropped_vals = data[col].isna().sum() if num_dropped_vals > 0: col_data = data[col].dropna(axis=0) print( f"Dropped {num_dropped_vals} missing values from column {col}." ) else: col_data = data[col] _, ax = plt.subplots(figsize=figsize) ax = sns.distplot( col_data, hist=False, rug=True, kde_kws=kde_kws, rug_kws=rug_kws, ) # Vertical lines and fill x, y = ax.lines[0].get_xydata().T ax.fill_between( x, y, where=((x >= np.quantile(col_data, fill_range[0])) & (x <= np.quantile(col_data, fill_range[1]))), label=f"{fill_range[0]*100:.1f}% - {fill_range[1]*100:.1f}%", **fill_kws, ) mean = np.mean(col_data) std = scipy.stats.tstd(col_data) ax.vlines(x=mean, ymin=0, ymax=np.interp(mean, x, y), ls="dotted", color=mean_color, lw=2, label="mean") ax.vlines( x=np.median(col_data), ymin=0, ymax=np.interp(np.median(col_data), x, y), ls=":", color=".3", label="median", ) ax.vlines( x=[mean - std, mean + std], ymin=0, ymax=[np.interp(mean - std, x, y), np.interp(mean + std, x, y)], ls=":", color=".5", label="\u03BC \u00B1 \u03C3", ) ax.set_ylim(0) ax.set_xlim(ax.get_xlim()[0] * 1.15, ax.get_xlim()[1] * 1.15) # Annotations and legend ax.text(0.01, 0.85, f"Mean: {mean:.2f}", fontdict=font_kws, transform=ax.transAxes) ax.text(0.01, 0.7, f"Std. dev: {std:.2f}", fontdict=font_kws, transform=ax.transAxes) ax.text( 0.01, 0.55, f"Skew: {scipy.stats.skew(col_data):.2f}", fontdict=font_kws, transform=ax.transAxes, ) ax.text( 0.01, 0.4, f"Kurtosis: {scipy.stats.kurtosis(col_data):.2f}", # Excess Kurtosis fontdict=font_kws, transform=ax.transAxes, ) ax.text(0.01, 0.25, f"Count: {len(col_data)}", fontdict=font_kws, transform=ax.transAxes) ax.legend(loc="upper right") return ax
def cat_plot( data: pd.DataFrame, figsize: Tuple = (18, 18), top: int = 3, bottom: int = 3, bar_color_top: str = "#5ab4ac", bar_color_bottom: str = "#d8b365", ): """ Two-dimensional visualization of the number and frequency of categorical features. Parameters ---------- data : pd.DataFrame 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the \ index/column information is used to label the plots figsize : Tuple, optional Use to control the figure size, by default (18, 18) top : int, optional Show the "top" most frequent values in a column, by default 3 bottom : int, optional Show the "bottom" most frequent values in a column, by default 3 bar_color_top : str, optional Use to control the color of the bars indicating the most common values, by default "#5ab4ac" bar_color_bottom : str, optional Use to control the color of the bars indicating the least common values, by default "#d8b365" cmap : str, optional The mapping from data values to color space, by default "BrBG" Returns ------- Gridspec gs: Figure with array of Axes objects """ # Validate Inputs _validate_input_int(top, "top") _validate_input_int(bottom, "bottom") _validate_input_range(top, "top", 0, data.shape[1]) _validate_input_range(bottom, "bottom", 0, data.shape[1]) _validate_input_sum_larger(1, "top and bottom", top, bottom) data = pd.DataFrame(data).copy() cols = data.select_dtypes(exclude=["number"]).columns.tolist() data = data[cols] for col in data.columns: if data[col].dtype.name == "category" or data[ col].dtype.name == "string": data[col] = data[col].astype("object") if len(cols) == 0: print("No columns with categorical data were detected.") fig = plt.figure(figsize=figsize) gs = fig.add_gridspec(nrows=6, ncols=len(cols), wspace=0.21) for count, col in enumerate(cols): n_unique = data[col].nunique(dropna=True) value_counts = data[col].value_counts() lim_top, lim_bot = top, bottom if n_unique < top + bottom: lim_top = int(n_unique // 2) lim_bot = int(n_unique // 2) + 1 if n_unique <= 2: lim_top = lim_bot = int(n_unique // 2) value_counts_top = value_counts[0:lim_top] value_counts_idx_top = value_counts_top.index.tolist() value_counts_bot = value_counts[-lim_bot:] value_counts_idx_bot = value_counts_bot.index.tolist() if top == 0: value_counts_top = value_counts_idx_top = [] if bottom == 0: value_counts_bot = value_counts_idx_bot = [] data.loc[data[col].isin(value_counts_idx_top), col] = 10 data.loc[data[col].isin(value_counts_idx_bot), col] = 0 data.loc[((data[col] != 10) & (data[col] != 0)), col] = 5 data[col] = data[col].rolling(2, min_periods=1).mean() value_counts_idx_top = [elem[:20] for elem in value_counts_idx_top] value_counts_idx_bot = [elem[:20] for elem in value_counts_idx_bot] # Barcharts ax_top = fig.add_subplot(gs[:1, count:count + 1]) ax_top.bar(value_counts_idx_top, value_counts_top, color=bar_color_top, width=0.85) ax_top.bar(value_counts_idx_bot, value_counts_bot, color=bar_color_bottom, width=0.85) ax_top.set(frame_on=False) ax_top.tick_params(axis="x", labelrotation=90) # Summary stats ax_bottom = fig.add_subplot(gs[1:2, count:count + 1]) plt.subplots_adjust(hspace=0.075) ax_bottom.get_yaxis().set_visible(False) ax_bottom.get_xaxis().set_visible(False) ax_bottom.set(frame_on=False) ax_bottom.text( 0, 0, f"Unique values: {n_unique}\n\n" f"Top {lim_top} vals: {sum(value_counts_top)} ({sum(value_counts_top)/data.shape[0]*100:.1f}%)\n" f"Bot {lim_bot} vals: {sum(value_counts_bot)} ({sum(value_counts_bot)/data.shape[0]*100:.1f}%)", transform=ax_bottom.transAxes, color="#111111", fontsize=11, ) # Heatmap color_bot_rgb = to_rgb(bar_color_bottom) color_white = to_rgb("#FFFFFF") color_top_rgb = to_rgb(bar_color_top) cat_plot_cmap = LinearSegmentedColormap.from_list( "cat_plot_cmap", [color_bot_rgb, color_white, color_top_rgb], N=200) ax_hm = fig.add_subplot(gs[2:, :]) sns.heatmap(data, cmap=cat_plot_cmap, cbar=False, vmin=0, vmax=10, ax=ax_hm) ax_hm.set_yticks(np.round(ax_hm.get_yticks()[0::5], -1)) ax_hm.set_yticklabels(ax_hm.get_yticks()) ax_hm.set_xticklabels(ax_hm.get_xticklabels(), horizontalalignment="center", fontweight="light", fontsize="medium") ax_hm.tick_params(length=1, colors="#111111") gs.figure.suptitle("Categorical data plot", x=0.5, y=0.91, fontsize=18, color="#111111") return gs
def corr_plot( data: pd.DataFrame, split: Optional[str] = None, threshold: float = 0, target: Optional[Union[pd.Series, str]] = None, method: str = "pearson", cmap: str = "BrBG", figsize: Tuple = (12, 10), annot: bool = True, dev: bool = False, **kwargs, ): """ Two-dimensional visualization of the correlation between feature-columns, excluding NA values. Parameters ---------- data : pd.DataFrame 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the \ index/column information is used to label the plots split : Optional[str], optional Type of split to be performed {None, "pos", "neg", "high", "low"}, by default None * None: visualize all correlations between the feature-columns * pos: visualize all positive correlations between the feature-columns above the threshold * neg: visualize all negative correlations between the feature-columns below the threshold * high: visualize all correlations between the feature-columns for which abs(corr) > threshold \ is True * low: visualize all correlations between the feature-columns for which abs(corr) < threshold \ is True threshold : float, optional Value between 0 and 1 to set the correlation threshold, by default 0 unless split = "high" \ or split = "low", in which case default is 0.3 target : Optional[Union[pd.Series, str]], optional Specify target for correlation. E.g. label column to generate only the correlations between each \ feature and the label, by default None method : str, optional method: {"pearson", "spearman", "kendall"}, by default "pearson" * pearson: measures linear relationships and requires normally distributed and homoscedastic data. * spearman: ranked/ordinal correlation, measures monotonic relationships. * kendall: ranked/ordinal correlation, measures monotonic relationships. Computationally more \ expensive but more robust in smaller dataets than "spearman". cmap : str, optional The mapping from data values to color space, matplotlib colormap name or object, or list of colors, \ by default "BrBG" figsize : Tuple, optional Use to control the figure size, by default (12, 10) annot : bool, optional Use to show or hide annotations, by default True dev : bool, optional Display figure settings in the plot by setting dev = True. If False, the settings are not displayed, \ by default False Keyword Arguments : optional Additional elements to control the visualization of the plot, e.g.: * mask: bool, default True If set to False the entire correlation matrix, including the upper triangle is shown. Set \ dev = False in this case to avoid overlap. * vmax: float, default is calculated from the given correlation coefficients. Value between -1 or vmin <= vmax <= 1, limits the range of the colorbar. * vmin: float, default is calculated from the given correlation coefficients. Value between -1 <= vmin <= 1 or vmax, limits the range of the colorbar. * linewidths: float, default 0.5 Controls the line-width inbetween the squares. * annot_kws: dict, default {"size" : 10} Controls the font size of the annotations. Only available when annot = True. * cbar_kws: dict, default {"shrink": .95, "aspect": 30} Controls the size of the colorbar. * Many more kwargs are available, i.e. "alpha" to control blending, or options to adjust labels, \ ticks ... Kwargs can be supplied through a dictionary of key-value pairs (see above). Returns ------- ax: matplotlib Axes Returns the Axes object with the plot for further tweaking. """ # Validate Inputs _validate_input_range(threshold, "threshold", -1, 1) _validate_input_bool(annot, "annot") _validate_input_bool(dev, "dev") data = pd.DataFrame(data) corr = corr_mat(data, split=split, threshold=threshold, target=target, method=method, colored=False) mask = np.zeros_like(corr, dtype=np.bool) if target is None: mask = np.triu(np.ones_like(corr, dtype=np.bool)) vmax = np.round(np.nanmax(corr.where(~mask)) - 0.05, 2) vmin = np.round(np.nanmin(corr.where(~mask)) + 0.05, 2) fig, ax = plt.subplots(figsize=figsize) # Specify kwargs for the heatmap kwargs = { "mask": mask, "cmap": cmap, "annot": annot, "vmax": vmax, "vmin": vmin, "linewidths": 0.5, "annot_kws": { "size": 10 }, "cbar_kws": { "shrink": 0.95, "aspect": 30 }, **kwargs, } # Draw heatmap with mask and default settings sns.heatmap(corr, center=0, fmt=".2f", **kwargs) ax.set_title(f"Feature-correlation ({method})", fontdict={"fontsize": 18}) # Settings if dev: fig.suptitle( f"\ Settings (dev-mode): \n\ - split-mode: {split} \n\ - threshold: {threshold} \n\ - method: {method} \n\ - annotations: {annot} \n\ - cbar: \n\ - vmax: {vmax} \n\ - vmin: {vmin} \n\ - linewidths: {kwargs['linewidths']} \n\ - annot_kws: {kwargs['annot_kws']} \n\ - cbar_kws: {kwargs['cbar_kws']}", fontsize=12, color="gray", x=0.35, y=0.85, ha="left", ) return ax
def corr_mat( data: pd.DataFrame, split: Optional[ str] = None, # Optional[Literal['pos', 'neg', 'high', 'low']] = None, threshold: float = 0, target: Optional[Union[pd.DataFrame, pd.Series, np.ndarray, str]] = None, method: str = "pearson", # Literal['pearson', 'spearman', 'kendall'] = "pearson", colored: bool = True, ) -> Union[pd.DataFrame, Any]: """ Returns a color-encoded correlation matrix. Parameters ---------- data : pd.DataFrame 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the \ index/column information is used to label the plots split : Optional[str], optional Type of split to be performed, by default None {None, "pos", "neg", "high", "low"} threshold : float, optional Value between 0 and 1 to set the correlation threshold, by default 0 unless split = "high" \ or split = "low", in which case default is 0.3 target : Optional[Union[pd.DataFrame, str]], optional Specify target for correlation. E.g. label column to generate only the correlations between each \ feature and the label, by default None method : str, optional method: {"pearson", "spearman", "kendall"}, by default "pearson" * pearson: measures linear relationships and requires normally distributed and homoscedastic data. * spearman: ranked/ordinal correlation, measures monotonic relationships. * kendall: ranked/ordinal correlation, measures monotonic relationships. Computationally more \ expensive but more robust in smaller dataets than "spearman" colored : bool, optional If True the negative values in the correlation matrix are colored in red, by default True Returns ------- Union[pd.DataFrame, pd.Styler] If colored = True - corr: Pandas Styler object If colored = False - corr: Pandas DataFrame """ # Validate Inputs _validate_input_range(threshold, "threshold", -1, 1) _validate_input_bool(colored, "colored") def color_negative_red(val): color = "#FF3344" if val < 0 else None return "color: %s" % color data = pd.DataFrame(data) if isinstance(target, (str, list, pd.Series, np.ndarray)): target_data = [] if isinstance(target, str): target_data = data[target] data = data.drop(target, axis=1) elif isinstance(target, (list, pd.Series, np.ndarray)): target_data = pd.Series(target) target = target_data.name corr = pd.DataFrame(data.corrwith(target_data, method=method)) corr = corr.sort_values(corr.columns[0], ascending=False) corr.columns = [target] else: corr = data.corr(method=method) corr = _corr_selector(corr, split=split, threshold=threshold) if colored: return corr.style.applymap(color_negative_red).format("{:.2f}", na_rep="-") else: return corr
def dist_plot( data: pd.DataFrame, mean_color: str = "orange", size: int = 2.5, fill_range: Tuple = (0.025, 0.975), showall: bool = False, kde_kws: Dict[str, Any] = None, rug_kws: Dict[str, Any] = None, fill_kws: Dict[str, Any] = None, font_kws: Dict[str, Any] = None, ): """ Two-dimensional visualization of the distribution of non binary numerical features. Parameters ---------- data : pd.DataFrame 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame \ is provided, the index/column information is used to label the plots mean_color : str, optional Color of the vertical line indicating the mean of the data, by default "orange" size : int, optional Controls the plot size, by default 2.5 fill_range : Tuple, optional Set the quantiles for shading. Default spans 95% of the data, which is about \ two std. deviations above and below the mean, by default (0.025, 0.975) showall : bool, optional Set to True to remove the output limit of 20 plots, by default False kde_kws : Dict[str, Any], optional Keyword arguments for kdeplot(), by default {"color": "k", "alpha": 0.75, \ "linewidth": 1.5, "bw_adjust": 0.8} rug_kws : Dict[str, Any], optional Keyword arguments for rugplot(), by default {"color": "#ff3333", \ "alpha": 0.15, "lw": 3, "height": 0.075} fill_kws : Dict[str, Any], optional Keyword arguments to control the fill, by default {"color": "#80d4ff", \ "alpha": 0.2} font_kws : Dict[str, Any], optional Keyword arguments to control the font, by default {"color": "#111111", \ "weight": "normal", "size": 11} Returns ------- ax: matplotlib Axes Returns the Axes object with the plot for further tweaking. """ # Validate Inputs _validate_input_range(fill_range[0], "fill_range_lower", 0, 1) _validate_input_range(fill_range[1], "fill_range_upper", 0, 1) _validate_input_smaller(fill_range[0], fill_range[1], "fill_range") _validate_input_bool(showall, "showall") # Handle dictionary defaults kde_kws = ({ "alpha": 0.75, "linewidth": 1.5, "bw_adjust": 0.8 } if kde_kws is None else kde_kws.copy()) rug_kws = ({ "color": "#ff3333", "alpha": 0.15, "lw": 3, "height": 0.075 } if rug_kws is None else rug_kws.copy()) fill_kws = ({ "color": "#80d4ff", "alpha": 0.2 } if fill_kws is None else fill_kws.copy()) font_kws = ({ "color": "#111111", "weight": "normal", "size": 11 } if font_kws is None else font_kws.copy()) data = pd.DataFrame(data.copy()).dropna(axis=1, how="all") df = data.copy() data = data.loc[:, data.nunique() > 2] if data.shape[0] > 10000: data = data.sample(n=10000, random_state=408) print( "Large dataset detected, using 10000 random samples for the plots. Summary" " statistics are still based on the entire dataset.") cols = list(data.select_dtypes(include=["number"]).columns) data = data[cols] if len(cols) == 0: print("No columns with numeric data were detected.") return None if len(cols) >= 20 and showall is False: print( "Note: The number of non binary numerical features is very large " f"({len(cols)}), please consider splitting the data. Showing plots for " "the first 20 numerical features. Override this by setting showall=True." ) cols = cols[:20] g = None for col in cols: col_data = data[col].dropna(axis=0) col_df = df[col].dropna(axis=0) g = sns.displot( col_data, kind="kde", rug=True, height=size, aspect=5, legend=False, rug_kws=rug_kws, **kde_kws, ) # Vertical lines and fill x, y = g.axes[0, 0].lines[0].get_xydata().T g.axes[0, 0].fill_between( x, y, where=((x >= np.quantile(col_df, fill_range[0])) & (x <= np.quantile(col_df, fill_range[1]))), label=f"{fill_range[0]*100:.1f}% - {fill_range[1]*100:.1f}%", **fill_kws, ) mean = np.mean(col_df) std = scipy.stats.tstd(col_df) g.axes[0, 0].vlines( x=mean, ymin=0, ymax=np.interp(mean, x, y), ls="dotted", color=mean_color, lw=2, label="mean", ) g.axes[0, 0].vlines( x=np.median(col_df), ymin=0, ymax=np.interp(np.median(col_df), x, y), ls=":", color=".3", label="median", ) g.axes[0, 0].vlines( x=[mean - std, mean + std], ymin=0, ymax=[np.interp(mean - std, x, y), np.interp(mean + std, x, y)], ls=":", color=".5", label="\u03BC \u00B1 \u03C3", ) g.axes[0, 0].set_ylim(0) g.axes[0, 0].set_xlim( g.axes[0, 0].get_xlim()[0] - g.axes[0, 0].get_xlim()[1] * 0.05, g.axes[0, 0].get_xlim()[1] * 1.03, ) # Annotations and legend g.axes[0, 0].text( 0.005, 0.9, f"Mean: {mean:.2f}", fontdict=font_kws, transform=g.axes[0, 0].transAxes, ) g.axes[0, 0].text( 0.005, 0.7, f"Std. dev: {std:.2f}", fontdict=font_kws, transform=g.axes[0, 0].transAxes, ) g.axes[0, 0].text( 0.005, 0.5, f"Skew: {scipy.stats.skew(col_df):.2f}", fontdict=font_kws, transform=g.axes[0, 0].transAxes, ) g.axes[0, 0].text( 0.005, 0.3, f"Kurtosis: {scipy.stats.kurtosis(col_df):.2f}", # Excess Kurtosis fontdict=font_kws, transform=g.axes[0, 0].transAxes, ) g.axes[0, 0].text( 0.005, 0.1, f"Count: {len(col_df)}", fontdict=font_kws, transform=g.axes[0, 0].transAxes, ) g.axes[0, 0].legend(loc="upper right") return g.axes[0, 0]
def train_dev_test_split(data, target, dev_size=0.1, test_size=0.1, stratify=None, random_state=408): """ Split a dataset and a label column into train, dev and test sets. Parameters ---------- data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame \ is provided, the index/column information is used to label the plots. target: string, list, np.array or pd.Series, default None Specify target for correlation. E.g. label column to generate only the \ correlations between each feature and the label. dev_size: float, default 0.1 If float, should be between 0.0 and 1.0 and represent the proportion of the \ dataset to include in the dev split. test_size: float, default 0.1 If float, should be between 0.0 and 1.0 and represent the proportion of the \ dataset to include in the test split. stratify: target column, default None If not None, data is split in a stratified fashion, using the input as the \ class labels. random_state: integer, default 408 Random_state is the seed used by the random number generator. Returns ------- tuple: Tuple containing train-dev-test split of inputs. """ # Validate Inputs _validate_input_range(dev_size, "dev_size", 0, 1) _validate_input_range(test_size, "test_size", 0, 1) _validate_input_int(random_state, "random_state") _validate_input_sum_smaller(1, "Dev and test", dev_size, test_size) target_data = [] if isinstance(target, str): target_data = data[target] data = data.drop(target, axis=1) elif isinstance(target, (list, pd.Series, np.ndarray)): target_data = pd.Series(target) X_train, X_dev_test, y_train, y_dev_test = train_test_split( data, target_data, test_size=dev_size + test_size, random_state=random_state, stratify=stratify, ) if (dev_size == 0) or (test_size == 0): return X_train, X_dev_test, y_train, y_dev_test else: X_dev, X_test, y_dev, y_test = train_test_split( X_dev_test, y_dev_test, test_size=test_size / (dev_size + test_size), random_state=random_state, stratify=y_dev_test, ) return X_train, X_dev, X_test, y_train, y_dev, y_test