def drop_missing( data: pd.DataFrame, drop_threshold_cols: float = 1, drop_threshold_rows: float = 1, col_exclude: Optional[List[str]] = None, ) -> pd.DataFrame: """ Drops completely empty columns and rows by default and optionally provides \ flexibility to loosen restrictions to drop additional non-empty columns and \ rows based on the fraction of NA-values. Parameters ---------- data : pd.DataFrame 2D dataset that can be coerced into Pandas DataFrame drop_threshold_cols : float, optional Drop columns with NA-ratio equal to or above the specified threshold, by \ default 1 drop_threshold_rows : float, optional Drop rows with NA-ratio equal to or above the specified threshold, by default 1 col_exclude : Optional[List[str]], optional Specify a list of columns to exclude from dropping. The excluded columns do \ not affect the drop thresholds, by default None Returns ------- pd.DataFrame Pandas DataFrame without any empty columns or rows Notes ----- Columns are dropped first """ # Validate Inputs _validate_input_range(drop_threshold_cols, "drop_threshold_cols", 0, 1) _validate_input_range(drop_threshold_rows, "drop_threshold_rows", 0, 1) col_exclude = [] if col_exclude is None else col_exclude.copy() data_exclude = data[col_exclude] data = pd.DataFrame(data).copy() data_dropped = data.drop(columns=col_exclude, errors="ignore") data_dropped = data_dropped.drop( columns=data_dropped.loc[ :, _missing_vals(data)["mv_cols_ratio"] > drop_threshold_cols ].columns ).dropna(axis=1, how="all") data = pd.concat([data_dropped, data_exclude], axis=1) data_cleaned = data.drop( index=data.loc[ _missing_vals(data)["mv_rows_ratio"] > drop_threshold_rows, : ].index ).dropna(axis=0, how="all") return data_cleaned
def mv_col_handling( data: pd.DataFrame, target: Optional[Union[str, pd.Series, List]] = None, mv_threshold: float = 0.1, corr_thresh_features: float = 0.5, corr_thresh_target: float = 0.3, return_details: bool = False, ) -> pd.DataFrame: """ Converts columns with a high ratio of missing values into binary features and \ eventually drops them based on their correlation with other features and the \ target variable. This function follows a three step process: - 1) Identify features with a high ratio of missing values (above 'mv_threshold'). - 2) Identify high correlations of these features among themselves and with \ other features in the dataset (above 'corr_thresh_features'). - 3) Features with high ratio of missing values and high correlation among each \ other are dropped unless they correlate reasonably well with the target \ variable (above 'corr_thresh_target'). Note: If no target is provided, the process exits after step two and drops columns \ identified up to this point. Parameters ---------- data : pd.DataFrame 2D dataset that can be coerced into Pandas DataFrame target : Optional[Union[str, pd.Series, List]], optional Specify target for correlation. I.e. label column to generate only the \ correlations between each feature and the label, by default None mv_threshold : float, optional Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger \ than mv_threshold are candidates for dropping and undergo further analysis, by \ default 0.1 corr_thresh_features : float, optional Value between 0 <= threshold <= 1. Maximum correlation a previously identified \ features (with a high mv-ratio) is allowed to have with another feature. If \ this threshold is overstepped, the feature undergoes further analysis, by \ default 0.5 corr_thresh_target : float, optional Value between 0 <= threshold <= 1. Minimum required correlation of a remaining \ feature (i.e. feature with a high mv-ratio and high correlation to another \ existing feature) with the target. If this threshold is not met the feature is \ ultimately dropped, by default 0.3 return_details : bool, optional Provdies flexibility to return intermediary results, by default False Returns ------- pd.DataFrame Updated Pandas DataFrame optional: cols_mv: Columns with missing values included in the analysis drop_cols: List of dropped columns """ # Validate Inputs _validate_input_range(mv_threshold, "mv_threshold", 0, 1) _validate_input_range(corr_thresh_features, "corr_thresh_features", 0, 1) _validate_input_range(corr_thresh_target, "corr_thresh_target", 0, 1) data = pd.DataFrame(data).copy() data_local = data.copy() mv_ratios = _missing_vals(data_local)["mv_cols_ratio"] cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist() data_local[cols_mv] = (data_local[cols_mv].applymap( lambda x: 1 if not pd.isnull(x) else x).fillna(0)) high_corr_features = [] data_temp = data_local.copy() for col in cols_mv: corrmat = corr_mat(data_temp, colored=False) if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features: high_corr_features.append(col) data_temp = data_temp.drop(columns=[col]) drop_cols = [] if target is None: data = data.drop(columns=high_corr_features) else: corrs = corr_mat(data_local, target=target, colored=False).loc[high_corr_features] drop_cols = corrs.loc[ abs(corrs.iloc[:, 0]) < corr_thresh_target].index.tolist() data = data.drop(columns=drop_cols) if return_details: return data, cols_mv, drop_cols return data
def missingval_plot( data: pd.DataFrame, cmap: str = "PuBuGn", figsize: Tuple = (20, 20), sort: bool = False, spine_color: str = "#EEEEEE", ): """ Two-dimensional visualization of the missing values in a dataset. Parameters ---------- data : pd.DataFrame 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the \ index/column information is used to label the plots cmap : str, optional Any valid colormap can be used. E.g. "Greys", "RdPu". More information can be found in the \ matplotlib documentation, by default "PuBuGn" figsize : Tuple, optional Use to control the figure size, by default (20, 20) sort : bool, optional Sort columns based on missing values in descending order and drop columns without any missing \ values, by default False spine_color : str, optional Set to "None" to hide the spines on all plots or use any valid matplotlib color argument, by default \ "#EEEEEE" Returns ------- GridSpec gs: Figure with array of Axes objects """ # Validate Inputs _validate_input_bool(sort, "sort") data = pd.DataFrame(data) if sort: mv_cols_sorted = data.isna().sum(axis=0).sort_values(ascending=False) final_cols = (mv_cols_sorted.drop(mv_cols_sorted[ mv_cols_sorted.values == 0].keys().tolist()).keys().tolist()) data = data[final_cols] print("Displaying only columns with missing values.") # Identify missing values mv_total, mv_rows, mv_cols, _, mv_cols_ratio = _missing_vals(data).values() total_datapoints = data.shape[0] * data.shape[1] if mv_total == 0: print("No missing values found in the dataset.") else: # Create figure and axes fig = plt.figure(figsize=figsize) gs = fig.add_gridspec(nrows=6, ncols=6, left=0.1, wspace=0.05) ax1 = fig.add_subplot(gs[:1, :5]) ax2 = fig.add_subplot(gs[1:, :5]) ax3 = fig.add_subplot(gs[:1, 5:]) ax4 = fig.add_subplot(gs[1:, 5:]) # ax1 - Barplot colors = plt.get_cmap(cmap)(mv_cols / np.max(mv_cols)) # color bars by height ax1.bar(range(len(mv_cols)), np.round((mv_cols_ratio) * 100, 2), color=colors) ax1.get_xaxis().set_visible(False) ax1.set(frame_on=False, xlim=(-0.5, len(mv_cols) - 0.5)) ax1.set_ylim(0, np.max(mv_cols_ratio) * 100) ax1.grid(linestyle=":", linewidth=1) ax1.yaxis.set_major_formatter(ticker.PercentFormatter(decimals=0)) ax1.tick_params(axis="y", colors="#111111", length=1) # annotate values on top of the bars for rect, label in zip(ax1.patches, mv_cols): height = rect.get_height() ax1.text( 0.1 + rect.get_x() + rect.get_width() / 2, height + 0.5, label, ha="center", va="bottom", rotation="90", alpha=0.5, fontsize="11", ) ax1.set_frame_on(True) for _, spine in ax1.spines.items(): spine.set_visible(True) spine.set_color(spine_color) ax1.spines["top"].set_color(None) # ax2 - Heatmap sns.heatmap(data.isna(), cbar=False, cmap="binary", ax=ax2) ax2.set_yticks(np.round(ax2.get_yticks()[0::5], -1)) ax2.set_yticklabels(ax2.get_yticks()) ax2.set_xticklabels(ax2.get_xticklabels(), horizontalalignment="center", fontweight="light", fontsize="12") ax2.tick_params(length=1, colors="#111111") for _, spine in ax2.spines.items(): spine.set_visible(True) spine.set_color(spine_color) # ax3 - Summary fontax3 = {"color": "#111111", "weight": "normal", "size": 14} ax3.get_xaxis().set_visible(False) ax3.get_yaxis().set_visible(False) ax3.set(frame_on=False) ax3.text( 0.025, 0.875, f"Total: {np.round(total_datapoints/1000,1)}K", transform=ax3.transAxes, fontdict=fontax3, ) ax3.text(0.025, 0.675, f"Missing: {np.round(mv_total/1000,1)}K", transform=ax3.transAxes, fontdict=fontax3) ax3.text( 0.025, 0.475, f"Relative: {np.round(mv_total/total_datapoints*100,1)}%", transform=ax3.transAxes, fontdict=fontax3, ) ax3.text( 0.025, 0.275, f"Max-col: {np.round(mv_cols.max()/data.shape[0]*100)}%", transform=ax3.transAxes, fontdict=fontax3, ) ax3.text( 0.025, 0.075, f"Max-row: {np.round(mv_rows.max()/data.shape[1]*100)}%", transform=ax3.transAxes, fontdict=fontax3, ) # ax4 - Scatter plot ax4.get_yaxis().set_visible(False) for _, spine in ax4.spines.items(): spine.set_color(spine_color) ax4.tick_params(axis="x", colors="#111111", length=1) ax4.scatter(mv_rows, range(len(mv_rows)), s=mv_rows, c=mv_rows, cmap=cmap, marker=".", vmin=1) ax4.set_ylim((0, len(mv_rows))[::-1]) # limit and invert y-axis ax4.set_xlim(0, max(mv_rows) + 0.5) ax4.grid(linestyle=":", linewidth=1) gs.figure.suptitle("Missing value plot", x=0.45, y=0.94, fontsize=18, color="#111111") return gs