def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame: """Compute the importance of each predictor in the model and return it as a DataFrame Parameters ---------- data : pd.DataFrame data to score the model Returns ------- pd.DataFrame DataFrame containing columns predictor and importance """ y_pred = self.score_model(data) importance_by_variable = { utils.clean_predictor_name(predictor): stats.pearsonr(data[predictor], y_pred)[0] for predictor in self.predictors } df = pd.DataFrame.from_dict(importance_by_variable, orient='index').reset_index() df.columns = ["predictor", "importance"] return (df.sort_values(by="importance", ascending=False).reset_index(drop=True))
def compute_correlations(target_enc_train_data: pd.DataFrame, predictors: list) -> pd.DataFrame: """Given a DataFrame and a list of predictors, compute the correlations amongst the predictors in the DataFrame. Parameters ---------- target_enc_train_data : pd.DataFrame Data to compute correlation. predictors : list List of column names of the DataFrame between which to compute the correlation matrix. Returns ------- pd.DataFrame The correlation matrix of the training set. """ correlations = target_enc_train_data[predictors].corr() predictors_cleaned = [ utils.clean_predictor_name(predictor) for predictor in predictors ] # Change index and columns with the cleaned version of the predictors # e.g. change "var1_enc" with "var1" correlations.columns = predictors_cleaned correlations.index = predictors_cleaned return correlations
def compute_pig_table(basetable: pd.DataFrame, predictor_column_name: str, target_column_name: str, id_column_name: str) -> pd.DataFrame: """Compute the PIG table of a given predictor for a given target. Parameters ---------- basetable : pd.DataFrame Input data from which to compute the pig table. predictor_column_name : str Predictor name of which to compute the pig table. target_column_name : str Name of the target variable. id_column_name : str Name of the id column (used to count population size). Returns ------- pd.DataFrame PIG table as a DataFrame """ global_avg_target = basetable[target_column_name].mean() # group by the binned variable, compute the incidence # (=mean of the target for the given bin) and compute the bin size # (e.g. COUNT(id_column_name)). After that, rename the columns res = (basetable.groupby(predictor_column_name).agg({ target_column_name: "mean", id_column_name: "size" }).reset_index().rename( columns={ predictor_column_name: "label", target_column_name: "avg_target", id_column_name: "pop_size" })) # add the column name to a variable column # add the average incidence # replace population size by a percentage of total population res["variable"] = utils.clean_predictor_name(predictor_column_name) res["global_avg_target"] = global_avg_target res["pop_size"] = res["pop_size"] / len(basetable.index) # make sure to always return the data with the proper column order column_order = [ "variable", "label", "pop_size", "global_avg_target", "avg_target" ] return res[column_order]
def compute_univariate_preselection( target_enc_train_data: pd.DataFrame, target_enc_selection_data: pd.DataFrame, predictors: list, target_column: str, model_type: str = "classification", preselect_auc_threshold: float = 0.053, preselect_rmse_threshold: float = 5, preselect_overtrain_threshold: float = 0.05) -> pd.DataFrame: """Perform a preselection of predictors based on an AUC (in case of classification) or a RMSE (in case of regression) threshold of a univariate model on a train and selection dataset and return a DataFrame containing for each variable the train and selection AUC or RMSE along with a boolean "preselection" column. As the AUC just calculates the quality of a ranking, all monotonous transformations of a given ranking (i.e. transformations that do not alter the ranking itself) will lead to the same AUC. Hence, pushing a categorical variable (incl. a binned continuous variable) through a logistic regression will produce exactly the same ranking as pushing it through incidence replacement (i.e. target encoding), as it will produce the exact same output: a ranking of the categories on the training set. Therefore, no univariate model is trained here as the target encoded train and selection data is/must be used as inputs for this function. These will be used as predicted scores to compute the AUC with against the target. Parameters ---------- model_type : str Model type ("classification" or "regression"). target_enc_train_data : pd.DataFrame Train data. target_enc_selection_data : pd.DataFrame Selection data. predictors : list List of predictors (e.g. column names in the train set and selection data sets). target_column : str Name of the target column. preselect_auc_threshold : float, optional Threshold on min. AUC to select predictor. Ignored if model_type is "regression". preselect_rmse_threshold : float, optional Threshold on max. RMSE to select predictor. Ignored if model_type is "classification". It is important to note that the threshold depends heavily on the scale of the target variable, and should be modified accordingly. preselect_overtrain_threshold : float, optional Threshold on the difference between train and selection AUC or RMSE (in case of the latter, as a proportion). Returns ------- pd.DataFrame DataFrame containing for each variable the train AUC or RMSE and selection AUC or RMSE along with a boolean indicating whether or not it is selected based on the criteria. """ result = [] if model_type == "classification": for predictor in predictors: cleaned_predictor = utils.clean_predictor_name(predictor) auc_train = roc_auc_score( y_true=target_enc_train_data[target_column], y_score=target_enc_train_data[predictor]) auc_selection = roc_auc_score( y_true=target_enc_selection_data[target_column], y_score=target_enc_selection_data[predictor]) result.append({ "predictor": cleaned_predictor, "AUC train": auc_train, "AUC selection": auc_selection }) df_auc = pd.DataFrame(result) # Filter based on min. AUC auc_thresh = df_auc.loc[:, "AUC selection"] > preselect_auc_threshold # Identify those variables for which the AUC difference between train # and selection is within a user-defined ratio auc_overtrain = ((df_auc["AUC train"] - df_auc["AUC selection"]) < preselect_overtrain_threshold) df_auc["preselection"] = auc_thresh & auc_overtrain df_out = df_auc.sort_values(by="AUC selection", ascending=False).reset_index(drop=True) elif model_type == "regression": for predictor in predictors: cleaned_predictor = utils.clean_predictor_name(predictor) rmse_train = sqrt( mean_squared_error(y_true=target_enc_train_data[target_column], y_pred=target_enc_train_data[predictor])) rmse_selection = sqrt( mean_squared_error( y_true=target_enc_selection_data[target_column], y_pred=target_enc_selection_data[predictor])) result.append({ "predictor": cleaned_predictor, "RMSE train": rmse_train, "RMSE selection": rmse_selection }) df_rmse = pd.DataFrame(result) # Filter based on max. RMSE rmse_thresh = df_rmse.loc[:, "RMSE selection"] < preselect_rmse_threshold # Identify those variables for which the RMSE difference between train # and selection is within a user-defined ratio rmse_overtrain = ((df_rmse["RMSE selection"] - df_rmse["RMSE train"] ) # flip subtraction vs. AUC < preselect_overtrain_threshold) df_rmse["preselection"] = rmse_thresh & rmse_overtrain df_out = df_rmse.sort_values(by="RMSE selection", ascending=True).reset_index( drop=True) # lower is better return df_out
def compute_univariate_preselection( target_enc_train_data: pd.DataFrame, target_enc_selection_data: pd.DataFrame, predictors: list, target_column: str, preselect_auc_threshold: float = 0.053, preselect_overtrain_threshold: float = 0.05) -> pd.DataFrame: """Perform a preselection of predictors based on an AUC threshold of a univariate model on a train and selection dataset and return a datframe containing for each variable the train and selection AUC along with a boolean "preselection" column. As the AUC just calculates the quality of a ranking, all monotonous transformations of a given ranking (i.e. transformations that do not alter the ranking itself) will lead to the same AUC. Hence, pushing a categorical variable (incl. a binned continuous variable) through a logistic regression will produce exactly the same ranking as pushing it through incidence replacement (i.e. target encoding), as it will produce the exact same output: a ranking of the categories on the training set. Therefore, no univariate model is trained here as the target encoded train and selection data is/must be used as inputs for this function. These will be used as predicted scores to compute the AUC with against the target Parameters ---------- target_enc_train_data : pd.DataFrame Train data target_enc_selection_data : pd.DataFrame Selection data predictors : list list of predictors (e.g. column names in the train set and selection data sets) target_column : str name of the target column preselect_auc_threshold : float, optional threshold on AUC to select predictor preselect_overtrain_threshold : float, optional threshold on the difference between train and selection AUC Returns ------- pd.DataFrame DataFrame containing for each variable the train auc and selection auc allong with a boolean indicating whether or not it is selected based on the criteria """ result = [] for predictor in predictors: cleaned_predictor = utils.clean_predictor_name(predictor) auc_train = roc_auc_score(y_true=target_enc_train_data[target_column], y_score=target_enc_train_data[predictor]) auc_selection = roc_auc_score( y_true=target_enc_selection_data[target_column], y_score=target_enc_selection_data[predictor]) result.append({ "predictor": cleaned_predictor, "AUC train": auc_train, "AUC selection": auc_selection }) df_auc = pd.DataFrame(result) # Filter based on min AUC auc_thresh = df_auc.loc[:, "AUC selection"] > preselect_auc_threshold # Identify those variables for which the AUC difference between train # and selection is within a user-defined ratio auc_overtrain = ((df_auc["AUC train"] - df_auc["AUC selection"]) < preselect_overtrain_threshold) df_auc["preselection"] = auc_thresh & auc_overtrain return (df_auc.sort_values(by='AUC selection', ascending=False).reset_index(drop=True))