def collect_summary(path_list, time_span): evaluator_nn = Evaluation.Evaluation(1, 1) aggregate_summary = evaluator_nn.roc_aggregate(path_list[0], time_span, eps=True) for path in path_list[1:]: new_summary = evaluator_nn.roc_aggregate(path, time_span, eps=True) new_summary["FPR"] = aggregate_summary["FPR"][:new_summary.shape[0]] aggregate_summary = pd.concat([aggregate_summary, new_summary], axis=0) aggregate_summary = np.round(aggregate_summary, 3) return aggregate_summary
# ========= 2.a.i. Model ========= # Initialize the model at the first iteration # Model model = sk.linear_model.LogisticRegression(solver="liblinear", max_iter=1000, class_weight={ 0: 1, 1: 3000 }).fit(XTrain, yTrain) # Predict pred = model.predict_proba(XValid)[:, 1] # Evaluate evaluator = Evaluation.Evaluation(yValid, pred) # Save ROC plot _ = evaluator.roc_plot(plot=False, title=MODEL_NAME, save_path=DYNAMIC_PATH + f"roc_{time_pred}") # Save summary summary_data = evaluator.summary() summary_data.to_csv(DYNAMIC_PATH + f"summary_{time_pred}.csv", index=False) # Store predictions pred_valid1 = pred # Permutation test imp_means, imp_vars = feature_importance_permutation( predict_method=model.predict_proba_single,
hidden_size = HIDDEN_SIZE).to(device) criterion = nn.CrossEntropyLoss(weight = torch.FloatTensor([1, CLASS_WEIGHT])).to(device) optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE) model_sub, loss = model_sub.fit(XTrain_good, yTrain_good, NUM_EPOCHS, BATCH_SIZE, optimizer, criterion) pred_prob_sub = model_sub.predict_proba_single(XTest) y_pred_sub = threshold_predict(pred_prob_sub, yTest, fpr = FPR_THRESHOLD) tpr_sub = true_positive_rate(yTest, y_pred_sub) # Summary print("TPR on valid. full: {}. sub: {}".format(tpr_full, tpr_sub)) evaluator = Evaluation.Evaluation(yTest, pred_prob_full) # Save ROC plot _ = evaluator.roc_plot(plot = True, title = MODEL_NAME) evaluator = Evaluation.Evaluation(yTest, pred_prob_sub) # Save ROC plot _ = evaluator.roc_plot(plot = True, title = MODEL_NAME)
pred_train = model.predict_proba_single(x_data=XTrain, batch_size=BATCH_SIZE, transformation=transformation) # ========= 2.a.ii. Feature importance by permutation test ========= # Permutation test imp_means, imp_vars = feature_importance_permutation( predict_method=model.predict_proba_single, X=np.array(XTest), y=np.array(yTest), metric=true_positive_rate, fpr_threshold=FPR_THRESHOLD, num_rounds=5, seed=RANDOM_SEED) # Save feature importance plot fi_evaluator = Evaluation.FeatureImportance(imp_means, imp_vars, XTest.columns, MODEL_NAME) fi_evaluator.FI_plot(save_path=DYNAMIC_PATH, y_fontsize=4, eps=True) # ========= 2.b. Evaluation ========= evaluator = Evaluation.Evaluation(yTest, pred) # Save ROC plot _ = evaluator.roc_plot(plot=False, title=MODEL_NAME, save_path=DYNAMIC_PATH + f"roc_{time_pred}") # Save summary summary_data = evaluator.summary() summary_data.to_csv(DYNAMIC_PATH + f"summary_{time_pred}.csv", index=False) # ========= 2.c. Save predicted results =========
pred = pd.DataFrame(pred, columns=["pred_prob"]) pred.to_csv(REPORTS_DIR + f"predicted_result_{time_pred}.csv", index=False, header=True) # Save probs for train set (for stacked model) pred_train = prediction_model.predict_proba_single( eval_features=train_features, batch_size=EVAL_BATCH_SIZE, transformation=transformation) pred_train = pd.DataFrame(pred_train, columns=["pred_prob"]) pred_train.to_csv(DYNAMIC_PATH + f"predicted_result_train_{time_pred}.csv", index=False) # ========= 2.b. Evaluation ========= evaluator = Evaluation.Evaluation(yTest, pred) # Save ROC plot _ = evaluator.roc_plot(plot=False, title=MODEL_NAME, save_path=REPORTS_DIR + f"roc_{time_pred}") # Save summary summary_data = evaluator.summary() summary_data.to_csv(REPORTS_DIR + f"summary_{time_pred}.csv", index=False) # ========= 2.c. Save predicted results ========= pred = pd.DataFrame(pred, columns=["pred_prob"]) pred.to_csv(REPORTS_DIR + f"predicted_result_{time_pred}.csv", index=False) # ========= End of iteration =========
def collect_summary(path_list, time_span): evaluator_nn = Evaluation.Evaluation(1, 1) aggregate_summary = evaluator_nn.roc_aggregate(path_list[0], time_span, eps=True) for path in path_list[1:]: new_summary = evaluator_nn.roc_aggregate(path, time_span, eps=True) new_summary["FPR"] = aggregate_summary["FPR"][:new_summary.shape[0]] aggregate_summary = pd.concat([aggregate_summary, new_summary], axis=0) aggregate_summary = np.round(aggregate_summary, 3) return aggregate_summary # NN aggregate evaluator_nn = Evaluation.Evaluation(1, 1) aggregate_summary_nn = collect_summary(FIG_ROOT_PATH_NN_LST, time_span) # LR aggregate ROC evaluator_lr = Evaluation.Evaluation(1, 1) aggregate_summary_lr = collect_summary(FIG_ROOT_PATH_LR_LST, time_span) # RF aggregate evaluator_rf = Evaluation.Evaluation(1, 1) aggregate_summary_rf = collect_summary(FIG_ROOT_PATH_RF_LST, time_span) # IF aggregate evaluator_if = Evaluation.Evaluation(1, 1) aggregate_summary_if = evaluator_if.roc_aggregate(FIG_ROOT_PATH_IF, time_span, eps=True)
pred_new = model_new.predict_proba(XTest)[:, 1] # ========= 2.a.ii. Plot beta values ========= # Plot the features whose coefficients are the top 50 largest in magnitude non_zero_coeffs = model_new.coef_[model_new.coef_ != 0] indices = np.argsort(abs(non_zero_coeffs))[::-1][:50] _ = plt.figure() _ = plt.title("Logistic Regression Coefficients Values") _ = sns.barplot(y=XTest.columns[indices], x=np.squeeze(non_zero_coeffs)[indices]) _ = plt.yticks(fontsize=4) plt.savefig(dynamic_path + f"coeffs_{time_pred}.eps", format='eps', dpi=800) plt.close() # ========= 2.b. Evaluation ========= evaluator = Evaluation.Evaluation(yTest, pred_new) # Save ROC plot _ = evaluator.roc_plot(plot=False, title="LR", save_path=dynamic_path + f"roc_{time_pred}") # Save summary summary_data = evaluator.summary() summary_data.to_csv(dynamic_path + f"summary_{time_pred}.csv", index=False) # ========= 2.c. Feature importance ========= # Permutation test imp_means, imp_vars = mlxtend.evaluate.feature_importance_permutation( predict_method=model_new.predict, X=np.array(XTest), y=np.array(yTest), metric=sk.metrics.f1_score, num_rounds=15,
# Plot and save losses _ = sns.scatterplot(range(len(loss_train_vec)), loss_train_vec, label = "train") _ = sns.scatterplot(range(len(loss_valid_vec)), 10*loss_valid_vec, label = "valid") plt.savefig(DYNAMIC_PATH + f"losses_{time_pred}.png") plt.close() # Prediction on train # test_loader = torch.utils.data.DataLoader(dataset = np.array(XTrain), # batch_size = len(yTrain), # shuffle = False) evaluator = Evaluation.Evaluation(yValid, pred) # Save ROC plot _ = evaluator.roc_plot(plot = False, title = MODEL_NAME, save_path = DYNAMIC_PATH + f"roc_{time_pred}") # Store predictions pred_valid1 = pred # Permutation test imp_means, imp_vars = feature_importance_permutation( predict_method = model.predict_proba_single, X = np.array(XValid), y = np.array(yValid), metric = true_positive_rate, fpr_threshold = FPR_THRESHOLD,