def main(): parser = argparse.ArgumentParser() parser.add_argument("submissions", nargs="+", type=str) parser.add_argument("-o", "--output", type=str, required=True) args = parser.parse_args() submissions = [ pd.read_csv(x).sort_values(by="Id").reset_index() for x in args.submissions ] # Force 1.01 value of OOR values in my submission # Scripts assumes ABBA's submission goes first oor_mask = submissions[0].Label > 1.0 for s in submissions[1:]: s.loc[oor_mask, "Label"] = 1.01 submissions_blend = blend_predictions_ranked(submissions) print(submissions_blend.describe()) submissions_blend.to_csv(args.output, index=False) print("Saved blend to", args.output)
def main(): output_dir = os.path.dirname(__file__) experiments = [ # "A_May24_11_08_ela_skresnext50_32x4d_fold0_fp16", # "A_May15_17_03_ela_skresnext50_32x4d_fold1_fp16", # "A_May21_13_28_ela_skresnext50_32x4d_fold2_fp16", # "A_May26_12_58_ela_skresnext50_32x4d_fold3_fp16", # # "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", # "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", # "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # "C_Jun24_22_00_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16", # # "D_Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16", # "D_Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16", # # "E_Jun18_19_24_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "E_Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16", # "F_Jun29_19_43_rgb_tf_efficientnet_b3_ns_fold0_local_rank_0_fp16", # "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", # "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16", "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16", ] all_predictions = [] labels = experiments scoring_fn = alaska_weighted_auc for metric in [ # "loss", # "bauc", "cauc" ]: holdout_predictions_d4 = get_predictions_csv(experiments, metric, "holdout", "d4") oof_predictions_d4 = get_predictions_csv(experiments, metric, "oof", "d4") test_predictions_d4 = get_predictions_csv(experiments, metric, "test", "d4") fnames_for_checksum = [x + f"{metric}" for x in experiments] bin_pred_d4 = make_binary_predictions(holdout_predictions_d4) y_true = bin_pred_d4[0].y_true_type.values bin_pred_d4_score = scoring_fn(y_true, blend_predictions_ranked(bin_pred_d4).Label) cls_pred_d4 = make_classifier_predictions(holdout_predictions_d4) cls_pred_d4_score = scoring_fn(y_true, blend_predictions_ranked(cls_pred_d4).Label) prod_pred_d4_score = scoring_fn( y_true, blend_predictions_ranked(cls_pred_d4).Label * blend_predictions_ranked(bin_pred_d4).Label ) if False: bin_pred_d4_cal = make_binary_predictions_calibrated(holdout_predictions_d4, oof_predictions_d4) bin_pred_d4_cal_score = scoring_fn(y_true, blend_predictions_ranked(bin_pred_d4_cal).Label) cls_pred_d4_cal = make_classifier_predictions_calibrated(holdout_predictions_d4, oof_predictions_d4) cls_pred_d4_cal_score = scoring_fn(y_true, blend_predictions_ranked(cls_pred_d4_cal).Label) prod_pred_d4_cal_score = scoring_fn( y_true, blend_predictions_ranked(cls_pred_d4_cal).Label * blend_predictions_ranked(bin_pred_d4_cal).Label, ) else: bin_pred_d4_cal_score = 0 cls_pred_d4_cal_score = 0 prod_pred_d4_cal_score = 0 print(metric, "Bin NC", "d4", bin_pred_d4_score) print(metric, "Cls NC", "d4", cls_pred_d4_score) print(metric, "Prod NC", "d4", prod_pred_d4_score) print(metric, "Bin CL", "d4", bin_pred_d4_cal_score) print(metric, "Cls CL", "d4", cls_pred_d4_cal_score) print(metric, "Prod CL", "d4", prod_pred_d4_cal_score) max_score = max( bin_pred_d4_score, cls_pred_d4_score, bin_pred_d4_cal_score, cls_pred_d4_cal_score, prod_pred_d4_score, prod_pred_d4_cal_score, ) if bin_pred_d4_score == max_score: predictions = make_binary_predictions(test_predictions_d4) predictions = blend_predictions_ranked(predictions) predictions.to_csv( os.path.join(output_dir, f"rank_{max_score:.4f}_bin_{compute_checksum_v2(fnames_for_checksum)}.csv"), index=False, ) if bin_pred_d4_cal_score == max_score: predictions = make_binary_predictions_calibrated(test_predictions_d4, oof_predictions_d4) predictions = blend_predictions_ranked(predictions) predictions.to_csv( os.path.join( output_dir, f"rank_{max_score:.4f}_bin_cal_{compute_checksum_v2(fnames_for_checksum)}.csv" ), index=False, ) if cls_pred_d4_score == max_score: predictions = make_classifier_predictions(test_predictions_d4) predictions = blend_predictions_ranked(predictions) predictions.to_csv( os.path.join(output_dir, f"rank_{max_score:.4f}_cls_{compute_checksum_v2(fnames_for_checksum)}.csv"), index=False, ) if cls_pred_d4_cal_score == max_score: predictions = make_classifier_predictions_calibrated(test_predictions_d4, oof_predictions_d4) predictions = blend_predictions_ranked(predictions) predictions.to_csv( os.path.join( output_dir, f"rank_{max_score:.4f}_cls_cal_{compute_checksum_v2(fnames_for_checksum)}.csv" ), index=False, ) if prod_pred_d4_score == max_score: cls_predictions = make_classifier_predictions(test_predictions_d4) bin_predictions = make_binary_predictions(test_predictions_d4) predictions1 = blend_predictions_ranked(cls_predictions) predictions2 = blend_predictions_ranked(bin_predictions) predictions = predictions1.copy() predictions.Label = predictions1.Label * predictions2.Label predictions.to_csv( os.path.join(output_dir, f"rank_{max_score:.4f}_prod_{compute_checksum_v2(fnames_for_checksum)}.csv"), index=False, ) if prod_pred_d4_cal_score == max_score: cls_predictions = make_classifier_predictions_calibrated(test_predictions_d4, oof_predictions_d4) bin_predictions = make_binary_predictions_calibrated(test_predictions_d4, oof_predictions_d4) predictions1 = blend_predictions_ranked(cls_predictions) predictions2 = blend_predictions_ranked(bin_predictions) predictions = predictions1.copy() predictions.Label = predictions1.Label * predictions2.Label predictions.to_csv( os.path.join( output_dir, f"rank_{max_score:.4f}_prod_cal_{compute_checksum_v2(fnames_for_checksum)}.csv" ), index=False, )
import pandas as pd from scipy.stats import spearmanr from sklearn.metrics import matthews_corrcoef from alaska2.submissions import blend_predictions_ranked, blend_predictions_mean submission_v25_xl_NR_moreTTA = pd.read_csv( "submission_v25_xl_NR_moreTTA.csv").sort_values(by="Id") stacked_b6_xgb_cv = pd.read_csv( "662cfbbddf616db0df6f59ee2a96cc20_xgb_cv_0.9485.csv") print(spearmanr(submission_v25_xl_NR_moreTTA.Label, stacked_b6_xgb_cv.Label)) blend_1_ranked = blend_predictions_ranked( [submission_v25_xl_NR_moreTTA, stacked_b6_xgb_cv]) blend_1_ranked.to_csv("blend_1_ranked.csv", index=False) blend_1_mean = blend_predictions_mean( [submission_v25_xl_NR_moreTTA, stacked_b6_xgb_cv]) blend_1_mean.to_csv("blend_1_mean.csv", index=False)
import pandas as pd from scipy.stats import spearmanr from sklearn.metrics import matthews_corrcoef from alaska2.submissions import blend_predictions_ranked, blend_predictions_mean submission_v25_xl_NR_moreTTA = pd.read_csv( "submission_v25_xl_NR_moreTTA.csv").sort_values(by="Id").reset_index() submission_b6_mean_calibrated = pd.read_csv( "662cfbbddf616db0df6f59ee2a96cc20_best_cauc_blend_cls_mean_calibrated_0.9422.csv" ) # Force 1.01 value of OOR values in my submission oor_mask = submission_v25_xl_NR_moreTTA.Label > 1.0 submission_b6_mean_calibrated.loc[oor_mask, "Label"] = 1.01 print( spearmanr(submission_v25_xl_NR_moreTTA.Label, submission_b6_mean_calibrated.Label)) blend_3_ranked = blend_predictions_ranked( [submission_v25_xl_NR_moreTTA, submission_b6_mean_calibrated]) blend_3_ranked.to_csv( "blend_3_ranked_from_v25_xl_NR_moreTTA_and_b6_cauc_mean_calibrated.csv", index=False) blend_3_mean = blend_predictions_mean( [submission_v25_xl_NR_moreTTA, submission_b6_mean_calibrated]) blend_3_mean.to_csv( "blend_3_mean_from_v25_xl_NR_moreTTA_and_b6_cauc_mean_calibrated.csv", index=False)
print(cm) # disp = ConfusionMatrixDisplay( # confusion_matrix=cm, # display_labels=["v25_xl_NR_moreTTA", "v25_xl_NR_moreTTA_b4mish", "mean_09406", "xgb_cls_gs_09445"], # ) # plt.figure(figsize=(8, 8)) # disp.plot(include_values=True, cmap="Blues", ax=plt.gca(), xticks_rotation=45) # plt.show() # 939 # blend_6_ranked = blend_predictions_ranked([submission_v25_xl_NR_moreTTA_b4mish_b2mish_xlmish, xgb_cls_gs_09445]) # blend_6_ranked.to_csv("blend_7_ranked_v25_xl_NR_moreTTA_b4mish_b2mish_xlmish_with_xgb_cls_gs_09445.csv", index=False) # blend_7_ranked = blend_predictions_ranked([v25_xl_NR_moreTTA_b4mish_b2mish_xlmish, mean_9415]) blend_7_ranked.to_csv( "blend_7_ranked_v25_xl_NR_moreTTA_b4mish_b2mish_xlmish_with_mean_0.9415_prod_Gf0cauc_Gf3cauc_Hnrmishf2cauc_nrmishf1cauc.csv", index=False, ) blend_7_ranked = blend_predictions_ranked([v25_xl_NR_moreTTA_b4mish_b2mish_xlmish, xgb_cls_gs_09419]) blend_7_ranked.to_csv( "blend_7_ranked_v25_xl_NR_moreTTA_b4mish_b2mish_xlmish_with_xgb_cls_gs_0.9419_Gf0cauc_Gf3cauc_Hnrmishf2cauc_nrmishf1cauc.csv", index=False, ) # blend_6_ranked = blend_predictions_ranked([v25_xl_NR_moreTTA_b4mish, xgb_cls_gs_09445]) # blend_6_ranked.to_csv(
"xgb_gs_0.9434_BrgbB6f0cauc_BrgbB6f1cauc_BrgbB6f2cauc_BrgbB6f3cauc_CrgbB2f2cauc_DrgbB7f1cauc_DrgbB7f2cauc_ErgbB6f0istego100kcauc_FrgbB3f0cauc.csv" ) # Force 1.01 value of OOR values in my submission oor_mask = submission_v25_xl_NR_moreTTA.Label > 1.0 submission_b6_mean_calibrated.loc[oor_mask, "Label"] = 1.01 submission_b6_cmb_uncalibrated.loc[oor_mask, "Label"] = 1.01 submission_b6_xgb.loc[oor_mask, "Label"] = 1.01 print( spearmanr(submission_v25_xl_NR_moreTTA.Label, submission_b6_cmb_uncalibrated.Label)) print( spearmanr(submission_v25_xl_NR_moreTTA.Label, submission_b6_mean_calibrated.Label)) print(spearmanr(submission_v25_xl_NR_moreTTA.Label, submission_b6_xgb.Label)) # # blend_4_ranked = blend_predictions_ranked([submission_v25_xl_NR_moreTTA, submission_b6_mean_calibrated]) # blend_4_ranked.to_csv("blend_3_ranked_from_v25_xl_NR_moreTTA_and_mean_0.9391_cls_cal_BrgbB6f0cauc_BrgbB6f1cauc_BrgbB6f2cauc_BrgbB6f3cauc_CrgbB2f2cauc_DrgbB7f1cauc_DrgbB7f2cauc_ErgbB6f0istego100kcauc_FrgbB3f0cauc.csv", index=False) # blend_4_ranked = blend_predictions_ranked([submission_v25_xl_NR_moreTTA, submission_b6_mean_calibrated]) # blend_4_ranked.to_csv("blend_3_ranked_from_v25_xl_NR_moreTTA_and_mean_0.9391_cls_cal_BrgbB6f0cauc_BrgbB6f1cauc_BrgbB6f2cauc_BrgbB6f3cauc_CrgbB2f2cauc_DrgbB7f1cauc_DrgbB7f2cauc_ErgbB6f0istego100kcauc_FrgbB3f0cauc.csv", index=False) blend_4_mean = blend_predictions_ranked( [submission_v25_xl_NR_moreTTA, submission_b6_xgb]) blend_4_mean.to_csv( "blend_4_ranked_from_v25_xl_NR_moreTTA_and_xgb_gs_0.9434_BrgbB6f0cauc_BrgbB6f1cauc_BrgbB6f2cauc_BrgbB6f3cauc_CrgbB2f2cauc_DrgbB7f1cauc_DrgbB7f2cauc_ErgbB6f0istego100kcauc_FrgbB3f0cauc.csv", index=False, )
from sklearn.metrics import plot_confusion_matrix v25_xl_NR_moreTTA_b4mish_b2mish_xlmish = (pd.read_csv( "submission_v25_xl_NR_moreTTA_b4mish_b2mish_xlmish.csv").sort_values( by="Id").reset_index()) xgb_cls_gs_09420 = pd.read_csv( "xgb_cls_0.9420_Gf0_Gf1_Gf2_Gf3_Hnrmishf2_Hnrmishf1_.csv") # Force 1.01 value of OOR values in my submission oor_mask = v25_xl_NR_moreTTA_b4mish_b2mish_xlmish.Label > 1.0 xgb_cls_gs_09420.loc[oor_mask, "Label"] = 1.01 submissions = [v25_xl_NR_moreTTA_b4mish_b2mish_xlmish, xgb_cls_gs_09420] cm = np.zeros((len(submissions), len(submissions))) for i in range(len(submissions)): for j in range(len(submissions)): cm[i, j] = spearmanr(submissions[i].Label, submissions[j].Label).correlation print(cm) blend_8_ranked = blend_predictions_ranked( [v25_xl_NR_moreTTA_b4mish_b2mish_xlmish, xgb_cls_gs_09420]) blend_8_ranked.to_csv( "blend_8_ranked_v25_xl_NR_moreTTA_b4mish_b2mish_xlmish_with_xgb_cls_0.9420_Gf0_Gf1_Gf2_Gf3_Hnrmishf2_Hnrmishf1.csv", index=False, )
"v25", "v26", "emb_09411", "avg_0_9417", "cmb_0_9424", "xgb_0_9424", "lgb_0_9421" ], ) plt.figure(figsize=(8, 8)) disp.plot(include_values=True, cmap="Blues", ax=plt.gca(), xticks_rotation=45) plt.savefig(fname="predictions_corr.png") plt.show() # Submit 1 - v25 + embedding # Submit 2 - v25 + tuned models # Submit 3 - v26 + tuned models # Submit 4 - # Submit 5 - blend_10_ranked = blend_predictions_ranked( [v25_xl_NR_moreTTA_b4mish_b2mish_xlmish, embeddings_09411]) print(blend_10_ranked.describe()) blend_10_ranked.to_csv( "blend_10_ranked_v25_xl_NR_moreTTA_b4mish_b2mish_xlmish_with_embeddings_09411.csv", index=False) blend_10_ranked = blend_predictions_ranked( [v25_xl_NR_moreTTA_b4mish_b2mish_xlmish, mean_0_9417]) print(blend_10_ranked.describe()) blend_10_ranked.to_csv( "blend_10_ranked_v25_xl_NR_moreTTA_b4mish_b2mish_xlmish_with_mean_0.9417_cls_Kmishf0cauc_Jnrmishf1cauc_Hnrmishf2cauc_Kmishf3cauc.csv", index=False, ) blend_10_ranked = blend_predictions_ranked( [v25_xl_NR_moreTTA_b4mish_b2mish_xlmish, xgb_0_9424])