Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("submissions", nargs="+", type=str)
    parser.add_argument("-o", "--output", type=str, required=True)
    args = parser.parse_args()

    submissions = [
        pd.read_csv(x).sort_values(by="Id").reset_index()
        for x in args.submissions
    ]

    # Force 1.01 value of OOR values in my submission
    # Scripts assumes ABBA's submission goes first
    oor_mask = submissions[0].Label > 1.0
    for s in submissions[1:]:
        s.loc[oor_mask, "Label"] = 1.01

    submissions_blend = blend_predictions_ranked(submissions)

    print(submissions_blend.describe())
    submissions_blend.to_csv(args.output, index=False)
    print("Saved blend to", args.output)
Esempio n. 2
0
def main():
    output_dir = os.path.dirname(__file__)

    experiments = [
        # "A_May24_11_08_ela_skresnext50_32x4d_fold0_fp16",
        # "A_May15_17_03_ela_skresnext50_32x4d_fold1_fp16",
        # "A_May21_13_28_ela_skresnext50_32x4d_fold2_fp16",
        # "A_May26_12_58_ela_skresnext50_32x4d_fold3_fp16",
        #
        # "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        # "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        # "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        #
        "C_Jun24_22_00_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16",
        #
        # "D_Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16",
        # "D_Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16",
        #
        # "E_Jun18_19_24_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "E_Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16",
        #
        "F_Jun29_19_43_rgb_tf_efficientnet_b3_ns_fold0_local_rank_0_fp16",
        #
        "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        # "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        #
        "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16",
        "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16",
    ]

    all_predictions = []
    labels = experiments
    scoring_fn = alaska_weighted_auc

    for metric in [
        # "loss",
        # "bauc",
        "cauc"
    ]:
        holdout_predictions_d4 = get_predictions_csv(experiments, metric, "holdout", "d4")
        oof_predictions_d4 = get_predictions_csv(experiments, metric, "oof", "d4")
        test_predictions_d4 = get_predictions_csv(experiments, metric, "test", "d4")

        fnames_for_checksum = [x + f"{metric}" for x in experiments]

        bin_pred_d4 = make_binary_predictions(holdout_predictions_d4)
        y_true = bin_pred_d4[0].y_true_type.values

        bin_pred_d4_score = scoring_fn(y_true, blend_predictions_ranked(bin_pred_d4).Label)

        cls_pred_d4 = make_classifier_predictions(holdout_predictions_d4)
        cls_pred_d4_score = scoring_fn(y_true, blend_predictions_ranked(cls_pred_d4).Label)

        prod_pred_d4_score = scoring_fn(
            y_true, blend_predictions_ranked(cls_pred_d4).Label * blend_predictions_ranked(bin_pred_d4).Label
        )

        if False:
            bin_pred_d4_cal = make_binary_predictions_calibrated(holdout_predictions_d4, oof_predictions_d4)
            bin_pred_d4_cal_score = scoring_fn(y_true, blend_predictions_ranked(bin_pred_d4_cal).Label)

            cls_pred_d4_cal = make_classifier_predictions_calibrated(holdout_predictions_d4, oof_predictions_d4)
            cls_pred_d4_cal_score = scoring_fn(y_true, blend_predictions_ranked(cls_pred_d4_cal).Label)

            prod_pred_d4_cal_score = scoring_fn(
                y_true,
                blend_predictions_ranked(cls_pred_d4_cal).Label * blend_predictions_ranked(bin_pred_d4_cal).Label,
            )
        else:
            bin_pred_d4_cal_score = 0
            cls_pred_d4_cal_score = 0
            prod_pred_d4_cal_score = 0

        print(metric, "Bin  NC", "d4", bin_pred_d4_score)
        print(metric, "Cls  NC", "d4", cls_pred_d4_score)
        print(metric, "Prod NC", "d4", prod_pred_d4_score)
        print(metric, "Bin  CL", "d4", bin_pred_d4_cal_score)
        print(metric, "Cls  CL", "d4", cls_pred_d4_cal_score)
        print(metric, "Prod CL", "d4", prod_pred_d4_cal_score)

        max_score = max(
            bin_pred_d4_score,
            cls_pred_d4_score,
            bin_pred_d4_cal_score,
            cls_pred_d4_cal_score,
            prod_pred_d4_score,
            prod_pred_d4_cal_score,
        )

        if bin_pred_d4_score == max_score:
            predictions = make_binary_predictions(test_predictions_d4)

            predictions = blend_predictions_ranked(predictions)
            predictions.to_csv(
                os.path.join(output_dir, f"rank_{max_score:.4f}_bin_{compute_checksum_v2(fnames_for_checksum)}.csv"),
                index=False,
            )
        if bin_pred_d4_cal_score == max_score:
            predictions = make_binary_predictions_calibrated(test_predictions_d4, oof_predictions_d4)

            predictions = blend_predictions_ranked(predictions)
            predictions.to_csv(
                os.path.join(
                    output_dir, f"rank_{max_score:.4f}_bin_cal_{compute_checksum_v2(fnames_for_checksum)}.csv"
                ),
                index=False,
            )
        if cls_pred_d4_score == max_score:
            predictions = make_classifier_predictions(test_predictions_d4)

            predictions = blend_predictions_ranked(predictions)
            predictions.to_csv(
                os.path.join(output_dir, f"rank_{max_score:.4f}_cls_{compute_checksum_v2(fnames_for_checksum)}.csv"),
                index=False,
            )
        if cls_pred_d4_cal_score == max_score:
            predictions = make_classifier_predictions_calibrated(test_predictions_d4, oof_predictions_d4)

            predictions = blend_predictions_ranked(predictions)
            predictions.to_csv(
                os.path.join(
                    output_dir, f"rank_{max_score:.4f}_cls_cal_{compute_checksum_v2(fnames_for_checksum)}.csv"
                ),
                index=False,
            )
        if prod_pred_d4_score == max_score:
            cls_predictions = make_classifier_predictions(test_predictions_d4)
            bin_predictions = make_binary_predictions(test_predictions_d4)

            predictions1 = blend_predictions_ranked(cls_predictions)
            predictions2 = blend_predictions_ranked(bin_predictions)
            predictions = predictions1.copy()
            predictions.Label = predictions1.Label * predictions2.Label

            predictions.to_csv(
                os.path.join(output_dir, f"rank_{max_score:.4f}_prod_{compute_checksum_v2(fnames_for_checksum)}.csv"),
                index=False,
            )
        if prod_pred_d4_cal_score == max_score:
            cls_predictions = make_classifier_predictions_calibrated(test_predictions_d4, oof_predictions_d4)
            bin_predictions = make_binary_predictions_calibrated(test_predictions_d4, oof_predictions_d4)

            predictions1 = blend_predictions_ranked(cls_predictions)
            predictions2 = blend_predictions_ranked(bin_predictions)
            predictions = predictions1.copy()
            predictions.Label = predictions1.Label * predictions2.Label

            predictions.to_csv(
                os.path.join(
                    output_dir, f"rank_{max_score:.4f}_prod_cal_{compute_checksum_v2(fnames_for_checksum)}.csv"
                ),
                index=False,
            )
import pandas as pd
from scipy.stats import spearmanr
from sklearn.metrics import matthews_corrcoef
from alaska2.submissions import blend_predictions_ranked, blend_predictions_mean

submission_v25_xl_NR_moreTTA = pd.read_csv(
    "submission_v25_xl_NR_moreTTA.csv").sort_values(by="Id")
stacked_b6_xgb_cv = pd.read_csv(
    "662cfbbddf616db0df6f59ee2a96cc20_xgb_cv_0.9485.csv")

print(spearmanr(submission_v25_xl_NR_moreTTA.Label, stacked_b6_xgb_cv.Label))

blend_1_ranked = blend_predictions_ranked(
    [submission_v25_xl_NR_moreTTA, stacked_b6_xgb_cv])
blend_1_ranked.to_csv("blend_1_ranked.csv", index=False)

blend_1_mean = blend_predictions_mean(
    [submission_v25_xl_NR_moreTTA, stacked_b6_xgb_cv])
blend_1_mean.to_csv("blend_1_mean.csv", index=False)
import pandas as pd
from scipy.stats import spearmanr
from sklearn.metrics import matthews_corrcoef
from alaska2.submissions import blend_predictions_ranked, blend_predictions_mean

submission_v25_xl_NR_moreTTA = pd.read_csv(
    "submission_v25_xl_NR_moreTTA.csv").sort_values(by="Id").reset_index()
submission_b6_mean_calibrated = pd.read_csv(
    "662cfbbddf616db0df6f59ee2a96cc20_best_cauc_blend_cls_mean_calibrated_0.9422.csv"
)

# Force 1.01 value of OOR values in my submission
oor_mask = submission_v25_xl_NR_moreTTA.Label > 1.0
submission_b6_mean_calibrated.loc[oor_mask, "Label"] = 1.01
print(
    spearmanr(submission_v25_xl_NR_moreTTA.Label,
              submission_b6_mean_calibrated.Label))

blend_3_ranked = blend_predictions_ranked(
    [submission_v25_xl_NR_moreTTA, submission_b6_mean_calibrated])
blend_3_ranked.to_csv(
    "blend_3_ranked_from_v25_xl_NR_moreTTA_and_b6_cauc_mean_calibrated.csv",
    index=False)

blend_3_mean = blend_predictions_mean(
    [submission_v25_xl_NR_moreTTA, submission_b6_mean_calibrated])
blend_3_mean.to_csv(
    "blend_3_mean_from_v25_xl_NR_moreTTA_and_b6_cauc_mean_calibrated.csv",
    index=False)
Esempio n. 5
0
print(cm)

# disp = ConfusionMatrixDisplay(
#     confusion_matrix=cm,
#     display_labels=["v25_xl_NR_moreTTA", "v25_xl_NR_moreTTA_b4mish", "mean_09406", "xgb_cls_gs_09445"],
# )
# plt.figure(figsize=(8, 8))
# disp.plot(include_values=True, cmap="Blues", ax=plt.gca(), xticks_rotation=45)
# plt.show()

# 939
# blend_6_ranked = blend_predictions_ranked([submission_v25_xl_NR_moreTTA_b4mish_b2mish_xlmish, xgb_cls_gs_09445])
# blend_6_ranked.to_csv("blend_7_ranked_v25_xl_NR_moreTTA_b4mish_b2mish_xlmish_with_xgb_cls_gs_09445.csv", index=False)

#
blend_7_ranked = blend_predictions_ranked([v25_xl_NR_moreTTA_b4mish_b2mish_xlmish, mean_9415])
blend_7_ranked.to_csv(
    "blend_7_ranked_v25_xl_NR_moreTTA_b4mish_b2mish_xlmish_with_mean_0.9415_prod_Gf0cauc_Gf3cauc_Hnrmishf2cauc_nrmishf1cauc.csv",
    index=False,
)


blend_7_ranked = blend_predictions_ranked([v25_xl_NR_moreTTA_b4mish_b2mish_xlmish, xgb_cls_gs_09419])
blend_7_ranked.to_csv(
    "blend_7_ranked_v25_xl_NR_moreTTA_b4mish_b2mish_xlmish_with_xgb_cls_gs_0.9419_Gf0cauc_Gf3cauc_Hnrmishf2cauc_nrmishf1cauc.csv",
    index=False,
)


# blend_6_ranked = blend_predictions_ranked([v25_xl_NR_moreTTA_b4mish, xgb_cls_gs_09445])
# blend_6_ranked.to_csv(
    "xgb_gs_0.9434_BrgbB6f0cauc_BrgbB6f1cauc_BrgbB6f2cauc_BrgbB6f3cauc_CrgbB2f2cauc_DrgbB7f1cauc_DrgbB7f2cauc_ErgbB6f0istego100kcauc_FrgbB3f0cauc.csv"
)

# Force 1.01 value of OOR values in my submission
oor_mask = submission_v25_xl_NR_moreTTA.Label > 1.0
submission_b6_mean_calibrated.loc[oor_mask, "Label"] = 1.01
submission_b6_cmb_uncalibrated.loc[oor_mask, "Label"] = 1.01
submission_b6_xgb.loc[oor_mask, "Label"] = 1.01

print(
    spearmanr(submission_v25_xl_NR_moreTTA.Label,
              submission_b6_cmb_uncalibrated.Label))
print(
    spearmanr(submission_v25_xl_NR_moreTTA.Label,
              submission_b6_mean_calibrated.Label))
print(spearmanr(submission_v25_xl_NR_moreTTA.Label, submission_b6_xgb.Label))

#
# blend_4_ranked = blend_predictions_ranked([submission_v25_xl_NR_moreTTA, submission_b6_mean_calibrated])
# blend_4_ranked.to_csv("blend_3_ranked_from_v25_xl_NR_moreTTA_and_mean_0.9391_cls_cal_BrgbB6f0cauc_BrgbB6f1cauc_BrgbB6f2cauc_BrgbB6f3cauc_CrgbB2f2cauc_DrgbB7f1cauc_DrgbB7f2cauc_ErgbB6f0istego100kcauc_FrgbB3f0cauc.csv", index=False)

# blend_4_ranked = blend_predictions_ranked([submission_v25_xl_NR_moreTTA, submission_b6_mean_calibrated])
# blend_4_ranked.to_csv("blend_3_ranked_from_v25_xl_NR_moreTTA_and_mean_0.9391_cls_cal_BrgbB6f0cauc_BrgbB6f1cauc_BrgbB6f2cauc_BrgbB6f3cauc_CrgbB2f2cauc_DrgbB7f1cauc_DrgbB7f2cauc_ErgbB6f0istego100kcauc_FrgbB3f0cauc.csv", index=False)

blend_4_mean = blend_predictions_ranked(
    [submission_v25_xl_NR_moreTTA, submission_b6_xgb])
blend_4_mean.to_csv(
    "blend_4_ranked_from_v25_xl_NR_moreTTA_and_xgb_gs_0.9434_BrgbB6f0cauc_BrgbB6f1cauc_BrgbB6f2cauc_BrgbB6f3cauc_CrgbB2f2cauc_DrgbB7f1cauc_DrgbB7f2cauc_ErgbB6f0istego100kcauc_FrgbB3f0cauc.csv",
    index=False,
)
Esempio n. 7
0
from sklearn.metrics import plot_confusion_matrix

v25_xl_NR_moreTTA_b4mish_b2mish_xlmish = (pd.read_csv(
    "submission_v25_xl_NR_moreTTA_b4mish_b2mish_xlmish.csv").sort_values(
        by="Id").reset_index())

xgb_cls_gs_09420 = pd.read_csv(
    "xgb_cls_0.9420_Gf0_Gf1_Gf2_Gf3_Hnrmishf2_Hnrmishf1_.csv")

# Force 1.01 value of OOR values in my submission
oor_mask = v25_xl_NR_moreTTA_b4mish_b2mish_xlmish.Label > 1.0

xgb_cls_gs_09420.loc[oor_mask, "Label"] = 1.01

submissions = [v25_xl_NR_moreTTA_b4mish_b2mish_xlmish, xgb_cls_gs_09420]

cm = np.zeros((len(submissions), len(submissions)))
for i in range(len(submissions)):
    for j in range(len(submissions)):
        cm[i, j] = spearmanr(submissions[i].Label,
                             submissions[j].Label).correlation

print(cm)

blend_8_ranked = blend_predictions_ranked(
    [v25_xl_NR_moreTTA_b4mish_b2mish_xlmish, xgb_cls_gs_09420])
blend_8_ranked.to_csv(
    "blend_8_ranked_v25_xl_NR_moreTTA_b4mish_b2mish_xlmish_with_xgb_cls_0.9420_Gf0_Gf1_Gf2_Gf3_Hnrmishf2_Hnrmishf1.csv",
    index=False,
)
        "v25", "v26", "emb_09411", "avg_0_9417", "cmb_0_9424", "xgb_0_9424",
        "lgb_0_9421"
    ],
)
plt.figure(figsize=(8, 8))
disp.plot(include_values=True, cmap="Blues", ax=plt.gca(), xticks_rotation=45)
plt.savefig(fname="predictions_corr.png")
plt.show()

# Submit 1 - v25 + embedding
# Submit 2 - v25 + tuned models
# Submit 3 - v26 + tuned models
# Submit 4 -
# Submit 5 -

blend_10_ranked = blend_predictions_ranked(
    [v25_xl_NR_moreTTA_b4mish_b2mish_xlmish, embeddings_09411])
print(blend_10_ranked.describe())
blend_10_ranked.to_csv(
    "blend_10_ranked_v25_xl_NR_moreTTA_b4mish_b2mish_xlmish_with_embeddings_09411.csv",
    index=False)

blend_10_ranked = blend_predictions_ranked(
    [v25_xl_NR_moreTTA_b4mish_b2mish_xlmish, mean_0_9417])
print(blend_10_ranked.describe())
blend_10_ranked.to_csv(
    "blend_10_ranked_v25_xl_NR_moreTTA_b4mish_b2mish_xlmish_with_mean_0.9417_cls_Kmishf0cauc_Jnrmishf1cauc_Hnrmishf2cauc_Kmishf3cauc.csv",
    index=False,
)

blend_10_ranked = blend_predictions_ranked(
    [v25_xl_NR_moreTTA_b4mish_b2mish_xlmish, xgb_0_9424])