Ejemplo n.º 1
0
def main():
    output_dir = os.path.dirname(__file__)

    experiments = [
        "May24_11_08_ela_skresnext50_32x4d_fold0_fp16",
        "May15_17_03_ela_skresnext50_32x4d_fold1_fp16",
        "May21_13_28_ela_skresnext50_32x4d_fold2_fp16",
        "May26_12_58_ela_skresnext50_32x4d_fold3_fp16",
        #
        "Jun02_12_26_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16",
        #
        "Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        "Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        "Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        "Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        #
        "Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16",
        "Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16",
        #
        "Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16",
    ]

    holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout", "d4")
    test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4")
    checksum = compute_checksum(test_predictions)

    X, y = get_x_y(holdout_predictions)
    print(X.shape, y.shape)

    X_public_lb, _ = get_x_y(test_predictions)
    print(X_public_lb.shape)

    loss_partial = partial(_auc_loss, X=X, y=y)
    initial_coef = np.ones(X.shape[1]) / X.shape[1]
    result = sp.optimize.minimize(
        loss_partial,
        initial_coef,
        bounds=Bounds(0, 1),
        method="nelder-mead",
        options={"maxiter": 5000, "disp": True, "gtol": 1e-10, "maxfun": 99999},
        tol=1e-6,
    )
    print(result)
    best_coef = softmax(result.x)
    print(best_coef)
    x_pred = (np.expand_dims(best_coef, 0) * X).sum(axis=1)
    auc = alaska_weighted_auc(y, x_pred)
    print(auc)

    x_test_pred = (np.expand_dims(best_coef, 0) * X_public_lb).sum(axis=1)

    submit_fname = os.path.join(output_dir, f"wmean_{np.mean(auc):.4f}_{checksum}.csv")

    df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"})
    df["Label"] = x_test_pred
    df[["Id", "Label"]].to_csv(submit_fname, index=False)
    print("Saved submission to ", submit_fname)
Ejemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("experiments", nargs="+", type=str)
    parser.add_argument("-o", "--output", type=str, required=False)
    parser.add_argument("-dd",
                        "--data-dir",
                        type=str,
                        default=os.environ.get("KAGGLE_2020_ALASKA2"))
    args = parser.parse_args()

    output_dir = os.path.dirname(__file__)
    data_dir = args.data_dir
    experiments = args.experiments
    output_file = args.output

    holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout",
                                              "d4")
    test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4")
    checksum = compute_checksum_v2(experiments)

    holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY])
    image_ids_h = [fs.id_from_fname(x) for x in holdout_ds.images]
    quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(),
                          3).numpy().astype(np.float32)

    test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY])
    quality_t = F.one_hot(torch.tensor(test_ds.quality).long(),
                          3).numpy().astype(np.float32)

    with_logits = True
    x, y = get_x_y_for_stacking(holdout_predictions,
                                with_logits=with_logits,
                                tta_logits=with_logits)
    # Force target to be binary
    y = (y > 0).astype(int)
    print(x.shape, y.shape)

    x_test, _ = get_x_y_for_stacking(test_predictions,
                                     with_logits=with_logits,
                                     tta_logits=with_logits)
    print(x_test.shape)

    if False:
        image_fnames_h = [
            os.path.join(data_dir, INDEX_TO_METHOD[method], f"{image_id}.jpg")
            for (image_id, method) in zip(image_ids_h, y)
        ]
        test_image_ids = pd.read_csv(test_predictions[0]).image_id.tolist()
        image_fnames_t = [
            os.path.join(data_dir, "Test", image_id)
            for image_id in test_image_ids
        ]

        entropy_t = compute_image_features(image_fnames_t)
        x_test = np.column_stack([x_test, entropy_t])

        # entropy_h = entropy_t.copy()
        # x = x_test.copy()

        entropy_h = compute_image_features(image_fnames_h)
        x = np.column_stack([x, entropy_h])
        print("Added image features", entropy_h.shape, entropy_t.shape)

    if True:
        sc = StandardScaler()
        x = sc.fit_transform(x)
        x_test = sc.transform(x_test)

    if False:
        sc = PCA(n_components=16)
        x = sc.fit_transform(x)
        x_test = sc.transform(x_test)

    if True:
        x = np.column_stack([x, quality_h])
        x_test = np.column_stack([x_test, quality_t])

    group_kfold = GroupKFold(n_splits=5)

    params = {
        "min_child_weight": [1, 5, 10],
        "gamma": [1e-3, 1e-2, 1e-2, 0.5, 2],
        "subsample": [0.6, 0.8, 1.0],
        "colsample_bytree": [0.6, 0.8, 1.0],
        "max_depth": [2, 3, 4, 5, 6],
        "n_estimators": [16, 32, 64, 128, 256, 1000],
        "learning_rate": [0.001, 0.01, 0.05, 0.2, 1],
    }

    xgb = XGBClassifier(objective="binary:logistic", nthread=1)

    random_search = RandomizedSearchCV(
        xgb,
        param_distributions=params,
        scoring=make_scorer(alaska_weighted_auc,
                            greater_is_better=True,
                            needs_proba=True),
        n_jobs=4,
        n_iter=25,
        cv=group_kfold.split(x, y, groups=image_ids_h),
        verbose=3,
        random_state=42,
    )

    # Here we go
    random_search.fit(x, y)

    print("\n All results:")
    print(random_search.cv_results_)
    print("\n Best estimator:")
    print(random_search.best_estimator_)
    print(random_search.best_score_)
    print("\n Best hyperparameters:")
    print(random_search.best_params_)
    results = pd.DataFrame(random_search.cv_results_)
    results.to_csv("xgb-random-grid-search-results-01.csv", index=False)

    test_pred = random_search.predict_proba(x_test)[:, 1]

    if output_file is None:
        with_logits_sfx = "_with_logits" if with_logits else ""
        submit_fname = os.path.join(
            output_dir,
            f"xgb_cls_gs_{random_search.best_score_:.4f}_{checksum}{with_logits_sfx}.csv"
        )
    else:
        submit_fname = output_file

    df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"})
    df["Label"] = test_pred
    df[["Id", "Label"]].to_csv(submit_fname, index=False)
    print("Saved submission to ", submit_fname)

    import json

    with open(fs.change_extension(submit_fname, ".json"), "w") as f:
        json.dump(random_search.best_params_, f, indent=2)
def main():
    output_dir = os.path.dirname(__file__)

    experiments = [
        "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
    ]

    scoring_fn = alaska_weighted_auc

    for metric in [
        # "loss",
        # "bauc",
        "cauc"
    ]:
        holdout_predictions_d4 = get_predictions_csv(experiments, metric, "holdout", "d4")
        oof_predictions_d4 = get_predictions_csv(experiments, metric, "oof", "d4")
        test_predictions_d4 = get_predictions_csv(experiments, metric, "test", "d4")

        fnames_for_checksum = [x + f"{metric}" for x in experiments]

        bin_pred_d4 = make_binary_predictions(holdout_predictions_d4)
        y_true = bin_pred_d4[0].y_true_type.values

        bin_pred_d4_score = scoring_fn(y_true, blend_predictions_mean(bin_pred_d4).Label)

        cls_pred_d4 = make_classifier_predictions(holdout_predictions_d4)
        cls_pred_d4_score = scoring_fn(y_true, blend_predictions_mean(cls_pred_d4).Label)

        bin_pred_d4_cal = make_binary_predictions_calibrated(holdout_predictions_d4, oof_predictions_d4)
        bin_pred_d4_cal_score = scoring_fn(y_true, blend_predictions_mean(bin_pred_d4_cal).Label)

        cls_pred_d4_cal = make_classifier_predictions_calibrated(holdout_predictions_d4, oof_predictions_d4)
        cls_pred_d4_cal_score = scoring_fn(y_true, blend_predictions_mean(cls_pred_d4_cal).Label)

        prod_pred_d4_cal_score = scoring_fn(
            y_true, blend_predictions_mean(cls_pred_d4_cal).Label * blend_predictions_mean(bin_pred_d4_cal).Label
        )

        print(metric, "Bin NC", "d4", bin_pred_d4_score)
        print(metric, "Bin CL", "d4", cls_pred_d4_score)
        print(metric, "Cls NC", "d4", bin_pred_d4_cal_score)
        print(metric, "Cls CL", "d4", cls_pred_d4_cal_score)
        print(metric, "Prod  ", "d4", prod_pred_d4_cal_score)

        max_score = max(
            bin_pred_d4_score, cls_pred_d4_score, bin_pred_d4_cal_score, cls_pred_d4_cal_score, prod_pred_d4_cal_score
        )

        if bin_pred_d4_score == max_score:
            predictions = make_binary_predictions(test_predictions_d4)

            predictions = blend_predictions_mean(predictions)
            predictions.to_csv(
                os.path.join(output_dir, f"mean_{max_score:.4f}_bin_{compute_checksum_v2(fnames_for_checksum)}.csv"),
                index=False,
            )
        if bin_pred_d4_cal_score == max_score:
            predictions = make_binary_predictions_calibrated(test_predictions_d4, oof_predictions_d4)

            predictions = blend_predictions_mean(predictions)
            predictions.to_csv(
                os.path.join(
                    output_dir, f"mean_{max_score:.4f}_bin_cal_{compute_checksum_v2(fnames_for_checksum)}.csv"
                ),
                index=False,
            )
        if cls_pred_d4_score == max_score:
            predictions = make_classifier_predictions(test_predictions_d4)

            predictions = blend_predictions_mean(predictions)
            predictions.to_csv(
                os.path.join(output_dir, f"mean_{max_score:.4f}_cls_{compute_checksum_v2(fnames_for_checksum)}.csv"),
                index=False,
            )
        if cls_pred_d4_cal_score == max_score:
            predictions = make_classifier_predictions_calibrated(test_predictions_d4, oof_predictions_d4)

            predictions = blend_predictions_mean(predictions)
            predictions.to_csv(
                os.path.join(
                    output_dir, f"mean_{max_score:.4f}_cls_cal_{compute_checksum_v2(fnames_for_checksum)}.csv"
                ),
                index=False,
            )
        if prod_pred_d4_cal_score == max_score:
            cls_predictions = make_classifier_predictions_calibrated(test_predictions_d4, oof_predictions_d4)
            bin_predictions = make_binary_predictions_calibrated(test_predictions_d4, oof_predictions_d4)

            predictions1 = blend_predictions_mean(cls_predictions)
            predictions2 = blend_predictions_mean(bin_predictions)
            predictions = predictions1.copy()
            predictions.Label = predictions1.Label * predictions2.Label

            predictions.to_csv(
                os.path.join(
                    output_dir, f"mean_{max_score:.4f}_prod_cal_{compute_checksum_v2(fnames_for_checksum)}.csv"
                ),
                index=False,
            )
Ejemplo n.º 4
0
def main():
    output_dir = os.path.dirname(__file__)

    experiments = [
        # "A_May24_11_08_ela_skresnext50_32x4d_fold0_fp16",
        # "A_May15_17_03_ela_skresnext50_32x4d_fold1_fp16",
        # "A_May21_13_28_ela_skresnext50_32x4d_fold2_fp16",
        # "A_May26_12_58_ela_skresnext50_32x4d_fold3_fp16",
        #
        # "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        # "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        # "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        #
        # "C_Jun24_22_00_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16",
        #
        # "D_Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16",
        # "D_Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16",
        #
        # "E_Jun18_19_24_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "E_Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16",
        #
        # "F_Jun29_19_43_rgb_tf_efficientnet_b3_ns_fold0_local_rank_0_fp16",
        #
        "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        #
        "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16",
        "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16",
    ]

    holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout",
                                              "d4")
    test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4")
    checksum = compute_checksum_v2(experiments)

    holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY])
    image_ids = [fs.id_from_fname(x) for x in holdout_ds.images]

    quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(),
                          3).numpy().astype(np.float32)

    test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY])
    quality_t = F.one_hot(torch.tensor(test_ds.quality).long(),
                          3).numpy().astype(np.float32)

    x, y = get_x_y_for_stacking(holdout_predictions,
                                with_logits=True,
                                tta_logits=True)
    print(x.shape, y.shape)

    x_test, _ = get_x_y_for_stacking(test_predictions,
                                     with_logits=True,
                                     tta_logits=True)
    print(x_test.shape)

    if True:
        x = np.column_stack([x, quality_h])
        x_test = np.column_stack([x_test, quality_t])

    group_kfold = GroupKFold(n_splits=5)

    for fold_index, (train_index, valid_index) in enumerate(
            group_kfold.split(x, y, groups=image_ids)):
        x_train, x_valid, y_train, y_valid = (x[train_index], x[valid_index],
                                              y[train_index], y[valid_index])

        clf = LazyClassifier(verbose=True,
                             ignore_warnings=False,
                             custom_metric=alaska_weighted_auc,
                             predictions=True)
        models, predictions = clf.fit(x_train, x_valid, y_train, y_valid)
        print(models)

        models.to_csv(
            os.path.join(output_dir,
                         f"lazypredict_models_{fold_index}_{checksum}.csv"))
        predictions.to_csv(
            os.path.join(output_dir,
                         f"lazypredict_preds_{fold_index}_{checksum}.csv"))
Ejemplo n.º 5
0
def main():
    output_dir = os.path.dirname(__file__)

    experiments = [
        # "A_May24_11_08_ela_skresnext50_32x4d_fold0_fp16",
        # "A_May15_17_03_ela_skresnext50_32x4d_fold1_fp16",
        # "A_May21_13_28_ela_skresnext50_32x4d_fold2_fp16",
        # "A_May26_12_58_ela_skresnext50_32x4d_fold3_fp16",
        #
        # "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        # "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        # "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        #
        # "C_Jun24_22_00_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16",
        #
        # "D_Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16",
        # "D_Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16",
        #
        # "E_Jun18_19_24_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "E_Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16",
        #
        # "F_Jun29_19_43_rgb_tf_efficientnet_b3_ns_fold0_local_rank_0_fp16",
        #
        "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        # "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        #
        "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16",
        "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16",
    ]

    holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout",
                                              "d4")
    test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4")
    fnames_for_checksum = [x + f"cauc" for x in experiments]
    checksum = compute_checksum_v2(fnames_for_checksum)

    holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY])
    image_ids = [fs.id_from_fname(x) for x in holdout_ds.images]
    print("Unique image ids", len(np.unique(image_ids)))
    quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(),
                          3).numpy().astype(np.float32)

    test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY])
    quality_t = F.one_hot(torch.tensor(test_ds.quality).long(),
                          3).numpy().astype(np.float32)

    x, y = get_x_y(holdout_predictions)
    print(x.shape, y.shape)

    x_test, _ = get_x_y(test_predictions)
    print(x_test.shape)

    if True:
        sc = StandardScaler()
        x = sc.fit_transform(x)
        x_test = sc.transform(x_test)

    if False:
        sc = PCA(n_components=16)
        x = sc.fit_transform(x)
        x_test = sc.transform(x_test)

    if True:
        x = np.column_stack([x, quality_h])
        x_test = np.column_stack([x_test, quality_t])

    test_dmatrix = xgb.DMatrix(x_test)

    group_kfold = GroupKFold(n_splits=5)
    cv_scores = []
    test_pred = None
    one_over_n = 1.0 / group_kfold.n_splits

    params = {
        "base_score": 0.5,
        "booster": "gblinear",
        # "booster": "gbtree",
        "colsample_bylevel": 1,
        "colsample_bynode": 1,
        "colsample_bytree": 1,
        # "gamma": 1.0,
        "learning_rate": 0.01,
        "max_delta_step": 0,
        "objective": "binary:logistic",
        "eta": 0.1,
        "reg_lambda": 0,
        "subsample": 0.8,
        "scale_pos_weight": 1,
        "min_child_weight": 2,
        "max_depth": 5,
        "tree_method": "exact",
        "seed": 42,
        "alpha": 0.01,
        "lambda": 0.01,
        "n_estimators": 256,
        "gamma": 0.01,
        "disable_default_eval_metric": 1,
        # "eval_metric": "wauc",
    }

    for fold_index, (train_index, valid_index) in enumerate(
            group_kfold.split(x, y, groups=image_ids)):
        x_train, x_valid, y_train, y_valid = (x[train_index], x[valid_index],
                                              y[train_index], y[valid_index])

        train_dmatrix = xgb.DMatrix(x_train.copy(), y_train.copy())
        valid_dmatrix = xgb.DMatrix(x_valid.copy(), y_valid.copy())

        xgb_model = xgb.train(
            params,
            train_dmatrix,
            num_boost_round=5000,
            verbose_eval=True,
            feval=xgb_weighted_auc,
            maximize=True,
            evals=[(valid_dmatrix, "validation")],
        )

        y_valid_pred = xgb_model.predict(valid_dmatrix)
        score = alaska_weighted_auc(y_valid, y_valid_pred)

        cv_scores.append(score)

        if test_pred is not None:
            test_pred += xgb_model.predict(test_dmatrix) * one_over_n
        else:
            test_pred = xgb_model.predict(test_dmatrix) * one_over_n

    for s in cv_scores:
        print(s)
    print(np.mean(cv_scores), np.std(cv_scores))

    submit_fname = os.path.join(
        output_dir, f"xgb_{np.mean(cv_scores):.4f}_{checksum}_.csv")
    df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"})
    df["Label"] = test_pred
    df[["Id", "Label"]].to_csv(submit_fname, index=False)
    print("Saved submission to ", submit_fname)
Ejemplo n.º 6
0
def main():
    output_dir = os.path.dirname(__file__)

    experiments = [
        # "A_May24_11_08_ela_skresnext50_32x4d_fold0_fp16",
        # "A_May15_17_03_ela_skresnext50_32x4d_fold1_fp16",
        # "A_May21_13_28_ela_skresnext50_32x4d_fold2_fp16",
        # "A_May26_12_58_ela_skresnext50_32x4d_fold3_fp16",
        #
        # "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        # "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        # "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        #
        "C_Jun24_22_00_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16",
        #
        # "D_Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16",
        # "D_Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16",
        #
        # "E_Jun18_19_24_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "E_Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16",
        #
        "F_Jun29_19_43_rgb_tf_efficientnet_b3_ns_fold0_local_rank_0_fp16",
        #
        "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        # "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        #
        "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16",
        "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16",
    ]

    all_predictions = []
    labels = experiments
    scoring_fn = alaska_weighted_auc

    for metric in [
        # "loss",
        # "bauc",
        "cauc"
    ]:
        holdout_predictions_d4 = get_predictions_csv(experiments, metric, "holdout", "d4")
        oof_predictions_d4 = get_predictions_csv(experiments, metric, "oof", "d4")
        test_predictions_d4 = get_predictions_csv(experiments, metric, "test", "d4")

        fnames_for_checksum = [x + f"{metric}" for x in experiments]

        bin_pred_d4 = make_binary_predictions(holdout_predictions_d4)
        y_true = bin_pred_d4[0].y_true_type.values

        bin_pred_d4_score = scoring_fn(y_true, blend_predictions_ranked(bin_pred_d4).Label)

        cls_pred_d4 = make_classifier_predictions(holdout_predictions_d4)
        cls_pred_d4_score = scoring_fn(y_true, blend_predictions_ranked(cls_pred_d4).Label)

        prod_pred_d4_score = scoring_fn(
            y_true, blend_predictions_ranked(cls_pred_d4).Label * blend_predictions_ranked(bin_pred_d4).Label
        )

        if False:
            bin_pred_d4_cal = make_binary_predictions_calibrated(holdout_predictions_d4, oof_predictions_d4)
            bin_pred_d4_cal_score = scoring_fn(y_true, blend_predictions_ranked(bin_pred_d4_cal).Label)

            cls_pred_d4_cal = make_classifier_predictions_calibrated(holdout_predictions_d4, oof_predictions_d4)
            cls_pred_d4_cal_score = scoring_fn(y_true, blend_predictions_ranked(cls_pred_d4_cal).Label)

            prod_pred_d4_cal_score = scoring_fn(
                y_true,
                blend_predictions_ranked(cls_pred_d4_cal).Label * blend_predictions_ranked(bin_pred_d4_cal).Label,
            )
        else:
            bin_pred_d4_cal_score = 0
            cls_pred_d4_cal_score = 0
            prod_pred_d4_cal_score = 0

        print(metric, "Bin  NC", "d4", bin_pred_d4_score)
        print(metric, "Cls  NC", "d4", cls_pred_d4_score)
        print(metric, "Prod NC", "d4", prod_pred_d4_score)
        print(metric, "Bin  CL", "d4", bin_pred_d4_cal_score)
        print(metric, "Cls  CL", "d4", cls_pred_d4_cal_score)
        print(metric, "Prod CL", "d4", prod_pred_d4_cal_score)

        max_score = max(
            bin_pred_d4_score,
            cls_pred_d4_score,
            bin_pred_d4_cal_score,
            cls_pred_d4_cal_score,
            prod_pred_d4_score,
            prod_pred_d4_cal_score,
        )

        if bin_pred_d4_score == max_score:
            predictions = make_binary_predictions(test_predictions_d4)

            predictions = blend_predictions_ranked(predictions)
            predictions.to_csv(
                os.path.join(output_dir, f"rank_{max_score:.4f}_bin_{compute_checksum_v2(fnames_for_checksum)}.csv"),
                index=False,
            )
        if bin_pred_d4_cal_score == max_score:
            predictions = make_binary_predictions_calibrated(test_predictions_d4, oof_predictions_d4)

            predictions = blend_predictions_ranked(predictions)
            predictions.to_csv(
                os.path.join(
                    output_dir, f"rank_{max_score:.4f}_bin_cal_{compute_checksum_v2(fnames_for_checksum)}.csv"
                ),
                index=False,
            )
        if cls_pred_d4_score == max_score:
            predictions = make_classifier_predictions(test_predictions_d4)

            predictions = blend_predictions_ranked(predictions)
            predictions.to_csv(
                os.path.join(output_dir, f"rank_{max_score:.4f}_cls_{compute_checksum_v2(fnames_for_checksum)}.csv"),
                index=False,
            )
        if cls_pred_d4_cal_score == max_score:
            predictions = make_classifier_predictions_calibrated(test_predictions_d4, oof_predictions_d4)

            predictions = blend_predictions_ranked(predictions)
            predictions.to_csv(
                os.path.join(
                    output_dir, f"rank_{max_score:.4f}_cls_cal_{compute_checksum_v2(fnames_for_checksum)}.csv"
                ),
                index=False,
            )
        if prod_pred_d4_score == max_score:
            cls_predictions = make_classifier_predictions(test_predictions_d4)
            bin_predictions = make_binary_predictions(test_predictions_d4)

            predictions1 = blend_predictions_ranked(cls_predictions)
            predictions2 = blend_predictions_ranked(bin_predictions)
            predictions = predictions1.copy()
            predictions.Label = predictions1.Label * predictions2.Label

            predictions.to_csv(
                os.path.join(output_dir, f"rank_{max_score:.4f}_prod_{compute_checksum_v2(fnames_for_checksum)}.csv"),
                index=False,
            )
        if prod_pred_d4_cal_score == max_score:
            cls_predictions = make_classifier_predictions_calibrated(test_predictions_d4, oof_predictions_d4)
            bin_predictions = make_binary_predictions_calibrated(test_predictions_d4, oof_predictions_d4)

            predictions1 = blend_predictions_ranked(cls_predictions)
            predictions2 = blend_predictions_ranked(bin_predictions)
            predictions = predictions1.copy()
            predictions.Label = predictions1.Label * predictions2.Label

            predictions.to_csv(
                os.path.join(
                    output_dir, f"rank_{max_score:.4f}_prod_cal_{compute_checksum_v2(fnames_for_checksum)}.csv"
                ),
                index=False,
            )
def main():
    output_dir = os.path.dirname(__file__)

    experiments = [
        # "A_May24_11_08_ela_skresnext50_32x4d_fold0_fp16",
        # "A_May15_17_03_ela_skresnext50_32x4d_fold1_fp16",
        # "A_May21_13_28_ela_skresnext50_32x4d_fold2_fp16",
        # "A_May26_12_58_ela_skresnext50_32x4d_fold3_fp16",
        #
        # "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        # "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        # "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        #
        # "C_Jun24_22_00_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16",
        #
        # "D_Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16",
        # "D_Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16",
        #
        # "E_Jun18_19_24_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "E_Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16",
        #
        # "F_Jun29_19_43_rgb_tf_efficientnet_b3_ns_fold0_local_rank_0_fp16",
        #
        "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        #
        "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16",
        "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16",
    ]

    holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout",
                                              "d4")
    test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4")
    checksum = compute_checksum_v2(experiments)

    holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY])
    image_ids = [fs.id_from_fname(x) for x in holdout_ds.images]

    quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(),
                          3).numpy().astype(np.float32)

    test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY])
    quality_t = F.one_hot(torch.tensor(test_ds.quality).long(),
                          3).numpy().astype(np.float32)

    x, y = get_x_y_for_stacking(holdout_predictions,
                                with_logits=True,
                                tta_logits=True)
    print(x.shape, y.shape)

    x_test, _ = get_x_y_for_stacking(test_predictions,
                                     with_logits=True,
                                     tta_logits=True)
    print(x_test.shape)

    if False:
        sc = StandardScaler()
        x = sc.fit_transform(x)
        x_test = sc.transform(x_test)

    if False:
        sc = PCA(n_components=16)
        x = sc.fit_transform(x)
        x_test = sc.transform(x_test)

    if True:
        x = np.column_stack([x, quality_h])
        x_test = np.column_stack([x_test, quality_t])

    group_kfold = GroupKFold(n_splits=5)
    cv_scores = []
    test_pred = None
    one_over_n = 1.0 / group_kfold.n_splits

    for train_index, valid_index in group_kfold.split(x, y, groups=image_ids):
        x_train, x_valid, y_train, y_valid = (x[train_index], x[valid_index],
                                              y[train_index], y[valid_index])
        print(np.bincount(y_train), np.bincount(y_valid))

        # cls = LinearDiscriminantAnalysis()
        cls = LinearDiscriminantAnalysis(solver="lsqr",
                                         shrinkage="auto",
                                         priors=[0.5, 0.5])
        cls.fit(x_train, y_train)

        y_valid_pred = cls.predict_proba(x_valid)[:, 1]
        score = alaska_weighted_auc(y_valid, y_valid_pred)
        cv_scores.append(score)

        if test_pred is not None:
            test_pred += cls.predict_proba(x_test)[:, 1] * one_over_n
        else:
            test_pred = cls.predict_proba(x_test)[:, 1] * one_over_n

    for s in cv_scores:
        print(s)
    print(np.mean(cv_scores), np.std(cv_scores))

    submit_fname = os.path.join(
        output_dir, f"lda_{np.mean(cv_scores):.4f}_{checksum}.csv")
    df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"})
    df["Label"] = test_pred
    df[["Id", "Label"]].to_csv(submit_fname, index=False)
    print("Saved submission to ", submit_fname)
Ejemplo n.º 8
0
def main():
    output_dir = os.path.dirname(__file__)

    experiments = [
        #
        # "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        # "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        #
        "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16",
        "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16",
        #
        "K_Jul17_17_09_nr_rgb_tf_efficientnet_b6_ns_mish_fold0_local_rank_0_fp16",
    ]

    checksum = compute_checksum_v2(experiments)

    if False:
        train_predictions = get_predictions_csv(experiments, "cauc", "train", tta="d4", need_embedding=True)
        x, y = get_x_y_for_stacking(
            train_predictions,
            with_embeddings=True,
            with_logits=False,
            with_probas=False,
            tta_probas=False,
            tta_logits=False,
        )
        print("Train", x.shape, y.shape)
        np.save(f"embeddings_x_train_{checksum}.npy", x)
        np.save(f"embeddings_y_train_{checksum}.npy", y)
        del x, y, train_predictions

    if False:
        test_predictions = get_predictions_csv(experiments, "cauc", "test", tta="d4", need_embedding=True)
        x_test, _ = get_x_y_for_stacking(
            test_predictions,
            with_embeddings=True,
            with_logits=False,
            with_probas=False,
            tta_probas=False,
            tta_logits=False,
        )
        print("Test", x_test.shape)
        np.save(f"embeddings_x_test_{checksum}.npy", x_test)
        del x_test, test_predictions

    if True:
        holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout", tta="d4", need_embedding=True)
        x_hld, y_hld = get_x_y_for_stacking(
            holdout_predictions,
            with_embeddings=True,
            with_logits=False,
            with_probas=False,
            tta_probas=False,
            tta_logits=False,
        )
        print("Holdout", x_hld.shape)
        np.save(f"embeddings_x_holdout_{checksum}.npy", x_hld)
        np.save(f"embeddings_y_holdout_{checksum}.npy", y_hld)
        del x_hld, y_hld, holdout_predictions
def main():
    output_dir = os.path.dirname(__file__)

    experiments = [
        "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        # "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16",
        #
        "K_Jul17_17_09_nr_rgb_tf_efficientnet_b6_ns_mish_fold0_local_rank_0_fp16",
        "J_Jul19_20_10_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16",
        "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16",
        "K_Jul18_16_41_nr_rgb_tf_efficientnet_b6_ns_mish_fold3_local_rank_0_fp16"
        #
        #
    ]

    holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout",
                                              "d4")
    test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4")
    checksum = compute_checksum_v2(experiments)

    holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY])
    image_ids = [fs.id_from_fname(x) for x in holdout_ds.images]

    quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(),
                          3).numpy().astype(np.float32)

    test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY])
    quality_t = F.one_hot(torch.tensor(test_ds.quality).long(),
                          3).numpy().astype(np.float32)

    with_logits = True

    x, y = get_x_y_for_stacking(holdout_predictions,
                                with_logits=with_logits,
                                tta_logits=with_logits)
    # Force target to be binary
    y = (y > 0).astype(int)
    print(x.shape, y.shape)

    x_test, _ = get_x_y_for_stacking(test_predictions,
                                     with_logits=with_logits,
                                     tta_logits=with_logits)
    print(x_test.shape)

    if True:
        sc = StandardScaler()
        x = sc.fit_transform(x)
        x_test = sc.transform(x_test)

    if False:
        sc = PCA(n_components=16)
        x = sc.fit_transform(x)
        x_test = sc.transform(x_test)

    if True:
        x = np.column_stack([x, quality_h])
        x_test = np.column_stack([x_test, quality_t])

    group_kfold = GroupKFold(n_splits=5)

    params = {
        "boosting_type": ["gbdt", "dart", "rf", "goss"],
        "num_leaves": [16, 32, 64, 128],
        "reg_alpha": [0, 0.01, 0.1, 0.5],
        "reg_lambda": [0, 0.01, 0.1, 0.5],
        "learning_rate": [0.001, 0.01, 0.1, 0.5],
        "n_estimators": [32, 64, 126, 512],
        "max_depth": [2, 4, 8],
        "min_child_samples": [20, 40, 80, 100],
    }

    lgb_estimator = lgb.LGBMClassifier(objective="binary", silent=True)

    random_search = RandomizedSearchCV(
        lgb_estimator,
        param_distributions=params,
        scoring=make_scorer(alaska_weighted_auc,
                            greater_is_better=True,
                            needs_proba=True),
        n_jobs=3,
        n_iter=50,
        cv=group_kfold.split(x, y, groups=image_ids),
        verbose=2,
        random_state=42,
    )

    # Here we go
    random_search.fit(x, y)

    test_pred = random_search.predict_proba(x_test)[:, 1]
    print(test_pred)

    submit_fname = os.path.join(
        output_dir, f"lgbm_gs_{random_search.best_score_:.4f}_{checksum}.csv")
    df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"})
    df["Label"] = test_pred
    df[["Id", "Label"]].to_csv(submit_fname, index=False)

    print("\n All results:")
    print(random_search.cv_results_)
    print("\n Best estimator:")
    print(random_search.best_estimator_)
    print(random_search.best_score_)
    print("\n Best hyperparameters:")
    print(random_search.best_params_)
    results = pd.DataFrame(random_search.cv_results_)
    results.to_csv("lgbm-random-grid-search-results-01.csv", index=False)
def main():
    output_dir = os.path.dirname(__file__)

    experiments = [
        # A models trained on old folds without holdout, so it will have a leak if evaluated.
        # "A_May24_11_08_ela_skresnext50_32x4d_fold0_fp16",
        # "A_May15_17_03_ela_skresnext50_32x4d_fold1_fp16",
        # "A_May21_13_28_ela_skresnext50_32x4d_fold2_fp16",
        # "A_May26_12_58_ela_skresnext50_32x4d_fold3_fp16",
        #
        # "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        # "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        # "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        #
        # "C_Jun24_22_00_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16",
        # #
        # "D_Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16",
        # "D_Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16",
        # #
        # # "E_Jun18_19_24_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "E_Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16",
        # #
        # "F_Jun29_19_43_rgb_tf_efficientnet_b3_ns_fold0_local_rank_0_fp16",
        #
        "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
    ]

    test_predictions_d4 = get_predictions_csv(experiments, "cauc", "test",
                                              "d4")
    classes = []
    for x in test_predictions_d4:
        df = pd.read_csv(x)
        df = df.rename(columns={"image_id": "Id"})
        df["classes"] = df["pred_modification_type"].apply(parse_and_softmax)
        classes.append(df["classes"].tolist())

    classes = np.mean(classes, axis=0)
    print("Class distribution", np.bincount(classes.argmax(axis=1)))

    bin_probas = np.stack([classes[:, 0], 1 - classes[:, 0]])
    bin_classes = bin_probas.argmax(axis=0)

    classes_cp = classes.copy()
    classes_cp[bin_classes == 1, 0] = 0
    print("Class distribution", np.bincount(classes_cp.argmax(axis=1)))

    plt.figure()
    plt.hist(classes[:, 0], bins=100, alpha=0.25, label="Cover")
    plt.hist(classes[:, 1], bins=100, alpha=0.25, label="JMiPOD")
    plt.hist(classes[:, 2], bins=100, alpha=0.25, label="JUNIWARD")
    plt.hist(classes[:, 3], bins=100, alpha=0.25, label="UERD")
    plt.yscale("log")
    plt.legend()
    plt.show()

    holdout_predictions_d4 = get_predictions_csv(experiments, "cauc",
                                                 "holdout", "d4")
    holdout_predictions_d4 = make_product_predictions(holdout_predictions_d4)
    y_true_type = holdout_predictions_d4[0].y_true_type

    holdout_predictions_d4 = blend_predictions_mean(holdout_predictions_d4)
    scores = evaluate_wauc_shakeup_using_bagging(holdout_predictions_d4,
                                                 y_true_type, 10000)

    plt.figure()
    plt.hist(scores,
             bins=100,
             alpha=0.5,
             label=f"{np.mean(scores):.5f} +- {np.std(scores):.6f}")
    plt.legend()
    plt.show()

    print(np.mean(scores), np.std(scores))
def main():
    output_dir = os.path.dirname(__file__)

    experiments = [
        "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        # "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16",
        #
        "K_Jul17_17_09_nr_rgb_tf_efficientnet_b6_ns_mish_fold0_local_rank_0_fp16",
        "J_Jul19_20_10_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16",
        "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16",
        "K_Jul18_16_41_nr_rgb_tf_efficientnet_b6_ns_mish_fold3_local_rank_0_fp16"
        #
        #
    ]

    holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout",
                                              "d4")
    test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4")
    checksum = compute_checksum_v2(experiments)

    holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY])
    image_ids = [fs.id_from_fname(x) for x in holdout_ds.images]

    quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(),
                          3).numpy().astype(np.float32)

    test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY])
    quality_t = F.one_hot(torch.tensor(test_ds.quality).long(),
                          3).numpy().astype(np.float32)

    with_logits = True

    x, y = get_x_y_for_stacking(holdout_predictions,
                                with_logits=with_logits,
                                tta_logits=with_logits)
    # Force target to be binary
    y = (y > 0).astype(int)
    print(x.shape, y.shape)

    x_test, _ = get_x_y_for_stacking(test_predictions,
                                     with_logits=with_logits,
                                     tta_logits=with_logits)
    print(x_test.shape)

    if True:
        sc = StandardScaler()
        x = sc.fit_transform(x)
        x_test = sc.transform(x_test)

    if False:
        sc = PCA(n_components=16)
        x = sc.fit_transform(x)
        x_test = sc.transform(x_test)

    if True:
        x = np.column_stack([x, quality_h])
        x_test = np.column_stack([x_test, quality_t])

    group_kfold = GroupKFold(n_splits=5)
    cv_scores = []
    test_pred = None
    one_over_n = 1.0 / group_kfold.n_splits

    for train_index, valid_index in group_kfold.split(x, y, groups=image_ids):
        x_train, x_valid, y_train, y_valid = (x[train_index], x[valid_index],
                                              y[train_index], y[valid_index])
        print(np.bincount(y_train), np.bincount(y_valid))

        cls = XGBClassifier(
            base_score=0.5,
            booster="gbtree",
            colsample_bylevel=1,
            colsample_bynode=1,
            colsample_bytree=0.6,
            gamma=0.5,
            gpu_id=-1,
            importance_type="gain",
            interaction_constraints="",
            learning_rate=0.01,
            max_delta_step=0,
            max_depth=3,
            min_child_weight=10,
            # missing=nan,
            monotone_constraints="()",
            n_estimators=1000,
            n_jobs=8,
            nthread=1,
            num_parallel_tree=1,
            objective="binary:logistic",
            random_state=0,
            reg_alpha=0,
            reg_lambda=1,
            scale_pos_weight=1,
            silent=True,
            subsample=0.8,
            tree_method="exact",
            validate_parameters=1,
            verbosity=2,
        )

        cls.fit(x_train, y_train)

        y_valid_pred = cls.predict_proba(x_valid)[:, 1]
        score = alaska_weighted_auc(y_valid, y_valid_pred)
        cv_scores.append(score)

        if test_pred is not None:
            test_pred += cls.predict_proba(x_test)[:, 1] * one_over_n
        else:
            test_pred = cls.predict_proba(x_test)[:, 1] * one_over_n

    for s in cv_scores:
        print(s)
    print(np.mean(cv_scores), np.std(cv_scores))

    with_logits_sfx = "_with_logits" if with_logits else ""

    submit_fname = os.path.join(
        output_dir,
        f"xgb_cls_{np.mean(cv_scores):.4f}_{checksum}{with_logits_sfx}.csv")
    df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"})
    df["Label"] = test_pred
    df[["Id", "Label"]].to_csv(submit_fname, index=False)
    print("Saved submission to ", submit_fname)
def main():
    output_dir = os.path.dirname(__file__)

    experiments = [
        # "A_May24_11_08_ela_skresnext50_32x4d_fold0_fp16",
        # "A_May15_17_03_ela_skresnext50_32x4d_fold1_fp16",
        # "A_May21_13_28_ela_skresnext50_32x4d_fold2_fp16",
        # "A_May26_12_58_ela_skresnext50_32x4d_fold3_fp16",
        #
        # "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        # "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        # "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        #
        # "C_Jun24_22_00_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16",
        #
        # "D_Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16",
        # "D_Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16",
        #
        # "E_Jun18_19_24_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "E_Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16",
        #
        # "F_Jun29_19_43_rgb_tf_efficientnet_b3_ns_fold0_local_rank_0_fp16",
        #
        "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        #
        "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16",
        "Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16",
    ]

    holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout",
                                              "d4")
    test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4")
    fnames_for_checksum = [x + f"cauc" for x in experiments]
    checksum = compute_checksum_v2(fnames_for_checksum)

    holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY])
    image_ids = [fs.id_from_fname(x) for x in holdout_ds.images]

    quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(),
                          3).numpy().astype(np.float32)

    test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY])
    quality_t = F.one_hot(torch.tensor(test_ds.quality).long(),
                          3).numpy().astype(np.float32)

    x, y = get_x_y_for_stacking(holdout_predictions)
    print(x.shape, y.shape)

    x_test, _ = get_x_y_for_stacking(test_predictions)
    print(x_test.shape)

    if True:
        sc = StandardScaler()
        x = sc.fit_transform(x)
        x_test = sc.transform(x_test)

    if False:
        sc = PCA(n_components=16)
        x = sc.fit_transform(x)
        x_test = sc.transform(x_test)

    if True:
        x = np.column_stack([x, quality_h])
        x_test = np.column_stack([x_test, quality_t])

    group_kfold = GroupKFold(n_splits=2)

    params = {
        "depth": [3, 1, 2, 6, 4, 5, 7, 8, 9, 10],
        # "iterations": [250, 100, 500, 1000],
        "learning_rate": [0.03, 0.001, 0.01, 0.1, 0.2, 0.3],
        "l2_leaf_reg": [3, 1, 5, 10, 100],
    }

    lgb_estimator = cat.CatBoostClassifier(
        verbose=True,
        iterations=2500,
        # use_best_model=True, eval_metric="AUC",
        task_type="GPU",
    )

    random_search = RandomizedSearchCV(
        lgb_estimator,
        param_distributions=params,
        n_iter=10,
        scoring=make_scorer(alaska_weighted_auc,
                            greater_is_better=True,
                            needs_proba=True),
        cv=group_kfold.split(x, y, groups=image_ids),
        verbose=3,
        random_state=42,
    )

    # Here we go
    random_search.fit(x, y)

    test_pred = random_search.predict_proba(x_test)[:, 1]
    print(test_pred)

    submit_fname = os.path.join(
        output_dir,
        f"catboost_gs_{random_search.best_score_:.4f}_{checksum}.csv")
    df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"})
    df["Label"] = test_pred
    df[["Id", "Label"]].to_csv(submit_fname, index=False)
    print("Saved predictions to ", submit_fname)

    print("\n All results:")
    print(random_search.cv_results_)
    print("\n Best estimator:")
    print(random_search.best_estimator_)
    print(random_search.best_score_)
    print("\n Best hyperparameters:")
    print(random_search.best_params_)
def main():
    output_dir = os.path.dirname(__file__)

    experiments = [
        "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
    ]

    for metric in [
        # "loss",
        # "bauc",
        "cauc"
    ]:
        holdout_predictions_d4 = get_predictions_csv(experiments, metric, "holdout", "d4")
        oof_predictions_d4 = get_predictions_csv(experiments, metric, "oof", "d4")
        test_predictions_d4 = get_predictions_csv(experiments, metric, "test", "d4")

        hld_bin_pred_d4 = make_binary_predictions(holdout_predictions_d4)
        hld_y_true = hld_bin_pred_d4[0].y_true_type.values

        oof_bin_pred_d4 = make_binary_predictions(oof_predictions_d4)

        hld_cls_pred_d4 = make_classifier_predictions(holdout_predictions_d4)
        oof_cls_pred_d4 = make_classifier_predictions(oof_predictions_d4)

        bin_pred_d4_cal = make_binary_predictions_calibrated(holdout_predictions_d4, oof_predictions_d4)
        cls_pred_d4_cal = make_classifier_predictions_calibrated(holdout_predictions_d4, oof_predictions_d4)

        print(
            "   ", "      ", "  ", "   OOF", "     OOF 5K", "     OOF 1K", "        HLD", "     HLD 5K", "     HLD 1K"
        )
        print(
            metric,
            "Bin NC",
            "{:.6f}\t{:.6f}\t{:.6f}\t{:.6f}\t{:.6f}\t{:.6f}".format(
                np.mean([alaska_weighted_auc(x.y_true_type, x.Label) for x in oof_bin_pred_d4]),
                np.mean([shaky_wauc(x.y_true_type, x.Label) for x in oof_bin_pred_d4]),
                np.mean([shaky_wauc_public(x.y_true_type, x.Label) for x in oof_bin_pred_d4]),
                alaska_weighted_auc(hld_y_true, blend_predictions_mean(hld_bin_pred_d4).Label),
                shaky_wauc(hld_y_true, blend_predictions_mean(hld_bin_pred_d4).Label),
                shaky_wauc_public(hld_y_true, blend_predictions_mean(hld_bin_pred_d4).Label),
            ),
        )

        print(
            metric,
            "Cls NC",
            "{:.6f}\t{:.6f}\t{:.6f}\t{:.6f}\t{:.6f}\t{:.6f}".format(
                np.mean([alaska_weighted_auc(x.y_true_type, x.Label) for x in oof_cls_pred_d4]),
                np.mean([shaky_wauc(x.y_true_type, x.Label) for x in oof_cls_pred_d4]),
                np.mean([shaky_wauc_public(x.y_true_type, x.Label) for x in oof_cls_pred_d4]),
                alaska_weighted_auc(hld_y_true, blend_predictions_mean(hld_cls_pred_d4).Label),
                shaky_wauc(hld_y_true, blend_predictions_mean(hld_cls_pred_d4).Label),
                shaky_wauc_public(hld_y_true, blend_predictions_mean(hld_cls_pred_d4).Label),
            ),
        )

        print(
            metric,
            "Bin CL",
            "                                    {:.6f}\t{:.6f}\t{:.6f}".format(
                alaska_weighted_auc(hld_y_true, blend_predictions_mean(bin_pred_d4_cal).Label),
                shaky_wauc(hld_y_true, blend_predictions_mean(bin_pred_d4_cal).Label),
                shaky_wauc_public(hld_y_true, blend_predictions_mean(bin_pred_d4_cal).Label),
            ),
        )
        print(
            metric,
            "Cls CL",
            "                                    {:.6f}\t{:.6f}\t{:.6f}".format(
                alaska_weighted_auc(hld_y_true, blend_predictions_mean(cls_pred_d4_cal).Label),
                shaky_wauc(hld_y_true, blend_predictions_mean(cls_pred_d4_cal).Label),
                shaky_wauc_public(hld_y_true, blend_predictions_mean(cls_pred_d4_cal).Label),
            ),
        )
        print(
            metric,
            "Prd NC",
            "{:.6f}\t{:.6f}\t{:.6f}\t{:.6f}\t{:.6f}\t{:.6f}".format(
                np.mean(
                    [
                        alaska_weighted_auc(x.y_true_type, x.Label * y.Label)
                        for (x, y) in zip(oof_bin_pred_d4, oof_cls_pred_d4)
                    ]
                ),
                np.mean(
                    [shaky_wauc(x.y_true_type, x.Label * y.Label) for (x, y) in zip(oof_bin_pred_d4, oof_cls_pred_d4)]
                ),
                np.mean(
                    [
                        shaky_wauc_public(x.y_true_type, x.Label * y.Label)
                        for (x, y) in zip(oof_bin_pred_d4, oof_cls_pred_d4)
                    ]
                ),
                alaska_weighted_auc(
                    hld_y_true,
                    blend_predictions_mean(bin_pred_d4_cal).Label * blend_predictions_mean(cls_pred_d4_cal).Label,
                ),
                shaky_wauc(
                    hld_y_true,
                    blend_predictions_mean(bin_pred_d4_cal).Label * blend_predictions_mean(cls_pred_d4_cal).Label,
                ),
                shaky_wauc_public(
                    hld_y_true,
                    blend_predictions_mean(bin_pred_d4_cal).Label * blend_predictions_mean(cls_pred_d4_cal).Label,
                ),
            ),
        )
Ejemplo n.º 14
0
def main():
    output_dir = os.path.dirname(__file__)

    experiments = [
        "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        #
        "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16",
        "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16",
        #
        "K_Jul17_17_09_nr_rgb_tf_efficientnet_b6_ns_mish_fold0_local_rank_0_fp16",
        "K_Jul18_16_41_nr_rgb_tf_efficientnet_b6_ns_mish_fold3_local_rank_0_fp16",
        #
        "J_Jul19_20_10_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16",
    ]

    holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout", "d4")
    test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4")

    fnames_for_checksum = np.array(
        [x + "cauc_bin" for x in experiments]
        # + [x + "loss_bin" for x in experiments]
        + [x + "cauc_cls" for x in experiments]
        # + [x + "loss_cls" for x in experiments]
    )

    X = make_binary_predictions(holdout_predictions) + make_classifier_predictions(holdout_predictions)
    y_true = X[0].y_true_type.values
    X = np.array([x.Label.values for x in X])

    assert len(fnames_for_checksum) == X.shape[0]

    X_test = make_binary_predictions(test_predictions) + make_classifier_predictions(test_predictions)

    indices = np.arange(len(X))

    for r in range(2, 8):
        best_comb = None
        best_auc = 0
        combs = list(itertools.combinations(indices, r))

        for c in tqdm(combs, desc=f"{r}"):
            avg_preds = X[np.array(c)].mean(axis=0)
            score_averaging = alaska_weighted_auc(y_true, avg_preds)

            if score_averaging > best_auc:
                best_auc = score_averaging
                best_comb = c

        print(r, best_auc, best_comb)

        checksum = compute_checksum_v2(fnames_for_checksum[np.array(best_comb)])

        test_preds = [X_test[i] for i in best_comb]
        test_preds = blend_predictions_mean(test_preds)
        test_preds.to_csv(os.path.join(output_dir, f"cmb_mean_{best_auc:.4f}_{r}_{checksum}.csv"), index=False)

    for r in range(2, 8):
        best_comb = None
        best_auc = 0
        combs = list(itertools.combinations(indices, r))

        for c in tqdm(combs, desc=f"{r}"):
            rnk_preds = rankdata(X[np.array(c)], axis=1).mean(axis=0)
            score_averaging = alaska_weighted_auc(y_true, rnk_preds)

            if score_averaging > best_auc:
                best_auc = score_averaging
                best_comb = c

        print(r, best_auc, best_comb)

        checksum = compute_checksum_v2(fnames_for_checksum[np.array(best_comb)])

        test_preds = [X_test[i] for i in best_comb]
        test_preds = blend_predictions_mean(test_preds)
        test_preds.to_csv(os.path.join(output_dir, f"cmb_rank_{best_auc:.4f}_{r}_{checksum}.csv"), index=False)
Ejemplo n.º 15
0
def main():
    output_dir = os.path.dirname(__file__)

    experiments = [
        # "A_May24_11_08_ela_skresnext50_32x4d_fold0_fp16",
        # "A_May15_17_03_ela_skresnext50_32x4d_fold1_fp16",
        # "A_May21_13_28_ela_skresnext50_32x4d_fold2_fp16",
        # "A_May26_12_58_ela_skresnext50_32x4d_fold3_fp16",
        #
        # "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        # "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        # "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        #
        # "C_Jun24_22_00_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16",
        #
        # "D_Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16",
        # "D_Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16",
        #
        # "E_Jun18_19_24_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "E_Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16",
        #
        # "F_Jun29_19_43_rgb_tf_efficientnet_b3_ns_fold0_local_rank_0_fp16",
        #
        "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        #
        "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16",
        "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16",
    ]

    holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout", tta=None, need_embedding=True)
    test_predictions = get_predictions_csv(experiments, "cauc", "test", tta=None, need_embedding=True)
    checksum = compute_checksum_v2(experiments)

    holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY])
    image_ids = [fs.id_from_fname(x) for x in holdout_ds.images]

    quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(), 3).numpy().astype(np.float32)

    test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY])
    quality_t = F.one_hot(torch.tensor(test_ds.quality).long(), 3).numpy().astype(np.float32)

    x, y = get_x_y_embedding_for_stacking(holdout_predictions)
    print(x.shape, y.shape)

    x_test, _ = get_x_y_embedding_for_stacking(test_predictions)
    print(x_test.shape)

    if False:
        sc = StandardScaler()
        x = sc.fit_transform(x)
        x_test = sc.transform(x_test)

    if True:
        sc = PCA(n_components=512)
        x = sc.fit_transform(x)
        x_test = sc.transform(x_test)

    if True:
        x = np.column_stack([x, quality_h])
        x_test = np.column_stack([x_test, quality_t])

    group_kfold = GroupKFold(n_splits=5)

    params = {
        "min_child_weight": [1, 5, 10],
        "gamma": [1e-3, 1e-2, 1e-2, 0.5, 2],
        "subsample": [0.6, 0.8, 1.0],
        "colsample_bytree": [0.6, 0.8, 1.0],
        "max_depth": [2, 3, 4, 5, 6],
        "n_estimators": [16, 32, 64, 128, 256, 1000],
        "learning_rate": [0.001, 0.01, 0.05, 0.2, 1],
    }

    xgb = XGBClassifier(objective="binary:logistic", nthread=1)

    random_search = RandomizedSearchCV(
        xgb,
        param_distributions=params,
        scoring=make_scorer(alaska_weighted_auc, greater_is_better=True, needs_proba=True),
        n_jobs=4,
        n_iter=25,
        cv=group_kfold.split(x, y, groups=image_ids),
        verbose=3,
        random_state=42,
    )

    # Here we go
    random_search.fit(x, y)

    print("\n All results:")
    print(random_search.cv_results_)
    print("\n Best estimator:")
    print(random_search.best_estimator_)
    print(random_search.best_score_)
    print("\n Best hyperparameters:")
    print(random_search.best_params_)
    results = pd.DataFrame(random_search.cv_results_)
    results.to_csv("xgb-embedding-random-grid-search-results-01.csv", index=False)

    test_pred = random_search.predict_proba(x_test)[:, 1]

    submit_fname = os.path.join(output_dir, f"xgb_cls_emb_gs_{random_search.best_score_:.4f}_{checksum}_.csv")
    df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"})
    df["Label"] = test_pred
    df[["Id", "Label"]].to_csv(submit_fname, index=False)
    print("Saved submission to ", submit_fname)
def main():
    output_dir = os.path.dirname(__file__)

    experiments = [
        # "A_May24_11_08_ela_skresnext50_32x4d_fold0_fp16",
        # "A_May15_17_03_ela_skresnext50_32x4d_fold1_fp16",
        # "A_May21_13_28_ela_skresnext50_32x4d_fold2_fp16",
        # "A_May26_12_58_ela_skresnext50_32x4d_fold3_fp16",
        #
        # "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        # "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        # "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
        #
        "C_Jun24_22_00_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16",
        #
        "D_Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16",
        "D_Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16",
        #
        # "E_Jun18_19_24_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        # "E_Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16",
        #
        "F_Jun29_19_43_rgb_tf_efficientnet_b3_ns_fold0_local_rank_0_fp16",
        #
        "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16",
        "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16",
        "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16",
        "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16",
    ]

    holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout",
                                              "d4")
    test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4")
    fnames_for_checksum = [x + f"cauc" for x in experiments]
    checksum = compute_checksum_v2(fnames_for_checksum)

    holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY])
    image_ids = [fs.id_from_fname(x) for x in holdout_ds.images]

    quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(),
                          3).numpy().astype(np.float32)

    test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY])
    quality_t = F.one_hot(torch.tensor(test_ds.quality).long(),
                          3).numpy().astype(np.float32)

    x, y = get_x_y(holdout_predictions)
    print(x.shape, y.shape)

    x_test, _ = get_x_y(test_predictions)
    print(x_test.shape)

    if True:
        sc = StandardScaler()
        x = sc.fit_transform(x)
        x_test = sc.transform(x_test)

    if False:
        sc = PCA(n_components=16)
        x = sc.fit_transform(x)
        x_test = sc.transform(x_test)

    if True:
        x = np.column_stack([x, quality_h])
        x_test = np.column_stack([x_test, quality_t])

    group_kfold = GroupKFold(n_splits=5)

    df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"})
    auc_cv = []

    classifier1 = LGBMClassifier()
    classifier2 = CatBoostClassifier()
    classifier3 = LogisticRegression()
    classifier4 = CalibratedClassifierCV()
    classifier5 = LinearDiscriminantAnalysis()

    sclf = StackingCVClassifier(
        classifiers=[
            classifier1, classifier2, classifier3, classifier4, classifier5
        ],
        shuffle=False,
        use_probas=True,
        cv=4,
        # meta_classifier=SVC(degree=2, probability=True),
        meta_classifier=LogisticRegression(solver="lbfgs"),
    )

    sclf.fit(x, y, groups=image_ids)

    classifiers = {
        "LGBMClassifier": classifier1,
        "CatBoostClassifier": classifier2,
        "LogisticRegression": classifier3,
        "CalibratedClassifierCV": classifier4,
        "LinearDiscriminantAnalysis": classifier5,
        "Stack": sclf,
    }

    # Get results
    for key in classifiers:
        # Make prediction on test set
        y_pred = classifiers[key].predict_proba(x_valid)[:, 1]

        print(key, alaska_weighted_auc(y_valid, y_pred))

    # Making prediction on test set
    y_test = sclf.predict_proba(x_test)[:, 1]

    df["Label"] = y_test
    df.to_csv(os.path.join(output_dir,
                           f"stacking_{np.mean(auc_cv):.4f}_{checksum}.csv"),
              index=False)