def main(): output_dir = os.path.dirname(__file__) experiments = [ "May24_11_08_ela_skresnext50_32x4d_fold0_fp16", "May15_17_03_ela_skresnext50_32x4d_fold1_fp16", "May21_13_28_ela_skresnext50_32x4d_fold2_fp16", "May26_12_58_ela_skresnext50_32x4d_fold3_fp16", # "Jun02_12_26_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16", # "Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", "Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", "Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", "Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # "Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16", "Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16", # "Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16", ] holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout", "d4") test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4") checksum = compute_checksum(test_predictions) X, y = get_x_y(holdout_predictions) print(X.shape, y.shape) X_public_lb, _ = get_x_y(test_predictions) print(X_public_lb.shape) loss_partial = partial(_auc_loss, X=X, y=y) initial_coef = np.ones(X.shape[1]) / X.shape[1] result = sp.optimize.minimize( loss_partial, initial_coef, bounds=Bounds(0, 1), method="nelder-mead", options={"maxiter": 5000, "disp": True, "gtol": 1e-10, "maxfun": 99999}, tol=1e-6, ) print(result) best_coef = softmax(result.x) print(best_coef) x_pred = (np.expand_dims(best_coef, 0) * X).sum(axis=1) auc = alaska_weighted_auc(y, x_pred) print(auc) x_test_pred = (np.expand_dims(best_coef, 0) * X_public_lb).sum(axis=1) submit_fname = os.path.join(output_dir, f"wmean_{np.mean(auc):.4f}_{checksum}.csv") df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"}) df["Label"] = x_test_pred df[["Id", "Label"]].to_csv(submit_fname, index=False) print("Saved submission to ", submit_fname)
def _auc_loss(coef, X, y): coef = softmax(coef) x_weighted = (np.expand_dims(coef, 0) * X).sum(axis=1) auc = alaska_weighted_auc(y, x_weighted) return 1 - auc
def xgb_weighted_auc(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]: y_true = dtrain.get_label() result = "wauc", alaska_weighted_auc(y_true.astype(int), predt) return result
def main(): output_dir = os.path.dirname(__file__) experiments = [ # "A_May24_11_08_ela_skresnext50_32x4d_fold0_fp16", # "A_May15_17_03_ela_skresnext50_32x4d_fold1_fp16", # "A_May21_13_28_ela_skresnext50_32x4d_fold2_fp16", # "A_May26_12_58_ela_skresnext50_32x4d_fold3_fp16", # # "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", # "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", # "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # # "C_Jun24_22_00_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16", # # "D_Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16", # "D_Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16", # # "E_Jun18_19_24_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "E_Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16", # # "F_Jun29_19_43_rgb_tf_efficientnet_b3_ns_fold0_local_rank_0_fp16", # "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", # "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16", "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16", ] holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout", "d4") test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4") fnames_for_checksum = [x + f"cauc" for x in experiments] checksum = compute_checksum_v2(fnames_for_checksum) holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY]) image_ids = [fs.id_from_fname(x) for x in holdout_ds.images] print("Unique image ids", len(np.unique(image_ids))) quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(), 3).numpy().astype(np.float32) test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY]) quality_t = F.one_hot(torch.tensor(test_ds.quality).long(), 3).numpy().astype(np.float32) x, y = get_x_y(holdout_predictions) print(x.shape, y.shape) x_test, _ = get_x_y(test_predictions) print(x_test.shape) if True: sc = StandardScaler() x = sc.fit_transform(x) x_test = sc.transform(x_test) if False: sc = PCA(n_components=16) x = sc.fit_transform(x) x_test = sc.transform(x_test) if True: x = np.column_stack([x, quality_h]) x_test = np.column_stack([x_test, quality_t]) test_dmatrix = xgb.DMatrix(x_test) group_kfold = GroupKFold(n_splits=5) cv_scores = [] test_pred = None one_over_n = 1.0 / group_kfold.n_splits params = { "base_score": 0.5, "booster": "gblinear", # "booster": "gbtree", "colsample_bylevel": 1, "colsample_bynode": 1, "colsample_bytree": 1, # "gamma": 1.0, "learning_rate": 0.01, "max_delta_step": 0, "objective": "binary:logistic", "eta": 0.1, "reg_lambda": 0, "subsample": 0.8, "scale_pos_weight": 1, "min_child_weight": 2, "max_depth": 5, "tree_method": "exact", "seed": 42, "alpha": 0.01, "lambda": 0.01, "n_estimators": 256, "gamma": 0.01, "disable_default_eval_metric": 1, # "eval_metric": "wauc", } for fold_index, (train_index, valid_index) in enumerate( group_kfold.split(x, y, groups=image_ids)): x_train, x_valid, y_train, y_valid = (x[train_index], x[valid_index], y[train_index], y[valid_index]) train_dmatrix = xgb.DMatrix(x_train.copy(), y_train.copy()) valid_dmatrix = xgb.DMatrix(x_valid.copy(), y_valid.copy()) xgb_model = xgb.train( params, train_dmatrix, num_boost_round=5000, verbose_eval=True, feval=xgb_weighted_auc, maximize=True, evals=[(valid_dmatrix, "validation")], ) y_valid_pred = xgb_model.predict(valid_dmatrix) score = alaska_weighted_auc(y_valid, y_valid_pred) cv_scores.append(score) if test_pred is not None: test_pred += xgb_model.predict(test_dmatrix) * one_over_n else: test_pred = xgb_model.predict(test_dmatrix) * one_over_n for s in cv_scores: print(s) print(np.mean(cv_scores), np.std(cv_scores)) submit_fname = os.path.join( output_dir, f"xgb_{np.mean(cv_scores):.4f}_{checksum}_.csv") df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"}) df["Label"] = test_pred df[["Id", "Label"]].to_csv(submit_fname, index=False) print("Saved submission to ", submit_fname)
def main(): output_dir = os.path.dirname(__file__) checksum = "DCTR_JRM_B4_B5_B6_MixNet_XL_SRNET" columns = [ "DCTR", "JRM", # "MixNet_xl_pc", # "MixNet_xl_pjm", # "MixNet_xl_pjuni", # "MixNet_xl_puerd", # "efn_b4_pc", # "efn_b4_pjm", # "efn_b4_pjuni", # "efn_b4_puerd", # "efn_b2_pc", # "efn_b2_pjm", # "efn_b2_pjuni", # "efn_b2_puerd", # "MixNet_s_pc", # "MixNet_s_pjm", # "MixNet_s_pjuni", # "MixNet_s_puerd", # "SRNet_pc", # "SRNet_pjm", # "SRNet_pjuni", # "SRNet_puerd", # "SRNet_noPC70_pc", # "SRNet_noPC70_pjm", # "SRNet_noPC70_pjuni", # "SRNet_noPC70_puerd", "efn_b4_mish_pc", "efn_b4_mish_pjm", "efn_b4_mish_pjuni", "efn_b4_mish_puerd", "efn_b5_mish_pc", "efn_b5_mish_pjm", "efn_b5_mish_pjuni", "efn_b5_mish_puerd", # "efn_b2_NR_mish_pc", # "efn_b2_NR_mish_pjm", # "efn_b2_NR_mish_pjuni", # "efn_b2_NR_mish_puerd", "MixNet_xl_mish_pc", "MixNet_xl_mish_pjm", "MixNet_xl_mish_pjuni", "MixNet_xl_mish_puerd", "efn_b6_NR_mish_pc", "efn_b6_NR_mish_pjm", "efn_b6_NR_mish_pjuni", "efn_b6_NR_mish_puerd", "SRNet_noPC70_mckpt_pc", "SRNet_noPC70_mckpt_pjm", "SRNet_noPC70_mckpt_pjuni", "SRNet_noPC70_mckpt_puerd", ] x, y, quality_h, image_ids = get_x_y_for_stacking("probabilities_zoo_holdout_0718.csv", columns) print(x.shape, y.shape) x_test, _, quality_t, image_ids_test = get_x_y_for_stacking("probabilities_zoo_lb_0718.csv", columns) print(x_test.shape) if True: sc = StandardScaler() x = sc.fit_transform(x) x_test = sc.transform(x_test) if False: sc = PCA(n_components=16) x = sc.fit_transform(x) x_test = sc.transform(x_test) if True: x = np.column_stack([x, quality_h]) x_test = np.column_stack([x_test, quality_t]) group_kfold = GroupKFold(n_splits=5) cv_scores = [] test_pred = None one_over_n = 1.0 / group_kfold.n_splits for train_index, valid_index in group_kfold.split(x, y, groups=image_ids): x_train, x_valid, y_train, y_valid = (x[train_index], x[valid_index], y[train_index], y[valid_index]) print(np.bincount(y_train), np.bincount(y_valid)) cls = XGBClassifier( base_score=0.5, booster="gbtree", colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.8, gamma=2, gpu_id=-1, importance_type="gain", interaction_constraints="", learning_rate=0.01, max_delta_step=0, max_depth=6, min_child_weight=5, # missing=nan, monotone_constraints="()", n_estimators=256, n_jobs=8, nthread=1, num_parallel_tree=1, objective="binary:logistic", random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=0.6, tree_method="exact", validate_parameters=1, verbosity=2, ) cls.fit(x_train, y_train) y_valid_pred = cls.predict_proba(x_valid)[:, 1] score = alaska_weighted_auc(y_valid, y_valid_pred) cv_scores.append(score) if test_pred is not None: test_pred += cls.predict_proba(x_test)[:, 1] * one_over_n else: test_pred = cls.predict_proba(x_test)[:, 1] * one_over_n for s in cv_scores: print(s) print(np.mean(cv_scores), np.std(cv_scores)) submit_fname = os.path.join(output_dir, f"xgb_cls_2_{np.mean(cv_scores):.4f}_{checksum}.csv") df = {} df["Label"] = test_pred df["Id"] = image_ids_test pd.DataFrame.from_dict(df).to_csv(submit_fname, index=False) print("Saved submission to ", submit_fname)
def main(): output_dir = os.path.dirname(__file__) experiments = [ # "A_May24_11_08_ela_skresnext50_32x4d_fold0_fp16", # "A_May15_17_03_ela_skresnext50_32x4d_fold1_fp16", # "A_May21_13_28_ela_skresnext50_32x4d_fold2_fp16", # "A_May26_12_58_ela_skresnext50_32x4d_fold3_fp16", # # "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", # "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", # "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # # "C_Jun24_22_00_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16", # # "D_Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16", # "D_Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16", # # "E_Jun18_19_24_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "E_Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16", # # "F_Jun29_19_43_rgb_tf_efficientnet_b3_ns_fold0_local_rank_0_fp16", # "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16", "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16", ] holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout", "d4") test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4") checksum = compute_checksum_v2(experiments) holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY]) image_ids = [fs.id_from_fname(x) for x in holdout_ds.images] quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(), 3).numpy().astype(np.float32) test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY]) quality_t = F.one_hot(torch.tensor(test_ds.quality).long(), 3).numpy().astype(np.float32) x, y = get_x_y_for_stacking(holdout_predictions, with_logits=True, tta_logits=True) print(x.shape, y.shape) x_test, _ = get_x_y_for_stacking(test_predictions, with_logits=True, tta_logits=True) print(x_test.shape) if False: sc = StandardScaler() x = sc.fit_transform(x) x_test = sc.transform(x_test) if False: sc = PCA(n_components=16) x = sc.fit_transform(x) x_test = sc.transform(x_test) if True: x = np.column_stack([x, quality_h]) x_test = np.column_stack([x_test, quality_t]) group_kfold = GroupKFold(n_splits=5) cv_scores = [] test_pred = None one_over_n = 1.0 / group_kfold.n_splits for train_index, valid_index in group_kfold.split(x, y, groups=image_ids): x_train, x_valid, y_train, y_valid = (x[train_index], x[valid_index], y[train_index], y[valid_index]) print(np.bincount(y_train), np.bincount(y_valid)) # cls = LinearDiscriminantAnalysis() cls = LinearDiscriminantAnalysis(solver="lsqr", shrinkage="auto", priors=[0.5, 0.5]) cls.fit(x_train, y_train) y_valid_pred = cls.predict_proba(x_valid)[:, 1] score = alaska_weighted_auc(y_valid, y_valid_pred) cv_scores.append(score) if test_pred is not None: test_pred += cls.predict_proba(x_test)[:, 1] * one_over_n else: test_pred = cls.predict_proba(x_test)[:, 1] * one_over_n for s in cv_scores: print(s) print(np.mean(cv_scores), np.std(cv_scores)) submit_fname = os.path.join( output_dir, f"lda_{np.mean(cv_scores):.4f}_{checksum}.csv") df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"}) df["Label"] = test_pred df[["Id", "Label"]].to_csv(submit_fname, index=False) print("Saved submission to ", submit_fname)
def wauc_metric(y_true, y_pred): wauc = alaska_weighted_auc(y_true, y_pred) return ("wauc", wauc, True)
def main(): output_dir = os.path.dirname(__file__) experiments = [ "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16", # "K_Jul17_17_09_nr_rgb_tf_efficientnet_b6_ns_mish_fold0_local_rank_0_fp16", "J_Jul19_20_10_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16", "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16", "K_Jul18_16_41_nr_rgb_tf_efficientnet_b6_ns_mish_fold3_local_rank_0_fp16" # # ] holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout", "d4") test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4") checksum = compute_checksum_v2(experiments) holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY]) image_ids = [fs.id_from_fname(x) for x in holdout_ds.images] quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(), 3).numpy().astype(np.float32) test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY]) quality_t = F.one_hot(torch.tensor(test_ds.quality).long(), 3).numpy().astype(np.float32) with_logits = True x, y = get_x_y_for_stacking(holdout_predictions, with_logits=with_logits, tta_logits=with_logits) # Force target to be binary y = (y > 0).astype(int) print(x.shape, y.shape) x_test, _ = get_x_y_for_stacking(test_predictions, with_logits=with_logits, tta_logits=with_logits) print(x_test.shape) if True: sc = StandardScaler() x = sc.fit_transform(x) x_test = sc.transform(x_test) if False: sc = PCA(n_components=16) x = sc.fit_transform(x) x_test = sc.transform(x_test) if True: x = np.column_stack([x, quality_h]) x_test = np.column_stack([x_test, quality_t]) group_kfold = GroupKFold(n_splits=5) cv_scores = [] test_pred = None one_over_n = 1.0 / group_kfold.n_splits for train_index, valid_index in group_kfold.split(x, y, groups=image_ids): x_train, x_valid, y_train, y_valid = (x[train_index], x[valid_index], y[train_index], y[valid_index]) print(np.bincount(y_train), np.bincount(y_valid)) cls = XGBClassifier( base_score=0.5, booster="gbtree", colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.6, gamma=0.5, gpu_id=-1, importance_type="gain", interaction_constraints="", learning_rate=0.01, max_delta_step=0, max_depth=3, min_child_weight=10, # missing=nan, monotone_constraints="()", n_estimators=1000, n_jobs=8, nthread=1, num_parallel_tree=1, objective="binary:logistic", random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=0.8, tree_method="exact", validate_parameters=1, verbosity=2, ) cls.fit(x_train, y_train) y_valid_pred = cls.predict_proba(x_valid)[:, 1] score = alaska_weighted_auc(y_valid, y_valid_pred) cv_scores.append(score) if test_pred is not None: test_pred += cls.predict_proba(x_test)[:, 1] * one_over_n else: test_pred = cls.predict_proba(x_test)[:, 1] * one_over_n for s in cv_scores: print(s) print(np.mean(cv_scores), np.std(cv_scores)) with_logits_sfx = "_with_logits" if with_logits else "" submit_fname = os.path.join( output_dir, f"xgb_cls_{np.mean(cv_scores):.4f}_{checksum}{with_logits_sfx}.csv") df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"}) df["Label"] = test_pred df[["Id", "Label"]].to_csv(submit_fname, index=False) print("Saved submission to ", submit_fname)
def main(): output_dir = os.path.dirname(__file__) experiments = [ "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", ] for metric in [ # "loss", # "bauc", "cauc" ]: holdout_predictions_d4 = get_predictions_csv(experiments, metric, "holdout", "d4") oof_predictions_d4 = get_predictions_csv(experiments, metric, "oof", "d4") test_predictions_d4 = get_predictions_csv(experiments, metric, "test", "d4") hld_bin_pred_d4 = make_binary_predictions(holdout_predictions_d4) hld_y_true = hld_bin_pred_d4[0].y_true_type.values oof_bin_pred_d4 = make_binary_predictions(oof_predictions_d4) hld_cls_pred_d4 = make_classifier_predictions(holdout_predictions_d4) oof_cls_pred_d4 = make_classifier_predictions(oof_predictions_d4) bin_pred_d4_cal = make_binary_predictions_calibrated(holdout_predictions_d4, oof_predictions_d4) cls_pred_d4_cal = make_classifier_predictions_calibrated(holdout_predictions_d4, oof_predictions_d4) print( " ", " ", " ", " OOF", " OOF 5K", " OOF 1K", " HLD", " HLD 5K", " HLD 1K" ) print( metric, "Bin NC", "{:.6f}\t{:.6f}\t{:.6f}\t{:.6f}\t{:.6f}\t{:.6f}".format( np.mean([alaska_weighted_auc(x.y_true_type, x.Label) for x in oof_bin_pred_d4]), np.mean([shaky_wauc(x.y_true_type, x.Label) for x in oof_bin_pred_d4]), np.mean([shaky_wauc_public(x.y_true_type, x.Label) for x in oof_bin_pred_d4]), alaska_weighted_auc(hld_y_true, blend_predictions_mean(hld_bin_pred_d4).Label), shaky_wauc(hld_y_true, blend_predictions_mean(hld_bin_pred_d4).Label), shaky_wauc_public(hld_y_true, blend_predictions_mean(hld_bin_pred_d4).Label), ), ) print( metric, "Cls NC", "{:.6f}\t{:.6f}\t{:.6f}\t{:.6f}\t{:.6f}\t{:.6f}".format( np.mean([alaska_weighted_auc(x.y_true_type, x.Label) for x in oof_cls_pred_d4]), np.mean([shaky_wauc(x.y_true_type, x.Label) for x in oof_cls_pred_d4]), np.mean([shaky_wauc_public(x.y_true_type, x.Label) for x in oof_cls_pred_d4]), alaska_weighted_auc(hld_y_true, blend_predictions_mean(hld_cls_pred_d4).Label), shaky_wauc(hld_y_true, blend_predictions_mean(hld_cls_pred_d4).Label), shaky_wauc_public(hld_y_true, blend_predictions_mean(hld_cls_pred_d4).Label), ), ) print( metric, "Bin CL", " {:.6f}\t{:.6f}\t{:.6f}".format( alaska_weighted_auc(hld_y_true, blend_predictions_mean(bin_pred_d4_cal).Label), shaky_wauc(hld_y_true, blend_predictions_mean(bin_pred_d4_cal).Label), shaky_wauc_public(hld_y_true, blend_predictions_mean(bin_pred_d4_cal).Label), ), ) print( metric, "Cls CL", " {:.6f}\t{:.6f}\t{:.6f}".format( alaska_weighted_auc(hld_y_true, blend_predictions_mean(cls_pred_d4_cal).Label), shaky_wauc(hld_y_true, blend_predictions_mean(cls_pred_d4_cal).Label), shaky_wauc_public(hld_y_true, blend_predictions_mean(cls_pred_d4_cal).Label), ), ) print( metric, "Prd NC", "{:.6f}\t{:.6f}\t{:.6f}\t{:.6f}\t{:.6f}\t{:.6f}".format( np.mean( [ alaska_weighted_auc(x.y_true_type, x.Label * y.Label) for (x, y) in zip(oof_bin_pred_d4, oof_cls_pred_d4) ] ), np.mean( [shaky_wauc(x.y_true_type, x.Label * y.Label) for (x, y) in zip(oof_bin_pred_d4, oof_cls_pred_d4)] ), np.mean( [ shaky_wauc_public(x.y_true_type, x.Label * y.Label) for (x, y) in zip(oof_bin_pred_d4, oof_cls_pred_d4) ] ), alaska_weighted_auc( hld_y_true, blend_predictions_mean(bin_pred_d4_cal).Label * blend_predictions_mean(cls_pred_d4_cal).Label, ), shaky_wauc( hld_y_true, blend_predictions_mean(bin_pred_d4_cal).Label * blend_predictions_mean(cls_pred_d4_cal).Label, ), shaky_wauc_public( hld_y_true, blend_predictions_mean(bin_pred_d4_cal).Label * blend_predictions_mean(cls_pred_d4_cal).Label, ), ), )
def main(): output_dir = os.path.dirname(__file__) experiments = [ "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16", "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16", # "K_Jul17_17_09_nr_rgb_tf_efficientnet_b6_ns_mish_fold0_local_rank_0_fp16", "K_Jul18_16_41_nr_rgb_tf_efficientnet_b6_ns_mish_fold3_local_rank_0_fp16", # "J_Jul19_20_10_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16", ] holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout", "d4") test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4") fnames_for_checksum = np.array( [x + "cauc_bin" for x in experiments] # + [x + "loss_bin" for x in experiments] + [x + "cauc_cls" for x in experiments] # + [x + "loss_cls" for x in experiments] ) X = make_binary_predictions(holdout_predictions) + make_classifier_predictions(holdout_predictions) y_true = X[0].y_true_type.values X = np.array([x.Label.values for x in X]) assert len(fnames_for_checksum) == X.shape[0] X_test = make_binary_predictions(test_predictions) + make_classifier_predictions(test_predictions) indices = np.arange(len(X)) for r in range(2, 8): best_comb = None best_auc = 0 combs = list(itertools.combinations(indices, r)) for c in tqdm(combs, desc=f"{r}"): avg_preds = X[np.array(c)].mean(axis=0) score_averaging = alaska_weighted_auc(y_true, avg_preds) if score_averaging > best_auc: best_auc = score_averaging best_comb = c print(r, best_auc, best_comb) checksum = compute_checksum_v2(fnames_for_checksum[np.array(best_comb)]) test_preds = [X_test[i] for i in best_comb] test_preds = blend_predictions_mean(test_preds) test_preds.to_csv(os.path.join(output_dir, f"cmb_mean_{best_auc:.4f}_{r}_{checksum}.csv"), index=False) for r in range(2, 8): best_comb = None best_auc = 0 combs = list(itertools.combinations(indices, r)) for c in tqdm(combs, desc=f"{r}"): rnk_preds = rankdata(X[np.array(c)], axis=1).mean(axis=0) score_averaging = alaska_weighted_auc(y_true, rnk_preds) if score_averaging > best_auc: best_auc = score_averaging best_comb = c print(r, best_auc, best_comb) checksum = compute_checksum_v2(fnames_for_checksum[np.array(best_comb)]) test_preds = [X_test[i] for i in best_comb] test_preds = blend_predictions_mean(test_preds) test_preds.to_csv(os.path.join(output_dir, f"cmb_rank_{best_auc:.4f}_{r}_{checksum}.csv"), index=False)
def main(): output_dir = os.path.dirname(__file__) experiments = [ # "A_May24_11_08_ela_skresnext50_32x4d_fold0_fp16", # "A_May15_17_03_ela_skresnext50_32x4d_fold1_fp16", # "A_May21_13_28_ela_skresnext50_32x4d_fold2_fp16", # "A_May26_12_58_ela_skresnext50_32x4d_fold3_fp16", # # "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", # "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", # "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # "C_Jun24_22_00_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16", # "D_Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16", "D_Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16", # # "E_Jun18_19_24_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "E_Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16", # "F_Jun29_19_43_rgb_tf_efficientnet_b3_ns_fold0_local_rank_0_fp16", # "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", ] holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout", "d4") test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4") fnames_for_checksum = [x + f"cauc" for x in experiments] checksum = compute_checksum_v2(fnames_for_checksum) holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY]) image_ids = [fs.id_from_fname(x) for x in holdout_ds.images] quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(), 3).numpy().astype(np.float32) test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY]) quality_t = F.one_hot(torch.tensor(test_ds.quality).long(), 3).numpy().astype(np.float32) x, y = get_x_y(holdout_predictions) print(x.shape, y.shape) x_test, _ = get_x_y(test_predictions) print(x_test.shape) if True: sc = StandardScaler() x = sc.fit_transform(x) x_test = sc.transform(x_test) if False: sc = PCA(n_components=16) x = sc.fit_transform(x) x_test = sc.transform(x_test) if True: x = np.column_stack([x, quality_h]) x_test = np.column_stack([x_test, quality_t]) group_kfold = GroupKFold(n_splits=5) df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"}) auc_cv = [] classifier1 = LGBMClassifier() classifier2 = CatBoostClassifier() classifier3 = LogisticRegression() classifier4 = CalibratedClassifierCV() classifier5 = LinearDiscriminantAnalysis() sclf = StackingCVClassifier( classifiers=[ classifier1, classifier2, classifier3, classifier4, classifier5 ], shuffle=False, use_probas=True, cv=4, # meta_classifier=SVC(degree=2, probability=True), meta_classifier=LogisticRegression(solver="lbfgs"), ) sclf.fit(x, y, groups=image_ids) classifiers = { "LGBMClassifier": classifier1, "CatBoostClassifier": classifier2, "LogisticRegression": classifier3, "CalibratedClassifierCV": classifier4, "LinearDiscriminantAnalysis": classifier5, "Stack": sclf, } # Get results for key in classifiers: # Make prediction on test set y_pred = classifiers[key].predict_proba(x_valid)[:, 1] print(key, alaska_weighted_auc(y_valid, y_pred)) # Making prediction on test set y_test = sclf.predict_proba(x_test)[:, 1] df["Label"] = y_test df.to_csv(os.path.join(output_dir, f"stacking_{np.mean(auc_cv):.4f}_{checksum}.csv"), index=False)