def main(): output_dir = os.path.dirname(__file__) experiments = [ # "A_May24_11_08_ela_skresnext50_32x4d_fold0_fp16", # "A_May15_17_03_ela_skresnext50_32x4d_fold1_fp16", # "A_May21_13_28_ela_skresnext50_32x4d_fold2_fp16", # "A_May26_12_58_ela_skresnext50_32x4d_fold3_fp16", # # "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", # "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", # "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # # "C_Jun24_22_00_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16", # # "D_Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16", # "D_Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16", # # "E_Jun18_19_24_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "E_Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16", # # "F_Jun29_19_43_rgb_tf_efficientnet_b3_ns_fold0_local_rank_0_fp16", # "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", # "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16", "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16", ] holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout", "d4") test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4") fnames_for_checksum = [x + f"cauc" for x in experiments] checksum = compute_checksum_v2(fnames_for_checksum) holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY]) image_ids = [fs.id_from_fname(x) for x in holdout_ds.images] print("Unique image ids", len(np.unique(image_ids))) quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(), 3).numpy().astype(np.float32) test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY]) quality_t = F.one_hot(torch.tensor(test_ds.quality).long(), 3).numpy().astype(np.float32) x, y = get_x_y(holdout_predictions) print(x.shape, y.shape) x_test, _ = get_x_y(test_predictions) print(x_test.shape) if True: sc = StandardScaler() x = sc.fit_transform(x) x_test = sc.transform(x_test) if False: sc = PCA(n_components=16) x = sc.fit_transform(x) x_test = sc.transform(x_test) if True: x = np.column_stack([x, quality_h]) x_test = np.column_stack([x_test, quality_t]) test_dmatrix = xgb.DMatrix(x_test) group_kfold = GroupKFold(n_splits=5) cv_scores = [] test_pred = None one_over_n = 1.0 / group_kfold.n_splits params = { "base_score": 0.5, "booster": "gblinear", # "booster": "gbtree", "colsample_bylevel": 1, "colsample_bynode": 1, "colsample_bytree": 1, # "gamma": 1.0, "learning_rate": 0.01, "max_delta_step": 0, "objective": "binary:logistic", "eta": 0.1, "reg_lambda": 0, "subsample": 0.8, "scale_pos_weight": 1, "min_child_weight": 2, "max_depth": 5, "tree_method": "exact", "seed": 42, "alpha": 0.01, "lambda": 0.01, "n_estimators": 256, "gamma": 0.01, "disable_default_eval_metric": 1, # "eval_metric": "wauc", } for fold_index, (train_index, valid_index) in enumerate( group_kfold.split(x, y, groups=image_ids)): x_train, x_valid, y_train, y_valid = (x[train_index], x[valid_index], y[train_index], y[valid_index]) train_dmatrix = xgb.DMatrix(x_train.copy(), y_train.copy()) valid_dmatrix = xgb.DMatrix(x_valid.copy(), y_valid.copy()) xgb_model = xgb.train( params, train_dmatrix, num_boost_round=5000, verbose_eval=True, feval=xgb_weighted_auc, maximize=True, evals=[(valid_dmatrix, "validation")], ) y_valid_pred = xgb_model.predict(valid_dmatrix) score = alaska_weighted_auc(y_valid, y_valid_pred) cv_scores.append(score) if test_pred is not None: test_pred += xgb_model.predict(test_dmatrix) * one_over_n else: test_pred = xgb_model.predict(test_dmatrix) * one_over_n for s in cv_scores: print(s) print(np.mean(cv_scores), np.std(cv_scores)) submit_fname = os.path.join( output_dir, f"xgb_{np.mean(cv_scores):.4f}_{checksum}_.csv") df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"}) df["Label"] = test_pred df[["Id", "Label"]].to_csv(submit_fname, index=False) print("Saved submission to ", submit_fname)
def main(): parser = argparse.ArgumentParser() parser.add_argument("experiments", nargs="+", type=str) parser.add_argument("-o", "--output", type=str, required=False) parser.add_argument("-dd", "--data-dir", type=str, default=os.environ.get("KAGGLE_2020_ALASKA2")) args = parser.parse_args() output_dir = os.path.dirname(__file__) data_dir = args.data_dir experiments = args.experiments output_file = args.output holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout", "d4") test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4") checksum = compute_checksum_v2(experiments) holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY]) image_ids_h = [fs.id_from_fname(x) for x in holdout_ds.images] quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(), 3).numpy().astype(np.float32) test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY]) quality_t = F.one_hot(torch.tensor(test_ds.quality).long(), 3).numpy().astype(np.float32) with_logits = True x, y = get_x_y_for_stacking(holdout_predictions, with_logits=with_logits, tta_logits=with_logits) # Force target to be binary y = (y > 0).astype(int) print(x.shape, y.shape) x_test, _ = get_x_y_for_stacking(test_predictions, with_logits=with_logits, tta_logits=with_logits) print(x_test.shape) if False: image_fnames_h = [ os.path.join(data_dir, INDEX_TO_METHOD[method], f"{image_id}.jpg") for (image_id, method) in zip(image_ids_h, y) ] test_image_ids = pd.read_csv(test_predictions[0]).image_id.tolist() image_fnames_t = [ os.path.join(data_dir, "Test", image_id) for image_id in test_image_ids ] entropy_t = compute_image_features(image_fnames_t) x_test = np.column_stack([x_test, entropy_t]) # entropy_h = entropy_t.copy() # x = x_test.copy() entropy_h = compute_image_features(image_fnames_h) x = np.column_stack([x, entropy_h]) print("Added image features", entropy_h.shape, entropy_t.shape) if True: sc = StandardScaler() x = sc.fit_transform(x) x_test = sc.transform(x_test) if False: sc = PCA(n_components=16) x = sc.fit_transform(x) x_test = sc.transform(x_test) if True: x = np.column_stack([x, quality_h]) x_test = np.column_stack([x_test, quality_t]) group_kfold = GroupKFold(n_splits=5) params = { "min_child_weight": [1, 5, 10], "gamma": [1e-3, 1e-2, 1e-2, 0.5, 2], "subsample": [0.6, 0.8, 1.0], "colsample_bytree": [0.6, 0.8, 1.0], "max_depth": [2, 3, 4, 5, 6], "n_estimators": [16, 32, 64, 128, 256, 1000], "learning_rate": [0.001, 0.01, 0.05, 0.2, 1], } xgb = XGBClassifier(objective="binary:logistic", nthread=1) random_search = RandomizedSearchCV( xgb, param_distributions=params, scoring=make_scorer(alaska_weighted_auc, greater_is_better=True, needs_proba=True), n_jobs=4, n_iter=25, cv=group_kfold.split(x, y, groups=image_ids_h), verbose=3, random_state=42, ) # Here we go random_search.fit(x, y) print("\n All results:") print(random_search.cv_results_) print("\n Best estimator:") print(random_search.best_estimator_) print(random_search.best_score_) print("\n Best hyperparameters:") print(random_search.best_params_) results = pd.DataFrame(random_search.cv_results_) results.to_csv("xgb-random-grid-search-results-01.csv", index=False) test_pred = random_search.predict_proba(x_test)[:, 1] if output_file is None: with_logits_sfx = "_with_logits" if with_logits else "" submit_fname = os.path.join( output_dir, f"xgb_cls_gs_{random_search.best_score_:.4f}_{checksum}{with_logits_sfx}.csv" ) else: submit_fname = output_file df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"}) df["Label"] = test_pred df[["Id", "Label"]].to_csv(submit_fname, index=False) print("Saved submission to ", submit_fname) import json with open(fs.change_extension(submit_fname, ".json"), "w") as f: json.dump(random_search.best_params_, f, indent=2)
def main(): output_dir = os.path.dirname(__file__) experiments = [ # "A_May24_11_08_ela_skresnext50_32x4d_fold0_fp16", # "A_May15_17_03_ela_skresnext50_32x4d_fold1_fp16", # "A_May21_13_28_ela_skresnext50_32x4d_fold2_fp16", # "A_May26_12_58_ela_skresnext50_32x4d_fold3_fp16", # # "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", # "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", # "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # # "C_Jun24_22_00_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16", # # "D_Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16", # "D_Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16", # # "E_Jun18_19_24_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "E_Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16", # # "F_Jun29_19_43_rgb_tf_efficientnet_b3_ns_fold0_local_rank_0_fp16", # "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16", "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16", ] holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout", "d4") test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4") checksum = compute_checksum_v2(experiments) holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY]) image_ids = [fs.id_from_fname(x) for x in holdout_ds.images] quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(), 3).numpy().astype(np.float32) test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY]) quality_t = F.one_hot(torch.tensor(test_ds.quality).long(), 3).numpy().astype(np.float32) x, y = get_x_y_for_stacking(holdout_predictions, with_logits=True, tta_logits=True) print(x.shape, y.shape) x_test, _ = get_x_y_for_stacking(test_predictions, with_logits=True, tta_logits=True) print(x_test.shape) if False: sc = StandardScaler() x = sc.fit_transform(x) x_test = sc.transform(x_test) if False: sc = PCA(n_components=16) x = sc.fit_transform(x) x_test = sc.transform(x_test) if True: x = np.column_stack([x, quality_h]) x_test = np.column_stack([x_test, quality_t]) group_kfold = GroupKFold(n_splits=5) cv_scores = [] test_pred = None one_over_n = 1.0 / group_kfold.n_splits for train_index, valid_index in group_kfold.split(x, y, groups=image_ids): x_train, x_valid, y_train, y_valid = (x[train_index], x[valid_index], y[train_index], y[valid_index]) print(np.bincount(y_train), np.bincount(y_valid)) # cls = LinearDiscriminantAnalysis() cls = LinearDiscriminantAnalysis(solver="lsqr", shrinkage="auto", priors=[0.5, 0.5]) cls.fit(x_train, y_train) y_valid_pred = cls.predict_proba(x_valid)[:, 1] score = alaska_weighted_auc(y_valid, y_valid_pred) cv_scores.append(score) if test_pred is not None: test_pred += cls.predict_proba(x_test)[:, 1] * one_over_n else: test_pred = cls.predict_proba(x_test)[:, 1] * one_over_n for s in cv_scores: print(s) print(np.mean(cv_scores), np.std(cv_scores)) submit_fname = os.path.join( output_dir, f"lda_{np.mean(cv_scores):.4f}_{checksum}.csv") df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"}) df["Label"] = test_pred df[["Id", "Label"]].to_csv(submit_fname, index=False) print("Saved submission to ", submit_fname)
def main(): output_dir = os.path.dirname(__file__) experiments = [ # "A_May24_11_08_ela_skresnext50_32x4d_fold0_fp16", # "A_May15_17_03_ela_skresnext50_32x4d_fold1_fp16", # "A_May21_13_28_ela_skresnext50_32x4d_fold2_fp16", # "A_May26_12_58_ela_skresnext50_32x4d_fold3_fp16", # # "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", # "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", # "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # # "C_Jun24_22_00_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16", # # "D_Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16", # "D_Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16", # # "E_Jun18_19_24_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "E_Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16", # # "F_Jun29_19_43_rgb_tf_efficientnet_b3_ns_fold0_local_rank_0_fp16", # "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16", "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16", ] holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout", "d4") test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4") checksum = compute_checksum_v2(experiments) holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY]) image_ids = [fs.id_from_fname(x) for x in holdout_ds.images] quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(), 3).numpy().astype(np.float32) test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY]) quality_t = F.one_hot(torch.tensor(test_ds.quality).long(), 3).numpy().astype(np.float32) x, y = get_x_y_for_stacking(holdout_predictions, with_logits=True, tta_logits=True) print(x.shape, y.shape) x_test, _ = get_x_y_for_stacking(test_predictions, with_logits=True, tta_logits=True) print(x_test.shape) if True: x = np.column_stack([x, quality_h]) x_test = np.column_stack([x_test, quality_t]) group_kfold = GroupKFold(n_splits=5) for fold_index, (train_index, valid_index) in enumerate( group_kfold.split(x, y, groups=image_ids)): x_train, x_valid, y_train, y_valid = (x[train_index], x[valid_index], y[train_index], y[valid_index]) clf = LazyClassifier(verbose=True, ignore_warnings=False, custom_metric=alaska_weighted_auc, predictions=True) models, predictions = clf.fit(x_train, x_valid, y_train, y_valid) print(models) models.to_csv( os.path.join(output_dir, f"lazypredict_models_{fold_index}_{checksum}.csv")) predictions.to_csv( os.path.join(output_dir, f"lazypredict_preds_{fold_index}_{checksum}.csv"))
def main(): output_dir = os.path.dirname(__file__) experiments = [ # # "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", # "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16", "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16", # "K_Jul17_17_09_nr_rgb_tf_efficientnet_b6_ns_mish_fold0_local_rank_0_fp16", ] checksum = compute_checksum_v2(experiments) if False: train_predictions = get_predictions_csv(experiments, "cauc", "train", tta="d4", need_embedding=True) x, y = get_x_y_for_stacking( train_predictions, with_embeddings=True, with_logits=False, with_probas=False, tta_probas=False, tta_logits=False, ) print("Train", x.shape, y.shape) np.save(f"embeddings_x_train_{checksum}.npy", x) np.save(f"embeddings_y_train_{checksum}.npy", y) del x, y, train_predictions if False: test_predictions = get_predictions_csv(experiments, "cauc", "test", tta="d4", need_embedding=True) x_test, _ = get_x_y_for_stacking( test_predictions, with_embeddings=True, with_logits=False, with_probas=False, tta_probas=False, tta_logits=False, ) print("Test", x_test.shape) np.save(f"embeddings_x_test_{checksum}.npy", x_test) del x_test, test_predictions if True: holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout", tta="d4", need_embedding=True) x_hld, y_hld = get_x_y_for_stacking( holdout_predictions, with_embeddings=True, with_logits=False, with_probas=False, tta_probas=False, tta_logits=False, ) print("Holdout", x_hld.shape) np.save(f"embeddings_x_holdout_{checksum}.npy", x_hld) np.save(f"embeddings_y_holdout_{checksum}.npy", y_hld) del x_hld, y_hld, holdout_predictions
def main(): output_dir = os.path.dirname(__file__) experiments = [ "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16", # "K_Jul17_17_09_nr_rgb_tf_efficientnet_b6_ns_mish_fold0_local_rank_0_fp16", "J_Jul19_20_10_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16", "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16", "K_Jul18_16_41_nr_rgb_tf_efficientnet_b6_ns_mish_fold3_local_rank_0_fp16" # # ] holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout", "d4") test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4") checksum = compute_checksum_v2(experiments) holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY]) image_ids = [fs.id_from_fname(x) for x in holdout_ds.images] quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(), 3).numpy().astype(np.float32) test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY]) quality_t = F.one_hot(torch.tensor(test_ds.quality).long(), 3).numpy().astype(np.float32) with_logits = True x, y = get_x_y_for_stacking(holdout_predictions, with_logits=with_logits, tta_logits=with_logits) # Force target to be binary y = (y > 0).astype(int) print(x.shape, y.shape) x_test, _ = get_x_y_for_stacking(test_predictions, with_logits=with_logits, tta_logits=with_logits) print(x_test.shape) if True: sc = StandardScaler() x = sc.fit_transform(x) x_test = sc.transform(x_test) if False: sc = PCA(n_components=16) x = sc.fit_transform(x) x_test = sc.transform(x_test) if True: x = np.column_stack([x, quality_h]) x_test = np.column_stack([x_test, quality_t]) group_kfold = GroupKFold(n_splits=5) params = { "boosting_type": ["gbdt", "dart", "rf", "goss"], "num_leaves": [16, 32, 64, 128], "reg_alpha": [0, 0.01, 0.1, 0.5], "reg_lambda": [0, 0.01, 0.1, 0.5], "learning_rate": [0.001, 0.01, 0.1, 0.5], "n_estimators": [32, 64, 126, 512], "max_depth": [2, 4, 8], "min_child_samples": [20, 40, 80, 100], } lgb_estimator = lgb.LGBMClassifier(objective="binary", silent=True) random_search = RandomizedSearchCV( lgb_estimator, param_distributions=params, scoring=make_scorer(alaska_weighted_auc, greater_is_better=True, needs_proba=True), n_jobs=3, n_iter=50, cv=group_kfold.split(x, y, groups=image_ids), verbose=2, random_state=42, ) # Here we go random_search.fit(x, y) test_pred = random_search.predict_proba(x_test)[:, 1] print(test_pred) submit_fname = os.path.join( output_dir, f"lgbm_gs_{random_search.best_score_:.4f}_{checksum}.csv") df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"}) df["Label"] = test_pred df[["Id", "Label"]].to_csv(submit_fname, index=False) print("\n All results:") print(random_search.cv_results_) print("\n Best estimator:") print(random_search.best_estimator_) print(random_search.best_score_) print("\n Best hyperparameters:") print(random_search.best_params_) results = pd.DataFrame(random_search.cv_results_) results.to_csv("lgbm-random-grid-search-results-01.csv", index=False)
def main(): output_dir = os.path.dirname(__file__) experiments = [ "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16", # "K_Jul17_17_09_nr_rgb_tf_efficientnet_b6_ns_mish_fold0_local_rank_0_fp16", "J_Jul19_20_10_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16", "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16", "K_Jul18_16_41_nr_rgb_tf_efficientnet_b6_ns_mish_fold3_local_rank_0_fp16" # # ] holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout", "d4") test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4") checksum = compute_checksum_v2(experiments) holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY]) image_ids = [fs.id_from_fname(x) for x in holdout_ds.images] quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(), 3).numpy().astype(np.float32) test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY]) quality_t = F.one_hot(torch.tensor(test_ds.quality).long(), 3).numpy().astype(np.float32) with_logits = True x, y = get_x_y_for_stacking(holdout_predictions, with_logits=with_logits, tta_logits=with_logits) # Force target to be binary y = (y > 0).astype(int) print(x.shape, y.shape) x_test, _ = get_x_y_for_stacking(test_predictions, with_logits=with_logits, tta_logits=with_logits) print(x_test.shape) if True: sc = StandardScaler() x = sc.fit_transform(x) x_test = sc.transform(x_test) if False: sc = PCA(n_components=16) x = sc.fit_transform(x) x_test = sc.transform(x_test) if True: x = np.column_stack([x, quality_h]) x_test = np.column_stack([x_test, quality_t]) group_kfold = GroupKFold(n_splits=5) cv_scores = [] test_pred = None one_over_n = 1.0 / group_kfold.n_splits for train_index, valid_index in group_kfold.split(x, y, groups=image_ids): x_train, x_valid, y_train, y_valid = (x[train_index], x[valid_index], y[train_index], y[valid_index]) print(np.bincount(y_train), np.bincount(y_valid)) cls = XGBClassifier( base_score=0.5, booster="gbtree", colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.6, gamma=0.5, gpu_id=-1, importance_type="gain", interaction_constraints="", learning_rate=0.01, max_delta_step=0, max_depth=3, min_child_weight=10, # missing=nan, monotone_constraints="()", n_estimators=1000, n_jobs=8, nthread=1, num_parallel_tree=1, objective="binary:logistic", random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=0.8, tree_method="exact", validate_parameters=1, verbosity=2, ) cls.fit(x_train, y_train) y_valid_pred = cls.predict_proba(x_valid)[:, 1] score = alaska_weighted_auc(y_valid, y_valid_pred) cv_scores.append(score) if test_pred is not None: test_pred += cls.predict_proba(x_test)[:, 1] * one_over_n else: test_pred = cls.predict_proba(x_test)[:, 1] * one_over_n for s in cv_scores: print(s) print(np.mean(cv_scores), np.std(cv_scores)) with_logits_sfx = "_with_logits" if with_logits else "" submit_fname = os.path.join( output_dir, f"xgb_cls_{np.mean(cv_scores):.4f}_{checksum}{with_logits_sfx}.csv") df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"}) df["Label"] = test_pred df[["Id", "Label"]].to_csv(submit_fname, index=False) print("Saved submission to ", submit_fname)
def main(): output_dir = os.path.dirname(__file__) experiments = [ # "A_May24_11_08_ela_skresnext50_32x4d_fold0_fp16", # "A_May15_17_03_ela_skresnext50_32x4d_fold1_fp16", # "A_May21_13_28_ela_skresnext50_32x4d_fold2_fp16", # "A_May26_12_58_ela_skresnext50_32x4d_fold3_fp16", # # "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", # "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", # "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # # "C_Jun24_22_00_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16", # # "D_Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16", # "D_Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16", # # "E_Jun18_19_24_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "E_Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16", # # "F_Jun29_19_43_rgb_tf_efficientnet_b3_ns_fold0_local_rank_0_fp16", # "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16", "Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16", ] holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout", "d4") test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4") fnames_for_checksum = [x + f"cauc" for x in experiments] checksum = compute_checksum_v2(fnames_for_checksum) holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY]) image_ids = [fs.id_from_fname(x) for x in holdout_ds.images] quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(), 3).numpy().astype(np.float32) test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY]) quality_t = F.one_hot(torch.tensor(test_ds.quality).long(), 3).numpy().astype(np.float32) x, y = get_x_y_for_stacking(holdout_predictions) print(x.shape, y.shape) x_test, _ = get_x_y_for_stacking(test_predictions) print(x_test.shape) if True: sc = StandardScaler() x = sc.fit_transform(x) x_test = sc.transform(x_test) if False: sc = PCA(n_components=16) x = sc.fit_transform(x) x_test = sc.transform(x_test) if True: x = np.column_stack([x, quality_h]) x_test = np.column_stack([x_test, quality_t]) group_kfold = GroupKFold(n_splits=2) params = { "depth": [3, 1, 2, 6, 4, 5, 7, 8, 9, 10], # "iterations": [250, 100, 500, 1000], "learning_rate": [0.03, 0.001, 0.01, 0.1, 0.2, 0.3], "l2_leaf_reg": [3, 1, 5, 10, 100], } lgb_estimator = cat.CatBoostClassifier( verbose=True, iterations=2500, # use_best_model=True, eval_metric="AUC", task_type="GPU", ) random_search = RandomizedSearchCV( lgb_estimator, param_distributions=params, n_iter=10, scoring=make_scorer(alaska_weighted_auc, greater_is_better=True, needs_proba=True), cv=group_kfold.split(x, y, groups=image_ids), verbose=3, random_state=42, ) # Here we go random_search.fit(x, y) test_pred = random_search.predict_proba(x_test)[:, 1] print(test_pred) submit_fname = os.path.join( output_dir, f"catboost_gs_{random_search.best_score_:.4f}_{checksum}.csv") df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"}) df["Label"] = test_pred df[["Id", "Label"]].to_csv(submit_fname, index=False) print("Saved predictions to ", submit_fname) print("\n All results:") print(random_search.cv_results_) print("\n Best estimator:") print(random_search.best_estimator_) print(random_search.best_score_) print("\n Best hyperparameters:") print(random_search.best_params_)
def main(): output_dir = os.path.dirname(__file__) experiments = [ "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16", "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16", # "K_Jul17_17_09_nr_rgb_tf_efficientnet_b6_ns_mish_fold0_local_rank_0_fp16", "K_Jul18_16_41_nr_rgb_tf_efficientnet_b6_ns_mish_fold3_local_rank_0_fp16", # "J_Jul19_20_10_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16", ] holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout", "d4") test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4") fnames_for_checksum = np.array( [x + "cauc_bin" for x in experiments] # + [x + "loss_bin" for x in experiments] + [x + "cauc_cls" for x in experiments] # + [x + "loss_cls" for x in experiments] ) X = make_binary_predictions(holdout_predictions) + make_classifier_predictions(holdout_predictions) y_true = X[0].y_true_type.values X = np.array([x.Label.values for x in X]) assert len(fnames_for_checksum) == X.shape[0] X_test = make_binary_predictions(test_predictions) + make_classifier_predictions(test_predictions) indices = np.arange(len(X)) for r in range(2, 8): best_comb = None best_auc = 0 combs = list(itertools.combinations(indices, r)) for c in tqdm(combs, desc=f"{r}"): avg_preds = X[np.array(c)].mean(axis=0) score_averaging = alaska_weighted_auc(y_true, avg_preds) if score_averaging > best_auc: best_auc = score_averaging best_comb = c print(r, best_auc, best_comb) checksum = compute_checksum_v2(fnames_for_checksum[np.array(best_comb)]) test_preds = [X_test[i] for i in best_comb] test_preds = blend_predictions_mean(test_preds) test_preds.to_csv(os.path.join(output_dir, f"cmb_mean_{best_auc:.4f}_{r}_{checksum}.csv"), index=False) for r in range(2, 8): best_comb = None best_auc = 0 combs = list(itertools.combinations(indices, r)) for c in tqdm(combs, desc=f"{r}"): rnk_preds = rankdata(X[np.array(c)], axis=1).mean(axis=0) score_averaging = alaska_weighted_auc(y_true, rnk_preds) if score_averaging > best_auc: best_auc = score_averaging best_comb = c print(r, best_auc, best_comb) checksum = compute_checksum_v2(fnames_for_checksum[np.array(best_comb)]) test_preds = [X_test[i] for i in best_comb] test_preds = blend_predictions_mean(test_preds) test_preds.to_csv(os.path.join(output_dir, f"cmb_rank_{best_auc:.4f}_{r}_{checksum}.csv"), index=False)
def main(): output_dir = os.path.dirname(__file__) experiments = [ # "A_May24_11_08_ela_skresnext50_32x4d_fold0_fp16", # "A_May15_17_03_ela_skresnext50_32x4d_fold1_fp16", # "A_May21_13_28_ela_skresnext50_32x4d_fold2_fp16", # "A_May26_12_58_ela_skresnext50_32x4d_fold3_fp16", # # "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", # "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", # "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # # "C_Jun24_22_00_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16", # # "D_Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16", # "D_Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16", # # "E_Jun18_19_24_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "E_Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16", # # "F_Jun29_19_43_rgb_tf_efficientnet_b3_ns_fold0_local_rank_0_fp16", # "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16", "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16", ] holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout", tta=None, need_embedding=True) test_predictions = get_predictions_csv(experiments, "cauc", "test", tta=None, need_embedding=True) checksum = compute_checksum_v2(experiments) holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY]) image_ids = [fs.id_from_fname(x) for x in holdout_ds.images] quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(), 3).numpy().astype(np.float32) test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY]) quality_t = F.one_hot(torch.tensor(test_ds.quality).long(), 3).numpy().astype(np.float32) x, y = get_x_y_embedding_for_stacking(holdout_predictions) print(x.shape, y.shape) x_test, _ = get_x_y_embedding_for_stacking(test_predictions) print(x_test.shape) if False: sc = StandardScaler() x = sc.fit_transform(x) x_test = sc.transform(x_test) if True: sc = PCA(n_components=512) x = sc.fit_transform(x) x_test = sc.transform(x_test) if True: x = np.column_stack([x, quality_h]) x_test = np.column_stack([x_test, quality_t]) group_kfold = GroupKFold(n_splits=5) params = { "min_child_weight": [1, 5, 10], "gamma": [1e-3, 1e-2, 1e-2, 0.5, 2], "subsample": [0.6, 0.8, 1.0], "colsample_bytree": [0.6, 0.8, 1.0], "max_depth": [2, 3, 4, 5, 6], "n_estimators": [16, 32, 64, 128, 256, 1000], "learning_rate": [0.001, 0.01, 0.05, 0.2, 1], } xgb = XGBClassifier(objective="binary:logistic", nthread=1) random_search = RandomizedSearchCV( xgb, param_distributions=params, scoring=make_scorer(alaska_weighted_auc, greater_is_better=True, needs_proba=True), n_jobs=4, n_iter=25, cv=group_kfold.split(x, y, groups=image_ids), verbose=3, random_state=42, ) # Here we go random_search.fit(x, y) print("\n All results:") print(random_search.cv_results_) print("\n Best estimator:") print(random_search.best_estimator_) print(random_search.best_score_) print("\n Best hyperparameters:") print(random_search.best_params_) results = pd.DataFrame(random_search.cv_results_) results.to_csv("xgb-embedding-random-grid-search-results-01.csv", index=False) test_pred = random_search.predict_proba(x_test)[:, 1] submit_fname = os.path.join(output_dir, f"xgb_cls_emb_gs_{random_search.best_score_:.4f}_{checksum}_.csv") df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"}) df["Label"] = test_pred df[["Id", "Label"]].to_csv(submit_fname, index=False) print("Saved submission to ", submit_fname)
def main(): output_dir = os.path.dirname(__file__) experiments = [ # "A_May24_11_08_ela_skresnext50_32x4d_fold0_fp16", # "A_May15_17_03_ela_skresnext50_32x4d_fold1_fp16", # "A_May21_13_28_ela_skresnext50_32x4d_fold2_fp16", # "A_May26_12_58_ela_skresnext50_32x4d_fold3_fp16", # # "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", # "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", # "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # "C_Jun24_22_00_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16", # "D_Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16", "D_Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16", # # "E_Jun18_19_24_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "E_Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16", # "F_Jun29_19_43_rgb_tf_efficientnet_b3_ns_fold0_local_rank_0_fp16", # "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", ] holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout", "d4") test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4") fnames_for_checksum = [x + f"cauc" for x in experiments] checksum = compute_checksum_v2(fnames_for_checksum) holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY]) image_ids = [fs.id_from_fname(x) for x in holdout_ds.images] quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(), 3).numpy().astype(np.float32) test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY]) quality_t = F.one_hot(torch.tensor(test_ds.quality).long(), 3).numpy().astype(np.float32) x, y = get_x_y(holdout_predictions) print(x.shape, y.shape) x_test, _ = get_x_y(test_predictions) print(x_test.shape) if True: sc = StandardScaler() x = sc.fit_transform(x) x_test = sc.transform(x_test) if False: sc = PCA(n_components=16) x = sc.fit_transform(x) x_test = sc.transform(x_test) if True: x = np.column_stack([x, quality_h]) x_test = np.column_stack([x_test, quality_t]) group_kfold = GroupKFold(n_splits=5) df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"}) auc_cv = [] classifier1 = LGBMClassifier() classifier2 = CatBoostClassifier() classifier3 = LogisticRegression() classifier4 = CalibratedClassifierCV() classifier5 = LinearDiscriminantAnalysis() sclf = StackingCVClassifier( classifiers=[ classifier1, classifier2, classifier3, classifier4, classifier5 ], shuffle=False, use_probas=True, cv=4, # meta_classifier=SVC(degree=2, probability=True), meta_classifier=LogisticRegression(solver="lbfgs"), ) sclf.fit(x, y, groups=image_ids) classifiers = { "LGBMClassifier": classifier1, "CatBoostClassifier": classifier2, "LogisticRegression": classifier3, "CalibratedClassifierCV": classifier4, "LinearDiscriminantAnalysis": classifier5, "Stack": sclf, } # Get results for key in classifiers: # Make prediction on test set y_pred = classifiers[key].predict_proba(x_valid)[:, 1] print(key, alaska_weighted_auc(y_valid, y_pred)) # Making prediction on test set y_test = sclf.predict_proba(x_test)[:, 1] df["Label"] = y_test df.to_csv(os.path.join(output_dir, f"stacking_{np.mean(auc_cv):.4f}_{checksum}.csv"), index=False)