def custom_metric(self, y_pred: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, np.ndarray]: y_true = dtrain.get_label().astype(float) loss = normalized_absolute_errors(y_true, y_pred) return "normalized_mae", loss
def main(config: DictConfig) -> None: prepair_dir(config) train_df, test_df = load_data(config) label_cols = [ "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2" ] feature_cols = [ col for col in train_df.columns if col not in label_cols + ["Id"] ] # From adversarial valiadtion # feature_cols.remove("IC_20") train_df["age_rank"] = train_df["age"] // 10 * 10 age_rank = train_df["age_rank"].to_array() if config.randomize_age: set_seed(100) train_df["age"] += [randomize_age(age) for age in train_df["age"]] skf = StratifiedKFold(n_splits=5, random_state=config.data.seed, shuffle=True) for label_col, k in zip(label_cols, [20, 160, 180, 120, 120]): y_oof = np.zeros(train_df.shape[0]) for n_fold, (train_index, val_index) in enumerate(skf.split(age_rank, age_rank)): train_df_fold = train_df.iloc[train_index] valid_df_fold = train_df.iloc[val_index] train_df_fold = train_df_fold[train_df_fold[label_col].notnull()] model = KNeighborsRegressor(n_neighbors=k) model.fit(train_df_fold[feature_cols], train_df_fold[label_col]) y_oof[val_index] = model.predict( valid_df_fold[feature_cols]).to_array() test_df[f"{label_col}_pred_fold{n_fold}"] = model.predict( test_df[feature_cols]) train_df[f"{label_col}_pred"] = y_oof notnull_idx = train_df[label_col].notnull() score = normalized_absolute_errors( train_df[notnull_idx][label_col].values, train_df[notnull_idx][f"{label_col}_pred"].values, ) logger.info(f"{label_col}, score: {score}") test_df[label_col] = test_df[[ f"{label_col}_pred_fold{i}" for i in range(5) ]].mean(1) score = 0 for label_col, weight in zip(label_cols, [0.3, 0.175, 0.175, 0.175, 0.175]): notnull_idx = train_df[label_col].notnull() score += (normalized_absolute_errors( train_df[notnull_idx][label_col].to_array(), train_df[notnull_idx][f"{label_col}_pred"].to_array(), ) * weight) logger.info(f"all score: {score}") train_df.to_csv( os.path.join(config.store.result_path, f"{config.store.model_name}_train.csv"), index=False, ) test_df.to_csv( os.path.join(config.store.result_path, f"{config.store.model_name}_test.csv"), index=False, ) if config.store.gcs_project is not None: upload_directory(config.store)
def validation_epoch_end(self, outputs): preds = np.concatenate( [x["preds"].detach().cpu().numpy() for x in outputs], axis=0) labels = np.concatenate( [x["labels"].detach().cpu().numpy() for x in outputs], axis=0) loss = np.mean([x["loss"].detach().cpu().numpy() for x in outputs]) ids = list( chain.from_iterable( [x["Id"].detach().cpu().numpy() for x in outputs])) label_cols = self.valid_dataset.label_cols df_dict = {"Id": ids} for i, label_col in enumerate(label_cols): df_dict[f"{label_col}_pred"] = preds[:, i] df_dict[label_col] = labels[:, i] df = pd.DataFrame(df_dict) if self.store_config.save_feature: feature = np.concatenate( [x["feature"].detach().cpu().numpy() for x in outputs], axis=0) for i in range(feature.shape[-1]): df[f"feature{i}"] = feature[:, i] # For handling log_loss None Error results = { f"{label_col}_nae": normalized_absolute_errors(df[label_col].values, df[f"{label_col}_pred"].values) for label_col in label_cols } avg_score = weighted_normalized_absolute_errors( df[label_cols].values, df[[f"{col}_pred" for col in label_cols]].values, weights=self.data_config.weights, ).astype(np.float32) if self.use_ddp: metrics = {"avg_loss": loss, "avg_score": avg_score} world_size = dist.get_world_size() rank = dist.get_rank() aggregated_metrics = {} for metric_name, metric_val in metrics.items(): metric_tensor = torch.tensor(metric_val).to(f"cuda:{rank}") dist.barrier() dist.all_reduce(metric_tensor, op=dist.ReduceOp.SUM) reduced_metric = metric_tensor.item() / world_size aggregated_metrics[metric_name] = reduced_metric loss = aggregated_metrics["avg_loss"] avg_score = aggregated_metrics["avg_score"] else: rank = 0 res = {} res["step"] = int(self.global_step) res["epoch"] = int(self.current_epoch) if avg_score <= self.best_score: self.best_score = avg_score self.save_flg = True res["best_score"] = float(self.best_score) df.to_csv( os.path.join(self.store_config.result_path, f"valid_result_{rank}.csv"), index=False, ) with open( os.path.join(self.store_config.log_path, "best_score.yaml"), "w") as f: yaml.dump(res, f, default_flow_style=False) metrics = {} metrics["progress_bar"] = { "val_loss": avg_score, "avg_val_score": torch.tensor(avg_score), "best_score": self.best_score, "lr": self.optimizer.param_groups[0]["lr"], } metrics["progress_bar"].update(results) metrics["log"] = { "val_loss": avg_score, "avg_val_score": torch.tensor(avg_score), "best_score": self.best_score, } metrics["log"].update(results) return metrics
def main(config: DictConfig) -> None: prepair_dir(config) set_seed(777777) train_df, test_df = load_data(config) label_cols = [ "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2" ] feature_cols = [ col for col in train_df.columns if col not in label_cols + ["Id"] ] # From adversarial valiadtion feature_cols.remove("IC_20") train_df["age_rank"] = train_df["age"] // 10 * 10 skf = StratifiedKFold(n_splits=5, random_state=config.data.seed, shuffle=True) for i, (_, val_index) in enumerate(skf.split(train_df, train_df["age_rank"])): train_df.loc[val_index, "fold"] = i if config.randomize_age: train_df["age"] += np.array( [randomize_age(age) for age in train_df["age"].values]) params = {} params["age"] = { "alpha": 0.8, "reg_lambda": 0.8, "max_leaves": 3, "colsample_bytree": 0.6, "subsample": 0.8, "min_child_weight": 50, "booster": "gbtree", "objective": "reg:squarederror", "eval_metric": "mae", "max_depth": 10, "learning_rate": 0.01, "nthread": -1, "max_bin": 256, "tree_method": "gpu_hist", "disable_default_eval_metric": 1, } params["domain1_var1"] = { "alpha": 0.8, "reg_lambda": 0.8, "max_leaves": 4, "colsample_bytree": 0.5, "subsample": 0.8, "min_child_weight": 5, "booster": "gbtree", "objective": "reg:squarederror", "eval_metric": "mae", "max_depth": 7, "learning_rate": 0.01, "nthread": -1, "max_bin": 256, "tree_method": "gpu_hist", "disable_default_eval_metric": 1, } params["domain1_var2"] = { "alpha": 0.8, "reg_lambda": 0.8, "max_leaves": 4, "colsample_bytree": 0.8, "subsample": 0.6, "min_child_weight": 5, "booster": "gbtree", "objective": "reg:squarederror", "eval_metric": "mae", "max_depth": 7, "learning_rate": 0.01, "nthread": -1, "max_bin": 256, "tree_method": "gpu_hist", "disable_default_eval_metric": 1, } params["domain2_var1"] = { "alpha": 0.8, "reg_lambda": 0.8, "max_leaves": 4, "colsample_bytree": 0.7, "subsample": 0.4, "min_child_weight": 5, "booster": "gbtree", "objective": "reg:squarederror", "eval_metric": "mae", "max_depth": 7, "learning_rate": 0.01, "nthread": -1, "max_bin": 256, "tree_method": "gpu_hist", "disable_default_eval_metric": 1, } params["domain2_var2"] = { "alpha": 0.8, "reg_lambda": 0.8, "max_leaves": 4, "colsample_bytree": 0.7, "subsample": 0.4, "min_child_weight": 5, "booster": "gbtree", "objective": "reg:squarederror", "eval_metric": "mae", "max_depth": 7, "learning_rate": 0.01, "nthread": -1, "max_bin": 256, "tree_method": "gpu_hist", "disable_default_eval_metric": 1, } for label_col in label_cols: model = XGBModel(feature_cols, label_col, params[label_col]) train_df, test_df = model.cv(train_df, test_df) score = normalized_absolute_errors( train_df[label_col].values, train_df[f"{label_col}_pred"].values) logger.info(f"{label_col} score: {score}") # test_df[label_col] = test_df[ # [f"{label_col}_pred_fold{i}" for i in range(5)] # ].mean(1) save_importance(model.importance_df, label_col, config.store) score = weighted_normalized_absolute_errors( train_df[label_cols].values, train_df[[f"{label_col}_pred" for label_col in label_cols]].values, ) logger.info(f"all score: {score}") train_df.to_csv( os.path.join(config.store.result_path, f"{config.store.model_name}_train.csv"), index=False, ) test_df.to_csv( os.path.join(config.store.result_path, f"{config.store.model_name}_test.csv"), index=False, ) # with pseudo label for label_col in label_cols: feature_cols = list( pd.read_csv( f"{config.store.workdir}/output/{config.store.model_name}/result/importance_{label_col}.csv" )["index"].values[:1024]) model = XGBModel(feature_cols, label_col, params[label_col]) train_df, test_df = model.cv(train_df, test_df) score = normalized_absolute_errors( train_df[label_col].values, train_df[f"{label_col}_pred"].values) logger.info(f"{label_col} score: {score}") test_df[label_col] = test_df[[ f"{label_col}_pred_fold{i}" for i in range(5) ]].mean(1) save_importance( model.importance_df, label_col, config.store, suffix=f"_feature_section{config.n_feature}", ) score = weighted_normalized_absolute_errors( train_df[label_cols].values, train_df[[f"{label_col}_pred" for label_col in label_cols]].values.copy(), ) logger.info(f"all score: {score}") train_df.to_csv( os.path.join( config.store.result_path, f"{config.store.model_name}_feature_section{config.n_feature}_train.csv", ), index=False, ) test_df.to_csv( os.path.join( config.store.result_path, f"{config.store.model_name}_feature_section{config.n_feature}_test.csv", ), index=False, ) sub_df = make_submission(test_df) sub_df.to_csv( os.path.join( config.store.result_path, f"{config.store.model_name}_feature_section{config.n_feature}_submission.csv", ), index=False, ) if config.store.gcs_project is not None: upload_directory(config.store)
def main(config: DictConfig) -> None: prepair_dir(config) train_df, test_df = load_data(config) label_cols = [ "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2" ] feature_cols = [ col for col in train_df.columns if col not in label_cols + ["Id"] ] train_df["age_rank"] = train_df["age"] // 10 * 10 skf = StratifiedKFold(n_splits=5, random_state=config.data.seed, shuffle=True) for i, (_, val_index) in enumerate(skf.split(train_df, train_df["age_rank"])): train_df.loc[val_index, "fold"] = i if config.randomize_age: set_seed(100) train_df["age"] += [randomize_age(age) for age in train_df["age"]] for label_col in label_cols: best_score = np.inf best_alpha = 0.0 best_pred = np.zeros([train_df.shape[0]]) for alpha in [0.01, 0.001, 0.0003, 0.0001]: for n_fold in range(5): if not config.use_bagging: model = Ridge(alpha=alpha) else: model = BaggingRegressor( Ridge(alpha=alpha), n_estimators=30, random_state=42, max_samples=0.3, max_features=0.3, ) X_train = train_df.query("fold!=@n_fold")[feature_cols] y_train = train_df.query("fold!=@n_fold")[label_col] X_train = X_train[y_train.notnull()] y_train = y_train[y_train.notnull()] model.fit(X_train, y_train) train_df.loc[train_df.query("fold==@n_fold").index, f"{label_col}_pred"] = model.predict( train_df.query("fold==@n_fold")[feature_cols]) score = normalized_absolute_errors( train_df[label_col].values, train_df[f"{label_col}_pred"].values) logger.info(f"{label_col} alpha: {alpha}, score: {score}") if score <= best_score: best_score = score best_alpha = alpha best_pred[:] = train_df[f"{label_col}_pred"].values train_df[f"{label_col}_pred"] = best_pred for n_fold in range(5): if not config.use_bagging: model = Ridge(alpha=best_alpha) else: model = BaggingRegressor( Ridge(alpha=best_alpha), n_estimators=30, random_state=42, max_samples=0.3, max_features=0.3, ) X_train = train_df.query("fold!=@n_fold")[feature_cols] y_train = train_df.query("fold!=@n_fold")[label_col] X_train = X_train[y_train.notnull()] y_train = y_train[y_train.notnull()] model.fit(X_train, y_train) test_df[f"{label_col}_pred_fold{n_fold}"] = model.predict( test_df[feature_cols]) score = normalized_absolute_errors( train_df[label_col].values, train_df[f"{label_col}_pred"].values) logger.info(f"{label_col} alpha: {best_alpha}, score: {score}") test_df[label_col] = test_df[[ f"{label_col}_pred_fold{i}" for i in range(5) ]].mean(1) score = weighted_normalized_absolute_errors( train_df[label_cols].values, train_df[[f"{label_col}_pred" for label_col in label_cols]].values, ) logger.info(f"all score: {score}") train_df.to_csv( os.path.join(config.store.result_path, f"{config.store.model_name}_train.csv"), index=False, ) test_df.to_csv( os.path.join(config.store.result_path, f"{config.store.model_name}_test.csv"), index=False, ) if config.store.gcs_project is not None: upload_directory(config.store)
def custom_metric( self, y_pred: np.ndarray, dtrain: lgb.basic.Dataset) -> Tuple[str, np.ndarray, bool]: y_true = dtrain.get_label().astype(float) loss = normalized_absolute_errors(y_true, y_pred) return "normalized_mae", loss, False
def main(config: DictConfig) -> None: prepair_dir(config) set_seed(777) train_df, test_df = load_data(config) label_cols = [ "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2", ] feature_cols = [ col for col in train_df.columns if col not in label_cols + ["Id"] ] # From adversarial valiadtion train_df["age_rank"] = train_df["age"] // 10 * 10 skf = StratifiedKFold(n_splits=5, random_state=config.data.seed, shuffle=True) for i, (_, val_index) in enumerate(skf.split(train_df, train_df["age_rank"])): train_df.loc[val_index, "fold"] = i if config.randomize_age: set_seed(777) train_df["age"] += np.array( [randomize_age(age) for age in train_df["age"].values]) params = {} params["age"] = { "lambda_l1": 0.8, "lambda_l2": 0.8, "num_leaves": 2, "feature_fraction": 0.4, "bagging_fraction": 0.6, "bagging_freq": 1, "min_child_samples": 20, "task": "train", "boosting_type": "gbdt", "objective": "mae", "metric": "normalized_mae", "max_depth": 7, "learning_rate": 0.01, "num_thread": 4, "max_bin": 256, "verbose": -1, "device": "cpu", } params["domain1_var1"] = { "lambda_l1": 0.8, "lambda_l2": 0.8, "num_leaves": 4, "feature_fraction": 0.5, "bagging_fraction": 0.8, "bagging_freq": 1, "min_child_samples": 5, "task": "train", "boosting_type": "gbdt", "objective": "mae", "metric": "normalized_mae", "max_depth": 7, "learning_rate": 0.01, "num_thread": 4, "max_bin": 256, "verbose": -1, "device": "cpu", } params["domain1_var2"] = { "lambda_l1": 0.8, "lambda_l2": 0.8, "num_leaves": 4, "feature_fraction": 0.8, "bagging_fraction": 0.6, "bagging_freq": 6, "min_child_samples": 5, "task": "train", "boosting_type": "gbdt", "objective": "mae", "metric": "normalized_mae", "max_depth": 7, "learning_rate": 0.01, "num_thread": 4, "max_bin": 256, "verbose": -1, "device": "cpu", } params["domain2_var1"] = { "lambda_l1": 0.8, "lambda_l2": 0.8, "num_leaves": 4, "feature_fraction": 0.7, "bagging_fraction": 0.4, "bagging_freq": 1, "min_child_samples": 5, "task": "train", "boosting_type": "gbdt", "objective": "mae", "metric": "normalized_mae", "max_depth": 7, "learning_rate": 0.01, "num_thread": 4, "max_bin": 256, "verbose": -1, "device": "cpu", } params["domain2_var2"] = { "lambda_l1": 0.8, "lambda_l2": 0.8, "num_leaves": 4, "feature_fraction": 0.4, "bagging_fraction": 0.8, "bagging_freq": 1, "min_child_samples": 5, "task": "train", "boosting_type": "gbdt", "objective": "mae", "metric": "normalized_mae", "max_depth": 7, "learning_rate": 0.01, "num_thread": 4, "max_bin": 256, "verbose": -1, "device": "cpu", } if not config.is_null_importance and not config.is_adversarial_validation: for label_col in label_cols: model = LGBMModel(feature_cols, label_col, params[label_col]) train_df, test_df = model.cv(train_df, test_df) score = normalized_absolute_errors( train_df[label_col].values, train_df[f"{label_col}_pred"].values) logger.info(f"{label_col} score: {score}") # test_df[label_col] = test_df[ # [f"{label_col}_pred_fold{i}" for i in range(5)] # ].mean(1) save_importance(model.importance_df, label_col, config.store) score = weighted_normalized_absolute_errors( train_df[label_cols].values, train_df[[f"{label_col}_pred" for label_col in label_cols]].values.copy(), ) logger.info(f"all score: {score}") train_df.to_csv( os.path.join(config.store.result_path, f"{config.store.model_name}_train.csv"), index=False, ) test_df.to_csv( os.path.join(config.store.result_path, f"{config.store.model_name}_test.csv"), index=False, ) if config.store.gcs_project is not None: upload_directory(config.store) # Feature Selection for label_col in label_cols: feature_cols = list( pd.read_csv( f"{config.store.workdir}/output/{config.store.model_name}/result/importance_{label_col}.csv" )["index"].values[:config.n_feature]) model = LGBMModel(feature_cols, label_col, params[label_col]) train_df, test_df = model.cv(train_df, test_df) score = normalized_absolute_errors( train_df[label_col].values, train_df[f"{label_col}_pred"].values) logger.info(f"{label_col} score: {score}") test_df[label_col] = test_df[[ f"{label_col}_pred_fold{i}" for i in range(5) ]].mean(1) save_importance( model.importance_df, label_col, config.store, suffix=f"_feature_section{config.n_feature}", ) score = weighted_normalized_absolute_errors( train_df[label_cols].values, train_df[[f"{label_col}_pred" for label_col in label_cols]].values.copy(), ) logger.info(f"all score: {score}") train_df.to_csv( os.path.join( config.store.result_path, f"{config.store.model_name}_feature_section{config.n_feature}_train.csv", ), index=False, ) test_df.to_csv( os.path.join( config.store.result_path, f"{config.store.model_name}_feature_section{config.n_feature}_test.csv", ), index=False, ) sub_df = make_submission(test_df) sub_df.to_csv( os.path.join( config.store.result_path, f"{config.store.model_name}_feature_section{config.n_feature}_submission.csv", ), index=False, ) if config.store.gcs_project is not None: upload_directory(config.store) elif config.is_adversarial_validation: skf = StratifiedKFold(n_splits=5, random_state=config.data.seed, shuffle=True) if True: train_df["is_train"] = 1 test_df["is_train"] = 0 label_col = "is_train" train_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True) if False: site2_ids = pd.read_csv( f"{config.store.workdir}/input/reveal_ID_site2.csv" )["Id"].values test_df.loc[test_df.query("Id in @site2_ids").index, "is_site2"] = 1 train_df = pd.concat( [train_df, test_df.query("is_site2==1")], axis=0).reset_index(drop=True) train_df["is_site2"].fillna(0, inplace=True) label_col = "is_site2" for i, (_, val_index) in enumerate( skf.split(train_df, train_df[label_col])): train_df.loc[val_index, "fold"] = i print(i, train_df.loc[val_index, label_col].sum()) param = { "lambda_l1": 0.1, "lambda_l2": 0.1, "num_leaves": 32, "feature_fraction": 0.4, "bagging_fraction": 0.8, "bagging_freq": 1, "min_child_samples": 20, "task": "train", "boosting_type": "gbdt", "objective": "binary", "metric": "auc", "max_depth": 10, "learning_rate": 0.01, "num_thread": 4, "max_bin": 256, "verbose": -1, "device": "cpu", } model = LGBMModel(feature_cols, label_col, param) train_df, test_df = model.cv(train_df, test_df) test_df[label_col] = test_df[[ f"{label_col}_pred_fold{i}" for i in range(5) ]].mean(1) score = roc_auc_score(train_df[label_col].values, train_df[f"{label_col}_pred"].values) logger.info(f"{label_col} score: {score}") save_importance(model.importance_df, label_col, config.store) train_df.to_csv( os.path.join(config.store.result_path, f"{config.store.model_name}_adv_val.csv"), index=False, ) test_df.to_csv( os.path.join(config.store.result_path, f"{config.store.model_name}_adv_val_test.csv"), index=False, ) else: for label_col in label_cols: null_imp_df = pd.DataFrame() for i in range(config.n_run): train_df[label_col] = np.random.permutation( train_df[label_col]) model = LGBMModel(feature_cols, label_col, params[label_col]) train_df, test_df = model.cv(train_df, test_df) score = normalized_absolute_errors( train_df[label_col].values, train_df[f"{label_col}_pred"].values) logger.info(f"{label_col} score: {score}") test_df[label_col] = test_df[[ f"{label_col}_pred_fold{i}" for i in range(5) ]].mean(1) importance_df = model.importance_df importance_df["run"] = i + 1 null_imp_df = pd.concat([null_imp_df, importance_df]) save_importance(null_imp_df, label_col, config.store)
def main(config: DictConfig) -> None: prepair_dir(config) set_seed(config.data.seed) label_cols = [ "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2" ] train_dfs, names = load_train_data(config.store.workdir) test_dfs = load_test_data(config.store.workdir) remove_cols = [ "knn_age_pred", "knn_domain1_var1", "densenet121_age_pred", "densenet121_domain1_var1_pred", "densenet121_domain1_var2_pred", "densenet121_domain2_var2_pred", "3dcnn_resnet18_domain1_var2_pred", "3dcnn_resnet18_domain2_var1_pred", "3dcnn_resnet18_domain2_var2_pred", "1dresnet18_domain1_var1_pred", "1dresnet18_domain1_var2_pred", "1dresnet18_domain2_var2_pred", "simple_3dcnn_domain1_var1_pred", "simple_3dcnn_domain1_var2_pred", "simple_3dcnn_domain2_var2_pred", "transformer_domain2_var1_pred", "transformer_domain2_var2_pred", "transformer_domain1_var1_pred", "transformer_domain1_var2_pred", "lgbm_gnn_feature_domain1_var2_pred", "lgbm_gnn_feature_domain2_var2_pred", "lgbm_gnn_featured_domain1_var2_pred", "lgbm_gnn_featured_domain2_var2_pred", "lgbm_cnn_feature_domain1_var2_pred", "lgbm_cnn_feature_domain2_var2_pred", "lgbm_2plus1dcnn_feature_domain1_var2_pred", "lgbm_2plus1dcnn_feature_domain2_var2_pred", "xgb_2plus1dcnn_feature_age_pred", "xgb_2plus1dcnn_feature_domain1_var2_pred", "xgb_2plus1dcnn_feature_domain2_var2_pred", "simple_3dcnn_domain2_var1_pred", "simple_3dcnn_3label_domain1_var2_pred", "gin_domain1_var1_pred", "gin_domain2_var1_pred", "2plus1dcnn_resnet10_domain1_var2_pred", "resnest14d_domain1_var1_pred", "resnest14d_domain1_var2_pred", "resnest14d_domain2_var2_pred", ] train_ft_dict = {} test_ft_dict = {} feature_cols = [] train_ft_dict["Id"] = train_dfs[0]["Id"] test_ft_dict["Id"] = test_dfs[0]["Id"] for label_col in label_cols: train_ft_dict[label_col] = train_dfs[0][label_col] for name, df in zip(names, train_dfs): for label_col in label_cols: if (f"{label_col}_pred" in df.columns and f"{name}_{label_col}_pred" not in remove_cols): train_ft_dict[f"{name}_{label_col}_pred"] = df[ f"{label_col}_pred"] feature_cols += [f"{name}_{label_col}_pred"] elif f"{name}_{label_col}_pred" in remove_cols: df.drop(f"{label_col}_pred", axis=1, inplace=True) feat_dict = make_domain_feature(df, mode="train", name=name) train_ft_dict.update(feat_dict) feature_cols += list(feat_dict.keys()) for name, df in zip(names, test_dfs): for label_col in label_cols: for i in range(5): if (f"{label_col}_pred_fold{i}" in df.columns and f"{name}_{label_col}_pred" not in remove_cols): test_ft_dict[f"{name}_{label_col}_pred_fold{i}"] = df[ f"{label_col}_pred_fold{i}"] elif (f"{name}_{label_col}_pred" in remove_cols and f"{label_col}_pred_fold{i}" in df.columns): df.drop(f"{label_col}_pred_fold{i}", axis=1, inplace=True) feat_dict = make_domain_feature(df, mode="test", name=name) test_ft_dict.update(feat_dict) train_df = pd.DataFrame(train_ft_dict) test_df = pd.DataFrame(test_ft_dict) train_df["age"] = ( pd.read_csv(f"{config.store.workdir}/input/train_scores.csv" ).sort_values("Id").reset_index(drop=True)["age"]) age_rank = train_df["age"].values // 10 * 10 skf = StratifiedKFold(n_splits=5, random_state=777, shuffle=True) train_df, test_df = preprocess(train_df, test_df, feature_cols) for feature_col in feature_cols: train_df[feature_col].fillna(0, inplace=True) test_df[feature_col].fillna(0, inplace=True) train_df = cudf.from_pandas(train_df) test_df = cudf.from_pandas(test_df) if config.randomize_age: set_seed(777_777_777) train_df["age"] += np.array( [randomize_age(age) for age in train_df["age"].values]) skf = StratifiedKFold(n_splits=5, random_state=777, shuffle=True) train_df = train_df.reset_index(drop=True) logger.info("=" * 10 + "parameter search" + "=" * 10) best_c = {} for label_col in label_cols: best = np.inf if label_col == "age": feature_cols_ = [ col for col in feature_cols if f"{label_col}" in col ] else: feature_cols_ = feature_cols for c in [2**(i) for i in range(-14, 1)]: y_oof = np.zeros(train_df.shape[0]) for n_fold, (train_index, val_index) in enumerate(skf.split(age_rank, age_rank)): train_df_fold = train_df.iloc[train_index] valid_df_fold = train_df.iloc[val_index] train_df_fold = train_df_fold[ train_df_fold[label_col].notnull()] model = SVR(kernel="linear", C=c, cache_size=3000.0) model.fit(train_df_fold[feature_cols_], train_df_fold[label_col]) y_oof[val_index] = model.predict( valid_df_fold[feature_cols_]).to_array() test_df[f"{label_col}_pred_fold{n_fold}"] = model.predict( test_df[feature_cols_]) train_df[f"{label_col}_pred"] = y_oof notnull_idx = train_df[label_col].notnull() score = normalized_absolute_errors( train_df[notnull_idx][label_col].values, train_df[notnull_idx][f"{label_col}_pred"].values, ) logger.info(f"c={c}, {label_col}: {score}") if score <= best: best = score best_c[label_col] = c logger.info("=" * 10 + "prediction" + "=" * 10) for label_col in label_cols: y_oof = np.zeros(train_df.shape[0]) if label_col == "age": feature_cols_ = [ col for col in feature_cols if f"{label_col}" in col ] else: feature_cols_ = feature_cols for n_fold, (train_index, val_index) in enumerate(skf.split(age_rank, age_rank)): train_df_fold = train_df.iloc[train_index] valid_df_fold = train_df.iloc[val_index] train_df_fold = train_df_fold[train_df_fold[label_col].notnull()] model = SVR(kernel="linear", C=best_c[label_col], cache_size=3000.0) model.fit(train_df_fold[feature_cols_], train_df_fold[label_col]) y_oof[val_index] = model.predict( valid_df_fold[feature_cols_]).to_array() test_df[f"{label_col}_pred_fold{n_fold}"] = model.predict( test_df[feature_cols_]) train_df[f"{label_col}_pred"] = y_oof notnull_idx = train_df[label_col].notnull() score = normalized_absolute_errors( train_df[notnull_idx][label_col].values, train_df[notnull_idx][f"{label_col}_pred"].values, ) logger.info(f"c={c}, {label_col}: {score}") score = 0 for label_col, weight in zip(label_cols, [0.3, 0.175, 0.175, 0.175, 0.175]): notnull_idx = train_df[label_col].notnull() score += (normalized_absolute_errors( train_df[notnull_idx][label_col].to_array(), train_df[notnull_idx][f"{label_col}_pred"].to_array(), ) * weight) logger.info(f"all: {score}") train_df.to_csv( os.path.join(config.store.result_path, f"{config.store.model_name}_train.csv"), index=False, ) test_df.to_csv( os.path.join(config.store.result_path, f"{config.store.model_name}_test.csv"), index=False, ) if config.store.gcs_project is not None: upload_directory(config.store) sub_df = make_submission(test_df) sub_df.to_csv( os.path.join(config.store.result_path, f"{config.store.model_name}_submission.csv"), index=False, )
def main(config: DictConfig) -> None: prepair_dir(config) set_seed(config.data.seed) label_cols = [ "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2" ] train_dfs, names = load_train_data(config.store.workdir) test_dfs = load_test_data(config.store.workdir) params = { "lambda_l1": 0.1, "lambda_l2": 0.1, "num_leaves": 2, "feature_fraction": 0.6, "bagging_fraction": 0.6, "bagging_freq": 1, "min_child_samples": 10, "task": "train", "boosting_type": "gbdt", "objective": "mae", "metric": "normalized_mae", "max_depth": 7, "learning_rate": 0.01, "num_thread": 4, "max_bin": 256, "verbose": -1, "device": "cpu", } remove_cols = [ "knn_age_pred", "knn_domain1_var1", "densenet121_age_pred", "densenet121_domain1_var1_pred", "densenet121_domain1_var2_pred", "densenet121_domain2_var2_pred", "3dcnn_resnet18_domain1_var2_pred", "3dcnn_resnet18_domain2_var1_pred", "3dcnn_resnet18_domain2_var2_pred", "1dresnet18_domain1_var1_pred", "1dresnet18_domain1_var2_pred", "1dresnet18_domain2_var2_pred", "simple_3dcnn_domain1_var1_pred", "simple_3dcnn_domain1_var2_pred", "simple_3dcnn_domain2_var2_pred", "transformer_domain2_var1_pred", "transformer_domain2_var2_pred", "transformer_domain1_var1_pred", "transformer_domain1_var2_pred", "lgbm_gnn_feature_domain1_var2_pred", "lgbm_gnn_feature_domain2_var2_pred", "lgbm_gnn_featured_domain1_var2_pred", "lgbm_gnn_featured_domain2_var2_pred", "lgbm_cnn_feature_domain1_var2_pred", "lgbm_cnn_feature_domain2_var2_pred", "lgbm_2plus1dcnn_feature_domain1_var2_pred", "lgbm_2plus1dcnn_feature_domain2_var2_pred", "xgb_2plus1dcnn_feature_age_pred", "xgb_2plus1dcnn_feature_domain1_var2_pred", "xgb_2plus1dcnn_feature_domain2_var2_pred", "simple_3dcnn_domain2_var1_pred", "simple_3dcnn_3label_domain1_var2_pred", "gin_domain1_var1_pred", "gin_domain2_var1_pred", "2plus1dcnn_resnet10_domain1_var2_pred", "resnest14d_domain1_var1_pred", "resnest14d_domain1_var2_pred", "resnest14d_domain2_var2_pred", ] train_ft_dict = {} test_ft_dict = {} feature_cols = [] train_ft_dict["Id"] = train_dfs[0]["Id"] test_ft_dict["Id"] = test_dfs[0]["Id"] for label_col in label_cols: train_ft_dict[label_col] = train_dfs[0][label_col] for name, df in zip(names, train_dfs): for label_col in label_cols: if (f"{label_col}_pred" in df.columns and f"{name}_{label_col}_pred" not in remove_cols): train_ft_dict[f"{name}_{label_col}_pred"] = df[ f"{label_col}_pred"] feature_cols += [f"{name}_{label_col}_pred"] elif f"{name}_{label_col}_pred" in remove_cols: df.drop(f"{label_col}_pred", axis=1, inplace=True) feat_dict = make_domain_feature(df, mode="train", name=name) train_ft_dict.update(feat_dict) feature_cols += list(feat_dict.keys()) for name, df in zip(names, test_dfs): for label_col in label_cols: for i in range(5): if (f"{label_col}_pred_fold{i}" in df.columns and f"{name}_{label_col}_pred" not in remove_cols): test_ft_dict[f"{name}_{label_col}_pred_fold{i}"] = df[ f"{label_col}_pred_fold{i}"] elif f"{name}_{label_col}_pred" in remove_cols: df.drop(f"{label_col}_pred_fold{i}", axis=1, inplace=True) feat_dict = make_domain_feature(df, mode="test", name=name) test_ft_dict.update(feat_dict) train_df = pd.DataFrame(train_ft_dict) test_df = pd.DataFrame(test_ft_dict) train_df["age"] = ( pd.read_csv(f"{config.store.workdir}/input/train_scores.csv" ).sort_values("Id").reset_index(drop=True)["age"]) train_df["age_rank"] = train_df["age"] // 10 * 10 skf = StratifiedKFold(n_splits=5, random_state=777, shuffle=True) for i, (train_index, val_index) in enumerate(skf.split(train_df, train_df["age_rank"])): train_df.loc[val_index, "fold"] = i if config.randomize_age: set_seed(777777777) train_df["age"] += np.array( [randomize_age(age) for age in train_df["age"].values]) if (not config.is_null_importance and not config.is_adversarial_validation and not config.is_quantile): for label_col in label_cols: if not config.is_split_label: model = LGBMModel(feature_cols, label_col, params) else: model = LGBMModel( [col for col in feature_cols if f"{label_col}" in col], label_col, params, ) train_df, test_df = model.cv(train_df, test_df) score = normalized_absolute_errors( train_df[label_col].values, train_df[f"{label_col}_pred"].values) logger.info(f"{label_col} score: {score}") test_df[label_col] = test_df[[ f"{label_col}_pred_fold{i}" for i in range(5) ]].mean(1) save_importance(model.importance_df, label_col, config.store) score = weighted_normalized_absolute_errors( train_df[label_cols].values, train_df[[f"{label_col}_pred" for label_col in label_cols]].values.copy(), ) logger.info(f"{names} all score: {score}") train_df.to_csv( os.path.join(config.store.result_path, f"{config.store.model_name}_train.csv"), index=False, ) test_df.to_csv( os.path.join(config.store.result_path, f"{config.store.model_name}_test.csv"), index=False, ) if config.store.gcs_project is not None: upload_directory(config.store) sub_df = make_submission(test_df) sub_df.to_csv( os.path.join( config.store.result_path, f"{config.store.model_name}_submission.csv", ), index=False, ) elif config.is_quantile: params = { "lambda_l1": 0.1, "lambda_l2": 0.1, "num_leaves": 13, "feature_fraction": 1.0, "bagging_fraction": 0.6, "bagging_freq": 1, "min_child_samples": 10, "task": "train", "boosting_type": "gbdt", "objective": "quantile", "alpha": 0.75, "metric": None, "max_depth": 7, "learning_rate": 0.01, "num_thread": 4, "max_bin": 256, "verbose": -1, "device": "cpu", } for label_col in label_cols: model = LGBMModel(feature_cols, label_col, params) train_df, test_df = model.cv(train_df, test_df) train_df = train_df.rename( columns={f"{label_col}_pred": f"{label_col}_pred_upper"}) for i in range(5): test_df = test_df.rename(columns={ f"{label_col}_pred_fold{i}": f"{label_col}_pred_fold{i}_upper" }) test_df[f"{label_col}_pred_upper"] = test_df[[ f"{label_col}_pred_fold{i}_upper" for i in range(5) ]].mean(1) params["alpha"] = 0.25 for label_col in label_cols: model = LGBMModel(feature_cols, label_col, params) train_df, test_df = model.cv(train_df, test_df) train_df = train_df.rename( columns={f"{label_col}_pred": f"{label_col}_pred_lower"}) for i in range(5): test_df = test_df.rename(columns={ f"{label_col}_pred_fold{i}": f"{label_col}_pred_fold{i}_lower" }) test_df[f"{label_col}_pred_lower"] = test_df[[ f"{label_col}_pred_fold{i}_lower" for i in range(5) ]].mean(1) params["alpha"] = 0.5 for label_col in label_cols: model = LGBMModel(feature_cols, label_col, params) train_df, test_df = model.cv(train_df, test_df) score = normalized_absolute_errors( train_df[label_col].values, train_df[f"{label_col}_pred"].values) logger.info(f"{label_col} score: {score}") test_df[label_col] = test_df[[ f"{label_col}_pred_fold{i}" for i in range(5) ]].mean(1) save_importance(model.importance_df, label_col, config.store, suffix="_quantile") test_df[f"{label_col}_pred"] = test_df[[ f"{label_col}_pred_fold{i}" for i in range(5) ]].mean(1) score = weighted_normalized_absolute_errors( train_df[label_cols].values, train_df[[f"{label_col}_pred" for label_col in label_cols]].values.copy(), ) logger.info(f"{names} all score: {score}") train_df.to_csv( os.path.join( config.store.result_path, f"{config.store.model_name}_quantile_train.csv", ), index=False, ) test_df.to_csv( os.path.join(config.store.result_path, f"{config.store.model_name}_quantile_test.csv"), index=False, ) elif config.is_adversarial_validation: skf = StratifiedKFold(n_splits=5, random_state=config.data.seed, shuffle=True) if True: train_df["is_train"] = 1 test_df["is_train"] = 0 label_col = "is_train" train_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True) if False: site2_ids = pd.read_csv( f"{config.store.workdir}/input/reveal_ID_site2.csv" )["Id"].values train_df.loc[train_df.query("Id in @site2_ids").index, "is_site2"] = 1 train_df["is_site2"].fillna(0, inplace=True) label_col = "is_site2" for i, (_, val_index) in enumerate( skf.split(train_df, train_df[label_col])): train_df.loc[val_index, "fold"] = i param = { "lambda_l1": 0.1, "lambda_l2": 0.1, "num_leaves": 32, "feature_fraction": 0.4, "bagging_fraction": 0.8, "bagging_freq": 1, "min_child_samples": 20, "task": "train", "boosting_type": "gbdt", "objective": "binary", "metric": "auc", "max_depth": 10, "learning_rate": 0.01, "num_thread": -1, "max_bin": 256, "verbose": -1, "device": "cpu", } model = LGBMModel(feature_cols, label_col, param) train_df, test_df = model.cv(train_df, test_df) test_df[label_col] = test_df[[ f"{label_col}_pred_fold{i}" for i in range(5) ]].mean(1) score = roc_auc_score(train_df[label_col].values, train_df[f"{label_col}_pred"].values) logger.info(f"{label_col} score: {score}") save_importance(model.importance_df, label_col, config.store) train_df.to_csv( os.path.join(config.store.result_path, f"{config.store.model_name}_adv_val.csv"), index=False, )