def custom_metric(self, y_pred: np.ndarray,
                   dtrain: xgb.DMatrix) -> Tuple[str, np.ndarray]:
     y_true = dtrain.get_label().astype(float)
     loss = normalized_absolute_errors(y_true, y_pred)
     return "normalized_mae", loss
def main(config: DictConfig) -> None:
    prepair_dir(config)
    train_df, test_df = load_data(config)
    label_cols = [
        "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2"
    ]
    feature_cols = [
        col for col in train_df.columns if col not in label_cols + ["Id"]
    ]
    # From adversarial valiadtion
    # feature_cols.remove("IC_20")
    train_df["age_rank"] = train_df["age"] // 10 * 10
    age_rank = train_df["age_rank"].to_array()
    if config.randomize_age:
        set_seed(100)
        train_df["age"] += [randomize_age(age) for age in train_df["age"]]

    skf = StratifiedKFold(n_splits=5,
                          random_state=config.data.seed,
                          shuffle=True)
    for label_col, k in zip(label_cols, [20, 160, 180, 120, 120]):
        y_oof = np.zeros(train_df.shape[0])
        for n_fold, (train_index,
                     val_index) in enumerate(skf.split(age_rank, age_rank)):
            train_df_fold = train_df.iloc[train_index]
            valid_df_fold = train_df.iloc[val_index]
            train_df_fold = train_df_fold[train_df_fold[label_col].notnull()]
            model = KNeighborsRegressor(n_neighbors=k)
            model.fit(train_df_fold[feature_cols], train_df_fold[label_col])
            y_oof[val_index] = model.predict(
                valid_df_fold[feature_cols]).to_array()
            test_df[f"{label_col}_pred_fold{n_fold}"] = model.predict(
                test_df[feature_cols])
        train_df[f"{label_col}_pred"] = y_oof
        notnull_idx = train_df[label_col].notnull()
        score = normalized_absolute_errors(
            train_df[notnull_idx][label_col].values,
            train_df[notnull_idx][f"{label_col}_pred"].values,
        )
        logger.info(f"{label_col}, score: {score}")
        test_df[label_col] = test_df[[
            f"{label_col}_pred_fold{i}" for i in range(5)
        ]].mean(1)
    score = 0
    for label_col, weight in zip(label_cols,
                                 [0.3, 0.175, 0.175, 0.175, 0.175]):
        notnull_idx = train_df[label_col].notnull()
        score += (normalized_absolute_errors(
            train_df[notnull_idx][label_col].to_array(),
            train_df[notnull_idx][f"{label_col}_pred"].to_array(),
        ) * weight)
    logger.info(f"all score: {score}")
    train_df.to_csv(
        os.path.join(config.store.result_path,
                     f"{config.store.model_name}_train.csv"),
        index=False,
    )
    test_df.to_csv(
        os.path.join(config.store.result_path,
                     f"{config.store.model_name}_test.csv"),
        index=False,
    )
    if config.store.gcs_project is not None:
        upload_directory(config.store)
 def validation_epoch_end(self, outputs):
     preds = np.concatenate(
         [x["preds"].detach().cpu().numpy() for x in outputs], axis=0)
     labels = np.concatenate(
         [x["labels"].detach().cpu().numpy() for x in outputs], axis=0)
     loss = np.mean([x["loss"].detach().cpu().numpy() for x in outputs])
     ids = list(
         chain.from_iterable(
             [x["Id"].detach().cpu().numpy() for x in outputs]))
     label_cols = self.valid_dataset.label_cols
     df_dict = {"Id": ids}
     for i, label_col in enumerate(label_cols):
         df_dict[f"{label_col}_pred"] = preds[:, i]
         df_dict[label_col] = labels[:, i]
     df = pd.DataFrame(df_dict)
     if self.store_config.save_feature:
         feature = np.concatenate(
             [x["feature"].detach().cpu().numpy() for x in outputs], axis=0)
         for i in range(feature.shape[-1]):
             df[f"feature{i}"] = feature[:, i]
     # For handling log_loss None Error
     results = {
         f"{label_col}_nae":
         normalized_absolute_errors(df[label_col].values,
                                    df[f"{label_col}_pred"].values)
         for label_col in label_cols
     }
     avg_score = weighted_normalized_absolute_errors(
         df[label_cols].values,
         df[[f"{col}_pred" for col in label_cols]].values,
         weights=self.data_config.weights,
     ).astype(np.float32)
     if self.use_ddp:
         metrics = {"avg_loss": loss, "avg_score": avg_score}
         world_size = dist.get_world_size()
         rank = dist.get_rank()
         aggregated_metrics = {}
         for metric_name, metric_val in metrics.items():
             metric_tensor = torch.tensor(metric_val).to(f"cuda:{rank}")
             dist.barrier()
             dist.all_reduce(metric_tensor, op=dist.ReduceOp.SUM)
             reduced_metric = metric_tensor.item() / world_size
             aggregated_metrics[metric_name] = reduced_metric
         loss = aggregated_metrics["avg_loss"]
         avg_score = aggregated_metrics["avg_score"]
     else:
         rank = 0
     res = {}
     res["step"] = int(self.global_step)
     res["epoch"] = int(self.current_epoch)
     if avg_score <= self.best_score:
         self.best_score = avg_score
         self.save_flg = True
         res["best_score"] = float(self.best_score)
         df.to_csv(
             os.path.join(self.store_config.result_path,
                          f"valid_result_{rank}.csv"),
             index=False,
         )
         with open(
                 os.path.join(self.store_config.log_path,
                              "best_score.yaml"), "w") as f:
             yaml.dump(res, f, default_flow_style=False)
     metrics = {}
     metrics["progress_bar"] = {
         "val_loss": avg_score,
         "avg_val_score": torch.tensor(avg_score),
         "best_score": self.best_score,
         "lr": self.optimizer.param_groups[0]["lr"],
     }
     metrics["progress_bar"].update(results)
     metrics["log"] = {
         "val_loss": avg_score,
         "avg_val_score": torch.tensor(avg_score),
         "best_score": self.best_score,
     }
     metrics["log"].update(results)
     return metrics
def main(config: DictConfig) -> None:
    prepair_dir(config)
    set_seed(777777)
    train_df, test_df = load_data(config)
    label_cols = [
        "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2"
    ]
    feature_cols = [
        col for col in train_df.columns if col not in label_cols + ["Id"]
    ]
    # From adversarial valiadtion
    feature_cols.remove("IC_20")
    train_df["age_rank"] = train_df["age"] // 10 * 10
    skf = StratifiedKFold(n_splits=5,
                          random_state=config.data.seed,
                          shuffle=True)
    for i, (_,
            val_index) in enumerate(skf.split(train_df, train_df["age_rank"])):
        train_df.loc[val_index, "fold"] = i
    if config.randomize_age:
        train_df["age"] += np.array(
            [randomize_age(age) for age in train_df["age"].values])

    params = {}
    params["age"] = {
        "alpha": 0.8,
        "reg_lambda": 0.8,
        "max_leaves": 3,
        "colsample_bytree": 0.6,
        "subsample": 0.8,
        "min_child_weight": 50,
        "booster": "gbtree",
        "objective": "reg:squarederror",
        "eval_metric": "mae",
        "max_depth": 10,
        "learning_rate": 0.01,
        "nthread": -1,
        "max_bin": 256,
        "tree_method": "gpu_hist",
        "disable_default_eval_metric": 1,
    }
    params["domain1_var1"] = {
        "alpha": 0.8,
        "reg_lambda": 0.8,
        "max_leaves": 4,
        "colsample_bytree": 0.5,
        "subsample": 0.8,
        "min_child_weight": 5,
        "booster": "gbtree",
        "objective": "reg:squarederror",
        "eval_metric": "mae",
        "max_depth": 7,
        "learning_rate": 0.01,
        "nthread": -1,
        "max_bin": 256,
        "tree_method": "gpu_hist",
        "disable_default_eval_metric": 1,
    }
    params["domain1_var2"] = {
        "alpha": 0.8,
        "reg_lambda": 0.8,
        "max_leaves": 4,
        "colsample_bytree": 0.8,
        "subsample": 0.6,
        "min_child_weight": 5,
        "booster": "gbtree",
        "objective": "reg:squarederror",
        "eval_metric": "mae",
        "max_depth": 7,
        "learning_rate": 0.01,
        "nthread": -1,
        "max_bin": 256,
        "tree_method": "gpu_hist",
        "disable_default_eval_metric": 1,
    }
    params["domain2_var1"] = {
        "alpha": 0.8,
        "reg_lambda": 0.8,
        "max_leaves": 4,
        "colsample_bytree": 0.7,
        "subsample": 0.4,
        "min_child_weight": 5,
        "booster": "gbtree",
        "objective": "reg:squarederror",
        "eval_metric": "mae",
        "max_depth": 7,
        "learning_rate": 0.01,
        "nthread": -1,
        "max_bin": 256,
        "tree_method": "gpu_hist",
        "disable_default_eval_metric": 1,
    }
    params["domain2_var2"] = {
        "alpha": 0.8,
        "reg_lambda": 0.8,
        "max_leaves": 4,
        "colsample_bytree": 0.7,
        "subsample": 0.4,
        "min_child_weight": 5,
        "booster": "gbtree",
        "objective": "reg:squarederror",
        "eval_metric": "mae",
        "max_depth": 7,
        "learning_rate": 0.01,
        "nthread": -1,
        "max_bin": 256,
        "tree_method": "gpu_hist",
        "disable_default_eval_metric": 1,
    }
    for label_col in label_cols:
        model = XGBModel(feature_cols, label_col, params[label_col])
        train_df, test_df = model.cv(train_df, test_df)
        score = normalized_absolute_errors(
            train_df[label_col].values, train_df[f"{label_col}_pred"].values)
        logger.info(f"{label_col} score: {score}")
        # test_df[label_col] = test_df[
        #     [f"{label_col}_pred_fold{i}" for i in range(5)]
        # ].mean(1)
        save_importance(model.importance_df, label_col, config.store)
    score = weighted_normalized_absolute_errors(
        train_df[label_cols].values,
        train_df[[f"{label_col}_pred" for label_col in label_cols]].values,
    )
    logger.info(f"all score: {score}")
    train_df.to_csv(
        os.path.join(config.store.result_path,
                     f"{config.store.model_name}_train.csv"),
        index=False,
    )
    test_df.to_csv(
        os.path.join(config.store.result_path,
                     f"{config.store.model_name}_test.csv"),
        index=False,
    )
    # with pseudo label
    for label_col in label_cols:
        feature_cols = list(
            pd.read_csv(
                f"{config.store.workdir}/output/{config.store.model_name}/result/importance_{label_col}.csv"
            )["index"].values[:1024])
        model = XGBModel(feature_cols, label_col, params[label_col])
        train_df, test_df = model.cv(train_df, test_df)
        score = normalized_absolute_errors(
            train_df[label_col].values, train_df[f"{label_col}_pred"].values)
        logger.info(f"{label_col} score: {score}")
        test_df[label_col] = test_df[[
            f"{label_col}_pred_fold{i}" for i in range(5)
        ]].mean(1)
        save_importance(
            model.importance_df,
            label_col,
            config.store,
            suffix=f"_feature_section{config.n_feature}",
        )
    score = weighted_normalized_absolute_errors(
        train_df[label_cols].values,
        train_df[[f"{label_col}_pred"
                  for label_col in label_cols]].values.copy(),
    )
    logger.info(f"all score: {score}")
    train_df.to_csv(
        os.path.join(
            config.store.result_path,
            f"{config.store.model_name}_feature_section{config.n_feature}_train.csv",
        ),
        index=False,
    )
    test_df.to_csv(
        os.path.join(
            config.store.result_path,
            f"{config.store.model_name}_feature_section{config.n_feature}_test.csv",
        ),
        index=False,
    )
    sub_df = make_submission(test_df)
    sub_df.to_csv(
        os.path.join(
            config.store.result_path,
            f"{config.store.model_name}_feature_section{config.n_feature}_submission.csv",
        ),
        index=False,
    )
    if config.store.gcs_project is not None:
        upload_directory(config.store)
Exemple #5
0
def main(config: DictConfig) -> None:
    prepair_dir(config)
    train_df, test_df = load_data(config)
    label_cols = [
        "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2"
    ]
    feature_cols = [
        col for col in train_df.columns if col not in label_cols + ["Id"]
    ]
    train_df["age_rank"] = train_df["age"] // 10 * 10
    skf = StratifiedKFold(n_splits=5,
                          random_state=config.data.seed,
                          shuffle=True)
    for i, (_,
            val_index) in enumerate(skf.split(train_df, train_df["age_rank"])):
        train_df.loc[val_index, "fold"] = i
    if config.randomize_age:
        set_seed(100)
        train_df["age"] += [randomize_age(age) for age in train_df["age"]]

    for label_col in label_cols:
        best_score = np.inf
        best_alpha = 0.0
        best_pred = np.zeros([train_df.shape[0]])
        for alpha in [0.01, 0.001, 0.0003, 0.0001]:
            for n_fold in range(5):
                if not config.use_bagging:
                    model = Ridge(alpha=alpha)
                else:
                    model = BaggingRegressor(
                        Ridge(alpha=alpha),
                        n_estimators=30,
                        random_state=42,
                        max_samples=0.3,
                        max_features=0.3,
                    )
                X_train = train_df.query("fold!=@n_fold")[feature_cols]
                y_train = train_df.query("fold!=@n_fold")[label_col]
                X_train = X_train[y_train.notnull()]
                y_train = y_train[y_train.notnull()]
                model.fit(X_train, y_train)
                train_df.loc[train_df.query("fold==@n_fold").index,
                             f"{label_col}_pred"] = model.predict(
                                 train_df.query("fold==@n_fold")[feature_cols])
            score = normalized_absolute_errors(
                train_df[label_col].values,
                train_df[f"{label_col}_pred"].values)
            logger.info(f"{label_col} alpha: {alpha}, score: {score}")
            if score <= best_score:
                best_score = score
                best_alpha = alpha
                best_pred[:] = train_df[f"{label_col}_pred"].values
        train_df[f"{label_col}_pred"] = best_pred
        for n_fold in range(5):
            if not config.use_bagging:
                model = Ridge(alpha=best_alpha)
            else:
                model = BaggingRegressor(
                    Ridge(alpha=best_alpha),
                    n_estimators=30,
                    random_state=42,
                    max_samples=0.3,
                    max_features=0.3,
                )
            X_train = train_df.query("fold!=@n_fold")[feature_cols]
            y_train = train_df.query("fold!=@n_fold")[label_col]
            X_train = X_train[y_train.notnull()]
            y_train = y_train[y_train.notnull()]
            model.fit(X_train, y_train)
            test_df[f"{label_col}_pred_fold{n_fold}"] = model.predict(
                test_df[feature_cols])

        score = normalized_absolute_errors(
            train_df[label_col].values, train_df[f"{label_col}_pred"].values)
        logger.info(f"{label_col} alpha: {best_alpha}, score: {score}")
        test_df[label_col] = test_df[[
            f"{label_col}_pred_fold{i}" for i in range(5)
        ]].mean(1)
    score = weighted_normalized_absolute_errors(
        train_df[label_cols].values,
        train_df[[f"{label_col}_pred" for label_col in label_cols]].values,
    )
    logger.info(f"all score: {score}")
    train_df.to_csv(
        os.path.join(config.store.result_path,
                     f"{config.store.model_name}_train.csv"),
        index=False,
    )
    test_df.to_csv(
        os.path.join(config.store.result_path,
                     f"{config.store.model_name}_test.csv"),
        index=False,
    )
    if config.store.gcs_project is not None:
        upload_directory(config.store)
Exemple #6
0
 def custom_metric(
         self, y_pred: np.ndarray,
         dtrain: lgb.basic.Dataset) -> Tuple[str, np.ndarray, bool]:
     y_true = dtrain.get_label().astype(float)
     loss = normalized_absolute_errors(y_true, y_pred)
     return "normalized_mae", loss, False
Exemple #7
0
def main(config: DictConfig) -> None:
    prepair_dir(config)
    set_seed(777)
    train_df, test_df = load_data(config)
    label_cols = [
        "age",
        "domain1_var1",
        "domain1_var2",
        "domain2_var1",
        "domain2_var2",
    ]
    feature_cols = [
        col for col in train_df.columns if col not in label_cols + ["Id"]
    ]
    # From adversarial valiadtion
    train_df["age_rank"] = train_df["age"] // 10 * 10
    skf = StratifiedKFold(n_splits=5,
                          random_state=config.data.seed,
                          shuffle=True)
    for i, (_,
            val_index) in enumerate(skf.split(train_df, train_df["age_rank"])):
        train_df.loc[val_index, "fold"] = i
    if config.randomize_age:
        set_seed(777)
        train_df["age"] += np.array(
            [randomize_age(age) for age in train_df["age"].values])

    params = {}
    params["age"] = {
        "lambda_l1": 0.8,
        "lambda_l2": 0.8,
        "num_leaves": 2,
        "feature_fraction": 0.4,
        "bagging_fraction": 0.6,
        "bagging_freq": 1,
        "min_child_samples": 20,
        "task": "train",
        "boosting_type": "gbdt",
        "objective": "mae",
        "metric": "normalized_mae",
        "max_depth": 7,
        "learning_rate": 0.01,
        "num_thread": 4,
        "max_bin": 256,
        "verbose": -1,
        "device": "cpu",
    }
    params["domain1_var1"] = {
        "lambda_l1": 0.8,
        "lambda_l2": 0.8,
        "num_leaves": 4,
        "feature_fraction": 0.5,
        "bagging_fraction": 0.8,
        "bagging_freq": 1,
        "min_child_samples": 5,
        "task": "train",
        "boosting_type": "gbdt",
        "objective": "mae",
        "metric": "normalized_mae",
        "max_depth": 7,
        "learning_rate": 0.01,
        "num_thread": 4,
        "max_bin": 256,
        "verbose": -1,
        "device": "cpu",
    }
    params["domain1_var2"] = {
        "lambda_l1": 0.8,
        "lambda_l2": 0.8,
        "num_leaves": 4,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.6,
        "bagging_freq": 6,
        "min_child_samples": 5,
        "task": "train",
        "boosting_type": "gbdt",
        "objective": "mae",
        "metric": "normalized_mae",
        "max_depth": 7,
        "learning_rate": 0.01,
        "num_thread": 4,
        "max_bin": 256,
        "verbose": -1,
        "device": "cpu",
    }
    params["domain2_var1"] = {
        "lambda_l1": 0.8,
        "lambda_l2": 0.8,
        "num_leaves": 4,
        "feature_fraction": 0.7,
        "bagging_fraction": 0.4,
        "bagging_freq": 1,
        "min_child_samples": 5,
        "task": "train",
        "boosting_type": "gbdt",
        "objective": "mae",
        "metric": "normalized_mae",
        "max_depth": 7,
        "learning_rate": 0.01,
        "num_thread": 4,
        "max_bin": 256,
        "verbose": -1,
        "device": "cpu",
    }
    params["domain2_var2"] = {
        "lambda_l1": 0.8,
        "lambda_l2": 0.8,
        "num_leaves": 4,
        "feature_fraction": 0.4,
        "bagging_fraction": 0.8,
        "bagging_freq": 1,
        "min_child_samples": 5,
        "task": "train",
        "boosting_type": "gbdt",
        "objective": "mae",
        "metric": "normalized_mae",
        "max_depth": 7,
        "learning_rate": 0.01,
        "num_thread": 4,
        "max_bin": 256,
        "verbose": -1,
        "device": "cpu",
    }
    if not config.is_null_importance and not config.is_adversarial_validation:
        for label_col in label_cols:
            model = LGBMModel(feature_cols, label_col, params[label_col])
            train_df, test_df = model.cv(train_df, test_df)
            score = normalized_absolute_errors(
                train_df[label_col].values,
                train_df[f"{label_col}_pred"].values)
            logger.info(f"{label_col} score: {score}")
            # test_df[label_col] = test_df[
            #     [f"{label_col}_pred_fold{i}" for i in range(5)]
            # ].mean(1)
            save_importance(model.importance_df, label_col, config.store)
        score = weighted_normalized_absolute_errors(
            train_df[label_cols].values,
            train_df[[f"{label_col}_pred"
                      for label_col in label_cols]].values.copy(),
        )
        logger.info(f"all score: {score}")
        train_df.to_csv(
            os.path.join(config.store.result_path,
                         f"{config.store.model_name}_train.csv"),
            index=False,
        )
        test_df.to_csv(
            os.path.join(config.store.result_path,
                         f"{config.store.model_name}_test.csv"),
            index=False,
        )
        if config.store.gcs_project is not None:
            upload_directory(config.store)

        # Feature Selection
        for label_col in label_cols:
            feature_cols = list(
                pd.read_csv(
                    f"{config.store.workdir}/output/{config.store.model_name}/result/importance_{label_col}.csv"
                )["index"].values[:config.n_feature])
            model = LGBMModel(feature_cols, label_col, params[label_col])
            train_df, test_df = model.cv(train_df, test_df)
            score = normalized_absolute_errors(
                train_df[label_col].values,
                train_df[f"{label_col}_pred"].values)
            logger.info(f"{label_col} score: {score}")
            test_df[label_col] = test_df[[
                f"{label_col}_pred_fold{i}" for i in range(5)
            ]].mean(1)
            save_importance(
                model.importance_df,
                label_col,
                config.store,
                suffix=f"_feature_section{config.n_feature}",
            )
        score = weighted_normalized_absolute_errors(
            train_df[label_cols].values,
            train_df[[f"{label_col}_pred"
                      for label_col in label_cols]].values.copy(),
        )
        logger.info(f"all score: {score}")
        train_df.to_csv(
            os.path.join(
                config.store.result_path,
                f"{config.store.model_name}_feature_section{config.n_feature}_train.csv",
            ),
            index=False,
        )
        test_df.to_csv(
            os.path.join(
                config.store.result_path,
                f"{config.store.model_name}_feature_section{config.n_feature}_test.csv",
            ),
            index=False,
        )
        sub_df = make_submission(test_df)
        sub_df.to_csv(
            os.path.join(
                config.store.result_path,
                f"{config.store.model_name}_feature_section{config.n_feature}_submission.csv",
            ),
            index=False,
        )
        if config.store.gcs_project is not None:
            upload_directory(config.store)

    elif config.is_adversarial_validation:
        skf = StratifiedKFold(n_splits=5,
                              random_state=config.data.seed,
                              shuffle=True)
        if True:
            train_df["is_train"] = 1
            test_df["is_train"] = 0
            label_col = "is_train"
            train_df = pd.concat([train_df, test_df],
                                 axis=0).reset_index(drop=True)
        if False:
            site2_ids = pd.read_csv(
                f"{config.store.workdir}/input/reveal_ID_site2.csv"
            )["Id"].values
            test_df.loc[test_df.query("Id in @site2_ids").index,
                        "is_site2"] = 1
            train_df = pd.concat(
                [train_df, test_df.query("is_site2==1")],
                axis=0).reset_index(drop=True)
            train_df["is_site2"].fillna(0, inplace=True)
            label_col = "is_site2"

        for i, (_, val_index) in enumerate(
                skf.split(train_df, train_df[label_col])):
            train_df.loc[val_index, "fold"] = i
            print(i, train_df.loc[val_index, label_col].sum())
        param = {
            "lambda_l1": 0.1,
            "lambda_l2": 0.1,
            "num_leaves": 32,
            "feature_fraction": 0.4,
            "bagging_fraction": 0.8,
            "bagging_freq": 1,
            "min_child_samples": 20,
            "task": "train",
            "boosting_type": "gbdt",
            "objective": "binary",
            "metric": "auc",
            "max_depth": 10,
            "learning_rate": 0.01,
            "num_thread": 4,
            "max_bin": 256,
            "verbose": -1,
            "device": "cpu",
        }
        model = LGBMModel(feature_cols, label_col, param)
        train_df, test_df = model.cv(train_df, test_df)
        test_df[label_col] = test_df[[
            f"{label_col}_pred_fold{i}" for i in range(5)
        ]].mean(1)
        score = roc_auc_score(train_df[label_col].values,
                              train_df[f"{label_col}_pred"].values)
        logger.info(f"{label_col} score: {score}")
        save_importance(model.importance_df, label_col, config.store)
        train_df.to_csv(
            os.path.join(config.store.result_path,
                         f"{config.store.model_name}_adv_val.csv"),
            index=False,
        )
        test_df.to_csv(
            os.path.join(config.store.result_path,
                         f"{config.store.model_name}_adv_val_test.csv"),
            index=False,
        )
    else:
        for label_col in label_cols:
            null_imp_df = pd.DataFrame()
            for i in range(config.n_run):
                train_df[label_col] = np.random.permutation(
                    train_df[label_col])
                model = LGBMModel(feature_cols, label_col, params[label_col])
                train_df, test_df = model.cv(train_df, test_df)
                score = normalized_absolute_errors(
                    train_df[label_col].values,
                    train_df[f"{label_col}_pred"].values)
                logger.info(f"{label_col} score: {score}")
                test_df[label_col] = test_df[[
                    f"{label_col}_pred_fold{i}" for i in range(5)
                ]].mean(1)
                importance_df = model.importance_df
                importance_df["run"] = i + 1
                null_imp_df = pd.concat([null_imp_df, importance_df])
            save_importance(null_imp_df, label_col, config.store)
Exemple #8
0
def main(config: DictConfig) -> None:
    prepair_dir(config)
    set_seed(config.data.seed)
    label_cols = [
        "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2"
    ]
    train_dfs, names = load_train_data(config.store.workdir)
    test_dfs = load_test_data(config.store.workdir)
    remove_cols = [
        "knn_age_pred",
        "knn_domain1_var1",
        "densenet121_age_pred",
        "densenet121_domain1_var1_pred",
        "densenet121_domain1_var2_pred",
        "densenet121_domain2_var2_pred",
        "3dcnn_resnet18_domain1_var2_pred",
        "3dcnn_resnet18_domain2_var1_pred",
        "3dcnn_resnet18_domain2_var2_pred",
        "1dresnet18_domain1_var1_pred",
        "1dresnet18_domain1_var2_pred",
        "1dresnet18_domain2_var2_pred",
        "simple_3dcnn_domain1_var1_pred",
        "simple_3dcnn_domain1_var2_pred",
        "simple_3dcnn_domain2_var2_pred",
        "transformer_domain2_var1_pred",
        "transformer_domain2_var2_pred",
        "transformer_domain1_var1_pred",
        "transformer_domain1_var2_pred",
        "lgbm_gnn_feature_domain1_var2_pred",
        "lgbm_gnn_feature_domain2_var2_pred",
        "lgbm_gnn_featured_domain1_var2_pred",
        "lgbm_gnn_featured_domain2_var2_pred",
        "lgbm_cnn_feature_domain1_var2_pred",
        "lgbm_cnn_feature_domain2_var2_pred",
        "lgbm_2plus1dcnn_feature_domain1_var2_pred",
        "lgbm_2plus1dcnn_feature_domain2_var2_pred",
        "xgb_2plus1dcnn_feature_age_pred",
        "xgb_2plus1dcnn_feature_domain1_var2_pred",
        "xgb_2plus1dcnn_feature_domain2_var2_pred",
        "simple_3dcnn_domain2_var1_pred",
        "simple_3dcnn_3label_domain1_var2_pred",
        "gin_domain1_var1_pred",
        "gin_domain2_var1_pred",
        "2plus1dcnn_resnet10_domain1_var2_pred",
        "resnest14d_domain1_var1_pred",
        "resnest14d_domain1_var2_pred",
        "resnest14d_domain2_var2_pred",
    ]
    train_ft_dict = {}
    test_ft_dict = {}
    feature_cols = []
    train_ft_dict["Id"] = train_dfs[0]["Id"]
    test_ft_dict["Id"] = test_dfs[0]["Id"]
    for label_col in label_cols:
        train_ft_dict[label_col] = train_dfs[0][label_col]
    for name, df in zip(names, train_dfs):
        for label_col in label_cols:
            if (f"{label_col}_pred" in df.columns
                    and f"{name}_{label_col}_pred" not in remove_cols):
                train_ft_dict[f"{name}_{label_col}_pred"] = df[
                    f"{label_col}_pred"]
                feature_cols += [f"{name}_{label_col}_pred"]
            elif f"{name}_{label_col}_pred" in remove_cols:
                df.drop(f"{label_col}_pred", axis=1, inplace=True)

        feat_dict = make_domain_feature(df, mode="train", name=name)
        train_ft_dict.update(feat_dict)
        feature_cols += list(feat_dict.keys())

    for name, df in zip(names, test_dfs):
        for label_col in label_cols:
            for i in range(5):
                if (f"{label_col}_pred_fold{i}" in df.columns
                        and f"{name}_{label_col}_pred" not in remove_cols):
                    test_ft_dict[f"{name}_{label_col}_pred_fold{i}"] = df[
                        f"{label_col}_pred_fold{i}"]
                elif (f"{name}_{label_col}_pred" in remove_cols
                      and f"{label_col}_pred_fold{i}" in df.columns):
                    df.drop(f"{label_col}_pred_fold{i}", axis=1, inplace=True)
        feat_dict = make_domain_feature(df, mode="test", name=name)
        test_ft_dict.update(feat_dict)
    train_df = pd.DataFrame(train_ft_dict)
    test_df = pd.DataFrame(test_ft_dict)
    train_df["age"] = (
        pd.read_csv(f"{config.store.workdir}/input/train_scores.csv"
                    ).sort_values("Id").reset_index(drop=True)["age"])
    age_rank = train_df["age"].values // 10 * 10
    skf = StratifiedKFold(n_splits=5, random_state=777, shuffle=True)

    train_df, test_df = preprocess(train_df, test_df, feature_cols)
    for feature_col in feature_cols:
        train_df[feature_col].fillna(0, inplace=True)
        test_df[feature_col].fillna(0, inplace=True)
    train_df = cudf.from_pandas(train_df)
    test_df = cudf.from_pandas(test_df)
    if config.randomize_age:
        set_seed(777_777_777)
        train_df["age"] += np.array(
            [randomize_age(age) for age in train_df["age"].values])
    skf = StratifiedKFold(n_splits=5, random_state=777, shuffle=True)
    train_df = train_df.reset_index(drop=True)
    logger.info("=" * 10 + "parameter search" + "=" * 10)
    best_c = {}
    for label_col in label_cols:
        best = np.inf
        if label_col == "age":
            feature_cols_ = [
                col for col in feature_cols if f"{label_col}" in col
            ]
        else:
            feature_cols_ = feature_cols
        for c in [2**(i) for i in range(-14, 1)]:
            y_oof = np.zeros(train_df.shape[0])
            for n_fold, (train_index,
                         val_index) in enumerate(skf.split(age_rank,
                                                           age_rank)):
                train_df_fold = train_df.iloc[train_index]
                valid_df_fold = train_df.iloc[val_index]
                train_df_fold = train_df_fold[
                    train_df_fold[label_col].notnull()]
                model = SVR(kernel="linear", C=c, cache_size=3000.0)
                model.fit(train_df_fold[feature_cols_],
                          train_df_fold[label_col])
                y_oof[val_index] = model.predict(
                    valid_df_fold[feature_cols_]).to_array()
                test_df[f"{label_col}_pred_fold{n_fold}"] = model.predict(
                    test_df[feature_cols_])
            train_df[f"{label_col}_pred"] = y_oof
            notnull_idx = train_df[label_col].notnull()
            score = normalized_absolute_errors(
                train_df[notnull_idx][label_col].values,
                train_df[notnull_idx][f"{label_col}_pred"].values,
            )
            logger.info(f"c={c}, {label_col}: {score}")
            if score <= best:
                best = score
                best_c[label_col] = c
    logger.info("=" * 10 + "prediction" + "=" * 10)
    for label_col in label_cols:
        y_oof = np.zeros(train_df.shape[0])
        if label_col == "age":
            feature_cols_ = [
                col for col in feature_cols if f"{label_col}" in col
            ]
        else:
            feature_cols_ = feature_cols
        for n_fold, (train_index,
                     val_index) in enumerate(skf.split(age_rank, age_rank)):
            train_df_fold = train_df.iloc[train_index]
            valid_df_fold = train_df.iloc[val_index]
            train_df_fold = train_df_fold[train_df_fold[label_col].notnull()]
            model = SVR(kernel="linear",
                        C=best_c[label_col],
                        cache_size=3000.0)
            model.fit(train_df_fold[feature_cols_], train_df_fold[label_col])
            y_oof[val_index] = model.predict(
                valid_df_fold[feature_cols_]).to_array()
            test_df[f"{label_col}_pred_fold{n_fold}"] = model.predict(
                test_df[feature_cols_])
        train_df[f"{label_col}_pred"] = y_oof
        notnull_idx = train_df[label_col].notnull()
        score = normalized_absolute_errors(
            train_df[notnull_idx][label_col].values,
            train_df[notnull_idx][f"{label_col}_pred"].values,
        )
        logger.info(f"c={c}, {label_col}: {score}")
    score = 0
    for label_col, weight in zip(label_cols,
                                 [0.3, 0.175, 0.175, 0.175, 0.175]):
        notnull_idx = train_df[label_col].notnull()
        score += (normalized_absolute_errors(
            train_df[notnull_idx][label_col].to_array(),
            train_df[notnull_idx][f"{label_col}_pred"].to_array(),
        ) * weight)
    logger.info(f"all: {score}")
    train_df.to_csv(
        os.path.join(config.store.result_path,
                     f"{config.store.model_name}_train.csv"),
        index=False,
    )
    test_df.to_csv(
        os.path.join(config.store.result_path,
                     f"{config.store.model_name}_test.csv"),
        index=False,
    )
    if config.store.gcs_project is not None:
        upload_directory(config.store)

    sub_df = make_submission(test_df)
    sub_df.to_csv(
        os.path.join(config.store.result_path,
                     f"{config.store.model_name}_submission.csv"),
        index=False,
    )
def main(config: DictConfig) -> None:
    prepair_dir(config)
    set_seed(config.data.seed)
    label_cols = [
        "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2"
    ]
    train_dfs, names = load_train_data(config.store.workdir)
    test_dfs = load_test_data(config.store.workdir)
    params = {
        "lambda_l1": 0.1,
        "lambda_l2": 0.1,
        "num_leaves": 2,
        "feature_fraction": 0.6,
        "bagging_fraction": 0.6,
        "bagging_freq": 1,
        "min_child_samples": 10,
        "task": "train",
        "boosting_type": "gbdt",
        "objective": "mae",
        "metric": "normalized_mae",
        "max_depth": 7,
        "learning_rate": 0.01,
        "num_thread": 4,
        "max_bin": 256,
        "verbose": -1,
        "device": "cpu",
    }
    remove_cols = [
        "knn_age_pred",
        "knn_domain1_var1",
        "densenet121_age_pred",
        "densenet121_domain1_var1_pred",
        "densenet121_domain1_var2_pred",
        "densenet121_domain2_var2_pred",
        "3dcnn_resnet18_domain1_var2_pred",
        "3dcnn_resnet18_domain2_var1_pred",
        "3dcnn_resnet18_domain2_var2_pred",
        "1dresnet18_domain1_var1_pred",
        "1dresnet18_domain1_var2_pred",
        "1dresnet18_domain2_var2_pred",
        "simple_3dcnn_domain1_var1_pred",
        "simple_3dcnn_domain1_var2_pred",
        "simple_3dcnn_domain2_var2_pred",
        "transformer_domain2_var1_pred",
        "transformer_domain2_var2_pred",
        "transformer_domain1_var1_pred",
        "transformer_domain1_var2_pred",
        "lgbm_gnn_feature_domain1_var2_pred",
        "lgbm_gnn_feature_domain2_var2_pred",
        "lgbm_gnn_featured_domain1_var2_pred",
        "lgbm_gnn_featured_domain2_var2_pred",
        "lgbm_cnn_feature_domain1_var2_pred",
        "lgbm_cnn_feature_domain2_var2_pred",
        "lgbm_2plus1dcnn_feature_domain1_var2_pred",
        "lgbm_2plus1dcnn_feature_domain2_var2_pred",
        "xgb_2plus1dcnn_feature_age_pred",
        "xgb_2plus1dcnn_feature_domain1_var2_pred",
        "xgb_2plus1dcnn_feature_domain2_var2_pred",
        "simple_3dcnn_domain2_var1_pred",
        "simple_3dcnn_3label_domain1_var2_pred",
        "gin_domain1_var1_pred",
        "gin_domain2_var1_pred",
        "2plus1dcnn_resnet10_domain1_var2_pred",
        "resnest14d_domain1_var1_pred",
        "resnest14d_domain1_var2_pred",
        "resnest14d_domain2_var2_pred",
    ]

    train_ft_dict = {}
    test_ft_dict = {}
    feature_cols = []
    train_ft_dict["Id"] = train_dfs[0]["Id"]
    test_ft_dict["Id"] = test_dfs[0]["Id"]
    for label_col in label_cols:
        train_ft_dict[label_col] = train_dfs[0][label_col]
    for name, df in zip(names, train_dfs):
        for label_col in label_cols:
            if (f"{label_col}_pred" in df.columns
                    and f"{name}_{label_col}_pred" not in remove_cols):
                train_ft_dict[f"{name}_{label_col}_pred"] = df[
                    f"{label_col}_pred"]
                feature_cols += [f"{name}_{label_col}_pred"]
            elif f"{name}_{label_col}_pred" in remove_cols:
                df.drop(f"{label_col}_pred", axis=1, inplace=True)

        feat_dict = make_domain_feature(df, mode="train", name=name)
        train_ft_dict.update(feat_dict)
        feature_cols += list(feat_dict.keys())

    for name, df in zip(names, test_dfs):
        for label_col in label_cols:
            for i in range(5):
                if (f"{label_col}_pred_fold{i}" in df.columns
                        and f"{name}_{label_col}_pred" not in remove_cols):
                    test_ft_dict[f"{name}_{label_col}_pred_fold{i}"] = df[
                        f"{label_col}_pred_fold{i}"]
                elif f"{name}_{label_col}_pred" in remove_cols:
                    df.drop(f"{label_col}_pred_fold{i}", axis=1, inplace=True)
        feat_dict = make_domain_feature(df, mode="test", name=name)
        test_ft_dict.update(feat_dict)
    train_df = pd.DataFrame(train_ft_dict)
    test_df = pd.DataFrame(test_ft_dict)
    train_df["age"] = (
        pd.read_csv(f"{config.store.workdir}/input/train_scores.csv"
                    ).sort_values("Id").reset_index(drop=True)["age"])
    train_df["age_rank"] = train_df["age"] // 10 * 10
    skf = StratifiedKFold(n_splits=5, random_state=777, shuffle=True)
    for i, (train_index,
            val_index) in enumerate(skf.split(train_df, train_df["age_rank"])):
        train_df.loc[val_index, "fold"] = i

    if config.randomize_age:
        set_seed(777777777)
        train_df["age"] += np.array(
            [randomize_age(age) for age in train_df["age"].values])
    if (not config.is_null_importance and not config.is_adversarial_validation
            and not config.is_quantile):
        for label_col in label_cols:
            if not config.is_split_label:
                model = LGBMModel(feature_cols, label_col, params)
            else:
                model = LGBMModel(
                    [col for col in feature_cols if f"{label_col}" in col],
                    label_col,
                    params,
                )
            train_df, test_df = model.cv(train_df, test_df)
            score = normalized_absolute_errors(
                train_df[label_col].values,
                train_df[f"{label_col}_pred"].values)
            logger.info(f"{label_col} score: {score}")
            test_df[label_col] = test_df[[
                f"{label_col}_pred_fold{i}" for i in range(5)
            ]].mean(1)
            save_importance(model.importance_df, label_col, config.store)
        score = weighted_normalized_absolute_errors(
            train_df[label_cols].values,
            train_df[[f"{label_col}_pred"
                      for label_col in label_cols]].values.copy(),
        )
        logger.info(f"{names} all score: {score}")
        train_df.to_csv(
            os.path.join(config.store.result_path,
                         f"{config.store.model_name}_train.csv"),
            index=False,
        )
        test_df.to_csv(
            os.path.join(config.store.result_path,
                         f"{config.store.model_name}_test.csv"),
            index=False,
        )
        if config.store.gcs_project is not None:
            upload_directory(config.store)
        sub_df = make_submission(test_df)
        sub_df.to_csv(
            os.path.join(
                config.store.result_path,
                f"{config.store.model_name}_submission.csv",
            ),
            index=False,
        )
    elif config.is_quantile:
        params = {
            "lambda_l1": 0.1,
            "lambda_l2": 0.1,
            "num_leaves": 13,
            "feature_fraction": 1.0,
            "bagging_fraction": 0.6,
            "bagging_freq": 1,
            "min_child_samples": 10,
            "task": "train",
            "boosting_type": "gbdt",
            "objective": "quantile",
            "alpha": 0.75,
            "metric": None,
            "max_depth": 7,
            "learning_rate": 0.01,
            "num_thread": 4,
            "max_bin": 256,
            "verbose": -1,
            "device": "cpu",
        }
        for label_col in label_cols:
            model = LGBMModel(feature_cols, label_col, params)
            train_df, test_df = model.cv(train_df, test_df)
            train_df = train_df.rename(
                columns={f"{label_col}_pred": f"{label_col}_pred_upper"})
            for i in range(5):
                test_df = test_df.rename(columns={
                    f"{label_col}_pred_fold{i}":
                    f"{label_col}_pred_fold{i}_upper"
                })
            test_df[f"{label_col}_pred_upper"] = test_df[[
                f"{label_col}_pred_fold{i}_upper" for i in range(5)
            ]].mean(1)
        params["alpha"] = 0.25
        for label_col in label_cols:
            model = LGBMModel(feature_cols, label_col, params)
            train_df, test_df = model.cv(train_df, test_df)
            train_df = train_df.rename(
                columns={f"{label_col}_pred": f"{label_col}_pred_lower"})
            for i in range(5):
                test_df = test_df.rename(columns={
                    f"{label_col}_pred_fold{i}":
                    f"{label_col}_pred_fold{i}_lower"
                })
            test_df[f"{label_col}_pred_lower"] = test_df[[
                f"{label_col}_pred_fold{i}_lower" for i in range(5)
            ]].mean(1)
        params["alpha"] = 0.5
        for label_col in label_cols:
            model = LGBMModel(feature_cols, label_col, params)
            train_df, test_df = model.cv(train_df, test_df)
            score = normalized_absolute_errors(
                train_df[label_col].values,
                train_df[f"{label_col}_pred"].values)
            logger.info(f"{label_col} score: {score}")
            test_df[label_col] = test_df[[
                f"{label_col}_pred_fold{i}" for i in range(5)
            ]].mean(1)
            save_importance(model.importance_df,
                            label_col,
                            config.store,
                            suffix="_quantile")
            test_df[f"{label_col}_pred"] = test_df[[
                f"{label_col}_pred_fold{i}" for i in range(5)
            ]].mean(1)
        score = weighted_normalized_absolute_errors(
            train_df[label_cols].values,
            train_df[[f"{label_col}_pred"
                      for label_col in label_cols]].values.copy(),
        )
        logger.info(f"{names} all score: {score}")
        train_df.to_csv(
            os.path.join(
                config.store.result_path,
                f"{config.store.model_name}_quantile_train.csv",
            ),
            index=False,
        )
        test_df.to_csv(
            os.path.join(config.store.result_path,
                         f"{config.store.model_name}_quantile_test.csv"),
            index=False,
        )

    elif config.is_adversarial_validation:
        skf = StratifiedKFold(n_splits=5,
                              random_state=config.data.seed,
                              shuffle=True)
        if True:
            train_df["is_train"] = 1
            test_df["is_train"] = 0
            label_col = "is_train"
        train_df = pd.concat([train_df, test_df],
                             axis=0).reset_index(drop=True)
        if False:
            site2_ids = pd.read_csv(
                f"{config.store.workdir}/input/reveal_ID_site2.csv"
            )["Id"].values
            train_df.loc[train_df.query("Id in @site2_ids").index,
                         "is_site2"] = 1
            train_df["is_site2"].fillna(0, inplace=True)
            label_col = "is_site2"

        for i, (_, val_index) in enumerate(
                skf.split(train_df, train_df[label_col])):
            train_df.loc[val_index, "fold"] = i
        param = {
            "lambda_l1": 0.1,
            "lambda_l2": 0.1,
            "num_leaves": 32,
            "feature_fraction": 0.4,
            "bagging_fraction": 0.8,
            "bagging_freq": 1,
            "min_child_samples": 20,
            "task": "train",
            "boosting_type": "gbdt",
            "objective": "binary",
            "metric": "auc",
            "max_depth": 10,
            "learning_rate": 0.01,
            "num_thread": -1,
            "max_bin": 256,
            "verbose": -1,
            "device": "cpu",
        }
        model = LGBMModel(feature_cols, label_col, param)
        train_df, test_df = model.cv(train_df, test_df)
        test_df[label_col] = test_df[[
            f"{label_col}_pred_fold{i}" for i in range(5)
        ]].mean(1)
        score = roc_auc_score(train_df[label_col].values,
                              train_df[f"{label_col}_pred"].values)
        logger.info(f"{label_col} score: {score}")
        save_importance(model.importance_df, label_col, config.store)
        train_df.to_csv(
            os.path.join(config.store.result_path,
                         f"{config.store.model_name}_adv_val.csv"),
            index=False,
        )