def run(args):
    # 데이터 파일 읽기
    data = pd.read_csv(data_dir + "/train.csv", parse_dates=["order_date"])

    # 피처 엔지니어링 실행
    train, test, y, features = feature_engineering(data, args.YEAR_MONTH)

    if args.MODEL == "lgbm":
        # Cross Validation Out Of Fold로 LightGBM 모델 훈련 및 예측
        y_oof, test_preds = lgbm(train, y, test, features, lgbm_params(), WANDB_USE=True)

    if args.MODEL == "cat":
        y_oof, test_preds = cat(train, y, test, features, cat_params(), WANDB_USE=True)

    if args.MODEL == "xgb":
        y_oof, test_preds = xgboost(train, y, test, features, xgb_params(), WANDB_USE=True)

    # 테스트 결과 제출 파일 읽기
    sub = pd.read_csv(data_dir + "/sample_submission.csv")

    # 테스트 예측 결과 저장
    sub["probability"] = test_preds

    # 제출 파일 쓰기
    os.makedirs(output_dir, exist_ok=True)
    sub.to_csv(
        os.path.join(output_dir, f"{args.MODEL}_{args.FILE_NAME}.csv"), index=False
    )
Beispiel #2
0
def main():
    df_train = pd.read_csv("data/training-set.csv")
    df_test = pd.read_csv("data/testing-set.csv").drop("Next_Premium", axis=1)
    df_features = feature_engineering(df_train, df_test)
    # print("\nFeatures:")
    # print(df_features.sample(10))

    model_files = [
        ".".join(x.split(".")[:-1]) for x in glob.glob(MODEL_PATH)
        if x.endswith(".pd")
    ]
    print(model_files)
    val_tmp, test_tmp = [], []
    print("Validation")
    for filename in model_files:
        val_tmp.append(
            np.clip(pd.read_pickle(filename + ".pd").values[:, 0], 0, 2e8))
        print("%.2f %.2f %.2f %.2f %.2f" %
              (np.min(val_tmp), np.percentile(val_tmp, 25), np.median(val_tmp),
               np.percentile(val_tmp, 75), np.max(val_tmp)))
    df_val_ens = pd.DataFrame(np.stack(val_tmp, axis=1), columns=model_files)
    # print(df_val_ens.head())

    print("=" * 20)
    print("Test")
    for filename in model_files:
        test_tmp.append(np.clip(joblib.load(filename + ".pkl"), 0, 2e8))
        print("%.2f %.2f %.2f %.2f %.2f" %
              (np.min(test_tmp), np.percentile(test_tmp, 25),
               np.median(test_tmp), np.percentile(test_tmp,
                                                  75), np.max(test_tmp)))
    df_test_ens = pd.DataFrame(np.stack(test_tmp, axis=1), columns=model_files)
    print("=" * 20)
    # print(df_test_ens.head())

    df_train = df_train.set_index("Policy_Number").join(df_features)
    df_test = df_test.set_index("Policy_Number").join(df_features)
    df_val_ens.set_index(df_train.index, inplace=True)
    df_test_ens.set_index(df_test.index, inplace=True)
    df_train = pd.concat([df_train, df_val_ens], axis=1, ignore_index=False)
    df_test = pd.concat([df_test, df_test_ens], axis=1, ignore_index=False)
    print(df_train.head())
    # df_train["ratio_pred_nom"] = np.clip(
    #     df_train["ratio_pred"], 0, 2) * df_train["total_premium"]
    # df_test["ratio_pred_nom"] = np.clip(
    #     df_test["ratio_pred"], 0, 2) * df_test["total_premium"]
    del df_train["index"]
    del df_test["index"]

    df_test["Next_Premium"], loss = fit_and_predict(df_train,
                                                    df_test,
                                                    n_best_features=int(
                                                        sys.argv[1]),
                                                    ens_features=model_files)
    df_test["Next_Premium"] = np.clip(df_test["Next_Premium"], 0, 5e5)
    df_test[["Next_Premium"]].to_csv("sub_ens_{}_{:.0f}.csv".format(
        date.today().strftime("%m%d"), loss * 100),
                                     float_format="%.2f")
Beispiel #3
0
def main():
    df_train = pd.read_csv("data/training-set.csv")
    df_test = pd.read_csv("data/testing-set.csv")
    df_features = feature_engineering(df_train, df_test)

    model_files = [
        ".".join(x.split(".")[:-1]) for x in glob.glob(MODEL_PATH)
        if x.endswith(".pd")
    ]

    val_tmp, test_tmp = [], []
    print("Validation")
    for filename in model_files:
        val_tmp.append(
            np.clip(pd.read_pickle(filename + ".pd").values[:, 0], 0, 2e8))
        print("%.2f %.2f %.2f %.2f %.2f" %
              (np.min(val_tmp), np.percentile(val_tmp, 25), np.median(val_tmp),
               np.percentile(val_tmp, 75), np.max(val_tmp)))
    df_val_ens = pd.DataFrame(np.stack(val_tmp, axis=1), columns=model_files)
    # print(df_val_ens.head())

    print("=" * 20)
    print("Test")
    for filename in model_files:
        test_tmp.append(np.clip(joblib.load(filename + ".pkl"), 0, 2e8))
        print("%.2f %.2f %.2f %.2f %.2f" %
              (np.min(test_tmp), np.percentile(test_tmp, 25),
               np.median(test_tmp), np.percentile(test_tmp,
                                                  75), np.max(test_tmp)))
    df_test_ens = pd.DataFrame(np.stack(test_tmp, axis=1), columns=model_files)
    print("=" * 20)
    # print(df_test_ens.head())

    df_train = df_train.set_index("Policy_Number").join(df_features)
    df_test = df_test.set_index("Policy_Number").join(df_features)
    df_train, df_test = preprocess_features(df_train, df_test)

    df_val_ens.set_index(df_train.index, inplace=True)
    df_test_ens.set_index(df_test.index, inplace=True)
    df_train = pd.concat([df_train, df_val_ens], axis=1)
    df_test = pd.concat([df_test, df_test_ens], axis=1)

    scaler = StandardScaler(copy=True)
    columns = model_files
    scaler.fit(
        pd.concat([df_test_ens[columns], df_val_ens[columns]], axis=0).values)
    df_train[columns] = scaler.transform(df_train[columns].values)
    df_test[columns] = scaler.transform(df_test[columns].values)
    print("train:\n", df_train[columns].describe())
    print("test:\n", df_test[columns].describe())

    df_test["Next_Premium"], loss = fit_and_predict(
        df_train[columns + ["Next_Premium"]],
        df_test[columns + ["Next_Premium"]])
    df_test["Next_Premium"] = np.clip(df_test["Next_Premium"], 0, 5e5)
    df_test[["Next_Premium"]].to_csv("sub_ens_dnn_{}_{:.0f}.csv".format(
        date.today().strftime("%m%d"), loss * 10000),
                                     float_format="%.2f")
Beispiel #4
0
def main():
    df_train = pd.read_csv("data/training-set.csv")
    df_test = pd.read_csv("data/testing-set.csv").drop("Next_Premium", axis=1)
    df_features = feature_engineering(df_train, df_test)
    # print("\nFeatures:")
    # print(df_features.sample(10))

    df_train = df_train.set_index("Policy_Number").join(df_features)
    df_test = df_test.set_index("Policy_Number").join(df_features)
    del df_train["index"]
    del df_test["index"]

    fit_and_predict(df_train, df_test, n_best_features=int(sys.argv[1]))
Beispiel #5
0
def main():
    df_train = pd.read_csv("data/training-set.csv")
    df_test = pd.read_csv("data/testing-set.csv")
    df_features = feature_engineering(df_train, df_test)

    df_train = df_train.set_index("Policy_Number").join(df_features)
    df_test = df_test.set_index("Policy_Number").join(df_features)
    del df_features
    del df_train["index"]
    del df_test["index"]
    # del df_train["overlap_test"]
    # del df_test["overlap_test"]

    df_train, df_test = preprocess_features(df_train, df_test)
    fit_and_predict(df_train, df_test)