Example #1
0
def run():
    cross_validation = True
    perform_imputation = False
    stop_after_validation = True
    rand_seed = 1

    print("Reading data")
    read_watch = Watch("Reading data")
    read_watch.start()
    df_app_train, df_app_test = load_app_data()
    read_watch.stop()
    print("Finish reading data")

    missing_fill_mean = ["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]
    missing_fill_most_freq = [
        "CNT_FAM_MEMBERS", "AMT_ANNUITY", "DAYS_LAST_PHONE_CHANGE"
    ]

    mean_imputer = SimpleImputer(strategy="mean")
    most_freq_imputer = SimpleImputer(strategy="most_frequent")

    preprocess_watch = Watch("Preprocess")
    print("Preprocess training data")
    preprocess_watch.start()

    df_bureau_agg = None
    df_prev_app_agg = None
    df_bureau_agg = get_preprocessed_bureau_data()
    print("Finish preprocessing bureau data")
    # df_prev_app_agg = get_preprocessed_previous_app_data(False, False)
    # print("Finish preprocessing previous application data")

    df_app_train = shuffle(df_app_train, random_state=rand_seed)
    X_train = preprocess_app(df_app_train, df_bureau_agg, df_prev_app_agg)
    if perform_imputation:
        X_train[missing_fill_mean] = pd.DataFrame(mean_imputer.fit_transform(
            df_app_train[missing_fill_mean]),
                                                  index=df_app_train.index)
        X_train[missing_fill_most_freq] = pd.DataFrame(
            most_freq_imputer.fit_transform(
                df_app_train[missing_fill_most_freq]),
            index=df_app_train.index)
    else:
        X_train[missing_fill_mean] = df_app_train[missing_fill_mean]
        X_train[missing_fill_most_freq] = df_app_train[missing_fill_most_freq]
    y_train = df_app_train["TARGET"]

    print("Preprocess test data")
    X_test = preprocess_app(df_app_test, df_bureau_agg, df_prev_app_agg)
    if perform_imputation:
        X_test[missing_fill_mean] = pd.DataFrame(mean_imputer.transform(
            df_app_test[missing_fill_mean]),
                                                 index=df_app_test.index)
        X_test[missing_fill_most_freq] = pd.DataFrame(
            most_freq_imputer.transform(df_app_test[missing_fill_most_freq]),
            index=df_app_test.index)
    else:
        X_test[missing_fill_mean] = df_app_test[missing_fill_mean]
        X_test[missing_fill_most_freq] = df_app_test[missing_fill_most_freq]

    if not X_test.columns.equals(X_train.columns):
        X_test[X_train.columns.difference(X_test.columns)] = 0
        X_test.drop(X_test.columns.difference(X_train.columns),
                    axis=1,
                    inplace=True)
        X_test = X_test.reindex(columns=X_train.columns, axis=1)
    assert X_train.columns.equals(X_test.columns)

    preprocess_watch.stop()

    print("Training data shape:", X_train.shape)
    X_train.info(verbose=5)

    print("Initializing classifier")
    weight_dict = {0: 1, 1: 1}
    clf = XGBClassifier(max_depth=10,
                        min_child_weight=10,
                        seed=rand_seed,
                        tree_method="gpu_hist")
    # clf = XGBClassifier(max_depth=8, min_child_weight=12, seed=1)
    # clf = GradientBoostingClassifier(max_depth=10, min_samples_split=15, verbose=5)
    # clf = DecisionTreeClassifier(class_weight=weight_dict, max_depth=15, min_samples_split=4)
    # clf = LogisticRegression(class_weight=weight_dict)

    # clf = LGBMClassifier(
    #     n_jobs=8,
    #     n_estimators=10000,
    #     learning_rate=0.02,
    #     num_leaves=34,
    #     colsample_bytree=0.9497036,
    #     subsample=0.8715623,
    #     max_depth=8,
    #     reg_alpha=0.041545473,
    #     reg_lambda=0.0735294,
    #     min_split_gain=0.0222415,
    #     min_child_weight=39.3259775,
    #     silent=-1,
    #     verbose=-1)

    print("Choosing classifier parameters")
    # model_selection_watch = Watch("Model selection")
    # params = {"max_depth": [5, 8, 10], "min_child_weight": [10, 12]}
    # model_selection_watch.start()
    # grid_clf = GridSearchCV(clf, param_grid=params, scoring="roc_auc", cv=5, verbose=5).fit(X_train, y_train)
    # model_selection_watch.stop()
    # print(grid_clf.best_score_)
    # print(grid_clf.best_params_)
    # print(grid_clf.cv_results_)
    # clf = grid_clf.best_estimator_
    w = Watch("Validation")
    w.start()
    if cross_validation:
        k_fold = 5
        print("Perform {:d}-fold cross validation".format(k_fold))
        score_val = sum(
            cross_val_score(clf,
                            X_train,
                            y_train,
                            cv=k_fold,
                            scoring="roc_auc",
                            verbose=5,
                            n_jobs=2)) / k_fold
    else:
        test_size = 0.1
        print("Perform hold-out validation (Test size: {:.0%})".format(
            test_size))
        X_train, X_val, y_train, y_val = train_test_split(
            X_train, y_train, test_size=test_size, random_state=rand_seed)
        print(X_train[:10])
        clf.fit(X_train, y_train)
        # mean_imputer.transform(X_val[missing_fill_mean])
        # most_freq_imputer.transform(X_val[missing_fill_most_freq])
        prob_val = clf.predict_proba(X_val)[:, 1]
        score_val = roc_auc_score(y_val, prob_val)
    w.stop()
    print("Validation AUC: %.6f" % score_val)
    # print(clf.feature_importances_)

    if stop_after_validation:
        Watch.print_all()
        return

    print("Training classifier")
    train_watch = Watch("Training")
    train_watch.start()
    clf.fit(X_train, y_train)
    train_watch.stop()

    print("Dumping trained classifier")
    from joblib import dump
    dump(clf, 'boost_tree_gpu_0.joblib')

    print("Classify test set")
    train_prob_df = pd.DataFrame(clf.predict_proba(X_train)[:, 1],
                                 index=X_train.index,
                                 columns=["PRED_PROB"])
    train_prob_df.to_csv("train_prob.csv")
    test_prob_df = pd.DataFrame(clf.predict_proba(X_test)[:, 1],
                                index=X_test.index,
                                columns=["TARGET"])
    test_prob_df.to_csv("submission.csv")

    Watch.print_all()
Example #2
0
def clean_inst_pay():
    df_inst_pay = load_install_payments(False)
    print_memory_usage(df_inst_pay, "installment_payments")

    df_inst_pay.DAYS_ENTRY_PAYMENT.fillna(0, inplace=True)
    df_inst_pay.AMT_PAYMENT.fillna(-1, inplace=True)
    df_inst_pay_valid_filter = (df_inst_pay["AMT_PAYMENT"] >
                                0) | (df_inst_pay["AMT_INSTALMENT"] > 0)
    print("Remove {:d} invalid records.".format(
        (~df_inst_pay_valid_filter).sum()))
    df_inst_pay_group = df_inst_pay[df_inst_pay_valid_filter].groupby([
        "SK_ID_PREV", "NUM_INSTALMENT_NUMBER", "DAYS_ENTRY_PAYMENT",
        "AMT_PAYMENT"
    ])
    del df_inst_pay_valid_filter

    w = Watch("Aggregation 1")
    print("Aggregate multiple installments for one payment")
    w.start()
    df_inst_pay_group_cnt = df_inst_pay_group.size()
    df_inst_agg = df_inst_pay_group.agg({
        "SK_ID_CURR": ["min", "max"],
        "NUM_INSTALMENT_VERSION": ["max", "nunique"],
        "DAYS_INSTALMENT": ["min", "max"],
        "AMT_INSTALMENT": ["min", "max", "sum"]
    })
    df_inst_agg.columns = ['_'.join(col) for col in df_inst_agg.columns]
    del df_inst_pay_group
    w.stop()

    print_memory_usage(df_inst_agg, "installment_pay_aggregation_1")

    print("Processing 1")
    assert (
        df_inst_agg["SK_ID_CURR_min"] == df_inst_agg["SK_ID_CURR_max"]).all(
            axis=None), "Inconsistent SK_ID_CURR"
    df_inst_pay_processed = pd.DataFrame(index=df_inst_agg.index)
    df_inst_pay_processed["SK_ID_CURR"] = df_inst_agg["SK_ID_CURR_min"]

    df_inst_pay_group_cnt_distict = df_inst_agg[
        "NUM_INSTALMENT_VERSION_nunique"]
    df_inst_pay_group_check = ((df_inst_pay_group_cnt == 2) |
                               (df_inst_pay_group_cnt_distict == 1))
    assert df_inst_pay_group_check.all(axis=None)
    del df_inst_pay_group_cnt, df_inst_pay_group_check
    df_inst_pay_processed["NUM_INSTALMENT_VERSION"] = df_inst_agg[
        "NUM_INSTALMENT_VERSION_max"]

    assert (df_inst_agg["DAYS_INSTALMENT_min"] ==
            df_inst_agg["DAYS_INSTALMENT_max"]).all(axis=None)
    df_inst_pay_processed["DAYS_INSTALMENT"] = df_inst_agg[
        "DAYS_INSTALMENT_min"]

    df_agg_filter = (df_inst_pay_group_cnt_distict == 2)
    assert (df_agg_filter |
            (df_inst_agg["AMT_INSTALMENT_min"]
             == df_inst_agg["AMT_INSTALMENT_max"])).all(axis=None)
    df_inst_pay_processed["AMT_INSTALMENT"] = df_inst_agg["AMT_INSTALMENT_min"]
    df_inst_pay_processed.loc[
        df_agg_filter, "AMT_INSTALMENT"] = df_inst_agg["AMT_INSTALMENT_sum"]
    print("%d payments aggregated" % df_agg_filter.sum())
    del df_inst_pay_group_cnt_distict, df_agg_filter

    df_inst_pay_processed.reset_index(inplace=True)
    # df_inst_pay_processed["DAYS_ENTRY_PAYMENT"].astype(np.float16, copy=False)
    df_inst_pay_processed["DAYS_ENTRY_PAYMENT"] = df_inst_pay_processed[
        "DAYS_ENTRY_PAYMENT"].astype(np.float16, copy=False)
    df_inst_pay_processed["AMT_PAYMENT"] = df_inst_pay_processed[
        "AMT_PAYMENT"].astype(np.float32, copy=False)
    df_inst_pay_processed["AMT_PAYMENT"].replace(-1, -np.inf, inplace=True)
    assert ((df_inst_pay_processed["AMT_PAYMENT"] >= 0) |
            (df_inst_pay_processed["DAYS_ENTRY_PAYMENT"] == 0)).all(axis=None)
    df_diff_entry_offset = df_inst_pay_processed[
        "DAYS_ENTRY_PAYMENT"] - df_inst_pay_processed["DAYS_INSTALMENT"]
    df_inst_pay_processed["AMT_DUE_PAYMENT"] = (
        np.fmax(df_inst_pay_processed["AMT_PAYMENT"], 0) *
        (df_diff_entry_offset <= 0))
    df_inst_pay_processed["AMT_DUE30_PAYMENT"] = (
        np.fmax(df_inst_pay_processed["AMT_PAYMENT"], 0) *
        (df_diff_entry_offset <= 30))
    print_memory_usage(df_inst_pay_processed, "inst_pay_processed_1")
    # print(df_inst_pay_processed.query("(SK_ID_PREV == 1001758) & (NUM_INSTALMENT_NUMBER == 24)").transpose())

    df_inst_pay_group = df_inst_pay_processed.groupby(
        ["SK_ID_PREV", "NUM_INSTALMENT_NUMBER", "NUM_INSTALMENT_VERSION"])
    del df_diff_entry_offset, df_inst_pay_processed, df_inst_agg

    w = Watch("Aggregation 2")
    print("Aggregate multiple payments for one installment")
    w.start()
    df_inst_pay_group_cnt = df_inst_pay_group.size()
    df_inst_agg = df_inst_pay_group.agg(
        {
            "SK_ID_CURR": ["min", "max"],
            # "NUM_INSTALMENT_VERSION": ["min", "max"],
            "DAYS_INSTALMENT": ["min", "max"],
            "DAYS_ENTRY_PAYMENT": ["min", "max"],
            "AMT_INSTALMENT": ["min", "max", "sum"],
            "AMT_PAYMENT": ["sum"],
            "AMT_DUE_PAYMENT": ["sum"],
            "AMT_DUE30_PAYMENT": ["sum"]
        },
        skipna=False)
    df_inst_agg.columns = ['_'.join(col) for col in df_inst_agg.columns]
    del df_inst_pay_group
    w.stop()
    print("Finish aggregations")

    gc.collect()
    print_memory_usage(df_inst_agg, "installment_pay_aggregation_2")

    print("Processing 2")
    w = Watch("Processing 2")
    w.start()
    assert (df_inst_agg["SK_ID_CURR_min"] == df_inst_agg["SK_ID_CURR_max"]
            ).all(), "Inconsistent SK_ID_CURR"
    df_inst_pay_processed = pd.DataFrame(index=df_inst_agg.index)
    df_inst_pay_processed["SK_ID_CURR"] = df_inst_agg["SK_ID_CURR_min"]

    # df_inst_agg_INST_VER = df_inst_agg["NUM_INSTALMENT_VERSION"]
    # assert (df_inst_agg_INST_VER["min"] == df_inst_agg_INST_VER["max"]).all(axis=None), "Inconsistent NUM_INSTALMENT_VERSION"
    # df_inst_pay_processed["NUM_INSTALMENT_VERSION"] = df_inst_agg_INST_VER["min"]

    assert (df_inst_agg["DAYS_INSTALMENT_min"] ==
            df_inst_agg["DAYS_INSTALMENT_max"]).all(
                axis=None), "Inconsistent DAYS_INSTALMENT"
    df_inst_pay_processed["DAYS_INSTALMENT"] = df_inst_agg[
        "DAYS_INSTALMENT_min"]

    df_inst_pay_processed["DAYS_FIRST_PAYMENT"] = df_inst_agg[
        "DAYS_ENTRY_PAYMENT_min"].replace(0, np.nan)
    df_inst_pay_processed["DAYS_LAST_PAYMENT"] = df_inst_agg[
        "DAYS_ENTRY_PAYMENT_max"].replace(0, np.nan)

    assert (df_inst_agg["AMT_INSTALMENT_min"] ==
            df_inst_agg["AMT_INSTALMENT_max"]).all(axis=None)
    df_inst_pay_processed["AMT_INSTALMENT"] = df_inst_agg["AMT_INSTALMENT_min"]

    # Fix missing installment info
    # df_prev_app_ann = pd.read_csv(r"data\previous_application.csv", index_col=0, usecols=[0, 3])
    # df_inst_agg = df_inst_agg.join(df_prev_app_ann, how="left")
    #
    # df_annuity_check = ((df_inst_agg.index.get_level_values(2) != 1) | df_inst_agg["AMT_ANNUITY"].isna() |
    #                     (df_inst_agg["AMT_INSTALMENT_min"] == 0) |
    #                     ((df_inst_agg["AMT_ANNUITY"] - df_inst_agg["AMT_INSTALMENT_min"]).abs() < 0.01))
    # assert df_annuity_check.all(axis=None)
    # inst_fix_filter = ((df_inst_agg["NUM_INSTALMENT_VERSION"] == 1) & (df_inst_agg["AMT_INSTALMENT_min"] == 0))
    # df_inst_pay_processed.loc[inst_fix_filter, "AMT_INSTALMENT"] = df_inst_agg.loc[inst_fix_filter, "AMT_ANNUITY"]
    # del df_annuity_check, inst_fix_filter

    # inst_fix_filter = (df_inst_agg["AMT_INSTALMENT_min"] == 0)
    # df_inst_pay_processed.loc[inst_fix_filter, "AMT_INSTALMENT"] = df_inst_agg.loc[inst_fix_filter, "AMT_PAYMENT_sum"]
    # del inst_fix_filter

    df_inst_pay_invalid_filter = (df_inst_agg["AMT_PAYMENT_sum"] < 0)
    assert ((~df_inst_pay_invalid_filter) |
            (df_inst_pay_group_cnt == 1)).all(axis=None)
    df_inst_pay_processed["AMT_PAYMENT"] = df_inst_agg["AMT_PAYMENT_sum"]
    df_inst_pay_processed.loc[df_inst_pay_invalid_filter,
                              "AMT_PAYMENT"] = np.nan
    assert (df_inst_pay_processed["AMT_PAYMENT"] != 0).all(axis=None)

    df_inst_pay_invalid_filter = df_inst_pay_processed["AMT_PAYMENT"].isnull()
    df_inst_pay_processed["NUM_PAYMENTS"] = df_inst_pay_group_cnt.astype(
        np.uint16)
    df_inst_pay_processed.loc[df_inst_pay_invalid_filter,
                              "NUM_PAYMENTS"] = np.uint16(0)
    print("%d installments aggregated" % (df_inst_pay_group_cnt > 1).sum())
    del df_inst_pay_group_cnt, df_inst_pay_invalid_filter

    df_inst_pay_processed["AMT_OVERDUE"] = np.fmax(
        df_inst_pay_processed["AMT_INSTALMENT"] -
        df_inst_agg["AMT_DUE_PAYMENT_sum"], 0)
    df_inst_pay_processed["AMT_OVERDUE"] *= (
        df_inst_pay_processed["AMT_OVERDUE"] >= 0.01)
    df_inst_pay_processed["AMT_DPD30"] = np.fmax(
        df_inst_pay_processed["AMT_INSTALMENT"] -
        df_inst_agg["AMT_DUE30_PAYMENT_sum"], 0)
    df_inst_pay_processed["AMT_DPD30"] *= (df_inst_pay_processed["AMT_DPD30"]
                                           >= 0.01)
    df_inst_pay_processed["AMT_UNPAID"] = np.fmax(
        df_inst_pay_processed["AMT_INSTALMENT"] -
        df_inst_pay_processed["AMT_PAYMENT"].fillna(0), 0)
    df_inst_pay_processed["AMT_UNPAID"] *= (df_inst_pay_processed["AMT_UNPAID"]
                                            >= 0.01)
    df_inst_pay_processed.reset_index(inplace=True)
    # df_inst_pay_processed.rename(columns={"NUM_INSTALMENT_NUMBER": "NUM_INSTALMENT_NUMBER",
    #                                       "NUM_INSTALMENT_VERSION": "INSTALMENT_VER"})
    del df_inst_agg
    w.stop()
    print("Finish processing")

    print_memory_usage(df_inst_pay_processed, "inst_pay_processed_2")
    gc.collect()

    columns_to_write = [
        "SK_ID_PREV", "SK_ID_CURR", "NUM_INSTALMENT_VERSION",
        "NUM_INSTALMENT_NUMBER", "DAYS_INSTALMENT", "DAYS_FIRST_PAYMENT",
        "DAYS_LAST_PAYMENT", "NUM_PAYMENTS", "AMT_INSTALMENT", "AMT_PAYMENT",
        "AMT_OVERDUE", "AMT_DPD30", "AMT_UNPAID"
    ]

    w = Watch("Save file")
    w.start()
    df_inst_pay_processed.to_csv(r"data\installments_payments_processed.csv",
                                 index=False,
                                 columns=columns_to_write)
    w.stop()
    Watch.print_all()