def main():
    train_df = pd.read_pickle('./mnt/inputs/origin/train.pkl.gz')
    test_df = pd.read_csv('./mnt/inputs/origin/test.csv')

    # ==============================
    # start processing
    # ==============================
    use_feature = {
        "EventCount": [EventCount, False],  # class, is_overwrite
        "EventCount2": [EventCount2, False],  # class, is_overwrite
        "Worldcount": [Worldcount, False],
        "SessionTime": [SessionTime2, False],
        #     "AssessEventCount": [AssessEventCount, False],
        "EncodingTitles": [EncodingTitles, False],
        # "encodingTitleOrder": [encodingTitleOrder, False],
        #     "PrevAssessResult":[PrevAssessResult, True],
        #     "PrevAssessAcc": [PrevAssessAcc, True],
        "PrevAssessAccByTitle": [PrevAssessAccByTitle, False],
        "dtFeatures": [dtFeatures, False],
        # "eventCodeRatioFeatures": [eventCodeRatioFeatures, False],
        # "eventIDRatioFeatures": [eventIDRatioFeatures, False],
        "immediatelyBeforeFeatures": [immediatelyBeforeFeatures, False],
        # "worldLabelEncodingDiffFeatures": [worldLabelEncodingDiffFeatures, False],
        "worldNumeriacalFeatures": [worldNumeriacalFeatures, False],
    }

    is_local = False

    if is_local:
        base_path = "../input"  # at local
        train_df, test_df = preprocess_dfs(use_feature,
                                           is_local=is_local,
                                           logger=None,
                                           debug=False)

    else:
        base_path = './mnt/inputs/origin'  # at kaggle kernel
        sub = pd.read_csv(f'{base_path}/sample_submission.csv')
        #        base_path = '/kaggle/input/data-science-bowl-2019'  # at kaggle kernel
        #        if len(sub) == 1000:
        if False:
            sub.to_csv('submission.csv', index=False)
            exit(0)
        else:
            train_df, test_df = preprocess_dfs(use_feature,
                                               is_local=is_local,
                                               logger=None,
                                               debug=is_debug)

    # remove , to avoid error of lgbm
    train_df.columns = [col.replace(',', '_') for col in train_df.columns]
    test_df.columns = [col.replace(',', '_') for col in test_df.columns]

    # train_params = {
    #     'learning_rate': 0.01,
    #     'bagging_fraction': 0.90,
    #     'feature_fraction': 0.85,
    #     'max_depth': 5,
    #     'lambda_l1': 0.7,
    #     'lambda_l2': 0.7,
    #     'metric': 'multiclass',
    #     'objective': 'multiclass',
    #     'num_classes': 4,
    #     'random_state': 773,
    #     "n_estimators": 3000

    # }

    train_params = {
        'learning_rate': 0.01,
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',
        'num_leaves': 64,
        # 'num_leaves': 16,
        'bagging_fraction': 0.9,
        'bagging_freq': 1,
        'feature_fraction': 0.7,
        'max_depth': -1,
        'lambda_l1': 0.2,
        'lambda_l2': 0.4,
        'seed': 19930802,
        'n_estimators': 100000,
        'importance_type': 'gain',
    }

    bad_feats = [
        'prev_gs_duration', 'session_intervalrmin', 'session_intervalrstd',
        'session_intervalrmax', 'session_interval', 'accum_acc_gr_-99',
        'session_intervalrmean', 'ass_session_interval',
        'prev_gs_durationrmean', 'prev_gs_durationrmax', 'ev_cnt4070',
        'prev_gs_durationrstd', 'mean_g_duration_meaan', 'ev_cnt3010',
        'g_duration_std', 'ev_cnt4030', 'ev_cnt3110', 'g_duration_mean',
        'meaan_g_duration_min', 'ass_session_interval_rmin', 'accum_acc_gr_3',
        'g_duration_min', 'mean_g_duraation_std'
    ]

    no_use_cols = [
        "accuracy", "accuracy_group", "game_session", "installation_id",
        "title", "type", "world", "pred_y"
    ] + list(set(train_df.columns) - set(test_df.columns)) + bad_feats

    train_cols = [c for c in list(train_df.columns) if c not in no_use_cols]

    print(f"train_df shape: {train_df.shape}")
    print(train_cols)

    cat_cols = []

    # logger.log(logging.DEBUG, f"categorical cols: {cat_cols}")

    target = "accuracy_group"
    # target = "accuracy"

    model_conf = {
        "predict_type": "regressor",
        "train_params": train_params,
        "train_cols": train_cols,
        "cat_cols": cat_cols,
        "target": target,
        "is_debug": is_debug,
    }

    validation_param = {
        "model_name": "LGBM",
    }

    exp_conf = {
        "train_small_dataset": False,
        "use_feature": {
            "sample": True
        },
        "train_params": train_params,
        "exp_name": exp_name
    }

    v = Validation(validation_param, exp_conf, train_df, test_df, logger)
    clf, oof, prediction, feature_importance_df \
        = v.do_valid_kfold(model_conf)

    optR = OptimizedRounder()
    optR.fit(oof, train_df['accuracy_group'], initial_coef=[1.0, 1.5, 2.9])
    # optR.fit(oof, train_df['accuracy_group'])
    coefficients = optR.coefficients()

    opt_preds = optR.predict(oof, coefficients)

    oof_dir = f'./mnt/oofs/{EXP_ID}'
    if not os.path.exists(oof_dir):
        os.mkdir(oof_dir)
    with open(f'{oof_dir}/{EXP_ID}_oof.pkl', 'wb') as fout:
        pickle.dump(oof, fout)

    res_qwk = qwk(train_df['accuracy_group'], opt_preds)
    print(f'res_qwk : {res_qwk}')
    logger.log(logging.DEBUG, f'qwk -- {res_qwk}')

    #     print(f'qwk -- {np.mean(valid_qwks)} +- {np.std(valid_qwks)}')
    #     logger.log(
    #         logging.DEBUG,
    #         f'qwk -- {np.mean(valid_qwks)} +- {np.std(valid_qwks)}')

    # save info
    feature_importance_df.to_csv(f'./mnt/importances/{EXP_ID}.csv',
                                 index=False)
Exemple #2
0
    def do_valid_kfold(self, model_conf, n_splits=5):
        sp = Splitter()
        target = model_conf["target"]
        split_x = self.train["installation_id"]
        split_y = self.train[target]
        seed = 773
        sp.get_kfold_idx(split_x,
                         split_y,
                         seed,
                         n_cv=n_splits,
                         stratified=False,
                         group=True,
                         pref=self.exp_conf["exp_name"])

        oof: ndarray = np.zeros((self.train.shape[0]))
        prediction = np.zeros((self.test.shape[0]))

        clf_list = []

        self.logger.log(logging.DEBUG, "[train cols] " + "-" * 50)
        self.logger.log(logging.DEBUG, model_conf["train_cols"])
        self.validation_scores = []

        optimizers = []
        valid_qwks = []

        for i, (trn_idx, val_idx) in enumerate(sp.idx_list):
            self.logger.log(logging.DEBUG, "-" * 60)
            self.logger.log(logging.DEBUG, f"start training: {i}")

            with timer(f"fold {i}", self.logger):
                train_df, valid_df = self.train.loc[trn_idx], self.train.loc[
                    val_idx]
                model = self.generate_model(model_conf)
                clf, fold_oof, feature_importance_df = model.train(
                    train_df, valid_df, self.logger)
                #                 fold_oof_class = fold_oof.argmax(axis = 1)

                fold_prediction = model.predict(self.test, self.logger)
                #                 fold_val_score = get_val_score(valid_df[target], fold_oof_class, "QWK")

                optR = OptimizedRounder()
                optR.fit(fold_oof, valid_df[target])
                coefficients = optR.coefficients()
                opt_preds = optR.predict(fold_oof, coefficients)
                fold_qwk = qwk(valid_df[target], opt_preds)
                optimizers.append(optR)
                valid_qwks.append(fold_qwk)

                clf_list.append(clf)
                oof[val_idx] = fold_oof

                prediction += fold_prediction / n_splits

                feature_importance_df["fold"] = i
                self.feature_importance.append(feature_importance_df)


#         self.logger.log(logging.DEBUG,
# f"Total Validation Score: {sum(self.validation_scores) /
# len(self.validation_scores):,.5f}")

        self.feature_importance = pd.concat(self.feature_importance, axis=0)

        return clf_list, oof, prediction, self.feature_importance, optimizers, valid_qwks
def main():
    train_df = pd.read_pickle('./mnt/inputs/origin/train.pkl.gz')
    test_df = pd.read_csv('./mnt/inputs/origin/test.csv')

    # ==============================
    # start processing
    # ==============================
    use_feature = {
        "EventCount": [EventCount, False],  # class, is_overwrite
        "EventCount2": [EventCount2, False],  # class, is_overwrite
        # "EventCount": [EventCount, True],  # class, is_overwrite
        # "EventCount2": [EventCount2, True],  # class, is_overwrite
        "Worldcount": [Worldcount, False],
        "SessionTime": [SessionTime2, False],
        #     "AssessEventCount": [AssessEventCount, False],
        "EncodingTitles": [EncodingTitles, False],
        #     "PrevAssessResult":[PrevAssessResult, True],
        #     "PrevAssessAcc": [PrevAssessAcc, True],
        "PrevAssessAccByTitle": [PrevAssessAccByTitle, False]
    }

    is_local = False

    if is_local:
        base_path = "../input"  # at local
        train_df, test_df = preprocess_dfs(use_feature,
                                           is_local=is_local,
                                           logger=None,
                                           debug=False)

    else:
        base_path = './mnt/inputs/origin'  # at kaggle kernel
        sub = pd.read_csv(f'{base_path}/sample_submission.csv')
        #        base_path = '/kaggle/input/data-science-bowl-2019'  # at kaggle kernel
        #        if len(sub) == 1000:
        if False:
            sub.to_csv('submission.csv', index=False)
            exit(0)
        else:
            train_df, test_df = preprocess_dfs(use_feature,
                                               is_local=is_local,
                                               logger=None,
                                               debug=is_debug)

    # remove , to avoid error of lgbm
    train_df.columns = [col.replace(',', '_') for col in train_df.columns]
    test_df.columns = [col.replace(',', '_') for col in test_df.columns]

    # train_params = {
    #     'learning_rate': 0.01,
    #     'bagging_fraction': 0.90,
    #     'feature_fraction': 0.85,
    #     'max_depth': 5,
    #     'lambda_l1': 0.7,
    #     'lambda_l2': 0.7,
    #     'metric': 'multiclass',
    #     'objective': 'multiclass',
    #     'num_classes': 4,
    #     'random_state': 773,
    #     "n_estimators": 3000

    # }

    train_params = {
        'learning_rate': 0.01,
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',
        'num_leaves': 64,
        'bagging_fraction': 0.9,
        'bagging_freq': 1,
        'feature_fraction': 0.7,
        'max_depth': -1,
        'lambda_l1': 0.2,
        'lambda_l2': 0.4,
        'seed': 19930802,
        'n_estimators': 100000
    }

    bad_feats = [
        'prev_gs_duration', 'session_intervalrmin', 'session_intervalrstd',
        'session_intervalrmax', 'session_interval', 'accum_acc_gr_-99',
        'session_intervalrmean', 'ass_session_interval',
        'prev_gs_durationrmean', 'prev_gs_durationrmax', 'ev_cnt4070',
        'prev_gs_durationrstd', 'mean_g_duration_meaan', 'ev_cnt3010',
        'g_duration_std', 'ev_cnt4030', 'ev_cnt3110', 'g_duration_mean',
        'meaan_g_duration_min', 'ass_session_interval_rmin', 'accum_acc_gr_3',
        'g_duration_min', 'mean_g_duraation_std'
    ]

    no_use_cols = [
        "accuracy", "accuracy_group", "game_session", "installation_id",
        "title", "type", "world", "pred_y"
    ] + list(set(train_df.columns) - set(test_df.columns)) + bad_feats

    train_cols = [c for c in list(train_df.columns) if c not in no_use_cols]

    print(f"train_df shape: {train_df.shape}")
    print(train_cols)

    cat_cols = []

    # logger.log(logging.DEBUG, f"categorical cols: {cat_cols}")

    target = "accuracy_group"

    model_conf = {
        "predict_type": "regressor",
        "train_params": train_params,
        "train_cols": train_cols,
        "cat_cols": cat_cols,
        "target": target,
        "is_debug": is_debug,
    }

    validation_param = {
        "model_name": "LGBM",
    }

    exp_conf = {
        "train_small_dataset": False,
        "use_feature": {
            "sample": True
        },
        "train_params": train_params,
        "exp_name": exp_name
    }

    v = Validation(validation_param, exp_conf, train_df, test_df, logger)
    clf, oof, prediction, feature_importance = v.do_valid_kfold(model_conf)

    test_pred = prediction.copy()

    optR = OptimizedRounder()
    optR.fit(oof, train_df[target])
    coefficients = optR.coefficients()

    opt_preds = optR.predict(oof, coefficients)
    # logger.info(f'valid qwk : {qwk(train_df[target], opt_preds)}')
    print(f'valid qwk : {qwk(train_df[target], opt_preds)}')
Exemple #4
0
    def do_adversarial_valid_kfold(self, model_conf, n_splits=2):
        sp = Splitter()
        target = "is_test"
        split_x = self.train["installation_id"]
        split_y = self.train[target]
        seed = 773
        sp.get_kfold_idx(split_x,
                         split_y,
                         seed,
                         n_cv=n_splits,
                         stratified=True,
                         pref="adv")

        target_length = 1
        oof: ndarray = np.zeros(self.train.shape[0])
        prediction = np.zeros(self.test.shape[0])

        clf_list = []

        self.logger.log(logging.DEBUG, "[train cols] " + "-" * 50)
        self.logger.log(logging.DEBUG, model_conf["train_cols"])
        self.validation_scores = []

        optimizers = []
        valid_qwks = []

        for i, (trn_idx, val_idx) in enumerate(sp.idx_list):
            self.logger.log(logging.DEBUG, "-" * 60)
            self.logger.log(logging.DEBUG, f"start training: {i}")

            with timer(f"fold {i}", self.logger):
                train_df, valid_df = self.train.loc[trn_idx], self.train.loc[
                    val_idx]
                model = self.generate_model(model_conf)
                clf, fold_oof, feature_importance_df = model.train(
                    train_df, valid_df, self.logger)

                # calc validation score using clf.best_iteration_
                fold_val_score = get_val_score(valid_df[target], fold_oof)
                self.validation_scores.append(fold_val_score)

                optR = OptimizedRounder()
                optR.fit(fold_oof, valid_df[target])
                coefficients = optR.coefficients()
                opt_preds = optR.predict(fold_oof, coefficients)
                fold_qwk = qwk(valid_df[target], opt_preds)
                optimizers.append(optR)
                valid_qwks.append(fold_qwk)

                self.logger.log(logging.DEBUG,
                                f"fold_val_score: {fold_val_score:,.5f}")

                clf_list.append(clf)
                oof[val_idx] = fold_oof

                feature_importance_df["fold"] = i
                self.feature_importance.append(feature_importance_df)

        self.logger.log(
            logging.DEBUG,
            f"Total Validation Score: {sum(self.validation_scores) / len(self.validation_scores):,.5f}"
        )

        oof = np.expm1(oof)
        self.train["pred_y"] = oof
        self.feature_importance = pd.concat(self.feature_importance, axis=0)

        return clf_list, oof, prediction, self.feature_importance, optimizers, valid_qwks