def objective(params):

        iteration_start = time.time()

        # print(params)
        params.update({'n_estimators': 500, 'random_state': 42, 'n_jobs': -1})

        model = lgb_model(params, mode)
        model.fit(Xtrain, ytrain)

        if mode == 'regression':
            pred = model.predict(Xtest)
            loss = np.sqrt(mean_squared_error(ytest, pred))
        elif mode == 'classification':
            pred = model.predict_proba(Xtest)[:, 1]
            loss = -roc_auc_score(ytest, pred)

        iteration_time = time.time() - iteration_start
        print('iteration time %.1f, loss %.5f' % (iteration_time, loss))

        return {
            'loss': loss,
            'status': STATUS_OK,
            'runtime': iteration_time,
            'params': params
        }
Example #2
0
def lgb_importance_fs(df, y, mode, BIG_DATASET_SIZE):
    """choose best features based  on lightgbm feature importance"""

    print('lightgbm feature selection..')

    # coefficient for taking fraction of data (to be sure that there won't be memory error)
    coef = 1

    # dataframe size
    df_size = df.memory_usage(deep=True).sum()

    # get subset of data if df is too big
    subset_size = min(df.shape[0],
                      int(coef * df.shape[0] / (df_size / BIG_DATASET_SIZE)))
    print('subset_size {}'.format(subset_size))
    idx = np.random.choice(df.index, size=subset_size, replace=False)

    # define model
    params = {
        'n_estimators': 100,
        'learning_rate': 0.05,
        'num_leaves': 200,
        'subsample': 1,
        'colsample_bytree': 1,
        'random_state': 42,
        'n_jobs': -1
    }
    model = lgb_model(params, mode)

    # train model
    model.fit(df.loc[idx], y.loc[idx])

    # feature importance
    feature_importance = pd.Series(
        model.booster_.feature_importance('gain'),
        index=df.columns).fillna(0).sort_values(ascending=False)
    # print(feature_importance.head(50))
    # print(feature_importance.tail(10))

    # remove totally unimportant features
    best_features = feature_importance[feature_importance > 0]

    # leave most relevant features for big dataset
    if df_size > BIG_DATASET_SIZE:
        new_feature_count = min(
            df.shape[1],
            int(coef * df.shape[1] / (df_size / BIG_DATASET_SIZE)))
        best_features = best_features.head(new_feature_count)

    # select features
    used_columns = best_features.index.tolist()
    df = df[used_columns]

    print('feature selection done')
    print('number of selected features {}'.format(len(used_columns)))

    return df, used_columns
    cat_features = [
        "is_festival_user",
        "is_LAST_2YEAR_DD_ACTIVE",
        "cafe_tag_is_mop_available",
        "IS_SR_KIT_USER",
    ]
    x_train = x_train[selects]
    x_test = x_test[selects]
    x_btest = df_btest[selects]

    adaboost_model(x_train, x_test, y_train, y_test, x_btest, df_btest[labels])
    lr_model(x_train, x_test, y_train, y_test, x_btest, df_btest[labels])
    gbdt_mdoel(x_train, x_test, y_train, y_test, x_btest, df_btest[labels])

    xgb_model(x_train, x_test, y_train, y_test, x_btest, df_btest[labels])
    lgb_model(x_train, x_test, y_train, y_test, x_btest, df_btest[labels])
    cat_boost_model(x_train,
                    x_test,
                    y_train,
                    y_test,
                    x_btest,
                    df_btest[labels],
                    cat_features=cat_features)

    # from sklearn.feature_selection import RFECV
    # x = df_train.copy()
    # clf1 = RandomForestClassifier()
    # clf2 = GradientBoostingClassifier()
    # clf3 = XGBClassifier()
    # dt_score = make_scorer(precision_score, pos_label=1)
    # label = "new_new_isSuccess"
Example #4
0
    # gbdt模型
    xgb_prob_train, xgb_prob_test, xgb_prob_btest = xgb_model(x_train, x_test, y_train, y_test, df_btest.drop("is_sucess_by_contract",axis=1),df_btest["is_sucess_by_contract"])

    # x_train = pd.concat([x_train, rf_prob_train,gbdt_prob_train,xgb_prob_train], axis=1)[["student_no","rf_1","gbdt_1","xgb_1"]]
    # x_test = pd.concat([x_test, rf_prob_btest,gbdt_prob_test,xgb_prob_test], axis=1)[["student_no","rf_1","gbdt_1","xgb_1"]]
    # df_btest = pd.concat([df_btest, rf_prob_btest,gbdt_prob_btest,xgb_prob_btest], axis=1)[["student_no","rf_1","gbdt_1","xgb_1","is_sucess_by_contract"]]

    # x_train = pd.concat([x_train, rf_prob_train, gbdt_prob_train, xgb_prob_train], axis=1)
    # x_test = pd.concat([x_test, rf_prob_btest, gbdt_prob_test, xgb_prob_test], axis=1)
    # df_btest = pd.concat([df_btest, rf_prob_btest, gbdt_prob_btest, xgb_prob_btest], axis=1)

    # rf_prob_train, rf_prob_test, rf_prob_btest = rf_mdoel(x_train, x_test, y_train, y_test, df_btest,rename=["rf_00","rf_11"])
    # #
    # # #lgb模型
    lgb_model(x_train, x_test, y_train, y_test, df_btest.drop("is_sucess_by_contract",axis=1),df_btest["is_sucess_by_contract"])
    #
    # #catboost
    #
    cat_boost_model(x_train, x_test, y_train, y_test, df_btest.drop("is_sucess_by_contract",axis=1),df_btest["is_sucess_by_contract"])
    #
    #major_vote
    from models import major_vote_model
    # major_vote_model(x_train, x_test, y_train, y_test, df_btest, model_weight=[0.2, 0.2, 0.2, 0.4], boundary=0.5)


    #gauss_navie_bayes
    # gauss_navie_bayes(x_train, x_test, y_train, y_test, df_btest.drop("is_sucess_by_contract",axis=1),df_btest["is_sucess_by_contract"])

    #B
    # MLPGradientCheck_model(np.array(x_train), np.array(x_test), y_train, y_test,
Example #5
0
    rf_mdoel(x_train, x_test, y_train, y_test, df_btest.drop(labels, axis=1),
             df_btest[labels])
    # # #
    # # gbdt模型
    gbdt_mdoel(x_train, x_test, y_train, y_test, df_btest.drop(labels, axis=1),
               df_btest[labels])

    #  xgb模型
    xgb_model(x_train, x_test, y_train, y_test, df_btest.drop(labels, axis=1),
              df_btest[labels])

    # lgb模型
    lgb_model(x_train,
              x_test,
              y_train,
              y_test,
              df_btest.drop(labels, axis=1),
              df_btest[labels],
              weight_bias=20)

    # lgb_sk
    # lgb_sk_mdoel(x_train, x_test, y_train, y_test, df_btest.drop(labels,axis=1),df_btest[labels])

    # catboost
    cat_boost_model(x_train,
                    x_test,
                    y_train,
                    y_test,
                    df_btest.drop(labels, axis=1),
                    df_btest[labels],
                    cat_features=catfeatures)
    #  LR模型
    # lr_model(x_train, x_test, y_train, y_test, df_btest.drop("is_pigeon", axis=1), df_btest["is_pigeon"])
    # # rf模型
    rf_mdoel(x_train, x_test, y_train, y_test,
             df_btest.drop("is_pigeon", axis=1), df_btest["is_pigeon"])
    # # #
    # # gbdt模型
    # gbdt_mdoel(x_train, x_test, y_train, y_test, df_btest.drop("is_pigeon",axis=1),df_btest["is_pigeon"])

    #  xgb模型
    xgb_model(x_train, x_test, y_train, y_test,
              df_btest.drop("is_pigeon", axis=1), df_btest["is_pigeon"])

    # lgb模型
    lgb_model(x_train, x_test, y_train, y_test,
              df_btest.drop("is_pigeon", axis=1), df_btest["is_pigeon"])

    # lgb_sk
    # lgb_sk_mdoel(x_train, x_test, y_train, y_test, df_btest.drop("is_pigeon",axis=1),df_btest["is_pigeon"])

    # catboost
    cat_boost_model(x_train, x_test, y_train, y_test,
                    df_btest.drop("is_pigeon", axis=1), df_btest["is_pigeon"])

    #  高斯贝叶斯
    # gauss_navie_bayes(x_train, x_test, y_train, y_test, df_btest.drop("is_pigeon",axis=1),df_btest["is_pigeon"])

    #  gbdt+lr
    gbdt_plus_lr(
        x_train,
        x_test,
Example #7
0
def train_small_data(df, y, model_config, time_limit,
                     include_algos=['et', 'rf', 'lgb', 'xgb'], n_boost=10,
                     model_seed=None, verbose=False):

    """
    training for very small data:
    run several random models, then average them
    """

    start_time = time.time()
    mode = model_config['mode']
    models = []

    if 'et' in include_algos:

        for max_f in [0.2,0.3,0.4,0.5,0.6,0.7,0.8]:
            params = {'n_estimators': 500, 'max_depth': 20, 'max_features': max_f,
                      'n_jobs': -1, 'random_state': model_seed}
            model = et_model(params, mode)
            model.fit(df, y)
            models.append(model)
            if verbose:
                print(params)
            if time.time()-start_time >= time_limit*0.95:
                print('time limit exceeded.')
                return models
        print('et done. total time elapsed {}'.format(time.time()-start_time))

    if 'xgb' in include_algos:

        space = [stochastic.sample(fspace_xgb) for i in range(n_boost)]
        for params in space:
            params.update({'n_estimators': 500, 'random_state': model_seed, 'n_jobs': -1})
            model = xgb_model(params, mode)
            model.fit(df, y)
            models.append(model)
            if verbose:
                print(params)
            if time.time()-start_time >= time_limit*0.95:
                print('time limit exceeded.')
                return models
        print('xgb done. total time elapsed {}'.format(time.time()-start_time))

    if 'lgb' in include_algos:

        space = [stochastic.sample(fspace_lgb) for i in range(n_boost)]
        for params in space:
            params['num_leaves'] = int(params['num_leaves'])
            params['min_child_samples'] = int(params['min_child_samples'])
            params.update({'n_estimators': 500, 'subseq_freq': 1, 'random_state': model_seed, 'n_jobs': -1})
            model = lgb_model(params, mode)
            model.fit(df, y)
            models.append(model)
            if verbose:
                print(params)
            if time.time()-start_time >= time_limit*0.95:
                print('time limit exceeded.')
                return models
        print('lgb done. total time elapsed {}'.format(time.time()-start_time))

    if 'rf' in include_algos:

        for max_f in [0.2,0.3,0.4,0.5,0.6,0.7,0.8]:
            params = {'n_estimators': 500, 'max_depth': 20, 'max_features': max_f,
                      'n_jobs': -1, 'random_state': model_seed}
            model = rf_model(params, mode)
            model.fit(df, y)
            models.append(model)
            if verbose:
                print(params)
            if time.time()-start_time >= time_limit*0.95:
                print('time limit exceeded.')
                return models
        print('rf done. total time elapsed {}'.format(time.time()-start_time))

    return models
    if is_big or len(model_config['used_columns']) > 500:
        df, used_columns = lgb_importance_fs(df, y, args.mode,
                                             BIG_DATASET_SIZE)
        model_config['used_columns'] = used_columns
        print('time elapsed: {}'.format(time.time() - start_time))

    # final data shape
    print('final df shape {}'.format(df.shape))

    # hyperopt
    elapsed = time.time() - start_time
    params = hyperopt_lgb(df,
                          y,
                          mode=args.mode,
                          N=HYPEROPT_NUM_ITERATIONS,
                          time_limit=int((TIME_LIMIT - elapsed) * 0.7),
                          max_train_size=HYPEROPT_MAX_TRAIN_SIZE,
                          max_train_rows=HYPEROPT_MAX_TRAIN_ROWS)

    # training
    model = lgb_model(params, args.mode)
    model.fit(df, y)
    model_config['model'] = model

    # save config to file
    model_config_filename = os.path.join(args.model_dir, 'model_config.pkl')
    with open(model_config_filename, 'wb') as fout:
        pickle.dump(model_config, fout, protocol=pickle.HIGHEST_PROTOCOL)

    print('Train time: {}'.format(time.time() - start_time))
Example #9
0
    def objective(params):

        iteration_start = time.time()

        # print(params)
        params.update({'n_estimators': 500, 'random_state': 42, 'n_jobs': -1})

        # define model
        if model_type == 'lgb':
            params['num_leaves'] = int(params['num_leaves'])
            params['min_child_samples'] = int(params['min_child_samples'])
            model = lgb_model(params, mode)
        elif model_type == 'xgb':
            params['n_estimators'] = 500
            params['tree_method'] = 'hist'
            model = xgb_model(params, mode)
        elif model_type == 'rf':
            params['min_samples_leaf'] = int(params['min_samples_leaf'])
            model = rf_model(params, mode)

        # training and prediction
        if cv:

            kf = KFold(n_splits=5, shuffle=True)
            pred = np.zeros_like(y)

            for i, (train_index, test_index) in enumerate(kf.split(X)):

                # train-validation split
                Xtrain2 = X.iloc[train_index]
                Xtest2 = X.iloc[test_index]
                ytrain2 = y.iloc[train_index]
                ytest2 = y.iloc[test_index]

                model.fit(Xtrain2, ytrain2)
                if mode == 'regression':
                    pred[test_index] = model.predict(Xtest2)
                elif mode == 'classification':
                    pred[test_index] = model.predict_proba(Xtest2)[:, 1]

            if mode == 'regression':
                loss = np.sqrt(mean_squared_error(y, pred))
            elif mode == 'classification':
                loss = -roc_auc_score(y, pred)

            model.fit(X, y)

        else:

            model.fit(Xtrain, ytrain)
            if mode == 'regression':
                pred = model.predict(Xtest)
                loss = np.sqrt(mean_squared_error(ytest, pred))
            elif mode == 'classification':
                pred = model.predict_proba(Xtest)[:, 1]
                loss = -roc_auc_score(ytest, pred)

        if blend or return_preds:
            models.append(model)
            preds.append(pred)
            scores.append(loss)

        iteration_time = time.time() - iteration_start
        print('iteration time %.1f, loss %.5f' % (iteration_time, loss))

        return {
            'loss': loss,
            'status': STATUS_OK,
            'runtime': iteration_time,
            'params': params
        }
	train_df = base_process(train_df)
	test_df = base_process(test_df)


	# 2 -- feature engineering
	train_df = create_features(train_df)
	test_df = create_features(test_df)

	## drop useless features
	drop_cols = ['用户编码', '是否黑名单客户']
	X = train_df.drop(drop_cols + ['信用分'], axis=1)
	X_submit = test_df.drop(drop_cols, axis=1)


	# 3 -- train model
	start_time = time.time()
	cv_pred, model_score = lgb_model(train_df, test_df, X, X_submit)
	print 'training time: ' + str(time.time() - start_time) + 's'


	# 4 -- submit
	submit_df = test_df[['用户编码']]
	submit_df['score'] = cv_pred
	submit_df.columns = ['id', 'score']
	submit_df['score'] = submit_df['score'].apply(lambda x: int(np.round(x)))

	csv_name = './submission/baseline_' + str(time.strftime('%Y%m%d-%H:%M:%S')) + '_{}_'.format(model_score) + '.csv'
	print 'saving ' + csv_name + ' <|-.-|>'
	submit_df.to_csv(csv_name, index=False)