def objective(params): iteration_start = time.time() # print(params) params.update({'n_estimators': 500, 'random_state': 42, 'n_jobs': -1}) model = lgb_model(params, mode) model.fit(Xtrain, ytrain) if mode == 'regression': pred = model.predict(Xtest) loss = np.sqrt(mean_squared_error(ytest, pred)) elif mode == 'classification': pred = model.predict_proba(Xtest)[:, 1] loss = -roc_auc_score(ytest, pred) iteration_time = time.time() - iteration_start print('iteration time %.1f, loss %.5f' % (iteration_time, loss)) return { 'loss': loss, 'status': STATUS_OK, 'runtime': iteration_time, 'params': params }
def lgb_importance_fs(df, y, mode, BIG_DATASET_SIZE): """choose best features based on lightgbm feature importance""" print('lightgbm feature selection..') # coefficient for taking fraction of data (to be sure that there won't be memory error) coef = 1 # dataframe size df_size = df.memory_usage(deep=True).sum() # get subset of data if df is too big subset_size = min(df.shape[0], int(coef * df.shape[0] / (df_size / BIG_DATASET_SIZE))) print('subset_size {}'.format(subset_size)) idx = np.random.choice(df.index, size=subset_size, replace=False) # define model params = { 'n_estimators': 100, 'learning_rate': 0.05, 'num_leaves': 200, 'subsample': 1, 'colsample_bytree': 1, 'random_state': 42, 'n_jobs': -1 } model = lgb_model(params, mode) # train model model.fit(df.loc[idx], y.loc[idx]) # feature importance feature_importance = pd.Series( model.booster_.feature_importance('gain'), index=df.columns).fillna(0).sort_values(ascending=False) # print(feature_importance.head(50)) # print(feature_importance.tail(10)) # remove totally unimportant features best_features = feature_importance[feature_importance > 0] # leave most relevant features for big dataset if df_size > BIG_DATASET_SIZE: new_feature_count = min( df.shape[1], int(coef * df.shape[1] / (df_size / BIG_DATASET_SIZE))) best_features = best_features.head(new_feature_count) # select features used_columns = best_features.index.tolist() df = df[used_columns] print('feature selection done') print('number of selected features {}'.format(len(used_columns))) return df, used_columns
cat_features = [ "is_festival_user", "is_LAST_2YEAR_DD_ACTIVE", "cafe_tag_is_mop_available", "IS_SR_KIT_USER", ] x_train = x_train[selects] x_test = x_test[selects] x_btest = df_btest[selects] adaboost_model(x_train, x_test, y_train, y_test, x_btest, df_btest[labels]) lr_model(x_train, x_test, y_train, y_test, x_btest, df_btest[labels]) gbdt_mdoel(x_train, x_test, y_train, y_test, x_btest, df_btest[labels]) xgb_model(x_train, x_test, y_train, y_test, x_btest, df_btest[labels]) lgb_model(x_train, x_test, y_train, y_test, x_btest, df_btest[labels]) cat_boost_model(x_train, x_test, y_train, y_test, x_btest, df_btest[labels], cat_features=cat_features) # from sklearn.feature_selection import RFECV # x = df_train.copy() # clf1 = RandomForestClassifier() # clf2 = GradientBoostingClassifier() # clf3 = XGBClassifier() # dt_score = make_scorer(precision_score, pos_label=1) # label = "new_new_isSuccess"
# gbdt模型 xgb_prob_train, xgb_prob_test, xgb_prob_btest = xgb_model(x_train, x_test, y_train, y_test, df_btest.drop("is_sucess_by_contract",axis=1),df_btest["is_sucess_by_contract"]) # x_train = pd.concat([x_train, rf_prob_train,gbdt_prob_train,xgb_prob_train], axis=1)[["student_no","rf_1","gbdt_1","xgb_1"]] # x_test = pd.concat([x_test, rf_prob_btest,gbdt_prob_test,xgb_prob_test], axis=1)[["student_no","rf_1","gbdt_1","xgb_1"]] # df_btest = pd.concat([df_btest, rf_prob_btest,gbdt_prob_btest,xgb_prob_btest], axis=1)[["student_no","rf_1","gbdt_1","xgb_1","is_sucess_by_contract"]] # x_train = pd.concat([x_train, rf_prob_train, gbdt_prob_train, xgb_prob_train], axis=1) # x_test = pd.concat([x_test, rf_prob_btest, gbdt_prob_test, xgb_prob_test], axis=1) # df_btest = pd.concat([df_btest, rf_prob_btest, gbdt_prob_btest, xgb_prob_btest], axis=1) # rf_prob_train, rf_prob_test, rf_prob_btest = rf_mdoel(x_train, x_test, y_train, y_test, df_btest,rename=["rf_00","rf_11"]) # # # # #lgb模型 lgb_model(x_train, x_test, y_train, y_test, df_btest.drop("is_sucess_by_contract",axis=1),df_btest["is_sucess_by_contract"]) # # #catboost # cat_boost_model(x_train, x_test, y_train, y_test, df_btest.drop("is_sucess_by_contract",axis=1),df_btest["is_sucess_by_contract"]) # #major_vote from models import major_vote_model # major_vote_model(x_train, x_test, y_train, y_test, df_btest, model_weight=[0.2, 0.2, 0.2, 0.4], boundary=0.5) #gauss_navie_bayes # gauss_navie_bayes(x_train, x_test, y_train, y_test, df_btest.drop("is_sucess_by_contract",axis=1),df_btest["is_sucess_by_contract"]) #B # MLPGradientCheck_model(np.array(x_train), np.array(x_test), y_train, y_test,
rf_mdoel(x_train, x_test, y_train, y_test, df_btest.drop(labels, axis=1), df_btest[labels]) # # # # # gbdt模型 gbdt_mdoel(x_train, x_test, y_train, y_test, df_btest.drop(labels, axis=1), df_btest[labels]) # xgb模型 xgb_model(x_train, x_test, y_train, y_test, df_btest.drop(labels, axis=1), df_btest[labels]) # lgb模型 lgb_model(x_train, x_test, y_train, y_test, df_btest.drop(labels, axis=1), df_btest[labels], weight_bias=20) # lgb_sk # lgb_sk_mdoel(x_train, x_test, y_train, y_test, df_btest.drop(labels,axis=1),df_btest[labels]) # catboost cat_boost_model(x_train, x_test, y_train, y_test, df_btest.drop(labels, axis=1), df_btest[labels], cat_features=catfeatures)
# LR模型 # lr_model(x_train, x_test, y_train, y_test, df_btest.drop("is_pigeon", axis=1), df_btest["is_pigeon"]) # # rf模型 rf_mdoel(x_train, x_test, y_train, y_test, df_btest.drop("is_pigeon", axis=1), df_btest["is_pigeon"]) # # # # # gbdt模型 # gbdt_mdoel(x_train, x_test, y_train, y_test, df_btest.drop("is_pigeon",axis=1),df_btest["is_pigeon"]) # xgb模型 xgb_model(x_train, x_test, y_train, y_test, df_btest.drop("is_pigeon", axis=1), df_btest["is_pigeon"]) # lgb模型 lgb_model(x_train, x_test, y_train, y_test, df_btest.drop("is_pigeon", axis=1), df_btest["is_pigeon"]) # lgb_sk # lgb_sk_mdoel(x_train, x_test, y_train, y_test, df_btest.drop("is_pigeon",axis=1),df_btest["is_pigeon"]) # catboost cat_boost_model(x_train, x_test, y_train, y_test, df_btest.drop("is_pigeon", axis=1), df_btest["is_pigeon"]) # 高斯贝叶斯 # gauss_navie_bayes(x_train, x_test, y_train, y_test, df_btest.drop("is_pigeon",axis=1),df_btest["is_pigeon"]) # gbdt+lr gbdt_plus_lr( x_train, x_test,
def train_small_data(df, y, model_config, time_limit, include_algos=['et', 'rf', 'lgb', 'xgb'], n_boost=10, model_seed=None, verbose=False): """ training for very small data: run several random models, then average them """ start_time = time.time() mode = model_config['mode'] models = [] if 'et' in include_algos: for max_f in [0.2,0.3,0.4,0.5,0.6,0.7,0.8]: params = {'n_estimators': 500, 'max_depth': 20, 'max_features': max_f, 'n_jobs': -1, 'random_state': model_seed} model = et_model(params, mode) model.fit(df, y) models.append(model) if verbose: print(params) if time.time()-start_time >= time_limit*0.95: print('time limit exceeded.') return models print('et done. total time elapsed {}'.format(time.time()-start_time)) if 'xgb' in include_algos: space = [stochastic.sample(fspace_xgb) for i in range(n_boost)] for params in space: params.update({'n_estimators': 500, 'random_state': model_seed, 'n_jobs': -1}) model = xgb_model(params, mode) model.fit(df, y) models.append(model) if verbose: print(params) if time.time()-start_time >= time_limit*0.95: print('time limit exceeded.') return models print('xgb done. total time elapsed {}'.format(time.time()-start_time)) if 'lgb' in include_algos: space = [stochastic.sample(fspace_lgb) for i in range(n_boost)] for params in space: params['num_leaves'] = int(params['num_leaves']) params['min_child_samples'] = int(params['min_child_samples']) params.update({'n_estimators': 500, 'subseq_freq': 1, 'random_state': model_seed, 'n_jobs': -1}) model = lgb_model(params, mode) model.fit(df, y) models.append(model) if verbose: print(params) if time.time()-start_time >= time_limit*0.95: print('time limit exceeded.') return models print('lgb done. total time elapsed {}'.format(time.time()-start_time)) if 'rf' in include_algos: for max_f in [0.2,0.3,0.4,0.5,0.6,0.7,0.8]: params = {'n_estimators': 500, 'max_depth': 20, 'max_features': max_f, 'n_jobs': -1, 'random_state': model_seed} model = rf_model(params, mode) model.fit(df, y) models.append(model) if verbose: print(params) if time.time()-start_time >= time_limit*0.95: print('time limit exceeded.') return models print('rf done. total time elapsed {}'.format(time.time()-start_time)) return models
if is_big or len(model_config['used_columns']) > 500: df, used_columns = lgb_importance_fs(df, y, args.mode, BIG_DATASET_SIZE) model_config['used_columns'] = used_columns print('time elapsed: {}'.format(time.time() - start_time)) # final data shape print('final df shape {}'.format(df.shape)) # hyperopt elapsed = time.time() - start_time params = hyperopt_lgb(df, y, mode=args.mode, N=HYPEROPT_NUM_ITERATIONS, time_limit=int((TIME_LIMIT - elapsed) * 0.7), max_train_size=HYPEROPT_MAX_TRAIN_SIZE, max_train_rows=HYPEROPT_MAX_TRAIN_ROWS) # training model = lgb_model(params, args.mode) model.fit(df, y) model_config['model'] = model # save config to file model_config_filename = os.path.join(args.model_dir, 'model_config.pkl') with open(model_config_filename, 'wb') as fout: pickle.dump(model_config, fout, protocol=pickle.HIGHEST_PROTOCOL) print('Train time: {}'.format(time.time() - start_time))
def objective(params): iteration_start = time.time() # print(params) params.update({'n_estimators': 500, 'random_state': 42, 'n_jobs': -1}) # define model if model_type == 'lgb': params['num_leaves'] = int(params['num_leaves']) params['min_child_samples'] = int(params['min_child_samples']) model = lgb_model(params, mode) elif model_type == 'xgb': params['n_estimators'] = 500 params['tree_method'] = 'hist' model = xgb_model(params, mode) elif model_type == 'rf': params['min_samples_leaf'] = int(params['min_samples_leaf']) model = rf_model(params, mode) # training and prediction if cv: kf = KFold(n_splits=5, shuffle=True) pred = np.zeros_like(y) for i, (train_index, test_index) in enumerate(kf.split(X)): # train-validation split Xtrain2 = X.iloc[train_index] Xtest2 = X.iloc[test_index] ytrain2 = y.iloc[train_index] ytest2 = y.iloc[test_index] model.fit(Xtrain2, ytrain2) if mode == 'regression': pred[test_index] = model.predict(Xtest2) elif mode == 'classification': pred[test_index] = model.predict_proba(Xtest2)[:, 1] if mode == 'regression': loss = np.sqrt(mean_squared_error(y, pred)) elif mode == 'classification': loss = -roc_auc_score(y, pred) model.fit(X, y) else: model.fit(Xtrain, ytrain) if mode == 'regression': pred = model.predict(Xtest) loss = np.sqrt(mean_squared_error(ytest, pred)) elif mode == 'classification': pred = model.predict_proba(Xtest)[:, 1] loss = -roc_auc_score(ytest, pred) if blend or return_preds: models.append(model) preds.append(pred) scores.append(loss) iteration_time = time.time() - iteration_start print('iteration time %.1f, loss %.5f' % (iteration_time, loss)) return { 'loss': loss, 'status': STATUS_OK, 'runtime': iteration_time, 'params': params }
train_df = base_process(train_df) test_df = base_process(test_df) # 2 -- feature engineering train_df = create_features(train_df) test_df = create_features(test_df) ## drop useless features drop_cols = ['用户编码', '是否黑名单客户'] X = train_df.drop(drop_cols + ['信用分'], axis=1) X_submit = test_df.drop(drop_cols, axis=1) # 3 -- train model start_time = time.time() cv_pred, model_score = lgb_model(train_df, test_df, X, X_submit) print 'training time: ' + str(time.time() - start_time) + 's' # 4 -- submit submit_df = test_df[['用户编码']] submit_df['score'] = cv_pred submit_df.columns = ['id', 'score'] submit_df['score'] = submit_df['score'].apply(lambda x: int(np.round(x))) csv_name = './submission/baseline_' + str(time.strftime('%Y%m%d-%H:%M:%S')) + '_{}_'.format(model_score) + '.csv' print 'saving ' + csv_name + ' <|-.-|>' submit_df.to_csv(csv_name, index=False)