def lgbm_regressor(): train, test = load_datasets(filename='../input/data_560.csv') X_train, y_train, X_test, df_coulumns = getDataSet(train, test) models = ['lr_data_560' + str(i) for i in range(201, 300)] training_lgbm_regressor(X_train, y_train, X_test, df_coulumns, ratio=0.8, model_name=models)
def main(): print('load train test datasets') train, test = load_datasets(dropDuplicate=False) batch_size = 1 submit_df = pd.DataFrame({'Id': test['Id']}) submit_pred = np.zeros((test.shape[0], 1)) submit_pred_n = np.zeros((test.shape[0], batch_size)) submit_train = np.zeros((train.shape[0], 1)) submit_train_n = np.zeros((train.shape[0], batch_size)) test_rmses = [] for _, train_all in enumerate(bacth(train, batch_size)): print('第 %d 批 dataset 开始训练' % _) y_train_pred, y_test_pred, mean_test_rmse = sub_train(train_all, test, hasFilter=False) submit_pred_n[:, _] = y_test_pred.reshape(-1) submit_train_n[:, _] = y_train_pred.reshape(-1) test_rmses.append(mean_test_rmse) print('第 %d 批 dataset 训练结束' % _) print('train finished...') test_rmse = np.mean(test_rmses) print('mean test rmse: ', test_rmse) submit_pred[:] = submit_pred_n.mean(1).reshape(-1, 1) submit_train[:] = submit_train_n.mean(1).reshape(-1, 1) # test submit_df['Score_xgb_bagging'] = submit_pred submission_path_raw = '../models/__models__/{}_{}_{}.csv'.format( 'xgboost_bagging_test', test_rmse, time.strftime('%m%d%H%M', time.localtime(time.time()))) submission_path_threshold = '../result/{}_{}_{}.csv'.format( 'xgboost_threshold', test_rmse, time.strftime('%m%d%H%M', time.localtime(time.time()))) submit_df.to_csv(submission_path_raw, index=False) submit_df = threshold(submit_df, feature='Score_xgb_bagging') submit_df.to_csv(submission_path_threshold, index=False, header=False) # train submit_train_df = train[['Id']] submit_train_df['Score_xgb_bagging'] = submit_train submit_train_df.to_csv(submission_path_raw.replace('test', 'train'), index=False) print('done.')
def regression(): train, test = load_datasets(fillNan=True) X_train, y_train, X_test, df_coulumns = getDataSet(train, test) model_name = ['ridge_1', 'ridge_2', 'lasso_1', 'lasso_2'] clfs = [ Ridge(fit_intercept=True, alpha=8.858667904100823, max_iter=500, normalize=False, tol=0.01), Ridge(fit_intercept=True, alpha=8.858667904100823, max_iter=500, normalize=True, tol=0.01), Lasso(fit_intercept=True, alpha=8.858667904100823, max_iter=500, normalize=True, tol=0.01), Lasso(fit_intercept=True, alpha=8.858667904100823, max_iter=500, normalize=False, tol=0.01) ] training_regression(X_train, y_train, X_test, df_coulumns, clfs, kBest=True, k=476, ratio=1, model_name=model_name) model_name = ['ridge_3', 'ridge_4', 'lasso_3', 'lasso_4'] training_regression(X_train, y_train, X_test, df_coulumns, clfs, minMaxScaler=MinMaxScaler((-1, 1)), kBest=True, k=476, ratio=1, model_name=model_name)
def main(): print('load train test datasets') usebatch = False # configuration display print('batch is used: {}'.format("Yes" if usebatch else "No")) with open('../input/feature.info', 'a+') as outf: if usebatch: for train, test in batch_with_gain_importance(featurefile='../models/info/gain_importance_data.csv', filename='../input/data.csv'): _train(train, test, outf) else: train, test = load_datasets(filename='../input/data_560.csv') _train(train, test, outf)
def main(): # 加载 数据集 train, text = load_datasets() # 加载 feature importance features = most_importance('split_importance_02181349.csv') best_feature = [] maxRmse = 0 drop_feature = [] drop_len = 10 drop_freq = defaultdict(int) all_feature = features[::] while len(features) != 0: choose_feature = features.pop(0) candicate_feature = best_feature.copy() candicate_feature.append(choose_feature) train_feature = train[candicate_feature] train_label = train['Score'] train_feature, valid_feature, train_label, valid_label = train_test_split( train_feature, train_label, test_size=0.3, random_state=0) train_feature = np.array(train_feature.values) valid_feature = np.array(valid_feature.values) train_label = np.array(train_label).reshape(-1) valid_label = np.array(valid_label).reshape(-1) lgb_train = lgbm.Dataset(train_feature, label=train_label) lgb_eval = lgbm.Dataset(valid_feature, label=valid_label, reference=lgb_train) lgbm_params = { 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'rmse', 'min_child_weight': 20, 'num_leaves': 2**5, 'lambda_l2': 2, 'subsample': 0.5, 'colsample_bytree': 1, 'learning_rate': 0.1, 'seed': 2017, 'verbose': 100, 'silent': True, } model = lgbm.train(lgbm_params, lgb_train, num_boost_round=5000, valid_sets=lgb_eval, early_stopping_rounds=100) valid_pred = model.predict(valid_feature, num_iteration=model.best_iteration) valid_rmse = rmse(valid_label, valid_pred) valid_rmse = 1 / (1 + valid_rmse) if valid_rmse >= maxRmse - 0.000245: best_feature.append(choose_feature) maxRmse = max(maxRmse, valid_rmse) with open('../models/info/sub_feature_02182137.csv', 'a+') as out: out.write( str(valid_rmse) + ',' + choose_feature + ',' + str(len(drop_feature)) + '\n') else: drop_freq[choose_feature] += 1 drop_feature.append(choose_feature) if len(drop_feature) == drop_len: for val in drop_feature[::-1]: if drop_freq[val] > 1: continue features.insert(0, val) drop_feature = [] drop_len += drop_len with open('../models/info/sub_feature_02182137.log', 'a+') as out: out.write('当前 drop 队列长度 {}:'.format(len(drop_feature)) + '\n') out.write(' || '.join(drop_feature) + '\n') for drop in drop_feature: out.write('{} {}'.format(drop_freq[drop], drop) + '\n')
def main(): print('load train test datasets') train, test = load_datasets() submit_df = pd.DataFrame({'userid': test['Id']}) X_train = train.drop(['Id', 'Score'], axis=1) X_test = test.drop(['Id'], axis=1) y_train = train['Score'] - 1 df_columns = X_train.columns xgb_params = { 'eta': 0.01, 'min_child_weight': 20, 'colsample_bytree': 0.5, 'max_depth': 10, 'subsample': 0.5, 'lambda': 2.0, 'scale_pos_weight': 1, 'eval_metric': 'mlogloss', 'objective': 'multi:softmax', 'silent': 1, 'booster': 'gbtree', 'num_class': 5 } dtrain_all = xgb.DMatrix(X_train.values, y_train.values, feature_names=df_columns) dtest = xgb.DMatrix(X_test.values, feature_names=df_columns) # 5 折交叉验证 nfold = 5 cv_result = xgb.cv(dict(xgb_params), dtrain_all, nfold=nfold, stratified=True, num_boost_round=10000, early_stopping_rounds=100, verbose_eval=100, show_stdv=False) best_num_boost_rounds = len(cv_result) mean_train_mlogloss = cv_result.loc[best_num_boost_rounds - 11:best_num_boost_rounds - 1, 'train-mlogloss-mean'].mean() mean_test_mlogloss = cv_result.loc[best_num_boost_rounds - 11:best_num_boost_rounds - 1, 'test-mlogloss-mean'].mean() print('best_num_boost_rounds = {}'.format(best_num_boost_rounds)) # num_boost_round = int(best_num_boost_rounds * 1.1) # print('num_boost_round = ', num_boost_round) print('mean_rmse_auc = {:.7f} , mean_rmse_auc = {:.7f}\n'.format( mean_train_mlogloss, mean_test_mlogloss)) print('---> training on total dataset') model = xgb.train(dict(xgb_params), dtrain_all, num_boost_round=best_num_boost_rounds) print('---> predict test') y_pred = model.predict(dtest, ntree_limit=model.best_ntree_limit) submit_df['Score'] = y_pred submit_df['Score'] = submit_df['Score'] + 1 print(y_pred) submission_path = '../result/{}_{}.csv'.format('xgb', mean_test_mlogloss) submit_df.to_csv(submission_path, index=False, header=False) print('done.')