def baseline_test_score(n_neighbors, Average_weights, lag, outputlength, train_ratio, horizon, weights): shops = range(1, 2001) print "Getting regressor..." neigh = getRegressor(n_neighbors, Average_weights, lag, outputlength, train_ratio, horizon, weights) print "Done" yTrue = [] yPred = [] res = [] for i in shops: print i yt = get_truth_value_last_elements(i, lag, outputlength, horizon) yTrue.append(yt) yp = ItePredict(neigh, get_feature_including_mean_diff(i, lag, outputlength), horizon, lag, weights) if len(yt) != len(yp): print "found %s" % i sys.exit(0) yp = pd.DataFrame(yp) yPred.append(yp) curloss = loss(yp, yt) curloss = [i, curloss] res.append(curloss) # print yTrue, yPred yTrue = pd.concat(yTrue) yPred = pd.concat(yPred) tmp = loss(yPred, yTrue) res.append(['all', tmp]) DESFOLDER = os.path.join(HOME, "Dropbox", "dataset", "Analysis", "IterativeModel", "Tuning") file_name = "neghbours-%s_targetweight-%s_lag-%s_day-%s.csv" % ( n_neighbors, Average_weights, lag, datetime.datetime.now().date()) DESFile = os.path.join(DESFOLDER, get_name_from_weights(weights) + file_name) final_rec = pd.DataFrame(res, columns=['shop_id', 'loss']) final_rec.to_csv(DESFile, index=False) return tmp
def xgBoost_out14(source, day, predictors, predictors_type, ifGS=True, target_variables=['Tar_1', 'Tar_2'], ifCompetition=False, useTrainCV=True, cv_folds=5, early_stopping_rounds=50, X_test_comp='l'): for tar in target_variables[day - 1:day]: report_file = "xgBoost_14out_removeSHOPID_%s_day_%s.txt" % ( predictors_type, tar) report_file = os.path.join(ReportFolder, report_file) #1 get data X = source[predictors] target_variables_plus = [tar] + ['shop_id', 'day'] y = source[target_variables_plus] # X_train = X[predictors][:-1] # y_train = y[:][:-1] # X_test = X[predictors][-1:] # y_test = y[:][-1:] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=0) # if ifGS: # # initial tuning for default paras -> to set n_estimators # xgb_param = {'reg_alpha': 0, 'subsample': 0.8, 'seed': 0, 'colsample_bytree': 0.8, # 'objective': 'reg:linear', 'learning_rate': 0.1, 'max_depth': 6, 'min_child_weight': 1, # 'gamma': 0} # dtrain = xgb.DMatrix(X_train.values, y_train[tar].values, feature_names=X_train.columns.values) # deval = xgb.DMatrix(X_test.values, y_test[tar].values, feature_names=X_test.columns.values) # watchlist = [(dtrain, 'train'), (deval, 'val')] # xgtrain = xgb.DMatrix(source[predictors].values, label=source[tar].values, # feature_names=X_train.columns.values) # cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=10000, nfold=cv_folds, # early_stopping_rounds=50, seed=0, show_stdv=False) # # print cvresult.shape[0] # so we get the best n_estimators for competition # # # clf = xgb.train(xgb_param, dtrain, num_boost_round=cvresult.shape[0], evals=watchlist, # # early_stopping_rounds=100) # # # # feature_imp = clf.get_fscore() # # sorted_scoreDic = sorted(feature_imp.items(), key=operator.itemgetter(1), reverse=True) # # # # # report_file_confirm_iter = "xgBoost_14out_removeSHOPID_%s_day_%s.txt" % (predictors_type, tar) # # # report_file_confirm_iter = os.path.join(ReportFolder, report_file_confirm_iter) # # # # y_pred = clf.predict(xgb.DMatrix(X_test.values, feature_names=X_test.columns.values)) # # loss_score = loss(y_pred, y_test[tar], ifchecked=False) # # print "loss_score: ", loss_score # # with open(report_file, 'a+') as fw: # fw.write("-------------------\Initial set rounds for default paras:\n") # # if tar == 1: # # fw.write(str(sorted_scoreDic)) # fw.write(str(xgb_param)) # fw.write("\n") # fw.write(str(cvresult.shape[0])) # # fw.write('loss_score for random 10% samples: ') # # fw.write(str(loss_score)) # # best_rounds = cvresult.shape[0] best_rounds = 1000 if ifGS: #2 Gridsearch to set the best parameters cv_params_1 = { 'max_depth': [6, 7, 8, 9], 'min_child_weight': [1, 3] } ind_params_1 = { 'learning_rate': 0.1, 'seed': 0, 'subsample': 0.8, 'colsample_bytree': 0.8, 'objective': 'reg:linear', 'silent': 1, 'n_estimators': best_rounds } optimized_GBM = GridSearchCV( estimator=xgb.XGBRegressor(**ind_params_1), param_grid=cv_params_1, scoring=scorer, cv=KFold(n_splits=cv_folds, shuffle=True, random_state=0), n_jobs=-1) optimized_GBM.fit(X, y[tar]) print optimized_GBM.cv_results_ print type(optimized_GBM.cv_results_) print optimized_GBM.best_params_, optimized_GBM.best_score_ best_params_1 = optimized_GBM.best_params_ best_score_1 = optimized_GBM.best_score_ with open(report_file, 'a+') as fw: fw.write("-------------------!!!!\:\n") fw.write(str(optimized_GBM.cv_results_)) fw.write('\n....:\n') fw.write(str(cv_params_1) + '\n') fw.write(str(ind_params_1) + '\n') fw.write( str(optimized_GBM.cv_results_['mean_test_score']) + '\n') fw.write( str(optimized_GBM.best_params_) + str(optimized_GBM.best_score_) + '\n') # gamma: 可选可不选 cv_params_2 = {'gamma': [0.1, 0.3]} ind_params_2 = { 'learning_rate': 0.1, 'subsample': 0.8, 'seed': 0, 'colsample_bytree': 0.8, 'objective': 'reg:linear', 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': best_rounds } ind_params_2.update(best_params_1) optimized_GBM = GridSearchCV( estimator=xgb.XGBRegressor(**ind_params_2), param_grid=cv_params_2, scoring=scorer, cv=KFold(n_splits=cv_folds, shuffle=True, random_state=0), n_jobs=-1) optimized_GBM.fit(X, y[tar]) print optimized_GBM.cv_results_ best_params_2 = optimized_GBM.best_params_ best_score_2 = optimized_GBM.best_score_ if best_score_2 < best_score_1: best_params_2 = {'gamma': 0.0} best_params_2.update(best_params_1) with open(report_file, 'a+') as fw: fw.write("-------------------\:\n") fw.write(str(optimized_GBM.cv_results_)) fw.write('\n....:\n') fw.write(str(cv_params_2) + '\n') fw.write(str(ind_params_2) + '\n') fw.write( str(optimized_GBM.cv_results_['mean_test_score']) + '\n') fw.write( str(optimized_GBM.best_params_) + str(optimized_GBM.best_score_) + '\n') # 3 GridSearch on rest parameters: cv_params_3 = { 'subsample': [0.8, 0.9], 'colsample_bytree': [i / 10.0 for i in range(8, 10)] } ind_params_3 = { 'learning_rate': 0.1, 'seed': 0, 'colsample_bytree': 0.8, 'objective': 'reg:linear', 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': best_rounds } ind_params_3.update(best_params_2) optimized_GBM = GridSearchCV( estimator=xgb.XGBRegressor(**ind_params_3), param_grid=cv_params_3, scoring=scorer, cv=KFold(n_splits=cv_folds, shuffle=True, random_state=0), n_jobs=-1) optimized_GBM.fit(X, y[tar]) print optimized_GBM.cv_results_ best_params_3 = optimized_GBM.best_params_ best_score_3 = optimized_GBM.best_score_ best_params_3.update(best_params_2) with open(report_file, 'a+') as fw: fw.write("-------------------\:\n") fw.write(str(optimized_GBM.cv_results_)) fw.write('\n....:\n') fw.write(str(cv_params_3) + '\n') fw.write(str(ind_params_3) + '\n') fw.write( str(optimized_GBM.cv_results_['mean_test_score']) + '\n') fw.write( str(optimized_GBM.best_params_) + str(optimized_GBM.best_score_) + '\n') cv_params_5 = {'reg_alpha': [100, 150]} ind_params_5 = { 'learning_rate': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.8, 'seed': 0, 'colsample_bytree': 0.8, 'objective': 'reg:linear', 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': best_rounds } ind_params_5.update(best_params_3) optimized_GBM = GridSearchCV( estimator=xgb.XGBRegressor(**ind_params_5), param_grid=cv_params_5, scoring=scorer, cv=KFold(n_splits=cv_folds, shuffle=True, random_state=0), n_jobs=-1) optimized_GBM.fit(X, y[tar]) print optimized_GBM.cv_results_ with open(report_file, 'a+') as fw: fw.write("-------------------\:\n") fw.write(str(optimized_GBM.cv_results_)) fw.write('\n....:\n') fw.write(str(cv_params_5) + '\n') fw.write(str(ind_params_5) + '\n') fw.write( str(optimized_GBM.cv_results_['mean_test_score']) + '\n') fw.write( str(optimized_GBM.best_params_) + str(optimized_GBM.best_score_) + '\n') sys.exit(0) #3 xgb.cv choose the optimized n_estimators if ifGS == False: xgb_param = { 'reg_alpha': 100, 'colsample_bytree': 0.8, 'learning_rate': 0.05, 'min_child_weight': 3, 'subsample': 0.9, 'seed': 0, 'objective': 'reg:linear', 'max_depth': 7, 'gamma': 0.3 } # xgb_param = {'reg_alpha': 100, 'subsample': 0.9, 'seed': 0, 'colsample_bytree': 0.7, # 'objective': 'reg:linear', 'learning_rate': 0.04, 'max_depth': 7, 'min_child_weight': 1, 'gamma': 0.1} dtrain = xgb.DMatrix(X_train.values, y_train[tar].values, feature_names=X_train.columns.values) deval = xgb.DMatrix(X_test.values, y_test[tar].values, feature_names=X_test.columns.values) watchlist = [(dtrain, 'train'), (deval, 'val')] xgtrain = xgb.DMatrix(source[predictors].values, label=source[tar].values, feature_names=X_train.columns.values) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=10000, nfold=cv_folds, early_stopping_rounds=early_stopping_rounds, seed=0, show_stdv=False) print cvresult.shape[ 0] # so we get the best n_estimators for competition clf = xgb.train(xgb_param, dtrain, num_boost_round=cvresult.shape[0], evals=watchlist, early_stopping_rounds=early_stopping_rounds) feature_imp = clf.get_fscore() sorted_scoreDic = sorted(feature_imp.items(), key=operator.itemgetter(1), reverse=True) # print sorted_scoreDic report_file_confirm_iter = "setRounds_xgBoost_14out_removeSHOPID_%s_day_%s.txt" % ( predictors_type, tar) report_file_confirm_iter = os.path.join(ReportFolder, report_file_confirm_iter) y_pred = clf.predict( xgb.DMatrix(X_test.values, feature_names=X_test.columns.values)) loss_score = loss(y_pred, y_test[tar], ifchecked=False) print "loss_score: ", loss_score with open(report_file_confirm_iter, 'a+') as fw: fw.write( "-------------------After choose all paras, test best rounds for a smaller learning rate, 0.04\:\n" ) fw.write(str(sorted_scoreDic)) fw.write(str(xgb_param)) fw.write("\n") fw.write(str(cvresult.shape[0])) fw.write('loss_score: ') fw.write(str(loss_score))
def Model_for_competition(algorithm_name, pickle_model_file_name, fgfs, ReportFolder, source, predictors_type, shop_range, key, predictors, target_variables, estimator_output_length=14, required_prediction_length=14, ifSaveModel=False, predict_mode=["drop", "filled"], n_estimators=300, min_samples_split=2, min_samples_leaf=1, iterOrCopy="iterative", estimator_withParams="l", xgb_param="l", early_stopping_rounds=50): target_variables_full = target_variables target_variables = target_variables[0:estimator_output_length] tmp_predictors = ['shop_id', 'day'] + predictors X = source[tmp_predictors] target_variables_plus = target_variables_full + ['shop_id', 'day'] y = source[target_variables_plus] if 2 < 1: estimator_withParams = pickle.load(open(pickle_model_file_name, 'rb')) else: # X_train_forRounds, X_test_forRounds, y_train_forRounds, y_test_forRounds = train_test_split(X, y, test_size=0.1, random_state=0) # X_train_forRounds = X_train_forRounds[predictors] # y_train_forRounds = y_train_forRounds[target_variables] # X_test_forRounds = X_test_forRounds[predictors] # y_test_forRounds = y_test_forRounds[target_variables] X_train_forRounds = X[predictors][:-1] y_train_forRounds = y[:][:-1] X_test_forRounds = X[predictors][-1:] y_test_forRounds = y[:][-1:] X_train = X[predictors] y_train = y[target_variables] if algorithm_name == "xgBoost": xgb_param_list = [{ 'subsample': 0.9, 'reg_alpha': 100, 'seed': 0, 'colsample_bytree': 0.8, 'objective': 'reg:linear', 'learning_rate': 0.04, 'max_depth': 6, 'min_child_weight': 1, 'gamma': 0.0 }] num_list = [3500] clf_list = [] # for day in xrange(1, estimator_output_length+1): for day in xrange(1, 8): tar = 'Tar_%s' % day # xgtrain = xgb.DMatrix(source[predictors].values, label=source[tar].values, # feature_names=X_train.columns.values) # cvresult = xgb.cv(xgb_param_list[0], xgtrain, num_boost_round=5000, nfold=3, # early_stopping_rounds=50, seed=0, show_stdv=False) # # rounds_nice = cvresult.shape[0] # # with open('num_rounds.csv', 'a+') as fw: # fw.write(str(key) + ' ') # fw.write(str(tar) + '\:') # fw.write(str(rounds_nice) + '\n') y_train_tmp = y_train_forRounds[[tar]] y_test_tmp = y_test_forRounds[[tar]] dtrain = xgb.DMatrix( X_train_forRounds.values, y_train_tmp[tar].values, feature_names=X_train_forRounds.columns.values) deval = xgb.DMatrix( X_test_forRounds.values, y_test_tmp[tar].values, feature_names=X_test_forRounds.columns.values) watchlist = [(dtrain, 'train'), (deval, 'val')] clf = xgb.train(xgb_param_list[0], dtrain, num_boost_round=5000, early_stopping_rounds=100, evals=watchlist) # clf = xgb.train(xgb_param_list[0], dtrain, num_boost_round=cvresult.shape[0], # early_stopping_rounds=50, evals=watchlist) y_pred = clf.predict( xgb.DMatrix(X_test_forRounds.values, feature_names=X_test_forRounds.columns.values)) loss_score = loss(y_pred, y_test_tmp[tar], ifchecked=False) # with open(, 'a+') as fw: # fw.write(str(key) + ' ') # fw.write(str(tar) + ' ') # fw.write(str(loss_score)) # fw.write('\n') all_shop_ids = set(y_test_tmp['shop_id']) loss_score_shop_list = [] for sh in all_shop_ids: row = [sh] row.append( loss( y_pred[np.array(y_test_tmp['shop_id'] == sh), :], y_test_tmp[y_test_tmp['shop_id'] == sh]. iloc[:, np. logical_and(y_test_tmp.columns != 'shop_id', y_test_tmp.columns != 'day')])) loss_score_shop_list.append(row) results_df = [] results_df_columns = [] results_df_columns.extend([ 'cluster_label', 'Tar_label', 'loss_score', 'shop_id', 'loss_theLast14Days', 'bst.best_ntree_limit' ]) for r in loss_score_shop_list: results_df.append(key, day, loss_score, shop_id, r, clf.bst.best_ntree_limit) results_df.to_csv( 'oldFeatures_num_rounds_cluster_shopID_tar.csv', index=False) return True
def rfOrETR_para_tuning(source, pickle_model_file_name, ReportFolder, estimator_withParams, fgfs, report_file,report_per_shop, predictors_type, predictors, target_variables, iterOrCopy = ["iterative"], estimator_output_length = 14, required_prediction_length = 14, n_folds = 5, ifSavePickle= False): target_variables_full = target_variables target_variables = target_variables[0:estimator_output_length] tmp_predictors = ['shop_id', 'day'] + predictors X = source[tmp_predictors] target_variables_plus = target_variables_full + ['shop_id', 'day'] y = source[target_variables_plus] # if os.path.exists(pickle_model_file_name): # print "already trained" # return True X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10,random_state = 0) X_train = X[predictors] y_train = y[target_variables] if len(target_variables)==1: y_train = y_train.values.ravel() print "fitting regressor.." estimator_withParams.fit(X_train, y_train) print "finish!" # if not os.path.exists(pickle_model_file_name) and ifSavePickle == True: # pickle.dump(estimator_withParams,open(pickle_model_file_name,'wb')) x = IterativePredictionModel(estimator_withParams, estimator_output_length, required_prediction_length, fgfs, predictors) for iterOrCopy in ["iterative", "copy"]: report_per_shop = "%s_%s_iterLength_%s_n%s_minS%s_minL%s_iterOrCopy_%s.csv" % ( algorithm_name, predictors_type, iter, n_estimators, min_samples_split, min_samples_leaf, iterOrCopy) report_per_shop = os.path.join(ReportFolder, report_per_shop) y_pred = x.do_iterative_prediction(X_test, how = iterOrCopy) loss_score = loss(y_pred,y_test[target_variables_full]) print "loss: ", loss_score feature_imp = {} for i in xrange(0, estimator_withParams.feature_importances_.shape[0]): feature_imp.setdefault(predictors[i], estimator_withParams.feature_importances_[i]) sorted_scoreDic = sorted(feature_imp.items(), key=operator.itemgetter(1), reverse=True) oob_score_result = estimator_withParams.oob_score_ results_df = [] results_df.extend([oob_score_result, loss_score, n_estimators, min_samples_split, min_samples_leaf, predictors_type, iter, iterOrCopy]) results_df_columns = [] results_df_columns.extend(['oob_score_result', 'loss_score', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'predictors_type', 'iterLength', 'iterOrCopy']) for i in xrange(0, len(sorted_scoreDic)): results_df_columns.append(sorted_scoreDic[i][0]) results_df.append(sorted_scoreDic[i][1]) results_df = pd.DataFrame([results_df], columns = results_df_columns) if os.path.exists(report_file): results_df.to_csv(report_file, index=False, mode='a') else: results_df.to_csv(report_file, index=False) loss_score_dict = {} for i in xrange(0, y_test.shape[0]): shop_id_test = y_test['shop_id'].values[i] loss_score_dict.setdefault(shop_id_test, [[], []])[1].extend( y_test.iloc[i, np.logical_and(y_test.columns != 'shop_id', y_test.columns != 'day')]) loss_score_dict.setdefault(shop_id_test, [[], []])[0].extend(y_pred[i]) loss_score_shop_list = [] for shop in xrange(1, 2001): if shop in loss_score_dict: row = [shop] this_loss = loss(loss_score_dict[shop][0], loss_score_dict[shop][1]) row.append(this_loss) loss_score_shop_list.append(row) with open(report_per_shop, 'a+') as fw: for r in loss_score_shop_list: fw.write(','.join(map(str, r)) + '\n')