def mul_dtree(X, Y2): forest = ExtraTreesRegressor(n_estimators=5, compute_importances=True, random_state=0) forest.fit(X[:200], Y2[:200]) forest.predict(X[200:]) print Y2[200:]
def train(self, verbose=False, training_data=None): n_estimators = 50 n_samples = 5000 trainingDataDict = self._getTrainingData(numSamples=n_samples) X = np.array(trainingDataDict['rot_line_test_deriv'], dtype=np.float32) y = np.array(trainingDataDict['solution_data'][0], dtype=np.float32) dtr0 = ExtraTreesRegressor(n_estimators=n_estimators) dtr0 = dtr0.fit(X, y) X = np.array(trainingDataDict['rot_line_test_deriv'], dtype=np.float32) y = np.array(trainingDataDict['solution_data'][1], dtype=np.float32) dtr1 = ExtraTreesRegressor(n_estimators=n_estimators) dtr1 = dtr1.fit(X, y) X = np.array(trainingDataDict['scaled_img'], dtype=np.float32) y = np.array(trainingDataDict['solution_data'][0], dtype=np.float32) str0 = ExtraTreesRegressor(n_estimators=n_estimators) str0 = str0.fit(X, y) X = np.array(trainingDataDict['scaled_img'], dtype=np.float32) y = np.array(trainingDataDict['solution_data'][1], dtype=np.float32) str1 = ExtraTreesRegressor(n_estimators=n_estimators) str1 = str1.fit(X, y) trainingDataDict = self._getTrainingData(startPos=n_samples+1, numSamples=n_samples) dtr0Pred = [dtr0.predict(trainingDataDict['rot_line_test_deriv'][i]) for i in range(len(trainingDataDict['rot_line_test_deriv']))] dtr1Pred = [dtr1.predict(trainingDataDict['rot_line_test_deriv'][i]) for i in range(len(trainingDataDict['rot_line_test_deriv']))] str0Pred = [str0.predict(trainingDataDict['scaled_img'][i]) for i in range(len(trainingDataDict['scaled_img']))] str1Pred = [str1.predict(trainingDataDict['scaled_img'][i]) for i in range(len(trainingDataDict['scaled_img']))] X = np.array([[dtr0Pred[i][0], str0Pred[i][0]] for i in xrange(len(dtr0Pred))], dtype=np.float32) y = np.array(trainingDataDict['solution_data'][0], dtype=np.float32) ftr0 = ExtraTreesRegressor(n_estimators=n_estimators) ftr0 = ftr0.fit(X, y) X = np.array([(dtr1Pred[i][0], str1Pred[i][0]) for i in xrange(len(dtr1Pred))], dtype=np.float32) y = np.array(trainingDataDict['solution_data'][1], dtype=np.float32) ftr1 = ExtraTreesRegressor(n_estimators=n_estimators) ftr1 = ftr1.fit(X, y) self.dtr0 = dtr0 self.dtr1 = dtr1 self.str0 = str0 self.str1 = str1 self.ftr0 = ftr0 self.ftr1 = ftr1 self.areModelsTrained = True
def estimate(): from loadData import loadSets from helper import splitDataset, separateTargetFromTrain from sklearn.ensemble import ExtraTreesRegressor import numpy as np import math best_rmsle = 2 best_i = 0 trainingSet, testingSet = loadSets() testingSet = None trainingData, testingData = splitDataset(trainingSet, 0.6) testingData, validationData = splitDataset(testingData, 0.5) trainingSet = None trainingTarget, trainingFeatures = separateTargetFromTrain(trainingData) testingTarget, testingFeatures = separateTargetFromTrain(testingData) validationTarget, validationFeatures = separateTargetFromTrain(validationData) testingTarget = testingTarget.values validationTarget = validationTarget.values trainingData = None testingData = None validationData = None for i in range(2000, 3001, 1000): model = ExtraTreesRegressor(n_estimators = i, n_jobs = -1) model.fit(trainingFeatures, trainingTarget) predictions = model.predict(testingFeatures) cost = pow(np.log(predictions + 1) - np.log(testingTarget + 1), 2) rmsle = math.sqrt(np.mean(cost)) print i, " estimators: ", rmsle if rmsle < best_rmsle: best_rmsle = rmsle best_i = i print "Best: ", best_i, " estimators with rmsle: ", best_rmsle model = ExtraTreesRegressor(n_estimators = best_i, n_jobs = -1) model.fit(trainingFeatures, trainingTarget) predictions = model.predict(validationFeatures) cost = pow(np.log(predictions + 1) - np.log(validationTarget + 1), 2) rmsle = math.sqrt(np.mean(cost)) print "Final model cost: ", rmsle
def do_etrees(filename): df, Y = create_merged_dataset(filename) etree = ExtraTreesRegressor(n_estimators=200, n_jobs=-1, min_samples_leaf=5, random_state=SEED) X = df.drop(['driver', 'trip'], 1) etree.fit(X, Y) probs = etree.predict(X[:200]) return pd.DataFrame({'driver': df['driver'][:200], 'trip': df['trip'][:200], 'probs': probs})
def predict_with_one(X, out_file_name): n_samples, n_features = X.shape iter_num = 3 div = ShuffleSplit(n_samples, n_iter=iter_num, test_size=0.2, random_state=0) model = ExtraTreesRegressor(n_estimators=5) score_matrix = np.zeros((n_features, n_features)) t = time() round_num = 0 for train, test in div: round_num += 1 train_samples = X[np.array(train)] test_samples = X[np.array(test)] for i in range(n_features): for j in range(n_features): X_train = train_samples[:, i:i+1] X_test = test_samples[:, i:i+1] y_train = train_samples[:, j] y_test = test_samples[:, j] # for i in range(len(fl)): # for j in range(len(fl)): # if fl[j][1]-fl[j][0] != 1: # continue # X_train = train_samples[:, fl[i][0]:fl[i][1]] # X_test = test_samples[:, fl[i][0]:fl[i][1]] # y_train = train_samples[:, fl[j][0]] # y_test = test_samples[:, fl[j][0]] model.fit(X_train, y_train) y_pred = model.predict(X_test) mae = mean_absolute_error(y_test, y_pred) score_matrix[i, j] += mae print('Round', round_num, '|', i, j, mae, time()-t) np.savetxt(os.path.join(CODE_PATH, out_file_name), score_matrix/iter_num, fmt='%.3f', delimiter=',')
def build_extra_tree_regressor(X_test, X_train_full, y_train_full): print "Building ExtraTrees regressor..." etr = ExtraTreesRegressor(n_estimators=500) etr.fit(X_train_full, y_train_full) etr_predict = etr.predict(X_test) return etr_predict
def classify(self): """Perform classification""" clf = ETRegressor(n_estimators=500, min_samples_split=5, min_samples_leaf=2) #pca = PCA(n_components = 400) #self._ClassifyDriver__traindata = pca.fit_transform(self._ClassifyDriver__traindata) #self._ClassifyDriver__testdata = pca.transform(self._ClassifyDriver__testdata) #print self._ClassifyDriver__traindata.shape clf.fit(self._ClassifyDriver__traindata, self._ClassifyDriver__trainlabels) self._ClassifyDriver__y = clf.predict(self._ClassifyDriver__testdata)
def extra_trees_regressor(x, y, n_estimators, max_depth): kf = KFold(len(x), n_folds=3) scores = [] for train_index, test_index in kf: X_train, X_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] clf = ExtraTreesRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=0) clf.fit(X_train, y_train) scores.append(mean_squared_error(clf.predict(X_test), y_test) ** 0.5) return np.mean(scores)
def reg_skl_etr(param, data): [X_tr, X_cv, y_class_tr, y_class_cv, y_reg_tr, y_reg_cv] = data etr = ExtraTreesRegressor(n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) etr.fit(X_tr, y_reg_tr) pred = etr.predict(X_cv) RMSEScore = getscoreRMSE(y_reg_cv, pred) return RMSEScore, pred
class MyExtraTreeReg(MyRegressor): def __init__(self, params=dict()): self._params = params self._extree = ExtraTreesRegressor(**(self._params)) def update_params(self, updates): self._params.update(updates) self._extree = ExtraTreesRegressor(**(self._params)) def fit(self, Xtrain, ytrain): self._extree.fit(Xtrain, ytrain) def predict(self, Xtest, option = None): return self._extree.predict(Xtest) def plt_feature_importance(self, fname_list, f_range = list()): importances = self._extree.feature_importances_ std = np.std([tree.feature_importances_ for tree in self._extree.estimators_], axis=0) indices = np.argsort(importances)[::-1] fname_array = np.array(fname_list) if not f_range: f_range = range(indices.shape[0]) n_f = len(f_range) plt.figure() plt.title("Extra Tree Feature importances") plt.barh(range(n_f), importances[indices[f_range]], color="b", xerr=std[indices[f_range]], ecolor='k',align="center") plt.yticks(range(n_f), fname_array[indices[f_range]]) plt.ylim([-1, n_f]) plt.show() def list_feature_importance(self, fname_list, f_range = list(), return_list = False): importances = self._extree.feature_importances_ indices = np.argsort(importances)[::-1] print 'Extra tree feature ranking:' if not f_range : f_range = range(indices.shape[0]) n_f = len(f_range) for i in range(n_f): f = f_range[i] print '{0:d}. feature[{1:d}] {2:s} ({3:f})'.format(f + 1, indices[f], fname_list[indices[f]], importances[indices[f]]) if return_list: return [indices[f_range[i]] for i in range(n_f)]
def algorithm_ExtraTrees(X_train,Y_train,X_validation,Y_validation, seed=7): # 训练模型 scaler = StandardScaler().fit(X_train) rescaledX = scaler.transform(X_train) gbr = ExtraTreesRegressor(n_estimators=80) gbr.fit(X=rescaledX, y=Y_train) # 评估算法模型 rescaledX_validation = scaler.transform(X_validation) predictions = gbr.predict(rescaledX_validation) print(mean_squared_error(Y_validation, predictions))
class ModelERT: def __init__(self, model_set_name, i_fold): self.model_set_name = model_set_name self.i_fold = i_fold def set_params(self, prms): self.prms = prms def set_data(self, labels_tr, labels_te, data_tr, data_te): self.labels_tr = labels_tr self.labels_te = labels_te self.data_tr = data_tr self.data_te = data_te def train(self): print "start ert" self.model = ExtraTreesRegressor(n_jobs=self.prms["n_jobs"], verbose=1, random_state=self.prms["random_state"], n_estimators=int(self.prms["n_estimators"]), max_features=self.prms["max_features"]) self.model.fit(self.data_tr.values, self.labels_tr) def predict(self): return self.model.predict(self.data_te.values) def predict_train(self): return self.model.predict(self.data_tr.values) def dump_model(self): pass def dump_pred(self, pred, name): folder = config.get_model_folder(self.model_set_name, self.i_fold) Files.mkdir(folder) path = config.get_model_path(self.model_set_name, name, self.i_fold) joblib.dump(pred, path)
def dummie_columns_extra_trees(train, test): from sklearn.ensemble import ExtraTreesRegressor print "-- {} --".format("Extremely Randomized Trees Regression using all but remarks") predicting_columns = list(train._get_numeric_data().columns.values) predicting_columns.remove("LISTPRICE") predicting_columns.remove("SOLDPRICE") rf = ExtraTreesRegressor( n_estimators=300, n_jobs=-1) rf.fit(train[predicting_columns], train["SOLDPRICE"]) score = rf.score(test[predicting_columns], test["SOLDPRICE"]) predictions = rf.predict(test[predicting_columns]) sample_predictions(test, predictions) print "Accuracy: {}\n".format(score) return score, predictions
def baseline_extra(train_x, train_y, test_x, test_y, n, d, result_path="review_baseline_extra.txt"): predict = [] clf = ExtraTreesRegressor(n_estimators=n, max_depth=d, random_state=0) clf = clf.fit(train_x, train_y) predict = clf.predict(test_x).tolist() result = pd.DataFrame([], columns=['review_count', 'predict']) result['review_count'] = test_y result['predict'] = predict result.to_csv(result_path, index=False) rmse = mean_squared_error(predict, test_y) ** 0.5 return rmse
def simple_extremely_random_trees(data_train_x, data_test_x, data_train_y, data_test_y): from sklearn.ensemble import ExtraTreesRegressor print "-- {} --".format("Extremely Randomized Trees Regression using all but remarks") rf = ExtraTreesRegressor( n_estimators=300, n_jobs=-1 ) rf.fit(data_train_x, data_train_y) sample_predictions(rf.predict(data_test_x), data_test_y) score = rf.score(data_test_x, data_test_y) cross_validated_scores = cross_val_score( rf, data_test_x, data_test_y, cv=5) print "MSE Accuracy: {}".format(score) print "MSE Across 5 Folds: {}".format(cross_validated_scores) print "95%% Confidence Interval: %0.3f (+/- %0.3f)\n" % (cross_validated_scores.mean(), cross_validated_scores.std() * 1.96)
def predict_for(output, cycles, tests, raw_tests, inputs): x_train, x_test, y_train, y_test = train_test_split(cycles[inputs], cycles[output], test_size=0.25, random_state=33) scaler_x = StandardScaler().fit(x_train) scaler_t = StandardScaler().fit(tests) x_train = scaler_x.transform(x_train) x_test = scaler_x.transform(x_test) tests = scaler_t.transform(tests) clf_et = ExtraTreesRegressor(n_estimators=10, compute_importances=True, random_state=42) clf_et.fit(x_train, y_train) ps = clf_et.predict(tests) return {dt: int(round(p)) for dt, p in zip(raw_tests['datetime'], ps)}
def baseline_extra_leave_one_out(train_raw_x, test_raw_x, test_ids, n=40, d=40, result_path="baseline_extra.txt"): predict = [] for test_id in test_ids: train_x = train_raw_x[train_raw_x.business_id != test_id] train_y = train_raw_x[train_raw_x.business_id != test_id].stars.as_matrix() train_x = train_x.drop(["business_id", "stars"], 1).as_matrix() clf = ExtraTreesRegressor(n_estimators=n, max_depth=d, random_state=0) clf = clf.fit(train_x, train_y) test_x = test_raw_x[test_raw_x.business_id == test_id] test_x = test_x.drop(["business_id", "stars"], 1).as_matrix() predict.append(clf.predict(test_x)[0]) result = pd.DataFrame([], columns=["stars", "predict"]) result["stars"] = test_raw_x.stars result["predict"] = predict result = result.sort("stars", ascending=0) result.to_csv(result_path, index=False) rmse = mean_squared_error(predict, test_raw_x.stars.as_matrix()) ** 0.5 return rmse
def buildModelOheETR(train_data, eval_data, train_labels, seed): train_data = sparse.csr_matrix(train_data) eval_data = sparse.csr_matrix(eval_data) clf = ExtraTreesRegressor(n_estimators=500, max_depth=38, min_samples_leaf=2,min_samples_split=6,\ max_features='auto', n_jobs=-1, random_state=seed, verbose=1) clf.fit(train_data, train_labels) preds = clf.predict(eval_data) preds = np.expm1(preds) # transform -ve preds to 0 for i in range(preds.shape[0]): if preds[i] < 0: preds[i] = 0 # convert back to log1p preds = np.log1p(preds) return((model,preds))
def predict(class_id): print "predicting: ", class_id salaries_idx = np.where(salaries_enc == class_id) valid_idx = np.where(valid_salaries_enc == class_id) if len(salaries_idx[0]) == 0 or len(valid_idx[0]) == 0: return [], None classifier = ExtraTreesRegressor(n_estimators=n_trees, verbose=0, n_jobs=4, # 2 jobs on submission / 4 on valid test oob_score=False, min_samples_split=min_samples_split, random_state=3465343) print features[salaries_idx[0], :].shape print salaries[salaries_idx].shape classifier.fit(features[salaries_idx[0], :], salaries[salaries_idx]) predictions_part = classifier.predict(validation_features[valid_idx[0]]) return predictions_part, valid_idx
def get_result(): ngram_range = (1, 2) max_df = 0.75 max_features = 2000 v = CountVectorizer( ngram_range=ngram_range, max_df=max_df, max_features=max_features) x = v.fit_transform(rats_tr.comments.fillna('')).todense() y = rats_tr.quality n_estimators = 40 max_depth = 20 clf = ExtraTreesRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=0) clf.fit(x, y) t_x = v.transform(rats_te.comments.fillna('')).todense() t_y = clf.predict(t_x) submit = pd.DataFrame(data={'id': rats_te.id, 'quality': t_y}) submit.to_csv('ridge_submit.csv', index=False)
def predict(class_id, param): print "predicting: ", class_id param += "\npredicting: %s\n" % (le_features[col_index].classes_[class_id],) salaries_idx = np.where(feature_category == class_id) valid_idx = np.where(validation_features_category == class_id) param += "Salaries len: %d, valid len: %d\n" % (len(salaries_idx[0]), len(valid_idx[0])) if len(salaries_idx[0]) == 0 or len(valid_idx[0]) == 0: return [], None, param classifier = ExtraTreesRegressor(n_estimators=n_trees, verbose=0, n_jobs=4, # 2 jobs on submission / 4 on valid test oob_score=False, min_samples_split=min_samples_split, random_state=3465343) print features[salaries_idx[0], :].shape print salaries[salaries_idx].shape print validation_features[0].shape classifier.fit(features[salaries_idx[0], :], salaries[salaries_idx]) predictions_part = classifier.predict(validation_features[valid_idx[0]]) return predictions_part, valid_idx, param
def load_model():#make it load once when the service starts. called only once. #load_the model f = open('bpinall.txt','r').readlines() num_rows=len(f) num_col=len(f[0].split(',')) x = np.zeros((num_rows,num_col),dtype=float) y=np.zeros((num_rows),dtype=float) for i,line in enumerate(f): line=line.strip('\r\n').strip() if line.count(',')>0: x[i]=[float(p) for p in line.split(',')] f2=open('bpoutall.txt','r').readlines() for i,line in enumerate(f2): line=line.strip('\r\n') y[i]=float(line) clf=ExtraTreesRegressor(verbose=0) print (x) clf.fit(x[:-1],y[:-1]) pq=clf.predict(x[-1]) print (pq,y[-1]) #global clfp pickle.dump(clf,open('modelb.pkl','wb')) return pq
gbr_tr_fit = GradientBoostingRegressor(n_estimators =10,max_depth=7) gbr_tr_fit = gbr_tr_fit.fit(transformed_train_gbr,target_train) mix_test_list += [pd.Series(gbr_tr_fit.predict(transformed_test_gbr),index=data_test_in.id.astype(int),name='gbr_tr')] mix_train_list += [pd.Series(gbr_tr_fit.predict(transformed_train_gbr),index=data_train_in.id.astype(int),name='gbr_tr')] end_gbr_tr = time.clock() print >> log, "time_gbr_tr = ", end_gbr_tr-start_gbr_tr start_xfr_tr = time.clock() xfr= ExtraTreesRegressor(n_estimators =10,max_depth=7) xfr_tr = xfr.fit(data_train,target_train) transformed_train_xfr = xfr_tr.transform(data_train,threshold="0.35*mean") print >> log, 'transformed_train_xfr',transformed_train_xfr.shape transformed_test_xfr = xfr_tr.transform(data_test,threshold="0.35*mean") xfr_tr_fit = ExtraTreesRegressor(n_estimators =10,max_depth=7) xfr_tr_fit = xfr_tr_fit.fit(transformed_train_xfr,target_train) mix_test_list += [pd.Series(xfr_tr_fit.predict(transformed_test_xfr),index=data_test_in.id.astype(int),name='xfr_tr')] mix_train_list += [pd.Series(xfr_tr_fit.predict(transformed_train_xfr),index=data_train_in.id.astype(int),name='xfr_tr')] end_xfr_tr = time.clock() print >> log, "time_xfr_tr = ", end_xfr_tr-start_xfr_tr start_gbr_cat = time.clock() gbr_cat_fit = GradientBoostingRegressor(n_estimators =10,max_depth=7) gbr_cat_fit = gbr_cat_fit.fit(data_train[catcol],target_train) mix_test_list += [pd.Series(gbr_cat_fit.predict(data_test[catcol]),index=data_test_in.id.astype(int),name='gbr_cat')] mix_train_list += [pd.Series(gbr_cat_fit.predict(data_train[catcol]),index=data_train_in.id.astype(int),name='gbr_cat')] end_gbr_cat = time.clock() print >> log, "time_gbr_cat = ", end_gbr_cat-start_gbr_cat start_xfr_cat = time.clock() xfr_cat_fit = ExtraTreesRegressor(n_estimators =10,max_depth=7) xfr_cat_fit = xfr_cat_fit.fit(data_train[catcol],target_train)
if (method==11): print('Ridge') str_method = 'Ridge' r = Ridge() if (method==12): print('Huber') str_method = 'Huber' r = HuberRegressor(fit_intercept=True, alpha=0.065, max_iter=160, epsilon=1.2) r.fit(x1[col], y1) a1 = NWRMSLE(y2, r.predict(x2[col]), x2['perishable']) # part of the output file name N1 = str(a1) test['transactions'] = r.predict(test[col]) test['transactions'] = test['transactions'].clip(lower=0.+1e-15) col = [c for c in x1 if c not in ['id', 'unit_sales','perishable']] y1 = x1['unit_sales'].values y2 = x2['unit_sales'].values # set a new seed to generate random numbers ra2 = round(method + 547*method + 182*method) np.random.seed(ra2)
new_final = new_final.append(final[final.index == i]) testeco = pd.concat([test, new_final], axis=1) testeco.to_csv('testeco_lstm.csv') print("test data after combining :" + str(testeco.shape)) #Now train the model test = pd.read_csv("testeco_lstm.csv") train = pd.read_csv("traineco_lstm.csv") gg = train.fillna(train.median()) y = gg['target'] X = gg.drop(['id', 'target'], axis=1) print("X_shape:" + str(X.shape), " , y_shape :" + str(y.shape)) X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=0.2, random_state=42) from sklearn.ensemble import ExtraTreesRegressor extra_tree = ExtraTreesRegressor(n_estimators=500, random_state=1234) extra_tree.fit(X_train, y_train) ypredictions = extra_tree.predict(X_cv) print(" Root Mean Absolute Error : ", sqrt(mean_squared_error(ypredictions, y_cv))) extra_tree.fit(X, y) test2 = test.drop(['id'], axis=1) test2 = test2.fillna(test2.median()) predictions = extra_tree.predict(test2) pred = pd.DataFrame(predictions) pred = pred.set_index([test['id']]) pred.to_csv("extra_tree_500.csv") #Our best submission is extra_tree_500 giving accuracy-> 0.98098 on leaderboard,By Default ExtraTreesRegressor (n_estimators=500,random_state=1234)
'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap } scores = ['neg_mean_absolute_error'] for score in scores: forest = GridSearchCV(ExtraTreesRegressor(random_state=1), tuned_parameters, verbose=10, cv=5, n_jobs=-1, scoring='%s' % score) forest.fit(X_train, y_train) model_train = forest.predict(X_train) model_test = forest.predict(X_test) r2_score_train = r2_score(y_train, model_train) mse_score_train = mean_squared_error(y_train, model_train) mae_score_train = mean_absolute_error(y_train, model_train) rmse_score_train = np.sqrt(mse_score_train) r2_score_test = r2_score(y_test, model_test) mse_score_test = mean_squared_error(y_test, model_test) mae_score_test = mean_absolute_error(y_test, model_test) rmse_score_test = np.sqrt(mse_score_test) dump(forest, 'rf_s_vs_e.pkl') if args.dielectric is True: if args.outlier_removal is True: f = open('hyperpameters_outlier_removal_dielectric.txt', mode='w')
class ExtraTreesRegressor(AutoSklearnRegressionAlgorithm): def __init__(self, n_estimators, criterion, min_samples_leaf, min_samples_split, max_features, max_leaf_nodes_or_max_depth="max_depth", bootstrap=False, max_leaf_nodes=None, max_depth="None", oob_score=False, n_jobs=1, random_state=None, verbose=0): super(ExtraTreesRegressor, self).__init__() self.n_estimators = int(n_estimators) self.estimator_increment = 10 if criterion not in ("mse"): raise ValueError("'criterion' is not in ('mse'): " "%s" % criterion) self.criterion = criterion if max_leaf_nodes_or_max_depth == "max_depth": self.max_leaf_nodes = None if max_depth == "None": self.max_depth = None else: self.max_depth = int(max_depth) #if use_max_depth == "True": # self.max_depth = int(max_depth) #elif use_max_depth == "False": # self.max_depth = None else: if max_leaf_nodes == "None": self.max_leaf_nodes = None else: self.max_leaf_nodes = int(max_leaf_nodes) self.max_depth = None self.min_samples_leaf = int(min_samples_leaf) self.min_samples_split = int(min_samples_split) self.max_features = float(max_features) if bootstrap == "True": self.bootstrap = True elif bootstrap == "False": self.bootstrap = False self.oob_score = oob_score self.n_jobs = int(n_jobs) self.random_state = random_state self.verbose = int(verbose) self.estimator = None def fit(self, X, y, refit=False): if self.estimator is None or refit: self.iterative_fit(X, y, n_iter=1, refit=refit) while not self.configuration_fully_fitted(): self.iterative_fit(X, y, n_iter=1) return self def iterative_fit(self, X, y, n_iter=1, refit=False): from sklearn.ensemble import ExtraTreesRegressor as ETR if refit: self.estimator = None if self.estimator is None: num_features = X.shape[1] max_features = int( float(self.max_features) * (np.log(num_features) + 1)) # Use at most half of the features max_features = max(1, min(int(X.shape[1] / 2), max_features)) self.estimator = ETR(n_estimators=0, criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, max_features=max_features, max_leaf_nodes=self.max_leaf_nodes, oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state, warm_start=True) tmp = self.estimator # TODO copy ? tmp.n_estimators += n_iter tmp.fit( X, y, ) self.estimator = tmp return self def configuration_fully_fitted(self): if self.estimator is None: return False return not len(self.estimator.estimators_) < self.n_estimators def predict(self, X): if self.estimator is None: raise NotImplementedError return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict_proba(X) @staticmethod def get_properties(dataset_properties=None): return { 'shortname': 'ET', 'name': 'Extra Trees Regressor', 'handles_regression': True, 'handles_classification': False, 'handles_multiclass': False, 'handles_multilabel': False, 'is_deterministic': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS, ), } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() n_estimators = cs.add_hyperparameter(Constant("n_estimators", 100)) criterion = cs.add_hyperparameter(Constant("criterion", "mse")) max_features = cs.add_hyperparameter( UniformFloatHyperparameter("max_features", 0.5, 5, default=1)) max_depth = cs.add_hyperparameter( UnParametrizedHyperparameter(name="max_depth", value="None")) min_samples_split = cs.add_hyperparameter( UniformIntegerHyperparameter("min_samples_split", 2, 20, default=2)) min_samples_leaf = cs.add_hyperparameter( UniformIntegerHyperparameter("min_samples_leaf", 1, 20, default=1)) # Unparametrized, we use min_samples as regularization # max_leaf_nodes_or_max_depth = UnParametrizedHyperparameter( # name="max_leaf_nodes_or_max_depth", value="max_depth") # CategoricalHyperparameter("max_leaf_nodes_or_max_depth", # choices=["max_leaf_nodes", "max_depth"], default="max_depth") # min_weight_fraction_leaf = UniformFloatHyperparameter( # "min_weight_fraction_leaf", 0.0, 0.1) # max_leaf_nodes = UnParametrizedHyperparameter(name="max_leaf_nodes", # value="None") bootstrap = cs.add_hyperparameter( CategoricalHyperparameter("bootstrap", ["True", "False"], default="False")) # Conditions # Not applicable because max_leaf_nodes is no legal value of the parent #cond_max_leaf_nodes_or_max_depth = \ # EqualsCondition(child=max_leaf_nodes, # parent=max_leaf_nodes_or_max_depth, # value="max_leaf_nodes") #cond2_max_leaf_nodes_or_max_depth = \ # EqualsCondition(child=use_max_depth, # parent=max_leaf_nodes_or_max_depth, # value="max_depth") #cond_max_depth = EqualsCondition(child=max_depth, parent=use_max_depth, #value="True") #cs.add_condition(cond_max_leaf_nodes_or_max_depth) #cs.add_condition(cond2_max_leaf_nodes_or_max_depth) #cs.add_condition(cond_max_depth) return cs
def hyperopt_obj(param, feat_folder, feat_name, trial_counter): log_loss_cv = np.zeros((config.n_runs, config.n_folds), dtype=float) year = datetime.datetime.now().year for run in range(1, config.n_runs + 1): # range(start, end)前包括后不包括 for fold in range(1, config.n_folds + 1): rng = np.random.RandomState(year + 1000 * run + 10 * fold) #### all the path path = "%s/Run%d/Fold%d" % (feat_folder, run, fold) save_path = "%s/Run%d/Fold%d" % (output_path, run, fold) if not os.path.exists(save_path): os.makedirs(save_path) # feat: combine feat file feat_train_path = "%s/train.feat" % path feat_valid_path = "%s/valid.feat" % path # # weight weight_train_path = "%s/train.feat.weight" % path weight_valid_path = "%s/valid.feat.weight" % path # info info_train_path = "%s/train.info" % path info_valid_path = "%s/valid.info" % path # cdf cdf_valid_path = "%s/valid.cdf" % path # raw prediction path (rank) raw_pred_valid_path = "%s/valid.raw.pred.%s_[Id@%d].csv" % ( save_path, feat_name, trial_counter) # rank_pred_valid_path = "%s/valid.pred.%s_[Id@%d].csv" % ( save_path, feat_name, trial_counter) # ## load feat X_train, labels_train = load_svmlight_file( feat_train_path ) # load_svmlight_file: Load datasets in the svmlight / libsvm format into sparse CSR matrix X_valid, labels_valid = load_svmlight_file(feat_valid_path) # align feat dim if X_valid.shape[1] < X_train.shape[1]: X_valid = hstack([ X_valid, np.zeros((X_valid.shape[0], X_train.shape[1] - X_valid.shape[1])) ]) elif X_valid.shape[1] > X_train.shape[1]: X_train = hstack([ X_train, np.zeros((X_train.shape[0], X_valid.shape[1] - X_train.shape[1])) ]) X_train = X_train.tocsr( ) # tocsr: Convert this matrix to Compressed Sparse Row format X_valid = X_valid.tocsr() # ## load weight weight_train = np.loadtxt(weight_train_path, dtype=float) weight_valid = np.loadtxt(weight_valid_path, dtype=float) ## load valid info info_train = pd.read_csv(info_train_path) numTrain = info_train.shape[0] info_valid = pd.read_csv(info_valid_path) numValid = info_valid.shape[0] Y_valid = info_valid["is_duplicate"] ## load cdf cdf_valid = np.loadtxt(cdf_valid_path, dtype=float) # ## make evalerror func 评价函数 # evalerror_regrank_valid = lambda preds,dtrain: evalerror_regrank_cdf(preds, dtrain, cdf_valid) # evalerror_softmax_valid = lambda preds,dtrain: evalerror_softmax_cdf(preds, dtrain, cdf_valid) # evalerror_softkappa_valid = lambda preds,dtrain: evalerror_softkappa_cdf(preds, dtrain, cdf_valid) # evalerror_ebc_valid = lambda preds,dtrain: evalerror_ebc_cdf(preds, dtrain, cdf_valid, ebc_hard_threshold) # evalerror_cocr_valid = lambda preds,dtrain: evalerror_cocr_cdf(preds, dtrain, cdf_valid) ############## ## Training ## ############## ## you can use bagging to stabilize the predictions 还可以使用 bagging 来使模型更加稳定 preds_bagging = np.zeros((numValid, bagging_size), dtype=float) for n in range(bagging_size): if bootstrap_replacement: sampleSize = int( numTrain * bootstrap_ratio) # bootstrap_ratio: 使用训练样本的比例 index_base = rng.randint(numTrain, size=sampleSize) index_meta = [ i for i in range(numTrain) if i not in index_base ] else: randnum = rng.uniform(size=numTrain) # 产生 0-1 之间的唯一的随机数 index_base = [ i for i in range(numTrain) if randnum[i] < bootstrap_ratio ] index_meta = [ i for i in range(numTrain) if randnum[i] >= bootstrap_ratio ] # 如果是xgb则先把数据转换成xgb需要的格式 if "booster" in param: dvalid_base = xgb.DMatrix( X_valid, label=labels_valid) # , weight=weight_valid dtrain_base = xgb.DMatrix( X_train[index_base], label=labels_train[index_base] ) # , weight=weight_train[index_base] watchlist = [] if verbose_level >= 2: watchlist = [(dtrain_base, 'train'), (dvalid_base, 'valid')] ## various models if param["task"] in ["regression", "ranking"]: ## regression & pairwise ranking with xgboost bst = xgb.train( param, dtrain_base, param['num_round'], watchlist) # , feval=evalerror_regrank_valid pred = bst.predict(dvalid_base) if param["task"] in ["classification"]: ## regression & pairwise ranking with xgboost bst = xgb.train( param, dtrain_base, param['num_round'], watchlist) # , feval=evalerror_regrank_valid pred = bst.predict(dvalid_base) elif param["task"] in ["softmax"]: ## softmax regression with xgboost bst = xgb.train( param, dtrain_base, param['num_round'], watchlist) # , feval=evalerror_softmax_valid pred = bst.predict(dvalid_base) w = np.asarray(range(1, numValid)) pred = pred * w[ np. newaxis, :] # np.newaxis: 插入一个维度,等价于w[np.newaxis],这里pred是n*1矩阵,而w[np.newaxis,:]是1*n矩阵,注意w原是数组 pred = np.sum(pred, axis=1) elif param["task"] in ["softkappa"]: ## softkappa with xgboost 自定义损失函数 # obj = lambda preds, dtrain: softkappaObj(preds, dtrain, hess_scale=param['hess_scale']) bst = xgb.train( param, dtrain_base, param['num_round'], watchlist ) # , obj=obj, feval=evalerror_softkappa_valid pred = softmax(bst.predict(dvalid_base)) w = np.asarray(range(1, numValid)) pred = pred * w[np.newaxis, :] pred = np.sum(pred, axis=1) elif param["task"] in ["ebc"]: ## ebc with xgboost 自定义损失函数 # obj = lambda preds, dtrain: ebcObj(preds, dtrain) bst = xgb.train( param, dtrain_base, param['num_round'], watchlist) # , obj=obj, feval=evalerror_ebc_valid pred = sigmoid(bst.predict(dvalid_base)) pred = applyEBCRule(pred, hard_threshold=ebc_hard_threshold) elif param["task"] in ["cocr"]: ## cocr with xgboost 自定义损失函数 # obj = lambda preds, dtrain: cocrObj(preds, dtrain) bst = xgb.train( param, dtrain_base, param['num_round'], watchlist) # , obj=obj, feval=evalerror_cocr_valid pred = bst.predict(dvalid_base) pred = applyCOCRRule(pred) elif param['task'] == "reg_skl_rf": ## regression with sklearn random forest regressor rf = RandomForestRegressor( n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) rf.fit(X_train[index_base], labels_train[index_base] ) # , sample_weight=weight_train[index_base] pred = rf.predict(X_valid) elif param['task'] == "reg_skl_etr": ## regression with sklearn extra trees regressor etr = ExtraTreesRegressor( n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) etr.fit(X_train[index_base], labels_train[index_base] ) # , sample_weight=weight_train[index_base] pred = etr.predict(X_valid) elif param['task'] == "reg_skl_gbm": ## regression with sklearn gradient boosting regressor gbm = GradientBoostingRegressor( n_estimators=param['n_estimators'], max_features=param['max_features'], learning_rate=param['learning_rate'], max_depth=param['max_depth'], subsample=param['subsample'], random_state=param['random_state']) gbm.fit(X_train.toarray()[index_base], labels_train[index_base] ) # , sample_weight=weight_train[index_base] pred = gbm.predict(X_valid.toarray()) elif param['task'] == "clf_skl_lr": ## classification with sklearn logistic regression lr = LogisticRegression(penalty="l2", dual=True, tol=1e-5, C=param['C'], fit_intercept=True, intercept_scaling=1.0, class_weight='auto', random_state=param['random_state']) lr.fit(X_train[index_base], labels_train[index_base]) pred = lr.predict_proba(X_valid) w = np.asarray(range(1, numValid)) pred = pred * w[np.newaxis, :] pred = np.sum(pred, axis=1) elif param['task'] == "reg_skl_svr": ## regression with sklearn support vector regression X_train, X_valid = X_train.toarray(), X_valid.toarray() scaler = StandardScaler() X_train[index_base] = scaler.fit_transform( X_train[index_base]) X_valid = scaler.transform(X_valid) svr = SVR(C=param['C'], gamma=param['gamma'], epsilon=param['epsilon'], degree=param['degree'], kernel=param['kernel']) svr.fit(X_train[index_base], labels_train[index_base] ) # , sample_weight=weight_train[index_base] pred = svr.predict(X_valid) elif param['task'] == "reg_skl_ridge": ## regression with sklearn ridge regression ridge = Ridge(alpha=param["alpha"], normalize=True) ridge.fit(X_train[index_base], labels_train[index_base] ) # , sample_weight=weight_train[index_base] pred = ridge.predict(X_valid) elif param['task'] == "reg_skl_lasso": ## regression with sklearn lasso lasso = Lasso(alpha=param["alpha"], normalize=True) lasso.fit(X_train[index_base], labels_train[index_base]) pred = lasso.predict(X_valid) elif param['task'] == 'reg_libfm': ## regression with factorization machine (libfm) ## to array X_train = X_train.toarray() X_valid = X_valid.toarray() ## scale scaler = StandardScaler() X_train[index_base] = scaler.fit_transform( X_train[index_base]) X_valid = scaler.transform(X_valid) ## dump feat dump_svmlight_file(X_train[index_base], labels_train[index_base], feat_train_path + ".tmp") dump_svmlight_file(X_valid, labels_valid, feat_valid_path + ".tmp") ## train fm cmd = "%s -task r -train %s -test %s -out %s -dim '1,1,%d' -iter %d > libfm.log" % ( \ libfm_exe, feat_train_path+".tmp", feat_valid_path+".tmp", raw_pred_valid_path, \ param['dim'], param['iter']) os.system(cmd) os.remove(feat_train_path + ".tmp") os.remove(feat_valid_path + ".tmp") ## extract libfm prediction pred = np.loadtxt(raw_pred_valid_path, dtype=float) ## labels are in [0,1,2,3] pred += 1 elif param['task'] == "reg_keras_dnn": ## regression with keras' deep neural networks model = Sequential() ## input layer model.add(Dropout(param["input_dropout"])) ## hidden layers first = True hidden_layers = param['hidden_layers'] while hidden_layers > 0: if first: dim = X_train.shape[1] first = False else: dim = param["hidden_units"] model.add( Dense(dim, param["hidden_units"], init='glorot_uniform')) if param["batch_norm"]: model.add( BatchNormalization((param["hidden_units"], ))) if param["hidden_activation"] == "prelu": model.add(PReLU((param["hidden_units"], ))) else: model.add(Activation(param['hidden_activation'])) model.add(Dropout(param["hidden_dropout"])) hidden_layers -= 1 ## output layer model.add( Dense(param["hidden_units"], 1, init='glorot_uniform')) model.add(Activation('linear')) ## loss model.compile(loss='mean_squared_error', optimizer="adam") ## to array X_train = X_train.toarray() X_valid = X_valid.toarray() ## scale scaler = StandardScaler() X_train[index_base] = scaler.fit_transform( X_train[index_base]) X_valid = scaler.transform(X_valid) ## train model.fit(X_train[index_base], labels_train[index_base], nb_epoch=param['nb_epoch'], batch_size=param['batch_size'], validation_split=0, verbose=0) ##prediction pred = model.predict(X_valid, verbose=0) pred.shape = (X_valid.shape[0], ) elif param['task'] == "reg_rgf": ## regression with regularized greedy forest (rgf) ## to array X_train, X_valid = X_train.toarray(), X_valid.toarray() train_x_fn = feat_train_path + ".x" train_y_fn = feat_train_path + ".y" valid_x_fn = feat_valid_path + ".x" valid_pred_fn = feat_valid_path + ".pred" model_fn_prefix = "rgf_model" np.savetxt(train_x_fn, X_train[index_base], fmt="%.6f", delimiter='\t') np.savetxt(train_y_fn, labels_train[index_base], fmt="%d", delimiter='\t') np.savetxt(valid_x_fn, X_valid, fmt="%.6f", delimiter='\t') # np.savetxt(valid_y_fn, labels_valid, fmt="%d", delimiter='\t') pars = [ "train_x_fn=", train_x_fn, "\n", "train_y_fn=", train_y_fn, "\n", #"train_w_fn=",weight_train_path,"\n", "model_fn_prefix=", model_fn_prefix, "\n", "reg_L2=", param['reg_L2'], "\n", #"reg_depth=", 1.01, "\n", "algorithm=", "RGF", "\n", "loss=", "LS", "\n", #"opt_interval=", 100, "\n", "valid_interval=", param['max_leaf_forest'], "\n", "max_leaf_forest=", param['max_leaf_forest'], "\n", "num_iteration_opt=", param['num_iteration_opt'], "\n", "num_tree_search=", param['num_tree_search'], "\n", "min_pop=", param['min_pop'], "\n", "opt_interval=", param['opt_interval'], "\n", "opt_stepsize=", param['opt_stepsize'], "\n", "NormalizeTarget" ] pars = "".join([str(p) for p in pars]) rfg_setting_train = "./rfg_setting_train" with open(rfg_setting_train + ".inp", "wb") as f: f.write(pars) ## train fm cmd = "perl %s %s train %s >> rgf.log" % ( call_exe, rgf_exe, rfg_setting_train) #print cmd os.system(cmd) model_fn = model_fn_prefix + "-01" pars = [ "test_x_fn=", valid_x_fn, "\n", "model_fn=", model_fn, "\n", "prediction_fn=", valid_pred_fn ] pars = "".join([str(p) for p in pars]) rfg_setting_valid = "./rfg_setting_valid" with open(rfg_setting_valid + ".inp", "wb") as f: f.write(pars) cmd = "perl %s %s predict %s >> rgf.log" % ( call_exe, rgf_exe, rfg_setting_valid) #print cmd os.system(cmd) pred = np.loadtxt(valid_pred_fn, dtype=float) ## weighted averageing over different models pred_valid = pred ## this bagging iteration preds_bagging[:, n] = pred_valid # preds_bagging的第n+1列为pred_valid pred_raw = np.mean(preds_bagging[:, :(n + 1)], axis=1) # 按行(同行多列)进行平均值 # pred_rank = pred_raw.argsort().argsort() # argsort: 获取排序的索引值(index),但索引值本身不排序,第二次是归位 # pred_score, cutoff = getScore(pred_rank, cdf_valid, valid=True) # 根据cdf来生成分数 # kappa_valid = quadratic_weighted_kappa(pred_score, Y_valid) # 计算kappa分数 log_loss_valid = elementwise.log_loss(Y_valid, pred_raw) log_loss_valid = log_loss(Y_valid, pred_raw) print('Y_valid mean:', np.mean(Y_valid)) print('pred_raw mean:', np.mean(pred_raw)) if (n + 1) != bagging_size: print( " {:>3} {:>3} {:>3} {:>6} {} x {}" .format(run, fold, n + 1, np.round(log_loss_valid, 6), X_train.shape[0], X_train.shape[1])) else: print( " {:>3} {:>3} {:>3} {:>8} {} x {}" .format(run, fold, n + 1, np.round(log_loss_valid, 6), X_train.shape[0], X_train.shape[1])) log_loss_cv[run - 1, fold - 1] = log_loss_valid ## save this prediction 保存的是单行的预测值 dfPred = pd.DataFrame({"target": Y_valid, "prediction": pred_raw}) dfPred.to_csv(raw_pred_valid_path, index=False, header=True, columns=["target", "prediction"]) # save this prediction 保存的是根据预测值排序之后,然后使用cdf来生成的预测值 # dfPred = pd.DataFrame({"target": Y_valid, "prediction": pred_rank}) # dfPred.to_csv(rank_pred_valid_path, index=False, header=True, columns=["target", "prediction"]) log_loss_cv_mean = np.mean(log_loss_cv) log_loss_cv_std = np.std(log_loss_cv) if verbose_level >= 1: print(" Mean: %.6f" % log_loss_cv_mean) print(" Std: %.6f" % log_loss_cv_std) #################### #### Retraining #### #################### #### all the path # path = "%s/All" % (feat_folder) # save_path = "%s/All" % output_path # subm_path = "%s/Subm" % output_path # if not os.path.exists(save_path): # os.makedirs(save_path) # if not os.path.exists(subm_path): # os.makedirs(subm_path) # # feat # feat_train_path = "%s/train.feat" % path # feat_test_path = "%s/test.feat" % path # # weight # # weight_train_path = "%s/train.feat.weight" % path # # info # info_train_path = "%s/train.info" % path # info_test_path = "%s/test.info" % path # # cdf # cdf_test_path = "%s/test.cdf" % path # # raw prediction path (rank) # raw_pred_test_path = "%s/test.raw.pred.%s_[Id@%d].csv" % (save_path, feat_name, trial_counter) # rank_pred_test_path = "%s/test.pred.%s_[Id@%d].csv" % (save_path, feat_name, trial_counter) # # submission path (is_duplicate as in [0, 1]) # subm_path = "%s/test.pred.%s_[Id@%d]_[Mean%.6f]_[Std%.6f].csv" % (subm_path, feat_name, trial_counter, log_loss_cv_mean, log_loss_cv_std) # # #### load data # ## load feat # X_train, labels_train = load_svmlight_file(feat_train_path) # X_test, labels_test = load_svmlight_file(feat_test_path) # if X_test.shape[1] < X_train.shape[1]: # X_test = hstack([X_test, np.zeros((X_test.shape[0], X_train.shape[1]-X_test.shape[1]))]) # elif X_test.shape[1] > X_train.shape[1]: # X_train = hstack([X_train, np.zeros((X_train.shape[0], X_test.shape[1]-X_train.shape[1]))]) # X_train = X_train.tocsr() # X_test = X_test.tocsr() # ## load train weight # # weight_train = np.loadtxt(weight_train_path, dtype=float) # ## load test info # info_train = pd.read_csv(info_train_path) # numTrain = info_train.shape[0] # info_test = pd.read_csv(info_test_path) # numTest = info_test.shape[0] # id_test = info_test["id"] # # ## load cdf # cdf_test = np.loadtxt(cdf_test_path, dtype=float) # # ## 评价函数 # # evalerror_regrank_test = lambda preds,dtrain: evalerror_regrank_cdf(preds, dtrain, cdf_test) # # evalerror_softmax_test = lambda preds,dtrain: evalerror_softmax_cdf(preds, dtrain, cdf_test) # # evalerror_softkappa_test = lambda preds,dtrain: evalerror_softkappa_cdf(preds, dtrain, cdf_test) # # evalerror_ebc_test = lambda preds,dtrain: evalerror_ebc_cdf(preds, dtrain, cdf_test, ebc_hard_threshold) # # evalerror_cocr_test = lambda preds,dtrain: evalerror_cocr_cdf(preds, dtrain, cdf_test) # # ## bagging # preds_bagging = np.zeros((numTest, bagging_size), dtype=float) # for n in range(bagging_size): # if bootstrap_replacement: # sampleSize = int(numTrain*bootstrap_ratio) # #index_meta = rng.randint(numTrain, size=sampleSize) # #index_base = [i for i in range(numTrain) if i not in index_meta] # index_base = rng.randint(numTrain, size=sampleSize) # index_meta = [i for i in range(numTrain) if i not in index_base] # else: # randnum = rng.uniform(size=numTrain) # index_base = [i for i in range(numTrain) if randnum[i] < bootstrap_ratio] # index_meta = [i for i in range(numTrain) if randnum[i] >= bootstrap_ratio] # # # 如果是xgb则先把数据转换成xgb需要的格式 # if "booster" in param: # dtest = xgb.DMatrix(X_test, label=labels_test) # dtrain = xgb.DMatrix(X_train[index_base], label=labels_train[index_base]) # , weight=weight_train[index_base] # # watchlist = [] # if verbose_level >= 2: # watchlist = [(dtrain, 'train')] # # ## train # if param["task"] in ["regression", "ranking"]: # bst = xgb.train(param, dtrain, param['num_round'], watchlist) # , feval=evalerror_regrank_test # pred = bst.predict(dtest) # # elif param["task"] in ["softmax"]: # bst = xgb.train(param, dtrain, param['num_round'], watchlist) # , feval=evalerror_softmax_test # pred = bst.predict(dtest) # w = np.asarray(range(1,numValid)) # pred = pred * w[np.newaxis,:] # pred = np.sum(pred, axis=1) # # elif param["task"] in ["softkappa"]: # # 自定义损失函数 # # obj = lambda preds, dtrain: softkappaObj(preds, dtrain, hess_scale=param['hess_scale']) # bst = xgb.train(param, dtrain, param['num_round'], watchlist) # , obj=obj, feval=evalerror_softkappa_test # pred = softmax(bst.predict(dtest)) # w = np.asarray(range(1,numValid)) # pred = pred * w[np.newaxis,:] # pred = np.sum(pred, axis=1) # # elif param["task"] in ["ebc"]: # # 自定义损失函数 # # obj = lambda preds, dtrain: ebcObj(preds, dtrain) # bst = xgb.train(param, dtrain, param['num_round'], watchlist) # , obj=obj, feval=evalerror_ebc_test # pred = sigmoid(bst.predict(dtest)) # pred = applyEBCRule(pred, hard_threshold=ebc_hard_threshold) # # elif param["task"] in ["cocr"]: # # 自定义损失函数 # obj = lambda preds, dtrain: cocrObj(preds, dtrain) # bst = xgb.train(param, dtrain, param['num_round'], watchlist) # , obj=obj, feval=evalerror_cocr_test # pred = bst.predict(dtest) # pred = applyCOCRRule(pred) # # elif param['task'] == "reg_skl_rf": # ## random forest regressor # rf = RandomForestRegressor(n_estimators=param['n_estimators'], # max_features=param['max_features'], # n_jobs=param['n_jobs'], # random_state=param['random_state']) # rf.fit(X_train[index_base], labels_train[index_base]) # , sample_weight=weight_train[index_base] # pred = rf.predict(X_test) # # elif param['task'] == "reg_skl_etr": # ## extra trees regressor # etr = ExtraTreesRegressor(n_estimators=param['n_estimators'], # max_features=param['max_features'], # n_jobs=param['n_jobs'], # random_state=param['random_state']) # etr.fit(X_train[index_base], labels_train[index_base]) # , sample_weight=weight_train[index_base] # pred = etr.predict(X_test) # # elif param['task'] == "reg_skl_gbm": # ## gradient boosting regressor # gbm = GradientBoostingRegressor(n_estimators=param['n_estimators'], # max_features=param['max_features'], # learning_rate=param['learning_rate'], # max_depth=param['max_depth'], # subsample=param['subsample'], # random_state=param['random_state']) # gbm.fit(X_train.toarray()[index_base], labels_train[index_base]) #, sample_weight=weight_train[index_base] # pred = gbm.predict(X_test.toarray()) # # elif param['task'] == "clf_skl_lr": # lr = LogisticRegression(penalty="l2", dual=True, tol=1e-5, # C=param['C'], fit_intercept=True, intercept_scaling=1.0, # class_weight='auto', random_state=param['random_state']) # lr.fit(X_train[index_base], labels_train[index_base]) # pred = lr.predict_proba(X_test) # w = np.asarray(range(1,numValid)) # pred = pred * w[np.newaxis,:] # pred = np.sum(pred, axis=1) # # elif param['task'] == "reg_skl_svr": # ## regression with sklearn support vector regression # X_train, X_test = X_train.toarray(), X_test.toarray() # scaler = StandardScaler() # X_train[index_base] = scaler.fit_transform(X_train[index_base]) # X_test = scaler.transform(X_test) # svr = SVR(C=param['C'], gamma=param['gamma'], epsilon=param['epsilon'], # degree=param['degree'], kernel=param['kernel']) # svr.fit(X_train[index_base], labels_train[index_base]) # , sample_weight=weight_train[index_base] # pred = svr.predict(X_test) # # elif param['task'] == "reg_skl_ridge": # ridge = Ridge(alpha=param["alpha"], normalize=True) # ridge.fit(X_train[index_base], labels_train[index_base]) # , sample_weight=weight_train[index_base] # pred = ridge.predict(X_test) # # elif param['task'] == "reg_skl_lasso": # lasso = Lasso(alpha=param["alpha"], normalize=True) # lasso.fit(X_train[index_base], labels_train[index_base]) # pred = lasso.predict(X_test) # # elif param['task'] == 'reg_libfm': # ## to array # X_train, X_test = X_train.toarray(), X_test.toarray() # # ## scale # scaler = StandardScaler() # X_train[index_base] = scaler.fit_transform(X_train[index_base]) # X_test = scaler.transform(X_test) # # ## dump feat # dump_svmlight_file(X_train[index_base], labels_train[index_base], feat_train_path+".tmp") # dump_svmlight_file(X_test, labels_test, feat_test_path+".tmp") # # ## train fm # cmd = "%s -task r -train %s -test %s -out %s -dim '1,1,%d' -iter %d > libfm.log" % ( \ # libfm_exe, feat_train_path+".tmp", feat_test_path+".tmp", raw_pred_test_path, \ # param['dim'], param['iter']) # os.system(cmd) # os.remove(feat_train_path+".tmp") # os.remove(feat_test_path+".tmp") # # ## extract libfm prediction # pred = np.loadtxt(raw_pred_test_path, dtype=float) # ## labels are in [0,1,2,3] # pred += 1 # # elif param['task'] == "reg_keras_dnn": # ## regression with keras deep neural networks # model = Sequential() # ## input layer # model.add(Dropout(param["input_dropout"])) # ## hidden layers # first = True # hidden_layers = param['hidden_layers'] # while hidden_layers > 0: # if first: # dim = X_train.shape[1] # first = False # else: # dim = param["hidden_units"] # model.add(Dense(dim, param["hidden_units"], init='glorot_uniform')) # if param["batch_norm"]: # model.add(BatchNormalization((param["hidden_units"],))) # if param["hidden_activation"] == "prelu": # model.add(PReLU((param["hidden_units"],))) # else: # model.add(Activation(param['hidden_activation'])) # model.add(Dropout(param["hidden_dropout"])) # hidden_layers -= 1 # # ## output layer # model.add(Dense(param["hidden_units"], 1, init='glorot_uniform')) # model.add(Activation('linear')) # # ## loss # model.compile(loss='mean_squared_error', optimizer="adam") # # ## to array # X_train = X_train.toarray() # X_test = X_test.toarray() # # ## scale # scaler = StandardScaler() # X_train[index_base] = scaler.fit_transform(X_train[index_base]) # X_test = scaler.transform(X_test) # # ## train # model.fit(X_train[index_base], labels_train[index_base], # nb_epoch=param['nb_epoch'], batch_size=param['batch_size'], verbose=0) # # ##prediction # pred = model.predict(X_test, verbose=0) # pred.shape = (X_test.shape[0],) # # elif param['task'] == "reg_rgf": # ## to array # X_train, X_test = X_train.toarray(), X_test.toarray() # # train_x_fn = feat_train_path+".x" # train_y_fn = feat_train_path+".y" # test_x_fn = feat_test_path+".x" # test_pred_fn = feat_test_path+".pred" # # model_fn_prefix = "rgf_model" # # np.savetxt(train_x_fn, X_train[index_base], fmt="%.6f", delimiter='\t') # np.savetxt(train_y_fn, labels_train[index_base], fmt="%d", delimiter='\t') # np.savetxt(test_x_fn, X_test, fmt="%.6f", delimiter='\t') # # np.savetxt(valid_y_fn, labels_valid, fmt="%d", delimiter='\t') # # # pars = [ # "train_x_fn=",train_x_fn,"\n", # "train_y_fn=",train_y_fn,"\n", # #"train_w_fn=",weight_train_path,"\n", # "model_fn_prefix=",model_fn_prefix,"\n", # "reg_L2=", param['reg_L2'], "\n", # #"reg_depth=", 1.01, "\n", # "algorithm=","RGF","\n", # "loss=","LS","\n", # "test_interval=", param['max_leaf_forest'],"\n", # "max_leaf_forest=", param['max_leaf_forest'],"\n", # "num_iteration_opt=", param['num_iteration_opt'], "\n", # "num_tree_search=", param['num_tree_search'], "\n", # "min_pop=", param['min_pop'], "\n", # "opt_interval=", param['opt_interval'], "\n", # "opt_stepsize=", param['opt_stepsize'], "\n", # "NormalizeTarget" # ] # pars = "".join([str(p) for p in pars]) # # rfg_setting_train = "./rfg_setting_train" # with open(rfg_setting_train+".inp", "wb") as f: # f.write(pars) # # ## train fm # cmd = "perl %s %s train %s >> rgf.log" % ( # call_exe, rgf_exe, rfg_setting_train) # #print cmd # os.system(cmd) # # # model_fn = model_fn_prefix + "-01" # pars = [ # "test_x_fn=",test_x_fn,"\n", # "model_fn=", model_fn,"\n", # "prediction_fn=", test_pred_fn # ] # # pars = "".join([str(p) for p in pars]) # # rfg_setting_test = "./rfg_setting_test" # with open(rfg_setting_test+".inp", "wb") as f: # f.write(pars) # cmd = "perl %s %s predict %s >> rgf.log" % ( # call_exe, rgf_exe, rfg_setting_test) # #print cmd # os.system(cmd) # # pred = np.loadtxt(test_pred_fn, dtype=float) # # ## weighted averageing over different models # pred_test = pred # preds_bagging[:,n] = pred_test # pred_raw = np.mean(preds_bagging, axis=1) # pred_rank = pred_raw.argsort().argsort() # # # ## write # output = pd.DataFrame({"id": id_test, "prediction": pred_raw}) # output.to_csv(raw_pred_test_path, index=False) # # ## write # output = pd.DataFrame({"id": id_test, "prediction": pred_rank}) # output.to_csv(rank_pred_test_path, index=False) # # ## write score # pred_score = getScore(pred, cdf_test) # output = pd.DataFrame({"id": id_test, "prediction": pred_score}) # output.to_csv(subm_path, index=False) # #""" return log_loss_cv_mean, log_loss_cv_std
# # Random search of parameters, using 3 fold cross validation, # # search across 100 different combinations, and use all available cores # rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 1, verbose=2, random_state=42, n_jobs = -1) # # Fit the random search model # rf_random.fit(local_train, y_local_train) # print(rf_random.best_params_) # In[ ]: #RF classifier for train-validation perf: clf = ExtraTreesRegressor(verbose=2, n_jobs=1,oob_score=True,min_samples_leaf=2, bootstrap=True,criterion='mae', max_depth = 30, n_estimators=200, random_state=0) clf.fit(local_train, y_local_train) p = clf.predict_proba(local_validation) y_validation_pred_binary = clf.predict(local_validation) y_validation_pred_prob = [] for x,y in p: y_validation_pred_prob.append(y) count_match = 0 count_error = 0 deviation = 0.0 assert(len(y_validation_pred_prob)==len(y_local_validation)) validation_gtruth=np.asarray(y_local_validation) for i in range(len(y_local_validation)): deviation +=abs(y_validation_pred_prob[i]-validation_gtruth[i]) if (int(y_validation_pred_binary[i])==int(validation_gtruth[i])): count_match+=1 else: count_error+=1 validation_accuracy = count_match/(count_match+count_error)*100.0
'n_jobs': 4 } #rf = RandomForestRegressor(**params_rf) #rf.fit(x_train,y_train) #y_pre_rf = rf.predict(x_test) params_ext = { 'max_features': 'log2', 'n_estimators': 600, 'max_depth': 12, 'oob_score': True, 'n_jobs': 4, 'bootstrap': True } ext = ExtraTreesRegressor(**params_ext) ext.fit(x_train, y_train) y_pre_ext = ext.predict(x_test) ### ''' plt.scatter(xday[-(start+14):-14],y_train) plt.scatter(xday[-14:],y_pre_ext,color = 'green') plt.plot(xday[-14:],y_pre_ext,color = 'red') path = "e://tianchi_koubei/fig/rf_pre/"+str(i+1)+'.png' plt.savefig(path+".png") plt.clf()#清除图像,所有的都画到一起了 ''' output(fw, i + 1, y_pre_ext) print(i) i += 1 fr1.close() fr2.close()
class PredictiveGraphEmbedder(object): """Provide 2D embedding.""" def __init__(self, n_estimators=250, medium_dim=100, nn_n_estimators=30, nn_negative_bias=1, nn_k=7, nn_p=2, emb_iter=50, emb_confidence=2, emb_sample_fraction=0.5, emb_feature_fraction=1, emb_alpha=1, emb_gamma=1, emb_beta=30): """init.""" self.set_params(n_estimators, medium_dim, nn_n_estimators, nn_negative_bias, nn_k, nn_p, emb_iter, emb_confidence, emb_sample_fraction, emb_feature_fraction, emb_alpha, emb_gamma, emb_beta) self.params_range = dict( n_estimators=[250], medium_dim=[10, 25, 50, 100, 250, 500], nn_n_estimators=[30], nn_negative_bias=[0, 1], nn_k=[1, 3, 5, 7, 11], nn_p=[2], emb_iter=[50], emb_confidence=[1, 3, 5], emb_sample_fraction=[.5, .75, 1], emb_feature_fraction=[.01, .05, .1, .3, .5, .7, 1], emb_alpha=[0, 1, 3], emb_gamma=[1], emb_beta=[20, 30, 40]) def get_params(self): """get_params.""" return dict(n_estimators=self.n_estimators, medium_dim=self.medium_dim, nn_n_estimators=self.nn_n_estimators, nn_negative_bias=self.nn_negative_bias, nn_k=self.nn_k, nn_p=self.nn_p, emb_iter=self.emb_iter, emb_confidence=self.emb_confidence, emb_sample_fraction=self.emb_sample_fraction, emb_feature_fraction=self.emb_feature_fraction, emb_alpha=self.emb_alpha, emb_gamma=self.emb_gamma, emb_beta=self.emb_beta) def set_params(self, n_estimators=250, medium_dim=100, nn_n_estimators=30, nn_negative_bias=1, nn_k=7, nn_p=2, emb_iter=10, emb_confidence=2, emb_sample_fraction=0.6, emb_feature_fraction=1, emb_alpha=1, emb_gamma=1, emb_beta=30): """set_params.""" self.n_estimators = n_estimators self.medium_dim = medium_dim self.nn_n_estimators = nn_n_estimators self.nn_negative_bias = nn_negative_bias self.nn_k = nn_k self.nn_p = nn_p self.emb_iter = emb_iter self.emb_confidence = emb_confidence self.emb_sample_fraction = emb_sample_fraction self.emb_feature_fraction = emb_feature_fraction self.emb_alpha = emb_alpha self.emb_gamma = emb_gamma self.emb_beta = emb_beta # set objects self.est_medium_dim = MediumEmbedder(dim=self.medium_dim) self.regress2d = ExtraTreesRegressor(n_estimators=self.n_estimators) self.est2d = Biased2DAveragedClassifier( negative_bias=self.nn_negative_bias, n_estimators=self.nn_n_estimators, n_neighbors=self.nn_k, weights='distance', p=self.nn_p) def _repr_params(self, params): txt = '' for key in sorted(self.params_range): if len(self.params_range[key]) > 1: txt += ' %s:%s ' % (key, params[key]) return txt def _params_random_choice(self): params = dict([(key, random.choice(self.params_range[key])) for key in self.params_range]) return params def _avg_score(self, data, target, n_repetitions=3): scores = [] for i in range(n_repetitions): tr_data, ts_data, tr_target, ts_target = train_test_split( data, target, test_size=0.33, random_state=421 + i) self.fit(tr_data, tr_target) score = self.score(ts_data, ts_target) scores.append(score) score = np.mean(score) return score def _feature_importance(self, data, target): ec = ExtraTreesClassifier(n_estimators=self.n_estimators) feature_p = ec.fit(data, target).feature_importances_ return feature_p def fit(self, data, target): """fit.""" tr_data, ts_data, tr_target, ts_target = train_test_split( data, target, test_size=0.5, random_state=42) self.est_medium_dim.fit(tr_data, tr_target) tr_data_medium = self.est_medium_dim.transform(tr_data) ts_data_medium = self.est_medium_dim.transform(ts_data) feature_p = self._feature_importance(tr_data_medium, tr_target) tr_data2d, graph = embed(tr_data_medium, target=tr_target, confidence=self.emb_confidence, n_iter=self.emb_iter, sample_fraction=self.emb_sample_fraction, feature_fraction=self.emb_feature_fraction, feature_p=feature_p, alpha=self.emb_alpha, gamma=self.emb_gamma, beta=self.emb_beta) self.regress2d.fit(tr_data_medium, tr_data2d) ts_data2d = self.regress2d.predict(ts_data_medium) self.est2d.fit(ts_data2d, ts_target) return self def transform(self, data): """transform.""" data_medium = self.est_medium_dim.transform(data) data_2_dim = self.regress2d.predict(data_medium) return data_2_dim def predict(self, data): """predict.""" data_medium = self.est_medium_dim.transform(data) data_2_dim = self.regress2d.predict(data_medium) y_score = self.est2d.predict_proba(data_2_dim) return y_score def score(self, data, target): """score.""" y_score = self.predict(data) auc = metrics.roc_auc_score(target, y_score) return auc def visualize(self, data, target, title='', region_only=False): """visualize.""" auc = self.score(data, target) title += 'roc:%.2f' % (auc) title += '\nparams:%s' % serialize_dict(self.get_params()) x2dim = self.transform(data) x_min, x_max = x2dim[:, 0].min(), x2dim[:, 0].max() y_min, y_max = x2dim[:, 1].min(), x2dim[:, 1].max() b = max((x_max - x_min) / 10, (y_max - y_min) / 10) # border size x_min, x_max = x_min - b, x_max + b y_min, y_max = y_min - b, y_max + b h = b / 20 # step size in the mesh xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) grid2d = np.c_[xx.ravel(), yy.ravel()] z = self.est2d.predict_proba(grid2d) z = 1 - z.reshape(xx.shape) plt.contourf(xx, yy, z, cmap=plt.get_cmap('BrBG'), alpha=.3, levels=[0.05, 0.25, 0.5, 0.75, 0.95], extend='both') plt.contour(xx, yy, z, levels=[-1, 0.5, 2], colors='w', linewidths=[.5, 4, .5], linestyles=['solid', 'solid', 'solid'], extend='both') plt.contour(xx, yy, z, levels=[-1, 0.5, 2], colors='k', linewidths=[.5, 2, .5], linestyles=['solid', 'solid', 'solid'], extend='both') if region_only is False: plt.scatter(x2dim[:, 0], x2dim[:, 1], alpha=.8, c=target, s=30, edgecolors='k', cmap=plt.get_cmap('gray')) plt.title(title) plt.grid(False) plt.axis('off') return self def visualize_data(self, data, target=None, x_min=None, x_max=None, y_min=None, y_max=None): """visualize_test.""" x2dim = self.transform(data) if x_min is None or x_max is None or y_min is None or y_max is None: x_min, x_max = x2dim[:, 0].min(), x2dim[:, 0].max() y_min, y_max = x2dim[:, 1].min(), x2dim[:, 1].max() self.visualize_region(x_min, x_max, y_min, y_max) if target is None: c = 'w' else: c = target plt.scatter(x2dim[:, 0], x2dim[:, 1], alpha=.8, c=c, s=30, edgecolors='k', cmap=plt.get_cmap('gray')) plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.grid() return self def visualize_region(self, x_min=None, x_max=None, y_min=None, y_max=None): """visualize_region.""" b = max((x_max - x_min) / 10, (y_max - y_min) / 10) # border size x_min, x_max = x_min - b, x_max + b y_min, y_max = y_min - b, y_max + b h = b / 20 # step size in the mesh xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) grid2d = np.c_[xx.ravel(), yy.ravel()] z = self.est2d.predict_proba(grid2d) z = 1 - z.reshape(xx.shape) plt.contourf(xx, yy, z, cmap=plt.get_cmap('BrBG'), alpha=.3, levels=[0.05, 0.25, 0.5, 0.75, 0.95], extend='both') plt.contour(xx, yy, z, levels=[-1, 0.5, 2], colors='w', linewidths=[.5, 4, .5], linestyles=['solid', 'solid', 'solid'], extend='both') plt.contour(xx, yy, z, levels=[-1, 0.5, 2], colors='k', linewidths=[.5, 2, .5], linestyles=['solid', 'solid', 'solid'], extend='both')
import numpy as np import pandas as pd from sklearn.ensemble import ExtraTreesRegressor from sklearn.model_selection import train_test_split # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=42) # Average CV score on the training set was: -304012.8776428422 exported_pipeline = ExtraTreesRegressor(bootstrap=False, max_features=0.8, min_samples_leaf=1, min_samples_split=16, n_estimators=100) # Fix random state in exported estimator if hasattr(exported_pipeline, 'random_state'): setattr(exported_pipeline, 'random_state', 42) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
class ExtraTreesRegressor(IterativeComponentWithSampleWeight, BaseRegressionModel): def __init__(self, n_estimators, criterion, min_samples_leaf, min_samples_split, max_features, bootstrap, random_state=None): if check_none(n_estimators): self.n_estimators = None else: self.n_estimators = int(self.n_estimators) self.criterion = criterion self.min_samples_leaf = min_samples_leaf self.min_samples_split = min_samples_split self.max_features = max_features self.bootstrap = bootstrap self.n_jobs = -1 self.random_state = random_state self.estimator = None self.start_time = time.time() self.time_limit = None def fit(self, X, y, sample_weight=None): from sklearn.ensemble import ExtraTreesRegressor self.bootstrap = check_for_bool(self.bootstrap) self.estimator = ExtraTreesRegressor(n_estimators=self.n_estimators, max_leaf_nodes=None, criterion=self.criterion, max_features=self.max_features, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, max_depth=None, bootstrap=self.bootstrap, random_state=self.random_state, n_jobs=self.n_jobs) self.estimator.fit(X, y, sample_weight=sample_weight) return self def configuration_fully_fitted(self): if self.estimator is None: return False return not len(self.estimator.estimators_) < self.n_estimators def predict(self, X): if self.estimator is None: raise NotImplementedError return self.estimator.predict(X) @staticmethod def get_properties(dataset_properties=None): return {'shortname': 'ET', 'name': 'Extra Trees Regressor', 'handles_regression': True, 'handles_classification': False, 'handles_multiclass': False, 'handles_multilabel': False, 'is_deterministic': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None, optimizer='smac'): if optimizer == 'smac': cs = ConfigurationSpace() n_estimators = Constant("n_estimators", 100) criterion = CategoricalHyperparameter( "criterion", ["mse", "mae"], default_value="mse") # The maximum number of features used in the forest is calculated as m^max_features, where # m is the total number of features, and max_features is the hyperparameter specified below. # The default is 0.5, which yields sqrt(m) features as max_features in the estimator. This # corresponds with Geurts' heuristic. max_features = UniformFloatHyperparameter( "max_features", 0., 1., default_value=0.5) min_samples_split = UniformIntegerHyperparameter( "min_samples_split", 2, 20, default_value=2) min_samples_leaf = UniformIntegerHyperparameter( "min_samples_leaf", 1, 20, default_value=1) bootstrap = CategoricalHyperparameter( "bootstrap", ["True", "False"], default_value="False") cs.add_hyperparameters([n_estimators, criterion, max_features, min_samples_split, min_samples_leaf, bootstrap]) return cs elif optimizer == 'tpe': space = {'n_estimators': hp.choice('et_n_estimators', [100]), 'criterion': hp.choice('et_criterion', ["mse", "mae"]), 'max_features': hp.uniform('et_max_features', 0, 1), 'min_samples_split': hp.randint('et_min_samples_split', 19) + 2, 'min_samples_leaf': hp.randint('et_min_samples_leaf,', 20) + 1, 'bootstrap': hp.choice('et_bootstrap', ["True", "False"])} init_trial = {'n_estimators': 100, 'criterion': "mse", 'max_features': 0.5, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': "False"} return space
#ax1.set_title("Training dataset after PCA") #ax2.set_title("Standardized training dataset after PCA") # #for ax in (ax1, ax2): # ax.set_xlabel("1st principal component") # ax.set_ylabel("2nd principal component") # ax.legend(loc="upper right") # ax.grid() # #plt.tight_layout() #plt.show() # Prediction ############ t0 = time.time() y_regr = regr.predict(x_test) regr_predict = time.time() - t0 print("Prediction for %d inputs in %.6f s" % (x_test.shape[0], regr_predict)) #with open('output.log', 'w') as f: # print("Training time: %.6f s" % regr_fit, file=f) # print("Prediction time: %.6f s" % regr_predict, file=f) # print(" ", file=f) # print("The model performance for training set", file=f) # print("--------------------------------------", file=f) # print('MAE is {}'.format(train_score_mae), file=f) # print('MSE is {}'.format(train_score_mse), file=f) # print('EVS is {}'.format(train_score_evs), file=f) # print('ME is {}'.format(train_score_me), file=f) # print('R2 score is {}'.format(train_score_r2), file=f) # print(" ", file=f)
from sklearn.model_selection import train_test_split import warnings warnings.filterwarnings('ignore') df=pd.read_csv('zomato_df.csv') df.drop('Unnamed: 0',axis=1,inplace=True) print(df.head()) x=df.drop('rate',axis=1) y=df['rate'] x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.3,random_state=10) #Preparing Extra Tree Regression from sklearn.ensemble import ExtraTreesRegressor ET_Model=ExtraTreesRegressor(n_estimators = 120) ET_Model.fit(x_train,y_train) y_predict=ET_Model.predict(x_test) import pickle # # Saving model to disk pickle.dump(ET_Model, open('model.pkl','wb')) model=pickle.load(open('model.pkl','rb')) print(y_predict)
# 使用RandomForestRegressor训练模型,并对测试数据做出预测,结果存储在变量rfr_y_predict中。 rfr = RandomForestRegressor() rfr.fit(X_train, y_train) rfr_y_predict = rfr.predict(X_test) # 使用ExtraTreesRegressor训练模型,并对测试数据做出预测,结果存储在变量etr_y_predict中。 ''' 极端随机森林 于普通随机森林不同: 在每当构建一棵树的分裂节点的时候,不会任意地选取特征, 而是先随机收集一部分特征,然后利用信息熵和基尼不纯度等指标挑选最佳的节点特征 ''' etr = ExtraTreesRegressor() etr.fit(X_train, y_train) etr_y_predict = etr.predict(X_test) # 使用GradientBoostingRegressor训练模型,并对测试数据做出预测,结果存储在变量gbr_y_predict中。 gbr = GradientBoostingRegressor() gbr.fit(X_train, y_train) gbr_y_predict = gbr.predict(X_test) from sklearn.metrics import mean_absolute_error,mean_squared_error # 使用R-squared、MSE以及MAE指标对默认配置的随机回归森林在测试集上进行性能评估。 print('R-squared value of RandomForestRegressor:', rfr.score(X_test, y_test)) print( 'The mean squared error of RandomForestRegressor:', mean_squared_error(y_test, rfr_y_predict)) print( 'The mean absoluate error of RandomForestRegressor:', mean_absolute_error(y_test, rfr_y_predict)) # 使用R-squared、MSE以及MAE指标对默认配置的极端回归森林在测试集上进行性能评估。 print('R-squared value of ExtraTreesRegessor:', etr.score(X_test, y_test))
class QVtree: def __init__(self, D, maxgrid, radius, para, num_split=40, num_leaf=20, num_est=215): self.Q_f = Tree(n_estimators=num_est, min_samples_split=num_split, min_samples_leaf=num_leaf, n_jobs=para.CPU_CORES) Twv = (1 / radius) / 1.8 T = [Twv for t in range(D)] L = int(140 / Twv) points = maxgrid self.W_f = Tilecode(D, T, L, mem_max=1, lin_spline=True, linT=7, cores=para.CPU_CORES) self.V_f = Tilecode(D, T, L, mem_max=1, lin_spline=True, linT=7, cores=para.CPU_CORES) self.maxgrid = maxgrid self.radius = radius self.D = D self.first = True self.beta = para.beta self.CORES = para.CPU_CORES def iterate(self, XA, X1, u, A_low, A_high, ITER=50, Ascaled=False, plot=True, xargs=[], output=True, gridsamp=1): tic = time() self.v_e = 0 # Value function error self.p_e = 0 # Policy function error tic = time() N = int(gridsamp * X1.shape[0]) grid, m = buildgrid(X1[0:N, :], self.maxgrid, self.radius, scale=True) points = grid.shape[0] toc = time() print 'State grid points: ' + str(points) + ', of maximum: ' + str( m) + ', Time taken: ' + str(toc - tic) if self.first: self.W_f.fit(grid, np.zeros(points)) self.V_f.fit(grid, np.zeros(points)) self.first = False Al = np.zeros(points) Ah = np.zeros(points) if Ascaled: for i in range(points): Ws = self.W_f_old.predict(grid[i, :]) Al[i] = A_low(grid[i, :], Ws) Ah[i] = A_high(grid[i, :], Ws) else: for i in range(points): Al[i] = A_low(grid[i, :]) Ah[i] = A_high(grid[i, :]) # ------------------ # Q-learning # ------------------ #First iteration j = 0 # Q values Q = u + self.beta * self.V_f.predict(X1, store_XS=True) # Fit Q function self.Q_f.fit(XA, Q) # Optimise Q function ERROR = self.maximise(grid, Al, Ah, Ascaled, output=output) for j in range(ITER): # Q values Q = u + self.beta * self.V_f.fast_values() # Fit Q function tic = time() self.Q_f.fit(XA, Q) toc = time() print 'Fit time: ' + str(toc - tic) # Optimise Q function ERROR = self.maximise(grid, Al, Ah, Ascaled, output=output) toc = time() print 'Solve time: ' + str(toc - tic) if plot: self.W_f.plot(xargs, showdata=True) pylab.show() #self.V_f.plot(['x', 1], showdata=True) #pylab.show() def maximise(self, grid, Al, Ah, Ascaled, plot=False, output=True): tic = time() if Ascaled: Alow = np.zeros(grid.shape[0]) Ahigh = np.ones(grid.shape[0]) else: Alow = Al Ahigh = Ah N = grid.shape[0] W_opt = np.zeros(N) V = np.zeros(N) Wgrid = np.zeros(0) for i in range(N): Wgrid = np.append(Wgrid, np.linspace(Alow[i], Ahigh[i], 300)) x = np.repeat(grid, 300, axis=0) X = np.hstack([Wgrid.reshape([N * 300, 1]), x]) tic = time() Qhat = self.Q_f.predict(X) toc = time() print str(toc - tic) j = 0 for i in range(N): idx = np.argmax(Qhat[j:j + 300]) W_opt[i] = Wgrid[j + idx] V[i] = Qhat[j + idx] j = j + 300 if Ascaled: W_opt = Al[idx] + (Ah[idx] - Al[idx]) * W_opt W_opt_old = self.W_f.predict(grid) V_old = self.V_f.predict(grid) self.V_f.fit(grid, V) self.W_f.fit(grid, W_opt, sgd=1, eta=0.4, n_iters=1, scale=0) self.p_e = np.mean(abs(W_opt_old - W_opt) / W_opt_old) self.v_e = np.mean(abs(V_old - V) / V_old) toc = time() if output: print 'Maximisation time: ' + str(toc - tic) print 'Value function change: ' + str(round( self.v_e, 4)) + ', Policy change: ' + str(round(self.p_e, 4)) if plot: self.W_f.plot(['x', 1], showdata=True) pylab.show() self.V_f.plot(['x', 1], showdata=True) pylab.show() return self.v_e
clf1 = ExtraTreesRegressor(n_estimators=1000, max_depth=4, min_samples_leaf=1) clf2 = RandomForestRegressor(n_estimators=1000, max_depth=7, min_samples_split=20, random_state=0) clf3 = GradientBoostingRegressor(learning_rate=0.003, max_depth=3, min_samples_split=35, min_samples_leaf=10, n_estimators=1500) clf4 = LassoLarsCV(cv=20) clf1.fit(dev_X, dev_y) preds = clf1.predict(val_X) if len(X_pred3) < 1: X_pred3 = preds else: X_pred3 = np.concatenate((X_pred3, preds), axis=0) scores3.append(r2_score(np.exp(val_y), np.exp(preds))) print("model 3 scores: ", scores3) clf2.fit(dev_X, dev_y) preds = clf2.predict(val_X) if len(X_pred4) < 1: X_pred4 = preds else: X_pred4 = np.concatenate((X_pred4, preds), axis=0)
def main(): # random number initialization np.random.seed(123456000) # preprocess data by PCA and standardization Xtrain__full, ytrain__full, Xtest = load_data(argv[1], argv[2]) # Xtrain__full, ytrain__full, Xtest = load_data("train_data.csv","test_data.csv") Xtrain__full, ytrain__full, Xtest = preprocess(Xtrain__full, ytrain__full, Xtest) # train-set and validation-set split X_train, X_val, y_train, y_val = train_test_split(Xtrain__full, ytrain__full, test_size=0.20, random_state=None) # ============================================================================================================ print(" ") print(" ") print("Linear regressor classifier") start_time = time.time() LR = regressor(0.01) LR.fit(X_train, y_train) show_performance(LR, X_train, y_train, "Train") show_performance(LR, X_val, y_val, "Validation") show_time(time.time() - start_time) # ============================================================================================================ print("Stochastic gradient descent regressor classifier") start_time = time.time() SGDR = SGDRegressor(loss='huber', penalty='elasticnet', max_iter=100, eta0=0.01) SGDR.fit(X_train, y_train.flatten()) show_performance(SGDR, X_train, y_train, "Train") show_performance(SGDR, X_val, y_val, "Validation") show_time(time.time() - start_time) # ============================================================================================================ print("Neural network classifier") start_time = time.time() def baseline_model(D): # Defining the NN based regressor model = Sequential() model.add( Dense(D, input_dim=D, kernel_initializer='glorot_uniform', activation='relu')) model.add(Dropout(0.25)) model.add( Dense(D, input_dim=D, kernel_initializer='glorot_uniform', activation='relu')) model.add(Dropout(0.25)) model.add(Dense(1, kernel_initializer='glorot_uniform')) model.compile(loss='mae', optimizer='adam', metrics=['mae']) return model _, D = np.shape(X_train) # KR = KerasRegressor(build_fn=baseline_model(D), epochs=30, batch_size=16, verbose=False) KR = baseline_model(D) KR.fit(X_train, y_train, epochs=100, batch_size=16, verbose=False) show_performance(KR, X_train, y_train, "Train") show_performance(KR, X_val, y_val, "Validation") show_time(time.time() - start_time) # ============================================================================================================ print("Extratrees regressor classifier") start_time = time.time() ET = ExtraTreesRegressor(n_estimators=200, criterion='mae', min_samples_split=2, min_samples_leaf=1) ET.fit(X_train, y_train.flatten()) show_performance(ET, X_train, y_train, "Train") show_performance(ET, X_val, y_val, "Validation") show_time(time.time() - start_time) # ============================================================================================================ print("Extreme gradient boosted regressor classifier") start_time = time.time() n = Xtrain__full.shape[1] XGBR = xgbr(n_estimators=400, max_depth=int(np.sqrt(n))) XGBR.fit(X_train, y_train.flatten()) show_performance(XGBR, X_train, y_train, "Train") show_performance(XGBR, X_val, y_val, "Validation") show_time(time.time() - start_time) # ============================================================================================================ print("Soft voting over best performing ET and XGBR classifiers") temp1 = ET.predict(X_val) temp2 = XGBR.predict(X_val) temp = np.average([temp1, temp2], axis=0, weights=[7, 10]) mae = mean_absolute_error(y_val, temp) print("Validation MAE: %f" % mae) # ============================================================================================================ print(" ") print(" ") print("Writing out the results") temp1 = ET.predict(Xtest) temp2 = XGBR.predict(Xtest) temp = np.average([temp1, temp2], axis=0, weights=[7, 10]) predictions = temp.astype(int) df = pd.read_csv(argv[2]) # df = pd.read_csv("test_data.csv") df['predicted_ground_truth'] = predictions df.to_csv(argv[2], index=False) # df.to_csv('test_data.csv', index=False) print("Task completed")
def run(feature_files, training_dates, feature_set_folder): train_set = pd.concat([ dfs(0, len(feature_files), feature_files + ['y'], 'dataset/' + date) for date in training_dates ]) test_set = dfs(0, len(feature_files), feature_files + ['y'], 'dataset/2016-06-01') test1_set = dfs(0, len(feature_files), feature_files + ['y'], 'dataset/2016-05-25') # train_set.to_csv('train_set.csv', index=False) # test_set.to_csv('test_set.csv', index=False) ''' unique_size = pd.read_csv('unique_size.csv') train_set = pd.merge(train_set, unique_size, how='left') train_set = train_set[train_set.unique_size > 1] train_set.drop(['unique_size'], axis=1, inplace=True) ''' train_set = train_set.fillna(-1, downcast='infer') test_set = test_set.fillna(-1, downcast='infer') test1_set = test1_set.fillna(-1, downcast='infer') train_set['y_log'] = train_set['y'].apply(lambda x: np.log(1 + x)) test_set['y_log'] = test_set['y'].apply(lambda x: np.log(1 + x)) test1_set['y_log'] = test1_set['y'].apply(lambda x: np.log(1 + x)) feature_set = filter( lambda x: x not in ['y', 'time', 'province', 'market', 'name', 'type', 'y_log'], train_set.columns) scaler = StandardScaler() scaler.fit(train_set[feature_set].as_matrix()) # model1 model1 = LinearRegression(normalize=True) model1.fit(scaler.transform(train_set[feature_set].as_matrix()), train_set['y'].as_matrix(), sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix())) print zip(feature_set, model1.coef_) test_set['predictY'] = model1.predict( scaler.transform(test_set[feature_set].as_matrix())) test_set.to_csv('result/' + feature_set_folder + '/model1_offline.csv') test1_set['predictY'] = model1.predict( scaler.transform(test1_set[feature_set].as_matrix())) test1_set.to_csv('result/' + feature_set_folder + '/model1_offline1.csv') # model2 model2 = XGBRegressor(n_estimators=600, learning_rate=0.01, max_depth=6, colsample_bytree=0.7, subsample=0.7, colsample_bylevel=0.7) model2.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix())) test_set['predictY'] = model2.predict(test_set[feature_set].as_matrix()) test_set.to_csv('result/' + feature_set_folder + '/model2_offline.csv') test1_set['predictY'] = model2.predict(test1_set[feature_set].as_matrix()) test1_set.to_csv('result/' + feature_set_folder + '/model2_offline1.csv') # model3 model3 = LinearSVR(tol=1e-7) model3.fit(scaler.transform(train_set[feature_set].as_matrix()), train_set['y'].as_matrix(), sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix())) test_set['predictY'] = model3.predict( scaler.transform(test_set[feature_set].as_matrix())) test_set.to_csv('result/' + feature_set_folder + '/model3_offline.csv') test1_set['predictY'] = model3.predict( scaler.transform(test1_set[feature_set].as_matrix())) test1_set.to_csv('result/' + feature_set_folder + '/model3_offline1.csv') # model4 model4 = RandomForestRegressor(n_estimators=1000, max_depth=7, max_features=0.2, max_leaf_nodes=100) model4.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), sample_weight=np.array( map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()))) test_set['predictY'] = model4.predict(test_set[feature_set].as_matrix()) test_set.to_csv('result/' + feature_set_folder + '/model4_offline.csv') test1_set['predictY'] = model4.predict(test1_set[feature_set].as_matrix()) test1_set.to_csv('result/' + feature_set_folder + '/model4_offline1.csv') # model15 model15 = ExtraTreesRegressor(n_estimators=1000, max_depth=12, max_features=0.3, max_leaf_nodes=400) model15.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), sample_weight=np.array( map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()))) test_set['predictY'] = model15.predict(test_set[feature_set].as_matrix()) test_set.to_csv('result/' + feature_set_folder + '/model15_offline.csv') test1_set['predictY'] = model15.predict(test1_set[feature_set].as_matrix()) test1_set.to_csv('result/' + feature_set_folder + '/model15_offline1.csv') # model5 model5 = XGBRegressor(n_estimators=600, learning_rate=0.01, max_depth=6, colsample_bytree=0.7, subsample=0.7, colsample_bylevel=0.7, seed=10000) model5.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix())) test_set['predictY'] = model5.predict(test_set[feature_set].as_matrix()) test_set.to_csv('result/' + feature_set_folder + '/model5_offline.csv') test1_set['predictY'] = model5.predict(test1_set[feature_set].as_matrix()) test1_set.to_csv('result/' + feature_set_folder + '/model5_offline1.csv') # model6 model6 = XGBRegressor(n_estimators=600, learning_rate=0.01, max_depth=5, colsample_bytree=0.7, subsample=0.7, colsample_bylevel=0.7) model6.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix())) test_set['predictY'] = model6.predict(test_set[feature_set].as_matrix()) test_set.to_csv('result/' + feature_set_folder + '/model6_offline.csv') test1_set['predictY'] = model6.predict(test1_set[feature_set].as_matrix()) test1_set.to_csv('result/' + feature_set_folder + '/model6_offline1.csv') pass
]] y_ol = ol['ActualDays'] y_lo = lo['ActualDays'] y_eo = eo['ActualDays'] # X = data[train_target] y = data[test_target] X_train, X_test, y_train, y_test = train_test_split(X_all, y, test_size=0.4, random_state=0) clf = ExtraTreesRegressor(n_estimators=100, criterion='mae') clf.fit(X_train, y_train) y_pred = clf.predict(X_test) score = mean_absolute_error(y_pred, y_test) print(score) y_pred_1 = [math.floor(x) if x > 0 else math.ceil(x) for x in y_pred] score2 = mean_absolute_error(y_pred_1, y_test) print(score2) # On the line point region regression clf_ol = ExtraTreesRegressor(n_estimators=100, criterion='mse', bootstrap=False) clf_ol.fit(y_ol, y_ol) # Later or on time data region regression clf_lo = ExtraTreesRegressor(n_estimators=10, criterion='mse', bootstrap=False)
test = o.features[col] n = test.isnull().sum(axis=1) for c in test.columns: test[c + '_nan_'] = pd.isnull(test[c]) test = test.fillna(d_mean) test['znull'] = n pred = o.target pred['y'] = model_et.predict(test).clip(low_y_cut, high_y_cut) pred['y'] = pred.apply(get_weighted_y, axis=1) o, reward, done, info = env.step(pred[['id', 'y']]) pred_y = list(pred.y.values) y_actual_list.extend(actual_y) y_pred_list.extend(pred_y) overall_reward = get_reward(np.array(y_actual_list), np.array(y_pred_list)) et_overall_reward_list.append(overall_reward)
rf.fit(x_train,y_train) y_pre_rf = rf.predict(x_test) rf_pre.append(y_pre_rf) ''' ### params_ExtraTrees = { 'max_features': 'log2', 'n_estimators': 600, 'max_depth': 10, 'oob_score': True, 'n_jobs': 4, 'bootstrap': True } ext = ExtraTreesRegressor(**params_ExtraTrees) ext.fit(x_train, y_train) y_pre_ext = ext.predict(x_test) #ext_pre.append(y_pre_ext) #y_pre_ext1 = ext.predict(x_test[:-7]) #y_pre_ext7d = np.append(y_pre_ext1,y_pre_ext1) #ext_pre7d.append(y_pre_ext7d) #print(y_pre_ext) #print(y_pre_ext1) ### ''' params_gbrt = {'loss':'huber','n_estimators': 500,'max_depth':12,'learning_rate': 0.01, 'random_state': 3} gbrt = GradientBoostingRegressor(**params_gbrt) gbrt.fit(x_train,y_train) y_pre_gbrt = gbrt.predict(x_test) gbrt_pre.append(y_pre_gbrt) '''
X = list(zip(*X1)) Y = cols[13] X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.2, random_state=rd.randrange(1000)) X_train = np.array(X_train) y_train = np.array(y_train) X_test = np.array(X_test) y_test = np.array(y_test) #print(y_test) lin_reg_mod = ExtraTreesRegressor(n_estimators=500) lin_reg_mod.fit(X_train, y_train) pred = lin_reg_mod.predict(X_test) #print(pred) #print(y_test) test_set_r2 = r2_score(y_test, pred) print(test_set_r2) tr2 += test_set_r2 #abs_er = mean_absolute_error(y_test, pred) #tabse+=abs_er temp = [] for (i, j) in zip(y_test, pred): t = (abs(i - j)) / float(i) temp.append(t) #print(temp) print(np.median(temp))
class ExtraTreesRegressor(ParamSklearnRegressionAlgorithm): def __init__(self, n_estimators, criterion, min_samples_leaf, min_samples_split, max_features, max_leaf_nodes_or_max_depth="max_depth", bootstrap=False, max_leaf_nodes=None, max_depth="None", oob_score=False, n_jobs=1, random_state=None, verbose=0): self.n_estimators = int(n_estimators) self.estimator_increment = 10 if criterion not in ("mse"): raise ValueError("'criterion' is not in ('mse'): " "%s" % criterion) self.criterion = criterion if max_leaf_nodes_or_max_depth == "max_depth": self.max_leaf_nodes = None if max_depth == "None": self.max_depth = None else: self.max_depth = int(max_depth) #if use_max_depth == "True": # self.max_depth = int(max_depth) #elif use_max_depth == "False": # self.max_depth = None else: if max_leaf_nodes == "None": self.max_leaf_nodes = None else: self.max_leaf_nodes = int(max_leaf_nodes) self.max_depth = None self.min_samples_leaf = int(min_samples_leaf) self.min_samples_split = int(min_samples_split) self.max_features = float(max_features) if bootstrap == "True": self.bootstrap = True elif bootstrap == "False": self.bootstrap = False self.oob_score = oob_score self.n_jobs = int(n_jobs) self.random_state = random_state self.verbose = int(verbose) self.estimator = None def fit(self, X, y, refit=False): if self.estimator is None or refit: self.iterative_fit(X, y, n_iter=1, refit=refit) while not self.configuration_fully_fitted(): self.iterative_fit(X, y, n_iter=1) return self def iterative_fit(self, X, y, n_iter=1, refit=False): if refit: self.estimator = None if self.estimator is None: num_features = X.shape[1] max_features = int( float(self.max_features) * (np.log(num_features) + 1)) # Use at most half of the features max_features = max(1, min(int(X.shape[1] / 2), max_features)) self.estimator = ETR( n_estimators=0, criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, max_features=max_features, max_leaf_nodes=self.max_leaf_nodes, oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state, warm_start=True ) tmp = self.estimator # TODO copy ? tmp.n_estimators += n_iter tmp.fit(X, y,) self.estimator = tmp return self def configuration_fully_fitted(self): if self.estimator is None: return False return not len(self.estimator.estimators_) < self.n_estimators def predict(self, X): if self.estimator is None: raise NotImplementedError return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict_proba(X) @staticmethod def get_properties(dataset_properties=None): return {'shortname': 'ET', 'name': 'Extra Trees Regressor', 'handles_missing_values': False, 'handles_nominal_values': False, 'handles_numerical_features': True, 'prefers_data_scaled': False, # TODO find out if this is good because of sparcity... 'prefers_data_normalized': False, 'handles_regression': True, 'handles_classification': False, 'handles_multiclass': False, 'handles_multilabel': False, 'is_deterministic': True, 'handles_sparse': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS,), # TODO find out what is best used here! # But rather fortran or C-contiguous? 'preferred_dtype': np.float32} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() n_estimators = cs.add_hyperparameter(Constant("n_estimators", 100)) criterion = cs.add_hyperparameter(Constant("criterion", "mse")) max_features = cs.add_hyperparameter(UniformFloatHyperparameter( "max_features", 0.5, 5, default=1)) max_depth = cs.add_hyperparameter( UnParametrizedHyperparameter(name="max_depth", value="None")) min_samples_split = cs.add_hyperparameter(UniformIntegerHyperparameter( "min_samples_split", 2, 20, default=2)) min_samples_leaf = cs.add_hyperparameter(UniformIntegerHyperparameter( "min_samples_leaf", 1, 20, default=1)) # Unparametrized, we use min_samples as regularization # max_leaf_nodes_or_max_depth = UnParametrizedHyperparameter( # name="max_leaf_nodes_or_max_depth", value="max_depth") # CategoricalHyperparameter("max_leaf_nodes_or_max_depth", # choices=["max_leaf_nodes", "max_depth"], default="max_depth") # min_weight_fraction_leaf = UniformFloatHyperparameter( # "min_weight_fraction_leaf", 0.0, 0.1) # max_leaf_nodes = UnParametrizedHyperparameter(name="max_leaf_nodes", # value="None") bootstrap = cs.add_hyperparameter(CategoricalHyperparameter( "bootstrap", ["True", "False"], default="False")) # Conditions # Not applicable because max_leaf_nodes is no legal value of the parent #cond_max_leaf_nodes_or_max_depth = \ # EqualsCondition(child=max_leaf_nodes, # parent=max_leaf_nodes_or_max_depth, # value="max_leaf_nodes") #cond2_max_leaf_nodes_or_max_depth = \ # EqualsCondition(child=use_max_depth, # parent=max_leaf_nodes_or_max_depth, # value="max_depth") #cond_max_depth = EqualsCondition(child=max_depth, parent=use_max_depth, #value="True") #cs.add_condition(cond_max_leaf_nodes_or_max_depth) #cs.add_condition(cond2_max_leaf_nodes_or_max_depth) #cs.add_condition(cond_max_depth) return cs
train = data[targets < 30] test = data[targets >= 30] # Test on independent people n_pixels = data.shape[1] X_train = train[:, :int(0.5 * n_pixels)] # Upper half of the faces Y_train = train[:, int(0.5 * n_pixels):] # Lower half of the faces X_test = test[:, :int(0.5 * n_pixels)] Y_test = test[:, int(0.5 * n_pixels):] # Build a multi-output forest forest = ExtraTreesRegressor(n_estimators=10, max_features=32, random_state=0) forest.fit(X_train, Y_train) Y_test_predict = forest.predict(X_test) # Plot the completed faces n_faces = 5 image_shape = (64, 64) pl.figure(figsize=(2. * n_faces, 2.26 * 2)) pl.suptitle("Face completion with multi-output forests", size=16) for i in range(1, 1 + n_faces): face_id = np.random.randint(X_test.shape[0]) true_face = np.hstack((X_test[face_id], Y_test[face_id])) completed_face = np.hstack((X_test[face_id], Y_test_predict[face_id])) pl.subplot(2, n_faces, i)
def sens(sample=20): home = '/home/nealbob' folder = '/Dropbox/Model/results/chapter7/chapter7/' out = '/Dropbox/Thesis/IMG/chapter7/' img_ext = '.pdf' table_out = '/Dropbox/Thesis/STATS/chapter7/' rows = ['CS', 'SWA', 'OA', 'CS-HL'] results = {run_no: {row : 0 for row in rows} for run_no in range(1,sample)} samprange = [] for run_no in range(1, sample): try: for row in rows: with open(home + folder + str(run_no) + '_' + row + '_result.pkl', 'rb') as f: results[run_no][row] = pickle.load(f) f.close() m = len(results[run_no]['CS'][0]['S']['Annual']['Mean']) - 1 SW = results[run_no][row][0]['SW']['Annual']['Mean'][m] if math.isnan(SW) or math.isinf(SW): raise Exception("Found a nan") samprange.append(run_no) except: print row print 'Run no: ' + str(run_no) + ' failed.' n = len(samprange) print str(n) + ' good runs of ' + str(sample - 1) + ' total' ###### Summary tables ##### series = ['SW', 'Profit', 'B', 'Budget', 'S'] title = {'SW' : 'Social welfare relative to CS', 'Profit' : 'Profit relative to CS', 'B' : 'Environmental benefits relative to CS', 'S' : 'Storage relative to CS', 'Budget' : 'Environmental trade relative to CS'} scale = {'SW' : 1000000, 'Profit' : 1000000, 'S' : 1000, 'W' : 1000, 'E' : 1000, 'B' : 1000000, 'Z' : 1000, 'Q_low' : 1000, 'Q_high' : 1000, 'Q_env' : 1000, 'A_low' : 1000, 'A_high' : 1000, 'A_env' : 1000, 'S_low' : 1000, 'S_high' : 1000, 'S_env' : 1000, 'U_low' : 1000000, 'U_high' : 1000000, 'Budget' : 1000000} m = len(results[1]['CS'][0]['S']['Annual']['Mean']) - 1 X = {} XI = {} for x in series: data0 = [] data1 = [] data2 = [] for row in rows: temp = np.zeros(n) record = {} record1 = {} i = 0 for run_no in samprange: temp[i] = results[run_no][row][0][x]['Annual']['Mean'][m] / scale[x] i += 1 record[run_no] = results[run_no][row][0][x]['Annual']['Mean'][m] / scale[x] record1['Mean'] = np.mean(temp) record1['Min'] = np.min(temp) record1['Q1'] = np.percentile(temp, 25) record1['Q3'] = np.percentile(temp, 75) record1['Max'] = np.max(temp) X[row] = temp data0.append(record) data1.append(record1) data = pandas.DataFrame(data0) data.index = rows data1 = pandas.DataFrame(data1) data1.index = rows #['Mean', 'Min', 'Q1', 'Q3', 'Max'] for row in rows: record2 = {} temp1 = np.zeros(n) for i in range(n): temp1[i] = X[row][i] / X['CS'][i] XI[row] = temp1 record2['Mean'] = np.mean(temp1) record2['Min'] = np.min(temp1) record2['Q1'] = np.percentile(temp1, 25) record2['Q3'] = np.percentile(temp1, 75) record2['Max'] = np.max(temp1) data2.append(record2) data2 = pandas.DataFrame(data2) data2.index = rows #['Mean', 'Min', 'Q1', 'Q3', 'Max'] with open(home + table_out + 'sens_full' + x + '.txt', 'w') as f: f.write(data.to_latex(float_format='{:,.2f}'.format, columns=samprange)) f.close() with open(home + table_out + 'sens_sum' + x + '.txt', 'w') as f: f.write(data1.to_latex(float_format='{:,.2f}'.format, columns=['Mean', 'Min', 'Q1', 'Q3', 'Max'])) f.close() with open(home + table_out + 'sens_table' + x + '.txt', 'w') as f: f.write(data2.to_latex(float_format='{:,.2f}'.format, columns=['Mean', 'Min', 'Q1', 'Q3', 'Max'])) f.close() minx = np.percentile([min(XI[i]) for i in XI], 1) maxx = np.percentile([max(XI[i]) for i in XI],99) chart_ch7(XI, 0.985 * minx, 1.015 * maxx, title[x], out, str(x) + '_sens') ##################################################################################### Regression Y = np.zeros([n, 4]) j = 0 for row in rows: i = 0 for run_no in samprange: Y[i, j] = results[run_no][row][0]['SW']['Annual']['Mean'][m] / results[run_no]['CS'][0]['SW']['Annual']['Mean'][m] i += 1 j += 1 paras = [] for run_no in range(1, sample): with open(home + folder + str(run_no) + '_para.pkl', 'rb') as f: paras.append(pickle.load(f)) f.close() pname1 = ['delta0', 'I_K', 'SD_I', 't_cost', 'N_high', 'rho_I', 'alpha', 'rho_eps', 'sig_eta', 'LL'] numpara1 = len(pname1) pname2 = ['omega_mu', 'omega_sig', 'omegadelta', 'delta_a', 'delta_Ea', 'delta_Eb', 'delta_R', 'b_1', 'b_value', 'e_sig'] numpara2 = len(pname2) para_labels = pname1 + pname2 + ['lambda', 'lambdaHL', 'lambdae'] numpara = numpara1 + numpara2 + 3 X = np.zeros([n, numpara]) para_names = ['$\delta0$', '$E[I]/K$', '$c_v$', '$\tau$', '$n_{high}$', '$\rho_I$', '$\alpha$', '$\rho_e$', '$\sigma_{\eta}$', '${\aA_{low} \over E[I]/K}$', '$\mu_\omega$', '$\sigma_\omega$', '$\omega_\delta$', '$\delta_a$', '$\delta_{Ea}$', '$\delta_{Eb}$', '$\delta_R$', '$b_1$', '$b_{\$} \over \bar I$', '$\sigma_{e0}$', '$\Lambda_{high} - \hat \Lambda_{high}$', '$\Lambda_{high}^{CS-HL} - \hat \Lambda_{high}^{CS-HL}$', '$\lambda_0 - \hat \lambda_0$' ] for j in range(numpara1): for i in range(n): if pname1[j] == 'LL': X[i, j] = paras[samprange[i]-1].para_list[pname1[j]] / paras[samprange[i]-1].para_list['I_K'] else: X[i, j] = paras[samprange[i]-1].para_list[pname1[j]] for j in range(numpara1, numpara2+numpara1): for i in range(n): if pname2[j - numpara1] == 'b_value': X[i, j] = paras[samprange[i]-1].ch7[pname2[j - numpara1]] / (paras[samprange[i]-1].para_list['I_K']*1000000) else: X[i, j] = paras[samprange[i]-1].ch7[pname2[j - numpara1]] CS_c = -0.153007555 CS_b = 0.00930613 CSHL_c = -0.0891846 CSHL_b = 0.0047009 for i in range(n): if i > 20: y = paras[samprange[i]-1].y else: y = CS_c + CS_b * paras[samprange[i]-1].para_list['N_high'] X[i, numpara2 + numpara1] = paras[samprange[i]-1].Lambda_high - y for i in range(n): if i > 20: yhl = paras[samprange[i]-1].yhl else: yhl = CSHL_c + CSHL_b * paras[samprange[i]-1].para_list['N_high'] X[i, numpara2 + numpara1 + 1] = paras[samprange[i]-1].Lambda_high_HL - yhl yelist = [0.4443, 0.1585, 0.1989, 0.2708, 0.3926, 0.0697, 0.1290, 0.1661, 0.2687, 0.0868, 0.1239, 0.3598, 0.3543, 0.2883, 0.2367, 0.2139, 0.2485, 0.2641, 0.5730, 0.1745] lambdae = np.zeros(n) for i in range(n): if i >= 20: ye = paras[samprange[i]-1].E_lambda_hat else: ye = yelist[samprange[i]-1] X[i, numpara2 + numpara1 + 2] = paras[samprange[i]-1].ch7['inflow_share'] - ye lambdae[i] = paras[samprange[i]-1].ch7['inflow_share'] index = lambdae < 0.5 pylab.hexbin(lambdae[index], X[index,1], C=Y[index, 2], gridsize=15) pylab.xlabel('Environmental share, $\lambda_0$') pylab.ylabel('Mean Inflow to Capacity, $E[I_t]/K$') cb = pylab.colorbar() cb.set_label('OA welfare relative to CS') #pylab.ylim(0, 1000) pylab.savefig(home + out + 'OAversusCS.pdf', bbox_inches='tight') pylab.show() pylab.hexbin(X[:, numpara -1], X[:, 1], C=Y[:, 3], gridsize=15) pylab.xlabel('Environmental share, $\lambda_0 - \hat \lambda_0$') pylab.ylabel('Mean Inflow to Capacity, $E[I_t]/K$') cb = pylab.colorbar() cb.set_label('CS-HL welfare relative to CS') #pylab.ylim(0, 1000) pylab.savefig(home + out + 'CSHLversusCS.pdf', bbox_inches='tight') pylab.show() tree = Tree(n_estimators=500, n_jobs=4) tree.fit(X, Y) rank = tree.feature_importances_ * 100 data0 = [] inn = 0 for p in para_names: record = {} record['Importance'] = rank[inn] data0.append(record) inn = inn + 1 tab = pandas.DataFrame(data0) tab.index = para_names tab = tab.sort(columns=['Importance'], ascending=False) tab_text = tab.to_latex(float_format='{:,.2f}'.format, escape=False) print tab_text with open(home + table_out + 'importance.txt', 'w') as f: f.write(tab_text) f.close() for i in range(numpara): Xtemp = np.zeros([200, numpara]) for j in range(numpara): Xtemp[:, j] = np.ones(200) * np.mean(X[:, j]) Xtemp[:, i] = np.linspace(np.min(X[:, i]), np.max(X[:, i]), 200) Ytemp = tree.predict(Xtemp) data = [[Xtemp[:, i], Ytemp]] data0 = [] for k in range(200): record = {} record['SWA'] = Ytemp[k, 1] record['OA'] = Ytemp[k, 2] record['CS-HL'] = Ytemp[k, 3] data0.append(record) data = pandas.DataFrame(data0) data.index = Xtemp[:, i] chart_data = {'OUTFILE': home + out + 'SW_' + para_labels[i] + img_ext, 'XLABEL': '', 'YLABEL': '', 'YMIN': 0.85, 'YMAX': 1.03} print para_labels[i] build_chart(chart_data, data, chart_type='date', ylim=True, save=True) ##################################################################################### Classifier srnum = {'CS' : 0, 'SWA' : 1, 'OA' : 2, 'CS-HL' : 3} Y = np.zeros(n) for i in range(n): SW = 0 SWmax = -1 for row in rows: SW = results[samprange[i]][row][0]['SW']['Annual']['Mean'][m] if SW > SWmax: SWmax = SW Y[i] = srnum[row] for row in rows: idx = np.where(Y == srnum[row]) print row + ': ' + str(len(Y[idx])) treec = Tree_classifier(n_estimators=500, n_jobs=4) #min_samples_split=3, min_samples_leaf=2) treec.fit(X, Y) rank = treec.feature_importances_ * 100 data0 = [] inn = 0 for p in para_names: record = {} record['Importance'] = rank[inn] record['CS'] = np.mean(X[np.where(Y == 0), inn]) record['SWA'] = np.mean(X[np.where(Y == 1), inn]) record['OA'] = np.mean(X[np.where(Y == 2), inn]) record['CS-HL'] = np.mean(X[np.where(Y == 3), inn]) data0.append(record) inn = inn + 1 tab = pandas.DataFrame(data0) tab.index = para_names tab = tab.sort(columns=['Importance'], ascending=False) tab_text = tab.to_latex(float_format='{:,.2f}'.format, escape=False) with open(home + table_out + 'classifier_table.txt', 'w') as f: f.write(tab.to_latex(float_format='{:,.2f}'.format, escape=False, columns=['Importance', 'CS', 'SWA', 'OA', 'CS-HL'])) f.close() pylab.ioff() fig_width_pt = 350 inches_per_pt = 1.0 / 72.27 golden_mean = 1.2360679774997898 / 2.0 fig_width = fig_width_pt * inches_per_pt fig_height = fig_width * golden_mean fig_size = [fig_width, fig_height] params = {'backend': 'ps', 'axes.labelsize': 10, 'text.fontsize': 10, 'legend.fontsize': 10, 'xtick.labelsize': 8, 'ytick.labelsize': 8, 'text.usetex': True, 'figure.figsize': fig_size} pylab.rcParams.update(params) plot_colors = 'rybg' cmap = pylab.cm.RdYlBu yi = numpara-1 minyi = -0.1 maxyi = 0.1 (xx, yy,) = np.meshgrid(np.arange(min(X[:, 1]), max(X[:, 1]), 0.02), np.arange(min(X[:, yi]), max(X[:, yi]), 0.01)) nnn = xx.ravel().shape[0] Xlist = [np.mean(X[:,i])*np.ones(nnn) for i in range(numpara)] Xlist[1] = xx.ravel() Xlist[yi] = yy.ravel() XX = np.array(Xlist).T Z = treec.predict(XX).reshape(xx.shape) fig = pylab.contourf(xx, yy, Z, [0, 0.9999, 1.9999, 2.9999, 3.9999], colors=('red', 'yellow', 'blue', 'green'), alpha=0.5, antialiased=False, extend='both') for (i, c,) in zip(xrange(4), plot_colors): idx0 = np.where(Y == i) pylab.scatter(X[idx0, 1], X[idx0, yi], c=c, cmap=cmap, label=rows[i], s = 12, lw=0.5 ) pylab.legend(bbox_to_anchor=(0.0, 1.02, 1.0, 0.102), loc=3, ncol=4, mode='expand', borderaxespad=0.0) pylab.xlabel('Mean inflow over capacity') pylab.ylabel('Environmental inflow share') pylab.ylim(minyi, maxyi) OUT = home + out + 'class_fig.pdf' pylab.savefig(OUT, bbox_inches='tight') pylab.show()
#Best hyperparam config best = pruebas[k].best_trial params = best['misc']['vals'] estimators=[50,100,150,300] model = ExtraTreesRegressor(n_estimators=estimators[int(np.array(params['estimators']))], min_samples_leaf=int(np.array(params['leaf'])), min_samples_split=int(np.array(params['split'])), max_features=int(np.array(params['features']))) model.fit(X_train, y_train) #Model metrics pred = model.predict(X_train) train_loss.append(r2_score(y_train, pred)) pred = model.predict(X_test) test_loss.append(r2_score(y_test, pred)) val_loss.append(best['result']['loss']*-1) #Store metrics metrics = {} metrics['train'] = train_loss metrics['val'] = val_loss metrics['test'] = test_loss with open('ETR_metrics_t+'+str(h+1)+'.pkl', 'wb') as f: pickle.dump(metrics, f)
train_x_mean = DataFrame(train_x_mean,columns=['train_mean_'+str(i) for i in range(len(train_x_mean[0]))]) train_x_max = DataFrame(train_x_max,columns=['train_max_'+str(i) for i in range(len(train_x_max[0]))]) train_x_min = DataFrame(train_x_min,columns=['train_min_'+str(i) for i in range(len(train_x_min[0]))]) train_x_median = DataFrame(train_x_median,columns=['train_median_'+str(i) for i in range(len(train_x_median[0]))]) test_x_mean = DataFrame(test_x_mean,columns=['test_mean_'+str(i) for i in range(len(test_x_mean[0]))]) test_x_max = DataFrame(test_x_max,columns=['test_max_'+str(i) for i in range(len(test_x_max[0]))]) test_x_min = DataFrame(test_x_min,columns=['test_min_'+str(i) for i in range(len(test_x_min[0]))]) test_x_median = DataFrame(test_x_median,columns=['test_median_'+str(i) for i in range(len(test_x_median[0]))]) train_x = train_x_mean.join(train_x_max,how='left') train_x = train_x.join(train_x_min,how='left') train_x = train_x.join(train_x_median,how='left') test_x = test_x_mean.join(test_x_max,how='left') test_x = test_x.join(test_x_min,how='left') test_x = test_x.join(test_x_median,how='left') test_x = test_x.fillna(1) train_y = DataFrame(train_y) #----------------------------------------------------------------------------------------直接生成激进结果 ET = ExtraTreesRegressor(n_estimators=2600,random_state=1,n_jobs=-1,min_samples_split=2,min_samples_leaf=2,max_depth=12,max_features='sqrt') ET.fit(train_x,train_y) pre = ET.predict(test_x) pre = sqrt(pre) pre = pre*mean_11_2016 pre = DataFrame(pre.round()) pre.insert(0,'shop_id',[i for i in range(1,2001)]) pre.to_csv('../results/result'+day_time+'_pre.csv',index=False,header=False)
"The MAE of DecisionTreeRegressor is ", mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(dtr_y_pred))) # 从sklearn.ensemble中导入RandomForestsRegressor、ExtraTreesGressor以及GradientBoostingRegressor from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor # 使用RandomForestRegressor训练模型,并在测试集上做出预测 rfr = RandomForestRegressor() rfr.fit(X_train, y_train.ravel()) rfr_y_pred = rfr.predict(X_test) # 使用ExtraTreesRegressor训练模型,并在测试集上做出预测 etr = ExtraTreesRegressor() etr.fit(X_train, y_train.ravel()) etr_y_pred = etr.predict(X_test) # 使用GradientBoostingRegressor训练模型,并在测试集上做出预测 gbr = GradientBoostingRegressor() gbr.fit(X_train, y_train.ravel()) gbr_y_pred = gbr.predict(X_test) # 使用R-squared、MSE和MAE指标对默认参数的随机回归森林在测试集上的性能进行评估 print("R-squared value of RandomForestRegressor is ", rfr.score(X_test, y_test)) print( "The MSE of RandomForestRegressor is ", mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rfr_y_pred))) print( "The MAE of RandomForestRegressor is ",
def decision_tree(X, y1, y2, y3): n, _ = X.shape nTrain = int(0.5*n) #training on 50% of the data Xtrain = X[:nTrain,:] ytrain = y1[:nTrain] ytrain_registered = y2[:nTrain] ytest_registered = y2[nTrain:] ytrain_casual = y3[:nTrain] ytest_casual = y3[nTrain:] Xtest = X[nTrain:,:] ytest = y1[nTrain:] #regular clf_1 = DecisionTreeRegressor(max_depth=None) clf_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=None), n_estimators=500) clf_4 = RandomForestRegressor(n_estimators=500, max_depth=None, min_samples_split=1, random_state=0) clf_5 = ExtraTreesRegressor(n_estimators=500, max_depth=None, min_samples_split=1, random_state=0) clf_3 = GradientBoostingRegressor(n_estimators=500, max_depth=None, random_state=0) print "finished generating tree" clf_1.fit(Xtrain, ytrain_registered) clf_2.fit(Xtrain, ytrain_registered) clf_3.fit(Xtrain, ytrain_registered) clf_4.fit(Xtrain, ytrain_registered) clf_5.fit(Xtrain, ytrain_registered) print 'Finished fitting' dt_regular = clf_1.predict(Xtest) ada_regular = clf_2.predict(Xtest) grad_regular = clf_3.predict(Xtest) rf_regular = clf_4.predict(Xtest) et_regular = clf_5.predict(Xtest) #casual print "finished generating tree" clf_1.fit(Xtrain, ytrain_casual) clf_2.fit(Xtrain, ytrain_casual) clf_3.fit(Xtrain, ytrain_casual) clf_4.fit(Xtrain, ytrain_casual) clf_5.fit(Xtrain, ytrain_casual) print 'Finished fitting' dt_casual = clf_1.predict(Xtest) ada_casual = clf_2.predict(Xtest) grad_casual = clf_3.predict(Xtest) rf_casual = clf_4.predict(Xtest) et_casual = clf_5.predict(Xtest) feature_imps = clf_4.feature_importances_ print "regular decision tree" print rmsle(ytest, dt_regular + dt_casual) print "boosted decision tree" print rmsle(ytest, ada_regular + ada_casual) print "gradient tree boosting" print rmsle(ytest, grad_regular + grad_casual) print "random forest classifier" print rmsle(ytest, rf_regular + rf_casual) print "extra trees classifier" print rmsle(ytest, et_casual + et_regular) print "feature importances" print feature_imps
def ensemble(self): ''' Create ensemble of gradient boosting regressor and random forest regressor ''' self.remove_columns([ 'institute_latitude', 'institute_longitude', 'institute_state', 'institute_country', 'var10', 'var11', 'var12', 'var13', 'var14', 'var15', 'instructor_past_performance', 'instructor_association_industry_expert', 'secondary_area', 'var24' ]) self.split_dataset() gbr_model = GradientBoostingRegressor(learning_rate=0.1, n_estimators=200, subsample=0.8) rf_model = RandomForestRegressor(n_estimators=50) etr_model = ExtraTreesRegressor(n_estimators=50) gbr_model.fit(self.Xt, self.yt) yt_pred_gbr = gbr_model.predict(self.Xt) gbr_training_score = self.eval_score(self.yt, yt_pred_gbr) print 'GBR training score ', gbr_training_score rf_model.fit(self.Xt, self.yt) yt_pred_rf = rf_model.predict(self.Xt) rf_training_score = self.eval_score(self.yt, yt_pred_rf) print 'RF training score ', rf_training_score etr_model.fit(self.Xt, self.yt) yt_pred_etr = etr_model.predict(self.Xt) etr_training_score = self.eval_score(self.yt, yt_pred_etr) print 'ETR training score ', etr_training_score self.training_score = self.eval_score( self.yt, (yt_pred_rf + yt_pred_gbr + yt_pred_etr) / 3.) yv_pred_gbr = gbr_model.predict(self.Xv) gbr_test_score = self.eval_score(self.yv, yv_pred_gbr) print 'GBR test score ', gbr_test_score yv_pred_rf = rf_model.predict(self.Xv) rf_test_score = self.eval_score(self.yv, yv_pred_rf) print 'Rf score ', rf_test_score yv_pred_etr = etr_model.predict(self.Xv) etr_test_score = self.eval_score(self.yv, yv_pred_etr) print 'ETR test score ', etr_test_score self.test_score = self.eval_score( self.yv, (yv_pred_rf + yv_pred_gbr + yv_pred_etr) / 3.) print 'Correlation between predictions of these two models ', pd.DataFrame( { 'rf_test_score': yv_pred_rf, 'gbr_test_score': yv_pred_gbr, 'etr_test_score': yv_pred_etr }).corr()
test_itr_first = test_itr_last output = [] for tr_,te_ in getExtensiveSeasonalTrainTestData(): #tr_,te_ = normalize(tr_.drop(['casual','registered','count'],axis=1),te_) print("TrainEnd:"+str(tr_['year'].iloc[len(tr_)-1])+":"+str(tr_['month'].iloc[len(tr_)-1])+"TrainStart:"+str(tr_['year'].iloc[0])+":"+str(tr_['month'].iloc[0])) print("TestEnd:"+str(te_['year'].iloc[len(te_)-1])+":"+str(te_['month'].iloc[len(te_)-1])+"TestStart:"+str(te_['year'].iloc[0])+":"+str(te_['month'].iloc[0])) tr_.drop('season',axis=1,inplace=True) te_.drop('season',axis=1,inplace=True) clf_casual = ExtraTreesRegressor(n_estimators = 100) clf_casual.fit(tr_.drop(['casual','registered','count'],axis=1),np.log(tr_.casual+1)) output_casual = np.exp(clf_casual.predict(te_))-1 clf_registered = ExtraTreesRegressor(n_estimators = 100) clf_registered.fit(tr_.drop(['casual','registered','count'],axis=1),np.log(tr_.registered+1)) output_registered = np.exp(clf_registered.predict(te_))-1 clf_count = ExtraTreesRegressor(n_estimators = 100) clf_count.fit(tr_.drop(['casual','registered','count'],axis=1),np.log(tr_['count']+1)) output_count = np.exp(clf_count.predict(te_))-1 out = ((output_casual + output_registered)+output_count)/2 #out = output_count #out = (output_casual + output_registered) #print(out.astype(int)) output.extend(out.astype(int)) #print(str(out.astype(int).shape[0]))
def Modeller(X_train, X_test, Y_train, Y_test, dt_, params, epochs): # #required by LBGM train_data = lgb.Dataset(X_train, Y_train) valid_data = lgb.Dataset(X_test, Y_test) if X_train.shape[0] < params['min_child_samples'] // 2 or X_train.shape[ 0] > params['min_child_samples'] // 3: params['min_child_samples'] //= 100 params['n_estimators'] //= 1 elif X_train.shape[0] < params['min_child_samples'] // 4 or X_train.shape[ 0] > params['min_child_samples'] // 5: params['min_child_samples'] //= 400 params['n_estimators'] //= 4 elif X_train.shape[0] < params['min_child_samples'] // 5 or X_train.shape[ 0] > params['min_child_samples'] // 6: params['min_child_samples'] //= 400 params['n_estimators'] //= 5 elif X_train.shape[0] < params['min_child_samples'] // 7 or X_train.shape[ 0] > params['min_child_samples'] // 8: params['min_child_samples'] //= 400 params['n_estimators'] //= 5 elif X_train.shape[0] < params['min_child_samples'] // 8 or X_train.shape[ 0] > params['min_child_samples'] // 9: params['min_child_samples'] //= 400 params['n_estimators'] //= 6 elif X_train.shape[0] < params['min_child_samples'] // 10 or X_train.shape[ 0] > params['min_child_samples'] // 11: params['min_child_samples'] //= 400 params['n_estimators'] //= 6 elif X_train.shape[0] < params['min_child_samples'] // 12 or X_train.shape[ 0] > params['min_child_samples'] // 13: params['min_child_samples'] //= 400 params['n_estimators'] //= 6 elif X_train.shape[0] < params['min_child_samples'] // 14 or X_train.shape[ 0] > params['min_child_samples'] // 14: params['min_child_samples'] //= 400 params['n_estimators'] //= 6 elif X_train.shape[0] < params['min_child_samples'] // 15 or X_train.shape[ 0] > params['min_child_samples'] // 16: params['min_child_samples'] //= 400 params['n_estimators'] //= 6 elif X_train.shape[0] < params['min_child_samples'] // 17: params['min_child_samples'] //= 400 params['n_estimators'] //= 6 else: params['min_child_samples'] params['n_estimators'] Regress1 = RandomForestRegressor(max_depth=params['max_depth'], random_state=params['random_state'], n_estimators=params['n_estimators']) Regress2 = GradientBoostingRegressor(learning_rate=params['learning_rate'], loss=params['loss'], n_estimators=params['n_estimators']) Regress3 = ExtraTreesRegressor(max_depth=params['max_depth'], random_state=params['random_state'], n_estimators=params['n_estimators']) Regress4 = XGBRegressor(max_depth=params['max_depth'], n_estimators=params['n_estimators'], min_child_weight=params['min_child_weight'], colsample_bytree=params['colsample_bytree'], subsample=params['subsample'], eta=params['eta'], seed=params['seed']) Regress1.fit(X_train, Y_train) Regress2.fit(X_train, Y_train) Regress3.fit(X_train, Y_train) Regress4.fit(X_train, Y_train, eval_metric="rmse") print('Parameter value: {}\nN_estimators:{}'.format( params['min_child_samples'], params['n_estimators'])) Regress5 = lgb.train(params, train_data, valid_sets=[train_data, valid_data], num_boost_round=2500) Predic_ = Regress1.predict(X_test) Predic_2 = Regress2.predict(X_test) Predic_3 = Regress3.predict(X_test) Predic_4 = Regress4.predict(X_test) Predic_5 = Regress5.predict(X_test) Predic_6 = [x[0] for x in RNN(forecast_window, epochs)] forcast_date = pd.DataFrame({ 'timestamp': dt_, 'RandForest_{}_Projection'.format(price): Predic_, 'GradBoost_{}_Projection'.format(price): Predic_2, 'ExtraTrees_{}_Projection'.format(price): Predic_3, 'XGB_{}_Projection'.format(price): Predic_4, 'LGB_{}_Projection'.format(price): Predic_5, 'RNN_{}_Projection'.format(price): Predic_6 }) forcast_date['Average_{}_Projection'.format(price)] = forcast_date.mean( axis=1) forcast_date.set_index('timestamp', inplace=True) return forcast_date
# Print the feature ranking print "Extra Tree Feature ranking:" for f in xrange(12): if indices[f] < 8: print "%d. %s (%f)" % (f + 1, feature_list[indices[f]],xt_importances[indices[f]]) else: print "%d. feature %d (%f)" % (f + 1, indices[f], xt_importances[indices[f]]) with open('xt_all.pkl', 'wb') as f: cPickle.dump(clf_xt, f) with open('xt_all.pkl', 'rb') as f: clf_xt = cPickle.load(f) #joblib.dump(clf_xt, 'xt.pkl', compress=9) #clf_xt = joblib.load('xt.pkl') abs_err = np.abs(bp_test - clf_xt.predict(data_test)) t1 = time.time() - t0 print "xtr sbp mean: %.2f (sd: %.2f)" % (np.mean(abs_err[:, 0]), np.std(abs_err[:, 0])), print "xtr dbp mean: %.2f (sd: %.2f)" % (np.mean(abs_err[:, 1]), np.std(abs_err[:, 1])), "took", round(t1, 2), "sec" scores = cross_val_score(clf_xt, data_train, bp_train) print "xv scores", scores.mean() print "explained_variance_score (sbp)", explained_variance_score(bp_test[:, 0], clf_xt.predict(data_test)[:, 0]) print "explained_variance_score (dbp)", explained_variance_score(bp_test[:, 1], clf_xt.predict(data_test)[:, 1]) print "r2_score (sbp)", r2_score(bp_test[:, 0], clf_xt.predict(data_test)[:, 0]) print "r2_score (dbp)", r2_score(bp_test[:, 1], clf_xt.predict(data_test)[:, 1]) print "----" ''' ## svr, need some fine tuning t0 = time.time() clf_svr = SVR(C=1.0, epsilon=0.4) clf_svr.fit(X=data_train, y=bp_train[:, 0])
def hyperopt_obj(param, feat_folder, feat_name, trial_counter): kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float) for run in range(1, config.n_runs + 1): for fold in range(1, config.n_folds + 1): rng = np.random.RandomState(2015 + 1000 * run + 10 * fold) #### all the path path = "%s/Run%d/Fold%d" % (feat_folder, run, fold) save_path = "%s/Run%d/Fold%d" % (output_path, run, fold) if not os.path.exists(save_path): os.makedirs(save_path) # feat feat_train_path = "%s/train.feat" % path feat_valid_path = "%s/valid.feat" % path # weight weight_train_path = "%s/train.feat.weight" % path weight_valid_path = "%s/valid.feat.weight" % path # info info_train_path = "%s/train.info" % path info_valid_path = "%s/valid.info" % path # cdf cdf_valid_path = "%s/valid.cdf" % path # raw prediction path (rank) raw_pred_valid_path = "%s/valid.raw.pred.%s_[Id@%d].csv" % ( save_path, feat_name, trial_counter) rank_pred_valid_path = "%s/valid.pred.%s_[Id@%d].csv" % ( save_path, feat_name, trial_counter) ## load feat X_train, labels_train = load_svmlight_file(feat_train_path) X_valid, labels_valid = load_svmlight_file(feat_valid_path) if X_valid.shape[1] < X_train.shape[1]: X_valid = hstack([ X_valid, np.zeros((X_valid.shape[0], X_train.shape[1] - X_valid.shape[1])) ]) elif X_valid.shape[1] > X_train.shape[1]: X_train = hstack([ X_train, np.zeros((X_train.shape[0], X_valid.shape[1] - X_train.shape[1])) ]) # ??? why augment the features here? X_train = X_train.tocsr() X_valid = X_valid.tocsr() ## load weight weight_train = np.loadtxt(weight_train_path, dtype=float) weight_valid = np.loadtxt(weight_valid_path, dtype=float) ## load valid info info_train = pd.read_csv(info_train_path) numTrain = info_train.shape[0] info_valid = pd.read_csv(info_valid_path) numValid = info_valid.shape[0] Y_valid = info_valid["median_relevance"] ## load cdf cdf_valid = np.loadtxt(cdf_valid_path, dtype=float) ## make evalerror func evalerror_regrank_valid = lambda preds, dtrain: evalerror_regrank_cdf( preds, dtrain, cdf_valid) evalerror_softmax_valid = lambda preds, dtrain: evalerror_softmax_cdf( preds, dtrain, cdf_valid) evalerror_softkappa_valid = lambda preds, dtrain: evalerror_softkappa_cdf( preds, dtrain, cdf_valid) evalerror_ebc_valid = lambda preds, dtrain: evalerror_ebc_cdf( preds, dtrain, cdf_valid, ebc_hard_threshold) evalerror_cocr_valid = lambda preds, dtrain: evalerror_cocr_cdf( preds, dtrain, cdf_valid) ############## ## Training ## ############## ## you can use bagging to stabilize the predictions preds_bagging = np.zeros((numValid, bagging_size), dtype=float) for n in range(bagging_size): if bootstrap_replacement: sampleSize = int(numTrain * bootstrap_ratio) index_base = rng.randint(numTrain, size=sampleSize) index_meta = [ i for i in range(numTrain) if i not in index_base ] else: randnum = rng.uniform(size=numTrain) index_base = [ i for i in range(numTrain) if randnum[i] < bootstrap_ratio ] index_meta = [ i for i in range(numTrain) if randnum[i] >= bootstrap_ratio ] if param.has_key("booster"): dvalid_base = xgb.DMatrix(X_valid, label=labels_valid, weight=weight_valid) dtrain_base = xgb.DMatrix(X_train[index_base], label=labels_train[index_base], weight=weight_train[index_base]) watchlist = [] if verbose_level >= 2: watchlist = [(dtrain_base, 'train'), (dvalid_base, 'valid')] ## various models if param["task"] in ["regression", "ranking"]: ## regression & pairwise ranking with xgboost bst = xgb.train(param, dtrain_base, param['num_round'], watchlist, feval=evalerror_regrank_valid) pred = bst.predict(dvalid_base) elif param["task"] in ["softmax"]: ## softmax regression with xgboost bst = xgb.train(param, dtrain_base, param['num_round'], watchlist, feval=evalerror_softmax_valid) pred = bst.predict(dvalid_base) w = np.asarray(range(1, numOfClass + 1)) pred = pred * w[np.newaxis, :] pred = np.sum(pred, axis=1) elif param["task"] in ["softkappa"]: ## softkappa with xgboost obj = lambda preds, dtrain: softkappaObj( preds, dtrain, hess_scale=param['hess_scale']) bst = xgb.train(param, dtrain_base, param['num_round'], watchlist, obj=obj, feval=evalerror_softkappa_valid) pred = softmax(bst.predict(dvalid_base)) w = np.asarray(range(1, numOfClass + 1)) pred = pred * w[np.newaxis, :] pred = np.sum(pred, axis=1) elif param["task"] in ["ebc"]: ## ebc with xgboost obj = lambda preds, dtrain: ebcObj(preds, dtrain) bst = xgb.train(param, dtrain_base, param['num_round'], watchlist, obj=obj, feval=evalerror_ebc_valid) pred = sigmoid(bst.predict(dvalid_base)) pred = applyEBCRule(pred, hard_threshold=ebc_hard_threshold) elif param["task"] in ["cocr"]: ## cocr with xgboost obj = lambda preds, dtrain: cocrObj(preds, dtrain) bst = xgb.train(param, dtrain_base, param['num_round'], watchlist, obj=obj, feval=evalerror_cocr_valid) pred = bst.predict(dvalid_base) pred = applyCOCRRule(pred) elif param['task'] == "reg_skl_rf": ## regression with sklearn random forest regressor rf = RandomForestRegressor( n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) rf.fit(X_train[index_base], labels_train[index_base] + 1, sample_weight=weight_train[index_base]) pred = rf.predict(X_valid) elif param['task'] == "reg_skl_etr": ## regression with sklearn extra trees regressor etr = ExtraTreesRegressor( n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) etr.fit(X_train[index_base], labels_train[index_base] + 1, sample_weight=weight_train[index_base]) pred = etr.predict(X_valid) elif param['task'] == "reg_skl_gbm": ## regression with sklearn gradient boosting regressor gbm = GradientBoostingRegressor( n_estimators=param['n_estimators'], max_features=param['max_features'], learning_rate=param['learning_rate'], max_depth=param['max_depth'], subsample=param['subsample'], random_state=param['random_state']) gbm.fit(X_train.toarray()[index_base], labels_train[index_base] + 1, sample_weight=weight_train[index_base]) pred = gbm.predict(X_valid.toarray()) elif param['task'] == "clf_skl_lr": ## classification with sklearn logistic regression lr = LogisticRegression(penalty="l2", dual=True, tol=1e-5, C=param['C'], fit_intercept=True, intercept_scaling=1.0, class_weight='auto', random_state=param['random_state']) lr.fit(X_train[index_base], labels_train[index_base] + 1) pred = lr.predict_proba(X_valid) w = np.asarray(range(1, numOfClass + 1)) pred = pred * w[np.newaxis, :] pred = np.sum(pred, axis=1) elif param['task'] == "reg_skl_svr": ## regression with sklearn support vector regression X_train, X_valid = X_train.toarray(), X_valid.toarray() scaler = StandardScaler() X_train[index_base] = scaler.fit_transform( X_train[index_base]) X_valid = scaler.transform(X_valid) svr = SVR(C=param['C'], gamma=param['gamma'], epsilon=param['epsilon'], degree=param['degree'], kernel=param['kernel']) svr.fit(X_train[index_base], labels_train[index_base] + 1, sample_weight=weight_train[index_base]) pred = svr.predict(X_valid) elif param['task'] == "reg_skl_ridge": ## regression with sklearn ridge regression ridge = Ridge(alpha=param["alpha"], normalize=True) ridge.fit(X_train[index_base], labels_train[index_base] + 1, sample_weight=weight_train[index_base]) pred = ridge.predict(X_valid) elif param['task'] == "reg_skl_lasso": ## regression with sklearn lasso lasso = Lasso(alpha=param["alpha"], normalize=True) lasso.fit(X_train[index_base], labels_train[index_base] + 1) pred = lasso.predict(X_valid) elif param['task'] == 'reg_libfm': ## regression with factorization machine (libfm) ## to array X_train = X_train.toarray() X_valid = X_valid.toarray() ## scale scaler = StandardScaler() X_train[index_base] = scaler.fit_transform( X_train[index_base]) X_valid = scaler.transform(X_valid) ## dump feat dump_svmlight_file(X_train[index_base], labels_train[index_base], feat_train_path + ".tmp") dump_svmlight_file(X_valid, labels_valid, feat_valid_path + ".tmp") ## train fm cmd = "%s -task r -train %s -test %s -out %s -dim '1,1,%d' -iter %d > libfm.log" % ( \ libfm_exe, feat_train_path+".tmp", feat_valid_path+".tmp", raw_pred_valid_path, \ param['dim'], param['iter']) os.system(cmd) os.remove(feat_train_path + ".tmp") os.remove(feat_valid_path + ".tmp") ## extract libfm prediction pred = np.loadtxt(raw_pred_valid_path, dtype=float) ## labels are in [0,1,2,3] pred += 1 elif param['task'] == "reg_keras_dnn": ## regression with keras' deep neural networks model = Sequential() ## input layer model.add(Dropout(param["input_dropout"])) ## hidden layers first = True hidden_layers = param['hidden_layers'] while hidden_layers > 0: if first: dim = X_train.shape[1] first = False else: dim = param["hidden_units"] model.add( Dense(dim, param["hidden_units"], init='glorot_uniform')) if param["batch_norm"]: model.add( BatchNormalization((param["hidden_units"], ))) if param["hidden_activation"] == "prelu": model.add(PReLU((param["hidden_units"], ))) else: model.add(Activation(param['hidden_activation'])) model.add(Dropout(param["hidden_dropout"])) hidden_layers -= 1 ## output layer model.add( Dense(param["hidden_units"], 1, init='glorot_uniform')) model.add(Activation('linear')) ## loss model.compile(loss='mean_squared_error', optimizer="adam") ## to array X_train = X_train.toarray() X_valid = X_valid.toarray() ## scale scaler = StandardScaler() X_train[index_base] = scaler.fit_transform( X_train[index_base]) X_valid = scaler.transform(X_valid) ## train model.fit(X_train[index_base], labels_train[index_base] + 1, nb_epoch=param['nb_epoch'], batch_size=param['batch_size'], validation_split=0, verbose=0) ##prediction pred = model.predict(X_valid, verbose=0) pred.shape = (X_valid.shape[0], ) elif param['task'] == "reg_rgf": ## regression with regularized greedy forest (rgf) ## to array X_train, X_valid = X_train.toarray(), X_valid.toarray() train_x_fn = feat_train_path + ".x" train_y_fn = feat_train_path + ".y" valid_x_fn = feat_valid_path + ".x" valid_pred_fn = feat_valid_path + ".pred" model_fn_prefix = "rgf_model" np.savetxt(train_x_fn, X_train[index_base], fmt="%.6f", delimiter='\t') np.savetxt(train_y_fn, labels_train[index_base], fmt="%d", delimiter='\t') np.savetxt(valid_x_fn, X_valid, fmt="%.6f", delimiter='\t') # np.savetxt(valid_y_fn, labels_valid, fmt="%d", delimiter='\t') pars = [ "train_x_fn=", train_x_fn, "\n", "train_y_fn=", train_y_fn, "\n", #"train_w_fn=",weight_train_path,"\n", "model_fn_prefix=", model_fn_prefix, "\n", "reg_L2=", param['reg_L2'], "\n", #"reg_depth=", 1.01, "\n", "algorithm=", "RGF", "\n", "loss=", "LS", "\n", #"opt_interval=", 100, "\n", "valid_interval=", param['max_leaf_forest'], "\n", "max_leaf_forest=", param['max_leaf_forest'], "\n", "num_iteration_opt=", param['num_iteration_opt'], "\n", "num_tree_search=", param['num_tree_search'], "\n", "min_pop=", param['min_pop'], "\n", "opt_interval=", param['opt_interval'], "\n", "opt_stepsize=", param['opt_stepsize'], "\n", "NormalizeTarget" ] pars = "".join([str(p) for p in pars]) rfg_setting_train = "./rfg_setting_train" with open(rfg_setting_train + ".inp", "wb") as f: f.write(pars) ## train fm cmd = "perl %s %s train %s >> rgf.log" % ( call_exe, rgf_exe, rfg_setting_train) #print cmd os.system(cmd) model_fn = model_fn_prefix + "-01" pars = [ "test_x_fn=", valid_x_fn, "\n", "model_fn=", model_fn, "\n", "prediction_fn=", valid_pred_fn ] pars = "".join([str(p) for p in pars]) rfg_setting_valid = "./rfg_setting_valid" with open(rfg_setting_valid + ".inp", "wb") as f: f.write(pars) cmd = "perl %s %s predict %s >> rgf.log" % ( call_exe, rgf_exe, rfg_setting_valid) #print cmd os.system(cmd) pred = np.loadtxt(valid_pred_fn, dtype=float) ## weighted averageing over different models pred_valid = pred ## this bagging iteration preds_bagging[:, n] = pred_valid pred_raw = np.mean(preds_bagging[:, :(n + 1)], axis=1) # why do we need to do this average over different bagging sample?2 pred_rank = pred_raw.argsort().argsort() pred_score, cutoff = getScore(pred_rank, cdf_valid, valid=True) kappa_valid = quadratic_weighted_kappa(pred_score, Y_valid) if (n + 1) != bagging_size: print( " {:>3} {:>3} {:>3} {:>6} {} x {}" .format(run, fold, n + 1, np.round(kappa_valid, 6), X_train.shape[0], X_train.shape[1])) else: print( " {:>3} {:>3} {:>3} {:>8} {} x {}" .format(run, fold, n + 1, np.round(kappa_valid, 6), X_train.shape[0], X_train.shape[1])) kappa_cv[run - 1, fold - 1] = kappa_valid ## save this prediction dfPred = pd.DataFrame({"target": Y_valid, "prediction": pred_raw}) dfPred.to_csv(raw_pred_valid_path, index=False, header=True, columns=["target", "prediction"]) ## save this prediction dfPred = pd.DataFrame({"target": Y_valid, "prediction": pred_rank}) dfPred.to_csv(rank_pred_valid_path, index=False, header=True, columns=["target", "prediction"]) kappa_cv_mean = np.mean(kappa_cv) kappa_cv_std = np.std(kappa_cv) if verbose_level >= 1: print(" Mean: %.6f" % kappa_cv_mean) print(" Std: %.6f" % kappa_cv_std) #################### #### Retraining #### #################### #### all the path path = "%s/All" % (feat_folder) save_path = "%s/All" % output_path subm_path = "%s/Subm" % output_path if not os.path.exists(save_path): os.makedirs(save_path) if not os.path.exists(subm_path): os.makedirs(subm_path) # feat feat_train_path = "%s/train.feat" % path feat_test_path = "%s/test.feat" % path # weight weight_train_path = "%s/train.feat.weight" % path # info info_train_path = "%s/train.info" % path info_test_path = "%s/test.info" % path # cdf cdf_test_path = "%s/test.cdf" % path # raw prediction path (rank) raw_pred_test_path = "%s/test.raw.pred.%s_[Id@%d].csv" % ( save_path, feat_name, trial_counter) rank_pred_test_path = "%s/test.pred.%s_[Id@%d].csv" % ( save_path, feat_name, trial_counter) # submission path (relevance as in [1,2,3,4]) subm_path = "%s/test.pred.%s_[Id@%d]_[Mean%.6f]_[Std%.6f].csv" % ( subm_path, feat_name, trial_counter, kappa_cv_mean, kappa_cv_std) #### load data ## load feat X_train, labels_train = load_svmlight_file(feat_train_path) X_test, labels_test = load_svmlight_file(feat_test_path) if X_test.shape[1] < X_train.shape[1]: X_test = hstack([ X_test, np.zeros((X_test.shape[0], X_train.shape[1] - X_test.shape[1])) ]) elif X_test.shape[1] > X_train.shape[1]: X_train = hstack([ X_train, np.zeros((X_train.shape[0], X_test.shape[1] - X_train.shape[1])) ]) X_train = X_train.tocsr() X_test = X_test.tocsr() ## load train weight weight_train = np.loadtxt(weight_train_path, dtype=float) ## load test info info_train = pd.read_csv(info_train_path) numTrain = info_train.shape[0] info_test = pd.read_csv(info_test_path) numTest = info_test.shape[0] id_test = info_test["id"] ## load cdf cdf_test = np.loadtxt(cdf_test_path, dtype=float) ## evalerror_regrank_test = lambda preds, dtrain: evalerror_regrank_cdf( preds, dtrain, cdf_test) evalerror_softmax_test = lambda preds, dtrain: evalerror_softmax_cdf( preds, dtrain, cdf_test) evalerror_softkappa_test = lambda preds, dtrain: evalerror_softkappa_cdf( preds, dtrain, cdf_test) evalerror_ebc_test = lambda preds, dtrain: evalerror_ebc_cdf( preds, dtrain, cdf_test, ebc_hard_threshold) evalerror_cocr_test = lambda preds, dtrain: evalerror_cocr_cdf( preds, dtrain, cdf_test) ## bagging preds_bagging = np.zeros((numTest, bagging_size), dtype=float) for n in range(bagging_size): if bootstrap_replacement: sampleSize = int(numTrain * bootstrap_ratio) #index_meta = rng.randint(numTrain, size=sampleSize) #index_base = [i for i in range(numTrain) if i not in index_meta] index_base = rng.randint(numTrain, size=sampleSize) index_meta = [i for i in range(numTrain) if i not in index_base] else: randnum = rng.uniform(size=numTrain) index_base = [ i for i in range(numTrain) if randnum[i] < bootstrap_ratio ] index_meta = [ i for i in range(numTrain) if randnum[i] >= bootstrap_ratio ] if param.has_key("booster"): dtest = xgb.DMatrix(X_test, label=labels_test) dtrain = xgb.DMatrix(X_train[index_base], label=labels_train[index_base], weight=weight_train[index_base]) watchlist = [] if verbose_level >= 2: watchlist = [(dtrain, 'train')] ## train if param["task"] in ["regression", "ranking"]: bst = xgb.train(param, dtrain, param['num_round'], watchlist, feval=evalerror_regrank_test) pred = bst.predict(dtest) elif param["task"] in ["softmax"]: bst = xgb.train(param, dtrain, param['num_round'], watchlist, feval=evalerror_softmax_test) pred = bst.predict(dtest) w = np.asarray(range(1, numOfClass + 1)) pred = pred * w[np.newaxis, :] pred = np.sum(pred, axis=1) elif param["task"] in ["softkappa"]: obj = lambda preds, dtrain: softkappaObj( preds, dtrain, hess_scale=param['hess_scale']) bst = xgb.train(param, dtrain, param['num_round'], watchlist, obj=obj, feval=evalerror_softkappa_test) pred = softmax(bst.predict(dtest)) w = np.asarray(range(1, numOfClass + 1)) pred = pred * w[np.newaxis, :] pred = np.sum(pred, axis=1) elif param["task"] in ["ebc"]: obj = lambda preds, dtrain: ebcObj(preds, dtrain) bst = xgb.train(param, dtrain, param['num_round'], watchlist, obj=obj, feval=evalerror_ebc_test) pred = sigmoid(bst.predict(dtest)) pred = applyEBCRule(pred, hard_threshold=ebc_hard_threshold) elif param["task"] in ["cocr"]: obj = lambda preds, dtrain: cocrObj(preds, dtrain) bst = xgb.train(param, dtrain, param['num_round'], watchlist, obj=obj, feval=evalerror_cocr_test) pred = bst.predict(dtest) pred = applyCOCRRule(pred) elif param['task'] == "reg_skl_rf": ## random forest regressor rf = RandomForestRegressor(n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) rf.fit(X_train[index_base], labels_train[index_base] + 1, sample_weight=weight_train[index_base]) pred = rf.predict(X_test) elif param['task'] == "reg_skl_etr": ## extra trees regressor etr = ExtraTreesRegressor(n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) etr.fit(X_train[index_base], labels_train[index_base] + 1, sample_weight=weight_train[index_base]) pred = etr.predict(X_test) elif param['task'] == "reg_skl_gbm": ## gradient boosting regressor gbm = GradientBoostingRegressor( n_estimators=param['n_estimators'], max_features=param['max_features'], learning_rate=param['learning_rate'], max_depth=param['max_depth'], subsample=param['subsample'], random_state=param['random_state']) gbm.fit(X_train.toarray()[index_base], labels_train[index_base] + 1, sample_weight=weight_train[index_base]) pred = gbm.predict(X_test.toarray()) elif param['task'] == "clf_skl_lr": lr = LogisticRegression(penalty="l2", dual=True, tol=1e-5, C=param['C'], fit_intercept=True, intercept_scaling=1.0, class_weight='auto', random_state=param['random_state']) lr.fit(X_train[index_base], labels_train[index_base] + 1) pred = lr.predict_proba(X_test) w = np.asarray(range(1, numOfClass + 1)) pred = pred * w[np.newaxis, :] pred = np.sum(pred, axis=1) elif param['task'] == "reg_skl_svr": ## regression with sklearn support vector regression X_train, X_test = X_train.toarray(), X_test.toarray() scaler = StandardScaler() X_train[index_base] = scaler.fit_transform(X_train[index_base]) X_test = scaler.transform(X_test) svr = SVR(C=param['C'], gamma=param['gamma'], epsilon=param['epsilon'], degree=param['degree'], kernel=param['kernel']) svr.fit(X_train[index_base], labels_train[index_base] + 1, sample_weight=weight_train[index_base]) pred = svr.predict(X_test) elif param['task'] == "reg_skl_ridge": ridge = Ridge(alpha=param["alpha"], normalize=True) ridge.fit(X_train[index_base], labels_train[index_base] + 1, sample_weight=weight_train[index_base]) pred = ridge.predict(X_test) elif param['task'] == "reg_skl_lasso": lasso = Lasso(alpha=param["alpha"], normalize=True) lasso.fit(X_train[index_base], labels_train[index_base] + 1) pred = lasso.predict(X_test) elif param['task'] == 'reg_libfm': ## to array X_train, X_test = X_train.toarray(), X_test.toarray() ## scale scaler = StandardScaler() X_train[index_base] = scaler.fit_transform(X_train[index_base]) X_test = scaler.transform(X_test) ## dump feat dump_svmlight_file(X_train[index_base], labels_train[index_base], feat_train_path + ".tmp") dump_svmlight_file(X_test, labels_test, feat_test_path + ".tmp") ## train fm cmd = "%s -task r -train %s -test %s -out %s -dim '1,1,%d' -iter %d > libfm.log" % ( \ libfm_exe, feat_train_path+".tmp", feat_test_path+".tmp", raw_pred_test_path, \ param['dim'], param['iter']) os.system(cmd) os.remove(feat_train_path + ".tmp") os.remove(feat_test_path + ".tmp") ## extract libfm prediction pred = np.loadtxt(raw_pred_test_path, dtype=float) ## labels are in [0,1,2,3] pred += 1 elif param['task'] == "reg_keras_dnn": ## regression with keras deep neural networks model = Sequential() ## input layer model.add(Dropout(param["input_dropout"])) ## hidden layers first = True hidden_layers = param['hidden_layers'] while hidden_layers > 0: if first: dim = X_train.shape[1] first = False else: dim = param["hidden_units"] model.add( Dense(dim, param["hidden_units"], init='glorot_uniform')) if param["batch_norm"]: model.add(BatchNormalization((param["hidden_units"], ))) if param["hidden_activation"] == "prelu": model.add(PReLU((param["hidden_units"], ))) else: model.add(Activation(param['hidden_activation'])) model.add(Dropout(param["hidden_dropout"])) hidden_layers -= 1 ## output layer model.add(Dense(param["hidden_units"], 1, init='glorot_uniform')) model.add(Activation('linear')) ## loss model.compile(loss='mean_squared_error', optimizer="adam") ## to array X_train = X_train.toarray() X_test = X_test.toarray() ## scale scaler = StandardScaler() X_train[index_base] = scaler.fit_transform(X_train[index_base]) X_test = scaler.transform(X_test) ## train model.fit(X_train[index_base], labels_train[index_base] + 1, nb_epoch=param['nb_epoch'], batch_size=param['batch_size'], verbose=0) ##prediction pred = model.predict(X_test, verbose=0) pred.shape = (X_test.shape[0], ) elif param['task'] == "reg_rgf": ## to array X_train, X_test = X_train.toarray(), X_test.toarray() train_x_fn = feat_train_path + ".x" train_y_fn = feat_train_path + ".y" test_x_fn = feat_test_path + ".x" test_pred_fn = feat_test_path + ".pred" model_fn_prefix = "rgf_model" np.savetxt(train_x_fn, X_train[index_base], fmt="%.6f", delimiter='\t') np.savetxt(train_y_fn, labels_train[index_base], fmt="%d", delimiter='\t') np.savetxt(test_x_fn, X_test, fmt="%.6f", delimiter='\t') # np.savetxt(valid_y_fn, labels_valid, fmt="%d", delimiter='\t') pars = [ "train_x_fn=", train_x_fn, "\n", "train_y_fn=", train_y_fn, "\n", #"train_w_fn=",weight_train_path,"\n", "model_fn_prefix=", model_fn_prefix, "\n", "reg_L2=", param['reg_L2'], "\n", #"reg_depth=", 1.01, "\n", "algorithm=", "RGF", "\n", "loss=", "LS", "\n", "test_interval=", param['max_leaf_forest'], "\n", "max_leaf_forest=", param['max_leaf_forest'], "\n", "num_iteration_opt=", param['num_iteration_opt'], "\n", "num_tree_search=", param['num_tree_search'], "\n", "min_pop=", param['min_pop'], "\n", "opt_interval=", param['opt_interval'], "\n", "opt_stepsize=", param['opt_stepsize'], "\n", "NormalizeTarget" ] pars = "".join([str(p) for p in pars]) rfg_setting_train = "./rfg_setting_train" with open(rfg_setting_train + ".inp", "wb") as f: f.write(pars) ## train fm cmd = "perl %s %s train %s >> rgf.log" % (call_exe, rgf_exe, rfg_setting_train) #print cmd os.system(cmd) model_fn = model_fn_prefix + "-01" pars = [ "test_x_fn=", test_x_fn, "\n", "model_fn=", model_fn, "\n", "prediction_fn=", test_pred_fn ] pars = "".join([str(p) for p in pars]) rfg_setting_test = "./rfg_setting_test" with open(rfg_setting_test + ".inp", "wb") as f: f.write(pars) cmd = "perl %s %s predict %s >> rgf.log" % (call_exe, rgf_exe, rfg_setting_test) #print cmd os.system(cmd) pred = np.loadtxt(test_pred_fn, dtype=float) ## weighted averageing over different models pred_test = pred preds_bagging[:, n] = pred_test pred_raw = np.mean(preds_bagging, axis=1) pred_rank = pred_raw.argsort().argsort() # ## write output = pd.DataFrame({"id": id_test, "prediction": pred_raw}) output.to_csv(raw_pred_test_path, index=False) ## write output = pd.DataFrame({"id": id_test, "prediction": pred_rank}) output.to_csv(rank_pred_test_path, index=False) ## write score pred_score = getScore(pred, cdf_test) output = pd.DataFrame({"id": id_test, "prediction": pred_score}) output.to_csv(subm_path, index=False) #""" return kappa_cv_mean, kappa_cv_std
preds_RF_py = np.exp(clf_RF.predict(pte[feature_names]))-1 RF_py_sub = pd.DataFrame({'Id':ID.Id, 'Sales':preds_RF_py}) RF_py_sub.to_csv("F:/Kaggle/Rossman/Blends/Stacking/RF_subs.csv", index = False) # Extreemly Randomized Trees # reg_ET = ExtraTreesRegressor(n_estimators = 1000, max_features = 0.75, max_depth = 8, min_samples_split = 12, n_jobs = -1, random_state = 737, verbose = 2) reg_ET = reg_ET.fit(x_train, y_train) preds_h = reg_ET.predict(pth[feature_names]) ET_holdout = pd.DataFrame({'Date':pth.Date, 'Dow':pth.DayOfWeek, 'Actual':np.exp(pth.Sales)-1, 'Predicted':np.exp(preds_h)-1}) ET_holdout.to_csv("F:/Kaggle/Rossman/Blends/Stacking/ET_holdout.csv", index = False) preds_ET = np.exp(reg_ET.predict(pte[feature_names]))-1 ET_sub = pd.DataFrame({'Id':ID.Id, 'Sales':preds_ET}) ET_sub.to_csv("F:/Kaggle/Rossman/Blends/Stacking/ET_subs.csv", index = False) # SVR with RBF kernel # svr_rbf = SVR(kernel = 'rbf', C = 1e4, gamma = 0.05, epsilon = 0.03, max_iter = 10000) svr_rbf = svr_rbf.fit(x_train, y_train)
def train_for_atom(atom, dataset_path, pred_save_path): ''' Function for training machine learning models for a single atom args: atom = the atom that the models are trained for (str) dataset_path = the path to which datasets can be found (expected to have three .csv files under the path, for train/validation/test) pred_save_path = the path for saving all the predictions for analysis ''' print(" ====== Training model for:", atom, "under folder", dataset_path, " ====== ") features, targets, metas = prep_data([ dataset_path + "train_['%s'].csv" % atom, dataset_path + "val_['%s'].csv" % atom ], atom, "train", filter_outlier=True, notnull=True) features_test, targets_test, metas_test = prep_data( dataset_path + "test_['%s'].csv" % atom, atom, "test", filter_outlier=False, notnull=False) kf = KFold(n_splits=K) # Prepare parameters for Kfold in a list and do "out-of-sample" training and testing on training dataset for K folds print("Training R0...") params = [] for train_idx, test_idx in kf.split(range(len(features))): params.append([ features.drop(["SHIFTY_" + atom, "MAX_IDENTITY", "AVG_IDENTITY"], axis=1), targets, train_idx, test_idx ]) pool = multiprocessing.Pool(processes=K) first_preds = pool.starmap(train_with_test, params) # first_preds=train_with_test(*params[0]) # Combine results from K parallel execusions into a single list all_test_idx = [] all_first_preds = [] for i in range(K): all_test_idx.extend(params[i][-1]) all_first_preds.extend(first_preds[i]) first_preds = pd.Series(all_first_preds, index=all_test_idx) features["FIRST_PRED"] = first_preds evaluate(first_preds, targets, metas, pred_save_path + "first_pred_%s.csv" % atom) # Retrain the model on all training data model1 = ExtraTreesRegressor(bootstrap=False, max_features=0.3, min_samples_leaf=3, min_samples_split=15, n_estimators=500) model1.fit( features.drop( ["SHIFTY_" + atom, "MAX_IDENTITY", "AVG_IDENTITY", "FIRST_PRED"], axis=1), targets.values.ravel()) # Write first predictions for the test dataset to the features of test features_test["FIRST_PRED"] = model1.predict( features_test.drop(["SHIFTY_" + atom, "MAX_IDENTITY", "AVG_IDENTITY"], axis=1)) # Save first level model (R0) if not DEBUG: joblib.dump(model1, "pipelines/%s_model1.sav" % atom) # Train and save second level model (R1) print("Training second level model without SHIFTY++ with %d examples..." % len(features)) model_2 = RandomForestRegressor(bootstrap=False, max_features=0.5, min_samples_leaf=7, min_samples_split=12, n_estimators=500) model_2.fit( features.drop(["SHIFTY_" + atom, "MAX_IDENTITY", "AVG_IDENTITY"], axis=1), targets.values.ravel()) pred_2 = model_2.predict( features_test.drop(["SHIFTY_" + atom, "MAX_IDENTITY", "AVG_IDENTITY"], axis=1)).ravel() evaluate(pred_2, targets_test, metas_test, pred_save_path + "second_pred_%s_nosy.csv" % atom) if not DEBUG: joblib.dump(model_2, "pipelines/%s_model2_ny.sav" % atom) # Train and save second level model with SHIFTY++ predictions (R2) model_21 = RandomForestRegressor(bootstrap=False, max_features=0.5, min_samples_leaf=7, min_samples_split=12, n_estimators=500) not_null_idx = features["SHIFTY_" + atom].notnull() not_null_idx_test = features_test["SHIFTY_" + atom].notnull() print("Training second level model with SHIFTY++ with %d examples..." % np.sum(not_null_idx)) model_21.fit(features[not_null_idx], targets[not_null_idx].values.ravel()) pred_21 = pred_2.copy() pred_21[not_null_idx_test] = model_21.predict( features_test[not_null_idx_test]) evaluate(pred_21, targets_test, metas_test, pred_save_path + "second_pred_%s_withsy.csv" % atom) if not DEBUG: joblib.dump(model_21, "pipelines/%s_model2_wy.sav" % atom) print("Finish for", atom)
def do_validation(data_path, steps=10): allfiles = initialize(data_path) gbm = GradientBoostingRegressor(n_estimators=100, learning_rate=0.05, max_depth=6, min_samples_leaf=5, subsample=0.5) ada = AdaBoostRegressor(n_estimators=200, learning_rate=1) etree = ExtraTreesRegressor(n_estimators=200, n_jobs=-1, min_samples_leaf=5) rf = RandomForestRegressor(n_estimators=200, max_features=4, min_samples_leaf=5) kn = KNeighborsRegressor(n_neighbors=25) logit = LogisticRegression(tol=0.05) enet = ElasticNetCV(l1_ratio=0.75, max_iter=1000, tol=0.05) svr = SVR(kernel="linear", probability=True) ridge = Ridge(alpha=18) bridge = BayesianRidge(n_iter=500) gbm_metrics = 0.0 ada_metrics = 0.0 etree_metrics = 0.0 rf_metrics = 0.0 kn_metrics = 0.0 logit_metrics = 0.0 svr_metrics = 0.0 ridge_metrics = 0.0 bridge_metrics = 0.0 enet_metrics = 0.0 nnet_metrics = 0.0 logistic = LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) for i in xrange(steps): driver = allfiles[i] df, Y = create_merged_dataset(driver) df['label'] = Y # Shuffle DF. df = df.reindex(np.random.permutation(df.index)) train = df[:100] label = train['label'] del train['label'] test = df[100:400] Y = test['label'] del test['label'] #to_drop = ['driver', 'trip', 'speed1', 'speed2', 'speed3', 'speed4', 'speed5', 'speed6', 'speed7', 'speed8', 'speed9', # 'speed10', 'speed11', 'speed12', 'speed13', 'speed14', 'speed15', 'speed16', 'speed17', 'speed18', 'speed19', # 'speed20', 'speed21', 'speed22', 'speed23', 'speed24', 'speed25', 'speed26', 'speed27', 'speed28', 'speed29', # 'speed30', 'speed31', 'speed32', 'speed33', 'speed34', 'speed35', 'speed36', 'speed37', 'speed38', 'speed39', # 'speed40', 'speed41', 'speed42', 'speed43', 'speed44', 'speed45', 'speed46', 'speed47', 'speed48', 'speed49', # 'speed50', 'speed51', 'speed52', 'speed53', 'speed54', 'speed55', 'speed56', 'speed57', 'speed58', 'speed59', # 'speed60', 'speed61', 'speed62', 'speed63', 'speed64', 'speed65', 'speed66', 'speed67', 'speed68', 'speed69', # 'speed70', 'speed71', 'speed72', 'speed73', 'speed74', 'speed75', 'speed76', 'speed77', 'speed78', 'speed79', 'speed80'] to_drop = ['driver', 'trip'] X_train = train.drop(to_drop, 1) X_test = test.drop(to_drop, 1) gbm.fit(X_train, label) Y_hat = gbm.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) gbm_metrics += metrics.auc(fpr, tpr) ada.fit(X_train, label) Y_hat = ada.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) ada_metrics += metrics.auc(fpr, tpr) etree.fit(X_train, label) Y_hat = etree.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) etree_metrics += metrics.auc(fpr, tpr) rf.fit(X_train, label) Y_hat = rf.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) rf_metrics += metrics.auc(fpr, tpr) kn.fit(X_train, label) Y_hat = kn.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) kn_metrics += metrics.auc(fpr, tpr) # Linear models. to_drop = [ 'driver', 'trip', 'distance', 'sd_acceleration', 'final_angle', 'mean_acceleration', 'mean_avg_speed', 'sd_inst_speed', 'sd_avg_speed', 'mean_inst_speed', 'points' ] X_train = train.drop(to_drop, 1) X_test = test.drop(to_drop, 1) logit.fit(X_train, label) Y_hat = [i[1] for i in logit.predict_proba(X_test)] fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) logit_metrics += metrics.auc(fpr, tpr) svr.fit(X_train, label) Y_hat = svr.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) svr_metrics += metrics.auc(fpr, tpr) ridge.fit(X_train, label) Y_hat = ridge.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) ridge_metrics += metrics.auc(fpr, tpr) bridge.fit(X_train, label) Y_hat = bridge.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) bridge_metrics += metrics.auc(fpr, tpr) enet.fit(X_train, label) Y_hat = enet.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) enet_metrics += metrics.auc(fpr, tpr) classifier.fit(X_train, label) Y_hat = classifier.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) nnet_metrics += metrics.auc(fpr, tpr) print "" print "GBM:", gbm_metrics / steps print "AdaBoost:", ada_metrics / steps print "Extra Trees:", etree_metrics / steps print "RF:", rf_metrics / steps print "KN:", kn_metrics / steps print "" print "Logit:", logit_metrics / steps print "SVR:", svr_metrics / steps print "Ridge:", ridge_metrics / steps print "BayesianRidge:", bridge_metrics / steps print "Elastic Net:", enet_metrics / steps print "Neural Networks:", nnet_metrics / steps print ""
#print('manual rescaledX\n', manual_scaled[1:5,0]) #Save Scaler scaler_filename = variable + '_scaler.sav' dump(scaler, scaler_filename) #model = KNeighborsRegressor(n_neighbors = 3) #model = LinearRegression() #model = DecisionTreeRegressor() #model = GradientBoostingRegressor() model = ExtraTreesRegressor() model.fit(rescaledX, Y_train) # Transform the validation dataset rescaledValidationX = scaler.transform(X_validation) predictions = model.predict(rescaledValidationX) print('\n', 'min: ', numpy.amin(Y_validation), 'max: ', numpy.amax(Y_validation)) print('\n', '0.5 of data in range: ', numpy.percentile(Y_validation, 25), ' - ', numpy.percentile(Y_validation, 75)) rmse = numpy.sqrt(mean_squared_error(Y_validation, predictions)) print('\n', 'RMSE: ', numpy.sqrt(mean_squared_error(Y_validation, predictions)), ' Median percentage %: ', 100 * rmse / numpy.percentile(Y_validation, 50)) print('\nModel score: ', model.score(rescaledValidationX, Y_validation)) # Save the model to disk filename = variable + '_model.sav'