def mul_dtree(X, Y2): forest = ExtraTreesRegressor(n_estimators=5, compute_importances=True, random_state=0) forest.fit(X[:200], Y2[:200]) forest.predict(X[200:]) print Y2[200:]
def main(): for ind in range(1, 15+1): print "TrainingSet/ACT%d_competition_training.csv" % ind #read in data, parse into training and target sets cols, molecules1, train = read_data("../TrainingSet/ACT%d_competition_training.csv" % ind) target = np.array( [x[0] for x in train] ) #load train train = filter_cols(train, cols, "../selected/cor9/selected_%d.txt" % ind) train = np.array(train) #print("Train: ", len(train), " cols:", len(train[0])) # seeds used: orig=1279, cor8=1278, cor9=1277 cfr = ExtraTreesRegressor(n_estimators=2000, max_features=(len(train[0])//3), n_jobs=8, random_state=1277) #min_samples_leaf=2, min_samples_split=2, random_state=1279) rf = cfr.fit(train, target) #predict train pred = rf.predict(train) write_file("erStacking/cor9/er_stacking_%d.csv" % ind, molecules1, pred) #load test cols, molecules2, test = read_data("../TestSet/ACT%d_competition_test.csv" % ind) test = filter_cols(test, cols, "../selected/cor9/selected_%d.txt" % ind) test = np.array(test) #predict test pred = rf.predict(test) write_file("erStacking/test/cor9/er_submission_%d.csv" % ind, molecules2, pred)
def predict_with_one(X, out_file_name): n_samples, n_features = X.shape iter_num = 3 div = ShuffleSplit(n_samples, n_iter=iter_num, test_size=0.2, random_state=0) model = ExtraTreesRegressor(n_estimators=5) score_matrix = np.zeros((n_features, n_features)) t = time() round_num = 0 for train, test in div: round_num += 1 train_samples = X[np.array(train)] test_samples = X[np.array(test)] for i in range(n_features): for j in range(n_features): X_train = train_samples[:, i:i+1] X_test = test_samples[:, i:i+1] y_train = train_samples[:, j] y_test = test_samples[:, j] # for i in range(len(fl)): # for j in range(len(fl)): # if fl[j][1]-fl[j][0] != 1: # continue # X_train = train_samples[:, fl[i][0]:fl[i][1]] # X_test = test_samples[:, fl[i][0]:fl[i][1]] # y_train = train_samples[:, fl[j][0]] # y_test = test_samples[:, fl[j][0]] model.fit(X_train, y_train) y_pred = model.predict(X_test) mae = mean_absolute_error(y_test, y_pred) score_matrix[i, j] += mae print('Round', round_num, '|', i, j, mae, time()-t) np.savetxt(os.path.join(CODE_PATH, out_file_name), score_matrix/iter_num, fmt='%.3f', delimiter=',')
def fit(self, X, y, weights = None, **kwargs): if weights is None: weights = np.ones(y.shape[0]) data = np.hstack((y.reshape(y.shape[0],1),X)) S = wcov(data, weights) corr = wcorr(data, weights) wsd = np.sqrt(S.diagonal()) ExtraTrees = ExtraTreesRegressor(**kwargs) ExtraTrees.fit(X,y, sample_weight=weights) Rsquare = ( S[0,1:].dot(np.linalg.inv(S[1:,1:]).dot(S[1:,0])) )/S[0,0] # assign proportion of Rsquare to each covariate dep. on importance self.importances = ExtraTrees.feature_importances_ * Rsquare model = self.constrained_optimization( corr ) if self.fit_intercept: w = np.diagflat( weights/np.sum(weights),k=0) wmean = np.sum(w.dot(data), axis=0) self.intercept_ = wmean[0] - wsd[0]*np.sum(wmean[1:]*model.x/wsd[1:]) self.coef_ = wsd[0]*model.x/wsd[1:] return self
def fit(self, X, y, **kwargs): for key, value in kwargs.iteritems(): if key in self.INITPARAMS.keys(): self.INITPARAMS[key] = value model = ExtraTreesRegressor(**self.INITPARAMS) model.fit(X, y) self.model = model
def main(): for ind in range(1, 15+1): #for ind in [3,4,5,7,9,11,12,13,14,15]: # no 1,2,6,8,10 print "TrainingSet/ACT%d_competition_training.csv" % ind #read in data, parse into training and target sets cols, train = read_data("../TrainingSet/ACT%d_competition_training.csv" % ind) target = np.array( [x[0] for x in train] ) train = filter_cols(train, cols, "../selected/selected_%d.txt" % ind) #print("Train: ", len(train), " cols:", len(train[0])) train = np.array( train ) #In this case we'll use a random forest, but this could be any classifier cfr = ExtraTreesRegressor(n_estimators=1000, max_features=(len(train[0])//3), n_jobs=8, random_state=1279) #Simple K-Fold cross validation. 10 folds. cv = cross_validation.KFold(len(train), k=10, indices=False, shuffle=True) #iterate through the training and test cross validation segments and #run the classifier on each one, aggregating the results into a list results = [] for traincv, testcv in cv: ft = cfr.fit(train[traincv], target[traincv]) score = ft.score(train[testcv], target[testcv]) results.append(score) print "\tFold %d: %f" % (len(results), score) #print out the mean of the cross-validated results print "Results: " + str( np.array(results).mean() )
def build_models(self): self.remove_columns( [ "institute_latitude", "institute_longitude", "institute_state", "institute_country", "var10", "var11", "var12", "var13", "var14", "var15", "instructor_past_performance", "instructor_association_industry_expert", "secondary_area", "var24", ] ) model1 = GradientBoostingRegressor(learning_rate=0.1, n_estimators=200, subsample=0.8) model2 = RandomForestRegressor(n_estimators=50) model3 = ExtraTreesRegressor(n_estimators=50) model1.fit(self.X, self.y) model2.fit(self.X, self.y) model3.fit(self.X, self.y) return [model1, model2, model3]
def fit(self,data_train,target): self.target_train = target self.catcol = data_train.filter(like='var').columns.tolist() #start_gbr_tr = time.clock() self.gbr = GradientBoostingRegressor(n_estimators =self.nest,max_depth=7) self.gbr.fit(data_train,self.target_train) self.transformed_train_gbr = self.gbr.transform(data_train,threshold="0.35*mean") self.gbr_tr_fit = GradientBoostingRegressor(n_estimators =self.nest,max_depth=7) self.gbr_tr_fit.fit(self.transformed_train_gbr,self.target_train) #end_gbr_tr = time.clock() #print >> log, "time_gbr_tr = ", end_gbr_tr-start_gbr_tr #start_xfr_tr = time.clock() self.xfr= ExtraTreesRegressor(n_estimators =self.nest,max_depth=7) self.xfr.fit(data_train,self.target_train) self.transformed_train_xfr = self.xfr.transform(data_train,threshold="0.35*mean") self.xfr_tr_fit = ExtraTreesRegressor(n_estimators =self.nest,max_depth=7) self.xfr_tr_fit.fit(self.transformed_train_xfr,self.target_train) #end_xfr_tr = time.clock() #print >> log, "time_xfr_tr = ", end_xfr_tr-start_xfr_tr #start_gbr_cat = time.clock() self.gbr_cat_fit = GradientBoostingRegressor(n_estimators =self.nest,max_depth=7) self.gbr_cat_fit.fit(data_train[self.catcol],self.target_train) #end_gbr_cat = time.clock() #print >> log, "time_gbr_cat = ", end_gbr_cat-start_gbr_cat #start_xfr_cat = time.clock() self.xfr_cat_fit = ExtraTreesRegressor(n_estimators =self.nest,max_depth=7) self.xfr_cat_fit.fit(data_train[self.catcol],self.target_train) #end_xfr_cat = time.clock() #print >> log, "time_xfr_cat = ", end_xfr_cat-start_xfr_cat return self
def do_etrees(filename): df, Y = create_merged_dataset(filename) etree = ExtraTreesRegressor(n_estimators=200, n_jobs=-1, min_samples_leaf=5, random_state=SEED) X = df.drop(['driver', 'trip'], 1) etree.fit(X, Y) probs = etree.predict(X[:200]) return pd.DataFrame({'driver': df['driver'][:200], 'trip': df['trip'][:200], 'probs': probs})
def cal_important_features(batch=10, threshold=1e-4): X_samples, Y_samples, scaler = dat.data_prepare('ocpm', 'lifetime_ecpm', outlier=0.05) tot_goot_atrs = {} for a in ATRS[5:]: tot_goot_atrs[a] = {} for i in np.arange(1,batch+1): Ts = timeit.default_timer() model = ExtraTreesRegressor(n_jobs=6) model.fit(X_samples, Y_samples) print "Totally %i features." % len(model.feature_importances_) print "[Labels] %i categories, %i interests, %i client_names, %i auto_tags" % (num.categories_len, num.interests_len, num.client_names_len, num.auto_tags_len) good_atrs = show_important_features(model.feature_importances_, threshold) for a in reversed(ATRS[5:]): for b in good_atrs[a]: if b in tot_goot_atrs[a]: tot_goot_atrs[a][b] += 1 else: tot_goot_atrs[a][b] = 1 print "%i batch finished in %.1f secs." % (i, (timeit.default_timer() - Ts)) print "------------------------------------------------" # show performances for atr in reversed(ATRS[5:]): print "-------[%s]-----------------------" % atr for j in np.arange(1,batch+1): good_keys = [k for k,v in tot_goot_atrs[atr].items() if (v >= j)] print "%i keys occurs > %i times." % (len(good_keys), j) return tot_goot_atrs
def build_extra_tree_regressor(X_test, X_train_full, y_train_full): print "Building ExtraTrees regressor..." etr = ExtraTreesRegressor(n_estimators=500) etr.fit(X_train_full, y_train_full) etr_predict = etr.predict(X_test) return etr_predict
def classify(self): """Perform classification""" clf = ETRegressor(n_estimators=500, min_samples_split=5, min_samples_leaf=2) #pca = PCA(n_components = 400) #self._ClassifyDriver__traindata = pca.fit_transform(self._ClassifyDriver__traindata) #self._ClassifyDriver__testdata = pca.transform(self._ClassifyDriver__testdata) #print self._ClassifyDriver__traindata.shape clf.fit(self._ClassifyDriver__traindata, self._ClassifyDriver__trainlabels) self._ClassifyDriver__y = clf.predict(self._ClassifyDriver__testdata)
def get_forest(X_names=Xs, y_names=ys, num_trees=256, data=data): forest = ExtraTreesRegressor( n_estimators=num_trees, n_jobs=62, bootstrap=True) X = data.loc[:, [i for i in X_names]] y = data.loc[:, [i for i in y_names]] start = time() rfr = forest.fit(X, y) end = time() return(rfr, end-start)
def reg_skl_etr(param, data): [X_tr, X_cv, y_class_tr, y_class_cv, y_reg_tr, y_reg_cv] = data etr = ExtraTreesRegressor(n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) etr.fit(X_tr, y_reg_tr) pred = etr.predict(X_cv) RMSEScore = getscoreRMSE(y_reg_cv, pred) return RMSEScore, pred
def extra_trees_regressor(x, y, n_estimators, max_depth): kf = KFold(len(x), n_folds=3) scores = [] for train_index, test_index in kf: X_train, X_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] clf = ExtraTreesRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=0) clf.fit(X_train, y_train) scores.append(mean_squared_error(clf.predict(X_test), y_test) ** 0.5) return np.mean(scores)
class MyExtraTreeReg(MyRegressor): def __init__(self, params=dict()): self._params = params self._extree = ExtraTreesRegressor(**(self._params)) def update_params(self, updates): self._params.update(updates) self._extree = ExtraTreesRegressor(**(self._params)) def fit(self, Xtrain, ytrain): self._extree.fit(Xtrain, ytrain) def predict(self, Xtest, option = None): return self._extree.predict(Xtest) def plt_feature_importance(self, fname_list, f_range = list()): importances = self._extree.feature_importances_ std = np.std([tree.feature_importances_ for tree in self._extree.estimators_], axis=0) indices = np.argsort(importances)[::-1] fname_array = np.array(fname_list) if not f_range: f_range = range(indices.shape[0]) n_f = len(f_range) plt.figure() plt.title("Extra Tree Feature importances") plt.barh(range(n_f), importances[indices[f_range]], color="b", xerr=std[indices[f_range]], ecolor='k',align="center") plt.yticks(range(n_f), fname_array[indices[f_range]]) plt.ylim([-1, n_f]) plt.show() def list_feature_importance(self, fname_list, f_range = list(), return_list = False): importances = self._extree.feature_importances_ indices = np.argsort(importances)[::-1] print 'Extra tree feature ranking:' if not f_range : f_range = range(indices.shape[0]) n_f = len(f_range) for i in range(n_f): f = f_range[i] print '{0:d}. feature[{1:d}] {2:s} ({3:f})'.format(f + 1, indices[f], fname_list[indices[f]], importances[indices[f]]) if return_list: return [indices[f_range[i]] for i in range(n_f)]
def algorithm_ExtraTrees(X_train,Y_train,X_validation,Y_validation, seed=7): # 训练模型 scaler = StandardScaler().fit(X_train) rescaledX = scaler.transform(X_train) gbr = ExtraTreesRegressor(n_estimators=80) gbr.fit(X=rescaledX, y=Y_train) # 评估算法模型 rescaledX_validation = scaler.transform(X_validation) predictions = gbr.predict(rescaledX_validation) print(mean_squared_error(Y_validation, predictions))
def dummie_columns_extra_trees(train, test): from sklearn.ensemble import ExtraTreesRegressor print "-- {} --".format("Extremely Randomized Trees Regression using all but remarks") predicting_columns = list(train._get_numeric_data().columns.values) predicting_columns.remove("LISTPRICE") predicting_columns.remove("SOLDPRICE") rf = ExtraTreesRegressor( n_estimators=300, n_jobs=-1) rf.fit(train[predicting_columns], train["SOLDPRICE"]) score = rf.score(test[predicting_columns], test["SOLDPRICE"]) predictions = rf.predict(test[predicting_columns]) sample_predictions(test, predictions) print "Accuracy: {}\n".format(score) return score, predictions
def simple_extremely_random_trees(data_train_x, data_test_x, data_train_y, data_test_y): from sklearn.ensemble import ExtraTreesRegressor print "-- {} --".format("Extremely Randomized Trees Regression using all but remarks") rf = ExtraTreesRegressor( n_estimators=300, n_jobs=-1 ) rf.fit(data_train_x, data_train_y) sample_predictions(rf.predict(data_test_x), data_test_y) score = rf.score(data_test_x, data_test_y) cross_validated_scores = cross_val_score( rf, data_test_x, data_test_y, cv=5) print "MSE Accuracy: {}".format(score) print "MSE Across 5 Folds: {}".format(cross_validated_scores) print "95%% Confidence Interval: %0.3f (+/- %0.3f)\n" % (cross_validated_scores.mean(), cross_validated_scores.std() * 1.96)
def baseline_extra(train_x, train_y, test_x, test_y, n, d, result_path="review_baseline_extra.txt"): predict = [] clf = ExtraTreesRegressor(n_estimators=n, max_depth=d, random_state=0) clf = clf.fit(train_x, train_y) predict = clf.predict(test_x).tolist() result = pd.DataFrame([], columns=['review_count', 'predict']) result['review_count'] = test_y result['predict'] = predict result.to_csv(result_path, index=False) rmse = mean_squared_error(predict, test_y) ** 0.5 return rmse
def main(): # X,Y = make_top_dataset(100000,30) X, Y = make_friedman1_random_attr(n_samples=100000, n_features=10) tX, tY = make_friedman1_random_attr(n_samples=100, n_features=10) start_time = time.time() ext = ETRs(max_features=None, n_estimators=100, min_samples_split=1, n_jobs=-1) # ext = RFR(max_features=None, n_estimators=100, min_samples_split=1, n_jobs=-1) ext.fit(X, Y) elapsed_time = time.time() - start_time print elapsed_time print score(ext, tX, tY)
def iterative_fit(self, X, y, n_iter=1, refit=False): if refit: self.estimator = None if self.estimator is None: num_features = X.shape[1] max_features = int( float(self.max_features) * (np.log(num_features) + 1)) # Use at most half of the features max_features = max(1, min(int(X.shape[1] / 2), max_features)) self.estimator = ETR( n_estimators=0, criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, max_features=max_features, max_leaf_nodes=self.max_leaf_nodes, oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state, warm_start=True ) tmp = self.estimator # TODO copy ? tmp.n_estimators += n_iter tmp.fit(X, y,) self.estimator = tmp return self
def trainRegressorsAndSave(computeScore=False): for db in dbs: if (not os.path.exists("clfs/" + db)): clf = ExtraTreesRegressor(n_estimators=500, random_state=1, n_jobs=-1) saveTrainedClassifier(db, clf) elif (computeScore): clf = joblib.load("clfs/" + db) if (computeScore): print("Loading test data...") loaded = loadDB(db + ".csv") X_test = loaded[:, 0:-1] y_test = loaded[:, -1] print("Normalized score is {}".format(clf.score(X_test, y_test))) X_test = y_test = 0
def run(): cycles = load_and_munge_training_data('train.csv') inputs = ['holiday', 'workingday', 'temp', 'atemp', 'humidity', 'windspeed', 'month', 'hour'] x_train, x_test, y_train, y_test = train_test_split(cycles[inputs], cycles['count'], test_size=0.25) scaler_x = StandardScaler().fit(x_train) scaler_y = StandardScaler().fit(y_train) x_train = scaler_x.transform(x_train) y_train = scaler_y.transform(y_train) x_test = scaler_x.transform(x_test) y_test = scaler_y.transform(y_test) techniques = {} clf_sgd = linear_model.SGDRegressor(loss='squared_loss', penalty=None) clf_sgd.fit(x_train, y_train) techniques['Linear - no penalty'] = evaluate(clf_sgd, x_train, y_train) clf_sgd1 = linear_model.SGDRegressor(loss='squared_loss', penalty='l2') clf_sgd1.fit(x_train, y_train) techniques['Linear - squared sums of the coefficients penalisation'] = \ evaluate(clf_sgd1, x_train, y_train) clf_svr = svm.SVR(kernel='linear') clf_svr.fit(x_train, y_train) techniques['SVR - linear'] = evaluate(clf_svr, x_train, y_train) clf_svr_poly = svm.SVR(kernel='poly') clf_svr_poly.fit(x_train, y_train) techniques['SVR - poly'] = evaluate(clf_svr_poly, x_train, y_train) clf_svr_rbf = svm.SVR(kernel='rbf') clf_svr_rbf.fit(x_train, y_train) techniques['SVR - RBF'] = evaluate(clf_svr_rbf, x_train, y_train) clf_et = ExtraTreesRegressor(n_estimators=10, compute_importances=True) clf_et.fit(x_train, y_train) techniques['Random forest'] = evaluate(clf_et, x_train, y_train) clf_lr = LinearRegression() clf_lr.fit(x_train, y_train) techniques['Linear regression'] = evaluate(clf_lr, x_train, y_train) return sorted(techniques.iteritems(), key=operator.itemgetter(1))
def train(self): print "start ert" self.model = ExtraTreesRegressor(n_jobs=self.prms["n_jobs"], verbose=1, random_state=self.prms["random_state"], n_estimators=int(self.prms["n_estimators"]), max_features=self.prms["max_features"]) self.model.fit(self.data_tr.values, self.labels_tr)
def predict_for(output, cycles, tests, raw_tests, inputs): x_train, x_test, y_train, y_test = train_test_split(cycles[inputs], cycles[output], test_size=0.25, random_state=33) scaler_x = StandardScaler().fit(x_train) scaler_t = StandardScaler().fit(tests) x_train = scaler_x.transform(x_train) x_test = scaler_x.transform(x_test) tests = scaler_t.transform(tests) clf_et = ExtraTreesRegressor(n_estimators=10, compute_importances=True, random_state=42) clf_et.fit(x_train, y_train) ps = clf_et.predict(tests) return {dt: int(round(p)) for dt, p in zip(raw_tests['datetime'], ps)}
def baseline_extra_leave_one_out(train_raw_x, test_raw_x, test_ids, n=40, d=40, result_path="baseline_extra.txt"): predict = [] for test_id in test_ids: train_x = train_raw_x[train_raw_x.business_id != test_id] train_y = train_raw_x[train_raw_x.business_id != test_id].stars.as_matrix() train_x = train_x.drop(["business_id", "stars"], 1).as_matrix() clf = ExtraTreesRegressor(n_estimators=n, max_depth=d, random_state=0) clf = clf.fit(train_x, train_y) test_x = test_raw_x[test_raw_x.business_id == test_id] test_x = test_x.drop(["business_id", "stars"], 1).as_matrix() predict.append(clf.predict(test_x)[0]) result = pd.DataFrame([], columns=["stars", "predict"]) result["stars"] = test_raw_x.stars result["predict"] = predict result = result.sort("stars", ascending=0) result.to_csv(result_path, index=False) rmse = mean_squared_error(predict, test_raw_x.stars.as_matrix()) ** 0.5 return rmse
def buildModelOheETR(train_data, eval_data, train_labels, seed): train_data = sparse.csr_matrix(train_data) eval_data = sparse.csr_matrix(eval_data) clf = ExtraTreesRegressor(n_estimators=500, max_depth=38, min_samples_leaf=2,min_samples_split=6,\ max_features='auto', n_jobs=-1, random_state=seed, verbose=1) clf.fit(train_data, train_labels) preds = clf.predict(eval_data) preds = np.expm1(preds) # transform -ve preds to 0 for i in range(preds.shape[0]): if preds[i] < 0: preds[i] = 0 # convert back to log1p preds = np.log1p(preds) return((model,preds))
def fit(self, X, Y): from sklearn.ensemble import ExtraTreesRegressor from sklearn.feature_selection import SelectFromModel self.n_estimators = int(self.n_estimators) self.min_samples_leaf = int(self.min_samples_leaf) self.min_samples_split = int(self.min_samples_split) self.max_features = float(self.max_features) self.bootstrap = check_for_bool(self.bootstrap) self.n_jobs = int(self.n_jobs) self.verbose = int(self.verbose) if check_none(self.max_leaf_nodes): self.max_leaf_nodes = None else: self.max_leaf_nodes = int(self.max_leaf_nodes) if check_none(self.max_depth): self.max_depth = None else: self.max_depth = int(self.max_depth) self.min_weight_fraction_leaf = float(self.min_weight_fraction_leaf) num_features = X.shape[1] max_features = int( float(self.max_features) * (np.log(num_features) + 1)) # Use at most half of the features max_features = max(1, min(int(X.shape[1] / 2), max_features)) estimator = ExtraTreesRegressor( n_estimators=self.n_estimators, criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, max_features=max_features, max_leaf_nodes=self.max_leaf_nodes, oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose, min_weight_fraction_leaf=self.min_weight_fraction_leaf, random_state=self.random_state) estimator.fit(X, Y) self.preprocessor = SelectFromModel(estimator=estimator, threshold='mean', prefit=True) return self
def get_regressor(x, y, n_estimators=1500, n_tries=5, verbose=False): """Calculate an ExtraTreesRegressor on predictor and target variables Parameters ---------- x : numpy.array Predictor vector y : numpy.array Target vector n_estimators : int, optional Number of estimators to use n_tries : int, optional Number of attempts to calculate regression verbose : bool, optional If True, output progress statements Returns ------- classifier : sklearn.ensemble.ExtraTreesRegressor The classifier with the highest out of bag scores of all the attempted "tries" oob_scores : numpy.array Out of bag scores of the classifier """ if verbose: sys.stderr.write('Getting regressor\n') clfs = [] oob_scores = [] for i in range(n_tries): if verbose: sys.stderr.write('%d.' % i) clf = ExtraTreesRegressor(n_estimators=n_estimators, oob_score=True, bootstrap=True, max_features='sqrt', n_jobs=1, random_state=i).fit(x, y) clfs.append(clf) oob_scores.append(clf.oob_score_) clf = clfs[np.argmax(oob_scores)] clf.feature_importances = pd.Series(clf.feature_importances_, index=x.columns) return clf, oob_scores
def hyperopt_obj(param, feat_folder, feat_name, trial_counter): global loaded global X_train, labels_train, X_valid, labels_valid, numTrain, numValid, cdf_valid, Y_valid log_loss_cv = np.zeros((config.n_runs, config.n_folds), dtype=float) year = datetime.datetime.now().year for run in range(1, config.n_runs + 1): # range(start, end)前包括后不包括 for fold in range(1, config.n_folds + 1): rng = np.random.RandomState(datetime.datetime.now().year + 1000 * run + 10 * fold) path = "%s/Run%d/Fold%d" % (feat_folder, run, fold) save_path = "%s/Run%d/Fold%d" % (output_path, run, fold) if not os.path.exists(save_path): os.makedirs(save_path) # feat: combine feat file feat_train_path = "%s/train.feat" % path feat_valid_path = "%s/valid.feat" % path # # weight # weight_train_path = "%s/train.feat.weight" % path # weight_valid_path = "%s/valid.feat.weight" % path # info info_train_path = "%s/train.info" % path info_valid_path = "%s/valid.info" % path # cdf cdf_valid_path = "%s/valid.cdf" % path # raw prediction path (rank) raw_pred_valid_path = "%s/valid.raw.pred.%s_[Id@%d].csv" % ( save_path, feat_name, trial_counter) # rank_pred_valid_path = "%s/valid.pred.%s_[Id@%d].csv" % ( save_path, feat_name, trial_counter) # if loaded is None: X_train, labels_train, X_valid, labels_valid, numTrain, numValid, cdf_valid, Y_valid = load_data( run, fold) # ## make evalerror func 评价函数 # evalerror_regrank_valid = lambda preds,dtrain: evalerror_regrank_cdf(preds, dtrain, cdf_valid) # evalerror_softmax_valid = lambda preds,dtrain: evalerror_softmax_cdf(preds, dtrain, cdf_valid) # evalerror_softkappa_valid = lambda preds,dtrain: evalerror_softkappa_cdf(preds, dtrain, cdf_valid) # evalerror_ebc_valid = lambda preds,dtrain: evalerror_ebc_cdf(preds, dtrain, cdf_valid, ebc_hard_threshold) # evalerror_cocr_valid = lambda preds,dtrain: evalerror_cocr_cdf(preds, dtrain, cdf_valid) ############## ## Training ## ############## ## you can use bagging to stabilize the predictions 还可以使用 bagging 来使模型更加稳定 preds_bagging = np.zeros((numValid, bagging_size), dtype=float) for n in range(bagging_size): if bootstrap_replacement: sampleSize = int( numTrain * bootstrap_ratio) # bootstrap_ratio: 使用训练样本的比例 index_base = rng.randint(numTrain, size=sampleSize) index_meta = [ i for i in range(numTrain) if i not in index_base ] else: randnum = rng.uniform(size=numTrain) # 产生 0-1 之间的唯一的随机数 index_base = [ i for i in range(numTrain) if randnum[i] < bootstrap_ratio ] index_meta = [ i for i in range(numTrain) if randnum[i] >= bootstrap_ratio ] # 如果是xgb则先把数据转换成xgb需要的格式 if "booster" in param: dvalid_base = xgb.DMatrix( X_valid, label=labels_valid) # , weight=weight_valid dtrain_base = xgb.DMatrix( X_train[index_base], label=labels_train[index_base] ) # , weight=weight_train[index_base] watchlist = [] if verbose_level >= 2: watchlist = [(dtrain_base, 'train'), (dvalid_base, 'valid')] ## various models if param["task"] in ["regression", "ranking"]: ## regression & pairwise ranking with xgboost bst = xgb.train( param, dtrain_base, param['num_round'], watchlist) # , feval=evalerror_regrank_valid pred = bst.predict(dvalid_base) if param["task"] in ["classification"]: ## regression & pairwise ranking with xgboost bst = xgb.train( param, dtrain_base, param['num_round'], watchlist) # , feval=evalerror_regrank_valid pred = bst.predict(dvalid_base) elif param["task"] in ["softmax"]: ## softmax regression with xgboost bst = xgb.train( param, dtrain_base, param['num_round'], watchlist) # , feval=evalerror_softmax_valid pred = bst.predict(dvalid_base) w = np.asarray(range(1, numValid)) pred = pred * w[ np. newaxis, :] # np.newaxis: 插入一个维度,等价于w[np.newaxis],这里pred是n*1矩阵,而w[np.newaxis,:]是1*n矩阵,注意w原是数组 pred = np.sum(pred, axis=1) elif param["task"] in ["softkappa"]: ## softkappa with xgboost 自定义损失函数 # obj = lambda preds, dtrain: softkappaObj(preds, dtrain, hess_scale=param['hess_scale']) bst = xgb.train( param, dtrain_base, param['num_round'], watchlist ) # , obj=obj, feval=evalerror_softkappa_valid pred = softmax(bst.predict(dvalid_base)) w = np.asarray(range(1, numValid)) pred = pred * w[np.newaxis, :] pred = np.sum(pred, axis=1) elif param["task"] in ["ebc"]: ## ebc with xgboost 自定义损失函数 # obj = lambda preds, dtrain: ebcObj(preds, dtrain) bst = xgb.train( param, dtrain_base, param['num_round'], watchlist) # , obj=obj, feval=evalerror_ebc_valid pred = sigmoid(bst.predict(dvalid_base)) pred = applyEBCRule(pred, hard_threshold=ebc_hard_threshold) elif param["task"] in ["cocr"]: ## cocr with xgboost 自定义损失函数 # obj = lambda preds, dtrain: cocrObj(preds, dtrain) bst = xgb.train( param, dtrain_base, param['num_round'], watchlist) # , obj=obj, feval=evalerror_cocr_valid pred = bst.predict(dvalid_base) pred = applyCOCRRule(pred) elif param['task'] == "reg_skl_rf": ## regression with sklearn random forest regressor rf = RandomForestRegressor( n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) rf.fit(X_train[index_base], labels_train[index_base] ) # , sample_weight=weight_train[index_base] pred = rf.predict(X_valid) elif param['task'] == "reg_skl_etr": ## regression with sklearn extra trees regressor etr = ExtraTreesRegressor( n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) etr.fit(X_train[index_base], labels_train[index_base] ) # , sample_weight=weight_train[index_base] pred = etr.predict(X_valid) elif param['task'] == "reg_skl_gbm": ## regression with sklearn gradient boosting regressor gbm = GradientBoostingRegressor( n_estimators=param['n_estimators'], max_features=param['max_features'], learning_rate=param['learning_rate'], max_depth=param['max_depth'], subsample=param['subsample'], random_state=param['random_state']) gbm.fit(X_train.toarray()[index_base], labels_train[index_base] ) # , sample_weight=weight_train[index_base] pred = gbm.predict(X_valid.toarray()) elif param['task'] == "clf_skl_lr": ## classification with sklearn logistic regression lr = LogisticRegression(penalty="l2", dual=True, tol=1e-5, C=param['C'], fit_intercept=True, intercept_scaling=1.0, class_weight='auto', random_state=param['random_state']) lr.fit(X_train[index_base], labels_train[index_base]) pred = lr.predict_proba(X_valid) w = np.asarray(range(1, numValid)) pred = pred * w[np.newaxis, :] pred = np.sum(pred, axis=1) elif param['task'] == "reg_skl_svr": ## regression with sklearn support vector regression X_train, X_valid = X_train.toarray(), X_valid.toarray() scaler = StandardScaler() X_train[index_base] = scaler.fit_transform( X_train[index_base]) X_valid = scaler.transform(X_valid) svr = SVR(C=param['C'], gamma=param['gamma'], epsilon=param['epsilon'], degree=param['degree'], kernel=param['kernel']) svr.fit(X_train[index_base], labels_train[index_base] ) # , sample_weight=weight_train[index_base] pred = svr.predict(X_valid) elif param['task'] == "reg_skl_ridge": ## regression with sklearn ridge regression ridge = Ridge(alpha=param["alpha"], normalize=True) ridge.fit(X_train[index_base], labels_train[index_base] ) # , sample_weight=weight_train[index_base] pred = ridge.predict(X_valid) elif param['task'] == "reg_skl_lasso": ## regression with sklearn lasso lasso = Lasso(alpha=param["alpha"], normalize=True) lasso.fit(X_train[index_base], labels_train[index_base]) pred = lasso.predict(X_valid) elif param['task'] == 'reg_libfm': ## regression with factorization machine (libfm) ## to array X_train = X_train.toarray() X_valid = X_valid.toarray() ## scale scaler = StandardScaler() X_train[index_base] = scaler.fit_transform( X_train[index_base]) X_valid = scaler.transform(X_valid) ## dump feat dump_svmlight_file(X_train[index_base], labels_train[index_base], feat_train_path + ".tmp") dump_svmlight_file(X_valid, labels_valid, feat_valid_path + ".tmp") ## train fm cmd = "%s -task r -train %s -test %s -out %s -dim '1,1,%d' -iter %d > libfm.log" % ( \ libfm_exe, feat_train_path+".tmp", feat_valid_path+".tmp", raw_pred_valid_path, \ param['dim'], param['iter']) os.system(cmd) os.remove(feat_train_path + ".tmp") os.remove(feat_valid_path + ".tmp") ## extract libfm prediction pred = np.loadtxt(raw_pred_valid_path, dtype=float) ## labels are in [0,1,2,3] pred += 1 # elif param['task'] == "reg_keras_dnn": # ## regression with keras' deep neural networks # model = Sequential() # ## input layer # model.add(Dropout(param["input_dropout"])) # ## hidden layers # first = True # hidden_layers = param['hidden_layers'] # while hidden_layers > 0: # if first: # dim = X_train.shape[1] # first = False # else: # dim = param["hidden_units"] # model.add(Dense(dim, param["hidden_units"], init='glorot_uniform')) # if param["batch_norm"]: # model.add(BatchNormalization((param["hidden_units"],))) # if param["hidden_activation"] == "prelu": # model.add(PReLU((param["hidden_units"],))) # else: # model.add(Activation(param['hidden_activation'])) # model.add(Dropout(param["hidden_dropout"])) # hidden_layers -= 1 # # ## output layer # model.add(Dense(param["hidden_units"], 1, init='glorot_uniform')) # model.add(Activation('linear')) # # ## loss # model.compile(loss='mean_squared_error', optimizer="adam") # # ## to array # X_train = X_train.toarray() # X_valid = X_valid.toarray() # # ## scale # scaler = StandardScaler() # X_train[index_base] = scaler.fit_transform(X_train[index_base]) # X_valid = scaler.transform(X_valid) # # ## train # model.fit(X_train[index_base], labels_train[index_base], # nb_epoch=param['nb_epoch'], batch_size=param['batch_size'], # validation_split=0, verbose=0) # # ##prediction # pred = model.predict(X_valid, verbose=0) # pred.shape = (X_valid.shape[0],) elif param['task'] == "reg_rgf": ## regression with regularized greedy forest (rgf) ## to array X_train, X_valid = X_train.toarray(), X_valid.toarray() train_x_fn = feat_train_path + ".x" train_y_fn = feat_train_path + ".y" valid_x_fn = feat_valid_path + ".x" valid_pred_fn = feat_valid_path + ".pred" model_fn_prefix = "rgf_model" np.savetxt(train_x_fn, X_train[index_base], fmt="%.6f", delimiter='\t') np.savetxt(train_y_fn, labels_train[index_base], fmt="%d", delimiter='\t') np.savetxt(valid_x_fn, X_valid, fmt="%.6f", delimiter='\t') # np.savetxt(valid_y_fn, labels_valid, fmt="%d", delimiter='\t') pars = [ "train_x_fn=", train_x_fn, "\n", "train_y_fn=", train_y_fn, "\n", #"train_w_fn=",weight_train_path,"\n", "model_fn_prefix=", model_fn_prefix, "\n", "reg_L2=", param['reg_L2'], "\n", #"reg_depth=", 1.01, "\n", "algorithm=", "RGF", "\n", "loss=", "LS", "\n", #"opt_interval=", 100, "\n", "valid_interval=", param['max_leaf_forest'], "\n", "max_leaf_forest=", param['max_leaf_forest'], "\n", "num_iteration_opt=", param['num_iteration_opt'], "\n", "num_tree_search=", param['num_tree_search'], "\n", "min_pop=", param['min_pop'], "\n", "opt_interval=", param['opt_interval'], "\n", "opt_stepsize=", param['opt_stepsize'], "\n", "NormalizeTarget" ] pars = "".join([str(p) for p in pars]) rfg_setting_train = "./rfg_setting_train" with open(rfg_setting_train + ".inp", "wb") as f: f.write(pars) ## train fm cmd = "perl %s %s train %s >> rgf.log" % ( call_exe, rgf_exe, rfg_setting_train) #print cmd os.system(cmd) model_fn = model_fn_prefix + "-01" pars = [ "test_x_fn=", valid_x_fn, "\n", "model_fn=", model_fn, "\n", "prediction_fn=", valid_pred_fn ] pars = "".join([str(p) for p in pars]) rfg_setting_valid = "./rfg_setting_valid" with open(rfg_setting_valid + ".inp", "wb") as f: f.write(pars) cmd = "perl %s %s predict %s >> rgf.log" % ( call_exe, rgf_exe, rfg_setting_valid) #print cmd os.system(cmd) pred = np.loadtxt(valid_pred_fn, dtype=float) ## weighted averageing over different models pred_valid = pred ## this bagging iteration preds_bagging[:, n] = pred_valid # preds_bagging的第n+1列为pred_valid pred_raw = np.mean(preds_bagging[:, :(n + 1)], axis=1) # 按行(同行多列)进行平均值 # pred_rank = pred_raw.argsort().argsort() # argsort: 获取排序的索引值(index),但索引值本身不排序,第二次是归位 # pred_score, cutoff = getScore(pred_rank, cdf_valid, valid=True) # 根据cdf来生成分数 # kappa_valid = quadratic_weighted_kappa(pred_score, Y_valid) # 计算kappa分数 log_loss_valid = elementwise.log_loss(Y_valid, pred_raw) print('Y_valid mean:', np.mean(Y_valid)) print('pred_raw mean:', np.mean(pred_raw)) if (n + 1) != bagging_size: print( " {:>3} {:>3} {:>3} {:>6} {} x {}" .format(run, fold, n + 1, np.round(log_loss_valid, 6), X_train.shape[0], X_train.shape[1])) else: print( " {:>3} {:>3} {:>3} {:>8} {} x {}" .format(run, fold, n + 1, np.round(log_loss_valid, 6), X_train.shape[0], X_train.shape[1])) log_loss_cv[run - 1, fold - 1] = log_loss_valid ## save this prediction 保存的是单行的预测值 dfPred = pd.DataFrame({"target": Y_valid, "prediction": pred_raw}) dfPred.to_csv(raw_pred_valid_path, index=False, header=True, columns=["target", "prediction"]) # save this prediction 保存的是根据预测值排序之后,然后使用cdf来生成的预测值 # dfPred = pd.DataFrame({"target": Y_valid, "prediction": pred_rank}) # dfPred.to_csv(rank_pred_valid_path, index=False, header=True, columns=["target", "prediction"]) log_loss_cv_mean = np.mean(log_loss_cv) log_loss_cv_std = np.std(log_loss_cv) if verbose_level >= 1: print(" Mean: %.6f" % log_loss_cv_mean) print(" Std: %.6f" % log_loss_cv_std) #################### #### Retraining #### #################### #### all the path # path = "%s/All" % (feat_folder) # save_path = "%s/All" % output_path # subm_path = "%s/Subm" % output_path # if not os.path.exists(save_path): # os.makedirs(save_path) # if not os.path.exists(subm_path): # os.makedirs(subm_path) # # feat # feat_train_path = "%s/train.feat" % path # feat_test_path = "%s/test.feat" % path # # weight # # weight_train_path = "%s/train.feat.weight" % path # # info # info_train_path = "%s/train.info" % path # info_test_path = "%s/test.info" % path # # cdf # cdf_test_path = "%s/test.cdf" % path # # raw prediction path (rank) # raw_pred_test_path = "%s/test.raw.pred.%s_[Id@%d].csv" % (save_path, feat_name, trial_counter) # rank_pred_test_path = "%s/test.pred.%s_[Id@%d].csv" % (save_path, feat_name, trial_counter) # # submission path (is_duplicate as in [0, 1]) # subm_path = "%s/test.pred.%s_[Id@%d]_[Mean%.6f]_[Std%.6f].csv" % (subm_path, feat_name, trial_counter, log_loss_cv_mean, log_loss_cv_std) # # #### load data # ## load feat # X_train, labels_train = load_svmlight_file(feat_train_path) # X_test, labels_test = load_svmlight_file(feat_test_path) # if X_test.shape[1] < X_train.shape[1]: # X_test = hstack([X_test, np.zeros((X_test.shape[0], X_train.shape[1]-X_test.shape[1]))]) # elif X_test.shape[1] > X_train.shape[1]: # X_train = hstack([X_train, np.zeros((X_train.shape[0], X_test.shape[1]-X_train.shape[1]))]) # X_train = X_train.tocsr() # X_test = X_test.tocsr() # ## load train weight # # weight_train = np.loadtxt(weight_train_path, dtype=float) # ## load test info # info_train = pd.read_csv(info_train_path) # numTrain = info_train.shape[0] # info_test = pd.read_csv(info_test_path) # numTest = info_test.shape[0] # id_test = info_test["id"] # # ## load cdf # cdf_test = np.loadtxt(cdf_test_path, dtype=float) # # ## 评价函数 # # evalerror_regrank_test = lambda preds,dtrain: evalerror_regrank_cdf(preds, dtrain, cdf_test) # # evalerror_softmax_test = lambda preds,dtrain: evalerror_softmax_cdf(preds, dtrain, cdf_test) # # evalerror_softkappa_test = lambda preds,dtrain: evalerror_softkappa_cdf(preds, dtrain, cdf_test) # # evalerror_ebc_test = lambda preds,dtrain: evalerror_ebc_cdf(preds, dtrain, cdf_test, ebc_hard_threshold) # # evalerror_cocr_test = lambda preds,dtrain: evalerror_cocr_cdf(preds, dtrain, cdf_test) # # ## bagging # preds_bagging = np.zeros((numTest, bagging_size), dtype=float) # for n in range(bagging_size): # if bootstrap_replacement: # sampleSize = int(numTrain*bootstrap_ratio) # #index_meta = rng.randint(numTrain, size=sampleSize) # #index_base = [i for i in range(numTrain) if i not in index_meta] # index_base = rng.randint(numTrain, size=sampleSize) # index_meta = [i for i in range(numTrain) if i not in index_base] # else: # randnum = rng.uniform(size=numTrain) # index_base = [i for i in range(numTrain) if randnum[i] < bootstrap_ratio] # index_meta = [i for i in range(numTrain) if randnum[i] >= bootstrap_ratio] # # # 如果是xgb则先把数据转换成xgb需要的格式 # if "booster" in param: # dtest = xgb.DMatrix(X_test, label=labels_test) # dtrain = xgb.DMatrix(X_train[index_base], label=labels_train[index_base]) # , weight=weight_train[index_base] # # watchlist = [] # if verbose_level >= 2: # watchlist = [(dtrain, 'train')] # # ## train # if param["task"] in ["regression", "ranking"]: # bst = xgb.train(param, dtrain, param['num_round'], watchlist) # , feval=evalerror_regrank_test # pred = bst.predict(dtest) # # elif param["task"] in ["softmax"]: # bst = xgb.train(param, dtrain, param['num_round'], watchlist) # , feval=evalerror_softmax_test # pred = bst.predict(dtest) # w = np.asarray(range(1,numValid)) # pred = pred * w[np.newaxis,:] # pred = np.sum(pred, axis=1) # # elif param["task"] in ["softkappa"]: # # 自定义损失函数 # # obj = lambda preds, dtrain: softkappaObj(preds, dtrain, hess_scale=param['hess_scale']) # bst = xgb.train(param, dtrain, param['num_round'], watchlist) # , obj=obj, feval=evalerror_softkappa_test # pred = softmax(bst.predict(dtest)) # w = np.asarray(range(1,numValid)) # pred = pred * w[np.newaxis,:] # pred = np.sum(pred, axis=1) # # elif param["task"] in ["ebc"]: # # 自定义损失函数 # # obj = lambda preds, dtrain: ebcObj(preds, dtrain) # bst = xgb.train(param, dtrain, param['num_round'], watchlist) # , obj=obj, feval=evalerror_ebc_test # pred = sigmoid(bst.predict(dtest)) # pred = applyEBCRule(pred, hard_threshold=ebc_hard_threshold) # # elif param["task"] in ["cocr"]: # # 自定义损失函数 # obj = lambda preds, dtrain: cocrObj(preds, dtrain) # bst = xgb.train(param, dtrain, param['num_round'], watchlist) # , obj=obj, feval=evalerror_cocr_test # pred = bst.predict(dtest) # pred = applyCOCRRule(pred) # # elif param['task'] == "reg_skl_rf": # ## random forest regressor # rf = RandomForestRegressor(n_estimators=param['n_estimators'], # max_features=param['max_features'], # n_jobs=param['n_jobs'], # random_state=param['random_state']) # rf.fit(X_train[index_base], labels_train[index_base]) # , sample_weight=weight_train[index_base] # pred = rf.predict(X_test) # # elif param['task'] == "reg_skl_etr": # ## extra trees regressor # etr = ExtraTreesRegressor(n_estimators=param['n_estimators'], # max_features=param['max_features'], # n_jobs=param['n_jobs'], # random_state=param['random_state']) # etr.fit(X_train[index_base], labels_train[index_base]) # , sample_weight=weight_train[index_base] # pred = etr.predict(X_test) # # elif param['task'] == "reg_skl_gbm": # ## gradient boosting regressor # gbm = GradientBoostingRegressor(n_estimators=param['n_estimators'], # max_features=param['max_features'], # learning_rate=param['learning_rate'], # max_depth=param['max_depth'], # subsample=param['subsample'], # random_state=param['random_state']) # gbm.fit(X_train.toarray()[index_base], labels_train[index_base]) #, sample_weight=weight_train[index_base] # pred = gbm.predict(X_test.toarray()) # # elif param['task'] == "clf_skl_lr": # lr = LogisticRegression(penalty="l2", dual=True, tol=1e-5, # C=param['C'], fit_intercept=True, intercept_scaling=1.0, # class_weight='auto', random_state=param['random_state']) # lr.fit(X_train[index_base], labels_train[index_base]) # pred = lr.predict_proba(X_test) # w = np.asarray(range(1,numValid)) # pred = pred * w[np.newaxis,:] # pred = np.sum(pred, axis=1) # # elif param['task'] == "reg_skl_svr": # ## regression with sklearn support vector regression # X_train, X_test = X_train.toarray(), X_test.toarray() # scaler = StandardScaler() # X_train[index_base] = scaler.fit_transform(X_train[index_base]) # X_test = scaler.transform(X_test) # svr = SVR(C=param['C'], gamma=param['gamma'], epsilon=param['epsilon'], # degree=param['degree'], kernel=param['kernel']) # svr.fit(X_train[index_base], labels_train[index_base]) # , sample_weight=weight_train[index_base] # pred = svr.predict(X_test) # # elif param['task'] == "reg_skl_ridge": # ridge = Ridge(alpha=param["alpha"], normalize=True) # ridge.fit(X_train[index_base], labels_train[index_base]) # , sample_weight=weight_train[index_base] # pred = ridge.predict(X_test) # # elif param['task'] == "reg_skl_lasso": # lasso = Lasso(alpha=param["alpha"], normalize=True) # lasso.fit(X_train[index_base], labels_train[index_base]) # pred = lasso.predict(X_test) # # elif param['task'] == 'reg_libfm': # ## to array # X_train, X_test = X_train.toarray(), X_test.toarray() # # ## scale # scaler = StandardScaler() # X_train[index_base] = scaler.fit_transform(X_train[index_base]) # X_test = scaler.transform(X_test) # # ## dump feat # dump_svmlight_file(X_train[index_base], labels_train[index_base], feat_train_path+".tmp") # dump_svmlight_file(X_test, labels_test, feat_test_path+".tmp") # # ## train fm # cmd = "%s -task r -train %s -test %s -out %s -dim '1,1,%d' -iter %d > libfm.log" % ( \ # libfm_exe, feat_train_path+".tmp", feat_test_path+".tmp", raw_pred_test_path, \ # param['dim'], param['iter']) # os.system(cmd) # os.remove(feat_train_path+".tmp") # os.remove(feat_test_path+".tmp") # # ## extract libfm prediction # pred = np.loadtxt(raw_pred_test_path, dtype=float) # ## labels are in [0,1,2,3] # pred += 1 # # elif param['task'] == "reg_keras_dnn": # ## regression with keras deep neural networks # model = Sequential() # ## input layer # model.add(Dropout(param["input_dropout"])) # ## hidden layers # first = True # hidden_layers = param['hidden_layers'] # while hidden_layers > 0: # if first: # dim = X_train.shape[1] # first = False # else: # dim = param["hidden_units"] # model.add(Dense(dim, param["hidden_units"], init='glorot_uniform')) # if param["batch_norm"]: # model.add(BatchNormalization((param["hidden_units"],))) # if param["hidden_activation"] == "prelu": # model.add(PReLU((param["hidden_units"],))) # else: # model.add(Activation(param['hidden_activation'])) # model.add(Dropout(param["hidden_dropout"])) # hidden_layers -= 1 # # ## output layer # model.add(Dense(param["hidden_units"], 1, init='glorot_uniform')) # model.add(Activation('linear')) # # ## loss # model.compile(loss='mean_squared_error', optimizer="adam") # # ## to array # X_train = X_train.toarray() # X_test = X_test.toarray() # # ## scale # scaler = StandardScaler() # X_train[index_base] = scaler.fit_transform(X_train[index_base]) # X_test = scaler.transform(X_test) # # ## train # model.fit(X_train[index_base], labels_train[index_base], # nb_epoch=param['nb_epoch'], batch_size=param['batch_size'], verbose=0) # # ##prediction # pred = model.predict(X_test, verbose=0) # pred.shape = (X_test.shape[0],) # # elif param['task'] == "reg_rgf": # ## to array # X_train, X_test = X_train.toarray(), X_test.toarray() # # train_x_fn = feat_train_path+".x" # train_y_fn = feat_train_path+".y" # test_x_fn = feat_test_path+".x" # test_pred_fn = feat_test_path+".pred" # # model_fn_prefix = "rgf_model" # # np.savetxt(train_x_fn, X_train[index_base], fmt="%.6f", delimiter='\t') # np.savetxt(train_y_fn, labels_train[index_base], fmt="%d", delimiter='\t') # np.savetxt(test_x_fn, X_test, fmt="%.6f", delimiter='\t') # # np.savetxt(valid_y_fn, labels_valid, fmt="%d", delimiter='\t') # # # pars = [ # "train_x_fn=",train_x_fn,"\n", # "train_y_fn=",train_y_fn,"\n", # #"train_w_fn=",weight_train_path,"\n", # "model_fn_prefix=",model_fn_prefix,"\n", # "reg_L2=", param['reg_L2'], "\n", # #"reg_depth=", 1.01, "\n", # "algorithm=","RGF","\n", # "loss=","LS","\n", # "test_interval=", param['max_leaf_forest'],"\n", # "max_leaf_forest=", param['max_leaf_forest'],"\n", # "num_iteration_opt=", param['num_iteration_opt'], "\n", # "num_tree_search=", param['num_tree_search'], "\n", # "min_pop=", param['min_pop'], "\n", # "opt_interval=", param['opt_interval'], "\n", # "opt_stepsize=", param['opt_stepsize'], "\n", # "NormalizeTarget" # ] # pars = "".join([str(p) for p in pars]) # # rfg_setting_train = "./rfg_setting_train" # with open(rfg_setting_train+".inp", "wb") as f: # f.write(pars) # # ## train fm # cmd = "perl %s %s train %s >> rgf.log" % ( # call_exe, rgf_exe, rfg_setting_train) # #print cmd # os.system(cmd) # # # model_fn = model_fn_prefix + "-01" # pars = [ # "test_x_fn=",test_x_fn,"\n", # "model_fn=", model_fn,"\n", # "prediction_fn=", test_pred_fn # ] # # pars = "".join([str(p) for p in pars]) # # rfg_setting_test = "./rfg_setting_test" # with open(rfg_setting_test+".inp", "wb") as f: # f.write(pars) # cmd = "perl %s %s predict %s >> rgf.log" % ( # call_exe, rgf_exe, rfg_setting_test) # #print cmd # os.system(cmd) # # pred = np.loadtxt(test_pred_fn, dtype=float) # # ## weighted averageing over different models # pred_test = pred # preds_bagging[:,n] = pred_test # pred_raw = np.mean(preds_bagging, axis=1) # pred_rank = pred_raw.argsort().argsort() # # # ## write # output = pd.DataFrame({"id": id_test, "prediction": pred_raw}) # output.to_csv(raw_pred_test_path, index=False) # # ## write # output = pd.DataFrame({"id": id_test, "prediction": pred_rank}) # output.to_csv(rank_pred_test_path, index=False) # # ## write score # pred_score = getScore(pred, cdf_test) # output = pd.DataFrame({"id": id_test, "prediction": pred_score}) # output.to_csv(subm_path, index=False) # #""" return log_loss_cv_mean, log_loss_cv_std
import numpy as np import pandas as pd from sklearn.ensemble import ExtraTreesRegressor from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import MaxAbsScaler # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: -1814.3293695408152 exported_pipeline = make_pipeline( MaxAbsScaler(), ExtraTreesRegressor(bootstrap=True, max_features=0.35000000000000003, min_samples_leaf=1, min_samples_split=14, n_estimators=100) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
predictions = np.column_stack([regr.predict(X) for regr in self.regr_]) return np.mean(predictions, axis=1) en = make_pipeline(RobustScaler(), SelectFromModel(Lasso(alpha=0.03)), ElasticNet(alpha=0.001, l1_ratio=0.1)) rf = RandomForestRegressor(n_estimators=250, n_jobs=4, min_samples_split=25, min_samples_leaf=25, max_depth=3) et = ExtraTreesRegressor(n_estimators=100, n_jobs=4, min_samples_split=25, min_samples_leaf=35, max_features=150) xgbm = xgb.sklearn.XGBRegressor(max_depth=4, learning_rate=0.005, subsample=0.9, base_score=y_mean, objective='reg:linear', n_estimators=1000) stack_avg = StackingCVRegressorAveraged((en, rf, et), ElasticNet(l1_ratio=0.1, alpha=1.4)) stack_with_feats = StackingCVRegressorRetrained((en, rf, et), xgbm,
def perishing_mother_wife(passenger): surname, Pclass, person = passenger return 1.0 if (surname in perishing_female_surnames) else 0.0 full_data['perishing_mother_wife'] = full_data[['surname', 'Pclass', 'person']].apply(perishing_mother_wife, axis=1) #### Survivng Males surviving_male_surnames = list(set(full_data[(full_data.male_adult == 1.0) & (full_data.Survived == 1.0) & ((full_data.Parch > 0) | (full_data.SibSp > 0))]['surname'].values)) def surviving_father_husband(passenger): surname, Pclass, person = passenger return 1.0 if (surname in surviving_male_surnames) else 0.0 full_data['surviving_father_husband'] = full_data[['surname', 'Pclass', 'person']].apply(surviving_father_husband, axis=1) classers = ['Fare','Parch','Pclass','SibSp','TitleCat','CabinCat','Sex_female','Sex_male', 'EmbarkedCat', 'FamilySize', 'NameLength', 'FamilyId'] age_et = ExtraTreesRegressor(n_estimators=200) X_train = full_data.loc[full_data.Age.notnull(),classers] Y_train = full_data.loc[full_data.Age.notnull(),['Age']] X_test = full_data.loc[full_data.Age.isnull(),classers] age_et.fit(X_train,np.ravel(Y_train)) age_preds = age_et.predict(X_test) full_data.loc[full_data.Age.isnull(),['Age']] = age_preds ###################################################################### ###################################################################### print('Building Model...') #### Model Build - Random Forest (Categorical Features) model_dummys = ['Age','male_adult', 'female_adult', 'child','perishing_mother_wife','surviving_father_husband','Fare','Parch','Pclass','SibSp','TitleCat','CabinCat','Sex_female','Sex_male', 'EmbarkedCat', 'FamilySize', 'NameLength', 'FamilyId'] model_rf = RandomForestClassifier(n_estimators=300, min_samples_leaf=4, class_weight={0:0.745,1:0.255})
def ETCf(): import pandas as pd import numpy as np import seaborn as sns from datetime import datetime from sklearn.linear_model import LinearRegression from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import ExtraTreesRegressor from xgboost import XGBClassifier from sklearn.svm import LinearSVC import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from sklearn.metrics import r2_score data1 = pd.read_excel("littoralis1516.xlsx") data1.head() data1.drop(["Date", "Larves"], axis=1, inplace=True) data1.head() data1 = data1.astype(float) data1["GDD"] = data1.Temp.astype(float) - 10 data1.head() x = data1.iloc[:, 0].values y = data1.iloc[:, 1:8].values x y xtrain, xtest, ytrain, ytest = train_test_split(y, x, test_size=0.2, random_state=0) #regressor=LinearRegression() #regressor=RandomForestRegressor(n_estimators=10,random_state=0,max_depth=20) #max depth=10 regressor = ExtraTreesRegressor(n_estimators=100, random_state=0, max_depth=10, min_samples_split=5) #max depth=5 #regressor=XGBClassifier() #regressor=LinearSVC() #regressor = LogisticRegression() regressor.fit(xtrain, ytrain) y_pred = regressor.predict(xtest) data1_cmp = pd.DataFrame(list(zip(y_pred, ytest))) data1_cmp['Difference'] = abs(data1_cmp[0] - data1_cmp[1]) data1_cmp.rename(columns={0: "Predicted", 1: "Actual"}, inplace=True) data1_cmp.head() MAPE = data1_cmp['Difference'].mean() x000 = float("{:.5f}".format(MAPE)) print("MAPE: %.5f" % (MAPE)) Error = np.mean(data1_cmp["Difference"]) / np.mean(data1_cmp["Actual"]) x11 = Error * 100 x111 = float("{:.2f}".format(x11)) print("Error: %.2f%%" % (Error * 100)) Accuracy = accuracy_score((ytest * 100).astype(int), (y_pred * 100).astype(int)) #Accuracy = r2_score(ytest,y_pred) print("Accuracy: %.2f%%" % (Accuracy * 100.0)) x22 = Accuracy * 100 x222 = float("{:.2f}".format(x22)) #plt.plot(data1_cmp.Actual, color="r") #plt.plot(data1_cmp.Predicted, color ="b") global Label11 Label11 = Label(root, text="MAPE=") global Label12 Label12 = Label(root, text=x000) global Label21 Label21 = Label(root, text="Error=") global Label22 Label22 = Label(root, text=x111) global Label31 Label31 = Label(root, text="Accuracy=") global Label32 Label32 = Label(root, text=x222) Label11.grid(row=10, column=5) Label12.grid(row=10, column=6) Label21.grid(row=11, column=5) Label22.grid(row=11, column=6) Label31.grid(row=12, column=5) Label32.grid(row=12, column=6) ETC['state'] = DISABLED
'property_type', 'room_type', 'bed_type', 'cancellation_policy' ] base_airbnb_cod = pd.get_dummies(data=base_airbnb_cod, columns=colunas_categorias) print(base_airbnb_cod.head()) #Modelo de Previsão def avaliar_modelo(nome_modelo, y_test, previsao): r2 = r2_score(y_test, previsao) RSME = np.sqrt(mean_squared_error(y_test, previsao)) return f'Modelo {nome_modelo}:\nR2:{r2:.2%}\nRSME:{RSME:.2f}' modelo_rf = RandomForestRegressor() modelo_lr = LinearRegression() modelo_et = ExtraTreesRegressor() modelos = { 'RandomForest': modelo_rf, 'LinearRegression': modelo_lr, 'ExtraTreesRegressor': modelo_et } y = base_airbnb_cod['price'] x = base_airbnb_cod.drop('price', axis=1) X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=10) for nome_modelo, modelo in modelos.items(): #treinar modelo.fit(X_train, y_train) #testar previsao = modelo.predict(X_test) print(avaliar_modelo(nome_modelo, y_test, previsao))
print(a.columns) final_dataset = a[[ 'Year', 'Selling_Price', 'Present_Price', 'Kms_Driven', 'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner' ]] final_dataset["current_year"] = 2020 final_dataset[ "new_year"] = final_dataset["current_year"] - final_dataset["Year"] final_dataset.drop(columns=['Year', 'current_year'], inplace=True) final_dataset = pd.get_dummies(final_dataset, drop_first=True) print(final_dataset) x = final_dataset.iloc[:, 1:] y = final_dataset.iloc[:, 0] #feature importance from sklearn.ensemble import ExtraTreesRegressor model = ExtraTreesRegressor() model.fit(x, y) from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0) from sklearn.ensemble import RandomForestRegressor r = RandomForestRegressor() from sklearn.model_selection import RandomizedSearchCV #Randomized Search CV # Number of trees in random forest n_estimators = [int(x) for x in np.linspace(start=100, stop=1200, num=12)] # Number of features to consider at every split
train = pd.read_csv("../../input/train.csv") # read train data test = pd.read_csv("../../input/test.csv") # read test data # build a model library (can be improved) base_models = [ RandomForestRegressor( n_jobs=1, random_state=0, n_estimators=500, max_features=14 ), RandomForestRegressor( n_jobs=1, random_state=0, n_estimators=500, max_features=20, max_depth = 7 ), ExtraTreesRegressor( n_jobs=1, random_state=0, n_estimators=500, max_features=15 ), ExtraTreesRegressor( n_jobs=1, random_state=0, n_estimators=500, max_features=20 ), GradientBoostingRegressor( random_state=0, n_estimators=500, max_features=10, max_depth=6, learning_rate=0.05, subsample=0.8 ), GradientBoostingRegressor( random_state=0, n_estimators=500, max_features=15, max_depth=6, learning_rate=0.05, subsample=0.8 ),
#print('vif :', vif) print('dropping ' + X[list_factors].columns[maxloc] + ' at index: ' + str(maxloc)) del list_factors[maxloc] else: break print('Final variables:', list_factors) X = X[list_factors] # ensembles ensembles = [] ensembles.append(('AB', AdaBoostRegressor())) ensembles.append(('GBM', GradientBoostingRegressor())) ensembles.append(('RF', RandomForestRegressor())) ensembles.append(('ET', ExtraTreesRegressor())) r2_results = [] mse_results = [] names = [] # evaluate model cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=200) for name, model in ensembles: fs = SelectKBest(score_func=f_regression) pipeline = Pipeline(steps=[('anova', fs), ('model', model)]) # define the grid grid = { 'anova__k': [i + 1 for i in range(X.shape[1])], 'model__n_estimators': randint(10, 400)
# Plot the estimated stability scores for a given alpha # Use 6-fold cross-validation rather than the default 3-fold: it leads to # a better choice of alpha: # Stop the user warnings outputs- they are not necessary for the example # as it is specifically set up to be challenging. with warnings.catch_warnings(): warnings.simplefilter('ignore', UserWarning) lars_cv = LassoLarsCV(cv=6).fit(X, y) # Run the RandomizedLasso: we use a paths going down to .1*alpha_max # to avoid exploring the regime in which very noisy variables enter # the model alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6) clf = RandomizedLasso(alpha=alphas, random_state=42).fit(X, y) trees = ExtraTreesRegressor(100, compute_importances=True).fit(X, y) # Compare with F-score F, _ = f_regression(X, y) pl.figure() for name, score in [ ('F-test', F), ('Stability selection', clf.scores_), ('Lasso coefs', np.abs(lars_cv.coef_)), ('Trees', trees.feature_importances_), ]: precision, recall, thresholds = precision_recall_curve( coef != 0, score) pl.semilogy(np.maximum(score / np.max(score), 1e-4), label="%s. AUC: %.3f" % (name, auc(recall, precision)))
X_scaled=pca.fit_transform(X_scaled) test_X_scaled = pca.transform(test_X_scaled) print(X_scaled.shape, test_X_scaled.shape) '''modeling&evalution''' #34 # define cross validation strategy def rmse_cv(model,X,y): rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=5)) return rmse #35 #We choose 13 models and use 5-folds cross-calidation to evaluate these models. models = [LinearRegression(),Ridge(),Lasso(alpha=0.01,max_iter=10000),RandomForestRegressor(),GradientBoostingRegressor(),SVR(),LinearSVR(), ElasticNet(alpha=0.001,max_iter=10000),SGDRegressor(),BayesianRidge(),KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5), ExtraTreesRegressor(),XGBRegressor()] #36 names = ["LR", "Ridge", "Lasso", "RF", "GBR", "SVR", "LinSVR", "Ela","SGD","Bay","Ker","Extra","Xgb"] for name, model in zip(names, models): score = rmse_cv(model, X_scaled, y_log) print("{}: {:.6f}, {:.4f}".format(name,score.mean(),score.std())) #37 #Next we do some hyperparameters tuning. First define a gridsearch method. class grid(): def __init__(self, model): self.model = model def grid_get(self, X, y, param_grid): grid_search = GridSearchCV(self.model, param_grid, cv=5, scoring="neg_mean_squared_error")
# forest = RandomForestRegressor( n_estimators=100, # max_depth=10, # n_jobs=-1 ) # forest = RegressionForest( n_estimators=30, # min_items=5, # max_depth=30, # nb_tests=1000, # test="axis", # verbose=False) # print forest.get_params() forest = ExtraTreesRegressor( n_estimators=2000, min_samples_leaf=3, max_depth=60, # bootstrap=True, n_jobs=-1) forest.fit(all_points, all_responses) #param_name = "max_depth" #param_range = np.logspace(0, 2, 10) #param_range = [60] # param_name = "min_samples_leaf" # param_range = np.logspace(0, 2, 5) #param_name = "bootstrap" #param_range = np.logspace(-1, 0, 10)
import numpy as np from sklearn.ensemble import ExtraTreesRegressor, VotingClassifier from sklearn.feature_selection import VarianceThreshold from sklearn.linear_model import ElasticNet from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import FunctionTransformer # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_classes, testing_classes = \ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = make_pipeline( make_union(VotingClassifier([("est", ElasticNet(alpha=1.0, l1_ratio=0.84))]), FunctionTransformer(lambda X: X)), VarianceThreshold(threshold=26.0), ExtraTreesRegressor(max_features=0.6900000000000001, n_estimators=500) ) exported_pipeline.fit(training_features, training_classes) results = exported_pipeline.predict(testing_features)
Y_train = ss_Y.fit_transform(Y_train.reshape(-1, 1)) Y_test = ss_Y.transform(Y_test.reshape(-1, 1)) # 要把以为数据Y变成二维的 rfr = RandomForestRegressor() rfr.fit(X_train, Y_train.ravel()) rfr_Y_predict = rfr.predict(X_test) # 使用随机森林回归模型 print 'R-squared of RandomForestRegressor:', rfr.score(X_test, Y_test) print 'the mean squared of RandomForestRegressor:', mean_squared_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(rfr_Y_predict)) print 'the mean absolute squared of RandomForestRegressor:', mean_absolute_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(rfr_Y_predict)) etr = ExtraTreesRegressor() etr.fit(X_train, Y_train.ravel()) etr_Y_predict = etr.predict(X_test) # 使用极端森林回归模型 print 'R-squared of ExtraTreesRegressor:', etr.score(X_test, Y_test) print 'the mean squared of ExtraTreesRegressor:', mean_squared_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(etr_Y_predict)) print 'the mean absolute squared of ExtraTreesRegressor:', mean_absolute_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(etr_Y_predict)) print np.sort(zip(etr.feature_importances_, boston.feature_names), axis=0) # 利用训练好的极端回归森林模型,输出各种特征对预测目标的贡献度 gbr = GradientBoostingRegressor() gbr.fit(X_train, Y_train.ravel())
#3.5随机森林 #调参建模 rfr = RandomForestRegressor(n_estimators=300, n_jobs=-1) param_grid = { #'min_samples_leaf':[3,5,10], #'max_depth':[20,30,40,55], 'max_features': [20, 40, 60, 80] } rfr = gscvr(x_train, y_train, rfr, param_grid) #交叉验证精度&学习曲线 rmse_cv(rfr, x_train, y_train) plot_learning_curve_r(x_train, y_train, rfr, 'rfr learning_curve') #3.6极端随机树 #调参建模 etr = ExtraTreesRegressor(n_estimators=500, n_jobs=-1) param_grid = { #'min_samples_leaf':[3,5,10], 'max_depth': [3, 5, 8], 'max_features': [40, 60, 80, 120, 160] } etr = gscvr(x_train, y_train, etr, param_grid) #交叉验证精度&学习曲线 rmse_cv(etr, x_train, y_train) plot_learning_curve_r(x_train, y_train, etr, 'etr learning_curve') #3.7xgboost #调参建模 xgbr = XGBRegressor( colsample_bytree=0.6, learning_rate=0.07, min_child_weight=1.5,
R = np.clip(R, -1, 1) toc('Number of non-zero feature: %s' % np.count_nonzero(np.mean(F[:-1], axis=0))) tic('Keeping NZV features') support = np.var(F, axis=0) != 0 # Keep only features with nonzero variance toc('Using %s features' % support.sum()) if args.rfs: log('Filtering out ZV features') F = F[:, support] FF = FF[:, support] tic('Running RFS') ifs_estimator_params = {'n_estimators': ifs_nb_trees, 'n_jobs': -1} ifs_params = {'estimator': ExtraTreesRegressor(**ifs_estimator_params), 'n_features_step': 1, 'cv': None, 'scale': True, 'verbose': 1, 'significance': ifs_significance} ifs = IFS(**ifs_params) features_names = np.array(map(str, range(F.shape[1])) + ['A']) rfs_params = {'feature_selector': ifs, 'features_names': features_names, 'verbose': 1} rfs = RFS(**rfs_params) rfs.fit(F, A, FF, R) # Process support support_rfs = rfs.get_support()
targetActions[actions == -1] = 0 targetActions[actions[:,0] == 1] = [1,0] # Upsample folds upsampleRatio = 10 foldMask = actions[:,0] == 1 foldFeatures = np.tile(features[foldMask], (upsampleRatio,1)) foldTargetActions = np.tile(targetActions[foldMask], (upsampleRatio,1)) x = np.row_stack((features[rndPlayerMask],foldFeatures)) y = np.row_stack((targetActions[rndPlayerMask],foldTargetActions)) shuffler = np.arange(len(x)) np.random.shuffle(shuffler) #regressorOld = copy.deepcopy(regressor) regressor = ExtraTreesRegressor(n_estimators=100, min_samples_leaf=10, min_samples_split=4, verbose=2, n_jobs=-1) regressor.fit(x[shuffler], y[shuffler]) # %% nGames = 5000 callPlayerIdx = 0 aiPlayerIdx = 1 seed = 76 initGameStates, initStacks = initRandomGames(nGames, seed=seed) smallBlinds = initGameStates.boards[:,1] equities = getEquities(initGameStates, seed=seed)
def train_test_regression(x_train, y_train, x_test, y_test): """ Train and test a number of regression models using a train/test split of single dataset, and log/report scores. Each regression model used will use its default initialization parameters. :param x_train: :param y_train: :param x_test: :param y_test: :return: None """ # a dictionary of model names to scores we'll populate and return model_scores = {} # create and train a linear regression model model = LinearRegression() model.fit(x_train, y_train) model_scores["LinearRegression"] = model.score(x_test, y_test) # create and train a ridge regression model model = Ridge() model.fit(x_train, y_train) model_scores["Ridge"] = model.score(x_test, y_test) # create and train a random forest regression model for trees in [3, 10, 20, 100, 250]: model = RandomForestRegressor(n_estimators=trees) model.fit(x_train, y_train) score = model.score(x_test, y_test) _logger.info("Random Forest (trees={t}) score: {result}".format( t=trees, result=score)) # create and train a K-neighbors regression model for k in [1, 3, 5, 10, 20]: model = KNeighborsRegressor(n_neighbors=k) model.fit(x_train, y_train) score = model.score(x_test, y_test) _logger.info("K-Neighbors (k={k}) score: {result}".format( k=k, result=score)) # # create and train an Ada boost regression model, trying various estimators and learning rate parameters # for estimators in [1, 3, 5, 10, 20]: # for rate in [0.01, 0.1, 1, 5, 12]: # model = AdaBoostRegressor(n_estimators=estimators, learning_rate=rate) # model.fit(x_train, y_train) # score = model.score(x_test, y_test) # _logger.info("Ada Boost (estimators={n}, learning rate={r}) score: {result}".format(n=estimators, # r=rate, # result=score)) # # create and train a bagging regression model # model = BaggingRegressor() # model.fit(x_train, y_train) # score = model.score(x_test, y_test) # _logger.info("Bagging score: {result}".format(result=score)) # create and train an extra trees regression model for trees in [3, 6, 10, 20]: model = ExtraTreesRegressor(n_estimators=trees) model.fit(x_train, y_train) score = model.score(x_test, y_test) _logger.info("Extra Trees (trees={t}) score: {result}".format( t=trees, result=score)) # create and train a support vector regression model with an linear kernel model = SVR(kernel='linear', C=1e3) model.fit(x_train.flatten(), y_train.flatten()) score = model.score(x_test, y_test) _logger.info("SVR (linear) score: {result}".format(result=score)) # create and train a support vector regression model with a polynomial kernel model = SVR(kernel='poly', C=1e3, degree=2) model.fit(x_train, y_train) score = model.score(x_test, y_test) _logger.info("SVR (polynomial) score: {result}".format(result=score)) # create and train a support vector regression model with an RBF kernel model = SVR(kernel='rbf', C=1e3, gamma=0.1) model.fit(x_train, y_train) score = model.score(x_test, y_test) _logger.info("SVR (RBF) score: {result}".format(result=score))
'max_depth': None, 'min_samples_leaf': 40, 'min_samples_split': 40 } args_rf2 = { 'n_estimators': 1000, 'max_depth': None, 'min_samples_leaf': 40, 'min_samples_split': 40 } model_lasso1 = linear_model.Lasso(**args_lasso1) model_lasso2 = linear_model.Lasso(**args_lasso2) model_rf1 = ExtraTreesRegressor(**args_rf1) model_rf2 = ExtraTreesRegressor(**args_rf2) model_nn1 = CL2020.NeuralNet1(101) model_nn2 = CL2020.NeuralNet2(100) # I collect the models into a dictionary so that they can be easily iterated over models = { 'lasso': [model_lasso1, model_lasso2], 'rf': [model_rf1, model_rf2], 'nn': [model_nn1, model_nn2] } # This dictionary defines which ml methods uses added basis functions and which # do not. An option is included in the fit method of DDMLCT to generate the # basis functions.
num_acc += 1 return num_acc / len(y_pred) X_train, X_test, y_train, y_test = train_test_split(X, y_time, test_size=0.3, random_state=0) Regressor = { 'Random Forest Regressor': RandomForestRegressor(n_estimators=200), 'Gradient Boosting Regressor': GradientBoostingRegressor(n_estimators=500), 'ExtraTrees Regressor': ExtraTreesRegressor(n_estimators=500, min_samples_split=5), 'Bayesian Ridge': BayesianRidge(), 'Elastic Net CV': ElasticNetCV() } for name, clf in Regressor.items(): print(name) clf.fit(X_train, y_train) print('acc', clf.score(X_test, y_test)) #print('new_acc',get_acc(y_test,clf.predict(X_test),10)) # print(f'R2: {r2_score(y_test, clf.predict(X_test)):.2f}') # print(f'MAE: {mean_absolute_error(y_test, clf.predict(X_test)):.2f}')
from sklearn.ensemble import GradientBoostingRegressor import sys sys.path.append('../tools') from tools import get_result day_time = '_02_16_3' train_x = pd.read_csv('../train_0/train_x' + day_time + '.csv') train_y = pd.read_csv('../train_0/train_y' + day_time + '.csv') test_x = pd.read_csv('../test_0/test_x' + day_time + '.csv') #RF = RandomForestRegressor(n_estimators=1200,random_state=1,n_jobs=-1,min_samples_split=2,min_samples_leaf=2,max_depth=25) #RF.fit(train_x,train_y) #pre = (RF.predict(test_x)).round() ET = ExtraTreesRegressor(n_estimators=1200, random_state=1, n_jobs=-1, min_samples_split=2, min_samples_leaf=2, max_depth=25, max_features=270) ET.fit(train_x, train_y) pre = (ET.predict(test_x)).round() result = get_result(pre) result.to_csv('../results/result' + day_time + '.csv', index=False, header=False)
x_train = pd.read_csv("X_train.csv") y_train = pd.read_csv("y_train.csv") y_train0 = y_train.drop('id', axis=1) x_train0 = x_train.drop('id', axis=1) for n_estimators, max_iter in [(e, i) for e in [10, 100] for i in [10, 100]]: x_train = x_train0 y_train = y_train0 # 1. Missing Values est = ExtraTreesRegressor(n_estimators=n_estimators, random_state=42, max_features='sqrt', n_jobs=10, verbose=0) imputer = IterativeImputer(estimator=est, max_iter=max_iter, tol=0.001, n_nearest_features=100, initial_strategy='median', imputation_order='ascending', verbose=2, random_state=0) x_train_filled = imputer.fit_transform(x_train) x_train = pd.DataFrame(x_train_filled) # 2. Outliers detection
ss_X=StandardScaler() ss_y=StandardScaler() X_train=ss_X.fit_transform(X_train) X_test=ss_X.transform(X_test) y_train=ss_y.fit_transform(y_train.reshape(-1,1)) y_test=ss_y.transform(y_test.reshape(-1,1)) ### 3、回归预测 from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor rfr=RandomForestRegressor() rfr.fit(X_train,y_train) rfr_y_predict=rfr.predict(X_test) etr=ExtraTreesRegressor() etr.fit(X_train,y_train) etr_y_predict=etr.predict(X_test) gbr=GradientBoostingRegressor() gbr.fit(X_train,y_train) gbr_y_predict=gbr.predict(X_test) #### 4、性能评估 import numpy as np from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error print('随机回归森林的R-squared值是:',rfr.score(X_test,y_test)) print('随机回归森林的MSE的值是:',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(rfr_y_predict))) print('随机回归森林的MAE的值是:',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(rfr_y_predict)))
RandomForestRegressor(n_estimators=150, max_depth=8, min_samples_leaf=4, n_jobs=-1, random_state=882), Pipeline([('feature_selection', SelectFromModel(LinearSVC(penalty="l2"))), ('Regression', RandomForestRegressor(n_estimators=200, max_depth=8, min_samples_leaf=4, max_features=0.4, n_jobs=-1, random_state=0))]), ExtraTreesRegressor(n_estimators=10, criterion='mse', max_depth=8, min_samples_split=4, min_samples_leaf=2, warm_start=False), ] def train_xgboost(): df = pd.read_csv('/home/kshitij/Desktop/Dataset/stage1_labels.csv') x = [] y = [] did = df['id'].tolist() cancer = df['cancer'].tolist() for i in range(len(df)): g = [] if os.path.isfile('/home/kshitij/Desktop/Dataset/stage1/%s.npy' %
#run python3 import pandas as pd import numpy as np conc = pd.read_csv('concrete.csv') from sklearn.model_selection import KFold y = np.array(conc['strength']) X = np.array(conc.drop(['strength'], axis=1)) kfold = KFold(n_splits=10, shuffle=True, random_state=7) from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import ExtraTreesRegressor from mlxtend.regressor import StackingRegressor rf = RandomForestRegressor(n_estimators=54, max_depth=None, random_state=7) ext = ExtraTreesRegressor(n_estimators=84, min_samples_split=2, random_state=7) clf = StackingRegressor(regressors=[ext], meta_regressor=rf) scores = [] for train, test in kfold.split(X, y): clf.fit(X[train], y[train]) score = clf.score(X[test], y[test]) print(score) scores.append(score) print("%.3f%% (+/- %.3f)" % (np.mean(scores), np.std(scores)))
[LogisticRegression(random_state=42)], [OneVsRestClassifier(LogisticRegression(random_state=42))], [SGDClassifier(random_state=42)], [SVC(kernel='linear', random_state=42)], [NuSVC(kernel='linear', random_state=42)], ]) def test_explain_clf_binary_iris(clf, iris_train_binary): X, y, feature_names = iris_train_binary clf.fit(X, y) assert_explain_prediction_single_target(clf, X, feature_names) assert_correct_class_explained_binary(clf, X) @pytest.mark.parametrize(['reg'], [ [DecisionTreeRegressor(random_state=42)], [ExtraTreesRegressor(random_state=42)], [RandomForestRegressor(random_state=42)], ]) def test_explain_tree_regressor_multitarget(reg): X, y = make_regression(n_samples=100, n_targets=3, n_features=10, random_state=42) reg.fit(X, y) res = explain_prediction(reg, X[0]) for expl in format_as_all(res, reg): for target in ['y0', 'y1', 'y2']: assert target in expl assert 'BIAS' in expl assert any('x%d' % i in expl for i in range(10)) check_targets_scores(res)
#利用ExtraTrees回归进行共线性检验,剔除变量 data_start = data_drop[col_temp + col_hum + col_weather + col_target] train, test = train_test_split(data_start, test_size=0.25, random_state=40) #数据标准化处理 train_standed = pd.DataFrame(StandardScaler().fit_transform(train), columns=train.columns, index=train.index) test_standed = pd.DataFrame(StandardScaler().fit_transform(test), columns=test.columns, index=test.index) x_train = train_standed[col_temp + col_hum + col_weather] y_train = train_standed[col_target] x_test = test_standed[col_temp + col_hum + col_weather] y_test = test_standed[col_target] #ExtraTrees回归模型 etr = ExtraTreesRegressor() vif_data = pd.Series([ variance_inflation_factor(x_train.values.astype(np.float), i) for i in range(x_train.shape[1]) ], index=x_train.columns, name='vif') #共线性检验并进行剔除 while (vif_data > 10).sum() > 0: etr.fit(x_train[vif_data.index], y_train) #得到变量的重要性系数 selector_data = pd.Series(etr.feature_importances_, index=vif_data.index, name='etr') select_etr = np.abs(selector_data).sort_values(ascending=False) etr_vif_data = pd.concat([select_etr, vif_data], join='inner', axis=1)
# #n_components = 30 #pca = PCA(n_components=n_components) #X = pca.fit_transform(X) print(X.shape) print(type(X)) print(y.shape) print(type(y)) print(y.shape) estimator = Ridge() #selector = RFECV(estimator, step=1, cv=5) selector = ExtraTreesRegressor(n_estimators=50) selector = selector.fit(X, y) print("Optimal number of features : %d" % selector.n_features_) X = selector.transform(X) print(X.shape) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) # build a classifier clf = RandomForestRegressor(n_estimators=20) # use a full grid over all parameters param_grid = { "max_depth": [3, None],
} stds = { key + '_std': np.std(value) for key, value in pipeline_results[pipeline_name].items() } means.update(stds) means['pipeline_name'] = pipeline_name results.append(means) return pd.DataFrame(results) # non-default parameters are from https://arxiv.org/pdf/1708.05070.pdf estimators = { 'extra_trees_regressor': [ ('extra_trees_regressor', ExtraTreesRegressor()), ], 'gradient_boosting_regressor': [('gradient_boosting_regressor', GradientBoostingRegressor())], 'random_forest_regressor': [('random_forest_regressor', RandomForestRegressor())], 'knn_regressor': [('standard_scaler', StandardScaler()), ('knn_regressor', KNeighborsRegressor())], 'xgb_regressor': [('xgb_regressor', XGBRegressor())], 'lightgbm_regressor': [('lightgbm_regressor', LightGBMRegressor())], 'catboost_regressor': [('catboost_regressor', CatBoostRegressor())], 'lasso_regressor': [('standard_scaler', StandardScaler()), ('lasso_regressor', Lasso())], 'ridge_regressor': [('ridge_regressor', Ridge())], 'elastic_net_regressor': [('elastic_net_regressor', ElasticNet())], 'sgd_regressor': [('sgd_regressor', SGDRegressor())],
y_test = ss_y.transform(y_test.reshape(-1,1)) #修改2 #采用单一回归树模型 from sklearn.tree import DecisionTreeRegressor dtr = DecisionTreeRegressor() dtr.fit(X_train,y_train) dtr_y = dtr.predict(X_test) from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor #采用随机森林模型 rfr = RandomForestRegressor() rfr.fit(X_train,y_train) rfr_y = rfr.predict(X_test) #采用极端森林模型 etr = ExtraTreesRegressor() etr.fit(X_train,y_train) etr_y = etr.predict(X_test) #采用梯度提升模型 gbr = GradientBoostingRegressor() gbr.fit(X_train,y_train) gbr_y = gbr.predict(X_test) #对单一回归树做出预测 from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error print('R_squared value of DecisionTreeRegressor is ',dtr.score(X_test,y_test)) print('The mean squared error of DecisionTreeRegressor is ',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(dtr_y))) print('The mean absolute error of DecisionTreeRegressor is ',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(dtr_y))) #对随机森林做出预测