# Build training & test sets # data = sb.SetBuilder( target='NumberOfCustomers', autoexclude=True, dataset='best_for_customers.csv', ).exclude('NumberOfSales', 'Month').build() # data = sb.SetBuilder(target='NumberOfSales', autoexclude=True, dataset='mean_var_on_cust_from_tain.csv').build() # Performs simple linear regression depth = 8 dtree = tree.DecisionTreeRegressor(max_depth=depth) dtree.fit(data.xtr, data.ytr) ypred = dtree.predict(data.xts) pr.save_model(dtree, 'decision_tree_cust') dtree = pr.load_model('decision_tree_cust') ypred = dtree.predict(data.xts) print('R2 train = %s' % eval.evaluate(data.ytr, dtree.predict(data.xtr))) print('R2 test = %s' % eval.evaluate(data.yts, ypred)) print("Plain Decision regression tree without bagging") it = 10 yy = [] for i in range(it):
def predict(self, X): preds = [] for t in range(self.n_estimators): preds.append(self.estimators_[t].predict(X)) return np.mean(np.array(preds), axis=0) if __name__ == '__main__': breast_data = datasets.load_boston() X, y = breast_data.data, breast_data.target X_train, y_train = X[:400], y[:400] X_test, y_test = X[400:], y[400:] tinyml_decisiontree_reg = tree.DecisionTreeRegressor(min_samples_split=20, min_samples_leaf=5, random_state=True) tinyml_decisiontree_reg.fit(X_train, y_train) decisiontree_pred = tinyml_decisiontree_reg.predict(X_test) print('base estimator:', mean_squared_error(y_test, decisiontree_pred)) tinyml_rf_reg = RandomForestRegressor( n_estimators=100, base_estimator=tree.DecisionTreeRegressor) tinyml_rf_reg.fit(X_train, y_train) y_pred = tinyml_rf_reg.predict(X_test) print('tinyml rf mse:', mean_squared_error(y_test, y_pred)) sklearn_rf_reg = ensemble.RandomForestRegressor(n_estimators=100, min_samples_leaf=5, min_samples_split=20, random_state=False)
def regression(X,Y,method='svm'): ''' 分类器 ''' print("=======开始训练分类器======") print('采用的分类器为',method) if method=='svm': clf = svm.SVR(gamma='auto') # 方法选择 # 1.决策树回归 if method == 'tree': from sklearn import tree clf = tree.DecisionTreeRegressor() # 2.线性回归 if method == 'linear' : from sklearn.linear_model import LinearRegression clf = LinearRegression() # 3.SVM回归 # 4.kNN回归 if method == 'knn': from sklearn import neighbors clf = neighbors.KNeighborsRegressor() # 5.随机森林回归 if method == 'RFR': from sklearn import ensemble clf = ensemble.RandomForestRegressor(n_estimators=20) # 使用20个决策树 if method == 'Adaboost': # 6.Adaboost回归 from sklearn import ensemble clf = ensemble.AdaBoostRegressor(n_estimators=50) # 这里使用50个决策树 if method == 'GBR': # 7.GBRT回归 from sklearn import ensemble clf = ensemble.GradientBoostingRegressor(n_estimators=100) # 这里使用100个决策树 if method == 'Bag': # 8.Bagging回归 from sklearn import ensemble clf = ensemble.BaggingRegressor() if method == 'ETR': # 9.ExtraTree极端随机数回归 from sklearn.tree import ExtraTreeRegressor clf = ExtraTreeRegressor() if method == 'MLP': from sklearn.neural_network import MLPRegressor clf = MLPRegressor(solver='adam',alpha=1e-5, hidden_layer_sizes=(100,4), random_state=1) clf.fit(X, Y) print("==========训练完毕=========") return clf
# Linear Regression reglr = linear_model.LinearRegression() reglr.fit(X,Y) Ylr = reglr.predict(Xp) # Kernel Ridge Regression regkr = KernelRidge(kernel='rbf', gamma=0.1,alpha=0.1) regkr.fit(X,Y) Ykr = regkr.predict(Xp) # Kernel Regression Yp1 = kernelregress(np.hstack((X,Y)),Xp,10) Yp2 = kernelregress(np.hstack((X,Y)),Xp,1) # Decision Tree Regressor min_samples_split = 3 regtree = tree.DecisionTreeRegressor(min_samples_split=min_samples_split) regtree = regtree.fit(X, Y) Ytree = regtree.predict(Xp) plt.plot(X,Y,'go',label='true') plt.plot(Xp,Yp1,'g',label='kerReg10') plt.plot(Xp,Yp2,'g:',label='kerReg1') plt.plot(Xp,Ykr,'r',label='KernRidge') plt.plot(Xp,Ytree,'b',label='tree') plt.plot(Xp,Ylr,'m',label='linregres') plt.legend( loc = 3 ) plt.show()
'gradient boosting': ensemble.GradientBoostingRegressor(), # 'gaussian':gaussian_process.GaussianProcessRegressor(),报错 # 'isotonic':isotonic.IsotonicRegression(),报错 'kernelridge': kernel_ridge.KernelRidge(), 'ARD': linear_model.ARDRegression(), 'bayesianridge': linear_model.BayesianRidge(), # 'elasticnet':linear_model.ElasticNet(),#报错 'HuberRegressor': linear_model.HuberRegressor(), 'LinearRegression': linear_model.LinearRegression(), # 'logistic':linear_model.LogisticRegression(),报错 # 'linear_model.RidgeClassifier':linear_model.RidgeClassifier(),报错 'k-neighbor': neighbors.KNeighborsRegressor(), 'SVR': svm.LinearSVR(), 'NUSVR': svm.NuSVR(), 'extra tree': tree.ExtraTreeRegressor(), 'decesion tree': tree.DecisionTreeRegressor(), # 'random losgistic':linear_model.RandomizedLogisticRegression(),报错 # 'dummy':dummy.DummyRegressor()报错 } cv = StratifiedKFold(n_splits=5) i = 0 X = train_data #y=probs #z=labels[:,2] y = np.array(range(len(X))) #y=int("".join(list(map(str,)))) #from functools import reduce #y=reduce(lambda x,y: 10*x+y, (y)) emotion = ['P', 'A', 'D']
def fit(self, X, y): X = np.asanyarray(X, dtype='f', order='C') Y = self._y_to_one_zero_mat(y) self.model = tree.DecisionTreeRegressor() self.model.fit(X, Y)
x_train = x[random_indices[:70]] y_train = y[random_indices[:70]] # Validation set x_val = x[random_indices[70:85]] y_val = y[random_indices[70:85]] # Test set x_test = x[random_indices[85:]] y_test = y[random_indices[85:]] maximum_depth_of_tree = np.arange(10) + 1 train_err_arr = [] val_err_arr = [] test_err_arr = [] for depth in maximum_depth_of_tree: model = tree.DecisionTreeRegressor(max_depth=depth) # sklearn takes the inputs as matrices. Hence we reshpae the arrays into column matrices x_train_for_line_fitting = np.matrix(x_train.reshape(len(x_train), 1)) y_train_for_line_fitting = np.matrix(y_train.reshape(len(y_train), 1)) # Fit the line to the training data model.fit(x_train_for_line_fitting, y_train_for_line_fitting) # Plot the line plt.figure() plt.scatter(x_train, y_train, color='black') plt.plot(x.reshape((len(x), 1)), model.predict(x.reshape((len(x), 1))), color='blue') plt.xlabel('x-input feature') plt.ylabel('y-target values')
from sklearn.pipeline import make_pipeline from sklearn.linear_model import Ridge import time import warnings,math def ignore_warn(*args, **kwargs): pass warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn) #指定字体,解决matplotlib中文标题乱码问题 myfont = matplotlib.font_manager.FontProperties(fname='C:\Windows\Fonts\simsun.ttc') # 方法选择 # 1.决策树回归 from sklearn import tree model_decision_tree_regression = tree.DecisionTreeRegressor() # 2.线性回归 from sklearn.linear_model import LinearRegression model_linear_regression = LinearRegression() # 3.SVM回归 from sklearn import svm model_svm = svm.SVR() # 4.kNN回归 from sklearn import neighbors model_k_neighbor = neighbors.KNeighborsRegressor() # 5.随机森林回归 from sklearn import ensemble
def treeRegression(train, trainLable, testData): clf = tree.DecisionTreeRegressor() clf.fit(train, trainLable) predict = clf.predict(testData) return predict
from sklearn.model_selection import cross_validate import pickle as pik import numpy as np import matplotlib.pyplot as plt import graphviz #load in data decisiontree_cross_val_results = pd.read_csv( "DecisionTree_full_crossval_results.csv") output_test_x = pd.read_csv("output_data/output_test_x.csv") output_test_y = pd.read_csv("output_data/output_test_y.csv") output_train_x = pd.read_csv("output_data/output_train_x.csv") output_train_y = pd.read_csv("output_data/output_train_y.csv") #test, score, and save tree structure clf = tree.DecisionTreeRegressor(max_leaf_nodes=26, max_depth=19) clf.fit(output_train_x, output_train_y['ARRIVAL_DELAY']) dot_data = tree.export_graphviz(clf, out_file='DecisionTree.dot') graph = graphviz.Source(dot_data) pik.dump(clf, open('best_DecisionTree_model.pickle', 'wb')) predict_val = clf.predict(output_test_x) r2_score = r2_score(predict_val, output_test_y) print('decision tree test score using r-squared metric is') print(r2_score) #create full dataframe for testing scores from the various models decisiontree_cross_val_best_test = decisiontree_cross_val_results.loc[ decisiontree_cross_val_results['max_leaf_nodes'] == 26, :] decisiontree_cross_val_best_test = decisiontree_cross_val_best_test.loc[ decisiontree_cross_val_best_test['max_depth'] == 19, :]
#testRegressor( train, ARDRegression() , target, 'ARDRegression' ) testRegressor( train, linear_model.PassiveAggressiveRegressor(loss='epsilon_insensitive') , target, 'PassiveAggressiveRegressor' ) testRegressor( train, linear_model.PassiveAggressiveRegressor(loss='squared_epsilon_insensitive') , target, 'PassiveAggressiveRegressor squared loss' ) # Support Vector machines testRegressor( train, svm.SVR(kernel='poly'), target, 'SVM poly' ) testRegressor( train, svm.SVR(kernel='rbf'), target, 'SVM rbf' ) testRegressor( train, svm.SVR(kernel='sigmoid'), target, 'SVM sigmoid' ) # Nearest neighbors testRegressor( train, neighbors.KNeighborsRegressor( n_neighbors=1 ), target, 'NearestNeighbor 1' ) testRegressor( train, neighbors.KNeighborsRegressor( n_neighbors=2 ), target, 'NearestNeighbor 2' ) testRegressor( train, neighbors.KNeighborsRegressor( n_neighbors=3 ), target, 'NearestNeighbor 3' ) testRegressor( train, neighbors.KNeighborsRegressor( n_neighbors=4 ), target, 'NearestNeighbor 4' ) testRegressor( train, neighbors.KNeighborsRegressor( n_neighbors=8 ), target, 'NearestNeighbor 8' ) testRegressor( train, neighbors.KNeighborsRegressor( n_neighbors=16 ), target, 'NearestNeighbor 16' ) testRegressor( train, neighbors.KNeighborsRegressor( n_neighbors=32 ), target, 'NearestNeighbor 32' ) # Gaussian process # testRegressor( train, gaussian_process.GaussianProcess(), target, 'Gaussian process' ) # Regression trees testRegressor( train, tree.DecisionTreeRegressor(), target, 'Regression tree' ) testRegressor( train, ensemble.RandomForestRegressor(), target, 'RandomForestRegressor' ) testRegressor( train, ensemble.ExtraTreesRegressor(), target, 'ExtraTreesRegressor' ) # Gradient tree Boosting #testRegressor( train, ensemble.GradientBoostingRegressor(loss='ls'), target, 'Gradient tree boosting' )
""" demo07_house.py 预测房屋价格 """ import sklearn.datasets as sd import sklearn.utils as su import sklearn.tree as st import sklearn.metrics as sm # 读取数据 加载波士顿房屋价格 boston = sd.load_boston() print(boston.data.shape) # 数据的维度 print(boston.feature_names) # 数据的特征名 print(boston.target.shape) # 划分测试集与训练集 # 打乱数据集 # 以random_state随机种子作为参数生成数据集 x, y = su.shuffle(boston.data, boston.target, random_state=7) train_size = int(len(x) * 0.8) train_x, test_x, train_y, test_y = \ x[:train_size], x[train_size:], \ y[:train_size], y[train_size:] # 创建决策树回归器模型,使用训练集训练模型, # 测试集测试模型 model = st.DecisionTreeRegressor(max_depth=6) model.fit(train_x, train_y) pred_test_y = model.predict(test_x) print(sm.r2_score(test_y, pred_test_y))
# fig.show() lb = preprocessing.LabelBinarizer() prebreakdown_df_meta_simple_merge.ramp_metering = lb.fit_transform( prebreakdown_df_meta_simple_merge.ramp_metering) lb.classes_ lb.get_params() lb.inverse_transform(prebreakdown_df_meta_simple_merge.ramp_metering) y = prebreakdown_df_meta_simple_merge.prebreakdown_vol X = prebreakdown_df_meta_simple_merge[[ col for col in prebreakdown_df_meta_simple_merge.columns if col not in ["prebreakdown_vol", "estimated_capacity_veh_hr_ln"] ]] max_depth_ = 6 clf = tree.DecisionTreeRegressor(max_depth=max_depth_) clf = clf.fit(X, y) dot_data = tree.export_graphviz( clf, out_file=None, feature_names=X.columns, class_names=[y.name], filled=True, rounded=True, special_characters=True, ) graph = graphviz.Source(dot_data) graph.render( os.path.join(path_figures, f"all_simple_merge_tree_depth_{max_depth_}"))
#print type(X2[0]),type(X1[0]),type(y2[0]),type(y1[0]) #print X2[0] #print y2[0] #y2 = [[10,0,0,1,1],[10,0,0,1,1],[10,0,0,1,1],[10,0,0,1,1],[10,0,0,1,1],[10,0,0,1,1],[10,0,0,1,1],[10,0,0,1,1],[10,0,0,1,1],[10,0,0,1,1],[10,0,0,1,1],[10,0,0,1,1],[10,0,0,1,1],[10,0,0,1,1]] import numpy as np a = np.array(X2) b = np.array(y2) test = encode_bases_5(records[0].seq) from sklearn import tree dtr = tree.DecisionTreeRegressor() dtr.fit(X2, y2) a = dtr.predict(test) known = encode_bases_5(records[1].seq) from sklearn import cross_validation dtrscores = cross_validation.cross_val_score(dtr, X2, y2) print 'decision trees', dtrscores from sklearn import ensemble rfr = ensemble.RandomForestRegressor() rfr.fit(X2, y2) rfrscores = cross_validation.cross_val_score(rfr, X2, y2) print 'random forests: ', rfrscores
type=int, default=4, help='max depth to grow the tree to') return parser if __name__ == "__main__": args = setup(make_parser) # Read data. tokeep = \ ['grdpts', 'sid', 'cid', 'termnum', 'major', 'sterm', 'cohort', 'cs'] tokeep += RVALS data = pd.read_csv(args.data_file, usecols=tokeep).sort(['sid', 'termnum']) # Build classifier. clf = sklearn_model( ensemble.AdaBoostRegressor, base_estimator=tree.DecisionTreeRegressor(max_depth=args.max_depth), n_estimators=args.n_estimators) results = method_error(data, clf, True, predict_cold_start=args.cold_start) by = args.plot if args.plot else ('cs' if args.cold_start else 'termnum') evaluation = eval_results(results, by=by) print evaluation if args.plot == 'pred': g1, g2 = plot_predictions(results) elif args.plot in ['termnum', 'sterm', 'cohort']: ax1, ax2 = plot_error_by(args.plot, results)
mcycle = pd.read_csv('data/mcycle.csv') Xm = mcycle['times'].values.reshape(-1, 1) ym = mcycle['accel'].values def mcycle_plot(mod, title, **kwargs): plt.scatter(mcycle['times'], mcycle['accel'], **kwargs) plt.xlabel("seconds", fontsize=16) plt.ylabel("acceleration", fontsize=16) plt.title(title, fontsize=18, y=1.05) xgrid = np.arange(0, 60, 1).reshape(-1, 1) plt.plot(xgrid, mod.predict(xgrid), color="red", linewidth=2) # modal cart fit mcycle_dt = tree.DecisionTreeRegressor(min_samples_leaf=5) mcycle_dt.fit(Xm, ym) # single bayesian cart draw omega = rn.exponential(1, mcycle.shape[0]) mcycle_bt = tree.DecisionTreeRegressor(min_samples_leaf=5) mcycle_bt.fit(Xm, ym, sample_weight=omega) # bayesian forest mcycle_bf = ensemble.RandomForestRegressor(100, min_samples_leaf=5, bootstrap=2) mcycle_bf.fit(Xm, ym) fig = plt.figure(figsize=(16, 4)) fig.add_subplot(1, 3, 1) mcycle_plot(mcycle_dt, "sample CART tree")
import numpy as np x_train = np.array([[-3, 7], [1, 5], [1, 2], [-2, 0], [2, 3], [-4, 0], [-1, 1], [1, 1], [-2, 2], [2, 7], [-4, 1], [-2, 7]]) y_train = np.array([3, 3, 3, 3, 4, 3, 3, 4, 3, 4, 4, 4]) x_test = np.array([[1, 2], [3, 9]]) #DECISION TREE from sklearn import tree model = tree.DecisionTreeRegressor() model.fit(x_train, y_train) model.score(x_train, y_train) predicted_output = model.predict(x_test) print(predicted_output)
dt.fit(training_set, training_set_predicted) correct_prediction = 0 predicted_class_set = dt.predict(testing_set) #print("testing_set_target",testing_set_predicted) #print("testing_set actually predicted",predicted_class_set) for i in range(len(testing_set)): original_class = testing_set_predicted[i] predicted_class = predicted_class_set[i] if (predicted_class == original_class): correct_prediction += 1 print("Decision Tree Classifier Accuracy:", float(correct_prediction) / float(len(testing_set))) #################################Decision tree regressor for continuous data############################# training_set_predicted_unlog = np.array( training_set_predicted_unlog) #prevent unhashable error dtr = tree.DecisionTreeRegressor( ) #using the default param setting will be fine dtr.fit(training_set, training_set_predicted_unlog) correct_prediction = 0 absolute_error = 0.0 #using the L2 norm space to calculating the error predicted_class_set = dtr.predict(testing_set) for i in range(len(testing_set)): #now the class become the continuous value #consider the continuous data original_class_cts = testing_set_predicted_unlog[i] predicted_class_cts = predicted_class_set[i] #consider the discrete data (transform from cts data back to the discrete one , which is class) original_class = testing_set_predicted[i] if predicted_class_set[i] != 0.0: #prevent math domain error of log 0 predicted_class = int(math.log10(predicted_class_set[i])) else: predicted_class = int(predicted_class_set[i])
def predict(self, X): y_pred = np.ones((X.shape[0], )) * self.mean for t in range(self.n_estimators): y_pred += (self.eta * self.estimators_[t].predict(X)) return y_pred if __name__ == '__main__': breast_data = datasets.load_boston() X, y = breast_data.data, breast_data.target X_train, y_train = X[:400], y[:400] X_test, y_test = X[400:], y[400:] sklearn_decisiontree_reg = tree.DecisionTreeRegressor(min_samples_split=15, min_samples_leaf=5, random_state=False) sklearn_decisiontree_reg.fit(X_train, y_train) decisiontree_pred = sklearn_decisiontree_reg.predict(X_test) print('base estimator:', mean_squared_error(y_test, decisiontree_pred)) tinyml_gbdt_reg = XGBRegressor(n_estimators=100, max_depth=3, gamma=0.) tinyml_gbdt_reg.fit(X_train, y_train) y_pred = tinyml_gbdt_reg.predict(X_test) print('tinyml mse:', mean_squared_error(y_test, y_pred)) xgb_reg = xgb.sklearn.XGBRegressor(max_depth=3, learning_rate=0.1, n_estimators=100, gamma=0, reg_lambda=1)
lambda df: len(set(df[i]))).reset_index()) tp.columns = ['uid',i + '_dstc'] if gc.empty == True: gc = tp else: gc = pd.merge(gc,tp,on = 'uid',how = 'left') fn = base.merge(gn,on='uid').merge(gc,on='uid') fn = pd.merge(fn,gc,on= 'uid') fn.shape x = fn.drop(['uid','oil_actv_dt','create_dt','bad_ind','class_new'],axis = 1) y = fn.bad_ind.copy() from sklearn import tree dtree = tree.DecisionTreeRegressor(max_depth = 2,min_samples_leaf = 500,min_samples_split = 5000) dtree = dtree.fit(x,y) import pydotplus from IPython.display import Image from sklearn.externals.six import StringIO import os os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/' dot_data = StringIO() tree.export_graphviz(dtree, out_file=dot_data, feature_names=x.columns, class_names=['bad_ind'], filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
# ============================================================================= X = df[predictors].values y = df[target_column].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42) print(X_train.shape) print(X_test.shape) # ============================================================================= # decision tree regressor # ============================================================================= dtree = tree.DecisionTreeRegressor(max_depth=5, min_samples_leaf=0.13, random_state=3) dtree.fit(X_train, y_train) tree.plot_tree(dtree) # ============================================================================= # predict on the training data # ============================================================================= pred_train_tree = dtree.predict(X_train) print(np.sqrt(mean_squared_error(y_train, pred_train_tree))) print(r2_score(y_train, pred_train_tree)) # ============================================================================= # predict on the test data
def fit(self, X, y): self._depth = 0 self.tree = self._build_tree(X, y) #print(self._depth) def predict(self, X): return [self.tree(X[i]) for i in range(len(X))] if __name__ == '__main__': x = np.linspace(-3, 3, 100).reshape(-1, 1) y = np.sin(x) # + np.random.rand(len(x)) index = np.arange(len(x)) np.random.shuffle(index) #print(x) x = x[index] y = y[index] plt.scatter(x, y) #print(x) t1 = DecisionTreeRegressor(4, 0.001) t1.fit(x, y) plt.scatter(x, t1.predict(x)) t2 = tree.DecisionTreeRegressor(max_depth=4, min_impurity_split=0.001) t2.fit(x, y) plt.scatter(x, t2.predict(x)) plt.show()