Example #1
0
# Build training & test sets
#
data = sb.SetBuilder(
    target='NumberOfCustomers',
    autoexclude=True,
    dataset='best_for_customers.csv',
).exclude('NumberOfSales', 'Month').build()

# data = sb.SetBuilder(target='NumberOfSales', autoexclude=True, dataset='mean_var_on_cust_from_tain.csv').build()

# Performs simple linear regression

depth = 8

dtree = tree.DecisionTreeRegressor(max_depth=depth)
dtree.fit(data.xtr, data.ytr)
ypred = dtree.predict(data.xts)

pr.save_model(dtree, 'decision_tree_cust')

dtree = pr.load_model('decision_tree_cust')
ypred = dtree.predict(data.xts)

print('R2 train = %s' % eval.evaluate(data.ytr, dtree.predict(data.xtr)))
print('R2 test = %s' % eval.evaluate(data.yts, ypred))
print("Plain Decision regression tree without bagging")

it = 10
yy = []
for i in range(it):
Example #2
0
    def predict(self, X):
        preds = []
        for t in range(self.n_estimators):
            preds.append(self.estimators_[t].predict(X))
        return np.mean(np.array(preds), axis=0)


if __name__ == '__main__':
    breast_data = datasets.load_boston()
    X, y = breast_data.data, breast_data.target
    X_train, y_train = X[:400], y[:400]
    X_test, y_test = X[400:], y[400:]

    tinyml_decisiontree_reg = tree.DecisionTreeRegressor(min_samples_split=20,
                                                         min_samples_leaf=5,
                                                         random_state=True)
    tinyml_decisiontree_reg.fit(X_train, y_train)
    decisiontree_pred = tinyml_decisiontree_reg.predict(X_test)
    print('base estimator:', mean_squared_error(y_test, decisiontree_pred))

    tinyml_rf_reg = RandomForestRegressor(
        n_estimators=100, base_estimator=tree.DecisionTreeRegressor)
    tinyml_rf_reg.fit(X_train, y_train)
    y_pred = tinyml_rf_reg.predict(X_test)
    print('tinyml rf mse:', mean_squared_error(y_test, y_pred))

    sklearn_rf_reg = ensemble.RandomForestRegressor(n_estimators=100,
                                                    min_samples_leaf=5,
                                                    min_samples_split=20,
                                                    random_state=False)
def regression(X,Y,method='svm'):
    '''
    分类器
    '''
    print("=======开始训练分类器======")
    print('采用的分类器为',method)
    if method=='svm':
        
        clf = svm.SVR(gamma='auto')

    # 方法选择
    # 1.决策树回归
    if method == 'tree':
        from sklearn import tree
        clf = tree.DecisionTreeRegressor()
        

        # 2.线性回归
    if method == 'linear' :
        from sklearn.linear_model import LinearRegression
        clf = LinearRegression()

         
        # 3.SVM回归

         
        # 4.kNN回归
    if method == 'knn':
        from sklearn import neighbors
        clf = neighbors.KNeighborsRegressor()
         
        # 5.随机森林回归
    if method == 'RFR':
        from sklearn import ensemble
        clf = ensemble.RandomForestRegressor(n_estimators=20)  # 使用20个决策树
    if method == 'Adaboost':
        # 6.Adaboost回归
        from sklearn import ensemble
        clf = ensemble.AdaBoostRegressor(n_estimators=50)  # 这里使用50个决策树
    if method == 'GBR':
        # 7.GBRT回归
        from sklearn import ensemble
        clf = ensemble.GradientBoostingRegressor(n_estimators=100)  # 这里使用100个决策树
    if method == 'Bag':
        # 8.Bagging回归
        from sklearn import ensemble
        clf = ensemble.BaggingRegressor()
    if method == 'ETR':
        # 9.ExtraTree极端随机数回归
        from sklearn.tree import ExtraTreeRegressor
        clf = ExtraTreeRegressor()      
        
    if method == 'MLP':
        from sklearn.neural_network import MLPRegressor
        clf = MLPRegressor(solver='adam',alpha=1e-5, hidden_layer_sizes=(100,4), random_state=1)
    

        
        
    clf.fit(X, Y)
        
    print("==========训练完毕=========")
    
    
    
    
    
    return clf
Example #4
0
# Linear Regression
reglr = linear_model.LinearRegression()
reglr.fit(X,Y)
Ylr = reglr.predict(Xp)

# Kernel Ridge Regression
regkr = KernelRidge(kernel='rbf', gamma=0.1,alpha=0.1)
regkr.fit(X,Y)
Ykr = regkr.predict(Xp)

# Kernel Regression
Yp1 = kernelregress(np.hstack((X,Y)),Xp,10)
Yp2 = kernelregress(np.hstack((X,Y)),Xp,1)

# Decision Tree Regressor
min_samples_split = 3
regtree = tree.DecisionTreeRegressor(min_samples_split=min_samples_split)
regtree = regtree.fit(X, Y)
Ytree = regtree.predict(Xp)


plt.plot(X,Y,'go',label='true')
plt.plot(Xp,Yp1,'g',label='kerReg10')
plt.plot(Xp,Yp2,'g:',label='kerReg1')
plt.plot(Xp,Ykr,'r',label='KernRidge')
plt.plot(Xp,Ytree,'b',label='tree')
plt.plot(Xp,Ylr,'m',label='linregres')
plt.legend( loc = 3 )

plt.show()
Example #5
0
    'gradient boosting': ensemble.GradientBoostingRegressor(),
    #        'gaussian':gaussian_process.GaussianProcessRegressor(),报错
    #        'isotonic':isotonic.IsotonicRegression(),报错
    'kernelridge': kernel_ridge.KernelRidge(),
    'ARD': linear_model.ARDRegression(),
    'bayesianridge': linear_model.BayesianRidge(),
    #        'elasticnet':linear_model.ElasticNet(),#报错
    'HuberRegressor': linear_model.HuberRegressor(),
    'LinearRegression': linear_model.LinearRegression(),
    #        'logistic':linear_model.LogisticRegression(),报错
    #        'linear_model.RidgeClassifier':linear_model.RidgeClassifier(),报错
    'k-neighbor': neighbors.KNeighborsRegressor(),
    'SVR': svm.LinearSVR(),
    'NUSVR': svm.NuSVR(),
    'extra tree': tree.ExtraTreeRegressor(),
    'decesion tree': tree.DecisionTreeRegressor(),
    #        'random losgistic':linear_model.RandomizedLogisticRegression(),报错
    #        'dummy':dummy.DummyRegressor()报错
}

cv = StratifiedKFold(n_splits=5)
i = 0
X = train_data
#y=probs
#z=labels[:,2]
y = np.array(range(len(X)))

#y=int("".join(list(map(str,))))
#from functools import reduce
#y=reduce(lambda x,y: 10*x+y, (y))
emotion = ['P', 'A', 'D']
Example #6
0
 def fit(self, X, y):
     X = np.asanyarray(X, dtype='f', order='C')
     Y = self._y_to_one_zero_mat(y)
     
     self.model = tree.DecisionTreeRegressor()
     self.model.fit(X, Y)
Example #7
0
x_train = x[random_indices[:70]]
y_train = y[random_indices[:70]]
# Validation set
x_val = x[random_indices[70:85]]
y_val = y[random_indices[70:85]]
# Test set
x_test = x[random_indices[85:]]
y_test = y[random_indices[85:]]

maximum_depth_of_tree = np.arange(10) + 1
train_err_arr = []
val_err_arr = []
test_err_arr = []

for depth in maximum_depth_of_tree:
    model = tree.DecisionTreeRegressor(max_depth=depth)
    # sklearn takes the inputs as matrices. Hence we reshpae the arrays into column matrices
    x_train_for_line_fitting = np.matrix(x_train.reshape(len(x_train), 1))
    y_train_for_line_fitting = np.matrix(y_train.reshape(len(y_train), 1))

    # Fit the line to the training data
    model.fit(x_train_for_line_fitting, y_train_for_line_fitting)

    # Plot the line
    plt.figure()
    plt.scatter(x_train, y_train, color='black')
    plt.plot(x.reshape((len(x), 1)),
             model.predict(x.reshape((len(x), 1))),
             color='blue')
    plt.xlabel('x-input feature')
    plt.ylabel('y-target values')
Example #8
0
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
import time
import warnings,math
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

#指定字体,解决matplotlib中文标题乱码问题
myfont = matplotlib.font_manager.FontProperties(fname='C:\Windows\Fonts\simsun.ttc')


# 方法选择
# 1.决策树回归
from sklearn import tree
model_decision_tree_regression = tree.DecisionTreeRegressor()

# 2.线性回归
from sklearn.linear_model import LinearRegression
model_linear_regression = LinearRegression()

# 3.SVM回归
from sklearn import svm
model_svm = svm.SVR()

# 4.kNN回归
from sklearn import neighbors
model_k_neighbor = neighbors.KNeighborsRegressor()

# 5.随机森林回归
from sklearn import ensemble
Example #9
0
def treeRegression(train, trainLable, testData):
    clf = tree.DecisionTreeRegressor()
    clf.fit(train, trainLable)
    predict = clf.predict(testData)
    return predict
Example #10
0
from sklearn.model_selection import cross_validate
import pickle as pik
import numpy as np
import matplotlib.pyplot as plt
import graphviz

#load in data
decisiontree_cross_val_results = pd.read_csv(
    "DecisionTree_full_crossval_results.csv")
output_test_x = pd.read_csv("output_data/output_test_x.csv")
output_test_y = pd.read_csv("output_data/output_test_y.csv")
output_train_x = pd.read_csv("output_data/output_train_x.csv")
output_train_y = pd.read_csv("output_data/output_train_y.csv")

#test, score, and save tree structure
clf = tree.DecisionTreeRegressor(max_leaf_nodes=26, max_depth=19)
clf.fit(output_train_x, output_train_y['ARRIVAL_DELAY'])
dot_data = tree.export_graphviz(clf, out_file='DecisionTree.dot')
graph = graphviz.Source(dot_data)

pik.dump(clf, open('best_DecisionTree_model.pickle', 'wb'))
predict_val = clf.predict(output_test_x)
r2_score = r2_score(predict_val, output_test_y)
print('decision tree test score using r-squared metric is')
print(r2_score)

#create full dataframe for testing scores from the various models
decisiontree_cross_val_best_test = decisiontree_cross_val_results.loc[
    decisiontree_cross_val_results['max_leaf_nodes'] == 26, :]
decisiontree_cross_val_best_test = decisiontree_cross_val_best_test.loc[
    decisiontree_cross_val_best_test['max_depth'] == 19, :]
#testRegressor( train, ARDRegression() , target, 'ARDRegression' )

testRegressor( train, linear_model.PassiveAggressiveRegressor(loss='epsilon_insensitive') , target, 'PassiveAggressiveRegressor' )
testRegressor( train, linear_model.PassiveAggressiveRegressor(loss='squared_epsilon_insensitive') , target, 'PassiveAggressiveRegressor squared loss' )


# Support Vector machines
testRegressor( train, svm.SVR(kernel='poly'), target, 'SVM poly' )
testRegressor( train, svm.SVR(kernel='rbf'), target, 'SVM rbf' )
testRegressor( train, svm.SVR(kernel='sigmoid'), target, 'SVM sigmoid' )

# Nearest neighbors
testRegressor( train, neighbors.KNeighborsRegressor( n_neighbors=1 ), target, 'NearestNeighbor 1' )
testRegressor( train, neighbors.KNeighborsRegressor( n_neighbors=2 ), target, 'NearestNeighbor 2' )
testRegressor( train, neighbors.KNeighborsRegressor( n_neighbors=3 ), target, 'NearestNeighbor 3' )
testRegressor( train, neighbors.KNeighborsRegressor( n_neighbors=4 ), target, 'NearestNeighbor 4' )
testRegressor( train, neighbors.KNeighborsRegressor( n_neighbors=8 ), target, 'NearestNeighbor 8' )
testRegressor( train, neighbors.KNeighborsRegressor( n_neighbors=16 ), target, 'NearestNeighbor 16' )
testRegressor( train, neighbors.KNeighborsRegressor( n_neighbors=32 ), target, 'NearestNeighbor 32' )


# Gaussian process
# testRegressor( train, gaussian_process.GaussianProcess(), target, 'Gaussian process' )

# Regression trees
testRegressor( train, tree.DecisionTreeRegressor(), target, 'Regression tree' )
testRegressor( train, ensemble.RandomForestRegressor(), target, 'RandomForestRegressor' )
testRegressor( train, ensemble.ExtraTreesRegressor(), target, 'ExtraTreesRegressor' )

# Gradient tree Boosting
#testRegressor( train, ensemble.GradientBoostingRegressor(loss='ls'), target, 'Gradient tree boosting' )
Example #12
0
"""
demo07_house.py  预测房屋价格
"""
import sklearn.datasets as sd
import sklearn.utils as su
import sklearn.tree as st
import sklearn.metrics as sm

# 读取数据  加载波士顿房屋价格
boston = sd.load_boston()
print(boston.data.shape)  # 数据的维度
print(boston.feature_names)  # 数据的特征名
print(boston.target.shape)

# 划分测试集与训练集
# 打乱数据集
# 以random_state随机种子作为参数生成数据集
x, y = su.shuffle(boston.data, boston.target, random_state=7)
train_size = int(len(x) * 0.8)
train_x, test_x, train_y, test_y = \
    x[:train_size], x[train_size:], \
    y[:train_size], y[train_size:]

# 创建决策树回归器模型,使用训练集训练模型,
# 测试集测试模型
model = st.DecisionTreeRegressor(max_depth=6)
model.fit(train_x, train_y)
pred_test_y = model.predict(test_x)
print(sm.r2_score(test_y, pred_test_y))
# fig.show()

lb = preprocessing.LabelBinarizer()
prebreakdown_df_meta_simple_merge.ramp_metering = lb.fit_transform(
    prebreakdown_df_meta_simple_merge.ramp_metering)
lb.classes_
lb.get_params()
lb.inverse_transform(prebreakdown_df_meta_simple_merge.ramp_metering)

y = prebreakdown_df_meta_simple_merge.prebreakdown_vol
X = prebreakdown_df_meta_simple_merge[[
    col for col in prebreakdown_df_meta_simple_merge.columns
    if col not in ["prebreakdown_vol", "estimated_capacity_veh_hr_ln"]
]]
max_depth_ = 6
clf = tree.DecisionTreeRegressor(max_depth=max_depth_)
clf = clf.fit(X, y)

dot_data = tree.export_graphviz(
    clf,
    out_file=None,
    feature_names=X.columns,
    class_names=[y.name],
    filled=True,
    rounded=True,
    special_characters=True,
)
graph = graphviz.Source(dot_data)
graph.render(
    os.path.join(path_figures, f"all_simple_merge_tree_depth_{max_depth_}"))
Example #14
0
#print type(X2[0]),type(X1[0]),type(y2[0]),type(y1[0])

#print X2[0]
#print y2[0]
#y2 = [[10,0,0,1,1],[10,0,0,1,1],[10,0,0,1,1],[10,0,0,1,1],[10,0,0,1,1],[10,0,0,1,1],[10,0,0,1,1],[10,0,0,1,1],[10,0,0,1,1],[10,0,0,1,1],[10,0,0,1,1],[10,0,0,1,1],[10,0,0,1,1],[10,0,0,1,1]]

import numpy as np

a = np.array(X2)
b = np.array(y2)

test = encode_bases_5(records[0].seq)

from sklearn import tree
dtr = tree.DecisionTreeRegressor()
dtr.fit(X2, y2)
a = dtr.predict(test)
known = encode_bases_5(records[1].seq)

from sklearn import cross_validation

dtrscores = cross_validation.cross_val_score(dtr, X2, y2)
print 'decision trees', dtrscores

from sklearn import ensemble
rfr = ensemble.RandomForestRegressor()
rfr.fit(X2, y2)

rfrscores = cross_validation.cross_val_score(rfr, X2, y2)
print 'random forests: ', rfrscores
Example #15
0
                        type=int,
                        default=4,
                        help='max depth to grow the tree to')
    return parser


if __name__ == "__main__":
    args = setup(make_parser)

    # Read data.
    tokeep = \
        ['grdpts', 'sid', 'cid', 'termnum', 'major', 'sterm', 'cohort', 'cs']
    tokeep += RVALS
    data = pd.read_csv(args.data_file, usecols=tokeep).sort(['sid', 'termnum'])

    # Build classifier.
    clf = sklearn_model(
        ensemble.AdaBoostRegressor,
        base_estimator=tree.DecisionTreeRegressor(max_depth=args.max_depth),
        n_estimators=args.n_estimators)

    results = method_error(data, clf, True, predict_cold_start=args.cold_start)
    by = args.plot if args.plot else ('cs' if args.cold_start else 'termnum')
    evaluation = eval_results(results, by=by)
    print evaluation

    if args.plot == 'pred':
        g1, g2 = plot_predictions(results)
    elif args.plot in ['termnum', 'sterm', 'cohort']:
        ax1, ax2 = plot_error_by(args.plot, results)
Example #16
0
mcycle = pd.read_csv('data/mcycle.csv')
Xm = mcycle['times'].values.reshape(-1, 1)
ym = mcycle['accel'].values


def mcycle_plot(mod, title, **kwargs):
    plt.scatter(mcycle['times'], mcycle['accel'], **kwargs)
    plt.xlabel("seconds", fontsize=16)
    plt.ylabel("acceleration", fontsize=16)
    plt.title(title, fontsize=18, y=1.05)
    xgrid = np.arange(0, 60, 1).reshape(-1, 1)
    plt.plot(xgrid, mod.predict(xgrid), color="red", linewidth=2)


# modal cart fit
mcycle_dt = tree.DecisionTreeRegressor(min_samples_leaf=5)
mcycle_dt.fit(Xm, ym)
# single bayesian cart draw
omega = rn.exponential(1, mcycle.shape[0])
mcycle_bt = tree.DecisionTreeRegressor(min_samples_leaf=5)
mcycle_bt.fit(Xm, ym, sample_weight=omega)
# bayesian forest
mcycle_bf = ensemble.RandomForestRegressor(100,
                                           min_samples_leaf=5,
                                           bootstrap=2)
mcycle_bf.fit(Xm, ym)

fig = plt.figure(figsize=(16, 4))

fig.add_subplot(1, 3, 1)
mcycle_plot(mcycle_dt, "sample CART tree")
Example #17
0
import numpy as np

x_train = np.array([[-3, 7], [1, 5], [1, 2], [-2, 0], [2, 3], [-4, 0], [-1, 1],
                    [1, 1], [-2, 2], [2, 7], [-4, 1], [-2, 7]])
y_train = np.array([3, 3, 3, 3, 4, 3, 3, 4, 3, 4, 4, 4])
x_test = np.array([[1, 2], [3, 9]])

#DECISION TREE
from sklearn import tree
model = tree.DecisionTreeRegressor()
model.fit(x_train, y_train)
model.score(x_train, y_train)
predicted_output = model.predict(x_test)
print(predicted_output)
Example #18
0
dt.fit(training_set, training_set_predicted)
correct_prediction = 0
predicted_class_set = dt.predict(testing_set)
#print("testing_set_target",testing_set_predicted)
#print("testing_set actually predicted",predicted_class_set)
for i in range(len(testing_set)):
    original_class = testing_set_predicted[i]
    predicted_class = predicted_class_set[i]
    if (predicted_class == original_class):
        correct_prediction += 1
print("Decision Tree Classifier Accuracy:",
      float(correct_prediction) / float(len(testing_set)))
#################################Decision tree regressor for continuous data#############################
training_set_predicted_unlog = np.array(
    training_set_predicted_unlog)  #prevent unhashable error
dtr = tree.DecisionTreeRegressor(
)  #using the default param setting will be fine
dtr.fit(training_set, training_set_predicted_unlog)
correct_prediction = 0
absolute_error = 0.0  #using the L2 norm space to calculating the error
predicted_class_set = dtr.predict(testing_set)
for i in range(len(testing_set)):  #now the class become the continuous value
    #consider the continuous data
    original_class_cts = testing_set_predicted_unlog[i]
    predicted_class_cts = predicted_class_set[i]
    #consider the discrete data (transform from cts data back to the discrete one , which is class)
    original_class = testing_set_predicted[i]
    if predicted_class_set[i] != 0.0:  #prevent math domain error of log 0
        predicted_class = int(math.log10(predicted_class_set[i]))
    else:
        predicted_class = int(predicted_class_set[i])
Example #19
0
    def predict(self, X):
        y_pred = np.ones((X.shape[0], )) * self.mean
        for t in range(self.n_estimators):
            y_pred += (self.eta * self.estimators_[t].predict(X))
        return y_pred


if __name__ == '__main__':
    breast_data = datasets.load_boston()
    X, y = breast_data.data, breast_data.target

    X_train, y_train = X[:400], y[:400]
    X_test, y_test = X[400:], y[400:]

    sklearn_decisiontree_reg = tree.DecisionTreeRegressor(min_samples_split=15,
                                                          min_samples_leaf=5,
                                                          random_state=False)
    sklearn_decisiontree_reg.fit(X_train, y_train)
    decisiontree_pred = sklearn_decisiontree_reg.predict(X_test)
    print('base estimator:', mean_squared_error(y_test, decisiontree_pred))

    tinyml_gbdt_reg = XGBRegressor(n_estimators=100, max_depth=3, gamma=0.)
    tinyml_gbdt_reg.fit(X_train, y_train)
    y_pred = tinyml_gbdt_reg.predict(X_test)
    print('tinyml mse:', mean_squared_error(y_test, y_pred))

    xgb_reg = xgb.sklearn.XGBRegressor(max_depth=3,
                                       learning_rate=0.1,
                                       n_estimators=100,
                                       gamma=0,
                                       reg_lambda=1)
Example #20
0
                                   lambda df: len(set(df[i]))).reset_index())  
    tp.columns = ['uid',i + '_dstc']  
    if gc.empty == True:  
        gc = tp  
    else:  
        gc = pd.merge(gc,tp,on = 'uid',how = 'left')

fn =  base.merge(gn,on='uid').merge(gc,on='uid')  
fn = pd.merge(fn,gc,on= 'uid')   
fn.shape 

x = fn.drop(['uid','oil_actv_dt','create_dt','bad_ind','class_new'],axis = 1)
y = fn.bad_ind.copy()

from sklearn import tree  
dtree = tree.DecisionTreeRegressor(max_depth = 2,min_samples_leaf = 500,min_samples_split = 5000)  
dtree = dtree.fit(x,y) 

import pydotplus   
from IPython.display import Image  
from sklearn.externals.six import StringIO  
import os  
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'

dot_data = StringIO()  
tree.export_graphviz(dtree, out_file=dot_data,  
                         feature_names=x.columns,  
                         class_names=['bad_ind'],  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())   
Example #21
0
# =============================================================================
X = df[predictors].values
y = df[target_column].values

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.30,
                                                    random_state=42)
print(X_train.shape)
print(X_test.shape)

# =============================================================================
# decision tree regressor
# =============================================================================
dtree = tree.DecisionTreeRegressor(max_depth=5,
                                   min_samples_leaf=0.13,
                                   random_state=3)

dtree.fit(X_train, y_train)

tree.plot_tree(dtree)

# =============================================================================
# predict on the training data
# =============================================================================
pred_train_tree = dtree.predict(X_train)
print(np.sqrt(mean_squared_error(y_train, pred_train_tree)))
print(r2_score(y_train, pred_train_tree))

# =============================================================================
# predict on the test data
Example #22
0
    def fit(self, X, y):
        self._depth = 0
        self.tree = self._build_tree(X, y)
        #print(self._depth)

    def predict(self, X):
        return [self.tree(X[i]) for i in range(len(X))]


if __name__ == '__main__':
    x = np.linspace(-3, 3, 100).reshape(-1, 1)
    y = np.sin(x)  # + np.random.rand(len(x))
    index = np.arange(len(x))
    np.random.shuffle(index)
    #print(x)
    x = x[index]
    y = y[index]
    plt.scatter(x, y)
    #print(x)

    t1 = DecisionTreeRegressor(4, 0.001)
    t1.fit(x, y)
    plt.scatter(x, t1.predict(x))

    t2 = tree.DecisionTreeRegressor(max_depth=4, min_impurity_split=0.001)
    t2.fit(x, y)
    plt.scatter(x, t2.predict(x))

    plt.show()