Ejemplo n.º 1
0
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
X_train,X_val,Y_train,Y_val=train_test_split(X,Y,test_size=0.25)
gbrt=GradientBoostingClassifier(max_depth=2,n_estimators=120)
gbrt.fit(X_train,Y_train)
errors=[mean_squared_error(Y_val,Y_pred)
          for Y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators=np.argmin(errors)
gbrt_best=GradientBoostingClassifier(max_depth=2,n_estimators=bst_n_estimators)
gbrt.fit(X_train,Y_train)

gbrt=GradientBoostingClassifier(max_depth=2,warm_start=True)
min_val_error=float('inf')
error_going_up=0
for n_estimators in range(1,120):
    gbrt.n_estimators=n_estimators
    gbrt.fit(X_train,Y_train)
    Y_pred=gbrt.predict(X_val)
    val_error=mean_squared_error(Y_val,Y_pred)
    if val_error<min_val_error:
        min_val_error=val_error
        error_going_up=0
    else:
        error_going_up+=1
        if error_going_up==5:
            break
#---------5堆叠法(stacking)------
from sklearn.datasets import load_iris
iris=load_iris()
X,Y=iris.data[:,1:3],iris.target
from sklearn.model_selection import cross_val_score
'''
Using Early stopping strategy to calculate no. of estimators 
required in order to decrease computations and overfitting

'''

c_gbrt_est = GradientBoostingClassifier(max_depth=8,
                                        min_samples_split=9,
                                        min_samples_leaf=8,
                                        learning_rate=0.09,
                                        warm_start=True)
min_val_error = float("inf")
error_going_up = 0
n_estimators = 0
for n_estimators in range(1, 1200):
    c_gbrt_est.n_estimators = n_estimators
    c_gbrt_est.fit(cX_real_drop, cy_real_drop)
    y_pred = c_gbrt_est.predict(cX_test_drop)
    val_error = mean_squared_error(cy_test_drop, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 15:
            break
# A minimum of 100 estimators for the classifier
if n_estimators < 100:
    n_estimators = 100
clf_estimator = n_estimators
Ejemplo n.º 3
0
errors = [mean_squared_error(y_test, y_pred) for y_pred in 
          grd_clf.staged_predict(X_test)]
n_estimators_opt = np.argmin(errors) # getting the index of least error
print('Optimal value:', n_estimators_opt)
# create new model with optimal value of n_estimators
grd_clf_opt_1 = GradientBoostingClassifier(max_depth=2,
                                           n_estimators=n_estimators_opt,
                                           learning_rate=1.0)
print('Training Model..')
grd_clf_opt_1.fit(X_train, y_train)
print('Done.')
y_pred = grd_clf_opt_1.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
# Implementing actual early-stopping
grd_clf = GradientBoostingClassifier(max_depth=2, warm_start=True) # set warm_start
min_val_error = float('inf')
error_going_up = 0
for n_estimators in range(1, 100):
    grd_clf.n_estimators = n_estimators
    grd_clf.fit(X_train, y_train)
    y_pred = grd_clf.predict(X_test)
    val_error = mean_squared_error(y_test, y_pred)
    print('n_estimators:', n_estimators, 'Error:', val_error)
    if val_error <= min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break # early-stpping
for i, d in enumerate(y_train):
    if d:
        y_trainx[i] = 1

y_train = y_trainx
y_train[0]

# Decision Tree Classifier..
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

model = GradientBoostingClassifier(learning_rate=0.09, warm_start=True, n)
acc = []
for i in range(1, 10):
    model.n_estimators = i
    model.fit(train_vector[:900], y_train[:900])
    acc.append(
        accuracy_score(model.predict(train_vector[1500:]), y_train[1500:]))

#model=DecisionTreeClassifier()
model.fit(train_vector[1000:1900], y_train[1000:1900])

from sklearn.metrics import confusion_matrix

test_vector = np.zeros(shape=(100, len(corpus)))

for j, i in enumerate(x_test[0:100]):
    for some in tokenizer.tokenize(i[0]):
        if some not in stopwords:
            test_vector[j][mp[some]] = 1
Ejemplo n.º 5
0
                                  undersampled_train,
                                  y_train,
                                  cv=4,
                                  scoring='roc_auc')
    scores.append(np.mean(this_scores))
    scores_std.append(np.std(this_scores))
rf_results = pd.DataFrame({'score': scores, 'Max Depth': msl_s})
rf_results

msl_s = [500, 750, 1000]
scores = list()
scores_std = list()
rf = RandomForestClassifier()

for msl in msl_s:
    rf.n_estimators = msl
    this_scores = cross_val_score(rf,
                                  undersampled_train,
                                  y_train,
                                  cv=4,
                                  scoring='roc_auc')
    scores.append(np.mean(this_scores))
    scores_std.append(np.std(this_scores))
rf_results = pd.DataFrame({'score': scores, 'Max Depth': msl_s})
rf_results

#clf = GradientBoostingClassifier(learning_rate=0.04, n_estimators=500, subsample=0.93, max_depth=10, max_features=None)
#clf = AdaBoostClassifier(learning_rate=0.5, n_estimators=500)
#clf = MLPClassifier(activation='logistic', solver='lbfgs', alpha=1e-5, max_iter=500,hidden_layer_sizes=(44, 2), learning_rate='invscaling')
#clf = KNeighborsClassifier(weights='distance',algorithm='kd_tree', leaf_size=1000, n_neighbors=2)
clf = GradientBoostingClassifier(n_estimators=100,