Ejemplo n.º 1
0
    def DecisionTreeRegressor(self):

        clf = DecisionTreeRegressor(random_state=self.random_state)
        path = clf.cost_complexity_pruning_path(self.X_, self.y_)

        previous_nodes = -1
        best_nsc = 1
        best_model = None

        # For every possible prunning point in reverse order
        for ccp_alpha in reversed(path.ccp_alphas):

            model = DecisionTreeRegressor(ccp_alpha=ccp_alpha,
                                          random_state=self.random_state)
            model.fit(self.X_, self.y_)

            # Skip if nothing has changed
            if model.tree_.node_count == previous_nodes:
                continue

            previous_nodes = model.tree_.node_count

            new_nsc = self.nescience_.nescience(model)

            if new_nsc < best_nsc:
                best_nsc = new_nsc
                best_model = model
            else:
                break

        return (best_nsc, best_model, None)
Ejemplo n.º 2
0
def item_E_prunning(train_x, train_y, val_x, val_y, plot_option=False):
    E_tree = DecisionTreeRegressor(random_state=0)
    parameters = E_tree.cost_complexity_pruning_path(train_x, train_y)
    ccp_alphas, impurities = parameters.ccp_alphas, parameters.impurities

    regressor_forest = []
    for ccp_alpha in ccp_alphas:
        regressor_tree = arbol_decision(DecisionTreeRegressor(random_state=0, ccp_alpha=ccp_alpha))
        regressor_tree.train_tree(train_x, train_y)
        regressor_forest.append(regressor_tree)
    
    nodo_per_tree= [arbol.tree.tree_.node_count for arbol in regressor_forest]
    max_depth    = [arbol.tree.tree_.max_depth for arbol in regressor_forest]
    
    train_scores = [arbol.error(train_x, train_y) for arbol in regressor_forest]
    test_scores = [arbol.error(val_x, val_y) for arbol in regressor_forest]
    
    
    fig,ax = plt.subplots()
    ax.set_ylabel("error")
    ax.set_xlabel("ccp $\\alpha$")
    ax.plot(ccp_alphas[:-2], train_scores[:-2], marker='o', label="train",drawstyle="steps-post", color=cmap(1), alpha=0.7)
    ax.plot(ccp_alphas[:-2], test_scores[:-2], marker='s', label="val",drawstyle="steps-post", color=cmap(2), alpha=0.7)
    ax.legend(loc='center')
    
    ax2=ax.twinx()
    ax2.plot(ccp_alphas[:-2], max_depth[:-2], marker='^', label="profundidad",drawstyle="steps-post", color=cmap(3), alpha=0.7)
    ax2.set_ylabel("Profundidad")
    ax2.legend(loc='center right')
    
    
    E_tree_optimo = arbol_decision(DecisionTreeRegressor(max_depth=5, ccp_alpha=0.2))
    E_tree_optimo.train_tree(train_x, train_y)
    y_pred = E_tree_optimo.test_tree(val_x)

    if plot_option==True:
        E_tree_optimo.plot_save_tree(val_y, y_pred, "E_tree_optimo.pdf")
        
    plt.show()

    E_tree_optimo.acc_error(y_pred,train_x, train_y, val_x, val_y )
Ejemplo n.º 3
0
# Adds a grid to the plot
plt.grid()
# X-axis label
plt.ylabel('RMS error')
# Y-axis label
plt.xlabel('Depth')
# Export the plot
plt.savefig('error.png')


## The below code contains pruning the decision tree

regr = DecisionTreeRegressor(max_depth=10)
X_train, X_test, y_train, y_test = process_input()
# This is the function which returns ccp alphas and impurity of leaves
path = regr.cost_complexity_pruning_path(X_train, y_train)

# This will store the alphas and their corresponding impurities
ccp_alphas, impurities = path.ccp_alphas, path.impurities

plt.figure(figsize=(10, 6))
plt.plot(ccp_alphas, impurities)
plt.xlabel("Effective alpha")
plt.ylabel("Total Impurity of Leaves")
plt.savefig('AlphavsImpurity.png')

regrs = []

# Build trees based on different CCP values
for ccp_alpha in ccp_alphas:
    regr = DecisionTreeRegressor(random_state=0, ccp_alpha=ccp_alpha, max_depth=10)
Ejemplo n.º 4
0
predictions_dt = dt.predict(x_test)

dt.score(x_test, y_test)
mse = mean_squared_error(y_test, predictions_dt)
rmse = mse**(1 / 2)
print(mse)
print(rmse)

# ---------- Checking the score by changing complexity parameter

from sklearn import tree
plt.figure(figsize=(7, 4))
tree.plot_tree(dt, filled=True)

path = dt.cost_complexity_pruning_path(x_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities  # ---- The weakest link is characterized by an effective alpha,
# where the nodes with the smallest effective alpha are pruned first
dts = []
for ccp_alpha in ccp_alphas:
    dt = DecisionTreeRegressor(random_state=0, ccp_alpha=ccp_alpha)
    dt.fit(x_train, y_train)
    dts.append(dt)
print("Number of nodes in the last tree is :{} with ccp_alpha : {}".format(
    dts[-1].tree_.node_count, ccp_alphas[-1]))

train_scores = [dt.score(x_train, y_train) for dt in dts]
test_scores = [dt.score(x_test, y_test) for dt in dts]

fig, ax = plt.subplots()
ax.set_xlabel("alpha")
        #select training Y data from Y-996
        Y_train = Y_996[index_996]

        #select the remaining X as testing data
        X_test = np.delete(X_996, index_996, axis=0)

        #select the remaining Y as testing data
        Y_test = np.delete(Y_996, index_996, axis=0)

        #assign to new variables
        X_resample = X_train
        Y_resample = Y_train

        #call function to find alpha values
        model = regr.cost_complexity_pruning_path(X_resample,
                                                  Y_resample.ravel())

        #find alpha and impurities
        ccp_alphas, impurities = model.ccp_alphas, model.impurities

        for i in range(0, len(ccp_alphas)):
            if ccp_alphas[i] < 0:
                #in very unusual cases, the alpha values are negative, find it and make it zero
                ind_minus_alpha_row.append(m)
                ind_minus_alpha_column.append(i)
                minus_alpha.append(ccp_alphas[i])
                ccp_alphas[i] = 0

        #save alpha, shape = (tree_num, len(seies))
        alpha_all.append(ccp_alphas)
Ejemplo n.º 6
0
#visualize the tree
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(10, 10))
tree.plot_tree(dtr, filled=True)
plt.show()

# Another alternative is given by pruning the tree which is controlled by setting $\alpha$. Reasonable ranges for $\alpha$ also depend on the data and need to be tested anyway when optimizing the hyperparameter. For instance, a value $\alpha = 0.05$ results in the following structure.

# In[5]:

#we may use of tree pruning which is controlled by ccp_alpha
dtr = DecisionTreeRegressor(ccp_alpha=0.05)

#fit the tree
dtr.fit(X, y)

path = dtr.cost_complexity_pruning_path(X, y)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

#visualize the tree
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(10, 10))
tree.plot_tree(dtr, filled=True)
plt.show()

# ## Decision Trees
#
# So far, we examined a regression problem. How do we derive trees for classification problems? More or less in the same manner, however, we can not use $RSS$ to evaluate model performance. Instead, we need a loss function which is suited for the classification problem. Intuitively, we may want to minimize the classification error. However, it has been found that this does not lead to qualitative tree structures. Instead, at each split either the **Gini index** or **cross-entropy** is used to evaluate the quality of the split. The Gini index $G$ is given by:
#
# $$
# G = \sum_k \hat{p}_{lk} (1 - \hat{p}_{lk})
# $$
#
Ejemplo n.º 7
0
model = DecisionTreeRegressor(random_state = 1, max_depth = 20, min_samples_split = 6, min_samples_leaf = 2)
model.fit(X_trspl, y_trspl)
y_pred = model.predict(X_tespl)

rmse = RMSE(y_tespl, y_pred)
print(rmse)

feat_importances = pd.Series(model.feature_importances_, index=X_trspl.columns)
feat_importances.nlargest(20).plot(kind='barh')
plt.title('Feature Important based on Decision Tree regressor')
plt.xlabel('feature scores')
plt.ylabel('feature names')
plt.show()

# Total impurity of leaves vs effective alphas of pruned tree
path = model.cost_complexity_pruning_path(X_trspl, y_trspl)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")

models = []
for ccp_alpha in ccp_alphas:
    model = DecisionTreeRegressor(random_state=0, ccp_alpha=ccp_alpha)
    model.fit(X_trspl, y_trspl)
    models.append(model)
print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
      models[-1].tree_.node_count, ccp_alphas[-1]))