def cart(max_leaf_nodes, min_impurity_decrease, X_train, y_train, X_test, y_test):
    tree = DecisionTreeRegressor(criterion="mse", splitter = "best", max_leaf_nodes = max_leaf_nodes, min_impurity_decrease=min_impurity_decrease)
    tree.fit(X_train, y_train)
    y_train_pred = tree.predict(X_train)
    y_test_pred = tree.predict(X_test)
    error_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
    error_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
    return( tree.get_depth(), tree.get_n_leaves(), tree.feature_importances_, y_train_pred, y_test_pred, error_train, error_test)
Exemple #2
0
def draw_tree_learning_curves(X_train, y_train, X_test, y_test):

    variances = []
    train_error, test_error = [], []
    leaves_counts = []
    max_l = 21

    np.random.seed(42)

    for n in range(1, max_l):
        variances.append(
            estimate_tree_variance(np.concatenate((X_train, X_test)),
                                   np.concatenate((y_train, y_test)),
                                   runs=10,
                                   min_samples_leaf=n))

        imputer = SimpleImputer()
        X_train_imputed = imputer.fit_transform(X_train)
        X_test_imputed = imputer.transform(X_test)

        model = DecisionTreeRegressor(min_samples_leaf=n)
        model.fit(X_train_imputed, y_train)

        leaves_counts.append(model.get_n_leaves())

        train_error.append(
            mean_squared_error(y_train, model.predict(X_train_imputed)))
        test_error.append(
            mean_squared_error(y_test, model.predict(X_test_imputed)))

    _, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

    ax1.plot(leaves_counts, (train_error),
             color='green',
             label='Training Error')
    ax1.plot(leaves_counts, (test_error),
             color='red',
             linestyle='--',
             label='Testing Error')

    ax1.legend()
    ax1.set_yticks([])
    ax1.set_xlabel("Number of Leaves")

    ax2.plot(leaves_counts,
             np.array(variances),
             color='black',
             label='Variance')
    ax2.plot(leaves_counts, (np.array(test_error) - np.array(train_error)),
             color='blue',
             linestyle='--',
             label='Test Err - Train Err')

    ax2.legend()
    ax2.set_yticks([])
    ax2.set_xlabel("Number of Leaves")
def DecisionTree_Regression(x, y):
    """
    This function compute and output results of a Decision Tree Regression.

    Note: this is a non linear and non continuous model. The visualization looks like "stairs" (horizontal line followed by a vertical one, the a horizontal, etc)


    Arguments:
    ----------
        - X: pandas dataframe
            Dataframe containing dependent variables
        - y: pandas dataframe
            Dataframe containing independent variables

    Return:
    ----------
        - model: decision tree fitted object from sklearn decision tree regression class
            The fitted model object
        - Results: pandas dataframe
            Statistics about the model such as the score and MSE
    """

    model = DecisionTreeRegressor()
    model.fit(X=x, y=y)

    Predict_y = model.predict(x)

    Results = FT.MachineLearning.Metrics.ModelEvaluation(y, Predict_y,
                                                         Indicators = ["Explained Variance Score", "Max Error", "Mean Squared Error", "R² Score"])

    #Tree Depth
    Results["Tree Depth"] = model.get_depth()

    #Tree Leaves
    Results["Tree Leaves"] = model.get_n_leaves()

    return model, Results
Exemple #4
0
class RamenRatingPredictor:
    def __init__(self):
        self.feature_names = None
        self.class_names = None

    def load_dataset(self, path):
        """
        To load dataset from local
        :param path: String, location of file
        :return:
        """
        self.data = pd.read_csv(path)
        print(self.data.head(10))

    def preprocess_dataset(self):
        """
        To select features and shuffle data
        :return:
        """
        self.data = self.data.iloc[:, [1, 3, 4, 5]]
        self.feature_names = self.data.iloc[:, :3].columns
        self.class_names = self.data.iloc[:, 3].unique().astype('str')
        self.data = self.data.dropna()
        self.data = self.data.sample(frac=1)
        print(self.data.info())
        ode = OrdinalEncoder()
        data = ode.fit_transform(self.data)
        self.data = pd.DataFrame(data, columns=self.data.columns)
        print(self.data.info())
        print(self.data.head(10))

    def split_dataset(self, test_rate=0.2):
        """
        To split dataset to trainingset and testset considering the rate of test set
        :param test_rate: double, the rate of test set
        :return:
        """
        X = self.data.iloc[:, :3]
        y = self.data.iloc[:, 3]

        self.train_X, self.test_X, self.train_y, self.test_y = train_test_split(
            X, y, test_size=test_rate, random_state=42)

        print("# of Instances in Training Set: ", len(self.train_X))
        print("# of Instances in Test Set: ", len(self.test_X))

    def train_model(self):
        """
        To train model using training set
        :return:
        """
        self.dtr = DecisionTreeRegressor(max_depth=30)
        self.dtr.fit(self.train_X, self.train_y)
        print(self.dtr.get_n_leaves(), self.dtr.get_depth())
        dump(self.dtr, "./Ramen_Rating_Predictor.joblib")

    def predict(self, X):
        """
        To predict diagnosis result derived X
        :param X: dataframe, dict, an instance for predicting label
        :return: Array, predicted result
        """
        self.dtr = load("./Ramen_Rating_Predictor.joblib")
        result = self.dtr.predict(X)
        return result

    def evaluate_model(self, y_true, y_pred):
        """
        To evaluate trained model
        :param y_true: dict, label data driven from original data
        :param y_pred: dict, label data driven from predicted result
        :return: dict, performance results
        """
        mae = mean_absolute_error(y_true=y_true, y_pred=y_pred)
        mse = mean_squared_error(y_true=y_true, y_pred=y_pred)
        return {"MAE": mae, "MSE": mse}

    def finetune_model(self,
                       criterion="mse",
                       max_depth=None,
                       min_samples_leaf=1,
                       max_leaf_nodes=None):
        """
        To change hyperparameters and train model again
        :param kernel: string, kernel of SVC
        :param degree: int, degree of polynomial kernel
        :param C: double, regularization
        :param coef0:double,
        :return:
        """
        self.dtr = DecisionTreeRegressor(criterion=criterion,
                                         max_depth=max_depth,
                                         min_samples_leaf=min_samples_leaf,
                                         max_leaf_nodes=max_leaf_nodes)
        self.dtr.fit(self.train_X, self.train_y)

        print(self.dtr.get_n_leaves(), self.dtr.get_depth())
        dump(self.dtr, "./Ramen_Rating_Predictor.joblib"
             )  # To save trained model to local

    def visualize_tree(self):
        """
        To save *.dot file of trained model
        :return:
        """
        tree.export_graphviz(self.dtr,
                             out_file="Ramen_Rating_Predictor.dot",
                             feature_names=self.feature_names,
                             class_names=self.class_names,
                             filled=True)
X_tst_scld = input_scaling(X_tst)
#%%#
# create a Vanilla decisontree regressor object. This will probably overfit and have the worst performance.
regressor = DecisionTreeRegressor(random_state=0)
test = X_trn_scld.head()
# fit the regressor with X_train and Y_train data directly without encoding and scaling
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

mae, mse, rmse, r2 = compute_metrics(y_test, y_pred)

print('Mean Absolute Error:', mae)
print('Mean Squared Error:', mse)
print('Root Mean Squared Error:', rmse)
print('R-squared:', r2)
print('No. of leaves', regressor.get_n_leaves())

# fit the regressor with X_train and Y_train data with encoding and scaling
regressor.fit(X_trn_scld, y_train)
y_pred = regressor.predict(X_tst_scld)

mae, mse, rmse, r2 = compute_metrics(y_test, y_pred)

print('Mean Absolute Error:', mae)
print('Mean Squared Error:', mse)
print('Root Mean Squared Error:', rmse)
print('R-squared:', r2)
print('No. of leaves', regressor.get_n_leaves())

# Doing a gridsearchCV and 5-fold and 10 fold CV R sq upto 0.08
mae, mse, rmse, r2 = compute_metrics(y_test, myGSCV().predict(X_tst_scld))
Exemple #6
0
# Split data into two pieces.
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)

# Initialise a decision tree.
melbourne_model = DecisionTreeRegressor(random_state=0)

# Build a decision tree from the training data.
melbourne_model.fit(train_X, train_y)

# Now the model is built, use it to predict the prices from the
# validation data X
val_predictions = melbourne_model.predict(val_X)

# Use the mean absolute error metric to measure accuracy of the model.
print("Mean Absolute Error is", mean_absolute_error(val_y, val_predictions))

# Misc functions
print("Depth of decision tree is", melbourne_model.get_depth())
print("n_leaves of decision tree is", melbourne_model.get_n_leaves())

# Plot the error
num_points = len(val_predictions)
val_y_array = val_y.to_numpy()
x = np.linspace(0, 500000, num_points)
errors = []
for i in range(num_points):
    errors.append(abs(val_y_array[i] - val_predictions[i]))
plt.scatter(x, errors, c='r', s=0.5)
plt.savefig("basic_decision_tree_errors.png")
plt.close()
Exemple #7
0
#a = np.load('outputs'+os.sep+'mlp_friedman_Ep_500_B1Sig_l2.npy.npy').item()

#%%
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

X, y = load_iris(return_X_y=True)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
clf.get_depth()
clf.get_n_leaves()

iris = load_iris()
iris.data
iris.target

#%%
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

X, y = load_diabetes(return_X_y=True)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
regressor = DecisionTreeRegressor(random_state=0)

regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
regressor.get_depth()
regressor.get_n_leaves()
Exemple #8
0
classifier = DecisionTreeRegressor(criterion ='mse', random_state =0)
classifier.fit(X_train, Y_train)

# predicting train set results
from sklearn.metrics import confusion_matrix, accuracy_score
train_pred = classifier.predict(X_train)
# train residual values
train_resid = train_pred - Y_train
# RMSE value of train data
train_rmse = np.sqrt(np.mean(train_resid**2))
print(train_rmse) # 0

# Predicting test set results
test_pred = classifier.predict(X_test)
classifier.score(Y_test, test_pred)
classifier.get_n_leaves() # 295 leaves

# Test set residual values
test_resid = test_pred - Y_test

# RMSE value for test data
test_rmse = np.sqrt(np.mean(test_resid**2))
print(test_rmse) # 2.3780

# Visualizing the Tree
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus

dot_data = StringIO()
Exemple #9
0
# In[49]:

#calculating the training error using the function created above
m_rmse(m, xs, y)

# In[50]:

#Calculating the validation error
m_rmse(m, valid_xs, valid_y)

# In[51]:

#Seems like the model is overfitting
#calculating the number of leaf nodes of the unconstrained model
m.get_n_leaves(), len(xs)

# # Creating the Decision Tree with a constrain that every leaf node contains ATLEAST 25 examples -- to assess if the the model still overfits

# In[52]:

m = DecisionTreeRegressor(min_samples_leaf=25)
m.fit(to.train.xs, to.train.y)
m_rmse(m, xs, y), m_rmse(m, valid_xs, valid_y)

# In[53]:

#with the constraint in place, we got a lower validation error and non-zero training error
#calculating the number of leaves in the model with
m.get_n_leaves()