def cart(max_leaf_nodes, min_impurity_decrease, X_train, y_train, X_test, y_test): tree = DecisionTreeRegressor(criterion="mse", splitter = "best", max_leaf_nodes = max_leaf_nodes, min_impurity_decrease=min_impurity_decrease) tree.fit(X_train, y_train) y_train_pred = tree.predict(X_train) y_test_pred = tree.predict(X_test) error_train = np.sqrt(mean_squared_error(y_train, y_train_pred)) error_test = np.sqrt(mean_squared_error(y_test, y_test_pred)) return( tree.get_depth(), tree.get_n_leaves(), tree.feature_importances_, y_train_pred, y_test_pred, error_train, error_test)
def draw_tree_learning_curves(X_train, y_train, X_test, y_test): variances = [] train_error, test_error = [], [] leaves_counts = [] max_l = 21 np.random.seed(42) for n in range(1, max_l): variances.append( estimate_tree_variance(np.concatenate((X_train, X_test)), np.concatenate((y_train, y_test)), runs=10, min_samples_leaf=n)) imputer = SimpleImputer() X_train_imputed = imputer.fit_transform(X_train) X_test_imputed = imputer.transform(X_test) model = DecisionTreeRegressor(min_samples_leaf=n) model.fit(X_train_imputed, y_train) leaves_counts.append(model.get_n_leaves()) train_error.append( mean_squared_error(y_train, model.predict(X_train_imputed))) test_error.append( mean_squared_error(y_test, model.predict(X_test_imputed))) _, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5)) ax1.plot(leaves_counts, (train_error), color='green', label='Training Error') ax1.plot(leaves_counts, (test_error), color='red', linestyle='--', label='Testing Error') ax1.legend() ax1.set_yticks([]) ax1.set_xlabel("Number of Leaves") ax2.plot(leaves_counts, np.array(variances), color='black', label='Variance') ax2.plot(leaves_counts, (np.array(test_error) - np.array(train_error)), color='blue', linestyle='--', label='Test Err - Train Err') ax2.legend() ax2.set_yticks([]) ax2.set_xlabel("Number of Leaves")
def DecisionTree_Regression(x, y): """ This function compute and output results of a Decision Tree Regression. Note: this is a non linear and non continuous model. The visualization looks like "stairs" (horizontal line followed by a vertical one, the a horizontal, etc) Arguments: ---------- - X: pandas dataframe Dataframe containing dependent variables - y: pandas dataframe Dataframe containing independent variables Return: ---------- - model: decision tree fitted object from sklearn decision tree regression class The fitted model object - Results: pandas dataframe Statistics about the model such as the score and MSE """ model = DecisionTreeRegressor() model.fit(X=x, y=y) Predict_y = model.predict(x) Results = FT.MachineLearning.Metrics.ModelEvaluation(y, Predict_y, Indicators = ["Explained Variance Score", "Max Error", "Mean Squared Error", "R² Score"]) #Tree Depth Results["Tree Depth"] = model.get_depth() #Tree Leaves Results["Tree Leaves"] = model.get_n_leaves() return model, Results
class RamenRatingPredictor: def __init__(self): self.feature_names = None self.class_names = None def load_dataset(self, path): """ To load dataset from local :param path: String, location of file :return: """ self.data = pd.read_csv(path) print(self.data.head(10)) def preprocess_dataset(self): """ To select features and shuffle data :return: """ self.data = self.data.iloc[:, [1, 3, 4, 5]] self.feature_names = self.data.iloc[:, :3].columns self.class_names = self.data.iloc[:, 3].unique().astype('str') self.data = self.data.dropna() self.data = self.data.sample(frac=1) print(self.data.info()) ode = OrdinalEncoder() data = ode.fit_transform(self.data) self.data = pd.DataFrame(data, columns=self.data.columns) print(self.data.info()) print(self.data.head(10)) def split_dataset(self, test_rate=0.2): """ To split dataset to trainingset and testset considering the rate of test set :param test_rate: double, the rate of test set :return: """ X = self.data.iloc[:, :3] y = self.data.iloc[:, 3] self.train_X, self.test_X, self.train_y, self.test_y = train_test_split( X, y, test_size=test_rate, random_state=42) print("# of Instances in Training Set: ", len(self.train_X)) print("# of Instances in Test Set: ", len(self.test_X)) def train_model(self): """ To train model using training set :return: """ self.dtr = DecisionTreeRegressor(max_depth=30) self.dtr.fit(self.train_X, self.train_y) print(self.dtr.get_n_leaves(), self.dtr.get_depth()) dump(self.dtr, "./Ramen_Rating_Predictor.joblib") def predict(self, X): """ To predict diagnosis result derived X :param X: dataframe, dict, an instance for predicting label :return: Array, predicted result """ self.dtr = load("./Ramen_Rating_Predictor.joblib") result = self.dtr.predict(X) return result def evaluate_model(self, y_true, y_pred): """ To evaluate trained model :param y_true: dict, label data driven from original data :param y_pred: dict, label data driven from predicted result :return: dict, performance results """ mae = mean_absolute_error(y_true=y_true, y_pred=y_pred) mse = mean_squared_error(y_true=y_true, y_pred=y_pred) return {"MAE": mae, "MSE": mse} def finetune_model(self, criterion="mse", max_depth=None, min_samples_leaf=1, max_leaf_nodes=None): """ To change hyperparameters and train model again :param kernel: string, kernel of SVC :param degree: int, degree of polynomial kernel :param C: double, regularization :param coef0:double, :return: """ self.dtr = DecisionTreeRegressor(criterion=criterion, max_depth=max_depth, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) self.dtr.fit(self.train_X, self.train_y) print(self.dtr.get_n_leaves(), self.dtr.get_depth()) dump(self.dtr, "./Ramen_Rating_Predictor.joblib" ) # To save trained model to local def visualize_tree(self): """ To save *.dot file of trained model :return: """ tree.export_graphviz(self.dtr, out_file="Ramen_Rating_Predictor.dot", feature_names=self.feature_names, class_names=self.class_names, filled=True)
X_tst_scld = input_scaling(X_tst) #%%# # create a Vanilla decisontree regressor object. This will probably overfit and have the worst performance. regressor = DecisionTreeRegressor(random_state=0) test = X_trn_scld.head() # fit the regressor with X_train and Y_train data directly without encoding and scaling regressor.fit(X_train, y_train) y_pred = regressor.predict(X_test) mae, mse, rmse, r2 = compute_metrics(y_test, y_pred) print('Mean Absolute Error:', mae) print('Mean Squared Error:', mse) print('Root Mean Squared Error:', rmse) print('R-squared:', r2) print('No. of leaves', regressor.get_n_leaves()) # fit the regressor with X_train and Y_train data with encoding and scaling regressor.fit(X_trn_scld, y_train) y_pred = regressor.predict(X_tst_scld) mae, mse, rmse, r2 = compute_metrics(y_test, y_pred) print('Mean Absolute Error:', mae) print('Mean Squared Error:', mse) print('Root Mean Squared Error:', rmse) print('R-squared:', r2) print('No. of leaves', regressor.get_n_leaves()) # Doing a gridsearchCV and 5-fold and 10 fold CV R sq upto 0.08 mae, mse, rmse, r2 = compute_metrics(y_test, myGSCV().predict(X_tst_scld))
# Split data into two pieces. train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0) # Initialise a decision tree. melbourne_model = DecisionTreeRegressor(random_state=0) # Build a decision tree from the training data. melbourne_model.fit(train_X, train_y) # Now the model is built, use it to predict the prices from the # validation data X val_predictions = melbourne_model.predict(val_X) # Use the mean absolute error metric to measure accuracy of the model. print("Mean Absolute Error is", mean_absolute_error(val_y, val_predictions)) # Misc functions print("Depth of decision tree is", melbourne_model.get_depth()) print("n_leaves of decision tree is", melbourne_model.get_n_leaves()) # Plot the error num_points = len(val_predictions) val_y_array = val_y.to_numpy() x = np.linspace(0, 500000, num_points) errors = [] for i in range(num_points): errors.append(abs(val_y_array[i] - val_predictions[i])) plt.scatter(x, errors, c='r', s=0.5) plt.savefig("basic_decision_tree_errors.png") plt.close()
#a = np.load('outputs'+os.sep+'mlp_friedman_Ep_500_B1Sig_l2.npy.npy').item() #%% from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split X, y = load_iris(return_X_y=True) #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) clf = DecisionTreeClassifier(random_state=0) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) clf.get_depth() clf.get_n_leaves() iris = load_iris() iris.data iris.target #%% from sklearn.datasets import load_diabetes from sklearn.model_selection import train_test_split X, y = load_diabetes(return_X_y=True) #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) regressor = DecisionTreeRegressor(random_state=0) regressor.fit(X_train, y_train) y_pred = regressor.predict(X_test) regressor.get_depth() regressor.get_n_leaves()
classifier = DecisionTreeRegressor(criterion ='mse', random_state =0) classifier.fit(X_train, Y_train) # predicting train set results from sklearn.metrics import confusion_matrix, accuracy_score train_pred = classifier.predict(X_train) # train residual values train_resid = train_pred - Y_train # RMSE value of train data train_rmse = np.sqrt(np.mean(train_resid**2)) print(train_rmse) # 0 # Predicting test set results test_pred = classifier.predict(X_test) classifier.score(Y_test, test_pred) classifier.get_n_leaves() # 295 leaves # Test set residual values test_resid = test_pred - Y_test # RMSE value for test data test_rmse = np.sqrt(np.mean(test_resid**2)) print(test_rmse) # 2.3780 # Visualizing the Tree from sklearn.externals.six import StringIO from IPython.display import Image from sklearn.tree import export_graphviz import pydotplus dot_data = StringIO()
# In[49]: #calculating the training error using the function created above m_rmse(m, xs, y) # In[50]: #Calculating the validation error m_rmse(m, valid_xs, valid_y) # In[51]: #Seems like the model is overfitting #calculating the number of leaf nodes of the unconstrained model m.get_n_leaves(), len(xs) # # Creating the Decision Tree with a constrain that every leaf node contains ATLEAST 25 examples -- to assess if the the model still overfits # In[52]: m = DecisionTreeRegressor(min_samples_leaf=25) m.fit(to.train.xs, to.train.y) m_rmse(m, xs, y), m_rmse(m, valid_xs, valid_y) # In[53]: #with the constraint in place, we got a lower validation error and non-zero training error #calculating the number of leaves in the model with m.get_n_leaves()