def cart(max_leaf_nodes, min_impurity_decrease, X_train, y_train, X_test, y_test): tree = DecisionTreeRegressor(criterion="mse", splitter = "best", max_leaf_nodes = max_leaf_nodes, min_impurity_decrease=min_impurity_decrease) tree.fit(X_train, y_train) y_train_pred = tree.predict(X_train) y_test_pred = tree.predict(X_test) error_train = np.sqrt(mean_squared_error(y_train, y_train_pred)) error_test = np.sqrt(mean_squared_error(y_test, y_test_pred)) return( tree.get_depth(), tree.get_n_leaves(), tree.feature_importances_, y_train_pred, y_test_pred, error_train, error_test)
class SkCARTPredictor(LearnablePredictor[np.ndarray]): def __init__( self, n_features: int, n_classes: int, **kwargs ): super().__init__() self._kwargs = kwargs self._actual_height = -1 self._is_classification = n_classes > 0 self._train_invoked = False assert 'random_state' not in kwargs kwargs['random_state'] = np.random.randint(low=0, high=10000000) if self._is_classification: self._model = DecisionTreeClassifier(**kwargs) else: self._model = DecisionTreeRegressor(**kwargs) def get_hyperparams(self) -> Dict[str, Any]: return OrderedDict( [('actual_height', self._actual_height)] + [(k, v) if k != 'max_depth' else ('height', v) for k, v in self._kwargs.items()] ) def train(self, train_data: Dataset[np.ndarray], validn_data: Dataset[np.ndarray], test_data: Dataset[np.ndarray]): assert not self._train_invoked, 'train() already invoked once' self._train_invoked = True self._model.fit(train_data['x'], train_data['y']) self._actual_height = self._model.get_depth() try: if self._actual_height != self._kwargs['max_depth']: self.log_f('WARN: actual depth != max_depth: {} != {}\n'.format(self._actual_height, self._kwargs['max_depth']), stdout=False) except KeyError: pass def inference(self, x: np.ndarray) -> np.ndarray: assert self._model is not None, 'Model doesn\'t exist but inference() called' return self._model.predict(x) def raw_inference(self, x: np.ndarray) -> np.ndarray: raise NotImplementedError def auc_inference(self, x: np.ndarray) -> np.ndarray: #assert self._model.n_classes_ == [0, 1] return self._model.predict_proba(x)[:, 1]
def main(): sns.set() df = pd.read_csv('movies.csv', encoding="ISO-8859-1") columns = [ 'budget', 'company', 'country', 'director', 'genre', 'gross', 'rating', 'runtime', 'score', 'star', 'writer', 'year' ] scores = [] test_sizes = [] for i in range(300): sample = df.sample(n=1000) filtered = sample.loc[:, columns] filtered.dropna() X = filtered.loc[:, (filtered.columns != 'gross')] X = pd.get_dummies(X) y = filtered['gross'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) model = DecisionTreeRegressor() # train on the training features and labels model.fit(X_train, y_train) # test on the test set y_pred = model.predict(X_test) score = mean_squared_error(y_test, y_pred) print("Accuracy Score", model.get_depth(), score) test_sizes.append(model.get_depth()) scores.append(score) score_df = pd.DataFrame({'scores': scores, 'test_sizes': test_sizes}) sns.relplot(x='test_sizes', y='scores', data=score_df, kind='line') plt.title('Mean Squared Error vs Decision Tree Depth') plt.xlabel('Decision Tree Depth') plt.ylabel('Mean Squared Error') plt.savefig('movie_success_prediction.png', bbox_inches='tight')
def DecisionTree_Regression(x, y): """ This function compute and output results of a Decision Tree Regression. Note: this is a non linear and non continuous model. The visualization looks like "stairs" (horizontal line followed by a vertical one, the a horizontal, etc) Arguments: ---------- - X: pandas dataframe Dataframe containing dependent variables - y: pandas dataframe Dataframe containing independent variables Return: ---------- - model: decision tree fitted object from sklearn decision tree regression class The fitted model object - Results: pandas dataframe Statistics about the model such as the score and MSE """ model = DecisionTreeRegressor() model.fit(X=x, y=y) Predict_y = model.predict(x) Results = FT.MachineLearning.Metrics.ModelEvaluation(y, Predict_y, Indicators = ["Explained Variance Score", "Max Error", "Mean Squared Error", "R² Score"]) #Tree Depth Results["Tree Depth"] = model.get_depth() #Tree Leaves Results["Tree Leaves"] = model.get_n_leaves() return model, Results
class RamenRatingPredictor: def __init__(self): self.feature_names = None self.class_names = None def load_dataset(self, path): """ To load dataset from local :param path: String, location of file :return: """ self.data = pd.read_csv(path) print(self.data.head(10)) def preprocess_dataset(self): """ To select features and shuffle data :return: """ self.data = self.data.iloc[:, [1, 3, 4, 5]] self.feature_names = self.data.iloc[:, :3].columns self.class_names = self.data.iloc[:, 3].unique().astype('str') self.data = self.data.dropna() self.data = self.data.sample(frac=1) print(self.data.info()) ode = OrdinalEncoder() data = ode.fit_transform(self.data) self.data = pd.DataFrame(data, columns=self.data.columns) print(self.data.info()) print(self.data.head(10)) def split_dataset(self, test_rate=0.2): """ To split dataset to trainingset and testset considering the rate of test set :param test_rate: double, the rate of test set :return: """ X = self.data.iloc[:, :3] y = self.data.iloc[:, 3] self.train_X, self.test_X, self.train_y, self.test_y = train_test_split( X, y, test_size=test_rate, random_state=42) print("# of Instances in Training Set: ", len(self.train_X)) print("# of Instances in Test Set: ", len(self.test_X)) def train_model(self): """ To train model using training set :return: """ self.dtr = DecisionTreeRegressor(max_depth=30) self.dtr.fit(self.train_X, self.train_y) print(self.dtr.get_n_leaves(), self.dtr.get_depth()) dump(self.dtr, "./Ramen_Rating_Predictor.joblib") def predict(self, X): """ To predict diagnosis result derived X :param X: dataframe, dict, an instance for predicting label :return: Array, predicted result """ self.dtr = load("./Ramen_Rating_Predictor.joblib") result = self.dtr.predict(X) return result def evaluate_model(self, y_true, y_pred): """ To evaluate trained model :param y_true: dict, label data driven from original data :param y_pred: dict, label data driven from predicted result :return: dict, performance results """ mae = mean_absolute_error(y_true=y_true, y_pred=y_pred) mse = mean_squared_error(y_true=y_true, y_pred=y_pred) return {"MAE": mae, "MSE": mse} def finetune_model(self, criterion="mse", max_depth=None, min_samples_leaf=1, max_leaf_nodes=None): """ To change hyperparameters and train model again :param kernel: string, kernel of SVC :param degree: int, degree of polynomial kernel :param C: double, regularization :param coef0:double, :return: """ self.dtr = DecisionTreeRegressor(criterion=criterion, max_depth=max_depth, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) self.dtr.fit(self.train_X, self.train_y) print(self.dtr.get_n_leaves(), self.dtr.get_depth()) dump(self.dtr, "./Ramen_Rating_Predictor.joblib" ) # To save trained model to local def visualize_tree(self): """ To save *.dot file of trained model :return: """ tree.export_graphviz(self.dtr, out_file="Ramen_Rating_Predictor.dot", feature_names=self.feature_names, class_names=self.class_names, filled=True)
# Обучите решающее дерево для регрессии на предложенных данных, # размер тестовой выборки возьмите за 0.3, random_state = 42 для разбиения и дерева. # Вычислите RMSE, округлите до двух знаков после точки-разделителя. import pandas as pd import numpy as np from sklearn.tree import DecisionTreeRegressor from sklearn.model_selection import train_test_split from sklearn.metrics import f1_score, accuracy_score, mean_squared_error RANDOM_SEED = 42 train = pd.read_csv('module_7_test/petrol_consumption.csv') y = train['Petrol_Consumption'] X = train.drop(columns=['Petrol_Consumption']) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_SEED) reg_tree = DecisionTreeRegressor(random_state=RANDOM_SEED) reg_tree.fit(X_train, y_train) y_predict = reg_tree.predict(X_test) # Вычислите RMSE, округлите до двух знаков после точки-разделителя. rmse = np.sqrt(mean_squared_error(y_test, y_predict)) print(round(rmse, 2)) # Какова глубина дерева? print(reg_tree.get_depth())
print( "Train error 1: ", np.sum(np.square(y_1_train - PCA_train_ph[:, comp_to_fit])) / len(y_1_train)) print( "Train error 2: ", np.sum(np.square(y_2_train - PCA_train_ph[:, comp_to_fit])) / len(y_2_train)) print( "Test error 1: ", np.sum(np.square(y_1_test - PCA_test_ph[:, comp_to_fit])) / len(y_1_test)) print( "Test error 2: ", np.sum(np.square(y_2_test - PCA_test_ph[:, comp_to_fit])) / len(y_1_test)) print(regr_1.feature_importances_, regr_1.get_depth()) # Plot test results plt.figure() plt.plot(test_theta[:, 0], PCA_test_ph[:, comp_to_fit], 'o', ms=2, c="darkorange", label="true") #plt.plot(test_theta[:,0], y_1_test,'o', color="cornflowerblue",label="max_depth= "+str(regr_1.get_depth()), ms=2) plt.plot(test_theta[:, 0], y_2_test, 'o', color="yellowgreen", label="max_depth= " + str(regr_1.get_depth()),
# Split data into two pieces. train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0) # Initialise a decision tree. melbourne_model = DecisionTreeRegressor(random_state=0) # Build a decision tree from the training data. melbourne_model.fit(train_X, train_y) # Now the model is built, use it to predict the prices from the # validation data X val_predictions = melbourne_model.predict(val_X) # Use the mean absolute error metric to measure accuracy of the model. print("Mean Absolute Error is", mean_absolute_error(val_y, val_predictions)) # Misc functions print("Depth of decision tree is", melbourne_model.get_depth()) print("n_leaves of decision tree is", melbourne_model.get_n_leaves()) # Plot the error num_points = len(val_predictions) val_y_array = val_y.to_numpy() x = np.linspace(0, 500000, num_points) errors = [] for i in range(num_points): errors.append(abs(val_y_array[i] - val_predictions[i])) plt.scatter(x, errors, c='r', s=0.5) plt.savefig("basic_decision_tree_errors.png") plt.close()
import matplotlib.pyplot as plot from sklearn.tree import export_graphviz import pydotplus maeList = [] depthList = [] boston = load_boston() X = pd.DataFrame(boston.data, columns=boston.feature_names) y = pd.DataFrame(boston.target, columns=["target"]) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) dtr_model = DecisionTreeRegressor(random_state=0, max_depth=6) dtr_model.fit(X_train, y_train) y_preds = dtr_model.predict(X_test) print(mean_absolute_error(y_test, y_preds), dtr_model.get_depth()) dot_data = export_graphviz(dtr_model, feature_names=boston.feature_names, out_file=None, filled=True, rounded=True) graph = pydotplus.graph_from_dot_data(dot_data) graph.write_png("boston.png") for depths in range(1, 19): dtr_model = DecisionTreeRegressor(random_state=0, max_depth=depths) dtr_model.fit(X_train, y_train) preds = dtr_model.predict(X_test) this_mae = mean_absolute_error(y_test, preds) maeList.append(this_mae)
#a = np.load('outputs'+os.sep+'mlp_friedman_Ep_500_B1Sig_l2.npy.npy').item() #%% from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split X, y = load_iris(return_X_y=True) #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) clf = DecisionTreeClassifier(random_state=0) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) clf.get_depth() clf.get_n_leaves() iris = load_iris() iris.data iris.target #%% from sklearn.datasets import load_diabetes from sklearn.model_selection import train_test_split X, y = load_diabetes(return_X_y=True) #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) regressor = DecisionTreeRegressor(random_state=0) regressor.fit(X_train, y_train) y_pred = regressor.predict(X_test) regressor.get_depth() regressor.get_n_leaves()
data["young"][i] = -1 X_data = np.matrix(data[["abs(V-I)", "abs(V-R)", "abs(R-I)"]]) X_resul = data["young"] train_data, test_data, train_resul, test_resul = train_test_split( X_data, X_resul, test_size=0.2) d_tree = DecisionTreeRegressor() d_tree.fit(train_data, train_resul) predictions = d_tree.predict(test_data) R2 = d_tree.score(train_data, train_resul) R2_test = d_tree.score(test_data, test_resul) deep = d_tree.get_depth() pres = predictions - test_resul y = [i for i in range(350)] x = np.zeros(350) bbox = dict(boxstyle="round", fc="0.8") plt.hist(predictions, density=False, histtype='bar', rwidth=0.9, bins=2, color="k", label="prediction") plt.hist(test_resul,
coeff.append(rScore) meanSquareList.append(np.sqrt(meanSquare)) total=0 totalRScore=0 totalMeanSquare=0 for i in range(len(meanList)): total+= meanList[i] totalRScore+=coeff[i] totalMeanSquare+=meanSquareList[i] print("Average Mean: ", total/10) print("Average Coefficient: ",totalRScore/10) print("Average Root Mean Square Error: ",totalMeanSquare/10) print(regressionTree.get_depth()) print (datetime.now() - startTime) #2 - Linear Regression on Guiding Question 1 from sklearn import preprocessing import numpy as np import pandas as pd from sklearn.metrics import mean_absolute_error from sklearn.metrics import mean_squared_error from sklearn.model_selection import KFold from sklearn.preprocessing import StandardScaler from matplotlib.colors import ListedColormap