def cart(max_leaf_nodes, min_impurity_decrease, X_train, y_train, X_test, y_test):
    tree = DecisionTreeRegressor(criterion="mse", splitter = "best", max_leaf_nodes = max_leaf_nodes, min_impurity_decrease=min_impurity_decrease)
    tree.fit(X_train, y_train)
    y_train_pred = tree.predict(X_train)
    y_test_pred = tree.predict(X_test)
    error_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
    error_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
    return( tree.get_depth(), tree.get_n_leaves(), tree.feature_importances_, y_train_pred, y_test_pred, error_train, error_test)
Esempio n. 2
0
class SkCARTPredictor(LearnablePredictor[np.ndarray]):
    def __init__(
        self,
        n_features: int,
        n_classes: int,
        **kwargs
    ):
        super().__init__()
        self._kwargs = kwargs
        self._actual_height = -1

        self._is_classification = n_classes > 0

        self._train_invoked = False

        assert 'random_state' not in kwargs
        kwargs['random_state'] = np.random.randint(low=0, high=10000000)

        if self._is_classification:
            self._model = DecisionTreeClassifier(**kwargs)
        else:
            self._model = DecisionTreeRegressor(**kwargs)

    def get_hyperparams(self) -> Dict[str, Any]:
        return OrderedDict(
            [('actual_height', self._actual_height)] +
            [(k, v) if k != 'max_depth' else ('height', v) for k, v in self._kwargs.items()]
        )

    def train(self, train_data: Dataset[np.ndarray], validn_data: Dataset[np.ndarray], test_data: Dataset[np.ndarray]):
        assert not self._train_invoked, 'train() already invoked once'
        self._train_invoked = True
        self._model.fit(train_data['x'], train_data['y'])
        self._actual_height = self._model.get_depth()

        try:
            if self._actual_height != self._kwargs['max_depth']:
                self.log_f('WARN: actual depth != max_depth: {} != {}\n'.format(self._actual_height, self._kwargs['max_depth']), stdout=False)
        except KeyError:
            pass

    def inference(self, x: np.ndarray) -> np.ndarray:
        assert self._model is not None, 'Model doesn\'t exist but inference() called'
        return self._model.predict(x)

    def raw_inference(self, x: np.ndarray) -> np.ndarray:
        raise NotImplementedError

    def auc_inference(self, x: np.ndarray) -> np.ndarray:
        #assert self._model.n_classes_ == [0, 1]
        return self._model.predict_proba(x)[:, 1]
Esempio n. 3
0
def main():
    sns.set()
    df = pd.read_csv('movies.csv', encoding="ISO-8859-1")
    columns = [
        'budget', 'company', 'country', 'director', 'genre', 'gross', 'rating',
        'runtime', 'score', 'star', 'writer', 'year'
    ]
    scores = []
    test_sizes = []
    for i in range(300):
        sample = df.sample(n=1000)
        filtered = sample.loc[:, columns]
        filtered.dropna()
        X = filtered.loc[:, (filtered.columns != 'gross')]
        X = pd.get_dummies(X)
        y = filtered['gross']
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2)
        model = DecisionTreeRegressor()

        # train on the training features and labels
        model.fit(X_train, y_train)

        # test on the test set
        y_pred = model.predict(X_test)

        score = mean_squared_error(y_test, y_pred)
        print("Accuracy Score", model.get_depth(), score)
        test_sizes.append(model.get_depth())
        scores.append(score)
    score_df = pd.DataFrame({'scores': scores, 'test_sizes': test_sizes})
    sns.relplot(x='test_sizes', y='scores', data=score_df, kind='line')
    plt.title('Mean Squared Error vs Decision Tree Depth')
    plt.xlabel('Decision Tree Depth')
    plt.ylabel('Mean Squared Error')
    plt.savefig('movie_success_prediction.png', bbox_inches='tight')
def DecisionTree_Regression(x, y):
    """
    This function compute and output results of a Decision Tree Regression.

    Note: this is a non linear and non continuous model. The visualization looks like "stairs" (horizontal line followed by a vertical one, the a horizontal, etc)


    Arguments:
    ----------
        - X: pandas dataframe
            Dataframe containing dependent variables
        - y: pandas dataframe
            Dataframe containing independent variables

    Return:
    ----------
        - model: decision tree fitted object from sklearn decision tree regression class
            The fitted model object
        - Results: pandas dataframe
            Statistics about the model such as the score and MSE
    """

    model = DecisionTreeRegressor()
    model.fit(X=x, y=y)

    Predict_y = model.predict(x)

    Results = FT.MachineLearning.Metrics.ModelEvaluation(y, Predict_y,
                                                         Indicators = ["Explained Variance Score", "Max Error", "Mean Squared Error", "R² Score"])

    #Tree Depth
    Results["Tree Depth"] = model.get_depth()

    #Tree Leaves
    Results["Tree Leaves"] = model.get_n_leaves()

    return model, Results
Esempio n. 5
0
class RamenRatingPredictor:
    def __init__(self):
        self.feature_names = None
        self.class_names = None

    def load_dataset(self, path):
        """
        To load dataset from local
        :param path: String, location of file
        :return:
        """
        self.data = pd.read_csv(path)
        print(self.data.head(10))

    def preprocess_dataset(self):
        """
        To select features and shuffle data
        :return:
        """
        self.data = self.data.iloc[:, [1, 3, 4, 5]]
        self.feature_names = self.data.iloc[:, :3].columns
        self.class_names = self.data.iloc[:, 3].unique().astype('str')
        self.data = self.data.dropna()
        self.data = self.data.sample(frac=1)
        print(self.data.info())
        ode = OrdinalEncoder()
        data = ode.fit_transform(self.data)
        self.data = pd.DataFrame(data, columns=self.data.columns)
        print(self.data.info())
        print(self.data.head(10))

    def split_dataset(self, test_rate=0.2):
        """
        To split dataset to trainingset and testset considering the rate of test set
        :param test_rate: double, the rate of test set
        :return:
        """
        X = self.data.iloc[:, :3]
        y = self.data.iloc[:, 3]

        self.train_X, self.test_X, self.train_y, self.test_y = train_test_split(
            X, y, test_size=test_rate, random_state=42)

        print("# of Instances in Training Set: ", len(self.train_X))
        print("# of Instances in Test Set: ", len(self.test_X))

    def train_model(self):
        """
        To train model using training set
        :return:
        """
        self.dtr = DecisionTreeRegressor(max_depth=30)
        self.dtr.fit(self.train_X, self.train_y)
        print(self.dtr.get_n_leaves(), self.dtr.get_depth())
        dump(self.dtr, "./Ramen_Rating_Predictor.joblib")

    def predict(self, X):
        """
        To predict diagnosis result derived X
        :param X: dataframe, dict, an instance for predicting label
        :return: Array, predicted result
        """
        self.dtr = load("./Ramen_Rating_Predictor.joblib")
        result = self.dtr.predict(X)
        return result

    def evaluate_model(self, y_true, y_pred):
        """
        To evaluate trained model
        :param y_true: dict, label data driven from original data
        :param y_pred: dict, label data driven from predicted result
        :return: dict, performance results
        """
        mae = mean_absolute_error(y_true=y_true, y_pred=y_pred)
        mse = mean_squared_error(y_true=y_true, y_pred=y_pred)
        return {"MAE": mae, "MSE": mse}

    def finetune_model(self,
                       criterion="mse",
                       max_depth=None,
                       min_samples_leaf=1,
                       max_leaf_nodes=None):
        """
        To change hyperparameters and train model again
        :param kernel: string, kernel of SVC
        :param degree: int, degree of polynomial kernel
        :param C: double, regularization
        :param coef0:double,
        :return:
        """
        self.dtr = DecisionTreeRegressor(criterion=criterion,
                                         max_depth=max_depth,
                                         min_samples_leaf=min_samples_leaf,
                                         max_leaf_nodes=max_leaf_nodes)
        self.dtr.fit(self.train_X, self.train_y)

        print(self.dtr.get_n_leaves(), self.dtr.get_depth())
        dump(self.dtr, "./Ramen_Rating_Predictor.joblib"
             )  # To save trained model to local

    def visualize_tree(self):
        """
        To save *.dot file of trained model
        :return:
        """
        tree.export_graphviz(self.dtr,
                             out_file="Ramen_Rating_Predictor.dot",
                             feature_names=self.feature_names,
                             class_names=self.class_names,
                             filled=True)
Esempio n. 6
0
# Обучите решающее дерево для регрессии на предложенных данных,
# размер тестовой выборки возьмите за 0.3, random_state = 42 для разбиения и дерева.
# Вычислите RMSE, округлите до двух знаков после точки-разделителя.

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, mean_squared_error

RANDOM_SEED = 42
train = pd.read_csv('module_7_test/petrol_consumption.csv')
y = train['Petrol_Consumption']
X = train.drop(columns=['Petrol_Consumption'])
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=RANDOM_SEED)
reg_tree = DecisionTreeRegressor(random_state=RANDOM_SEED)
reg_tree.fit(X_train, y_train)

y_predict = reg_tree.predict(X_test)

# Вычислите RMSE, округлите до двух знаков после точки-разделителя.
rmse = np.sqrt(mean_squared_error(y_test, y_predict))
print(round(rmse, 2))
# Какова глубина дерева?
print(reg_tree.get_depth())
Esempio n. 7
0
print(
    "Train error 1: ",
    np.sum(np.square(y_1_train - PCA_train_ph[:, comp_to_fit])) /
    len(y_1_train))
print(
    "Train error 2: ",
    np.sum(np.square(y_2_train - PCA_train_ph[:, comp_to_fit])) /
    len(y_2_train))
print(
    "Test error 1: ",
    np.sum(np.square(y_1_test - PCA_test_ph[:, comp_to_fit])) / len(y_1_test))
print(
    "Test error 2: ",
    np.sum(np.square(y_2_test - PCA_test_ph[:, comp_to_fit])) / len(y_1_test))
print(regr_1.feature_importances_, regr_1.get_depth())

# Plot test results
plt.figure()
plt.plot(test_theta[:, 0],
         PCA_test_ph[:, comp_to_fit],
         'o',
         ms=2,
         c="darkorange",
         label="true")
#plt.plot(test_theta[:,0], y_1_test,'o', color="cornflowerblue",label="max_depth= "+str(regr_1.get_depth()), ms=2)
plt.plot(test_theta[:, 0],
         y_2_test,
         'o',
         color="yellowgreen",
         label="max_depth= " + str(regr_1.get_depth()),
Esempio n. 8
0
# Split data into two pieces.
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)

# Initialise a decision tree.
melbourne_model = DecisionTreeRegressor(random_state=0)

# Build a decision tree from the training data.
melbourne_model.fit(train_X, train_y)

# Now the model is built, use it to predict the prices from the
# validation data X
val_predictions = melbourne_model.predict(val_X)

# Use the mean absolute error metric to measure accuracy of the model.
print("Mean Absolute Error is", mean_absolute_error(val_y, val_predictions))

# Misc functions
print("Depth of decision tree is", melbourne_model.get_depth())
print("n_leaves of decision tree is", melbourne_model.get_n_leaves())

# Plot the error
num_points = len(val_predictions)
val_y_array = val_y.to_numpy()
x = np.linspace(0, 500000, num_points)
errors = []
for i in range(num_points):
    errors.append(abs(val_y_array[i] - val_predictions[i]))
plt.scatter(x, errors, c='r', s=0.5)
plt.savefig("basic_decision_tree_errors.png")
plt.close()
import matplotlib.pyplot as plot
from sklearn.tree import export_graphviz
import pydotplus

maeList = []
depthList = []

boston = load_boston()
X = pd.DataFrame(boston.data, columns=boston.feature_names)
y = pd.DataFrame(boston.target, columns=["target"])
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

dtr_model = DecisionTreeRegressor(random_state=0, max_depth=6)
dtr_model.fit(X_train, y_train)
y_preds = dtr_model.predict(X_test)
print(mean_absolute_error(y_test, y_preds), dtr_model.get_depth())

dot_data = export_graphviz(dtr_model,
                           feature_names=boston.feature_names,
                           out_file=None,
                           filled=True,
                           rounded=True)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_png("boston.png")

for depths in range(1, 19):
    dtr_model = DecisionTreeRegressor(random_state=0, max_depth=depths)
    dtr_model.fit(X_train, y_train)
    preds = dtr_model.predict(X_test)
    this_mae = mean_absolute_error(y_test, preds)
    maeList.append(this_mae)
Esempio n. 10
0
#a = np.load('outputs'+os.sep+'mlp_friedman_Ep_500_B1Sig_l2.npy.npy').item()

#%%
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

X, y = load_iris(return_X_y=True)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
clf.get_depth()
clf.get_n_leaves()

iris = load_iris()
iris.data
iris.target

#%%
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

X, y = load_diabetes(return_X_y=True)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
regressor = DecisionTreeRegressor(random_state=0)

regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
regressor.get_depth()
regressor.get_n_leaves()
Esempio n. 11
0
        data["young"][i] = -1

X_data = np.matrix(data[["abs(V-I)", "abs(V-R)", "abs(R-I)"]])
X_resul = data["young"]

train_data, test_data, train_resul, test_resul = train_test_split(
    X_data, X_resul, test_size=0.2)

d_tree = DecisionTreeRegressor()
d_tree.fit(train_data, train_resul)

predictions = d_tree.predict(test_data)

R2 = d_tree.score(train_data, train_resul)
R2_test = d_tree.score(test_data, test_resul)
deep = d_tree.get_depth()

pres = predictions - test_resul

y = [i for i in range(350)]
x = np.zeros(350)
bbox = dict(boxstyle="round", fc="0.8")

plt.hist(predictions,
         density=False,
         histtype='bar',
         rwidth=0.9,
         bins=2,
         color="k",
         label="prediction")
plt.hist(test_resul,
Esempio n. 12
0
    coeff.append(rScore)
    meanSquareList.append(np.sqrt(meanSquare))

total=0
totalRScore=0
totalMeanSquare=0
for i in range(len(meanList)):
    total+= meanList[i]
    totalRScore+=coeff[i]
    totalMeanSquare+=meanSquareList[i]
    
print("Average Mean: ", total/10)
print("Average Coefficient: ",totalRScore/10)
print("Average Root Mean Square Error: ",totalMeanSquare/10)

print(regressionTree.get_depth())

print (datetime.now() - startTime)


#2 - Linear Regression on Guiding Question 1

from sklearn import preprocessing
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

from matplotlib.colors import ListedColormap