def test_on_housing_dataset(self): """ Test on housing data set Logging results :return: None """ log = logging.getLogger("DecisionTreeTest.test_on_housing_dataset") data = np.loadtxt(DecisionTreeTest.HousingDataPath) x, y = data[::, :-1:], data[::, -1] kf = KFold(x.shape[0], n_folds=5) for train, test in kf: train_x, train_y = x[train], y[train] test_x, test_y = x[test], y[test] tree = DecisionTreeRegressor(max_depth=50, min_list_size=2, min_list_variance=1e-5) tree.fit(train_x, train_y) sktree = sklearn_trees.DecisionTreeRegressor() sktree.fit(train_x, train_y) prediction = tree.predict(test_x) skprediction = sktree.predict(test_x) log.debug("Target: %s" % test_y) log.debug("Prediction: %s" % prediction) log.debug("Mean squared error my tree: %f" % mean_squared_error(test_y, prediction)) log.debug("Mean squared error sklearn tree: %f" % mean_squared_error(test_y, skprediction))
def test_tree(filename): df = pd.read_csv(filename) y = df.pop('Humidity').values X = df.values print X tree = DecisionTreeRegressor() tree.fit(X, y, df.columns) tree.prune(X, y) print tree y_predict = tree.predict(X) print '%35s %10s %10s' % ("FEATURES", "ACTUAL", "PREDICTED") print '%35s %10s %10s' % ("----------", "----------", "----------") for features, true, predicted in izip(X, y, y_predict): print '%35s %10d %10d' % (str(features), true, predicted)
def test_on_test_dataset(self): """ Test on test data set Logging results :return: None """ log = logging.getLogger("DecisionTreeTest.test_on_test_dataset") tree = DecisionTreeRegressor() data = np.loadtxt(DecisionTreeTest.TestDataPath) tree.fit(data[::, :-1:], data[::, -1]) prediction = tree.predict(data[::, :-1:]) y = data[::, -1] log.debug("Prediction: {0}".format(prediction)) log.debug("Target value: {0}".format(y)) self.assertTrue(np.array_equal(prediction, y))
def fit(self, train, target=None, test=None): """ Fit the random forest to the training set train. If a test set is provided then the return value wil be the predictions of the RandomForest on the test set. If no test set is provide nothing is returned. Note: Below we set the number of features to use in the splitting to be the square root of the number of total features in the dataset. :Parameters: **train** (list or `Pandas DataFrame <http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html>`_) : The training set. **target** (str or None) : The name of the target variable **test** (list or `Pandas DataFrame <http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html>`_) : The test set. :Returns: (list or None): If a test set is provided then the return value wil be the predictions of the RandomForest on the test set. If no test set is provide nothing is returned. """ # set the number of features for the trees to use. if isinstance(train, list) is False: if target is None: raise ValueError( 'If passing dataframe need to specify target.') else: train = self._convert_dataframe_to_list(train, target) n_features = int(sqrt(len(train[0]) - 1)) for i in range(self.n_trees): sample = self._subsample(train) tree = DecisionTreeRegressor(self.max_depth, self.min_size, self.cost_function) tree.fit(sample, n_features) self.trees.append(tree) # if the test set is not empty then return the predictions if test is not None: predictions = [self.predict(row) for row in test] return (predictions)
def fit(self, train, target=None, test=None): # set the number of features for the trees to use. if isinstance(train, list) is False: if target is None: raise ValueError('If passing dataframe need to specify target.') else: train = self._convert_dataframe_to_list(train, target) n_features = int(sqrt(len(train[0])-1)) for i in range(self.n_trees): sample = self._subsample(train) tree = DecisionTreeRegressor(self.max_depth, self.min_size, self.cost_function) tree.fit(sample, n_features) self.trees.append(tree) # if the test set is not empty then return the predictions if test is not None: predictions = [self.predict(row) for row in test] return(predictions)