Ejemplo n.º 1
0
    def test_on_housing_dataset(self):
        """
        Test on housing data set
        Logging results
        :return: None
        """
        log = logging.getLogger("DecisionTreeTest.test_on_housing_dataset")
        data = np.loadtxt(DecisionTreeTest.HousingDataPath)

        x, y = data[::, :-1:], data[::, -1]

        kf = KFold(x.shape[0], n_folds=5)

        for train, test in kf:
            train_x, train_y = x[train], y[train]
            test_x, test_y = x[test], y[test]

            tree = DecisionTreeRegressor(max_depth=50, min_list_size=2, min_list_variance=1e-5)
            tree.fit(train_x, train_y)

            sktree = sklearn_trees.DecisionTreeRegressor()
            sktree.fit(train_x, train_y)

            prediction = tree.predict(test_x)
            skprediction = sktree.predict(test_x)

            log.debug("Target: %s" % test_y)
            log.debug("Prediction: %s" % prediction)
            log.debug("Mean squared error my tree: %f" % mean_squared_error(test_y, prediction))
            log.debug("Mean squared error sklearn tree: %f" % mean_squared_error(test_y, skprediction))
Ejemplo n.º 2
0
def test_tree(filename):
    df = pd.read_csv(filename)
    y = df.pop('Humidity').values
    X = df.values
    print X

    tree = DecisionTreeRegressor()
    tree.fit(X, y, df.columns)
    tree.prune(X, y)
    print tree

    y_predict = tree.predict(X)
    print '%35s   %10s   %10s' % ("FEATURES", "ACTUAL", "PREDICTED")
    print '%35s   %10s   %10s' % ("----------", "----------", "----------")
    for features, true, predicted in izip(X, y, y_predict):
        print '%35s   %10d   %10d' % (str(features), true, predicted)
Ejemplo n.º 3
0
    def test_on_test_dataset(self):
        """
        Test on test data set
        Logging results
        :return: None
        """
        log = logging.getLogger("DecisionTreeTest.test_on_test_dataset")
        tree = DecisionTreeRegressor()
        data = np.loadtxt(DecisionTreeTest.TestDataPath)
        tree.fit(data[::, :-1:], data[::, -1])

        prediction = tree.predict(data[::, :-1:])
        y = data[::, -1]

        log.debug("Prediction: {0}".format(prediction))
        log.debug("Target value: {0}".format(y))

        self.assertTrue(np.array_equal(prediction, y))
Ejemplo n.º 4
0
    def fit(self, train, target=None, test=None):
        """
		Fit the random forest to the training set train.  If a test set is provided
		then the return value wil be the predictions of the RandomForest on the
		test set.  If no test set is provide nothing is returned.

		Note: Below we set the number of features to use in the splitting to be
		the square root of the number of total features in the dataset.

		:Parameters:
			**train** (list or `Pandas DataFrame <http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html>`_) : The training set.
			
			**target** (str or None) : The name of the target variable
			
			**test** (list or `Pandas DataFrame <http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html>`_) : The test set.
			
		:Returns:
			(list or None): If a test set is provided then the return value wil be
			the predictions of the RandomForest on the test set.  If no test set 
			is provide nothing is returned.
		"""
        # set the number of features for the trees to use.
        if isinstance(train, list) is False:
            if target is None:
                raise ValueError(
                    'If passing dataframe need to specify target.')
            else:

                train = self._convert_dataframe_to_list(train, target)

        n_features = int(sqrt(len(train[0]) - 1))

        for i in range(self.n_trees):
            sample = self._subsample(train)
            tree = DecisionTreeRegressor(self.max_depth, self.min_size,
                                         self.cost_function)

            tree.fit(sample, n_features)
            self.trees.append(tree)

        # if the test set is not empty then return the predictions
        if test is not None:
            predictions = [self.predict(row) for row in test]
            return (predictions)
Ejemplo n.º 5
0
	def fit(self, train, target=None, test=None):
		# set the number of features for the trees to use.
		if isinstance(train, list) is False:
			if target is None:
				raise ValueError('If passing dataframe need to specify target.')
			else:
		
				train = self._convert_dataframe_to_list(train, target)
	
		n_features = int(sqrt(len(train[0])-1))

		for i in range(self.n_trees):
			sample = self._subsample(train)
			tree = DecisionTreeRegressor(self.max_depth, self.min_size, self.cost_function)

			tree.fit(sample, n_features)
			self.trees.append(tree)

		# if the test set is not empty then return the predictions
		if test is not None:
			predictions = [self.predict(row) for row in test]
			return(predictions)