Ejemplo n.º 1
0
    def test_on_housing_dataset(self):
        """
        Test on housing data set
        Logging results
        :return: None
        """
        log = logging.getLogger("DecisionTreeTest.test_on_housing_dataset")
        data = np.loadtxt(DecisionTreeTest.HousingDataPath)

        x, y = data[::, :-1:], data[::, -1]

        kf = KFold(x.shape[0], n_folds=5)

        for train, test in kf:
            train_x, train_y = x[train], y[train]
            test_x, test_y = x[test], y[test]

            tree = DecisionTreeRegressor(max_depth=50, min_list_size=2, min_list_variance=1e-5)
            tree.fit(train_x, train_y)

            sktree = sklearn_trees.DecisionTreeRegressor()
            sktree.fit(train_x, train_y)

            prediction = tree.predict(test_x)
            skprediction = sktree.predict(test_x)

            log.debug("Target: %s" % test_y)
            log.debug("Prediction: %s" % prediction)
            log.debug("Mean squared error my tree: %f" % mean_squared_error(test_y, prediction))
            log.debug("Mean squared error sklearn tree: %f" % mean_squared_error(test_y, skprediction))
Ejemplo n.º 2
0
    def test_on_test_dataset(self):
        """
        Test on test data set
        Logging results
        :return: None
        """
        log = logging.getLogger("DecisionTreeTest.test_on_test_dataset")
        tree = DecisionTreeRegressor()
        data = np.loadtxt(DecisionTreeTest.TestDataPath)
        tree.fit(data[::, :-1:], data[::, -1])

        prediction = tree.predict(data[::, :-1:])
        y = data[::, -1]

        log.debug("Prediction: {0}".format(prediction))
        log.debug("Target value: {0}".format(y))

        self.assertTrue(np.array_equal(prediction, y))
Ejemplo n.º 3
0
    def fit(self, train, target=None, test=None):
        """
		Fit the random forest to the training set train.  If a test set is provided
		then the return value wil be the predictions of the RandomForest on the
		test set.  If no test set is provide nothing is returned.

		Note: Below we set the number of features to use in the splitting to be
		the square root of the number of total features in the dataset.

		:Parameters:
			**train** (list or `Pandas DataFrame <http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html>`_) : The training set.
			
			**target** (str or None) : The name of the target variable
			
			**test** (list or `Pandas DataFrame <http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html>`_) : The test set.
			
		:Returns:
			(list or None): If a test set is provided then the return value wil be
			the predictions of the RandomForest on the test set.  If no test set 
			is provide nothing is returned.
		"""
        # set the number of features for the trees to use.
        if isinstance(train, list) is False:
            if target is None:
                raise ValueError(
                    'If passing dataframe need to specify target.')
            else:

                train = self._convert_dataframe_to_list(train, target)

        n_features = int(sqrt(len(train[0]) - 1))

        for i in range(self.n_trees):
            sample = self._subsample(train)
            tree = DecisionTreeRegressor(self.max_depth, self.min_size,
                                         self.cost_function)

            tree.fit(sample, n_features)
            self.trees.append(tree)

        # if the test set is not empty then return the predictions
        if test is not None:
            predictions = [self.predict(row) for row in test]
            return (predictions)
Ejemplo n.º 4
0
	def fit(self, train, target=None, test=None):
		# set the number of features for the trees to use.
		if isinstance(train, list) is False:
			if target is None:
				raise ValueError('If passing dataframe need to specify target.')
			else:
		
				train = self._convert_dataframe_to_list(train, target)
	
		n_features = int(sqrt(len(train[0])-1))

		for i in range(self.n_trees):
			sample = self._subsample(train)
			tree = DecisionTreeRegressor(self.max_depth, self.min_size, self.cost_function)

			tree.fit(sample, n_features)
			self.trees.append(tree)

		# if the test set is not empty then return the predictions
		if test is not None:
			predictions = [self.predict(row) for row in test]
			return(predictions)
Ejemplo n.º 5
0
def test_tree(filename):
    df = pd.read_csv(filename)
    y = df.pop('Humidity').values
    X = df.values
    print X

    tree = DecisionTreeRegressor()
    tree.fit(X, y, df.columns)
    tree.prune(X, y)
    print tree

    y_predict = tree.predict(X)
    print '%35s   %10s   %10s' % ("FEATURES", "ACTUAL", "PREDICTED")
    print '%35s   %10s   %10s' % ("----------", "----------", "----------")
    for features, true, predicted in izip(X, y, y_predict):
        print '%35s   %10d   %10d' % (str(features), true, predicted)
Ejemplo n.º 6
0
    plt.scatter(regressor.predict(X_test), y_test, marker=".", c="r")
    plt.savefig(name + ".svg")
    plt.clf()


X_train, y_train, X_test, y_test, X_validate, y_validate, header = loadData()

usedTrees = 200

maxDepth = 10
regressors = [
    ("SK Learn Regressor",
     SKLearnDecisionTree(
         tree.DecisionTreeRegressor(min_samples_split=5,
                                    random_state=np.random))),
    ("Regressor", DecisionTreeRegressor()),
    ("SK Learn Boosted",
     SKLearnDecisionTree(
         ensemble.GradientBoostingRegressor(n_estimators=usedTrees,
                                            min_samples_split=5,
                                            max_depth=maxDepth,
                                            learning_rate=0.05,
                                            random_state=np.random))),
    ("Boosted Regressor",
     BoostedDecisionTreeRegressor(usedTrees=usedTrees, maxDepth=maxDepth)),
    ("SK Learn Random",
     SKLearnDecisionTree(
         ensemble.RandomForestRegressor(n_estimators=usedTrees,
                                        min_samples_split=5,
                                        max_depth=maxDepth,
                                        random_state=np.random))),
def test_decision_t():
    regressor = DecisionTreeRegressor("Salary_Data.csv", 1)
    print(regressor.get_average_error())
    regressor.train()
    regressor.predict([1],[2],[3],[4],[5],[6],[7],[8],[9], [10])
    regressor.plot()

    regressor = DecisionTreeRegressor("Position_Salaries.csv", 2)
    print(regressor.get_average_error())
    regressor.remove_variable(0)
    regressor.train()
    regressor.predict([1],[2],[3],[4],[5],[6],[7],[8],[9], [10])
    regressor.plot()

    regressor = DecisionTreeRegressor("50_Startups.csv", 4)
    print(regressor.get_average_error())
    regressor.train()
    regressor.predict([1,3, 400, 3])
    regressor.plot()
def test_decision_t():
    regressor = DecisionTreeRegressor("Salary_Data.csv", 1)
    regressor.train()
    regressor.predict([1], [2], [3], [4], [5], [6], [7], [8], [9], [10])
    regressor.plot()

    regressor = DecisionTreeRegressor("Position_Salaries.csv", 2)
    regressor.remove_variable(0)
    regressor.train()
    regressor.predict([1], [2], [3], [4], [5], [6], [7], [8], [9], [10])
    regressor.plot()