Example #1
0
def create_classification_trees(prior, partition_prior, prune=False):
    return [
        PerpendicularClassificationTree(partition_prior, prior, prune=prune),
        HyperplaneClassificationTree(partition_prior,
                                     prior,
                                     delta=0,
                                     prune=prune),
        HyperplaneClassificationTree(partition_prior,
                                     prior,
                                     delta=0,
                                     prune=prune,
                                     optimizer=ScipyOptimizer(
                                         DifferentialEvolutionSolver, 666)),
        HyperplaneClassificationTree(partition_prior,
                                     prior,
                                     delta=0,
                                     prune=prune,
                                     optimizer=RandomTwoPointOptimizer(
                                         100, 666)),
        HyperplaneClassificationTree(partition_prior,
                                     prior,
                                     delta=0,
                                     prune=prune,
                                     optimizer=RandomHyperplaneOptimizer(
                                         100, 666)),
        HyperplaneClassificationTree(partition_prior,
                                     prior,
                                     delta=0,
                                     prune=prune,
                                     optimizer=SimulatedAnnealingOptimizer(
                                         10, 10, 0.9, 666)),
    ]
Example #2
0
def create_regression_trees(prior, partition_prior):
    return [
        PerpendicularRegressionTree(partition_prior, prior),
        HyperplaneRegressionTree(partition_prior, prior),
        HyperplaneRegressionTree(partition_prior,
                                 prior,
                                 optimizer=ScipyOptimizer(
                                     DifferentialEvolutionSolver, 666)),
        HyperplaneRegressionTree(partition_prior,
                                 prior,
                                 optimizer=RandomHyperplaneOptimizer(100,
                                                                     666)),
        HyperplaneRegressionTree(partition_prior,
                                 prior,
                                 optimizer=SimulatedAnnealingOptimizer(
                                     10, 10, 0.9, 666)),
    ]
    def _fit(self, X, y, verbose, feature_names, side_name):
        n_data = X.shape[0]
        n_dim = X.shape[1]
        prior = self._get_prior(n_data, n_dim)

        if verbose:
            name = 'level {} {}'.format(self.level, side_name)
            print('Training {} with {:10} data points'.format(name, n_data))

        dense = isinstance(X, np.ndarray)
        if not dense and isinstance(X, csr_matrix):
            # column accesses coming up, so convert to CSC sparse matrix format
            X = csc_matrix(X)

        log_p_data_no_split = self._compute_log_p_data_no_split(y, prior)

        optimizer = self.optimizer
        if optimizer is None:
            # default to 'Differential Evolution' which works well and is reasonably fast
            optimizer = ScipyOptimizer(DifferentialEvolutionSolver, 666)

        # the function to optimize (depends on X and y, hence we need to instantiate it for every data set anew)
        optimization_function = HyperplaneOptimizationFunction(
            X, y, prior, self._compute_log_p_data_split, log_p_data_no_split,
            optimizer.search_space_is_unit_hypercube, self.split_precision)

        # create and run optimizer
        optimizer.solve(optimization_function)

        self.optimization_function = optimization_function

        # retrieve best hyperplane split from optimization function
        self._erase_split_info_base()
        self._erase_split_info()
        if optimization_function.best_hyperplane_normal is not None:
            # split data and target to recursively train children
            projections = X @ optimization_function.best_hyperplane_normal \
                          - np.dot(optimization_function.best_hyperplane_normal, optimization_function.best_hyperplane_origin)
            indices1 = np.where(projections < 0)[0]
            indices2 = np.where(projections >= 0)[0]

            if len(indices1) > 0 and len(indices2) > 0:
                """
                Note: The reason why indices1 or indices2 could be empty is that the optimizer might find a
                'split' that puts all data one one side and nothing on the other side, and that 'split' has
                a higher log probability than 'log_p_data_no_split' because of the partition prior
                overwhelming the data likelihoods (which are of course identical between the 'all data' and
                the 'everything on one side split' scenarios)s.
                """
                X1 = X[indices1]
                X2 = X[indices2]
                y1 = y[indices1]
                y2 = y[indices2]

                n_data1 = X1.shape[0]
                n_data2 = X2.shape[0]

                # compute posteriors of children and priors for further splitting
                prior_child1 = self._compute_posterior(y1, prior, delta=0)
                prior_child2 = self._compute_posterior(y2, prior, delta=0)

                # store split info, create children and continue training them if there's data left to split
                self.best_hyperplane_normal_ = optimization_function.best_hyperplane_normal
                self.best_hyperplane_origin_ = optimization_function.best_hyperplane_origin

                self.log_p_data_no_split_ = optimization_function.log_p_data_no_split
                self.best_log_p_data_split_ = optimization_function.best_log_p_data_split

                self.child1_ = self.child_type(self.partition_prior,
                                               prior_child1, self.delta,
                                               self.prune, optimizer,
                                               self.split_precision,
                                               self.level + 1)
                self.child2_ = self.child_type(self.partition_prior,
                                               prior_child2, self.delta,
                                               self.prune, optimizer,
                                               self.split_precision,
                                               self.level + 1)
                self.child1_._erase_split_info_base()
                self.child2_._erase_split_info_base()
                self.child1_._erase_split_info()
                self.child2_._erase_split_info()

                # fit children if there is more than one data point (i.e., there is
                # something to split) and if the targets differ (no point otherwise)
                if n_data1 > 1 and len(np.unique(y1)) > 1:
                    self.child1_._fit(X1, y1, verbose, feature_names, 'back ')
                else:
                    self.child1_.posterior_ = self._compute_posterior(
                        y1, prior)
                    self.child1_.n_data_ = n_data1

                if n_data2 > 1 and len(np.unique(y2)) > 1:
                    self.child2_._fit(X2, y2, verbose, feature_names, 'front')
                else:
                    self.child2_.posterior_ = self._compute_posterior(
                        y2, prior)
                    self.child2_.n_data_ = n_data2

        # compute posterior
        self.n_dim_ = X.shape[1]
        self.n_data_ = n_data
        self.posterior_ = self._compute_posterior(y, prior)