Example #1
0
    def skill_info(self, examples, feature_names=None):
        # print("SLOOP", examples)
        tree = self
        tree_ = tree.tree_
        # print("feature_names", feature_names)
        feature_name = [
            feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
            for i in tree_.feature
        ]
        node_indicator = tree.decision_path(examples)
        dense_ind = np.array(node_indicator.todense())

        def recurse(node, ind):
            if(tree_.feature[node] != _tree.TREE_UNDEFINED):
                l = tree_.children_left[node]
                less = ind[l]
                if(not less):
                    s = recurse(tree_.children_right[node], ind)
                else:
                    s = recurse(tree_.children_left[node], ind)

                name = feature_name[node]
                ineq = "<=" if less else ">"
                thresh = str(tree_.threshold[node])
                return [(name.replace("?ele-", ""), ineq, thresh)] + s
            else:
                return []
        for ind in dense_ind:
            return recurse(0, ind)
Example #2
0
    def closest_decision(self, tree, sample,
                         strategy='informativeness',
                         beta=5):
        """Find the closest decision that is of a class other than the
        target class.

        Args:
            tree: sklearn tree
            sample: Entry to explain
            beta: Hyperparameter >= 1 to determine when to only
                search part of tree (higher = search smaller area)

        Returns:
            Ordered descriptive decision path difference,
            confidence of leaf decision
        """
        # Only search part of tree depending on tree size
        decision_path = tree.decision_path(sample.reshape(1, -1)).indices
        if len(decision_path) < 2:
            warnings.warn('Stub tree')
            return None, 0.0
        start_depth = int(round(len(decision_path) / beta))
        start_node = decision_path[start_depth]

        # Get decision for sample
        fact_leaf = tree.apply(sample.reshape(1, -1)).item(0)

        # TODO: Retrain tree if wrong prediction
        if np.argmax(tree.tree_.value[fact_leaf]) != 0:
            warnings.warn('Tree did not predict as fact')

        # Find closest leaf that does not predict output x, based on a strategy
        graph, foil_nodes = self._fact_foil_graph(tree.tree_,
                                                  start_node=start_node)

        if self.verbose:
            print(f'[E] Found {len(foil_nodes)} contrastive decision regions, '
                  f'starting from node {start_node}')

        if len(foil_nodes) == 0:
            return None, 0

        # Contrastive decision region
        foil_path, confidence = self._get_path(graph,
                                               fact_leaf,
                                               foil_nodes,
                                               tree.tree_,
                                               strategy)

        return self.descriptive_path(foil_path, sample, tree), confidence
Example #3
0
def print_decision_path(tree, X, sample_id=0):
    node_indicator = tree.decision_path(X)
    leave_id = tree.apply(X)
    node_index = node_indicator.indices[
        node_indicator.indptr[sample_id]:node_indicator.indptr[sample_id + 1]]
    print('Rules used to predict sample %s: ' % sample_id)
    print node_index
    for node_id in node_index:
        if (X[sample_id, tree.tree_.feature[node_id]] <=
                tree.tree_.threshold[node_id]):
            threshold_sign = "<="
        else:
            threshold_sign = ">"
        print("decision id node %s : (X_test[%s, %s] (= %s) %s %s)" %
              (node_id, sample_id, tree.tree_.feature[node_id],
               X[sample_id, tree.tree_.feature[node_id]], threshold_sign,
               tree.tree_.threshold[node_id]))
Example #4
0
    def decision_path(self, tree, sample):
        """Get a descriptive decision path of a sample.

        Args:
            tree: sklearn tree
            sample: Sample to decide decision path of

        Returns:
            Descriptive decision path for sample
        """
        dp = list(np.nonzero(tree.decision_path(sample.reshape(1, -1)))[1])
        if len(dp) == 0:
            return []
        turned_right = [dp[i] in tree.tree_.children_right
                        for i, node in enumerate(dp[:-1])] + [False]

        return self.descriptive_path(list(zip(dp, turned_right)), sample, tree)
Example #5
0
 def __true_positive(self, tree):
     """
     Takes in a decision tree and returns a numpy Matrix of all the correctly classified transactions along
     with an array containing the addresses of the tree nodes accessed along the decision paths that resulted
     in the correct classifications.
     :param tree: A decision either the only one in the case of a decision tree classifier or a single member
     of a forest in the case of random forest classifier.
     :return: A numpy Matrix of all the correctly classified transactions and
     An array containing the addresses of the tree nodes accessed along the decision paths that resulted
     in the correct classifications.
     """
     p = tree.predict(self.__X)
     true_p_df = self.__X[(p == 1) & (p == self.__y)].copy()
     if (true_p_df.shape[0]):
         return true_p_df.to_numpy(), tree.decision_path(true_p_df).toarray()
     else:
         return true_p_df, true_p_df
def NCA(forest, samples):

    n_sample = samples.shape[0]
    d = np.zeros([n_sample, n_sample])
    n_estimator = len(forest)

    for k in range(n_estimator):

        tree = forest[k]

        path = tree.decision_path(samples).todense()

        for i in range(n_sample):
            for j in range(n_sample):
                sample_ids = [i, j]
                d[i, j] = d[i, j] + 1 / (path[sample_ids].sum(axis=0)
                                         == len(sample_ids)).sum()

    d = d / n_estimator
    d_Nearest_Common_Ancestor = [1 / x for x in d]

    return d_Nearest_Common_Ancestor
def SP(forest, samples):

    n_sample = samples.shape[0]
    d = np.zeros([n_sample, n_sample])
    n_estimator = len(forest)

    for k in range(n_estimator):

        tree = forest[k]

        path = tree.decision_path(samples).todense()

        for i in range(n_sample):
            for j in range(n_sample):
                sample_ids = [i, j]
                splitting_depth = (path[sample_ids].sum(
                    axis=0) == len(sample_ids)).sum()
                depth_i = (path[[i, i]].sum(axis=0) == len(sample_ids)).sum()
                depth_j = (path[[j, j]].sum(axis=0) == len(sample_ids)).sum()
                d[i, j] = d[i, j] + depth_i + depth_j - 2 * splitting_depth

    d_shortest_path = d / n_estimator

    return d_shortest_path
def Tree_path(tree, samples):
    '''
    inputs:
    
    takes tree (best estimated one, if GridSearchCV has been used),
    pure samples as inputs, could also be not pure samples
    
    outputs:
    
    returns a list of dictionaries, where, keys mean feature no[0-means-->sample[i][0], where i is any sample no],
    values mean condition followed by thresholds, one after another condition and thresholds are added in dictionary values
    
    i'th dictionary in dictionary list represents a unique rule.
    
    uncomment prints to see it in action.

    '''

    number_of_nodes = tree.tree_.node_count
    feature = tree.tree_.feature
    threshold = tree.tree_.threshold

    decision_paths = tree.decision_path(samples)

    leave_ids = tree.apply(samples)

    dic = []

    for i in range(0, len(samples), 1):

        sample_id = i

        d = dict()

        indexes=decision_paths.indices[decision_paths.indptr[sample_id]:\
                                      decision_paths.indptr[sample_id+1]]

        #print('sample id: ',sample_id)

        comparator = ''

        for node_id in indexes:
            d[feature[node_id]] = []

        for node_id in indexes:

            if leave_ids[sample_id] == node_id:

                d.pop(feature[node_id], None)

                #print(d)

                if d not in dic:
                    dic.append(d)

                continue

            if (samples[sample_id][feature[node_id]] <= threshold[node_id]):

                comparator = "<="
            else:
                comparator = ">"

            #print("X_test[%s,%s]  %s %s "%(sample_id,feature[node_id],comparator,threshold[node_id]) )

            d[feature[node_id]].append(comparator)
            d[feature[node_id]].append(threshold[node_id])

    #print(dic)

    return dic
Example #9
0
        if(i < 3):
            p = clf.predict(x)
            temp = x[(p == 1) & (p == y)].copy()
            temp = temp.to_numpy()
            for n, row in enumerate(clf.decision_path(temp).toarray()):
                for indx in np.nonzero(row)[-1][-2:-1]:
                    if(temp[n,clf.tree_.feature[indx]] <= clf.tree_.threshold[indx]):
                        feature_thresholds[clf.tree_.feature[indx]][1].append(clf.tree_.threshold[indx])
                    else:
                        feature_thresholds[clf.tree_.feature[indx]][0].append(clf.tree_.threshold[indx])
        else:
            for tree in clf.estimators_:
                p = tree.predict(x)
                temp = x[(p == 1) & (p == y)].copy()
                temp = temp.to_numpy()
                for n, row in enumerate(tree.decision_path(temp).toarray()):
                    for indx in np.nonzero(row)[-1][-2:-1]:
                        if(temp[n,tree.tree_.feature[indx]] <= tree.tree_.threshold[indx]):
                            feature_thresholds[tree.tree_.feature[indx]][1].append(tree.tree_.threshold[indx])
                        else:    
                            feature_thresholds[tree.tree_.feature[indx]][0].append(tree.tree_.threshold[indx])

        for k in range(len(features)):
            if len(feature_thresholds[k][0]) == 0:
                feature_thresholds[k][0].append(0)
                if len(feature_thresholds[k][1]) == 0:
                    feature_thresholds[k][1].append(0)
            elif(len(feature_thresholds[k][1]) == 0):
                feature_thresholds[k][1].append(max(x[features[k]]))
                
        number = sum(importances > 0.15)
Example #10
0
print(x)
print(y)

n_estimators = 4
max_depth = 4
model = RandomForestRegressor(n_estimators=n_estimators, max_depth=4)
model.fit(x, y)

_score = model.score(x, y)
print(_score)

for i_sample in range(len(x)):
    for i_estimator, tree in enumerate(model.estimators_):

        path = tree.decision_path(x)
        #print(path)

        print('i_estimator ', i_estimator)
        new_array = np.array(path.todense())[i_sample]
        print(i_sample)
        #print(dense)
        new_path = []
        for i in range(len(new_array)):
            if new_array[i] == 1:
                new_path.append(i)
        #print(new_path)

        for i in range(1, len(new_path)):
            label = features[tree.tree_.feature[new_path[i - 1]]]
            value_ = tree.tree_.value[new_path[i - 1]]
Example #11
0
    def predict_marginalized_over_instances(self, X: np.ndarray):
        """Predict mean and variance marginalized over all instances.

        Returns the predictive mean and variance marginalised over all
        instances for a set of configurations.

        Note
        ----
        This method overwrites the same method of ~smac.epm.base_epm.AbstractEPM;
        the following method is random forest specific
        and follows the SMAC2 implementation;
        it requires no distribution assumption
        to marginalize the uncertainty estimates

        Parameters
        ----------
        X : np.ndarray
            [n_samples, n_features (config)]

        Returns
        -------
        means : np.ndarray of shape = [n_samples, 1]
            Predictive mean
        vars : np.ndarray  of shape = [n_samples, 1]
            Predictive variance
        """

        if self.instance_features is None or \
                len(self.instance_features) == 0:

            new_X = []
            for tree in self.rf.estimators_:
                tree_X = []
                path = tree.decision_path(X)
                for i in range(path.shape[0]):
                    row = path.getrow(i).toarray().flatten().copy()
                    new_row = []
                    for j in range(len(row)):
                        if row[j] == np.NaN:
                            new_row.append(0)
                        else:
                            threshold = tree.tree_.threshold[j]
                            feature_idx = tree.tree_.feature[j]
                            diff = (threshold - X[i][feature_idx])
                            new_row.append(diff)
                    tree_X.append(new_row)
                new_X.append(np.array(tree_X))
            new_X = np.hstack(new_X)
            new_X = (new_X - self.X_min_) / self.diff_
            new_X[np.isnan(new_X)] = 0
            new_X = new_X / self.max_length_

            #new_X = self.scaler.transform(new_X)
            mean, std = self.gp.predict(new_X, return_std=True)

            mean = mean.reshape((-1, 1))
            var = (std).reshape((-1, 1))**2
            #print(new_X, mean, var)

            var[var < self.var_threshold] = self.var_threshold
            var[np.isnan(var)] = self.var_threshold
            return mean, var
        else:
            raise NotImplementedError()
Example #12
0
    def _train(self, X: np.ndarray, y: np.ndarray):
        """Trains the random forest on X and y.

        Parameters
        ----------
        X : np.ndarray [n_samples, n_features (config + instance features)]
            Input data points.
        Y : np.ndarray [n_samples, ]
            The corresponding target values.

        Returns
        -------
        self
        """

        self.X = X
        self.y = y.flatten()

        self.rf = sklearn.ensemble.RandomForestRegressor(
            max_features=1.0,
            bootstrap=False,
            n_estimators=1,
            max_depth=None,
        )
        #self.rf = sklearn.tree.DecisionTreeRegressor(max_depth=10)
        self.rf.fit(X, y)

        new_X = []
        for tree in self.rf.estimators_:
            tree_X = []
            # There's no reason to also take the leaves into account!
            path = tree.decision_path(X)
            for i in range(path.shape[0]):
                row = path.getrow(i).toarray().flatten().copy()
                new_row = []
                for j in range(len(row)):
                    if row[j] == 0:
                        new_row.append(np.NaN)
                    else:
                        threshold = tree.tree_.threshold[j]
                        feature_idx = tree.tree_.feature[j]
                        diff = (threshold - X[i][feature_idx])
                        new_row.append(diff)
                tree_X.append(new_row)
            new_X.append(np.array(tree_X))
        new_X = np.hstack(new_X)
        assert X.shape[0] == new_X.shape[0]
        X_min = np.nanmin(new_X, axis=0)
        X_max = np.nanmax(new_X, axis=0)
        diff = X_max - X_min
        diff[diff == 0] = 1
        self.X_min_ = X_min
        self.diff_ = diff
        new_X = (new_X - self.X_min_) / self.diff_
        new_X[np.isnan(new_X)] = 0
        self.max_length_ = np.max(np.sum(new_X, axis=1))
        new_X = new_X / self.max_length_

        # TODO compute the kernel manually by computing the tree similarities and then only compute an additive kernel within each tree...
        # only compare 'same' paths of a tree
        self.gp = sklearn.pipeline.Pipeline([
            # Cannot use the scaler here as it would destroy all knowledge about where the zeros are
            #['preproc', sklearn.preprocessing.MinMaxScaler()],
            [
                'regressor',
                sklearn.gaussian_process.GaussianProcessRegressor(
                    kernel=sklearn.gaussian_process.kernels.ConstantKernel() *
                    sklearn.gaussian_process.kernels.Matern(
                    ),  # + sklearn.gaussian_process.kernels.WhiteKernel(
                    #    noise_level=1e-7, noise_level_bounds=(1e-14, 1e-6)
                    #),
                    n_restarts_optimizer=10,
                    normalize_y=True,
                )
            ]
        ])
        print(new_X.shape)
        self.gp.fit(new_X, y)
        print(self.gp.steps[-1][-1].kernel_)
        return self