コード例 #1
0
ファイル: lambdamart.py プロジェクト: ofirnachum/pyltr
    def _update_terminal_regions(self, tree, X, y, lambdas, deltas, y_pred,
                                 sample_mask):
        terminal_regions = tree.apply(X)
        masked_terminal_regions = terminal_regions.copy()
        masked_terminal_regions[~sample_mask] = -1

        for leaf in np.where(tree.children_left ==
                             sklearn.tree._tree.TREE_LEAF)[0]:
            terminal_region = np.where(masked_terminal_regions == leaf)
            suml = np.sum(lambdas[terminal_region])
            sumd = np.sum(deltas[terminal_region])
            tree.value[leaf, 0, 0] = 0.0 if sumd == 0.0 else (suml / sumd)

        y_pred += tree.value[terminal_regions, 0, 0] * self.learning_rate
コード例 #2
0
    def _update_terminal_regions(self, tree, X, y, lambdas, deltas, y_pred,
                                 sample_mask):
        terminal_regions = tree.apply(X)
        masked_terminal_regions = terminal_regions.copy()
        masked_terminal_regions[~sample_mask] = -1

        for leaf in np.where(tree.children_left ==
                             sklearn.tree._tree.TREE_LEAF)[0]:
            terminal_region = np.where(masked_terminal_regions == leaf)
            suml = np.sum(lambdas[terminal_region])
            sumd = np.sum(deltas[terminal_region])
            tree.value[leaf, 0, 0] = 0.0 if sumd == 0.0 else (suml / sumd)

        y_pred += tree.value[terminal_regions, 0, 0] * self.learning_rate
コード例 #3
0
    def closest_decision(self,
                         tree,
                         sample,
                         strategy='informativeness',
                         beta=5):
        '''Find the closest decision that is of a class other than the
        target class.

        Args:
            tree: sklearn tree
            sample: Entry to explain
            beta: Hyperparameter >= 1 to determine when to only
                search part of tree (higher = search smaller area)

        Returns:
            Ordered descriptive decision path difference,
            confidence of leaf decision
        '''
        # Only search part of tree depending on tree size
        decision_path = tree.decision_path(sample.reshape(1, -1)).indices
        if len(decision_path) < 2:
            warnings.warn('Stub tree')
            return None, 0.0
        start_depth = int(round(len(decision_path) / beta))
        start_node = decision_path[start_depth]

        # Get decision for sample
        fact_leaf = tree.apply(sample.reshape(1, -1)).item(0)

        # TODO: Retrain tree if wrong prediction
        if np.argmax(tree.tree_.value[fact_leaf]) != 0:
            warnings.warn('Tree did not predict as fact')

        # Find closest leaf that does not predict output x, based on a strategy
        graph, foil_nodes = self._fact_foil_graph(tree.tree_,
                                                  start_node=start_node)

        if self.verbose:
            print(f'[E] Found {len(foil_nodes)} contrastive decision regions, '
                  f'starting from node {start_node}')

        if len(foil_nodes) == 0:
            return None, 0

        # Contrastive decision region
        foil_path, confidence = self._get_path(graph, fact_leaf, foil_nodes,
                                               tree.tree_, strategy)

        return self.descriptive_path(foil_path, sample, tree), confidence
コード例 #4
0
 def get_data_mask_of_ests_vaild(self, X_train, verbose=True):
     estimators = self.estimator
     last_forest_mask = np.array([True] * len(X_train))
     for index, forest in enumerate(estimators):
         tree_leaf_index = self.est_leaf_index[index]
         for i_tree, tree in enumerate(forest):
             node_id_lt = tree.apply(X_train)
             pass_data_mask = np.isin(node_id_lt, tree_leaf_index[i_tree])
             last_forest_mask = last_forest_mask & pass_data_mask
             if verbose == 2:
                 print("%d leaf-num:%d[now:%d/all:%d] " % (index, len(tree_leaf_index[i_tree]), \
                     len(pass_data_mask[pass_data_mask==True]), \
                     len(last_forest_mask[last_forest_mask==True])), end="")
         if verbose: print()
     return last_forest_mask
コード例 #5
0
def print_decision_path(tree, X, sample_id=0):
    node_indicator = tree.decision_path(X)
    leave_id = tree.apply(X)
    node_index = node_indicator.indices[
        node_indicator.indptr[sample_id]:node_indicator.indptr[sample_id + 1]]
    print('Rules used to predict sample %s: ' % sample_id)
    print node_index
    for node_id in node_index:
        if (X[sample_id, tree.tree_.feature[node_id]] <=
                tree.tree_.threshold[node_id]):
            threshold_sign = "<="
        else:
            threshold_sign = ">"
        print("decision id node %s : (X_test[%s, %s] (= %s) %s %s)" %
              (node_id, sample_id, tree.tree_.feature[node_id],
               X[sample_id, tree.tree_.feature[node_id]], threshold_sign,
               tree.tree_.threshold[node_id]))
コード例 #6
0
def Tree_path(tree, samples):
    '''
    inputs:
    
    takes tree (best estimated one, if GridSearchCV has been used),
    pure samples as inputs, could also be not pure samples
    
    outputs:
    
    returns a list of dictionaries, where, keys mean feature no[0-means-->sample[i][0], where i is any sample no],
    values mean condition followed by thresholds, one after another condition and thresholds are added in dictionary values
    
    i'th dictionary in dictionary list represents a unique rule.
    
    uncomment prints to see it in action.

    '''

    number_of_nodes = tree.tree_.node_count
    feature = tree.tree_.feature
    threshold = tree.tree_.threshold

    decision_paths = tree.decision_path(samples)

    leave_ids = tree.apply(samples)

    dic = []

    for i in range(0, len(samples), 1):

        sample_id = i

        d = dict()

        indexes=decision_paths.indices[decision_paths.indptr[sample_id]:\
                                      decision_paths.indptr[sample_id+1]]

        #print('sample id: ',sample_id)

        comparator = ''

        for node_id in indexes:
            d[feature[node_id]] = []

        for node_id in indexes:

            if leave_ids[sample_id] == node_id:

                d.pop(feature[node_id], None)

                #print(d)

                if d not in dic:
                    dic.append(d)

                continue

            if (samples[sample_id][feature[node_id]] <= threshold[node_id]):

                comparator = "<="
            else:
                comparator = ">"

            #print("X_test[%s,%s]  %s %s "%(sample_id,feature[node_id],comparator,threshold[node_id]) )

            d[feature[node_id]].append(comparator)
            d[feature[node_id]].append(threshold[node_id])

    #print(dic)

    return dic
コード例 #7
0
    def get_forest_leaf_index(self, clf, X_valid, y_valid):
        forest_leaf_index = []
        for index, tree in enumerate(clf):
            # max dimension
            max_dim = 0
            # get all data node id list
            node_id_lt_a = tree.apply(X_valid)
            node_id_cnt_a = np.bincount(node_id_lt_a)
            tmp_dim_a = np.max(node_id_lt_a)
            if tmp_dim_a > max_dim: max_dim = tmp_dim_a
            # get positive data node id list
            node_id_lt_p = tree.apply(X_valid[y_valid == 1])
            node_id_cnt_p = np.bincount(node_id_lt_p)
            tmp_dim_p = np.max(node_id_lt_p)
            if tmp_dim_p > max_dim: max_dim = tmp_dim_p
            # get negative data node id list
            node_id_lt_n = tree.apply(X_valid[y_valid == 0])
            node_id_cnt_n = np.bincount(node_id_lt_n)
            tmp_dim_n = np.max(node_id_lt_n)
            if tmp_dim_n > max_dim: max_dim = tmp_dim_n
            # sync dimension
            if tmp_dim_a < max_dim:
                diff = max_dim - tmp_dim_a
                node_id_cnt_a = np.append(node_id_cnt_a, [0] * diff)
            if tmp_dim_p < max_dim:
                diff = max_dim - tmp_dim_p
                node_id_cnt_p = np.append(node_id_cnt_p, [0] * diff)
            if tmp_dim_n < max_dim:
                diff = max_dim - tmp_dim_n
                node_id_cnt_n = np.append(node_id_cnt_n, [0] * diff)
            # assert
            assert not any(~np.isfinite(node_id_cnt_a))
            assert not any(~np.isfinite(node_id_cnt_p))
            assert not any(~np.isfinite(node_id_cnt_n))
            # node_id_most
            node_id_cnt_m = np.maximum(node_id_cnt_p, node_id_cnt_n)
            node_id_cnt_p_n = np.vstack((node_id_cnt_p, node_id_cnt_n))
            node_id_argmax = np.argmax(node_id_cnt_p_n, axis=0)
            node_id_argmin = np.argmin(node_id_cnt_p_n, axis=0)
            node_id_argmax = node_id_argmax * len(
                X_valid[y_valid == 0]) / len(X_valid)
            node_id_argmin = node_id_argmin * len(
                X_valid[y_valid == 1]) / len(X_valid)
            node_id_y_prob = node_id_argmax + node_id_argmin

            # diff = node_id_freq_p.shape[0] - node_id_count.shape[0]
            # if diff < 0:
            #     # print(abs(diff), np.max(node_id_lt_all), np.max(node_id_lt), np.max(node_id_lt_all)-np.max(node_id_lt))
            #     node_id_freq_p = np.append(node_id_freq_p, [0]*abs(diff))
            # elif diff > 0:
            #     # print(abs(diff), np.max(node_id_lt_all), np.max(node_id_lt), np.max(node_id_lt)-np.max(node_id_lt_all))
            #     node_id_count = np.append(node_id_count, [0]*diff)
            # sync dim end

            # assert
            assert node_id_cnt_m.shape == node_id_cnt_a.shape
            # node_id_freq & node_id_count
            node_id_freq = node_id_cnt_m / node_id_cnt_a
            node_id_lift = node_id_freq / node_id_y_prob
            node_id_count = node_id_cnt_m / len(X_valid)
            # print(node_id_count, node_id_freq, node_id_count*node_id_freq)
            # node_id_score = node_id_count*node_id_freq
            # node_id_score = 2*node_id_freq*node_id_count/(node_id_freq+node_id_count)
            # node_id_score = 2*node_id_freq*node_id_lift/(node_id_freq+node_id_lift)
            node_id_score = node_id_freq

            # !!! FIXME: Maybe bugs
            node_id_score[np.isnan(node_id_score)] = 0
            node_id_score[np.isinf(node_id_score)] = 0
            assert not any(~np.isfinite(node_id_score))

            impurity_index = np.argsort(node_id_score, axis=0)[::-1]
            impurity_sort = np.sort(node_id_score)[::-1]

            threshold_imp = np.mean(node_id_score[node_id_score > 0], axis=0)
            # print(impurity_sort)
            now_tree_impurity_index = impurity_index[
                impurity_sort > threshold_imp]
            now_tree_impurity_sort = impurity_sort[
                impurity_sort > threshold_imp]
            # print(now_tree_impurity_index.shape, node_id_score.shape)
            # print(now_tree_impurity_index)
            # print(now_tree_impurity_sort)
            forest_leaf_index.append(now_tree_impurity_index)
        return forest_leaf_index
コード例 #8
0
    def get_forest_leaf_index(self, clf, X_valid, y_valid, num_class):
        if num_class == None: num_class = 2

        forest_leaf_index = []
        for index, tree in enumerate(clf):
            # max dimension
            # max_dim = 0
            # get all data node id list
            node_id_lt_a = tree.apply(X_valid)
            node_id_cnt_a = np.bincount(node_id_lt_a)
            max_dim = np.max(node_id_lt_a) + 1
            # if tmp_dim_a > max_dim: max_dim = tmp_dim_a

            node_id_cnt_class = []
            for i in range(num_class):
                if len(X_valid[y_valid==i]) == 0: continue
                node_id_lt_tmp = tree.apply(X_valid[y_valid==i])
                node_id_cnt_tmp = np.bincount(node_id_lt_tmp)
                tmp_dim = np.max(node_id_lt_tmp) + 1
                if tmp_dim < max_dim:
                    diff = max_dim - tmp_dim
                    node_id_cnt_tmp = np.append(node_id_cnt_tmp, [0]*diff)
                # assert
                assert not any(~np.isfinite(node_id_cnt_tmp))
                node_id_cnt_class.append(node_id_cnt_tmp)
            node_id_cnt_class = np.array(node_id_cnt_class)
            
            # node_id_most
            node_id_cnt_m = np.max(node_id_cnt_class, axis=0)
            # node_id_cnt_p_n = np.vstack((node_id_cnt_p, node_id_cnt_n))
            node_id_argmax = np.argmax(node_id_cnt_class, axis=0)
            node_id_y_prob = np.array([1] * max_dim)
            for i in range(num_class):
                node_id_y_prob[node_id_argmax == i] = node_id_y_prob[node_id_argmax == i] * len(X_valid[y_valid==i])/len(X_valid)
            # node_id_argmin = np.argmin(node_id_cnt_p_n, axis=0)
            # node_id_argmax = node_id_argmax * len(X_valid[y_valid==0])/len(X_valid)
            # node_id_argmin = node_id_argmin * len(X_valid[y_valid==1])/len(X_valid)
            # node_id_y_prob = node_id_argmax + node_id_argmin

            # assert
            assert node_id_cnt_m.shape == node_id_cnt_a.shape
            # node_id_freq & node_id_count
            node_id_freq = node_id_cnt_m/node_id_cnt_a
            node_id_lift = node_id_freq/node_id_y_prob
            node_id_count = node_id_cnt_m/len(X_valid)
            # print(node_id_count, node_id_freq, node_id_count*node_id_freq)
            # node_id_score = node_id_count*node_id_freq
            # node_id_score = 2*node_id_freq*node_id_count/(node_id_freq+node_id_count)
            # node_id_score = 2*node_id_freq*node_id_lift/(node_id_freq+node_id_lift)
            node_id_score = node_id_freq


            # get positive data node id list
            # node_id_lt_p = tree.apply(X_valid[y_valid==1])
            # node_id_cnt_p = np.bincount(node_id_lt_p)
            # tmp_dim_p = np.max(node_id_lt_p)
            # if tmp_dim_p > max_dim: max_dim = tmp_dim_p
            # # get negative data node id list
            # node_id_lt_n = tree.apply(X_valid[y_valid==0])
            # node_id_cnt_n = np.bincount(node_id_lt_n)
            # tmp_dim_n = np.max(node_id_lt_n)
            # if tmp_dim_n > max_dim: max_dim = tmp_dim_n
            # # sync dimension
            # if tmp_dim_a < max_dim:
            #     diff = max_dim - tmp_dim_a
            #     node_id_cnt_a = np.append(node_id_cnt_a, [0]*diff)
            # if tmp_dim_p < max_dim:
            #     diff = max_dim - tmp_dim_p
            #     node_id_cnt_p = np.append(node_id_cnt_p, [0]*diff)
            # if tmp_dim_n < max_dim:
            #     diff = max_dim - tmp_dim_n
            #     node_id_cnt_n = np.append(node_id_cnt_n, [0]*diff)
            # # assert
            # assert not any(~np.isfinite(node_id_cnt_a))
            # assert not any(~np.isfinite(node_id_cnt_p))
            # assert not any(~np.isfinite(node_id_cnt_n))

            # node_id_most
            # node_id_cnt_m = np.maximum(node_id_cnt_p, node_id_cnt_n)
            # node_id_cnt_p_n = np.vstack((node_id_cnt_p, node_id_cnt_n))
            # node_id_argmax = np.argmax(node_id_cnt_p_n, axis=0)
            # node_id_argmin = np.argmin(node_id_cnt_p_n, axis=0)
            # node_id_argmax = node_id_argmax * len(X_valid[y_valid==0])/len(X_valid)
            # node_id_argmin = node_id_argmin * len(X_valid[y_valid==1])/len(X_valid)
            # node_id_y_prob = node_id_argmax + node_id_argmin

            # # assert
            # assert node_id_cnt_m.shape == node_id_cnt_a.shape
            # # node_id_freq & node_id_count
            # node_id_freq = node_id_cnt_m/node_id_cnt_a
            # node_id_lift = node_id_freq/node_id_y_prob
            # node_id_count = node_id_cnt_m/len(X_valid)
            # print(node_id_count, node_id_freq, node_id_count*node_id_freq)
            # node_id_score = node_id_count*node_id_freq
            # node_id_score = 2*node_id_freq*node_id_count/(node_id_freq+node_id_count)
            # node_id_score = 2*node_id_freq*node_id_lift/(node_id_freq+node_id_lift)
            node_id_score = node_id_freq

            # !!! FIXME: Maybe bugs
            node_id_score[np.isnan(node_id_score)] = 0
            node_id_score[np.isinf(node_id_score)] = 0
            assert not any(~np.isfinite(node_id_score))

            impurity_index = np.argsort(node_id_score, axis=0)[::-1]  
            impurity_sort = np.sort(node_id_score)[::-1]

            threshold_imp = np.mean(node_id_score[node_id_score>0], axis=0)
            # print(impurity_sort)
            now_tree_impurity_index = impurity_index[impurity_sort>threshold_imp]
            now_tree_impurity_sort = impurity_sort[impurity_sort>threshold_imp]
            # print(now_tree_impurity_index.shape, node_id_score.shape)
            # print(now_tree_impurity_index)
            # print(now_tree_impurity_sort)
            forest_leaf_index.append(now_tree_impurity_index)
        return forest_leaf_index