def _update_terminal_regions(self, tree, X, y, lambdas, deltas, y_pred, sample_mask): terminal_regions = tree.apply(X) masked_terminal_regions = terminal_regions.copy() masked_terminal_regions[~sample_mask] = -1 for leaf in np.where(tree.children_left == sklearn.tree._tree.TREE_LEAF)[0]: terminal_region = np.where(masked_terminal_regions == leaf) suml = np.sum(lambdas[terminal_region]) sumd = np.sum(deltas[terminal_region]) tree.value[leaf, 0, 0] = 0.0 if sumd == 0.0 else (suml / sumd) y_pred += tree.value[terminal_regions, 0, 0] * self.learning_rate
def _update_terminal_regions(self, tree, X, y, lambdas, deltas, y_pred, sample_mask): terminal_regions = tree.apply(X) masked_terminal_regions = terminal_regions.copy() masked_terminal_regions[~sample_mask] = -1 for leaf in np.where(tree.children_left == sklearn.tree._tree.TREE_LEAF)[0]: terminal_region = np.where(masked_terminal_regions == leaf) suml = np.sum(lambdas[terminal_region]) sumd = np.sum(deltas[terminal_region]) tree.value[leaf, 0, 0] = 0.0 if sumd == 0.0 else (suml / sumd) y_pred += tree.value[terminal_regions, 0, 0] * self.learning_rate
def closest_decision(self, tree, sample, strategy='informativeness', beta=5): '''Find the closest decision that is of a class other than the target class. Args: tree: sklearn tree sample: Entry to explain beta: Hyperparameter >= 1 to determine when to only search part of tree (higher = search smaller area) Returns: Ordered descriptive decision path difference, confidence of leaf decision ''' # Only search part of tree depending on tree size decision_path = tree.decision_path(sample.reshape(1, -1)).indices if len(decision_path) < 2: warnings.warn('Stub tree') return None, 0.0 start_depth = int(round(len(decision_path) / beta)) start_node = decision_path[start_depth] # Get decision for sample fact_leaf = tree.apply(sample.reshape(1, -1)).item(0) # TODO: Retrain tree if wrong prediction if np.argmax(tree.tree_.value[fact_leaf]) != 0: warnings.warn('Tree did not predict as fact') # Find closest leaf that does not predict output x, based on a strategy graph, foil_nodes = self._fact_foil_graph(tree.tree_, start_node=start_node) if self.verbose: print(f'[E] Found {len(foil_nodes)} contrastive decision regions, ' f'starting from node {start_node}') if len(foil_nodes) == 0: return None, 0 # Contrastive decision region foil_path, confidence = self._get_path(graph, fact_leaf, foil_nodes, tree.tree_, strategy) return self.descriptive_path(foil_path, sample, tree), confidence
def get_data_mask_of_ests_vaild(self, X_train, verbose=True): estimators = self.estimator last_forest_mask = np.array([True] * len(X_train)) for index, forest in enumerate(estimators): tree_leaf_index = self.est_leaf_index[index] for i_tree, tree in enumerate(forest): node_id_lt = tree.apply(X_train) pass_data_mask = np.isin(node_id_lt, tree_leaf_index[i_tree]) last_forest_mask = last_forest_mask & pass_data_mask if verbose == 2: print("%d leaf-num:%d[now:%d/all:%d] " % (index, len(tree_leaf_index[i_tree]), \ len(pass_data_mask[pass_data_mask==True]), \ len(last_forest_mask[last_forest_mask==True])), end="") if verbose: print() return last_forest_mask
def print_decision_path(tree, X, sample_id=0): node_indicator = tree.decision_path(X) leave_id = tree.apply(X) node_index = node_indicator.indices[ node_indicator.indptr[sample_id]:node_indicator.indptr[sample_id + 1]] print('Rules used to predict sample %s: ' % sample_id) print node_index for node_id in node_index: if (X[sample_id, tree.tree_.feature[node_id]] <= tree.tree_.threshold[node_id]): threshold_sign = "<=" else: threshold_sign = ">" print("decision id node %s : (X_test[%s, %s] (= %s) %s %s)" % (node_id, sample_id, tree.tree_.feature[node_id], X[sample_id, tree.tree_.feature[node_id]], threshold_sign, tree.tree_.threshold[node_id]))
def Tree_path(tree, samples): ''' inputs: takes tree (best estimated one, if GridSearchCV has been used), pure samples as inputs, could also be not pure samples outputs: returns a list of dictionaries, where, keys mean feature no[0-means-->sample[i][0], where i is any sample no], values mean condition followed by thresholds, one after another condition and thresholds are added in dictionary values i'th dictionary in dictionary list represents a unique rule. uncomment prints to see it in action. ''' number_of_nodes = tree.tree_.node_count feature = tree.tree_.feature threshold = tree.tree_.threshold decision_paths = tree.decision_path(samples) leave_ids = tree.apply(samples) dic = [] for i in range(0, len(samples), 1): sample_id = i d = dict() indexes=decision_paths.indices[decision_paths.indptr[sample_id]:\ decision_paths.indptr[sample_id+1]] #print('sample id: ',sample_id) comparator = '' for node_id in indexes: d[feature[node_id]] = [] for node_id in indexes: if leave_ids[sample_id] == node_id: d.pop(feature[node_id], None) #print(d) if d not in dic: dic.append(d) continue if (samples[sample_id][feature[node_id]] <= threshold[node_id]): comparator = "<=" else: comparator = ">" #print("X_test[%s,%s] %s %s "%(sample_id,feature[node_id],comparator,threshold[node_id]) ) d[feature[node_id]].append(comparator) d[feature[node_id]].append(threshold[node_id]) #print(dic) return dic
def get_forest_leaf_index(self, clf, X_valid, y_valid): forest_leaf_index = [] for index, tree in enumerate(clf): # max dimension max_dim = 0 # get all data node id list node_id_lt_a = tree.apply(X_valid) node_id_cnt_a = np.bincount(node_id_lt_a) tmp_dim_a = np.max(node_id_lt_a) if tmp_dim_a > max_dim: max_dim = tmp_dim_a # get positive data node id list node_id_lt_p = tree.apply(X_valid[y_valid == 1]) node_id_cnt_p = np.bincount(node_id_lt_p) tmp_dim_p = np.max(node_id_lt_p) if tmp_dim_p > max_dim: max_dim = tmp_dim_p # get negative data node id list node_id_lt_n = tree.apply(X_valid[y_valid == 0]) node_id_cnt_n = np.bincount(node_id_lt_n) tmp_dim_n = np.max(node_id_lt_n) if tmp_dim_n > max_dim: max_dim = tmp_dim_n # sync dimension if tmp_dim_a < max_dim: diff = max_dim - tmp_dim_a node_id_cnt_a = np.append(node_id_cnt_a, [0] * diff) if tmp_dim_p < max_dim: diff = max_dim - tmp_dim_p node_id_cnt_p = np.append(node_id_cnt_p, [0] * diff) if tmp_dim_n < max_dim: diff = max_dim - tmp_dim_n node_id_cnt_n = np.append(node_id_cnt_n, [0] * diff) # assert assert not any(~np.isfinite(node_id_cnt_a)) assert not any(~np.isfinite(node_id_cnt_p)) assert not any(~np.isfinite(node_id_cnt_n)) # node_id_most node_id_cnt_m = np.maximum(node_id_cnt_p, node_id_cnt_n) node_id_cnt_p_n = np.vstack((node_id_cnt_p, node_id_cnt_n)) node_id_argmax = np.argmax(node_id_cnt_p_n, axis=0) node_id_argmin = np.argmin(node_id_cnt_p_n, axis=0) node_id_argmax = node_id_argmax * len( X_valid[y_valid == 0]) / len(X_valid) node_id_argmin = node_id_argmin * len( X_valid[y_valid == 1]) / len(X_valid) node_id_y_prob = node_id_argmax + node_id_argmin # diff = node_id_freq_p.shape[0] - node_id_count.shape[0] # if diff < 0: # # print(abs(diff), np.max(node_id_lt_all), np.max(node_id_lt), np.max(node_id_lt_all)-np.max(node_id_lt)) # node_id_freq_p = np.append(node_id_freq_p, [0]*abs(diff)) # elif diff > 0: # # print(abs(diff), np.max(node_id_lt_all), np.max(node_id_lt), np.max(node_id_lt)-np.max(node_id_lt_all)) # node_id_count = np.append(node_id_count, [0]*diff) # sync dim end # assert assert node_id_cnt_m.shape == node_id_cnt_a.shape # node_id_freq & node_id_count node_id_freq = node_id_cnt_m / node_id_cnt_a node_id_lift = node_id_freq / node_id_y_prob node_id_count = node_id_cnt_m / len(X_valid) # print(node_id_count, node_id_freq, node_id_count*node_id_freq) # node_id_score = node_id_count*node_id_freq # node_id_score = 2*node_id_freq*node_id_count/(node_id_freq+node_id_count) # node_id_score = 2*node_id_freq*node_id_lift/(node_id_freq+node_id_lift) node_id_score = node_id_freq # !!! FIXME: Maybe bugs node_id_score[np.isnan(node_id_score)] = 0 node_id_score[np.isinf(node_id_score)] = 0 assert not any(~np.isfinite(node_id_score)) impurity_index = np.argsort(node_id_score, axis=0)[::-1] impurity_sort = np.sort(node_id_score)[::-1] threshold_imp = np.mean(node_id_score[node_id_score > 0], axis=0) # print(impurity_sort) now_tree_impurity_index = impurity_index[ impurity_sort > threshold_imp] now_tree_impurity_sort = impurity_sort[ impurity_sort > threshold_imp] # print(now_tree_impurity_index.shape, node_id_score.shape) # print(now_tree_impurity_index) # print(now_tree_impurity_sort) forest_leaf_index.append(now_tree_impurity_index) return forest_leaf_index
def get_forest_leaf_index(self, clf, X_valid, y_valid, num_class): if num_class == None: num_class = 2 forest_leaf_index = [] for index, tree in enumerate(clf): # max dimension # max_dim = 0 # get all data node id list node_id_lt_a = tree.apply(X_valid) node_id_cnt_a = np.bincount(node_id_lt_a) max_dim = np.max(node_id_lt_a) + 1 # if tmp_dim_a > max_dim: max_dim = tmp_dim_a node_id_cnt_class = [] for i in range(num_class): if len(X_valid[y_valid==i]) == 0: continue node_id_lt_tmp = tree.apply(X_valid[y_valid==i]) node_id_cnt_tmp = np.bincount(node_id_lt_tmp) tmp_dim = np.max(node_id_lt_tmp) + 1 if tmp_dim < max_dim: diff = max_dim - tmp_dim node_id_cnt_tmp = np.append(node_id_cnt_tmp, [0]*diff) # assert assert not any(~np.isfinite(node_id_cnt_tmp)) node_id_cnt_class.append(node_id_cnt_tmp) node_id_cnt_class = np.array(node_id_cnt_class) # node_id_most node_id_cnt_m = np.max(node_id_cnt_class, axis=0) # node_id_cnt_p_n = np.vstack((node_id_cnt_p, node_id_cnt_n)) node_id_argmax = np.argmax(node_id_cnt_class, axis=0) node_id_y_prob = np.array([1] * max_dim) for i in range(num_class): node_id_y_prob[node_id_argmax == i] = node_id_y_prob[node_id_argmax == i] * len(X_valid[y_valid==i])/len(X_valid) # node_id_argmin = np.argmin(node_id_cnt_p_n, axis=0) # node_id_argmax = node_id_argmax * len(X_valid[y_valid==0])/len(X_valid) # node_id_argmin = node_id_argmin * len(X_valid[y_valid==1])/len(X_valid) # node_id_y_prob = node_id_argmax + node_id_argmin # assert assert node_id_cnt_m.shape == node_id_cnt_a.shape # node_id_freq & node_id_count node_id_freq = node_id_cnt_m/node_id_cnt_a node_id_lift = node_id_freq/node_id_y_prob node_id_count = node_id_cnt_m/len(X_valid) # print(node_id_count, node_id_freq, node_id_count*node_id_freq) # node_id_score = node_id_count*node_id_freq # node_id_score = 2*node_id_freq*node_id_count/(node_id_freq+node_id_count) # node_id_score = 2*node_id_freq*node_id_lift/(node_id_freq+node_id_lift) node_id_score = node_id_freq # get positive data node id list # node_id_lt_p = tree.apply(X_valid[y_valid==1]) # node_id_cnt_p = np.bincount(node_id_lt_p) # tmp_dim_p = np.max(node_id_lt_p) # if tmp_dim_p > max_dim: max_dim = tmp_dim_p # # get negative data node id list # node_id_lt_n = tree.apply(X_valid[y_valid==0]) # node_id_cnt_n = np.bincount(node_id_lt_n) # tmp_dim_n = np.max(node_id_lt_n) # if tmp_dim_n > max_dim: max_dim = tmp_dim_n # # sync dimension # if tmp_dim_a < max_dim: # diff = max_dim - tmp_dim_a # node_id_cnt_a = np.append(node_id_cnt_a, [0]*diff) # if tmp_dim_p < max_dim: # diff = max_dim - tmp_dim_p # node_id_cnt_p = np.append(node_id_cnt_p, [0]*diff) # if tmp_dim_n < max_dim: # diff = max_dim - tmp_dim_n # node_id_cnt_n = np.append(node_id_cnt_n, [0]*diff) # # assert # assert not any(~np.isfinite(node_id_cnt_a)) # assert not any(~np.isfinite(node_id_cnt_p)) # assert not any(~np.isfinite(node_id_cnt_n)) # node_id_most # node_id_cnt_m = np.maximum(node_id_cnt_p, node_id_cnt_n) # node_id_cnt_p_n = np.vstack((node_id_cnt_p, node_id_cnt_n)) # node_id_argmax = np.argmax(node_id_cnt_p_n, axis=0) # node_id_argmin = np.argmin(node_id_cnt_p_n, axis=0) # node_id_argmax = node_id_argmax * len(X_valid[y_valid==0])/len(X_valid) # node_id_argmin = node_id_argmin * len(X_valid[y_valid==1])/len(X_valid) # node_id_y_prob = node_id_argmax + node_id_argmin # # assert # assert node_id_cnt_m.shape == node_id_cnt_a.shape # # node_id_freq & node_id_count # node_id_freq = node_id_cnt_m/node_id_cnt_a # node_id_lift = node_id_freq/node_id_y_prob # node_id_count = node_id_cnt_m/len(X_valid) # print(node_id_count, node_id_freq, node_id_count*node_id_freq) # node_id_score = node_id_count*node_id_freq # node_id_score = 2*node_id_freq*node_id_count/(node_id_freq+node_id_count) # node_id_score = 2*node_id_freq*node_id_lift/(node_id_freq+node_id_lift) node_id_score = node_id_freq # !!! FIXME: Maybe bugs node_id_score[np.isnan(node_id_score)] = 0 node_id_score[np.isinf(node_id_score)] = 0 assert not any(~np.isfinite(node_id_score)) impurity_index = np.argsort(node_id_score, axis=0)[::-1] impurity_sort = np.sort(node_id_score)[::-1] threshold_imp = np.mean(node_id_score[node_id_score>0], axis=0) # print(impurity_sort) now_tree_impurity_index = impurity_index[impurity_sort>threshold_imp] now_tree_impurity_sort = impurity_sort[impurity_sort>threshold_imp] # print(now_tree_impurity_index.shape, node_id_score.shape) # print(now_tree_impurity_index) # print(now_tree_impurity_sort) forest_leaf_index.append(now_tree_impurity_index) return forest_leaf_index