def __init__(self, tree_model, x_data: (pd.DataFrame, np.ndarray), y_data: (pd.Series, np.ndarray), feature_names: List[str] = None, target_name: str = None, class_names: (List[str], Mapping[int, str]) = None): """ Parameters ---------- :param tree_model: sklearn.tree.DecisionTreeRegressor, sklearn.tree.DecisionTreeClassifier, xgboost.core.Booster The decision tree to be interpreted :param x_data: pd.DataFrame, np.ndarray Features values on which the shadow tree will be build. :param y_data: pd.Series, np.ndarray Target values on which the shadow tree will be build. :param feature_names: List[str] Features' names :param target_name: str Target's name :param class_names: List[str], Mapping[int, str] Class' names (in case of a classifier) """ self.tree_model = tree_model if not self.is_fit(): raise Exception(f"Model {tree_model} is not fit.") self.feature_names = feature_names self.target_name = target_name self.x_data = ShadowDecTree._get_x_data(x_data) self.y_data = ShadowDecTree._get_y_data(y_data) self.root, self.leaves, self.internal = self._get_tree_nodes() if self.is_classifier(): self.class_names = utils._normalize_class_names( class_names, self.nclasses())
def clfviz_univar(model, x: np.ndarray, y: np.ndarray, ntiles=100, binary_threshold=0.5, show=[ 'instances', 'boundaries', 'probabilities', 'misclassified', 'legend' ], feature_name=None, target_name=None, class_names=None, markers=None, fontsize=9, fontname="Arial", dot_w=25, yshift=.09, sigma=.09, colors: dict = None, ax=None) -> None: """ See comment and parameter descriptions for clfviz() above. """ if ax is None: fig, ax = plt.subplots(1, 1, figsize=(5, 1.2)) if isinstance(x, pd.Series): x = x.values if isinstance(y, pd.Series): y = y.values if (len(x.shape) == 2 and x.shape[1] != 1) or len(x.shape) > 2: raise ValueError(f"Expecting 1D data not {x.shape}") colors = adjust_colors(colors) mu = 0.08 class_values = np.unique(y) nclasses = len(class_values) class_colors = np.array(colors['classes'][nclasses]) color_map = {v: class_colors[i] for i, v in enumerate(class_values)} x1r = np.max(x) - np.min(x) x1range = (np.min(x), np.max(x)) grid_points, w = np.linspace(*x1range, num=ntiles, endpoint=True, retstep=True) grid_proba = _predict_proba(model, grid_points) if len(np.unique(y)) == 2: # is k=2 binary? grid_pred = np.where(grid_proba[:, 1] >= binary_threshold, 1, 0) else: grid_pred = np.argmax(grid_proba, axis=1) # TODO: assumes classes are 0..k-1 ymax = ax.get_ylim()[1] # compute the stripes on the bottom showing probabilities if 'probabilities' in show: class_values = np.unique(y) color_map, grid_pred_colors, grid_proba_colors = \ _get_grid_colors(grid_proba, grid_pred, class_values, colors=adjust_colors(None)) pred_box_height = .08 * ymax boxes = [] for i, gx in enumerate(grid_points): rect = patches.Rectangle((gx, 0), w, pred_box_height, edgecolor='none', facecolor=grid_proba_colors[i], alpha=colors['tile_alpha']) boxes.append(rect) # drop box around the gradation ax.add_collection(PatchCollection(boxes, match_original=True)) rect = patches.Rectangle((grid_points[0], 0), x1r + w, pred_box_height, linewidth=.3, edgecolor=colors['rect_edge'], facecolor='none') ax.add_patch(rect) if 'boundaries' in show: dx = np.abs(np.diff(grid_pred)) dx = np.hstack([0, dx]) dx_edge_idx = np.where(dx) # indexes of dx class transitions? for lx in grid_points[dx_edge_idx]: ax.plot([lx, lx], [*ax.get_ylim()], '--', lw=.3, c=colors['split_line'], alpha=1.0) if 'instances' in show: # user should pass in short and wide fig x_proba = _predict_proba(model, x) if len(np.unique(y)) == 2: # is k=2 binary? x_pred = np.where(x_proba[:, 1] >= binary_threshold, 1, 0) else: x_pred = np.argmax(x_proba, axis=1) # TODO: assumes classes are 0..k-1 class_x = [x[y == cl] for cl in class_values] class_x_pred = [x_pred[y == cl] for cl in class_values] if markers is None: markers = ['o'] * len(class_x) for i, x_, in enumerate(class_x): if 'misclassified' in show: # Show correctly classified markers good_x = x_[class_x_pred[i] == class_values[i]] noise = np.random.normal(mu, sigma, size=len(good_x)) ax.scatter(good_x, [mu + i * yshift] * len(good_x) + noise, s=dot_w, c=color_map[i], marker=markers[i], alpha=colors['scatter_marker_alpha'], edgecolors=colors['scatter_edge'], lw=.5) # Show misclassified markers (can't have alpha per marker so do in 2 calls) bad_x = x_[class_x_pred[i] != class_values[i]] noise = np.random.normal(mu, sigma, size=len(bad_x)) ax.scatter(bad_x, [mu + i * yshift] * len(bad_x) + noise, s=dot_w, c=color_map[i], marker=markers[i], alpha=1.0, edgecolors=colors['warning'], lw=.5) else: noise = np.random.normal(mu, sigma, size=len(x_)) ax.scatter(x_, [mu + i * yshift] * len(x_) + noise, s=dot_w, c=color_map[i], marker=markers[i], alpha=colors['scatter_marker_alpha'], edgecolors=colors['scatter_edge'], lw=.5) ax.spines['top'].set_visible(False) ax.spines['left'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['bottom'].set_linewidth(0.1) ax.set_yticks([]) ax.tick_params(axis='both', which='major', width=.3, labelcolor=colors['tick_label'], labelsize=fontsize) for tick in ax.get_xticklabels(): tick.set_fontname(fontname) for tick in ax.get_yticklabels(): tick.set_fontname(fontname) ax.set_ylim(0, mu + nclasses * yshift + 6 * sigma) if feature_name is not None: ax.set_xlabel(f"{feature_name}", fontsize=fontsize, fontname=fontname, color=colors['axis_label']) if 'legend' in show: class_names = utils._normalize_class_names(class_names, nclasses) add_classifier_legend(ax, class_names, class_values, color_map, target_name, colors, fontsize=fontsize, fontname=fontname)
def clfviz_bivar(model, X: np.ndarray, y: np.ndarray, ntiles=50, tile_fraction=.9, binary_threshold=0.5, show=[ 'instances', 'boundaries', 'probabilities', 'misclassified', 'legend' ], feature_names=None, target_name=None, class_names=None, markers=None, boundary_marker='o', boundary_markersize=.8, fontsize=9, fontname="Arial", dot_w=25, colors: dict = None, ax=None) -> None: """ See comment and parameter descriptions for clfviz() above. """ if isinstance(X, pd.DataFrame): X = X.values if isinstance(y, pd.Series): y = y.values if len(X.shape) == 1 or (len(X.shape) == 2 and X.shape[1] != 2) or len(X.shape) > 2: raise ValueError(f"Expecting 2D data not {X.shape}") if ax is None: fig, ax = plt.subplots(1, 1, figsize=(5, 3.5)) # Created grid over the range of x1 and x2 variables, get probabilities, predictions grid_points, grid_proba, grid_pred_as_matrix, w, x_, class_X, class_values = \ _compute_tiling(model, X, y, binary_threshold, ntiles, tile_fraction) x_proba = _predict_proba(model, X) if len(np.unique(y)) == 2: # is k=2 binary? X_pred = np.where(x_proba[:, 1] >= binary_threshold, 1, 0) else: X_pred = np.argmax(x_proba, axis=1) # TODO: assumes classes are 0..k-1 class_X_pred = [X_pred[y == cl] for cl in class_values] if markers is None: markers = ['o'] * len(class_X) colors = adjust_colors(colors) class_values = np.unique(y) # returns sorted # Get class to color map for probabilities and predictions color_map, grid_pred_colors, grid_proba_colors = \ _get_grid_colors(grid_proba, grid_pred_as_matrix, class_values, colors) # Draw probabilities or class prediction grid facecolors = grid_proba_colors if 'probabilities' in show else grid_pred_colors _draw_tiles(ax, grid_points, facecolors, colors['tile_alpha'], x_, w) # Get grid with class predictions with coordinates (x,y) # e.g., y_pred[0,0] is lower left pixel and y_pred[5,5] is top-right pixel # for npoints=5 grid_pred_as_matrix = grid_pred_as_matrix.reshape(ntiles, ntiles) if 'boundaries' in show: _draw_boundary_edges(ax, grid_points, grid_pred_as_matrix, boundary_marker, boundary_markersize, colors, w, x_) # Draw the X instances circles if 'instances' in show: for i, x_ in enumerate(class_X): if 'misclassified' in show: # Show correctly classified markers good_x = x_[class_X_pred[i] == class_values[i], :] ax.scatter(good_x[:, 0], good_x[:, 1], s=dot_w, c=color_map[i], marker=markers[i], alpha=colors['scatter_marker_alpha'], edgecolors=colors['scatter_edge'], lw=.5) # Show misclassified markers (can't have alpha per marker so do in 2 calls) bad_x = x_[class_X_pred[i] != class_values[i], :] ax.scatter(bad_x[:, 0], bad_x[:, 1], s=dot_w, c=color_map[i], marker=markers[i], alpha=1.0, edgecolors=colors['warning'], lw=.5) else: ax.scatter(x_[:, 0], x_[:, 1], s=dot_w, c=color_map[i], marker=markers[i], alpha=colors['scatter_marker_alpha'], edgecolors=colors['scatter_edge'], lw=.5) if feature_names is not None: ax.set_xlabel(f"{feature_names[0]}", fontsize=fontsize, fontname=fontname, color=colors['axis_label']) ax.set_ylabel(f"{feature_names[1]}", fontsize=fontsize, fontname=fontname, color=colors['axis_label']) if 'legend' in show: class_names = utils._normalize_class_names(class_names, nclasses=len(class_values)) add_classifier_legend(ax, class_names, class_values, color_map, target_name, colors, fontsize=fontsize, fontname=fontname) ax.tick_params(axis='both', which='major', width=.3, labelcolor=colors['tick_label'], labelsize=fontsize) for tick in ax.get_xticklabels(): tick.set_fontname(fontname) for tick in ax.get_yticklabels(): tick.set_fontname(fontname) ax.spines['top'].set_visible(False) # turns off the top "spine" completely ax.spines['right'].set_visible(False) ax.spines['left'].set_linewidth(.5) ax.spines['bottom'].set_linewidth(.5)