def leaf_prediction(self, x, *, tree=None): if not self.stats: return prediction_option = tree.leaf_prediction if not self.is_active() or prediction_option == tree._MAJORITY_CLASS: dist = normalize_values_in_dict(self.stats, inplace=False) elif prediction_option == tree._NAIVE_BAYES: if self.total_weight >= tree.nb_threshold: dist = do_naive_bayes_prediction(x, self.stats, self.splitters) else: # Use majority class dist = normalize_values_in_dict(self.stats, inplace=False) else: # Naive Bayes Adaptive dist = super().leaf_prediction(x, tree=tree) dist_sum = sum(dist.values()) normalization_factor = dist_sum * self.error_estimation * self.error_estimation # Weight node's responses accordingly to the estimated error monitored by ADWIN # Useful if both the predictions of the alternate tree and the ones from the main tree # are combined -> give preference to the most accurate one dist = normalize_values_in_dict(dist, normalization_factor, inplace=False) return dist
def node_prediction(node): if isinstance(self, base.Classifier): pred = node.stats text = str(max(pred, key=pred.get)) sum_votes = sum(pred.values()) if sum_votes > 0: pred = normalize_values_in_dict(pred, factor=sum_votes, inplace=False) probas = '\n'.join([ f'P({c}) = {round_sig_fig(proba)}' for c, proba in pred.items() ]) text = f'{text}\n{probas}' return text elif isinstance(self, base.Regressor): # Multi-target regression if isinstance(self, base.MultiOutputMixin): return ' | '.join([ f'{t} = {round_sig_fig(s.mean.get())}' for t, s in node.stats.items() ]) else: # vanilla single-target regression pred = node.stats.mean.get() return f'{round_sig_fig(pred)}'
def predict_proba_one(self, x): y_proba = collections.Counter() for i, model in enumerate(self): epsilon = self.wrong_weight[i] + 1e-16 epsilon /= (self.correct_weight[i] + self.wrong_weight[i]) + 1e-16 if epsilon == 0 or epsilon > 0.5: model_weight = 1.0 else: beta_inv = (1 - epsilon) / epsilon model_weight = math.log(beta_inv) if beta_inv != 0 else 0 predictions = model.predict_proba_one(x) normalize_values_in_dict(predictions, inplace=True) scale_values_in_dict(predictions, model_weight, inplace=True) y_proba.update(predictions) normalize_values_in_dict(y_proba, inplace=True) return y_proba
def __repr__(self): if not self.stats: return "" text = f"Class {max(self.stats, key=self.stats.get)}:" for label, proba in sorted( normalize_values_in_dict(self.stats, inplace=False).items()): text += f"\n\tP({label}) = {round_sig_fig(proba)}" return text
def predict_proba_one(self, x): proba = {c: 0.0 for c in self.classes} if self._tree_root is not None: found_node = self._tree_root.filter_instance_to_leaf(x, None, -1) node = found_node.node if node is None: node = found_node.parent if node.is_leaf(): proba.update(node.leaf_prediction(x, tree=self)) else: # Corner case where a decision node is reached proba.update(normalize_values_in_dict(node.stats, inplace=False)) return proba
def predict_proba_one(self, x): proba = {c: 0.0 for c in self.classes} if self._root is not None: found_nodes = [self._root] if isinstance(self._root, DTBranch): found_nodes = self._root.traverse(x, until_leaf=True) for leaf in found_nodes: dist = leaf.prediction(x, tree=self) # Option Tree prediction (of sorts): combine the response of all leaves reached # by the instance proba = add_dict_values(proba, dist, inplace=True) proba = normalize_values_in_dict(proba) return proba
def predict_proba_one(self, x): if self._tree_root is None: return None enc_probas = super().predict_proba_one(x) enc_class = max(enc_probas, key=enc_probas.get) result = {} for lbl in self._labels: result[lbl] = {False: 0.0, True: 0.0} for label_id, label_val in self._r_label_map[enc_class]: result[label_id][label_val] = enc_probas[enc_class] result[label_id] = normalize_values_in_dict(result[label_id]) return result
def do_naive_bayes_prediction(x, observed_class_distribution: dict, attribute_observers: dict): """Perform Naive Bayes prediction Parameters ---------- x The feature values. observed_class_distribution Observed class distribution attribute_observers Attribute (features) observer Returns ------- votes dict Notes ----- This method is not intended to be used as a stand-alone method. """ total_weight_sum = sum(observed_class_distribution.values()) if not observed_class_distribution or total_weight_sum == 0: # No observed class distributions, all classes equal return None votes = {} for class_index, class_weight_sum in observed_class_distribution.items(): # Prior votes[class_index] = (math.log(class_weight_sum / total_weight_sum) if class_weight_sum > 0 else 0.0) if attribute_observers: for att_idx in attribute_observers: if att_idx not in x: continue obs = attribute_observers[att_idx] # Prior plus the log likelihood tmp = obs.probability_of_attribute_value_given_class( x[att_idx], class_index) votes[class_index] += math.log(tmp) if tmp > 0 else 0.0 # Revert log likelihood votes[class_index] = math.exp(votes[class_index]) return normalize_values_in_dict(votes)
def predict_proba_one(self, x): proba = {c: 0.0 for c in self.classes} if self._tree_root is not None: found_nodes = self._filter_instance_to_leaves(x, None, -1) for fn in found_nodes: # parent_branch == -999 means that the node is the root of an alternate tree. # In other words, the alternate tree is a single leaf. It is probably not accurate # enough to be used to predict, so skip it if fn.parent_branch != -999: leaf_node = fn.node if leaf_node is None: leaf_node = fn.parent dist = leaf_node.leaf_prediction(x, tree=self) # Option Tree prediction (of sorts): combine the response of all leaves reached # by the instance proba = add_dict_values(proba, dist, inplace=True) proba = normalize_values_in_dict(proba) return proba
def draw(self, max_depth: int = None): """Draw the tree using the `graphviz` library. Since the tree is drawn without passing incoming samples, classification trees will show the majority class in their leaves, whereas regression trees will use the target mean. Parameters ---------- max_depth Only the root will be drawn when set to `0`. Every node will be drawn when set to `None`. Notes ----- Currently, Label Combination Hoeffding Tree Classifier (for multi-label classification) is not supported. Examples -------- >>> from river import datasets >>> from river import tree >>> model = tree.HoeffdingTreeClassifier( ... grace_period=5, ... split_confidence=1e-5, ... split_criterion='gini', ... max_depth=10, ... tie_threshold=0.05, ... ) >>> for x, y in datasets.Phishing(): ... model = model.learn_one(x, y) >>> dot = model.draw() .. image:: ../../docs/img/dtree_draw.svg :align: center """ def node_prediction(node): if isinstance(self, base.Classifier): pred = node.stats text = str(max(pred, key=pred.get)) sum_votes = sum(pred.values()) if sum_votes > 0: pred = normalize_values_in_dict(pred, factor=sum_votes, inplace=False) probas = '\n'.join([ f'P({c}) = {round_sig_fig(proba)}' for c, proba in pred.items() ]) text = f'{text}\n{probas}' return text elif isinstance(self, base.Regressor): # Multi-target regression if isinstance(self, base.MultiOutputMixin): return ' | '.join([ f'{t} = {round_sig_fig(s.mean.get())}' for t, s in node.stats.items() ]) else: # vanilla single-target regression pred = node.stats.mean.get() return f'{round_sig_fig(pred)}' if max_depth is None: max_depth = math.inf dot = graphviz.Digraph(graph_attr={ 'splines': 'ortho', 'forcelabels': 'true', 'overlap': 'false' }, node_attr={ 'shape': 'box', 'penwidth': '1.2', 'fontname': 'trebuchet', 'fontsize': '11', 'margin': '0.1,0.0' }, edge_attr={ 'penwidth': '0.6', 'center': 'true', 'fontsize': '7 ' }) if isinstance(self, base.Classifier): n_colors = len(self.classes) # noqa else: n_colors = 1 # Pick a color palette which maps classes to colors new_color = functools.partial(next, iter(_color_brew(n_colors))) palette = collections.defaultdict(new_color) for parent_no, child_no, parent, child, branch_id in self._tree_root.iter_edges( ): if child.depth > max_depth: continue if not child.is_leaf(): text = f'{child.split_test.attrs_test_depends_on()[0]}' if child.depth == max_depth: text = f'{text}\n{node_prediction(child)}' else: text = f'{node_prediction(child)}\nsamples: {int(child.total_weight)}' # Pick a color, the hue depends on the class and the transparency on the distribution if isinstance(self, base.Classifier): class_proba = {c: 0 for c in self.classes} # noqa class_proba.update( normalize_values_in_dict(child.stats, inplace=False)) mode = max(class_proba, key=class_proba.get) p_mode = class_proba[mode] try: alpha = (p_mode - 1 / n_colors) / (1 - 1 / n_colors) fillcolor = str( transparency_hex(color=palette[mode], alpha=alpha)) except ZeroDivisionError: fillcolor = '#FFFFFF' else: fillcolor = '#FFFFFF' dot.node(f'{child_no}', text, fillcolor=fillcolor, style='filled') if parent_no is not None: dot.edge( f'{parent_no}', f'{child_no}', xlabel=parent.split_test.describe_condition_for_branch( branch_id, shorten=True)) return dot
def debug_one(self, x: dict) -> typing.Union[str, None]: """Print an explanation of how `x` is predicted. Parameters ---------- x A dictionary of features. Returns ------- A representation of the path followed by the tree to predict `x`; `None` if the tree is empty. """ if self._tree_root is None: return # We'll redirect all the print statement to a buffer, we'll return the content of the # buffer at the end buffer = io.StringIO() _print = functools.partial(print, file=buffer) for node in self._tree_root.path(x): if node.is_leaf(): pred = node.leaf_prediction(x, tree=self) if isinstance(self, base.Classifier): class_val = max(pred, key=pred.get) _print(f'Class {class_val} | {pred}') else: # Multi-target regression case if isinstance(self, base.MultiOutputMixin): _print('Predictions:\n{') for i, (t, var) in enumerate(pred.items()): _print( f'\t{t}: {pred[t]} | {node.stats[t].mean} | {node.stats[t]}' ) _print('}') else: # Single-target regression _print( f'Prediction {pred} | {node.stats.mean} | {node.stats}' ) break else: child_index = node.split_test.branch_for_instance(x) if child_index >= 0: _print( node.split_test.describe_condition_for_branch( child_index)) else: # Corner case where an emerging nominal feature value arrives _print('Decision node reached as final destination') pred = node.stats if isinstance(self, base.Classifier): class_val = max(pred, key=pred.get) pred = normalize_values_in_dict(pred, inplace=False) _print(f'Class {class_val} | {pred}') else: # Multi-target regression case if isinstance(self, base.MultiOutputMixin): _print('Predictions:\n{') for i, (t, var) in enumerate(pred.items()): _print( f'\t{t}: {pred[t].mean.get()} | {pred[t]}') _print('}') else: # Single-target regression _print( f'Prediction {pred} | {node.stats.mean} | {node.stats}' ) return buffer.getvalue()
def draw(self, max_depth: int = None): """Draw the tree using the `graphviz` library. Since the tree is drawn without passing incoming samples, classification trees will show the majority class in their leaves, whereas regression trees will use the target mean. Parameters ---------- max_depth Only the root will be drawn when set to `0`. Every node will be drawn when set to `None`. Notes ----- Currently, Label Combination Hoeffding Tree Classifier (for multi-label classification) is not supported. Examples -------- >>> from river import datasets >>> from river import tree >>> model = tree.HoeffdingTreeClassifier( ... grace_period=5, ... split_confidence=1e-5, ... split_criterion='gini', ... max_depth=10, ... tie_threshold=0.05, ... ) >>> for x, y in datasets.Phishing(): ... model = model.learn_one(x, y) >>> dot = model.draw() .. image:: ../../docs/img/dtree_draw.svg :align: center """ counter = 0 def iterate(node=None): if node is None: yield None, None, self._root, 0, None yield from iterate(self._root) nonlocal counter parent_no = counter if isinstance(node, HTBranch): for branch_index, child in enumerate(node.children): counter += 1 yield parent_no, node, child, counter, branch_index if isinstance(child, HTBranch): yield from iterate(child) if max_depth is None: max_depth = math.inf dot = graphviz.Digraph( graph_attr={ "splines": "ortho", "forcelabels": "true", "overlap": "false" }, node_attr={ "shape": "box", "penwidth": "1.2", "fontname": "trebuchet", "fontsize": "11", "margin": "0.1,0.0", }, edge_attr={ "penwidth": "0.6", "center": "true", "fontsize": "7 " }, ) if isinstance(self, base.Classifier): n_colors = len(self.classes) # noqa else: n_colors = 1 # Pick a color palette which maps classes to colors new_color = functools.partial(next, iter(_color_brew(n_colors))) palette = collections.defaultdict(new_color) for parent_no, parent, child, child_no, branch_index in iterate(): if child.depth > max_depth: continue if isinstance(child, HTBranch): text = f"{child.feature}" # noqa else: text = f"{repr(child)}\nsamples: {int(child.total_weight)}" # Pick a color, the hue depends on the class and the transparency on the distribution if isinstance(self, base.Classifier): class_proba = normalize_values_in_dict(child.stats, inplace=False) mode = max(class_proba, key=class_proba.get) p_mode = class_proba[mode] try: alpha = (p_mode - 1 / n_colors) / (1 - 1 / n_colors) fillcolor = str( transparency_hex(color=palette[mode], alpha=alpha)) except ZeroDivisionError: fillcolor = "#FFFFFF" else: fillcolor = "#FFFFFF" dot.node(f"{child_no}", text, fillcolor=fillcolor, style="filled") if parent_no is not None: dot.edge( f"{parent_no}", f"{child_no}", xlabel=parent.repr_split(branch_index, shorten=True), ) return dot
def prediction(self, x, *, tree=None): return normalize_values_in_dict(self.stats, inplace=False)
def leaf_prediction(self, x, *, tree=None): # In case split nodes end up being used (if emerging categorical feature appears, # for instance) use the MC (majority class) prediction strategy return normalize_values_in_dict(self.stats, inplace=False)