コード例 #1
0
ファイル: hatc_nodes.py プロジェクト: ogozuacik/river
    def leaf_prediction(self, x, *, tree=None):
        if not self.stats:
            return

        prediction_option = tree.leaf_prediction
        if not self.is_active() or prediction_option == tree._MAJORITY_CLASS:
            dist = normalize_values_in_dict(self.stats, inplace=False)
        elif prediction_option == tree._NAIVE_BAYES:
            if self.total_weight >= tree.nb_threshold:
                dist = do_naive_bayes_prediction(x, self.stats, self.splitters)
            else:  # Use majority class
                dist = normalize_values_in_dict(self.stats, inplace=False)
        else:  # Naive Bayes Adaptive
            dist = super().leaf_prediction(x, tree=tree)

        dist_sum = sum(dist.values())
        normalization_factor = dist_sum * self.error_estimation * self.error_estimation

        # Weight node's responses accordingly to the estimated error monitored by ADWIN
        # Useful if both the predictions of the alternate tree and the ones from the main tree
        # are combined -> give preference to the most accurate one
        dist = normalize_values_in_dict(dist,
                                        normalization_factor,
                                        inplace=False)

        return dist
コード例 #2
0
ファイル: _base_tree.py プロジェクト: puzzlebird/river
 def node_prediction(node):
     if isinstance(self, base.Classifier):
         pred = node.stats
         text = str(max(pred, key=pred.get))
         sum_votes = sum(pred.values())
         if sum_votes > 0:
             pred = normalize_values_in_dict(pred,
                                             factor=sum_votes,
                                             inplace=False)
             probas = '\n'.join([
                 f'P({c}) = {round_sig_fig(proba)}'
                 for c, proba in pred.items()
             ])
             text = f'{text}\n{probas}'
         return text
     elif isinstance(self, base.Regressor):
         # Multi-target regression
         if isinstance(self, base.MultiOutputMixin):
             return ' | '.join([
                 f'{t} = {round_sig_fig(s.mean.get())}'
                 for t, s in node.stats.items()
             ])
         else:  # vanilla single-target regression
             pred = node.stats.mean.get()
             return f'{round_sig_fig(pred)}'
コード例 #3
0
    def predict_proba_one(self, x):
        y_proba = collections.Counter()

        for i, model in enumerate(self):
            epsilon = self.wrong_weight[i] + 1e-16
            epsilon /= (self.correct_weight[i] + self.wrong_weight[i]) + 1e-16
            if epsilon == 0 or epsilon > 0.5:
                model_weight = 1.0
            else:
                beta_inv = (1 - epsilon) / epsilon
                model_weight = math.log(beta_inv) if beta_inv != 0 else 0
            predictions = model.predict_proba_one(x)
            normalize_values_in_dict(predictions, inplace=True)
            scale_values_in_dict(predictions, model_weight, inplace=True)
            y_proba.update(predictions)

        normalize_values_in_dict(y_proba, inplace=True)
        return y_proba
コード例 #4
0
ファイル: htc_nodes.py プロジェクト: venuraja79/river
    def __repr__(self):
        if not self.stats:
            return ""

        text = f"Class {max(self.stats, key=self.stats.get)}:"
        for label, proba in sorted(
                normalize_values_in_dict(self.stats, inplace=False).items()):
            text += f"\n\tP({label}) = {round_sig_fig(proba)}"

        return text
コード例 #5
0
    def predict_proba_one(self, x):
        proba = {c: 0.0 for c in self.classes}
        if self._tree_root is not None:
            found_node = self._tree_root.filter_instance_to_leaf(x, None, -1)
            node = found_node.node
            if node is None:
                node = found_node.parent

            if node.is_leaf():
                proba.update(node.leaf_prediction(x, tree=self))
            else:  # Corner case where a decision node is reached
                proba.update(normalize_values_in_dict(node.stats, inplace=False))
        return proba
コード例 #6
0
    def predict_proba_one(self, x):
        proba = {c: 0.0 for c in self.classes}
        if self._root is not None:
            found_nodes = [self._root]
            if isinstance(self._root, DTBranch):
                found_nodes = self._root.traverse(x, until_leaf=True)
            for leaf in found_nodes:
                dist = leaf.prediction(x, tree=self)
                # Option Tree prediction (of sorts): combine the response of all leaves reached
                # by the instance
                proba = add_dict_values(proba, dist, inplace=True)
            proba = normalize_values_in_dict(proba)

        return proba
コード例 #7
0
    def predict_proba_one(self, x):
        if self._tree_root is None:
            return None

        enc_probas = super().predict_proba_one(x)
        enc_class = max(enc_probas, key=enc_probas.get)

        result = {}
        for lbl in self._labels:
            result[lbl] = {False: 0.0, True: 0.0}

        for label_id, label_val in self._r_label_map[enc_class]:
            result[label_id][label_val] = enc_probas[enc_class]
            result[label_id] = normalize_values_in_dict(result[label_id])

        return result
コード例 #8
0
ファイル: _tree_utils.py プロジェクト: renatacgcastanha/river
def do_naive_bayes_prediction(x, observed_class_distribution: dict,
                              attribute_observers: dict):
    """Perform Naive Bayes prediction

    Parameters
    ----------
    x
        The feature values.

    observed_class_distribution
        Observed class distribution

    attribute_observers
        Attribute (features) observer

    Returns
    -------
    votes
        dict

    Notes
    -----
    This method is not intended to be used as a stand-alone method.
    """
    total_weight_sum = sum(observed_class_distribution.values())
    if not observed_class_distribution or total_weight_sum == 0:
        # No observed class distributions, all classes equal
        return None
    votes = {}
    for class_index, class_weight_sum in observed_class_distribution.items():
        # Prior
        votes[class_index] = (math.log(class_weight_sum / total_weight_sum)
                              if class_weight_sum > 0 else 0.0)
        if attribute_observers:
            for att_idx in attribute_observers:
                if att_idx not in x:
                    continue
                obs = attribute_observers[att_idx]
                # Prior plus the log likelihood
                tmp = obs.probability_of_attribute_value_given_class(
                    x[att_idx], class_index)
                votes[class_index] += math.log(tmp) if tmp > 0 else 0.0
        # Revert log likelihood
        votes[class_index] = math.exp(votes[class_index])
    return normalize_values_in_dict(votes)
コード例 #9
0
    def predict_proba_one(self, x):
        proba = {c: 0.0 for c in self.classes}
        if self._tree_root is not None:
            found_nodes = self._filter_instance_to_leaves(x, None, -1)
            for fn in found_nodes:
                # parent_branch == -999 means that the node is the root of an alternate tree.
                # In other words, the alternate tree is a single leaf. It is probably not accurate
                # enough to be used to predict, so skip it
                if fn.parent_branch != -999:
                    leaf_node = fn.node
                    if leaf_node is None:
                        leaf_node = fn.parent
                    dist = leaf_node.leaf_prediction(x, tree=self)
                    # Option Tree prediction (of sorts): combine the response of all leaves reached
                    # by the instance
                    proba = add_dict_values(proba, dist, inplace=True)
            proba = normalize_values_in_dict(proba)

        return proba
コード例 #10
0
ファイル: _base_tree.py プロジェクト: puzzlebird/river
    def draw(self, max_depth: int = None):
        """Draw the tree using the `graphviz` library.

        Since the tree is drawn without passing incoming samples, classification trees
        will show the majority class in their leaves, whereas regression trees will
        use the target mean.

        Parameters
        ----------
        max_depth
            Only the root will be drawn when set to `0`. Every node will be drawn when
            set to `None`.

        Notes
        -----
        Currently, Label Combination Hoeffding Tree Classifier (for multi-label
        classification) is not supported.

        Examples
        --------
        >>> from river import datasets
        >>> from river import tree
        >>> model = tree.HoeffdingTreeClassifier(
        ...    grace_period=5,
        ...    split_confidence=1e-5,
        ...    split_criterion='gini',
        ...    max_depth=10,
        ...    tie_threshold=0.05,
        ... )
        >>> for x, y in datasets.Phishing():
        ...    model = model.learn_one(x, y)
        >>> dot = model.draw()

        .. image:: ../../docs/img/dtree_draw.svg
            :align: center
        """
        def node_prediction(node):
            if isinstance(self, base.Classifier):
                pred = node.stats
                text = str(max(pred, key=pred.get))
                sum_votes = sum(pred.values())
                if sum_votes > 0:
                    pred = normalize_values_in_dict(pred,
                                                    factor=sum_votes,
                                                    inplace=False)
                    probas = '\n'.join([
                        f'P({c}) = {round_sig_fig(proba)}'
                        for c, proba in pred.items()
                    ])
                    text = f'{text}\n{probas}'
                return text
            elif isinstance(self, base.Regressor):
                # Multi-target regression
                if isinstance(self, base.MultiOutputMixin):
                    return ' | '.join([
                        f'{t} = {round_sig_fig(s.mean.get())}'
                        for t, s in node.stats.items()
                    ])
                else:  # vanilla single-target regression
                    pred = node.stats.mean.get()
                    return f'{round_sig_fig(pred)}'

        if max_depth is None:
            max_depth = math.inf

        dot = graphviz.Digraph(graph_attr={
            'splines': 'ortho',
            'forcelabels': 'true',
            'overlap': 'false'
        },
                               node_attr={
                                   'shape': 'box',
                                   'penwidth': '1.2',
                                   'fontname': 'trebuchet',
                                   'fontsize': '11',
                                   'margin': '0.1,0.0'
                               },
                               edge_attr={
                                   'penwidth': '0.6',
                                   'center': 'true',
                                   'fontsize': '7  '
                               })

        if isinstance(self, base.Classifier):
            n_colors = len(self.classes)  # noqa
        else:
            n_colors = 1

        # Pick a color palette which maps classes to colors
        new_color = functools.partial(next, iter(_color_brew(n_colors)))
        palette = collections.defaultdict(new_color)

        for parent_no, child_no, parent, child, branch_id in self._tree_root.iter_edges(
        ):

            if child.depth > max_depth:
                continue

            if not child.is_leaf():
                text = f'{child.split_test.attrs_test_depends_on()[0]}'

                if child.depth == max_depth:
                    text = f'{text}\n{node_prediction(child)}'
            else:
                text = f'{node_prediction(child)}\nsamples: {int(child.total_weight)}'

            # Pick a color, the hue depends on the class and the transparency on the distribution
            if isinstance(self, base.Classifier):
                class_proba = {c: 0 for c in self.classes}  # noqa
                class_proba.update(
                    normalize_values_in_dict(child.stats, inplace=False))
                mode = max(class_proba, key=class_proba.get)
                p_mode = class_proba[mode]
                try:
                    alpha = (p_mode - 1 / n_colors) / (1 - 1 / n_colors)
                    fillcolor = str(
                        transparency_hex(color=palette[mode], alpha=alpha))
                except ZeroDivisionError:
                    fillcolor = '#FFFFFF'
            else:
                fillcolor = '#FFFFFF'

            dot.node(f'{child_no}', text, fillcolor=fillcolor, style='filled')

            if parent_no is not None:
                dot.edge(
                    f'{parent_no}',
                    f'{child_no}',
                    xlabel=parent.split_test.describe_condition_for_branch(
                        branch_id, shorten=True))

        return dot
コード例 #11
0
ファイル: _base_tree.py プロジェクト: puzzlebird/river
    def debug_one(self, x: dict) -> typing.Union[str, None]:
        """Print an explanation of how `x` is predicted.

        Parameters
        ----------
        x
            A dictionary of features.

        Returns
        -------
            A representation of the path followed by the tree to predict `x`; `None` if
            the tree is empty.
        """
        if self._tree_root is None:
            return

        # We'll redirect all the print statement to a buffer, we'll return the content of the
        # buffer at the end
        buffer = io.StringIO()
        _print = functools.partial(print, file=buffer)

        for node in self._tree_root.path(x):
            if node.is_leaf():
                pred = node.leaf_prediction(x, tree=self)
                if isinstance(self, base.Classifier):
                    class_val = max(pred, key=pred.get)
                    _print(f'Class {class_val} | {pred}')
                else:
                    # Multi-target regression case
                    if isinstance(self, base.MultiOutputMixin):
                        _print('Predictions:\n{')
                        for i, (t, var) in enumerate(pred.items()):
                            _print(
                                f'\t{t}: {pred[t]} | {node.stats[t].mean} | {node.stats[t]}'
                            )
                        _print('}')
                    else:  # Single-target regression
                        _print(
                            f'Prediction {pred} | {node.stats.mean} | {node.stats}'
                        )
                break
            else:
                child_index = node.split_test.branch_for_instance(x)

                if child_index >= 0:
                    _print(
                        node.split_test.describe_condition_for_branch(
                            child_index))
                else:  # Corner case where an emerging nominal feature value arrives
                    _print('Decision node reached as final destination')
                    pred = node.stats
                    if isinstance(self, base.Classifier):
                        class_val = max(pred, key=pred.get)
                        pred = normalize_values_in_dict(pred, inplace=False)
                        _print(f'Class {class_val} | {pred}')
                    else:
                        # Multi-target regression case
                        if isinstance(self, base.MultiOutputMixin):
                            _print('Predictions:\n{')
                            for i, (t, var) in enumerate(pred.items()):
                                _print(
                                    f'\t{t}: {pred[t].mean.get()} | {pred[t]}')
                            _print('}')
                        else:  # Single-target regression
                            _print(
                                f'Prediction {pred} | {node.stats.mean} | {node.stats}'
                            )

        return buffer.getvalue()
コード例 #12
0
    def draw(self, max_depth: int = None):
        """Draw the tree using the `graphviz` library.

        Since the tree is drawn without passing incoming samples, classification trees
        will show the majority class in their leaves, whereas regression trees will
        use the target mean.

        Parameters
        ----------
        max_depth
            Only the root will be drawn when set to `0`. Every node will be drawn when
            set to `None`.

        Notes
        -----
        Currently, Label Combination Hoeffding Tree Classifier (for multi-label
        classification) is not supported.

        Examples
        --------
        >>> from river import datasets
        >>> from river import tree
        >>> model = tree.HoeffdingTreeClassifier(
        ...    grace_period=5,
        ...    split_confidence=1e-5,
        ...    split_criterion='gini',
        ...    max_depth=10,
        ...    tie_threshold=0.05,
        ... )
        >>> for x, y in datasets.Phishing():
        ...    model = model.learn_one(x, y)
        >>> dot = model.draw()

        .. image:: ../../docs/img/dtree_draw.svg
            :align: center
        """
        counter = 0

        def iterate(node=None):
            if node is None:
                yield None, None, self._root, 0, None
                yield from iterate(self._root)

            nonlocal counter
            parent_no = counter

            if isinstance(node, HTBranch):
                for branch_index, child in enumerate(node.children):
                    counter += 1
                    yield parent_no, node, child, counter, branch_index
                    if isinstance(child, HTBranch):
                        yield from iterate(child)

        if max_depth is None:
            max_depth = math.inf

        dot = graphviz.Digraph(
            graph_attr={
                "splines": "ortho",
                "forcelabels": "true",
                "overlap": "false"
            },
            node_attr={
                "shape": "box",
                "penwidth": "1.2",
                "fontname": "trebuchet",
                "fontsize": "11",
                "margin": "0.1,0.0",
            },
            edge_attr={
                "penwidth": "0.6",
                "center": "true",
                "fontsize": "7  "
            },
        )

        if isinstance(self, base.Classifier):
            n_colors = len(self.classes)  # noqa
        else:
            n_colors = 1

        # Pick a color palette which maps classes to colors
        new_color = functools.partial(next, iter(_color_brew(n_colors)))
        palette = collections.defaultdict(new_color)

        for parent_no, parent, child, child_no, branch_index in iterate():
            if child.depth > max_depth:
                continue

            if isinstance(child, HTBranch):
                text = f"{child.feature}"  # noqa
            else:
                text = f"{repr(child)}\nsamples: {int(child.total_weight)}"

            # Pick a color, the hue depends on the class and the transparency on the distribution
            if isinstance(self, base.Classifier):
                class_proba = normalize_values_in_dict(child.stats,
                                                       inplace=False)
                mode = max(class_proba, key=class_proba.get)
                p_mode = class_proba[mode]
                try:
                    alpha = (p_mode - 1 / n_colors) / (1 - 1 / n_colors)
                    fillcolor = str(
                        transparency_hex(color=palette[mode], alpha=alpha))
                except ZeroDivisionError:
                    fillcolor = "#FFFFFF"
            else:
                fillcolor = "#FFFFFF"

            dot.node(f"{child_no}", text, fillcolor=fillcolor, style="filled")

            if parent_no is not None:
                dot.edge(
                    f"{parent_no}",
                    f"{child_no}",
                    xlabel=parent.repr_split(branch_index, shorten=True),
                )

        return dot
コード例 #13
0
ファイル: efdtc_nodes.py プロジェクト: Leo-VK/creme
 def prediction(self, x, *, tree=None):
     return normalize_values_in_dict(self.stats, inplace=False)
コード例 #14
0
 def leaf_prediction(self, x, *, tree=None):
     # In case split nodes end up being used (if emerging categorical feature appears,
     # for instance) use the MC (majority class) prediction strategy
     return normalize_values_in_dict(self.stats, inplace=False)