Beispiel #1
0
    def get_tree_objects(self, tree_models, fields, classes):

        trees = list()
        for i, tree_model in enumerate(tree_models):
            if 'list' in str(type(tree_model)):
                tree_inner = list()
                for tree_mod in tree_model:
                    main_node = tree_mod.get_Node()
                    all_node = main_node.get_Node()
                    if len(all_node) == 0:
                        continue
                    operator = all_node[0].get_SimplePredicate().get_operator()
                    tt = Tree(fields, [1], operator)
                    tt.get_node_info(all_node)
                    tt.build_tree()
                    model = DecisionTreeRegressor()
                    model.n_features = len(fields)
                    model.n_features_ = len(fields)
                    model.n_outputs_ = 1
                    model.n_outputs = 1
                    model.classes_ = np.array(classes)
                    model.tree_ = tt
                    tree_inner.append(model)
                trees.append(tree_inner)
            else:
                main_node = tree_model.get_Node()
                all_node = main_node.get_Node()
                if len(all_node) == 0:
                    continue
                operator = all_node[0].get_SimplePredicate().get_operator()
                tt = Tree(fields, classes, operator)
                tt.get_node_info(all_node)
                tt.build_tree()
                model = DecisionTreeClassifier()
                model.n_features = len(fields)
                model.n_features_ = len(fields)
                model.n_outputs_ = 1
                model.n_outputs = 1
                model.classes_ = np.array(classes)
                model._estimator_type = 'classifier' if len(
                    classes) > 0 else 'regressor'
                model.tree_ = tt
                trees.append(model)
        return trees
def deserialize_decision_tree_regressor(model_dict):
    deserialized_decision_tree = DecisionTreeRegressor()

    deserialized_decision_tree.max_features_ = model_dict['max_features_']
    deserialized_decision_tree.n_features_ = model_dict['n_features_']
    deserialized_decision_tree.n_outputs_ = model_dict['n_outputs_']

    tree = deserialize_tree(model_dict['tree_'], model_dict['n_features_'], 1,
                            model_dict['n_outputs_'])
    deserialized_decision_tree.tree_ = tree

    return deserialized_decision_tree
Beispiel #3
0
    def __init__(self, pmml):
        PMMLBaseClassifier.__init__(self, pmml)

        mining_model = self.root.find('MiningModel')
        if mining_model is None:
            raise Exception('PMML model does not contain MiningModel.')

        segmentation = mining_model.find('Segmentation')
        if segmentation is None:
            raise Exception('PMML model does not contain Segmentation.')

        if segmentation.get('multipleModelMethod') not in ['modelChain']:
            raise Exception('PMML model ensemble should use modelChain.')

        # Parse segments
        segments = segmentation.findall('Segment')
        valid_segments = [None] * self.n_classes_

        indices = range(self.n_classes_)
        # For binary classification, only the predictions of the first class need to be described, the other can be inferred
        # Not all PMML models do this, but we assume the following conditions imply this approach.
        if self.n_classes_ == 2 and len(
                segments) == 2 and segments[-1].find('TreeModel') is None:
            indices = [0]

        for i in indices:
            valid_segments[i] = [
                segment for segment in segments[i].find('MiningModel').find(
                    'Segmentation').findall('Segment')
                if segment.find('True') is not None
                and segment.find('TreeModel') is not None
            ]

        n_estimators = len(valid_segments[0])
        GradientBoostingClassifier.__init__(self, n_estimators=n_estimators)

        clf = DecisionTreeRegressor(random_state=123)
        try:
            clf.n_features_in_ = self.n_features_in_
        except AttributeError:
            clf.n_features_ = self.n_features_
        clf.n_outputs_ = self.n_outputs_
        self.template_estimator = clf

        self._check_params()

        if self.n_classes_ == 2 and len(
                segments) == 3 and segments[-1].find('TreeModel') is None:
            # For binary classification where both sides are specified, we need to force multinomial deviance
            self.loss_ = _gb_losses.MultinomialDeviance(self.n_classes_ + 1)
            self.loss_.K = 2

        try:
            self.init = None
            self._init_state()

            self.init_.class_prior_ = [
                expit(-float(segments[i].find('MiningModel').find(
                    'Targets').find('Target').get('rescaleConstant')))
                for i in indices
            ]

            if self.n_classes_ == 2:
                self.init_.class_prior_ = [
                    self.init_.class_prior_[0], 1 - self.init_.class_prior_[0]
                ]

            self.init_.classes_ = [i for i, _ in enumerate(self.classes_)]
            self.init_.n_classes_ = self.n_classes_
            self.init_.n_outputs_ = 1
            self.init_._strategy = self.init_.strategy
        except AttributeError:
            self.init = 'zero'
            self._init_state()

        for x, y in np.ndindex(self.estimators_.shape):
            try:
                factor = float(segments[y].find('MiningModel').find(
                    'Targets').find('Target').get('rescaleFactor', 1))
                self.estimators_[x, y] = get_tree(self,
                                                  valid_segments[y][x],
                                                  rescale_factor=factor)
            except AttributeError:
                self.estimators_[x, y] = get_tree(self, valid_segments[y][x])

        # Required after constructing trees, because categories may be inferred in
        # the parsing process
        target = self.target_field.get('name')
        fields = [
            field for name, field in self.fields.items() if name != target
        ]
        for x, y in np.ndindex(self.estimators_.shape):
            clf = self.estimators_[x, y]
            n_categories = np.asarray([
                len(self.field_mapping[field.get('name')][1].categories)
                if field.get('optype') == 'categorical' else -1
                for field in fields if field.tag == 'DataField'
            ],
                                      dtype=np.int32,
                                      order='C')
            clf.n_categories = n_categories
            clf.tree_.set_n_categories(n_categories)

        self.categorical = [
            x != -1 for x in self.estimators_[0, 0].n_categories
        ]
Beispiel #4
0
def digitize2tree(bins, right=False):
    """
    Builds a decision tree which returns the same result as
    `lambda x: numpy.digitize(x, bins, right=right)`
    (see :epkg:`numpy:digitize`).

    :param bins: array of bins. It has to be 1-dimensional and monotonic.
    :param right: Indicating whether the intervals include the right
        or the left bin edge. Default behavior is (right==False)
        indicating that the interval does not include the right edge.
        The left bin end is open in this case, i.e.,
        `bins[i-1] <= x < bins[i]` is the default behavior for
        monotonically increasing bins.
    :return: decision tree

    .. note::
        The implementation of decision trees in :epkg:`scikit-learn`
        only allows one type of decision (`<=`). That's why the
        function throws an exception when `right=False`. However,
        this could be overcome by using :epkg:`ONNX` where all
        kind of decision rules are implemented. Default value for
        right is still *False* to follow *numpy* API even though
        this value raises an exception in *digitize2tree*.

    The following example shows what the tree looks like.

    .. runpython::
        :showcode:

        import numpy
        from sklearn.tree import export_text
        from mlinsights.mltree import digitize2tree

        x = numpy.array([0.2, 6.4, 3.0, 1.6])
        bins = numpy.array([0.0, 1.0, 2.5, 4.0, 7.0])
        expected = numpy.digitize(x, bins, right=True)
        tree = digitize2tree(bins, right=True)
        pred = tree.predict(x.reshape((-1, 1)))
        print("Comparison with numpy:")
        print(expected, pred)
        print("Tree:")
        print(export_text(tree, feature_names=['x']))

    See also example :ref:`l-example-digitize`.

    .. versionadded:: 0.4
    """
    if not right:
        raise RuntimeError(
            "right must be True not right=%r" % right)
    ascending = len(bins) <= 1 or bins[0] < bins[1]

    if not ascending:
        bins2 = bins[::-1]
        cl = digitize2tree(bins2, right=right)
        n = len(bins)
        for i in range(cl.tree_.value.shape[0]):
            cl.tree_.value[i, 0, 0] = n - cl.tree_.value[i, 0, 0]
        return cl

    tree = Tree(1, numpy.array([1], dtype=numpy.intp), 1)
    values = []
    UNUSED = numpy.nan
    n_nodes = []

    def add_root(index):
        if index < 0 or index >= len(bins):
            raise IndexError(  # pragma: no cover
                "Unexpected index %d / len(bins)=%d." % (
                    index, len(bins)))
        parent = -1
        is_left = False
        is_leaf = False
        threshold = bins[index]
        n = tree_add_node(
            tree, parent, is_left, is_leaf, 0, threshold, 0, 1, 1.)
        values.append(UNUSED)
        n_nodes.append(n)
        return n

    def add_nodes(parent, i, j, is_left):
        # add for bins[i:j] (j excluded)
        if is_left:
            # it means j is the parent split
            if i == j:
                # leaf
                n = tree_add_node(tree, parent, is_left, True, 0, 0, 0, 1, 1.)
                n_nodes.append(n)
                values.append(i)
                return n
            if i + 1 == j:
                # split
                values.append(UNUSED)
                th = bins[i]
                n = tree_add_node(tree, parent, is_left,
                                  False, 0, th, 0, 1, 1.)
                n_nodes.append(n)
                add_nodes(n, i, i, True)
                add_nodes(n, i, j, False)
                return n
            if i + 1 < j:
                # split
                values.append(UNUSED)
                index = (i + j) // 2
                th = bins[index]
                n = tree_add_node(tree, parent, is_left,
                                  False, 0, th, 0, 1, 1.)
                n_nodes.append(n)
                add_nodes(n, i, index, True)
                add_nodes(n, index, j, False)
                return n
        else:
            # it means i is the parent split
            if i + 1 == j:
                # leaf
                values.append(j)
                n = tree_add_node(tree, parent, is_left, True, 0, 0, 0, 1, 1.)
                n_nodes.append(n)
                return n
            if i + 1 < j:
                # split
                values.append(UNUSED)
                index = (i + j) // 2
                th = bins[index]
                n = tree_add_node(tree, parent, is_left,
                                  False, 0, th, 0, 1, 1.)
                n_nodes.append(n)
                add_nodes(n, i, index, True)
                add_nodes(n, index, j, False)
                return n
        raise NotImplementedError(  # pragma: no cover
            "Unexpected case where i=%r, j=%r, is_left=%r." % (
                i, j, is_left))

    index = len(bins) // 2
    add_root(index)
    add_nodes(0, 0, index, True)
    add_nodes(0, index, len(bins), False)

    cl = DecisionTreeRegressor()
    cl.tree_ = tree
    cl.tree_.value[:, 0, 0] = numpy.array(  # pylint: disable=E1137
        values, dtype=numpy.float64)
    cl.n_outputs = 1
    cl.n_outputs_ = 1
    try:
        # scikit-learn >= 0.24
        cl.n_features_in_ = 1
    except AttributeError:
        # scikit-learn < 0.24
        cl.n_features_ = 1
    try:
        # for scikit-learn<=0.23.2
        cl.n_features_ = 1
    except AttributeError:
        pass
    return cl
Beispiel #5
0
    def __init__(self, pmml):
        PMMLBaseRegressor.__init__(self, pmml)

        mining_model = self.root.find('MiningModel')
        if mining_model is None:
            raise Exception('PMML model does not contain MiningModel.')

        segmentation = mining_model.find('Segmentation')
        if segmentation is None:
            raise Exception('PMML model does not contain Segmentation.')

        if segmentation.get('multipleModelMethod') not in ['sum']:
            raise Exception('PMML model ensemble should use sum.')

        # Parse segments
        segments = segmentation.findall('Segment')
        valid_segments = [
            segment for segment in segments if segment.find('True') is not None
            and segment.find('TreeModel') is not None
        ]

        n_estimators = len(valid_segments)
        self.n_outputs_ = 1
        GradientBoostingRegressor.__init__(self, n_estimators=n_estimators)

        clf = DecisionTreeRegressor(random_state=123)
        try:
            clf.n_features_in_ = self.n_features_in_
        except AttributeError:
            clf.n_features_ = self.n_features_
        clf.n_outputs_ = self.n_outputs_
        self.template_estimator = clf

        self._check_params()
        self._init_state()

        mean = mining_model.find('Targets').find('Target').get(
            'rescaleConstant', 0)
        self.init_.constant_ = np.array([mean])
        self.init_.n_outputs_ = 1

        for x, y in np.ndindex(self.estimators_.shape):
            factor = float(
                mining_model.find('Targets').find('Target').get(
                    'rescaleFactor', 1))
            self.estimators_[x, y] = get_tree(self,
                                              valid_segments[x],
                                              rescale_factor=factor)

        # Required after constructing trees, because categories may be inferred in
        # the parsing process
        target = self.target_field.get('name')
        fields = [
            field for name, field in self.fields.items() if name != target
        ]
        for x, y in np.ndindex(self.estimators_.shape):
            clf = self.estimators_[x, y]
            n_categories = np.asarray([
                len(self.field_mapping[field.get('name')][1].categories)
                if field.get('optype') == 'categorical' else -1
                for field in fields if field.tag == 'DataField'
            ],
                                      dtype=np.int32,
                                      order='C')
            clf.n_categories = n_categories
            clf.tree_.set_n_categories(n_categories)

        self.categorical = [
            x != -1 for x in self.estimators_[0, 0].n_categories
        ]