def __init__(self, pmml): PMMLBaseClassifier.__init__(self, pmml) mining_model = self.root.find('MiningModel') if mining_model is None: raise Exception('PMML model does not contain MiningModel.') segmentation = mining_model.find('Segmentation') if segmentation is None: raise Exception('PMML model does not contain Segmentation.') if segmentation.get('multipleModelMethod') not in ['modelChain']: raise Exception('PMML model ensemble should use modelChain.') # Parse segments segments = segmentation.findall('Segment') valid_segments = [None] * self.n_classes_ indices = range(self.n_classes_) # For binary classification, only the predictions of the first class need to be described, the other can be inferred # Not all PMML models do this, but we assume the following conditions imply this approach. if self.n_classes_ == 2 and len( segments) == 2 and segments[-1].find('TreeModel') is None: indices = [0] for i in indices: valid_segments[i] = [ segment for segment in segments[i].find('MiningModel').find( 'Segmentation').findall('Segment') if segment.find('True') is not None and segment.find('TreeModel') is not None ] n_estimators = len(valid_segments[0]) GradientBoostingClassifier.__init__(self, n_estimators=n_estimators) clf = DecisionTreeRegressor(random_state=123) try: clf.n_features_in_ = self.n_features_in_ except AttributeError: clf.n_features_ = self.n_features_ clf.n_outputs_ = self.n_outputs_ self.template_estimator = clf self._check_params() if self.n_classes_ == 2 and len( segments) == 3 and segments[-1].find('TreeModel') is None: # For binary classification where both sides are specified, we need to force multinomial deviance self.loss_ = _gb_losses.MultinomialDeviance(self.n_classes_ + 1) self.loss_.K = 2 try: self.init = None self._init_state() self.init_.class_prior_ = [ expit(-float(segments[i].find('MiningModel').find( 'Targets').find('Target').get('rescaleConstant'))) for i in indices ] if self.n_classes_ == 2: self.init_.class_prior_ = [ self.init_.class_prior_[0], 1 - self.init_.class_prior_[0] ] self.init_.classes_ = [i for i, _ in enumerate(self.classes_)] self.init_.n_classes_ = self.n_classes_ self.init_.n_outputs_ = 1 self.init_._strategy = self.init_.strategy except AttributeError: self.init = 'zero' self._init_state() for x, y in np.ndindex(self.estimators_.shape): try: factor = float(segments[y].find('MiningModel').find( 'Targets').find('Target').get('rescaleFactor', 1)) self.estimators_[x, y] = get_tree(self, valid_segments[y][x], rescale_factor=factor) except AttributeError: self.estimators_[x, y] = get_tree(self, valid_segments[y][x]) # Required after constructing trees, because categories may be inferred in # the parsing process target = self.target_field.get('name') fields = [ field for name, field in self.fields.items() if name != target ] for x, y in np.ndindex(self.estimators_.shape): clf = self.estimators_[x, y] n_categories = np.asarray([ len(self.field_mapping[field.get('name')][1].categories) if field.get('optype') == 'categorical' else -1 for field in fields if field.tag == 'DataField' ], dtype=np.int32, order='C') clf.n_categories = n_categories clf.tree_.set_n_categories(n_categories) self.categorical = [ x != -1 for x in self.estimators_[0, 0].n_categories ]
def digitize2tree(bins, right=False): """ Builds a decision tree which returns the same result as `lambda x: numpy.digitize(x, bins, right=right)` (see :epkg:`numpy:digitize`). :param bins: array of bins. It has to be 1-dimensional and monotonic. :param right: Indicating whether the intervals include the right or the left bin edge. Default behavior is (right==False) indicating that the interval does not include the right edge. The left bin end is open in this case, i.e., `bins[i-1] <= x < bins[i]` is the default behavior for monotonically increasing bins. :return: decision tree .. note:: The implementation of decision trees in :epkg:`scikit-learn` only allows one type of decision (`<=`). That's why the function throws an exception when `right=False`. However, this could be overcome by using :epkg:`ONNX` where all kind of decision rules are implemented. Default value for right is still *False* to follow *numpy* API even though this value raises an exception in *digitize2tree*. The following example shows what the tree looks like. .. runpython:: :showcode: import numpy from sklearn.tree import export_text from mlinsights.mltree import digitize2tree x = numpy.array([0.2, 6.4, 3.0, 1.6]) bins = numpy.array([0.0, 1.0, 2.5, 4.0, 7.0]) expected = numpy.digitize(x, bins, right=True) tree = digitize2tree(bins, right=True) pred = tree.predict(x.reshape((-1, 1))) print("Comparison with numpy:") print(expected, pred) print("Tree:") print(export_text(tree, feature_names=['x'])) See also example :ref:`l-example-digitize`. .. versionadded:: 0.4 """ if not right: raise RuntimeError( "right must be True not right=%r" % right) ascending = len(bins) <= 1 or bins[0] < bins[1] if not ascending: bins2 = bins[::-1] cl = digitize2tree(bins2, right=right) n = len(bins) for i in range(cl.tree_.value.shape[0]): cl.tree_.value[i, 0, 0] = n - cl.tree_.value[i, 0, 0] return cl tree = Tree(1, numpy.array([1], dtype=numpy.intp), 1) values = [] UNUSED = numpy.nan n_nodes = [] def add_root(index): if index < 0 or index >= len(bins): raise IndexError( # pragma: no cover "Unexpected index %d / len(bins)=%d." % ( index, len(bins))) parent = -1 is_left = False is_leaf = False threshold = bins[index] n = tree_add_node( tree, parent, is_left, is_leaf, 0, threshold, 0, 1, 1.) values.append(UNUSED) n_nodes.append(n) return n def add_nodes(parent, i, j, is_left): # add for bins[i:j] (j excluded) if is_left: # it means j is the parent split if i == j: # leaf n = tree_add_node(tree, parent, is_left, True, 0, 0, 0, 1, 1.) n_nodes.append(n) values.append(i) return n if i + 1 == j: # split values.append(UNUSED) th = bins[i] n = tree_add_node(tree, parent, is_left, False, 0, th, 0, 1, 1.) n_nodes.append(n) add_nodes(n, i, i, True) add_nodes(n, i, j, False) return n if i + 1 < j: # split values.append(UNUSED) index = (i + j) // 2 th = bins[index] n = tree_add_node(tree, parent, is_left, False, 0, th, 0, 1, 1.) n_nodes.append(n) add_nodes(n, i, index, True) add_nodes(n, index, j, False) return n else: # it means i is the parent split if i + 1 == j: # leaf values.append(j) n = tree_add_node(tree, parent, is_left, True, 0, 0, 0, 1, 1.) n_nodes.append(n) return n if i + 1 < j: # split values.append(UNUSED) index = (i + j) // 2 th = bins[index] n = tree_add_node(tree, parent, is_left, False, 0, th, 0, 1, 1.) n_nodes.append(n) add_nodes(n, i, index, True) add_nodes(n, index, j, False) return n raise NotImplementedError( # pragma: no cover "Unexpected case where i=%r, j=%r, is_left=%r." % ( i, j, is_left)) index = len(bins) // 2 add_root(index) add_nodes(0, 0, index, True) add_nodes(0, index, len(bins), False) cl = DecisionTreeRegressor() cl.tree_ = tree cl.tree_.value[:, 0, 0] = numpy.array( # pylint: disable=E1137 values, dtype=numpy.float64) cl.n_outputs = 1 cl.n_outputs_ = 1 try: # scikit-learn >= 0.24 cl.n_features_in_ = 1 except AttributeError: # scikit-learn < 0.24 cl.n_features_ = 1 try: # for scikit-learn<=0.23.2 cl.n_features_ = 1 except AttributeError: pass return cl
def __init__(self, pmml): PMMLBaseRegressor.__init__(self, pmml) mining_model = self.root.find('MiningModel') if mining_model is None: raise Exception('PMML model does not contain MiningModel.') segmentation = mining_model.find('Segmentation') if segmentation is None: raise Exception('PMML model does not contain Segmentation.') if segmentation.get('multipleModelMethod') not in ['sum']: raise Exception('PMML model ensemble should use sum.') # Parse segments segments = segmentation.findall('Segment') valid_segments = [ segment for segment in segments if segment.find('True') is not None and segment.find('TreeModel') is not None ] n_estimators = len(valid_segments) self.n_outputs_ = 1 GradientBoostingRegressor.__init__(self, n_estimators=n_estimators) clf = DecisionTreeRegressor(random_state=123) try: clf.n_features_in_ = self.n_features_in_ except AttributeError: clf.n_features_ = self.n_features_ clf.n_outputs_ = self.n_outputs_ self.template_estimator = clf self._check_params() self._init_state() mean = mining_model.find('Targets').find('Target').get( 'rescaleConstant', 0) self.init_.constant_ = np.array([mean]) self.init_.n_outputs_ = 1 for x, y in np.ndindex(self.estimators_.shape): factor = float( mining_model.find('Targets').find('Target').get( 'rescaleFactor', 1)) self.estimators_[x, y] = get_tree(self, valid_segments[x], rescale_factor=factor) # Required after constructing trees, because categories may be inferred in # the parsing process target = self.target_field.get('name') fields = [ field for name, field in self.fields.items() if name != target ] for x, y in np.ndindex(self.estimators_.shape): clf = self.estimators_[x, y] n_categories = np.asarray([ len(self.field_mapping[field.get('name')][1].categories) if field.get('optype') == 'categorical' else -1 for field in fields if field.tag == 'DataField' ], dtype=np.int32, order='C') clf.n_categories = n_categories clf.tree_.set_n_categories(n_categories) self.categorical = [ x != -1 for x in self.estimators_[0, 0].n_categories ]