def test_parse_array_unknown(self): array = eTree.fromstring(f''' <Array type="colors">🔴 🟢 🔵</Array> ''') with self.assertRaises(Exception) as cm: parse_array(array) assert str(cm.exception) == "Unknown array type encountered."
def test_parse_sparse_array_int(self): array = eTree.fromstring(f''' <INT-SparseArray n="4"> <Indices>1 2 3</Indices> <INT-Entries>3 1 4</INT-Entries> </INT-SparseArray> ''') assert parse_array(array) == [3, 1, 4, 0]
def get_vectors(vector_dictionary, s): """Return support vector values, parsed as a numpy array.""" instance = vector_dictionary.find(f"VectorInstance[@id='{s}']") if instance is None: raise Exception( f'PMML model is broken, vector instance (id = {s}) not found.') array = instance.find('Array') if array is None: array = instance.find('REAL-Array') if array is None: array = instance.find('SparseArray') if array is None: array = instance.find('REAL-SparseArray') if array is None: raise Exception( f'PMML model is broken, vector instance (id = {s}) does not contain (Sparse)Array element.' ) return np.array(parse_array(array))
def construct_tree(node, classes, field_mapping, i=0, rescale_factor=1): """ Generate nodes and values used for constructing Cython Tree class. Parameters ---------- node : eTree.Element XML Node element representing the current node. classes : list, None List of possible target classes. Is `None` for regression trees. field_mapping: { str: (int, callable) } Dictionary mapping column names to tuples with 1) index of the column and 2) type of the column. i : int Index of the node in the result list. rescale_factor : float Factor to scale the output of every node with. Required for gradient boosting trees. Optional, and 1 by default. Returns ------- (nodes, values) : tuple nodes : [()] List of nodes represented by: left child (int), right child (int), feature (int), value (int for categorical, float for continuous), impurity (float), sample count (int) and weighted sample count (int). values : [[]] List with training sample distributions at this node in the tree. """ child_nodes = node.findall('Node') impurity = 0 # TODO: impurity doesnt affect predictions, but is nice to have i += 1 def votes_for(field): # Deal with case where target field is a double, but ScoreDistribution value is an integer. if isinstance(field, float) and field.is_integer(): return node.find( f"ScoreDistribution[@value='{field}']") or node.find( f"ScoreDistribution[@value='{int(field)}']") return node.find(f"ScoreDistribution[@value='{field}']") if not child_nodes: record_count = node.get('recordCount') if record_count is not None and classes is not None: node_count_weighted = float(record_count) node_count = int(node_count_weighted) votes = [[[ float(votes_for(c).get('recordCount')) if votes_for(c) is not None else 0.0 for c in classes ]]] else: score = node.get('score') node_count, node_count_weighted = (0, 0.0) if classes is None: # FIXME: unsure about `10 x rescale_factor`, but seems required, at least for r2pmml generated models votes = [[[float(score) * 10 * rescale_factor]]] else: votes = [[[1.0 if str(c) == score else 0.0 for c in classes]]] return [(TREE_LEAF, TREE_LEAF, TREE_UNDEFINED, SPLIT_UNDEFINED, impurity, node_count, node_count_weighted)], votes predicate = child_nodes[0].find('SimplePredicate') set_predicate = child_nodes[0].find('SimpleSetPredicate') # Convert SimplePredicate with equals operator on category to set predicate if predicate is not None: is_categorical = isinstance(field_mapping[predicate.get('field')][1], Category) if predicate.get('operator') == 'equal' and is_categorical: set_predicate = eTree.fromstring(f''' <SimpleSetPredicate field="{predicate.get('field')}" booleanOperator="isIn"> <Array type="string">"{predicate.get('value')}"</Array> </SimpleSetPredicate> ''') predicate = None elif predicate.get('operator') == 'notEqual' and is_categorical: set_predicate = eTree.fromstring(f''' <SimpleSetPredicate field="{predicate.get('field')}" booleanOperator="isNotIn"> <Array type="string">"{predicate.get('value')}"</Array> </SimpleSetPredicate> ''') predicate = None if predicate is not None and predicate.get('operator') in [ 'greaterThan', 'greaterOrEqual' ]: child_nodes.reverse() left_node, left_value = construct_tree(child_nodes[0], classes, field_mapping, i, rescale_factor) offset = len(left_node) right_node, right_value = construct_tree(child_nodes[1], classes, field_mapping, i + offset, rescale_factor) children = left_node + right_node distributions = left_value + right_value if predicate is not None: column, _ = field_mapping[predicate.get('field')] # We do not use field_mapping type as the Cython tree only supports floats value = np.float64(predicate.get('value')) # Account for `>=` != `>` and `<` != `<=`. scikit-learn only uses `<=`. if predicate.get('operator') == 'greaterOrEqual': value = np.nextafter(value, value - 1) if predicate.get('operator') == 'lessThan': value = np.nextafter(value, value - 1) else: if set_predicate is not None: column, field_type = field_mapping[set_predicate.get('field')] array = set_predicate.find('Array') categories = parse_array(array) mask = 0 for category in categories: try: index = field_type.categories.index(category) mask |= 1 << index except ValueError: warn( 'Categorical values are missing in the PMML document, ' 'attempting to infer from decision tree splits.') field_type.categories.append(category) mask |= 1 << len(field_type.categories) - 1 value = struct.pack( 'Q', np.uint64(mask)) # Q = unsigned long long = uint64 if set_predicate.get('booleanOperator') == 'isNotIn': value = struct.pack('Q', ~np.uint64(mask)) else: raise Exception( 'Unsupported tree format: unknown predicate structure in Node {}' .format(child_nodes[0].get('id'))) if classes is None: distribution = [[0]] sample_count_weighted = 0 sample_count = 0 else: distribution = [ list(map(add, distributions[0][0], distributions[offset][0])) ] sample_count_weighted = sum(distribution[0]) sample_count = int(sample_count_weighted) return [(i, i + offset, column, value, impurity, sample_count, sample_count_weighted)] + children, \ [distribution] + distributions
def test_parse_array_real(self): array = eTree.fromstring(f''' <Array type="real">1.2 1.3 2.8</Array> ''') assert parse_array(array) == [1.2, 1.3, 2.8]
def test_parse_array_int(self): array = eTree.fromstring(f''' <Array type="int">3 1 4</Array> ''') assert parse_array(array) == [3, 1, 4]
def test_parse_array_num(self): array = eTree.fromstring(f''' <NUM-Array>1.2 1.3 2.8</NUM-Array> ''') assert parse_array(array) == [1.2, 1.3, 2.8]
def test_parse_array_string(self): array = eTree.fromstring(f''' <Array type="string">"test and stuff" more tests</Array> ''') assert parse_array(array) == ['test and stuff', 'more', 'tests']