def boosting_last_predict(tree, fields, input_data, path=None): """Predict function for boosting and last prediction strategy """ if path is None: path = [] node = get_node(tree) children_number = node[OFFSETS["children#"]] children = [] if children_number == 0 else node[OFFSETS["children"]] count = node[OFFSETS["count"]] if children: for child in children: [operator, field, value, term, missing] = get_predicate(child) if apply_predicate(operator, field, value, term, missing, input_data, fields[field]): path.append(predicate_to_rule(operator, fields[field], value, term, missing)) return boosting_last_predict( \ child, fields, \ input_data, path=path) return Prediction( node[OFFSETS["output"]], path, None, distribution=None, count=count, median=None, distribution_unit=None, children=children, d_min=None, d_max=None)
def generate_rules(tree, offsets, objective_id, fields, depth=0, ids_path=None, subtree=True): """Translates a tree model into a set of IF-THEN rules. """ rules_str = "" node = get_node(tree) children_number = node[offsets["children#"]] children = [] if children_number == 0 else node[offsets["children"]] children = filter_nodes(children, offsets, ids=ids_path, subtree=subtree) if children: for child in children: predicate = get_predicate(child) if isinstance(predicate, list): [operator, field, value, term, missing] = predicate child_node = get_node(child) rules_str += ("%s IF %s %s\n" % (INDENT * depth, predicate_to_rule(operator, fields[field], value, term, missing, label='slug'), "AND" if child_node[offsets["children#"]] > 0 else "THEN")) rules_str += generate_rules(child, offsets, objective_id, fields, depth + 1, ids_path=ids_path, subtree=subtree) else: rules_str += ("%s %s = %s\n" % (INDENT * depth, (fields[objective_id]['slug'] if objective_id else "Prediction"), node[offsets["output"]])) return rules_str
def build_ids_map(tree, offsets, ids_map, parent_id=None): """Builds a map for the tree from each node id to its parent """ node = get_node(tree) node_id = node[offsets["id"]] ids_map[node_id] = parent_id children_number = node[offsets["children#"]] children = [] if children_number == 0 else node[offsets["children"]] for child in children: build_ids_map(child, offsets, ids_map, node_id)
def boosting_proportional_predict(tree, fields, input_data, path=None, missing_found=False): """Makes a prediction based on a number of field values considering all the predictions of the leaves that fall in a subtree. Each time a splitting field has no value assigned, we consider both branches of the split to be true, merging their predictions. The function returns the merged distribution and the last node reached by a unique path. """ if path is None: path = [] node = get_node(tree) children_number = node[OFFSETS["children#"]] children = [] if children_number == 0 else node[OFFSETS["children"]] g_sum = node[OFFSETS["g_sum"]] h_sum = node[OFFSETS["h_sum"]] count = node[OFFSETS["count"]] if not children: return (g_sum, h_sum, count, path) if one_branch(children, input_data) or \ fields[children[0][FIELD_OFFSET]]["optype"] in \ ["text", "items"]: for child in children: [operator, field, value, term, missing] = get_predicate(child) if apply_predicate(operator, field, value, term, missing, input_data, fields[field]): new_rule = predicate_to_rule(operator, fields[field], value, term, missing) if new_rule not in path and not missing_found: path.append(new_rule) return boosting_proportional_predict( \ child, fields, input_data, path, missing_found) else: # missing value found, the unique path stops missing_found = True g_sums = 0.0 h_sums = 0.0 population = 0 for child in children: g_sum, h_sum, count, _ = \ boosting_proportional_predict( \ child, fields, input_data, path, missing_found) g_sums += g_sum h_sums += h_sum population += count return (g_sums, h_sums, population, path)
def get_data_distribution(model): """Returns training data distribution """ if model.boosting: raise AttributeError("This method is not available for boosting" " models.") node = get_node(model.tree) distribution = node[model.offsets["distribution"]] return sorted(distribution, key=lambda x: x[0])
def missing_check_code(tree, offsets, fields, field, depth, input_map, cmv): """Builds the code to predict when the field is missing """ node = get_node(tree) code = "%sif (%s is None):\n" % \ (INDENT * depth, map_data(fields[field]['slug'], input_map, True)) value = value_to_print(node[offsets["output"]], NUMERIC) code += "%sreturn {\"prediction\":%s" % (INDENT * (depth + 1), value) code += "}\n" cmv.append(fields[field]['slug']) return code
def tree_csv(model, file_name=None, leaves_only=False): """Outputs the node structure to a CSV file or array """ if model.boosting: raise AttributeError("This method is not available for boosting" " models.") headers_names = [] if model.regression: headers_names.append(model.fields[model.objective_id]['name']) headers_names.append("error") max_bins = get_node(model.tree)[model.offsets["max_bins"]] for index in range(0, max_bins): headers_names.append("bin%s_value" % index) headers_names.append("bin%s_instances" % index) else: headers_names.append(model.fields[model.objective_id]['name']) headers_names.append("confidence") headers_names.append("impurity") node = get_node(model.tree) for category, _ in node[model.offsets["distribution"]]: headers_names.append(category) nodes_generator = get_nodes_info(model, headers_names, leaves_only=leaves_only) if file_name is not None: with UnicodeWriter(file_name) as writer: writer.writerow([utf8(header) for header in headers_names]) for row in nodes_generator: writer.writerow([ item if not isinstance(item, str) else utf8(item) for item in row ]) return file_name rows = [] rows.append(headers_names) for row in nodes_generator: rows.append(row) return rows
def __init__(self, tree, offsets): predicate = get_predicate(tree) if isinstance(predicate, bool): self.predicate = predicate else: [operator, field, value, term, _] = predicate self.predicate = Predicate(INVERSE_OP[operator], field, value, term) node = get_node(tree) for attr in offsets: if attr not in ["children#", "children"]: setattr(self, attr, node[offsets[attr]]) children = [] if node[offsets["children#"]] == 0 else \ node[offsets["children"]] setattr(self, "children", children)
def missing_check_code(tree, offsets, fields, objective_id, field, depth, input_map, cmv, metric): """Builds the code to predict when the field is missing """ code = "%sif (%s is None):\n" % \ (INDENT * depth, map_data(fields[field]['slug'], input_map, True)) node = get_node(tree) value = value_to_print(node[offsets["output"]], fields[objective_id]['optype']) code += "%sreturn {\"prediction\": %s," \ " \"%s\": %s}\n" % \ (INDENT * (depth + 1), value, metric, node[offsets["confidence"]]) cmv.append(fields[field]['slug']) return code
def get_tree_nodes_info(tree, offsets, regression, fields, objective_id, headers=None, leaves_only=False): """Yields the information associated to each of the tree nodes """ row = [] node = get_node(tree) if not regression: category_dict = dict(node[offsets["distribution"]]) for header in headers: if header == fields[objective_id]['name']: row.append(node[offsets["output"]]) continue if header in ['confidence', 'error']: row.append(node[offsets["confidence"]]) continue if header == 'impurity': row.append( gini_impurity(node[offsets["distribution"]], node[offsets["count"]])) continue if regression and header.startswith('bin'): for bin_value, bin_instances in node[offsets["distribution"]]: row.append(bin_value) row.append(bin_instances) break if not regression: row.append(category_dict.get(header)) while len(row) < len(headers): row.append(None) if not leaves_only or not tree.children: yield row if node[offsets["children#"]] > 0: for child in node[offsets["children"]]: for row in get_tree_nodes_info(child, offsets, regression, fields, objective_id, headers, leaves_only=leaves_only): yield row
def get_tree_leaves(tree, fields, path, leaves, filter_function=None): node = get_node(tree) predicate = get_predicate(tree) if isinstance(predicate, list): [operator, field, value, term, missing] = get_predicate(tree) path.append( to_lisp_rule(operator, field, value, term, missing, fields[field])) children_number = node[offsets["children#"]] children = [] if children_number == 0 else node[offsets["children"]] if children: for child in children: leaves += get_tree_leaves(child, fields, path[:], leaves, filter_function=filter_function) else: leaf = { 'id': node[offsets["id"]], 'confidence': node[offsets["confidence"]], 'count': node[offsets["count"]], 'distribution': node[offsets["distribution"]], 'impurity': gini_impurity(node[offsets["distribution"]], node[offsets["count"]]), 'output': node[offsets["output"]], 'path': path } if 'weighted_distribution' in offsets: leaf.update( \ {"weighted_distribution": node[offsets[ \ "weighted_distribution"]], "weight": node[offsets["weight"]]}) if (not hasattr(filter_function, '__call__') or filter_function(leaf)): leaves += [leaf] return leaves
def filter_nodes(trees_list, offsets, ids=None, subtree=True): """Filters the contents of a trees_list. If any of the nodes is in the ids list, the rest of nodes are removed. If none is in the ids list we include or exclude the nodes depending on the subtree flag. """ if not trees_list: return None trees = trees_list[:] if ids is not None: for tree in trees: node = get_node(tree) node_id = node[offsets["id"]] if node_id in ids: trees = [tree] return trees if not subtree: trees = [] return trees
def depth_first_search(tree, path): """Search for leafs' values and instances """ node = get_node(tree) predicate = get_predicate(tree) if isinstance(predicate, list): [operation, field, value, term, _] = predicate operator = INVERSE_OP[operation] path.append(Predicate(operator, field, value, term)) if term: if field not in model.terms: model.terms[field] = [] if term not in model.terms[field]: model.terms[field].append(term) if node[offsets["children#"]] == 0: add_to_groups( groups, node[offsets["output"]], path, node[offsets["count"]], node[offsets["confidence"]], gini_impurity(node[offsets["distribution"]], node[offsets["count"]])) return node[offsets["count"]] children = node[offsets["children"]][:] children.reverse() children_sum = 0 for child in children: children_sum += depth_first_search(child, path[:]) if children_sum < node[offsets["count"]]: add_to_groups( groups, node[offsets["output"]], path, node[offsets["count"]] - children_sum, node[offsets["confidence"]], gini_impurity(node[offsets["distribution"]], node[offsets["count"]])) return node[offsets["count"]]
def tableau_body(tree, offsets, fields, objective_id, body="", conditions=None, cmv=None, ids_path=None, subtree=True, attr=DFT_ATTR): """Translate the model into a set of "if" statements in Tableau syntax `depth` controls the size of indentation. As soon as a value is missing that node is returned without further evaluation. """ if cmv is None: cmv = [] if body: alternate = "ELSEIF" else: if conditions is None: conditions = [] alternate = "IF" node = get_node(tree) children_number = node[offsets["children#"]] children = [] if children_number == 0 else node[offsets["children"]] children = filter_nodes(children, offsets, ids=ids_path, subtree=subtree) if children: [_, field, _, _, _] = get_predicate(children[0]) has_missing_branch = (missing_branch(children) or none_value(children)) # the missing is singled out as a special case only when there's # no missing branch in the children list if (not has_missing_branch and fields[field]['name'] not in cmv): conditions.append("ISNULL([%s])" % fields[field]['name']) body += ("%s %s THEN " % (alternate, " AND ".join(conditions))) if fields[objective_id]['optype'] == 'numeric': value = node[offsets[attr]] else: value = tableau_string(node[offsets[attr]]) body += ("%s\n" % value) cmv.append(fields[field]['name']) alternate = "ELSEIF" del conditions[-1] for child in children: pre_condition = "" post_condition = "" [operator, field, ch_value, _, missing] = get_predicate(child) if has_missing_branch and ch_value is not None: negation = "" if missing else "NOT " connection = "OR" if missing else "AND" pre_condition = ("(%sISNULL([%s]) %s " % (negation, fields[field]['name'], connection)) if not missing: cmv.append(fields[field]['name']) post_condition = ")" optype = fields[field]['optype'] if ch_value is None: value = "" elif optype in ['text', 'items']: return "" elif optype == 'numeric': value = ch_value else: value = repr(ch_value) operator = ("" if ch_value is None else PYTHON_OPERATOR[operator]) if ch_value is None: pre_condition = (T_MISSING_OPERATOR[operator]) post_condition = ")" conditions.append("%s[%s]%s%s%s" % (pre_condition, fields[field]['name'], operator, value, post_condition)) body = tableau_body(child, offsets, fields, objective_id, body, conditions[:], cmv=cmv[:], ids_path=ids_path, subtree=subtree, attr=attr) del conditions[-1] else: if fields[objective_id]['optype'] == 'numeric': value = tree[offsets[attr]] else: value = tableau_string(node[offsets[attr]]) body += ("%s %s THEN" % (alternate, " AND ".join(conditions))) body += " %s\n" % value return body
def plug_in_body(tree, offsets, fields, objective_id, regression, depth=1, cmv=None, input_map=False, ids_path=None, subtree=True): """Translate the model into a set of "if" python statements. `depth` controls the size of indentation. As soon as a value is missing that node is returned without further evaluation. """ # label for the confidence measure and initialization metric = "error" if regression else "confidence" if cmv is None: cmv = [] body = "" term_analysis_fields = [] item_analysis_fields = [] node = get_node(tree) children = [] if node[offsets["children#"]] == 0 else \ node[offsets["children"]] children = filter_nodes(children, offsets, ids=ids_path, subtree=subtree) if children: # field used in the split field = mintree_split(children) has_missing_branch = (missing_branch(children) or none_value(children)) # the missing is singled out as a special case only when there's # no missing branch in the children list one_branch = not has_missing_branch or \ fields[field]['optype'] in COMPOSED_FIELDS if (one_branch and not fields[field]['slug'] in cmv): body += missing_check_code(tree, offsets, fields, objective_id, field, depth, input_map, cmv, metric) for child in children: [_, field, value, _, _] = get_predicate(child) pre_condition = "" # code when missing_splits has been used if has_missing_branch and value is not None: pre_condition = missing_prefix_code(child, fields, field, input_map, cmv) # complete split condition code body += split_condition_code( \ child, fields, depth, input_map, pre_condition, term_analysis_fields, item_analysis_fields, cmv) # value to be determined in next node next_level = plug_in_body(child, offsets, fields, objective_id, regression, depth + 1, cmv=cmv[:], input_map=input_map, ids_path=ids_path, subtree=subtree) body += next_level[0] term_analysis_fields.extend(next_level[1]) item_analysis_fields.extend(next_level[2]) else: value = value_to_print(node[offsets["output"]], fields[objective_id]['optype']) body = "%sreturn {\"prediction\":%s, \"%s\":%s}\n" % ( \ INDENT * depth, value, metric, node[offsets["confidence"]]) return body, term_analysis_fields, item_analysis_fields
def summarize(model, out=sys.stdout, format=BRIEF): """Prints summary grouping distribution as class header and details """ if model.boosting: raise AttributeError("This method is not available for boosting" " models.") tree = model.tree def extract_common_path(groups): """Extracts the common segment of the prediction path for a group """ for group in groups: details = groups[group]['details'] common_path = [] if len(details) > 0: mcd_len = min([len(x[0]) for x in details]) for i in range(0, mcd_len): test_common_path = details[0][0][i] for subgroup in details: if subgroup[0][i] != test_common_path: i = mcd_len break if i < mcd_len: common_path.append(test_common_path) groups[group]['total'][0] = common_path if len(details) > 0: groups[group]['details'] = sorted(details, key=lambda x: x[1], reverse=True) def confidence_error(value, impurity=None): """Returns confidence for categoric objective fields and error for numeric objective fields """ if value is None: return "" impurity_literal = "" if impurity is not None and impurity > 0: impurity_literal = "; impurity: %.2f%%" % (round(impurity, 4)) objective_type = model.fields[model.objective_id]['optype'] if objective_type == 'numeric': return " [Error: %s]" % value return " [Confidence: %.2f%%%s]" % (round(value, 4) * 100, impurity_literal) distribution = get_data_distribution(model) out.write(utf8("Data distribution:\n")) print_distribution(distribution, out=out) out.write(utf8("\n\n")) groups = group_prediction(model) predictions = get_prediction_distribution(model, groups) out.write(utf8("Predicted distribution:\n")) print_distribution(predictions, out=out) out.write(utf8("\n\n")) if model.field_importance: out.write(utf8("Field importance:\n")) print_importance(model, out=out) extract_common_path(groups) out.write(utf8("\n\nRules summary:")) node = get_node(tree) count = node[model.offsets["count"]] for group in [x[0] for x in predictions]: details = groups[group]['details'] path = Path(groups[group]['total'][0]) data_per_group = groups[group]['total'][1] * 1.0 / count pred_per_group = groups[group]['total'][2] * 1.0 / count out.write( utf8("\n\n%s : (data %.2f%% / prediction %.2f%%) %s" % (group, round(data_per_group, 4) * 100, round(pred_per_group, 4) * 100, path.to_rules(model.fields, format=format)))) if len(details) == 0: out.write( utf8("\n The model will never predict this" " class\n")) elif len(details) == 1: subgroup = details[0] out.write( utf8("%s\n" % confidence_error(subgroup[2], impurity=subgroup[3]))) else: out.write(utf8("\n")) for subgroup in details: pred_per_sgroup = subgroup[1] * 1.0 / \ groups[group]['total'][2] path = Path(subgroup[0]) path_chain = path.to_rules(model.fields, format=format) if \ path.predicates else "(root node)" out.write( utf8( " · %.2f%%: %s%s\n" % (round(pred_per_sgroup, 4) * 100, path_chain, confidence_error(subgroup[2], impurity=subgroup[3])))) out.flush()
def group_prediction(model): """Groups in categories or bins the predicted data dict - contains a dict grouping counts in 'total' and 'details' lists. 'total' key contains a 3-element list. - common segment of the tree for all instances - data count - predictions count 'details' key contains a list of elements. Each element is a 3-element list: - complete path of the tree from the root to the leaf - leaf predictions count - confidence """ if model.boosting: raise AttributeError("This method is not available for boosting" " models.") groups = {} tree = model.tree node = get_node(tree) offsets = model.offsets distribution = node[offsets["distribution"]] for group in distribution: groups[group[0]] = {'total': [[], group[1], 0], 'details': []} path = [] def add_to_groups(groups, output, path, count, confidence, impurity=None): """Adds instances to groups array """ group = output if output not in groups: groups[group] = {'total': [[], 0, 0], 'details': []} groups[group]['details'].append([path, count, confidence, impurity]) groups[group]['total'][2] += count def depth_first_search(tree, path): """Search for leafs' values and instances """ node = get_node(tree) predicate = get_predicate(tree) if isinstance(predicate, list): [operation, field, value, term, _] = predicate operator = INVERSE_OP[operation] path.append(Predicate(operator, field, value, term)) if term: if field not in model.terms: model.terms[field] = [] if term not in model.terms[field]: model.terms[field].append(term) if node[offsets["children#"]] == 0: add_to_groups( groups, node[offsets["output"]], path, node[offsets["count"]], node[offsets["confidence"]], gini_impurity(node[offsets["distribution"]], node[offsets["count"]])) return node[offsets["count"]] children = node[offsets["children"]][:] children.reverse() children_sum = 0 for child in children: children_sum += depth_first_search(child, path[:]) if children_sum < node[offsets["count"]]: add_to_groups( groups, node[offsets["output"]], path, node[offsets["count"]] - children_sum, node[offsets["confidence"]], gini_impurity(node[offsets["distribution"]], node[offsets["count"]])) return node[offsets["count"]] depth_first_search(tree, path) return groups