Esempio n. 1
0
def boosting_last_predict(tree, fields, input_data, path=None):
    """Predict function for boosting and last prediction strategy

    """

    if path is None:
        path = []
    node = get_node(tree)

    children_number = node[OFFSETS["children#"]]
    children = [] if children_number == 0 else node[OFFSETS["children"]]
    count = node[OFFSETS["count"]]

    if children:
        for child in children:
            [operator, field, value, term, missing] = get_predicate(child)
            if apply_predicate(operator, field, value, term, missing,
                               input_data, fields[field]):
                path.append(predicate_to_rule(operator, fields[field],
                                              value, term, missing))
                return boosting_last_predict( \
                    child, fields, \
                    input_data, path=path)

    return Prediction(
        node[OFFSETS["output"]],
        path,
        None,
        distribution=None,
        count=count,
        median=None,
        distribution_unit=None,
        children=children,
        d_min=None,
        d_max=None)
Esempio n. 2
0
def generate_rules(tree,
                   offsets,
                   objective_id,
                   fields,
                   depth=0,
                   ids_path=None,
                   subtree=True):
    """Translates a tree model into a set of IF-THEN rules.

    """
    rules_str = ""

    node = get_node(tree)
    children_number = node[offsets["children#"]]
    children = [] if children_number == 0 else node[offsets["children"]]
    children = filter_nodes(children, offsets, ids=ids_path, subtree=subtree)
    if children:
        for child in children:
            predicate = get_predicate(child)
            if isinstance(predicate, list):
                [operator, field, value, term, missing] = predicate
                child_node = get_node(child)
            rules_str += ("%s IF %s %s\n" %
                          (INDENT * depth,
                           predicate_to_rule(operator,
                                             fields[field],
                                             value,
                                             term,
                                             missing,
                                             label='slug'), "AND" if
                           child_node[offsets["children#"]] > 0 else "THEN"))
            rules_str += generate_rules(child,
                                        offsets,
                                        objective_id,
                                        fields,
                                        depth + 1,
                                        ids_path=ids_path,
                                        subtree=subtree)
    else:
        rules_str += ("%s %s = %s\n" %
                      (INDENT * depth,
                       (fields[objective_id]['slug'] if objective_id else
                        "Prediction"), node[offsets["output"]]))
    return rules_str
Esempio n. 3
0
def build_ids_map(tree, offsets, ids_map, parent_id=None):
    """Builds a map for the tree from each node id to its parent

    """
    node = get_node(tree)
    node_id = node[offsets["id"]]
    ids_map[node_id] = parent_id
    children_number = node[offsets["children#"]]
    children = [] if children_number == 0 else node[offsets["children"]]
    for child in children:
        build_ids_map(child, offsets, ids_map, node_id)
Esempio n. 4
0
def boosting_proportional_predict(tree, fields, input_data, path=None,
                                  missing_found=False):
    """Makes a prediction based on a number of field values considering all
       the predictions of the leaves that fall in a subtree.

       Each time a splitting field has no value assigned, we consider
       both branches of the split to be true, merging their
       predictions. The function returns the merged distribution and the
       last node reached by a unique path.

    """

    if path is None:
        path = []

    node = get_node(tree)
    children_number = node[OFFSETS["children#"]]
    children = [] if children_number == 0 else node[OFFSETS["children"]]
    g_sum = node[OFFSETS["g_sum"]]
    h_sum = node[OFFSETS["h_sum"]]
    count = node[OFFSETS["count"]]

    if not children:
        return (g_sum, h_sum, count, path)
    if one_branch(children, input_data) or \
            fields[children[0][FIELD_OFFSET]]["optype"] in \
            ["text", "items"]:
        for child in children:
            [operator, field, value, term, missing] = get_predicate(child)
            if apply_predicate(operator, field, value, term, missing,
                               input_data, fields[field]):
                new_rule = predicate_to_rule(operator, fields[field], value,
                                             term, missing)
                if new_rule not in path and not missing_found:
                    path.append(new_rule)
                return boosting_proportional_predict( \
                    child, fields,
                    input_data, path, missing_found)
    else:
        # missing value found, the unique path stops
        missing_found = True
        g_sums = 0.0
        h_sums = 0.0
        population = 0
        for child in children:
            g_sum, h_sum, count, _ = \
                boosting_proportional_predict( \
                    child, fields, input_data,
                    path, missing_found)
            g_sums += g_sum
            h_sums += h_sum
            population += count
        return (g_sums, h_sums, population, path)
Esempio n. 5
0
def get_data_distribution(model):
    """Returns training data distribution

    """
    if model.boosting:
        raise AttributeError("This method is not available for boosting"
                             " models.")
    node = get_node(model.tree)

    distribution = node[model.offsets["distribution"]]

    return sorted(distribution, key=lambda x: x[0])
Esempio n. 6
0
def missing_check_code(tree, offsets, fields, field, depth, input_map, cmv):
    """Builds the code to predict when the field is missing
    """
    node = get_node(tree)
    code = "%sif (%s is None):\n" % \
           (INDENT * depth,
            map_data(fields[field]['slug'], input_map, True))
    value = value_to_print(node[offsets["output"]], NUMERIC)
    code += "%sreturn {\"prediction\":%s" % (INDENT * (depth + 1), value)
    code += "}\n"
    cmv.append(fields[field]['slug'])
    return code
Esempio n. 7
0
def tree_csv(model, file_name=None, leaves_only=False):
    """Outputs the node structure to a CSV file or array

    """
    if model.boosting:
        raise AttributeError("This method is not available for boosting"
                             " models.")
    headers_names = []
    if model.regression:
        headers_names.append(model.fields[model.objective_id]['name'])
        headers_names.append("error")
        max_bins = get_node(model.tree)[model.offsets["max_bins"]]
        for index in range(0, max_bins):
            headers_names.append("bin%s_value" % index)
            headers_names.append("bin%s_instances" % index)
    else:
        headers_names.append(model.fields[model.objective_id]['name'])
        headers_names.append("confidence")
        headers_names.append("impurity")
        node = get_node(model.tree)
        for category, _ in node[model.offsets["distribution"]]:
            headers_names.append(category)

    nodes_generator = get_nodes_info(model,
                                     headers_names,
                                     leaves_only=leaves_only)
    if file_name is not None:
        with UnicodeWriter(file_name) as writer:
            writer.writerow([utf8(header) for header in headers_names])
            for row in nodes_generator:
                writer.writerow([
                    item if not isinstance(item, str) else utf8(item)
                    for item in row
                ])
        return file_name
    rows = []
    rows.append(headers_names)
    for row in nodes_generator:
        rows.append(row)
    return rows
Esempio n. 8
0
 def __init__(self, tree, offsets):
     predicate = get_predicate(tree)
     if isinstance(predicate, bool):
         self.predicate = predicate
     else:
         [operator, field, value, term, _] = predicate
         self.predicate = Predicate(INVERSE_OP[operator], field, value,
                                    term)
     node = get_node(tree)
     for attr in offsets:
         if attr not in ["children#", "children"]:
             setattr(self, attr, node[offsets[attr]])
     children = [] if node[offsets["children#"]] == 0 else \
         node[offsets["children"]]
     setattr(self, "children", children)
Esempio n. 9
0
def missing_check_code(tree, offsets, fields, objective_id, field, depth,
                       input_map, cmv, metric):
    """Builds the code to predict when the field is missing
    """
    code = "%sif (%s is None):\n" % \
           (INDENT * depth,
            map_data(fields[field]['slug'], input_map, True))
    node = get_node(tree)
    value = value_to_print(node[offsets["output"]],
                           fields[objective_id]['optype'])
    code += "%sreturn {\"prediction\": %s," \
        " \"%s\": %s}\n" % \
        (INDENT * (depth + 1), value, metric, node[offsets["confidence"]])
    cmv.append(fields[field]['slug'])
    return code
Esempio n. 10
0
    def get_tree_nodes_info(tree,
                            offsets,
                            regression,
                            fields,
                            objective_id,
                            headers=None,
                            leaves_only=False):
        """Yields the information associated to each of the tree nodes

        """
        row = []
        node = get_node(tree)
        if not regression:
            category_dict = dict(node[offsets["distribution"]])
        for header in headers:
            if header == fields[objective_id]['name']:
                row.append(node[offsets["output"]])
                continue
            if header in ['confidence', 'error']:
                row.append(node[offsets["confidence"]])
                continue
            if header == 'impurity':
                row.append(
                    gini_impurity(node[offsets["distribution"]],
                                  node[offsets["count"]]))
                continue
            if regression and header.startswith('bin'):
                for bin_value, bin_instances in node[offsets["distribution"]]:
                    row.append(bin_value)
                    row.append(bin_instances)
                break
            if not regression:
                row.append(category_dict.get(header))
        while len(row) < len(headers):
            row.append(None)
        if not leaves_only or not tree.children:
            yield row

        if node[offsets["children#"]] > 0:
            for child in node[offsets["children"]]:
                for row in get_tree_nodes_info(child,
                                               offsets,
                                               regression,
                                               fields,
                                               objective_id,
                                               headers,
                                               leaves_only=leaves_only):
                    yield row
Esempio n. 11
0
    def get_tree_leaves(tree, fields, path, leaves, filter_function=None):

        node = get_node(tree)
        predicate = get_predicate(tree)
        if isinstance(predicate, list):
            [operator, field, value, term, missing] = get_predicate(tree)
            path.append(
                to_lisp_rule(operator, field, value, term, missing,
                             fields[field]))

        children_number = node[offsets["children#"]]
        children = [] if children_number == 0 else node[offsets["children"]]

        if children:
            for child in children:
                leaves += get_tree_leaves(child,
                                          fields,
                                          path[:],
                                          leaves,
                                          filter_function=filter_function)
        else:
            leaf = {
                'id':
                node[offsets["id"]],
                'confidence':
                node[offsets["confidence"]],
                'count':
                node[offsets["count"]],
                'distribution':
                node[offsets["distribution"]],
                'impurity':
                gini_impurity(node[offsets["distribution"]],
                              node[offsets["count"]]),
                'output':
                node[offsets["output"]],
                'path':
                path
            }
            if 'weighted_distribution' in offsets:
                leaf.update( \
                    {"weighted_distribution": node[offsets[ \
                        "weighted_distribution"]],
                     "weight": node[offsets["weight"]]})
            if (not hasattr(filter_function, '__call__')
                    or filter_function(leaf)):
                leaves += [leaf]
        return leaves
Esempio n. 12
0
def filter_nodes(trees_list, offsets, ids=None, subtree=True):
    """Filters the contents of a trees_list. If any of the nodes is in the
       ids list, the rest of nodes are removed. If none is in the ids list
       we include or exclude the nodes depending on the subtree flag.

    """
    if not trees_list:
        return None
    trees = trees_list[:]
    if ids is not None:
        for tree in trees:
            node = get_node(tree)
            node_id = node[offsets["id"]]
            if node_id in ids:
                trees = [tree]
                return trees
    if not subtree:
        trees = []
    return trees
Esempio n. 13
0
    def depth_first_search(tree, path):
        """Search for leafs' values and instances

        """
        node = get_node(tree)
        predicate = get_predicate(tree)
        if isinstance(predicate, list):
            [operation, field, value, term, _] = predicate
            operator = INVERSE_OP[operation]
            path.append(Predicate(operator, field, value, term))
            if term:
                if field not in model.terms:
                    model.terms[field] = []
                if term not in model.terms[field]:
                    model.terms[field].append(term)

        if node[offsets["children#"]] == 0:
            add_to_groups(
                groups, node[offsets["output"]], path, node[offsets["count"]],
                node[offsets["confidence"]],
                gini_impurity(node[offsets["distribution"]],
                              node[offsets["count"]]))
            return node[offsets["count"]]
        children = node[offsets["children"]][:]
        children.reverse()

        children_sum = 0
        for child in children:
            children_sum += depth_first_search(child, path[:])
        if children_sum < node[offsets["count"]]:
            add_to_groups(
                groups, node[offsets["output"]], path,
                node[offsets["count"]] - children_sum,
                node[offsets["confidence"]],
                gini_impurity(node[offsets["distribution"]],
                              node[offsets["count"]]))
        return node[offsets["count"]]
Esempio n. 14
0
def tableau_body(tree,
                 offsets,
                 fields,
                 objective_id,
                 body="",
                 conditions=None,
                 cmv=None,
                 ids_path=None,
                 subtree=True,
                 attr=DFT_ATTR):
    """Translate the model into a set of "if" statements in Tableau syntax

    `depth` controls the size of indentation. As soon as a value is missing
    that node is returned without further evaluation.

    """

    if cmv is None:
        cmv = []
    if body:
        alternate = "ELSEIF"
    else:
        if conditions is None:
            conditions = []
        alternate = "IF"

    node = get_node(tree)
    children_number = node[offsets["children#"]]
    children = [] if children_number == 0 else node[offsets["children"]]
    children = filter_nodes(children, offsets, ids=ids_path, subtree=subtree)
    if children:
        [_, field, _, _, _] = get_predicate(children[0])
        has_missing_branch = (missing_branch(children) or none_value(children))
        # the missing is singled out as a special case only when there's
        # no missing branch in the children list
        if (not has_missing_branch and fields[field]['name'] not in cmv):
            conditions.append("ISNULL([%s])" % fields[field]['name'])
            body += ("%s %s THEN " % (alternate, " AND ".join(conditions)))
            if fields[objective_id]['optype'] == 'numeric':
                value = node[offsets[attr]]
            else:
                value = tableau_string(node[offsets[attr]])
            body += ("%s\n" % value)
            cmv.append(fields[field]['name'])
            alternate = "ELSEIF"
            del conditions[-1]

        for child in children:
            pre_condition = ""
            post_condition = ""
            [operator, field, ch_value, _, missing] = get_predicate(child)
            if has_missing_branch and ch_value is not None:
                negation = "" if missing else "NOT "
                connection = "OR" if missing else "AND"
                pre_condition = ("(%sISNULL([%s]) %s " %
                                 (negation, fields[field]['name'], connection))
                if not missing:
                    cmv.append(fields[field]['name'])
                post_condition = ")"
            optype = fields[field]['optype']
            if ch_value is None:
                value = ""
            elif optype in ['text', 'items']:
                return ""
            elif optype == 'numeric':
                value = ch_value
            else:
                value = repr(ch_value)

            operator = ("" if ch_value is None else PYTHON_OPERATOR[operator])
            if ch_value is None:
                pre_condition = (T_MISSING_OPERATOR[operator])
                post_condition = ")"

            conditions.append("%s[%s]%s%s%s" %
                              (pre_condition, fields[field]['name'], operator,
                               value, post_condition))
            body = tableau_body(child,
                                offsets,
                                fields,
                                objective_id,
                                body,
                                conditions[:],
                                cmv=cmv[:],
                                ids_path=ids_path,
                                subtree=subtree,
                                attr=attr)
            del conditions[-1]
    else:
        if fields[objective_id]['optype'] == 'numeric':
            value = tree[offsets[attr]]
        else:
            value = tableau_string(node[offsets[attr]])
        body += ("%s %s THEN" % (alternate, " AND ".join(conditions)))
        body += " %s\n" % value

    return body
Esempio n. 15
0
def plug_in_body(tree,
                 offsets,
                 fields,
                 objective_id,
                 regression,
                 depth=1,
                 cmv=None,
                 input_map=False,
                 ids_path=None,
                 subtree=True):
    """Translate the model into a set of "if" python statements.
    `depth` controls the size of indentation. As soon as a value is missing
    that node is returned without further evaluation.
    """
    # label for the confidence measure and initialization
    metric = "error" if regression else "confidence"
    if cmv is None:
        cmv = []
    body = ""
    term_analysis_fields = []
    item_analysis_fields = []

    node = get_node(tree)
    children = [] if node[offsets["children#"]] == 0 else \
        node[offsets["children"]]
    children = filter_nodes(children, offsets, ids=ids_path, subtree=subtree)
    if children:

        # field used in the split
        field = mintree_split(children)

        has_missing_branch = (missing_branch(children) or none_value(children))
        # the missing is singled out as a special case only when there's
        # no missing branch in the children list
        one_branch = not has_missing_branch or \
            fields[field]['optype'] in COMPOSED_FIELDS
        if (one_branch and not fields[field]['slug'] in cmv):
            body += missing_check_code(tree, offsets, fields, objective_id,
                                       field, depth, input_map, cmv, metric)

        for child in children:
            [_, field, value, _, _] = get_predicate(child)
            pre_condition = ""
            # code when missing_splits has been used
            if has_missing_branch and value is not None:
                pre_condition = missing_prefix_code(child, fields, field,
                                                    input_map, cmv)

            # complete split condition code
            body += split_condition_code( \
                child, fields, depth, input_map, pre_condition,
                term_analysis_fields, item_analysis_fields, cmv)

            # value to be determined in next node
            next_level = plug_in_body(child,
                                      offsets,
                                      fields,
                                      objective_id,
                                      regression,
                                      depth + 1,
                                      cmv=cmv[:],
                                      input_map=input_map,
                                      ids_path=ids_path,
                                      subtree=subtree)

            body += next_level[0]
            term_analysis_fields.extend(next_level[1])
            item_analysis_fields.extend(next_level[2])
    else:
        value = value_to_print(node[offsets["output"]],
                               fields[objective_id]['optype'])
        body = "%sreturn {\"prediction\":%s, \"%s\":%s}\n" % ( \
            INDENT * depth, value, metric, node[offsets["confidence"]])

    return body, term_analysis_fields, item_analysis_fields
Esempio n. 16
0
def summarize(model, out=sys.stdout, format=BRIEF):
    """Prints summary grouping distribution as class header and details

    """
    if model.boosting:
        raise AttributeError("This method is not available for boosting"
                             " models.")
    tree = model.tree

    def extract_common_path(groups):
        """Extracts the common segment of the prediction path for a group

        """
        for group in groups:
            details = groups[group]['details']
            common_path = []
            if len(details) > 0:
                mcd_len = min([len(x[0]) for x in details])
                for i in range(0, mcd_len):
                    test_common_path = details[0][0][i]
                    for subgroup in details:
                        if subgroup[0][i] != test_common_path:
                            i = mcd_len
                            break
                    if i < mcd_len:
                        common_path.append(test_common_path)
            groups[group]['total'][0] = common_path
            if len(details) > 0:
                groups[group]['details'] = sorted(details,
                                                  key=lambda x: x[1],
                                                  reverse=True)

    def confidence_error(value, impurity=None):
        """Returns confidence for categoric objective fields
           and error for numeric objective fields
        """
        if value is None:
            return ""
        impurity_literal = ""
        if impurity is not None and impurity > 0:
            impurity_literal = "; impurity: %.2f%%" % (round(impurity, 4))
        objective_type = model.fields[model.objective_id]['optype']
        if objective_type == 'numeric':
            return " [Error: %s]" % value
        return " [Confidence: %.2f%%%s]" % (round(value, 4) * 100,
                                            impurity_literal)

    distribution = get_data_distribution(model)

    out.write(utf8("Data distribution:\n"))
    print_distribution(distribution, out=out)
    out.write(utf8("\n\n"))

    groups = group_prediction(model)
    predictions = get_prediction_distribution(model, groups)

    out.write(utf8("Predicted distribution:\n"))
    print_distribution(predictions, out=out)
    out.write(utf8("\n\n"))

    if model.field_importance:
        out.write(utf8("Field importance:\n"))
        print_importance(model, out=out)

    extract_common_path(groups)

    out.write(utf8("\n\nRules summary:"))

    node = get_node(tree)
    count = node[model.offsets["count"]]
    for group in [x[0] for x in predictions]:
        details = groups[group]['details']
        path = Path(groups[group]['total'][0])
        data_per_group = groups[group]['total'][1] * 1.0 / count
        pred_per_group = groups[group]['total'][2] * 1.0 / count
        out.write(
            utf8("\n\n%s : (data %.2f%% / prediction %.2f%%) %s" %
                 (group, round(data_per_group, 4) * 100,
                  round(pred_per_group, 4) * 100,
                  path.to_rules(model.fields, format=format))))

        if len(details) == 0:
            out.write(
                utf8("\n    The model will never predict this"
                     " class\n"))
        elif len(details) == 1:
            subgroup = details[0]
            out.write(
                utf8("%s\n" %
                     confidence_error(subgroup[2], impurity=subgroup[3])))
        else:
            out.write(utf8("\n"))
            for subgroup in details:
                pred_per_sgroup = subgroup[1] * 1.0 / \
                    groups[group]['total'][2]
                path = Path(subgroup[0])
                path_chain = path.to_rules(model.fields, format=format) if \
                    path.predicates else "(root node)"
                out.write(
                    utf8(
                        "    · %.2f%%: %s%s\n" %
                        (round(pred_per_sgroup, 4) * 100, path_chain,
                         confidence_error(subgroup[2], impurity=subgroup[3]))))

    out.flush()
Esempio n. 17
0
def group_prediction(model):
    """Groups in categories or bins the predicted data

    dict - contains a dict grouping counts in 'total' and 'details' lists.
            'total' key contains a 3-element list.
                   - common segment of the tree for all instances
                   - data count
                   - predictions count
            'details' key contains a list of elements. Each element is a
                      3-element list:
                   - complete path of the tree from the root to the leaf
                   - leaf predictions count
                   - confidence
    """
    if model.boosting:
        raise AttributeError("This method is not available for boosting"
                             " models.")
    groups = {}
    tree = model.tree
    node = get_node(tree)
    offsets = model.offsets
    distribution = node[offsets["distribution"]]

    for group in distribution:
        groups[group[0]] = {'total': [[], group[1], 0], 'details': []}
    path = []

    def add_to_groups(groups, output, path, count, confidence, impurity=None):
        """Adds instances to groups array

        """
        group = output
        if output not in groups:
            groups[group] = {'total': [[], 0, 0], 'details': []}
        groups[group]['details'].append([path, count, confidence, impurity])
        groups[group]['total'][2] += count

    def depth_first_search(tree, path):
        """Search for leafs' values and instances

        """
        node = get_node(tree)
        predicate = get_predicate(tree)
        if isinstance(predicate, list):
            [operation, field, value, term, _] = predicate
            operator = INVERSE_OP[operation]
            path.append(Predicate(operator, field, value, term))
            if term:
                if field not in model.terms:
                    model.terms[field] = []
                if term not in model.terms[field]:
                    model.terms[field].append(term)

        if node[offsets["children#"]] == 0:
            add_to_groups(
                groups, node[offsets["output"]], path, node[offsets["count"]],
                node[offsets["confidence"]],
                gini_impurity(node[offsets["distribution"]],
                              node[offsets["count"]]))
            return node[offsets["count"]]
        children = node[offsets["children"]][:]
        children.reverse()

        children_sum = 0
        for child in children:
            children_sum += depth_first_search(child, path[:])
        if children_sum < node[offsets["count"]]:
            add_to_groups(
                groups, node[offsets["output"]], path,
                node[offsets["count"]] - children_sum,
                node[offsets["confidence"]],
                gini_impurity(node[offsets["distribution"]],
                              node[offsets["count"]]))
        return node[offsets["count"]]

    depth_first_search(tree, path)

    return groups