Esempio n. 1
0
def get_distributions(tree, instances):
    """
    Calculate piecewise distributions of good, bad, and unlabeled instances.

    Parameters
    ----------
    tree : TreeNode
        The decision tree.
    instances : DataFrame
        Labeled instances used to train the decision tree. Must have a column
        for each attribute in the tree, and a 'class' column with values 'good',
        'bad', or NaN (unlabeled).

    Returns
    -------
    dict
        key: parameter name
        value: list of tuples, each of which describes a range of values for the
        parameter, delimited by a pair of split points in the tree:
        (high, low, good_count, bad_count, unlabeled_count)
    """
    
    # key: parameter name
    # value: list of split values
    splits = {}
    for node in tree.get_internal_nodes():
        if node.split_attribute not in splits:
            splits[node.split_attribute] = [node.split_value]
        else:
            splits[node.split_attribute].append(node.split_value)
    
    # key: parameter name
    # value: list of tuples: (high, low, good_count, bad_count, unlabeled_count)
    distributions = {}
    for param, split_values in splits.items():
        split_values.sort()
        param_base_name = util.remove_trailing_digits(param)
        min_param_value = instances[param].min()
        max_param_value = instances[param].max()
        split_values.insert(0, min_param_value)
        split_values.append(max_param_value)

        # FIXME: There must be a more efficient way to use pandas for this
        df = instances
        segments = []
        for i in range(len(split_values) - 1):
            low = split_values[i]
            high = split_values[i+1]
            if i == 0:
                df2 = df[df[param] <= high]
            else:
                df2 = df[(df[param] > low) & (df[param] <= high)]
            counts = df2['class'].value_counts(dropna=False)
            good = counts.loc['good'] if 'good' in counts.index else 0
            bad = counts.loc['bad'] if 'bad' in counts.index else 0
            unlabeled = counts.loc[np.nan] if np.nan in counts.index else 0
            segments.append((low, high, int(good), int(bad), int(unlabeled)))

        distributions[param] = segments

    return distributions
Esempio n. 2
0
def get_distributions(tree, instances):
    """
    Calculate piecewise distributions of good, bad, and unlabeled instances.

    Parameters
    ----------
    tree : TreeNode
        The decision tree.
    instances : DataFrame
        Labeled instances used to train the decision tree. Must have a column
        for each attribute in the tree, and a 'label' column with values 'good',
        'bad', or NaN (unlabeled).

    Returns
    -------
    dict
        key: parameter name
        value: list of tuples, each of which describes a range of values for the
        parameter, delimited by a pair of split points in the tree:
        (high, low, good_count, bad_count, unlabeled_count)
    """
    
    # key: parameter name
    # value: list of split values
    splits = {}
    for node in tree.get_internal_nodes():
        if node.split_attribute not in splits:
            splits[node.split_attribute] = [node.split_value]
        else:
            splits[node.split_attribute].append(node.split_value)
    
    # key: parameter name
    # value: list of tuples: (high, low, good_count, bad_count, unlabeled_count)
    distributions = {}
    for param, split_values in splits.items():
        split_values.sort()
        param_base_name = util.remove_trailing_digits(param)
        min_param_value = instances[param].min()
        max_param_value = instances[param].max()
        split_values.insert(0, min_param_value)
        split_values.append(max_param_value)

        # FIXME: There must be a more efficient way to use pandas for this
        df = instances
        segments = []
        for i in range(len(split_values) - 1):
            low = split_values[i]
            high = split_values[i+1]
            if i == 0:
                df2 = df[df[param] <= high]
            else:
                df2 = df[(df[param] > low) & (df[param] <= high)]
            counts = df2['label'].value_counts(dropna=False)
            good = counts.loc['good'] if 'good' in counts.index else 0
            bad = counts.loc['bad'] if 'bad' in counts.index else 0
            unlabeled = counts.loc[np.nan] if np.nan in counts.index else 0
            segments.append((low, high, int(good), int(bad), int(unlabeled)))

        distributions[param] = segments

    return distributions
Esempio n. 3
0
def get_parameter_distributions(tree):
    """
    Calculate parameter distributions based on the given decision tree.

    For each parameter/attribute, a piecewise distribution is computed. Each
    segment of this distribution has two endpoints (on the 'x' axis) and a
    constant weight (on the 'y' axis'). The weight of a segment is the sum of
    the number of "good" instances within its range, minus the number of "bad"
    instances. Each leaf of the tree may contribute a segment of this
    distribution, based on the attribute split points along the path from the
    leaf to the root. 

    Parameters
    ----------
    tree : TreeNode
        The root node of the tree.

    Returns
    -------
    dict
        key: parameter name
        value: list of non-overlapping weighted segments, each one of which is a
               tuple in the form (low, high, weight)
    """

    # upper and lower limits for parameter ranges
    # key: parameter name
    # value: (low, high)
    limits = {}

    # Assemble the parameter ranges for each leaf into a list of weighted
    # segments for each parameter.
    # key: parameter name
    # value: list of (low, high, weight) segments for the parameter
    segments = {}

    for leaf in tree.get_leaves():
        weight = leaf.instance_count - leaf.misclassified_count
        if leaf.class_label == 'bad':
            weight = -weight

        for param, (low, high) in get_ranges_for_leaf(leaf).items():

            # Ranges can have None for low or high, if there is no lower or
            # upper bound, respectively. Replace None with the lowest or highest
            # valid value for the parameter.
            if param not in limits:
                param_base_name = util.remove_trailing_digits(param)
                limits[param] = valid_param_ranges[param_base_name]
            low = low or limits[param][0]
            high = high or limits[param][1]
            
            seg = (low, high, weight)
            if param not in segments:
                segments[param] = [seg]
            else:
                segments[param].append(seg)

    # For each parameter, combine the weighted segments.
    # key: parameter name
    # value: list of combined segments for the parameter, none overlapping
    combined_segments = {}
    for param, param_segments in segments.items():
        combined_segments[param] = combine_weighted_segments(segments[param])

    return combined_segments
Esempio n. 4
0
def get_parameter_distributions(tree):
    """
    Calculate parameter distributions based on the given decision tree.

    For each parameter/attribute, a piecewise distribution is computed. Each
    segment of this distribution has two endpoints (on the 'x' axis) and a
    constant weight (on the 'y' axis'). The weight of a segment is the sum of
    the number of "good" instances within its range, minus the number of "bad"
    instances. Each leaf of the tree may contribute a segment of this
    distribution, based on the attribute split points along the path from the
    leaf to the root. 

    Parameters
    ----------
    tree : TreeNode
        The root node of the tree.

    Returns
    -------
    dict
        key: parameter name
        value: list of non-overlapping weighted segments, each one of which is a
               tuple in the form (low, high, weight)
    """

    # upper and lower limits for parameter ranges
    # key: parameter name
    # value: (low, high)
    limits = {}

    # Assemble the parameter ranges for each leaf into a list of weighted
    # segments for each parameter.
    # key: parameter name
    # value: list of (low, high, weight) segments for the parameter
    segments = {}

    for leaf in tree.get_leaves():
        weight = leaf.instance_count - leaf.misclassified_count
        if leaf.class_label == 'bad':
            weight = -weight

        for param, (low, high) in get_ranges_for_leaf(leaf).items():

            # Ranges can have None for low or high, if there is no lower or
            # upper bound, respectively. Replace None with the lowest or highest
            # valid value for the parameter.
            if param not in limits:
                param_base_name = util.remove_trailing_digits(param)
                limits[param] = valid_param_ranges[param_base_name]
            low = low or limits[param][0]
            high = high or limits[param][1]
            
            seg = (low, high, weight)
            if param not in segments:
                segments[param] = [seg]
            else:
                segments[param].append(seg)

    # For each parameter, combine the weighted segments.
    # key: parameter name
    # value: list of combined segments for the parameter, none overlapping
    combined_segments = {}
    for param, param_segments in segments.items():
        combined_segments[param] = combine_weighted_segments(segments[param])

    return combined_segments