예제 #1
0
    def predict(self, input_data, path=None, missing_strategy=LAST_PREDICTION):
        """Makes a prediction based on a number of field values.

        The input fields must be keyed by Id. There are two possible
        strategies to predict when the value for the splitting field
        is missing:
            0 - LAST_PREDICTION: the last issued prediction is returned.
            1 - PROPORTIONAL: as we cannot choose between the two branches
                in the tree that stem from this split, we consider both. The
                algorithm goes on until the final leaves are reached and
                all their predictions are used to decide the final prediction.
        """

        if path is None:
            path = []
        if missing_strategy == PROPORTIONAL:
            (final_distribution, d_min, d_max, last_node, population,
             parent_node) = self.predict_proportional(input_data, path=path)

            if self.regression:
                # singular case:
                # when the prediction is the one given in a 1-instance node
                if len(final_distribution.items()) == 1:
                    prediction, instances = final_distribution.items()[0]
                    if instances == 1:
                        return Prediction(
                            last_node.output,
                            path,
                            last_node.confidence,
                            distribution=(last_node.distribution if not  \
                                self.weighted else \
                                last_node.weighted_distribution),
                            count=instances,
                            median=last_node.median,
                            distribution_unit=last_node.distribution_unit,
                            children=last_node.children,
                            d_min=last_node.min,
                            d_max=last_node.max)
                # when there's more instances, sort elements by their mean
                distribution = [
                    list(element)
                    for element in sorted(final_distribution.items(),
                                          key=lambda x: x[0])
                ]
                distribution_unit = ('bins' if len(distribution) > BINS_LIMIT
                                     else 'counts')
                distribution = merge_bins(distribution, BINS_LIMIT)
                total_instances = sum(
                    [instances for _, instances in distribution])
                if len(distribution) == 1:
                    # where there's only one bin, there will be no error, but
                    # we use a correction derived from the parent's error
                    prediction = distribution[0][0]
                    if total_instances < 2:
                        total_instances = 1
                    try:
                        # some strange models can have nodes with no confidence
                        confidence = round(
                            parent_node.confidence /
                            math.sqrt(total_instances), PRECISION)
                    except AttributeError:
                        confidence = None
                else:
                    prediction = mean(distribution)
                    confidence = round(
                        regression_error(
                            unbiased_sample_variance(distribution, prediction),
                            total_instances), PRECISION)
                return Prediction(prediction,
                                  path,
                                  confidence,
                                  distribution=distribution,
                                  count=total_instances,
                                  median=dist_median(distribution,
                                                     total_instances),
                                  distribution_unit=distribution_unit,
                                  children=last_node.children,
                                  d_min=d_min,
                                  d_max=d_max)
            else:
                distribution = [
                    list(element)
                    for element in sorted(final_distribution.items(),
                                          key=lambda x: (-x[1], x[0]))
                ]
                return Prediction(distribution[0][0],
                                  path,
                                  ws_confidence(distribution[0][0],
                                                final_distribution,
                                                ws_n=population),
                                  distribution=distribution,
                                  count=population,
                                  median=None,
                                  distribution_unit='categorical',
                                  children=last_node.children)

        else:
            if self.children:
                for child in self.children:
                    if child.predicate.apply(input_data, self.fields):
                        path.append(child.predicate.to_rule(self.fields))
                        return child.predict(input_data, path=path)

            if self.weighted:
                output_distribution = self.weighted_distribution
                output_unit = self.weighted_distribution_unit
            else:
                output_distribution = self.distribution
                output_unit = self.distribution_unit

            return Prediction(
                self.output,
                path,
                self.confidence,
                distribution=output_distribution,
                count=get_instances(output_distribution),
                median=None if not self.regression else self.median,
                distribution_unit=output_unit,
                children=self.children,
                d_min=None if not self.regression else self.min,
                d_max=None if not self.regression else self.max)
예제 #2
0
파일: tree.py 프로젝트: david-x-chen/python
    def predict(self, input_data, path=None, missing_strategy=LAST_PREDICTION):
        """Makes a prediction based on a number of field values.

        The input fields must be keyed by Id. There are two possible
        strategies to predict when the value for the splitting field
        is missing:
            0 - LAST_PREDICTION: the last issued prediction is returned.
            1 - PROPORTIONAL: as we cannot choose between the two branches
                in the tree that stem from this split, we consider both. The
                algorithm goes on until the final leaves are reached and
                all their predictions are used to decide the final prediction.
        """

        if path is None:
            path = []
        if missing_strategy == PROPORTIONAL:
            (final_distribution,
             last_node) = self.predict_proportional(input_data, path=path)

            if self.regression:
                # singular case:
                # when the prediction is the one given in a 1-instance node
                if len(final_distribution.items()) == 1:
                    prediction, instances = final_distribution.items()[0]
                    if instances == 1:
                        return Prediction(
                            last_node.output,
                            path,
                            last_node.confidence,
                            distribution=last_node.distribution,
                            count=instances,
                            median=last_node.median,
                            distribution_unit=last_node.distribution_unit,
                            children=last_node.children)
                # when there's more instances, sort elements by their mean
                distribution = [list(element) for element in
                                sorted(final_distribution.items(),
                                       key=lambda x: x[0])]
                distribution_unit = ('bins' if len(distribution) > BINS_LIMIT
                                     else 'counts')
                distribution = merge_bins(distribution, BINS_LIMIT)
                total_instances = sum([instances
                                       for _, instances in distribution])
                prediction = mean(distribution)
                confidence = regression_error(
                    unbiased_sample_variance(distribution, prediction),
                    total_instances)
                return Prediction(
                    prediction,
                    path,
                    confidence,
                    distribution=distribution,
                    count=total_instances,
                    median=dist_median(distribution, total_instances),
                    distribution_unit=distribution_unit,
                    children=last_node.children)
            else:
                distribution = [list(element) for element in
                                sorted(final_distribution.items(),
                                       key=lambda x: (-x[1], x[0]))]
                return Prediction(
                    distribution[0][0],
                    path,
                    ws_confidence(distribution[0][0], final_distribution),
                    distribution=distribution,
                    count=get_instances(distribution),
                    median=None,
                    distribution_unit='categorical',
                    children=last_node.children)

        else:
            if self.children:
                for child in self.children:
                    if child.predicate.apply(input_data, self.fields):
                        path.append(child.predicate.to_rule(self.fields))
                        return child.predict(input_data, path=path)

            return Prediction(
                self.output,
                path,
                self.confidence,
                distribution=self.distribution,
                count=get_instances(self.distribution),
                median=None if not self.regression else self.median,
                distribution_unit=self.distribution_unit,
                children=self.children)
예제 #3
0
def regression_proportional_predict(tree, weighted, fields, input_data):
    """Proportional prediction for regressions

    """

    offset = OFFSETS[str(weighted)]
    (final_distribution, d_min, d_max, last_node, population,
     parent_node, path) = proportional_predict( \
        tree, offset, fields, input_data, path=None)
    # singular case:
    # when the prediction is the one given in a 1-instance node
    if len(list(final_distribution.items())) == 1:
        prediction, instances = list(final_distribution.items())[0]
        if instances == 1:
            return Prediction( \
                last_node[offset["output"]],
                path,
                last_node[offset["confidence"]],
                distribution=last_node[offset["distribution"]] \
                    if not weighted else \
                    last_node[offset["wdistribution"]],
                count=instances,
                median=last_node[offset["median"]],
                distribution_unit=last_node[offset["distribution_unit"]],
                children=[] if last_node[offset["children#"]] == 0 else \
                    last_node[offset["children"]],
                d_min=last_node[offset["min"]],
                d_max=last_node[offset["max"]])
    # when there's more instances, sort elements by their mean
    distribution = [
        list(element) for element in sorted(list(final_distribution.items()),
                                            key=lambda x: x[0])
    ]
    distribution_unit = ('bins'
                         if len(distribution) > BINS_LIMIT else 'counts')
    distribution = merge_bins(distribution, BINS_LIMIT)
    total_instances = sum([instances for _, instances in distribution])
    if len(distribution) == 1:
        # where there's only one bin, there will be no error, but
        # we use a correction derived from the parent's error
        prediction = distribution[0][0]
        if total_instances < 2:
            total_instances = 1
        try:
            # some strange models can have nodes with no confidence
            confidence = round(
                parent_node[offset["confidence"]] / math.sqrt(total_instances),
                PRECISION)
        except AttributeError:
            confidence = None
    else:
        prediction = mean(distribution)
        # weighted trees use the unweighted population to
        # compute the associated error
        confidence = round(
            regression_error(
                unbiased_sample_variance(distribution, prediction),
                population), PRECISION)
    return Prediction( \
        prediction,
        path,
        confidence,
        distribution=distribution,
        count=total_instances,
        median=dist_median(distribution, total_instances),
        distribution_unit=distribution_unit,
        children=[] if last_node[offset["children#"]] == 0 else \
            last_node[offset["children"]],
        d_min=d_min,
        d_max=d_max)