Ejemplo n.º 1
0
    def predict(self, input_data, path=None, missing_strategy=LAST_PREDICTION):
        """Makes a prediction based on a number of field values.

        The input fields must be keyed by Id. There are two possible
        strategies to predict when the value for the splitting field
        is missing:
            0 - LAST_PREDICTION: the last issued prediction is returned.
            1 - PROPORTIONAL: we consider all possible outcomes and create
                              an average prediction.
        """

        if path is None:
            path = []
        if missing_strategy == PROPORTIONAL:
            return self.predict_proportional(input_data, path=path)
        else:
            if self.children:
                for child in self.children:
                    if child.predicate.apply(input_data, self.fields):
                        path.append(child.predicate.to_rule(self.fields))
                        return child.predict(input_data, path=path)

            return Prediction(self.output,
                              path,
                              None,
                              distribution=None,
                              count=self.count,
                              median=None,
                              distribution_unit=None,
                              children=self.children,
                              d_min=None,
                              d_max=None)
Ejemplo n.º 2
0
def boosting_last_predict(tree, fields, input_data, path=None):
    """Predict function for boosting and last prediction strategy

    """

    if path is None:
        path = []
    node = get_node(tree)

    children_number = node[OFFSETS["children#"]]
    children = [] if children_number == 0 else node[OFFSETS["children"]]
    count = node[OFFSETS["count"]]

    if children:
        for child in children:
            [operator, field, value, term, missing] = get_predicate(child)
            if apply_predicate(operator, field, value, term, missing,
                               input_data, fields[field]):
                path.append(predicate_to_rule(operator, fields[field],
                                              value, term, missing))
                return boosting_last_predict( \
                    child, fields, \
                    input_data, path=path)

    return Prediction(
        node[OFFSETS["output"]],
        path,
        None,
        distribution=None,
        count=count,
        median=None,
        distribution_unit=None,
        children=children,
        d_min=None,
        d_max=None)
Ejemplo n.º 3
0
def last_prediction_predict(tree, offsets, fields, input_data, path=None):
    """ Predictions for last prediction missing strategy

    """

    if path is None:
        path = []

    node = get_node(tree)

    children_number = node[offsets["children#"]]
    children = [] if children_number == 0 else node[offsets["children"]]

    for child in children:
        [operator, field, value, term, missing] = get_predicate(child)
        if apply_predicate(operator, field, value, term, missing,
                           input_data, fields[field]):
            new_rule = predicate_to_rule(operator, fields[field], value,
                                         term, missing)
            path.append(new_rule)
            return last_prediction_predict(child,
                                           offsets, fields,
                                           input_data, path=path)

    if "wdistribution" in offsets:
        output_distribution = node[offsets["wdistribution"]]
        output_unit = 'categories' if "distribution_unit" not in offsets else \
            node[offsets["wdistribution_unit"]]
    else:
        output_distribution = node[offsets["distribution"]]
        output_unit = 'categories' if "distribution_unit" not in offsets else \
            node[offsets["distribution_unit"]]

    return Prediction( \
        node[offsets["output"]],
        path,
        node[offsets["confidence"]],
        distribution=output_distribution,
        count=node[offsets["count"]],
        median=None if offsets.get("median") is None else \
            node[offsets["median"]],
        distribution_unit=output_unit,
        children=[] if node[offsets["children#"]] == 0 else \
            node[offsets["children"]],
        d_min=None if offsets.get("min") is None else \
            node[offsets["min"]],
        d_max=None if offsets.get("max") is None else \
            node[offsets["max"]])
Ejemplo n.º 4
0
def classification_proportional_predict(tree, weighted, fields, input_data):
    """Prediction for classification using proportional strategy

    """
    offset = OFFSETS[str(weighted)]
    (final_distribution, _, _, last_node, population,
     _, path) = proportional_predict( \
        tree, offset, fields, input_data, path=None)

    distribution = [list(element) for element in
                    sorted(list(final_distribution.items()),
                           key=lambda x: (-x[1], x[0]))]
    return Prediction( \
        distribution[0][0],
        path,
        ws_confidence(distribution[0][0], final_distribution,
                      ws_n=population),
        distribution,
        population,
        None,
        'categories',
        [] if last_node[OFFSETS[str(weighted)]["children#"]] == 0 else \
        last_node[OFFSETS[str(weighted)]["children"]])
Ejemplo n.º 5
0
    def predict(self, input_data, path=None, missing_strategy=LAST_PREDICTION):
        """Makes a prediction based on a number of field values.

        The input fields must be keyed by Id. There are two possible
        strategies to predict when the value for the splitting field
        is missing:
            0 - LAST_PREDICTION: the last issued prediction is returned.
            1 - PROPORTIONAL: as we cannot choose between the two branches
                in the tree that stem from this split, we consider both. The
                algorithm goes on until the final leaves are reached and
                all their predictions are used to decide the final prediction.
        """

        if path is None:
            path = []
        if missing_strategy == PROPORTIONAL:
            (final_distribution, d_min, d_max, last_node, population,
             parent_node) = self.predict_proportional(input_data, path=path)

            if self.regression:
                # singular case:
                # when the prediction is the one given in a 1-instance node
                if len(final_distribution.items()) == 1:
                    prediction, instances = final_distribution.items()[0]
                    if instances == 1:
                        return Prediction(
                            last_node.output,
                            path,
                            last_node.confidence,
                            distribution=(last_node.distribution if not  \
                                self.weighted else \
                                last_node.weighted_distribution),
                            count=instances,
                            median=last_node.median,
                            distribution_unit=last_node.distribution_unit,
                            children=last_node.children,
                            d_min=last_node.min,
                            d_max=last_node.max)
                # when there's more instances, sort elements by their mean
                distribution = [
                    list(element)
                    for element in sorted(final_distribution.items(),
                                          key=lambda x: x[0])
                ]
                distribution_unit = ('bins' if len(distribution) > BINS_LIMIT
                                     else 'counts')
                distribution = merge_bins(distribution, BINS_LIMIT)
                total_instances = sum(
                    [instances for _, instances in distribution])
                if len(distribution) == 1:
                    # where there's only one bin, there will be no error, but
                    # we use a correction derived from the parent's error
                    prediction = distribution[0][0]
                    if total_instances < 2:
                        total_instances = 1
                    try:
                        # some strange models can have nodes with no confidence
                        confidence = round(
                            parent_node.confidence /
                            math.sqrt(total_instances), PRECISION)
                    except AttributeError:
                        confidence = None
                else:
                    prediction = mean(distribution)
                    confidence = round(
                        regression_error(
                            unbiased_sample_variance(distribution, prediction),
                            total_instances), PRECISION)
                return Prediction(prediction,
                                  path,
                                  confidence,
                                  distribution=distribution,
                                  count=total_instances,
                                  median=dist_median(distribution,
                                                     total_instances),
                                  distribution_unit=distribution_unit,
                                  children=last_node.children,
                                  d_min=d_min,
                                  d_max=d_max)
            else:
                distribution = [
                    list(element)
                    for element in sorted(final_distribution.items(),
                                          key=lambda x: (-x[1], x[0]))
                ]
                return Prediction(distribution[0][0],
                                  path,
                                  ws_confidence(distribution[0][0],
                                                final_distribution,
                                                ws_n=population),
                                  distribution=distribution,
                                  count=population,
                                  median=None,
                                  distribution_unit='categorical',
                                  children=last_node.children)

        else:
            if self.children:
                for child in self.children:
                    if child.predicate.apply(input_data, self.fields):
                        path.append(child.predicate.to_rule(self.fields))
                        return child.predict(input_data, path=path)

            if self.weighted:
                output_distribution = self.weighted_distribution
                output_unit = self.weighted_distribution_unit
            else:
                output_distribution = self.distribution
                output_unit = self.distribution_unit

            return Prediction(
                self.output,
                path,
                self.confidence,
                distribution=output_distribution,
                count=get_instances(output_distribution),
                median=None if not self.regression else self.median,
                distribution_unit=output_unit,
                children=self.children,
                d_min=None if not self.regression else self.min,
                d_max=None if not self.regression else self.max)
Ejemplo n.º 6
0
    def _predict(self,
                 input_data,
                 missing_strategy=LAST_PREDICTION,
                 operating_point=None,
                 operating_kind=None,
                 unused_fields=None):
        """Makes a prediction based on a number of field values. Please,
        note that this function does not check the types for the input
        provided, so it's unsafe to use it directly without prior checking.

        """
        # When operating_point is used, we need the probabilities
        # (or confidences) of all possible classes to decide, so se use
        # the `predict_probability` or `predict_confidence` methods
        if operating_point:
            if self.regression:
                raise ValueError("The operating_point argument can only be"
                                 " used in classifications.")
            prediction = self.predict_operating( \
                input_data,
                missing_strategy=missing_strategy,
                operating_point=operating_point)
            return prediction

        if operating_kind:
            if self.regression:
                raise ValueError("The operating_kind argument can only be"
                                 " used in classifications.")
            prediction = self.predict_operating_kind( \
                input_data,
                missing_strategy=missing_strategy,
                operating_kind=operating_kind)
            return prediction

        prediction = tree_predict( \
            self.tree, self.tree_type, self.weighted, self.fields,
            input_data, missing_strategy=missing_strategy)

        if self.boosting and missing_strategy == PROPORTIONAL:
            # output has to be recomputed and comes in a different format
            g_sum, h_sum, population, path = prediction
            prediction = Prediction( \
                - g_sum / (h_sum +  self.boosting.get("lambda", 1)),
                path,
                None,
                distribution=None,
                count=population,
                median=None,
                distribution_unit=None)

        result = vars(prediction)
        # changing key name to prediction
        result['prediction'] = result['output']
        del result['output']
        # next
        field = (None if len(prediction.children) == 0 else
                 prediction.children[0][FIELD_OFFSET])
        if field is not None and field in self.model_fields:
            field = self.model_fields[field]['name']
        result.update({'next': field})
        del result['children']
        if not self.regression and not self.boosting:
            probabilities = self._probabilities(result['distribution'])
            result['probability'] = probabilities[result['prediction']]
        # adding unused fields, if any
        if unused_fields:
            result.update({'unused_fields': unused_fields})

        return result
Ejemplo n.º 7
0
def regression_proportional_predict(tree, weighted, fields, input_data):
    """Proportional prediction for regressions

    """

    offset = OFFSETS[str(weighted)]
    (final_distribution, d_min, d_max, last_node, population,
     parent_node, path) = proportional_predict( \
        tree, offset, fields, input_data, path=None)
    # singular case:
    # when the prediction is the one given in a 1-instance node
    if len(list(final_distribution.items())) == 1:
        prediction, instances = list(final_distribution.items())[0]
        if instances == 1:
            return Prediction( \
                last_node[offset["output"]],
                path,
                last_node[offset["confidence"]],
                distribution=last_node[offset["distribution"]] \
                    if not weighted else \
                    last_node[offset["wdistribution"]],
                count=instances,
                median=last_node[offset["median"]],
                distribution_unit=last_node[offset["distribution_unit"]],
                children=[] if last_node[offset["children#"]] == 0 else \
                    last_node[offset["children"]],
                d_min=last_node[offset["min"]],
                d_max=last_node[offset["max"]])
    # when there's more instances, sort elements by their mean
    distribution = [
        list(element) for element in sorted(list(final_distribution.items()),
                                            key=lambda x: x[0])
    ]
    distribution_unit = ('bins'
                         if len(distribution) > BINS_LIMIT else 'counts')
    distribution = merge_bins(distribution, BINS_LIMIT)
    total_instances = sum([instances for _, instances in distribution])
    if len(distribution) == 1:
        # where there's only one bin, there will be no error, but
        # we use a correction derived from the parent's error
        prediction = distribution[0][0]
        if total_instances < 2:
            total_instances = 1
        try:
            # some strange models can have nodes with no confidence
            confidence = round(
                parent_node[offset["confidence"]] / math.sqrt(total_instances),
                PRECISION)
        except AttributeError:
            confidence = None
    else:
        prediction = mean(distribution)
        # weighted trees use the unweighted population to
        # compute the associated error
        confidence = round(
            regression_error(
                unbiased_sample_variance(distribution, prediction),
                population), PRECISION)
    return Prediction( \
        prediction,
        path,
        confidence,
        distribution=distribution,
        count=total_instances,
        median=dist_median(distribution, total_instances),
        distribution_unit=distribution_unit,
        children=[] if last_node[offset["children#"]] == 0 else \
            last_node[offset["children"]],
        d_min=d_min,
        d_max=d_max)
Ejemplo n.º 8
0
    def predict(self,
                input_data,
                by_name=True,
                print_path=False,
                out=sys.stdout,
                with_confidence=False,
                missing_strategy=LAST_PREDICTION,
                add_confidence=False,
                add_path=False,
                add_distribution=False,
                add_count=False,
                add_median=False,
                add_next=False,
                add_min=False,
                add_max=False,
                add_unused_fields=False,
                multiple=None):
        """Makes a prediction based on a number of field values.

        By default the input fields must be keyed by field name but you can use
        `by_name=False` to input them directly keyed by id.

        input_data: Input data to be predicted
        by_name: Boolean, True if input_data is keyed by names
        print_path: Boolean, if True the rules that lead to the prediction
                    are printed
        out: output handler
        with_confidence: Boolean, if True, all the information in the node
                         (prediction, confidence, distribution and count)
                         is returned in a list format
        missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for
                          missing fields
        add_confidence: Boolean, if True adds confidence to the dict output
        add_path: Boolean, if True adds path to the dict output
        add_distribution: Boolean, if True adds distribution info to the
                          dict output
        add_count: Boolean, if True adds the number of instances in the
                       node to the dict output
        add_median: Boolean, if True adds the median of the values in
                    the distribution
        add_next: Boolean, if True adds the field that determines next
                  split in the tree
        add_min: Boolean, if True adds the minimum value in the prediction's
                 distribution (for regressions only)
        add_max: Boolean, if True adds the maximum value in the prediction's
                 distribution (for regressions only)
        add_unused_fields: Boolean, if True adds the information about the
                           fields in the input_data that are not being used
                           in the model as predictors.
        multiple: For categorical fields, it will return the categories
                  in the distribution of the predicted node as a
                  list of dicts:
                    [{'prediction': 'Iris-setosa',
                      'confidence': 0.9154
                      'probability': 0.97
                      'count': 97},
                     {'prediction': 'Iris-virginica',
                      'confidence': 0.0103
                      'probability': 0.03,
                      'count': 3}]
                  The value of this argument can either be an integer
                  (maximum number of categories to be returned), or the
                  literal 'all', that will cause the entire distribution
                  in the node to be returned.

        """
        # Checks if this is a regression model, using PROPORTIONAL
        # missing_strategy
        if (not self.boosting and self.regression
                and missing_strategy == PROPORTIONAL
                and not self.regression_ready):
            raise ImportError("Failed to find the numpy and scipy libraries,"
                              " needed to use proportional missing strategy"
                              " for regressions. Please install them before"
                              " using local predictions for the model.")
        # Checks and cleans input_data leaving the fields used in the model
        new_data = self.filter_input_data( \
            input_data, by_name=by_name,
            add_unused_fields=add_unused_fields)
        if add_unused_fields:
            input_data, unused_fields = new_data
        else:
            input_data = new_data

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        prediction = self.tree.predict(input_data,
                                       missing_strategy=missing_strategy)

        if self.boosting and missing_strategy == PROPORTIONAL:
            # output has to be recomputed and comes in a different format
            g_sum, h_sum, population, path = prediction
            prediction = Prediction(-g_sum /
                                    (h_sum + self.boosting.get("lambda", 1)),
                                    path,
                                    None,
                                    distribution=None,
                                    count=population,
                                    median=None,
                                    distribution_unit=None)

        # Prediction path
        if print_path:
            out.write(
                utf8(u' AND '.join(prediction.path) +
                     u' => %s \n' % prediction.output))
            out.flush()
        output = prediction.output
        if with_confidence:
            output = [
                prediction.output, prediction.confidence,
                prediction.distribution, prediction.count, prediction.median
            ]
        if multiple is not None and not self.regression:
            output = []
            total_instances = float(prediction.count)
            distribution = enumerate(prediction.distribution)
            for index, [category, instances] in distribution:
                if ((isinstance(multiple, basestring) and multiple == 'all')
                        or (isinstance(multiple, int) and index < multiple)):
                    prediction_dict = {
                        'prediction':
                        category,
                        'confidence':
                        ws_confidence(category, prediction.distribution),
                        'probability':
                        instances / total_instances,
                        'count':
                        instances
                    }
                    output.append(prediction_dict)
        elif (add_confidence or add_path or add_distribution or add_count
              or add_median or add_next or add_min or add_max
              or add_unused_fields):
            output = {'prediction': prediction.output}
            if add_confidence:
                output.update({'confidence': prediction.confidence})
            if add_path:
                output.update({'path': prediction.path})
            if add_distribution:
                output.update({
                    'distribution': prediction.distribution,
                    'distribution_unit': prediction.distribution_unit
                })
            if add_count:
                output.update({'count': prediction.count})
            if add_next:
                field = (None if len(prediction.children) == 0 else
                         prediction.children[0].predicate.field)
                if field is not None and field in self.fields:
                    field = self.fields[field]['name']
                output.update({'next': field})
            if not self.boosting and self.regression:
                if add_median:
                    output.update({'median': prediction.median})
                if add_min:
                    output.update({'min': prediction.min})
                if add_max:
                    output.update({'max': prediction.max})
            if add_unused_fields:
                output.update({'unused_fields': unused_fields})
        return output