Exemple #1
0
def combine_multivote(multivote, other_label=OTHER):
    """Combine in a global distribution the distribution of predictions
       obtained with models when each one is built on a subset of training
       data that has a subset of categories.

    """
    predictions = multivote.predictions
    global_distribution = []
    for prediction in predictions:
        prediction_category = None
        prediction_instances = 0
        for category, instances in prediction['distribution']:
            if category != other_label:
                if instances > prediction_instances:
                    prediction_category = category
                    prediction_instances = instances
        if prediction_category is not None:
            prediction_confidence = ws_confidence(
                prediction_category, prediction['distribution'])
            global_distribution.append([prediction_category,
                                        prediction_confidence])
    if global_distribution:
        prediction = sorted(global_distribution, key=lambda x: x[1],
                            reverse=True)[0]
    else:
        prediction = [None, None]
    return prediction
Exemple #2
0
def combine_multivote(multivote, other_label=OTHER):
    """Combine in a global distribution the distribution of predictions
       obtained with models when each one is built on a subset of training
       data that has a subset of categories.

    """
    predictions = multivote.predictions
    global_distribution = []
    for prediction in predictions:
        prediction_category = None
        prediction_instances = 0
        for category, instances in prediction['distribution']:
            if category != other_label:
                if instances > prediction_instances:
                    prediction_category = category
                    prediction_instances = instances
        if prediction_category is not None:
            prediction_confidence = ws_confidence(prediction_category,
                                                  prediction['distribution'])
            global_distribution.append(
                [prediction_category, prediction_confidence])
    if global_distribution:
        prediction = sorted(global_distribution,
                            key=lambda x: x[1],
                            reverse=True)[0]
    else:
        prediction = [None, None]
    return prediction
Exemple #3
0
    def predict_confidence(self,
                           input_data,
                           missing_strategy=LAST_PREDICTION,
                           compact=False):
        """For classification models, Predicts a one-vs.-rest confidence value
        for each possible output class, based on input values.  This
        confidence value is a lower confidence bound on the predicted
        probability of the given class.  The input fields must be a
        dictionary keyed by field name for field ID.

        For regressions, the output is a single element list
        containing the prediction.

        :param input_data: Input data to be predicted
        :param missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy
                                 for missing fields
        :param compact: If False, prediction is returned as a list of maps, one
                        per class, with the keys "prediction" and "confidence"
                        mapped to the name of the class and its confidence,
                        respectively.  If True, returns a list of confidences
                        ordered by the sorted order of the class names.

        """
        if self.regression:
            prediction = self.predict(input_data,
                                      missing_strategy=missing_strategy,
                                      full=not compact)

            if compact:
                output = [prediction]
            else:
                output = cast_prediction(prediction,
                                         to=DICTIONARY,
                                         confidence=True)
            return output

        if self.boosting:
            raise AttributeError("This method is available for non-boosting"
                                 " models only.")

        root_dist = self.root_distribution
        category_map = {category[0]: 0.0 for category in root_dist}
        prediction = self.predict(input_data,
                                  missing_strategy=missing_strategy,
                                  full=True)

        distribution = prediction['distribution']
        population = prediction['count']

        for class_info in distribution:
            name = class_info[0]
            category_map[name] = ws_confidence(name,
                                               distribution,
                                               ws_n=population)

        return self._to_output(category_map, compact, "confidence")
Exemple #4
0
    def predict(self, input_data, path=None, missing_strategy=LAST_PREDICTION):
        """Makes a prediction based on a number of field values.

        The input fields must be keyed by Id. There are two possible
        strategies to predict when the value for the splitting field
        is missing:
            0 - LAST_PREDICTION: the last issued prediction is returned.
            1 - PROPORTIONAL: as we cannot choose between the two branches
                in the tree that stem from this split, we consider both. The
                algorithm goes on until the final leaves are reached and
                all their predictions are used to decide the final prediction.
        """

        if path is None:
            path = []
        if missing_strategy == PROPORTIONAL:
            (final_distribution,
             last_node) = self.predict_proportional(input_data, path=path)

            if self.regression:
                # singular case:
                # when the prediction is the one given in a 1-instance node
                if len(final_distribution.items()) == 1:
                    prediction, instances = final_distribution.items()[0]
                    if instances == 1:
                        return (last_node.output, path, last_node.confidence,
                                last_node.distribution, instances)               
                # when there's more instances, sort elements by their mean
                distribution = [list(element) for element in
                                sorted(final_distribution.items(),
                                       key=lambda x: x[0])]
                distribution = merge_bins(distribution, BINS_LIMIT)
                prediction = mean(distribution)
                total_instances = sum([instances
                                       for _, instances in distribution])
                confidence = regression_error(
                    unbiased_sample_variance(distribution, prediction),
                    total_instances)
                return (prediction, path, confidence,
                        distribution, total_instances)
            else:
                distribution = [list(element) for element in
                                sorted(final_distribution.items(),
                                       key=lambda x: (-x[1], x[0]))]
                return (distribution[0][0], path,
                        ws_confidence(distribution[0][0], final_distribution),
                        distribution, get_instances(distribution))

        else:
            if self.children:
                for child in self.children:
                    if child.predicate.apply(input_data, self.fields):
                        path.append(child.predicate.to_rule(self.fields))
                        return child.predict(input_data, path)
            return (self.output, path, self.confidence,
                    self.distribution, get_instances(self.distribution))
Exemple #5
0
    def predict(self, input_data, path=None, missing_strategy=LAST_PREDICTION):
        """Makes a prediction based on a number of field values.

        The input fields must be keyed by Id. There are two possible
        strategies to predict when the value for the splitting field
        is missing:
            0 - LAST_PREDICTION: the last issued prediction is returned.
            1 - PROPORTIONAL: as we cannot choose between the two branches
                in the tree that stem from this split, we consider both. The
                algorithm goes on until the final leaves are reached and
                all their predictions are used to decide the final prediction.
        """

        if path is None:
            path = []
        if missing_strategy == PROPORTIONAL:
            final_distribution = self.predict_proportional(input_data,
                                                           path=path)

            if self.regression:
                # sort elements by their mean
                distribution = [
                    list(element)
                    for element in sorted(final_distribution.items(),
                                          key=lambda x: x[0])
                ]
                distribution = merge_bins(distribution, BINS_LIMIT)
                prediction = mean(distribution)
                total_instances = sum(
                    [instances for _, instances in distribution])
                confidence = regression_error(
                    unbiased_sample_variance(distribution, prediction),
                    total_instances)
                return (prediction, path, confidence, distribution,
                        total_instances)
            else:
                distribution = [
                    list(element)
                    for element in sorted(final_distribution.items(),
                                          key=lambda x: (-x[1], x[0]))
                ]
                return (distribution[0][0], path,
                        ws_confidence(distribution[0][0],
                                      final_distribution), distribution,
                        get_instances(distribution))

        else:
            if self.children and split(self.children) in input_data:
                for child in self.children:
                    if child.predicate.apply(input_data, self.fields):
                        path.append(child.predicate.to_rule(self.fields))
                        return child.predict(input_data, path)
            return (self.output, path, self.confidence, self.distribution,
                    get_instances(self.distribution))
Exemple #6
0
    def predict_confidence(self, input_data, by_name=True,
                           missing_strategy=LAST_PREDICTION,
                           compact=False):
        """For classification models, Predicts a one-vs.-rest confidence value
        for each possible output class, based on input values.  This
        confidence value is a lower confidence bound on the predicted
        probability of the given class.  The input fields must be a
        dictionary keyed by field name for field ID.

        For regressions, the output is a single element list
        containing the prediction.

        :param input_data: Input data to be predicted
        :param by_name: Boolean that is set to True if field_names (as
                        alternative to field ids) are used in the
                        input_data dict
        :param missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy
                                 for missing fields
        :param compact: If False, prediction is returned as a list of maps, one
                        per class, with the keys "prediction" and "confidence"
                        mapped to the name of the class and its confidence,
                        respectively.  If True, returns a list of confidences
                        ordered by the sorted order of the class names.

        """
        if self.regression or self.boosting:
            raise AttributeError("This method is available for non-boosting"
                                 " categorization models only.")

        root_dist = self.tree.distribution
        category_map = {category[0]: 0.0 for category in root_dist}
        prediction = self.predict(input_data,
                                  by_name=by_name,
                                  missing_strategy=missing_strategy,
                                  add_distribution=True)

        distribution = prediction['distribution']

        for class_info in distribution:
            name = class_info[0]
            category_map[name] = ws_confidence(name, distribution)

        return self._to_output(category_map, compact, "confidence")
Exemple #7
0
def classification_proportional_predict(tree, weighted, fields, input_data):
    """Prediction for classification using proportional strategy

    """
    offset = OFFSETS[str(weighted)]
    (final_distribution, _, _, last_node, population,
     _, path) = proportional_predict( \
        tree, offset, fields, input_data, path=None)

    distribution = [list(element) for element in
                    sorted(list(final_distribution.items()),
                           key=lambda x: (-x[1], x[0]))]
    return Prediction( \
        distribution[0][0],
        path,
        ws_confidence(distribution[0][0], final_distribution,
                      ws_n=population),
        distribution,
        population,
        None,
        'categories',
        [] if last_node[OFFSETS[str(weighted)]["children#"]] == 0 else \
        last_node[OFFSETS[str(weighted)]["children"]])
Exemple #8
0
                    confidence_list.append(confidence)
            prediction = [label_separator.join(prediction_list),
                          label_separator.join(confidence_list)]
        elif method == COMBINATION:
            predictions = multivote.predictions
            global_distribution = []
            for prediction in predictions:
                prediction_category = None
                prediction_instances = 0
                for category, instances in prediction['distribution']:
                    if category != other_label:
                        if instances > prediction_instances:
                            prediction_category = category
                            prediction_instances = instances
                if prediction_category is not None:
                    prediction_confidence = ws_confidence(
                        prediction_category, prediction['distribution'])
                    global_distribution.append([prediction_category,
                                                prediction_confidence])
            if global_distribution:
                prediction = sorted(global_distribution, key=lambda x: x[1],
                                    reverse=True)[0]
            else:
                prediction = [None, None]
        else:
            prediction = multivote.combine(method=method, with_confidence=True,
                                           options=options)

        write_prediction(prediction, output, prediction_info, input_data,
                         exclude)

Exemple #9
0
    def predict(self, input_data, by_name=True,
                print_path=False, out=sys.stdout, with_confidence=False,
                missing_strategy=LAST_PREDICTION,
                add_confidence=False,
                add_path=False,
                add_distribution=False,
                add_count=False,
                add_median=False,
                add_next=False,
                add_min=False,
                add_max=False,
                multiple=None):
        """Makes a prediction based on a number of field values.

        By default the input fields must be keyed by field name but you can use
        `by_name` to input them directly keyed by id.

        input_data: Input data to be predicted
        by_name: Boolean, True if input_data is keyed by names
        print_path: Boolean, if True the rules that lead to the prediction
                    are printed
        out: output handler
        with_confidence: Boolean, if True, all the information in the node
                         (prediction, confidence, distribution and count)
                         is returned in a list format
        missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for
                          missing fields
        add_confidence: Boolean, if True adds confidence to the dict output
        add_path: Boolean, if True adds path to the dict output
        add_distribution: Boolean, if True adds distribution info to the
                          dict output
        add_count: Boolean, if True adds the number of instances in the
                       node to the dict output
        add_median: Boolean, if True adds the median of the values in
                    the distribution
        add_next: Boolean, if True adds the field that determines next
                  split in the tree
        add_min: Boolean, if True adds the minimum value in the prediction's
                 distribution (for regressions only)
        add_max: Boolean, if True adds the maximum value in the prediction's
                 distribution (for regressions only)
        multiple: For categorical fields, it will return the categories
                  in the distribution of the predicted node as a
                  list of dicts:
                    [{'prediction': 'Iris-setosa',
                      'confidence': 0.9154
                      'probability': 0.97
                      'count': 97},
                     {'prediction': 'Iris-virginica',
                      'confidence': 0.0103
                      'probability': 0.03,
                      'count': 3}]
                  The value of this argument can either be an integer
                  (maximum number of categories to be returned), or the
                  literal 'all', that will cause the entire distribution
                  in the node to be returned.

        """
        # Checks if this is a regression model, using PROPORTIONAL
        # missing_strategy
        if (self.tree.regression and missing_strategy == PROPORTIONAL and
                not self.regression_ready):
            raise ImportError("Failed to find the numpy and scipy libraries,"
                              " needed to use proportional missing strategy"
                              " for regressions. Please install them before"
                              " using local predictions for the model.")
        # Checks and cleans input_data leaving the fields used in the model
        input_data = self.filter_input_data(input_data, by_name=by_name)

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        prediction = self.tree.predict(input_data,
                                       missing_strategy=missing_strategy)

        # Prediction path
        if print_path:
            out.write(utf8(u' AND '.join(prediction.path) + u' => %s \n' %
                           prediction.output))
            out.flush()
        output = prediction.output
        if with_confidence:
            output = [prediction.output,
                      prediction.confidence,
                      prediction.distribution,
                      prediction.count,
                      prediction.median]
        if multiple is not None and not self.tree.regression:
            output = []
            total_instances = float(prediction.count)
            distribution = enumerate(prediction.distribution)
            for index, [category, instances] in distribution:
                if ((isinstance(multiple, basestring) and multiple == 'all') or
                        (isinstance(multiple, int) and index < multiple)):
                    prediction_dict = {
                        'prediction': category,
                        'confidence': ws_confidence(category,
                                                    prediction.distribution),
                        'probability': instances / total_instances,
                        'count': instances}
                    output.append(prediction_dict)
        else:
            if (add_confidence or add_path or add_distribution or add_count or
                    add_median or add_next or add_min or add_max):
                output = {'prediction': prediction.output}
                if add_confidence:
                    output.update({'confidence': prediction.confidence})
                if add_path:
                    output.update({'path': prediction.path})
                if add_distribution:
                    output.update(
                        {'distribution': prediction.distribution,
                         'distribution_unit': prediction.distribution_unit})
                if add_count:
                    output.update({'count': prediction.count})
                if self.tree.regression and add_median:
                    output.update({'median': prediction.median})
                if add_next:
                    field = (None if len(prediction.children) == 0 else
                             prediction.children[0].predicate.field)
                    if field is not None and field in self.fields:
                        field = self.fields[field]['name']
                    output.update({'next': field})
                if self.tree.regression and add_min:
                    output.update({'min': prediction.min})
                if self.tree.regression and add_max:
                    output.update({'max': prediction.max})

        return output
Exemple #10
0
    def predict(self,
                input_data,
                by_name=True,
                print_path=False,
                out=sys.stdout,
                with_confidence=False,
                missing_strategy=LAST_PREDICTION,
                add_confidence=False,
                add_path=False,
                add_distribution=False,
                add_count=False,
                add_median=False,
                add_next=False,
                multiple=None):
        """Makes a prediction based on a number of field values.

        By default the input fields must be keyed by field name but you can use
        `by_name` to input them directly keyed by id.

        input_data: Input data to be predicted
        by_name: Boolean, True if input_data is keyed by names
        print_path: Boolean, if True the rules that lead to the prediction
                    are printed
        out: output handler
        with_confidence: Boolean, if True, all the information in the node
                         (prediction, confidence, distribution and count)
                         is returned in a list format
        missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for
                          missing fields
        add_confidence: Boolean, if True adds confidence to the dict output
        add_path: Boolean, if True adds path to the dict output
        add_distribution: Boolean, if True adds distribution info to the
                          dict output
        add_count: Boolean, if True adds the number of instances in the
                       node to the dict output
        add_median: Boolean, if True adds the median of the values in
                    the distribution
        add_next: Boolean, if True adds the field that determines next
                  split in the tree
        multiple: For categorical fields, it will return the categories
                  in the distribution of the predicted node as a
                  list of dicts:
                    [{'prediction': 'Iris-setosa',
                      'confidence': 0.9154
                      'probability': 0.97
                      'count': 97},
                     {'prediction': 'Iris-virginica',
                      'confidence': 0.0103
                      'probability': 0.03,
                      'count': 3}]
                  The value of this argument can either be an integer
                  (maximum number of categories to be returned), or the
                  literal 'all', that will cause the entire distribution
                  in the node to be returned.

        """
        # Checks if this is a regression model, using PROPORTIONAL
        # missing_strategy
        if (self.tree.regression and missing_strategy == PROPORTIONAL
                and not self.regression_ready):
            raise ImportError("Failed to find the numpy and scipy libraries,"
                              " needed to use proportional missing strategy"
                              " for regressions. Please install them before"
                              " using local predictions for the model.")
        # Checks and cleans input_data leaving the fields used in the model
        input_data = self.filter_input_data(input_data, by_name=by_name)

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        prediction = self.tree.predict(input_data,
                                       missing_strategy=missing_strategy)

        # Prediction path
        if print_path:
            out.write(
                utf8(u' AND '.join(prediction.path) +
                     u' => %s \n' % prediction.output))
            out.flush()
        output = prediction.output
        if with_confidence:
            output = [
                prediction.output, prediction.confidence,
                prediction.distribution, prediction.count, prediction.median
            ]
        if multiple is not None and not self.tree.regression:
            output = []
            total_instances = float(prediction.count)
            distribution = enumerate(prediction.distribution)
            for index, [category, instances] in distribution:
                if ((isinstance(multiple, basestring) and multiple == 'all')
                        or (isinstance(multiple, int) and index < multiple)):
                    prediction_dict = {
                        'prediction':
                        category,
                        'confidence':
                        ws_confidence(category, prediction.distribution),
                        'probability':
                        instances / total_instances,
                        'count':
                        instances
                    }
                    output.append(prediction_dict)
        else:
            if (add_confidence or add_path or add_distribution or add_count
                    or add_median or add_next):
                output = {'prediction': prediction.output}
                if add_confidence:
                    output.update({'confidence': prediction.confidence})
                if add_path:
                    output.update({'path': prediction.path})
                if add_distribution:
                    output.update({
                        'distribution':
                        prediction.distribution,
                        'distribution_unit':
                        prediction.distribution_unit
                    })
                if add_count:
                    output.update({'count': prediction.count})
                if self.tree.regression and add_median:
                    output.update({'median': prediction.median})
                if add_next:
                    field = (None if len(prediction.children) == 0 else
                             prediction.children[0].predicate.field)
                    if field is not None and field in self.fields:
                        field = self.fields[field]['name']
                    output.update({'next': field})

        return output
Exemple #11
0
    def predict(self, input_data, path=None, missing_strategy=LAST_PREDICTION):
        """Makes a prediction based on a number of field values.

        The input fields must be keyed by Id. There are two possible
        strategies to predict when the value for the splitting field
        is missing:
            0 - LAST_PREDICTION: the last issued prediction is returned.
            1 - PROPORTIONAL: as we cannot choose between the two branches
                in the tree that stem from this split, we consider both. The
                algorithm goes on until the final leaves are reached and
                all their predictions are used to decide the final prediction.
        """

        if path is None:
            path = []
        if missing_strategy == PROPORTIONAL:
            (final_distribution, d_min, d_max, last_node, population,
             parent_node) = self.predict_proportional(input_data, path=path)

            if self.regression:
                # singular case:
                # when the prediction is the one given in a 1-instance node
                if len(final_distribution.items()) == 1:
                    prediction, instances = final_distribution.items()[0]
                    if instances == 1:
                        return Prediction(
                            last_node.output,
                            path,
                            last_node.confidence,
                            distribution=(last_node.distribution if not  \
                                self.weighted else \
                                last_node.weighted_distribution),
                            count=instances,
                            median=last_node.median,
                            distribution_unit=last_node.distribution_unit,
                            children=last_node.children,
                            d_min=last_node.min,
                            d_max=last_node.max)
                # when there's more instances, sort elements by their mean
                distribution = [
                    list(element)
                    for element in sorted(final_distribution.items(),
                                          key=lambda x: x[0])
                ]
                distribution_unit = ('bins' if len(distribution) > BINS_LIMIT
                                     else 'counts')
                distribution = merge_bins(distribution, BINS_LIMIT)
                total_instances = sum(
                    [instances for _, instances in distribution])
                if len(distribution) == 1:
                    # where there's only one bin, there will be no error, but
                    # we use a correction derived from the parent's error
                    prediction = distribution[0][0]
                    if total_instances < 2:
                        total_instances = 1
                    try:
                        # some strange models can have nodes with no confidence
                        confidence = round(
                            parent_node.confidence /
                            math.sqrt(total_instances), PRECISION)
                    except AttributeError:
                        confidence = None
                else:
                    prediction = mean(distribution)
                    confidence = round(
                        regression_error(
                            unbiased_sample_variance(distribution, prediction),
                            total_instances), PRECISION)
                return Prediction(prediction,
                                  path,
                                  confidence,
                                  distribution=distribution,
                                  count=total_instances,
                                  median=dist_median(distribution,
                                                     total_instances),
                                  distribution_unit=distribution_unit,
                                  children=last_node.children,
                                  d_min=d_min,
                                  d_max=d_max)
            else:
                distribution = [
                    list(element)
                    for element in sorted(final_distribution.items(),
                                          key=lambda x: (-x[1], x[0]))
                ]
                return Prediction(distribution[0][0],
                                  path,
                                  ws_confidence(distribution[0][0],
                                                final_distribution,
                                                ws_n=population),
                                  distribution=distribution,
                                  count=population,
                                  median=None,
                                  distribution_unit='categorical',
                                  children=last_node.children)

        else:
            if self.children:
                for child in self.children:
                    if child.predicate.apply(input_data, self.fields):
                        path.append(child.predicate.to_rule(self.fields))
                        return child.predict(input_data, path=path)

            if self.weighted:
                output_distribution = self.weighted_distribution
                output_unit = self.weighted_distribution_unit
            else:
                output_distribution = self.distribution
                output_unit = self.distribution_unit

            return Prediction(
                self.output,
                path,
                self.confidence,
                distribution=output_distribution,
                count=get_instances(output_distribution),
                median=None if not self.regression else self.median,
                distribution_unit=output_unit,
                children=self.children,
                d_min=None if not self.regression else self.min,
                d_max=None if not self.regression else self.max)
Exemple #12
0
    def predict(self, input_data, path=None, missing_strategy=LAST_PREDICTION):
        """Makes a prediction based on a number of field values.

        The input fields must be keyed by Id. There are two possible
        strategies to predict when the value for the splitting field
        is missing:
            0 - LAST_PREDICTION: the last issued prediction is returned.
            1 - PROPORTIONAL: as we cannot choose between the two branches
                in the tree that stem from this split, we consider both. The
                algorithm goes on until the final leaves are reached and
                all their predictions are used to decide the final prediction.
        """

        if path is None:
            path = []
        if missing_strategy == PROPORTIONAL:
            (final_distribution,
             d_min,
             d_max,
             last_node,
             population,
             parent_node) = self.predict_proportional(input_data, path=path)

            if self.regression:
                # singular case:
                # when the prediction is the one given in a 1-instance node
                if len(final_distribution.items()) == 1:
                    prediction, instances = final_distribution.items()[0]
                    if instances == 1:
                        return Prediction(
                            last_node.output,
                            path,
                            last_node.confidence,
                            distribution=(last_node.distribution if not  \
                                self.weighted else \
                                last_node.weighted_distribution),
                            count=instances,
                            median=last_node.median,
                            distribution_unit=last_node.distribution_unit,
                            children=last_node.children,
                            d_min=last_node.min,
                            d_max=last_node.max)
                # when there's more instances, sort elements by their mean
                distribution = [list(element) for element in
                                sorted(final_distribution.items(),
                                       key=lambda x: x[0])]
                distribution_unit = ('bins' if len(distribution) > BINS_LIMIT
                                     else 'counts')
                distribution = merge_bins(distribution, BINS_LIMIT)
                total_instances = sum([instances
                                       for _, instances in distribution])
                if len(distribution) == 1:
                    # where there's only one bin, there will be no error, but
                    # we use a correction derived from the parent's error
                    prediction = distribution[0][0]
                    if total_instances < 2:
                        total_instances = 1
                    try:
                        # some strange models can have nodes with no confidence
                        confidence = round(parent_node.confidence /
                                           math.sqrt(total_instances),
                                           PRECISION)
                    except AttributeError:
                        confidence = None
                else:
                    prediction = mean(distribution)
                    confidence = round(regression_error(
                        unbiased_sample_variance(distribution, prediction),
                        total_instances), PRECISION)
                return Prediction(
                    prediction,
                    path,
                    confidence,
                    distribution=distribution,
                    count=total_instances,
                    median=dist_median(distribution, total_instances),
                    distribution_unit=distribution_unit,
                    children=last_node.children,
                    d_min=d_min,
                    d_max=d_max)
            else:
                distribution = [list(element) for element in
                                sorted(final_distribution.items(),
                                       key=lambda x: (-x[1], x[0]))]
                return Prediction(
                    distribution[0][0],
                    path,
                    ws_confidence(distribution[0][0], final_distribution,
                                  ws_n=population),
                    distribution=distribution,
                    count=population,
                    median=None,
                    distribution_unit='categorical',
                    children=last_node.children)

        else:
            if self.children:
                for child in self.children:
                    if child.predicate.apply(input_data, self.fields):
                        path.append(child.predicate.to_rule(self.fields))
                        return child.predict(input_data, path=path)

            if self.weighted:
                output_distribution = self.weighted_distribution
                output_unit = self.weighted_distribution_unit
            else:
                output_distribution = self.distribution
                output_unit = self.distribution_unit

            return Prediction(
                self.output,
                path,
                self.confidence,
                distribution=output_distribution,
                count=get_instances(output_distribution),
                median=None if not self.regression else self.median,
                distribution_unit=output_unit,
                children=self.children,
                d_min=None if not self.regression else self.min,
                d_max=None if not self.regression else self.max)
Exemple #13
0
                label_separator.join(prediction_list),
                label_separator.join(confidence_list)
            ]
        elif method == COMBINATION:
            predictions = multivote.predictions
            global_distribution = []
            for prediction in predictions:
                prediction_category = None
                prediction_instances = 0
                for category, instances in prediction['distribution']:
                    if category != other_label:
                        if instances > prediction_instances:
                            prediction_category = category
                            prediction_instances = instances
                if prediction_category is not None:
                    prediction_confidence = ws_confidence(
                        prediction_category, prediction['distribution'])
                    global_distribution.append(
                        [prediction_category, prediction_confidence])
            if global_distribution:
                prediction = sorted(global_distribution,
                                    key=lambda x: x[1],
                                    reverse=True)[0]
            else:
                prediction = [None, None]
        else:
            prediction = multivote.combine(method=method,
                                           with_confidence=True,
                                           options=options)

        write_prediction(prediction, output, args.prediction_info, input_data,
                         exclude)