def combine_multivote(multivote, other_label=OTHER): """Combine in a global distribution the distribution of predictions obtained with models when each one is built on a subset of training data that has a subset of categories. """ predictions = multivote.predictions global_distribution = [] for prediction in predictions: prediction_category = None prediction_instances = 0 for category, instances in prediction['distribution']: if category != other_label: if instances > prediction_instances: prediction_category = category prediction_instances = instances if prediction_category is not None: prediction_confidence = ws_confidence( prediction_category, prediction['distribution']) global_distribution.append([prediction_category, prediction_confidence]) if global_distribution: prediction = sorted(global_distribution, key=lambda x: x[1], reverse=True)[0] else: prediction = [None, None] return prediction
def combine_multivote(multivote, other_label=OTHER): """Combine in a global distribution the distribution of predictions obtained with models when each one is built on a subset of training data that has a subset of categories. """ predictions = multivote.predictions global_distribution = [] for prediction in predictions: prediction_category = None prediction_instances = 0 for category, instances in prediction['distribution']: if category != other_label: if instances > prediction_instances: prediction_category = category prediction_instances = instances if prediction_category is not None: prediction_confidence = ws_confidence(prediction_category, prediction['distribution']) global_distribution.append( [prediction_category, prediction_confidence]) if global_distribution: prediction = sorted(global_distribution, key=lambda x: x[1], reverse=True)[0] else: prediction = [None, None] return prediction
def predict_confidence(self, input_data, missing_strategy=LAST_PREDICTION, compact=False): """For classification models, Predicts a one-vs.-rest confidence value for each possible output class, based on input values. This confidence value is a lower confidence bound on the predicted probability of the given class. The input fields must be a dictionary keyed by field name for field ID. For regressions, the output is a single element list containing the prediction. :param input_data: Input data to be predicted :param missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for missing fields :param compact: If False, prediction is returned as a list of maps, one per class, with the keys "prediction" and "confidence" mapped to the name of the class and its confidence, respectively. If True, returns a list of confidences ordered by the sorted order of the class names. """ if self.regression: prediction = self.predict(input_data, missing_strategy=missing_strategy, full=not compact) if compact: output = [prediction] else: output = cast_prediction(prediction, to=DICTIONARY, confidence=True) return output if self.boosting: raise AttributeError("This method is available for non-boosting" " models only.") root_dist = self.root_distribution category_map = {category[0]: 0.0 for category in root_dist} prediction = self.predict(input_data, missing_strategy=missing_strategy, full=True) distribution = prediction['distribution'] population = prediction['count'] for class_info in distribution: name = class_info[0] category_map[name] = ws_confidence(name, distribution, ws_n=population) return self._to_output(category_map, compact, "confidence")
def predict(self, input_data, path=None, missing_strategy=LAST_PREDICTION): """Makes a prediction based on a number of field values. The input fields must be keyed by Id. There are two possible strategies to predict when the value for the splitting field is missing: 0 - LAST_PREDICTION: the last issued prediction is returned. 1 - PROPORTIONAL: as we cannot choose between the two branches in the tree that stem from this split, we consider both. The algorithm goes on until the final leaves are reached and all their predictions are used to decide the final prediction. """ if path is None: path = [] if missing_strategy == PROPORTIONAL: (final_distribution, last_node) = self.predict_proportional(input_data, path=path) if self.regression: # singular case: # when the prediction is the one given in a 1-instance node if len(final_distribution.items()) == 1: prediction, instances = final_distribution.items()[0] if instances == 1: return (last_node.output, path, last_node.confidence, last_node.distribution, instances) # when there's more instances, sort elements by their mean distribution = [list(element) for element in sorted(final_distribution.items(), key=lambda x: x[0])] distribution = merge_bins(distribution, BINS_LIMIT) prediction = mean(distribution) total_instances = sum([instances for _, instances in distribution]) confidence = regression_error( unbiased_sample_variance(distribution, prediction), total_instances) return (prediction, path, confidence, distribution, total_instances) else: distribution = [list(element) for element in sorted(final_distribution.items(), key=lambda x: (-x[1], x[0]))] return (distribution[0][0], path, ws_confidence(distribution[0][0], final_distribution), distribution, get_instances(distribution)) else: if self.children: for child in self.children: if child.predicate.apply(input_data, self.fields): path.append(child.predicate.to_rule(self.fields)) return child.predict(input_data, path) return (self.output, path, self.confidence, self.distribution, get_instances(self.distribution))
def predict(self, input_data, path=None, missing_strategy=LAST_PREDICTION): """Makes a prediction based on a number of field values. The input fields must be keyed by Id. There are two possible strategies to predict when the value for the splitting field is missing: 0 - LAST_PREDICTION: the last issued prediction is returned. 1 - PROPORTIONAL: as we cannot choose between the two branches in the tree that stem from this split, we consider both. The algorithm goes on until the final leaves are reached and all their predictions are used to decide the final prediction. """ if path is None: path = [] if missing_strategy == PROPORTIONAL: final_distribution = self.predict_proportional(input_data, path=path) if self.regression: # sort elements by their mean distribution = [ list(element) for element in sorted(final_distribution.items(), key=lambda x: x[0]) ] distribution = merge_bins(distribution, BINS_LIMIT) prediction = mean(distribution) total_instances = sum( [instances for _, instances in distribution]) confidence = regression_error( unbiased_sample_variance(distribution, prediction), total_instances) return (prediction, path, confidence, distribution, total_instances) else: distribution = [ list(element) for element in sorted(final_distribution.items(), key=lambda x: (-x[1], x[0])) ] return (distribution[0][0], path, ws_confidence(distribution[0][0], final_distribution), distribution, get_instances(distribution)) else: if self.children and split(self.children) in input_data: for child in self.children: if child.predicate.apply(input_data, self.fields): path.append(child.predicate.to_rule(self.fields)) return child.predict(input_data, path) return (self.output, path, self.confidence, self.distribution, get_instances(self.distribution))
def predict_confidence(self, input_data, by_name=True, missing_strategy=LAST_PREDICTION, compact=False): """For classification models, Predicts a one-vs.-rest confidence value for each possible output class, based on input values. This confidence value is a lower confidence bound on the predicted probability of the given class. The input fields must be a dictionary keyed by field name for field ID. For regressions, the output is a single element list containing the prediction. :param input_data: Input data to be predicted :param by_name: Boolean that is set to True if field_names (as alternative to field ids) are used in the input_data dict :param missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for missing fields :param compact: If False, prediction is returned as a list of maps, one per class, with the keys "prediction" and "confidence" mapped to the name of the class and its confidence, respectively. If True, returns a list of confidences ordered by the sorted order of the class names. """ if self.regression or self.boosting: raise AttributeError("This method is available for non-boosting" " categorization models only.") root_dist = self.tree.distribution category_map = {category[0]: 0.0 for category in root_dist} prediction = self.predict(input_data, by_name=by_name, missing_strategy=missing_strategy, add_distribution=True) distribution = prediction['distribution'] for class_info in distribution: name = class_info[0] category_map[name] = ws_confidence(name, distribution) return self._to_output(category_map, compact, "confidence")
def classification_proportional_predict(tree, weighted, fields, input_data): """Prediction for classification using proportional strategy """ offset = OFFSETS[str(weighted)] (final_distribution, _, _, last_node, population, _, path) = proportional_predict( \ tree, offset, fields, input_data, path=None) distribution = [list(element) for element in sorted(list(final_distribution.items()), key=lambda x: (-x[1], x[0]))] return Prediction( \ distribution[0][0], path, ws_confidence(distribution[0][0], final_distribution, ws_n=population), distribution, population, None, 'categories', [] if last_node[OFFSETS[str(weighted)]["children#"]] == 0 else \ last_node[OFFSETS[str(weighted)]["children"]])
confidence_list.append(confidence) prediction = [label_separator.join(prediction_list), label_separator.join(confidence_list)] elif method == COMBINATION: predictions = multivote.predictions global_distribution = [] for prediction in predictions: prediction_category = None prediction_instances = 0 for category, instances in prediction['distribution']: if category != other_label: if instances > prediction_instances: prediction_category = category prediction_instances = instances if prediction_category is not None: prediction_confidence = ws_confidence( prediction_category, prediction['distribution']) global_distribution.append([prediction_category, prediction_confidence]) if global_distribution: prediction = sorted(global_distribution, key=lambda x: x[1], reverse=True)[0] else: prediction = [None, None] else: prediction = multivote.combine(method=method, with_confidence=True, options=options) write_prediction(prediction, output, prediction_info, input_data, exclude)
def predict(self, input_data, by_name=True, print_path=False, out=sys.stdout, with_confidence=False, missing_strategy=LAST_PREDICTION, add_confidence=False, add_path=False, add_distribution=False, add_count=False, add_median=False, add_next=False, add_min=False, add_max=False, multiple=None): """Makes a prediction based on a number of field values. By default the input fields must be keyed by field name but you can use `by_name` to input them directly keyed by id. input_data: Input data to be predicted by_name: Boolean, True if input_data is keyed by names print_path: Boolean, if True the rules that lead to the prediction are printed out: output handler with_confidence: Boolean, if True, all the information in the node (prediction, confidence, distribution and count) is returned in a list format missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for missing fields add_confidence: Boolean, if True adds confidence to the dict output add_path: Boolean, if True adds path to the dict output add_distribution: Boolean, if True adds distribution info to the dict output add_count: Boolean, if True adds the number of instances in the node to the dict output add_median: Boolean, if True adds the median of the values in the distribution add_next: Boolean, if True adds the field that determines next split in the tree add_min: Boolean, if True adds the minimum value in the prediction's distribution (for regressions only) add_max: Boolean, if True adds the maximum value in the prediction's distribution (for regressions only) multiple: For categorical fields, it will return the categories in the distribution of the predicted node as a list of dicts: [{'prediction': 'Iris-setosa', 'confidence': 0.9154 'probability': 0.97 'count': 97}, {'prediction': 'Iris-virginica', 'confidence': 0.0103 'probability': 0.03, 'count': 3}] The value of this argument can either be an integer (maximum number of categories to be returned), or the literal 'all', that will cause the entire distribution in the node to be returned. """ # Checks if this is a regression model, using PROPORTIONAL # missing_strategy if (self.tree.regression and missing_strategy == PROPORTIONAL and not self.regression_ready): raise ImportError("Failed to find the numpy and scipy libraries," " needed to use proportional missing strategy" " for regressions. Please install them before" " using local predictions for the model.") # Checks and cleans input_data leaving the fields used in the model input_data = self.filter_input_data(input_data, by_name=by_name) # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) prediction = self.tree.predict(input_data, missing_strategy=missing_strategy) # Prediction path if print_path: out.write(utf8(u' AND '.join(prediction.path) + u' => %s \n' % prediction.output)) out.flush() output = prediction.output if with_confidence: output = [prediction.output, prediction.confidence, prediction.distribution, prediction.count, prediction.median] if multiple is not None and not self.tree.regression: output = [] total_instances = float(prediction.count) distribution = enumerate(prediction.distribution) for index, [category, instances] in distribution: if ((isinstance(multiple, basestring) and multiple == 'all') or (isinstance(multiple, int) and index < multiple)): prediction_dict = { 'prediction': category, 'confidence': ws_confidence(category, prediction.distribution), 'probability': instances / total_instances, 'count': instances} output.append(prediction_dict) else: if (add_confidence or add_path or add_distribution or add_count or add_median or add_next or add_min or add_max): output = {'prediction': prediction.output} if add_confidence: output.update({'confidence': prediction.confidence}) if add_path: output.update({'path': prediction.path}) if add_distribution: output.update( {'distribution': prediction.distribution, 'distribution_unit': prediction.distribution_unit}) if add_count: output.update({'count': prediction.count}) if self.tree.regression and add_median: output.update({'median': prediction.median}) if add_next: field = (None if len(prediction.children) == 0 else prediction.children[0].predicate.field) if field is not None and field in self.fields: field = self.fields[field]['name'] output.update({'next': field}) if self.tree.regression and add_min: output.update({'min': prediction.min}) if self.tree.regression and add_max: output.update({'max': prediction.max}) return output
def predict(self, input_data, by_name=True, print_path=False, out=sys.stdout, with_confidence=False, missing_strategy=LAST_PREDICTION, add_confidence=False, add_path=False, add_distribution=False, add_count=False, add_median=False, add_next=False, multiple=None): """Makes a prediction based on a number of field values. By default the input fields must be keyed by field name but you can use `by_name` to input them directly keyed by id. input_data: Input data to be predicted by_name: Boolean, True if input_data is keyed by names print_path: Boolean, if True the rules that lead to the prediction are printed out: output handler with_confidence: Boolean, if True, all the information in the node (prediction, confidence, distribution and count) is returned in a list format missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for missing fields add_confidence: Boolean, if True adds confidence to the dict output add_path: Boolean, if True adds path to the dict output add_distribution: Boolean, if True adds distribution info to the dict output add_count: Boolean, if True adds the number of instances in the node to the dict output add_median: Boolean, if True adds the median of the values in the distribution add_next: Boolean, if True adds the field that determines next split in the tree multiple: For categorical fields, it will return the categories in the distribution of the predicted node as a list of dicts: [{'prediction': 'Iris-setosa', 'confidence': 0.9154 'probability': 0.97 'count': 97}, {'prediction': 'Iris-virginica', 'confidence': 0.0103 'probability': 0.03, 'count': 3}] The value of this argument can either be an integer (maximum number of categories to be returned), or the literal 'all', that will cause the entire distribution in the node to be returned. """ # Checks if this is a regression model, using PROPORTIONAL # missing_strategy if (self.tree.regression and missing_strategy == PROPORTIONAL and not self.regression_ready): raise ImportError("Failed to find the numpy and scipy libraries," " needed to use proportional missing strategy" " for regressions. Please install them before" " using local predictions for the model.") # Checks and cleans input_data leaving the fields used in the model input_data = self.filter_input_data(input_data, by_name=by_name) # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) prediction = self.tree.predict(input_data, missing_strategy=missing_strategy) # Prediction path if print_path: out.write( utf8(u' AND '.join(prediction.path) + u' => %s \n' % prediction.output)) out.flush() output = prediction.output if with_confidence: output = [ prediction.output, prediction.confidence, prediction.distribution, prediction.count, prediction.median ] if multiple is not None and not self.tree.regression: output = [] total_instances = float(prediction.count) distribution = enumerate(prediction.distribution) for index, [category, instances] in distribution: if ((isinstance(multiple, basestring) and multiple == 'all') or (isinstance(multiple, int) and index < multiple)): prediction_dict = { 'prediction': category, 'confidence': ws_confidence(category, prediction.distribution), 'probability': instances / total_instances, 'count': instances } output.append(prediction_dict) else: if (add_confidence or add_path or add_distribution or add_count or add_median or add_next): output = {'prediction': prediction.output} if add_confidence: output.update({'confidence': prediction.confidence}) if add_path: output.update({'path': prediction.path}) if add_distribution: output.update({ 'distribution': prediction.distribution, 'distribution_unit': prediction.distribution_unit }) if add_count: output.update({'count': prediction.count}) if self.tree.regression and add_median: output.update({'median': prediction.median}) if add_next: field = (None if len(prediction.children) == 0 else prediction.children[0].predicate.field) if field is not None and field in self.fields: field = self.fields[field]['name'] output.update({'next': field}) return output
def predict(self, input_data, path=None, missing_strategy=LAST_PREDICTION): """Makes a prediction based on a number of field values. The input fields must be keyed by Id. There are two possible strategies to predict when the value for the splitting field is missing: 0 - LAST_PREDICTION: the last issued prediction is returned. 1 - PROPORTIONAL: as we cannot choose between the two branches in the tree that stem from this split, we consider both. The algorithm goes on until the final leaves are reached and all their predictions are used to decide the final prediction. """ if path is None: path = [] if missing_strategy == PROPORTIONAL: (final_distribution, d_min, d_max, last_node, population, parent_node) = self.predict_proportional(input_data, path=path) if self.regression: # singular case: # when the prediction is the one given in a 1-instance node if len(final_distribution.items()) == 1: prediction, instances = final_distribution.items()[0] if instances == 1: return Prediction( last_node.output, path, last_node.confidence, distribution=(last_node.distribution if not \ self.weighted else \ last_node.weighted_distribution), count=instances, median=last_node.median, distribution_unit=last_node.distribution_unit, children=last_node.children, d_min=last_node.min, d_max=last_node.max) # when there's more instances, sort elements by their mean distribution = [ list(element) for element in sorted(final_distribution.items(), key=lambda x: x[0]) ] distribution_unit = ('bins' if len(distribution) > BINS_LIMIT else 'counts') distribution = merge_bins(distribution, BINS_LIMIT) total_instances = sum( [instances for _, instances in distribution]) if len(distribution) == 1: # where there's only one bin, there will be no error, but # we use a correction derived from the parent's error prediction = distribution[0][0] if total_instances < 2: total_instances = 1 try: # some strange models can have nodes with no confidence confidence = round( parent_node.confidence / math.sqrt(total_instances), PRECISION) except AttributeError: confidence = None else: prediction = mean(distribution) confidence = round( regression_error( unbiased_sample_variance(distribution, prediction), total_instances), PRECISION) return Prediction(prediction, path, confidence, distribution=distribution, count=total_instances, median=dist_median(distribution, total_instances), distribution_unit=distribution_unit, children=last_node.children, d_min=d_min, d_max=d_max) else: distribution = [ list(element) for element in sorted(final_distribution.items(), key=lambda x: (-x[1], x[0])) ] return Prediction(distribution[0][0], path, ws_confidence(distribution[0][0], final_distribution, ws_n=population), distribution=distribution, count=population, median=None, distribution_unit='categorical', children=last_node.children) else: if self.children: for child in self.children: if child.predicate.apply(input_data, self.fields): path.append(child.predicate.to_rule(self.fields)) return child.predict(input_data, path=path) if self.weighted: output_distribution = self.weighted_distribution output_unit = self.weighted_distribution_unit else: output_distribution = self.distribution output_unit = self.distribution_unit return Prediction( self.output, path, self.confidence, distribution=output_distribution, count=get_instances(output_distribution), median=None if not self.regression else self.median, distribution_unit=output_unit, children=self.children, d_min=None if not self.regression else self.min, d_max=None if not self.regression else self.max)
def predict(self, input_data, path=None, missing_strategy=LAST_PREDICTION): """Makes a prediction based on a number of field values. The input fields must be keyed by Id. There are two possible strategies to predict when the value for the splitting field is missing: 0 - LAST_PREDICTION: the last issued prediction is returned. 1 - PROPORTIONAL: as we cannot choose between the two branches in the tree that stem from this split, we consider both. The algorithm goes on until the final leaves are reached and all their predictions are used to decide the final prediction. """ if path is None: path = [] if missing_strategy == PROPORTIONAL: (final_distribution, d_min, d_max, last_node, population, parent_node) = self.predict_proportional(input_data, path=path) if self.regression: # singular case: # when the prediction is the one given in a 1-instance node if len(final_distribution.items()) == 1: prediction, instances = final_distribution.items()[0] if instances == 1: return Prediction( last_node.output, path, last_node.confidence, distribution=(last_node.distribution if not \ self.weighted else \ last_node.weighted_distribution), count=instances, median=last_node.median, distribution_unit=last_node.distribution_unit, children=last_node.children, d_min=last_node.min, d_max=last_node.max) # when there's more instances, sort elements by their mean distribution = [list(element) for element in sorted(final_distribution.items(), key=lambda x: x[0])] distribution_unit = ('bins' if len(distribution) > BINS_LIMIT else 'counts') distribution = merge_bins(distribution, BINS_LIMIT) total_instances = sum([instances for _, instances in distribution]) if len(distribution) == 1: # where there's only one bin, there will be no error, but # we use a correction derived from the parent's error prediction = distribution[0][0] if total_instances < 2: total_instances = 1 try: # some strange models can have nodes with no confidence confidence = round(parent_node.confidence / math.sqrt(total_instances), PRECISION) except AttributeError: confidence = None else: prediction = mean(distribution) confidence = round(regression_error( unbiased_sample_variance(distribution, prediction), total_instances), PRECISION) return Prediction( prediction, path, confidence, distribution=distribution, count=total_instances, median=dist_median(distribution, total_instances), distribution_unit=distribution_unit, children=last_node.children, d_min=d_min, d_max=d_max) else: distribution = [list(element) for element in sorted(final_distribution.items(), key=lambda x: (-x[1], x[0]))] return Prediction( distribution[0][0], path, ws_confidence(distribution[0][0], final_distribution, ws_n=population), distribution=distribution, count=population, median=None, distribution_unit='categorical', children=last_node.children) else: if self.children: for child in self.children: if child.predicate.apply(input_data, self.fields): path.append(child.predicate.to_rule(self.fields)) return child.predict(input_data, path=path) if self.weighted: output_distribution = self.weighted_distribution output_unit = self.weighted_distribution_unit else: output_distribution = self.distribution output_unit = self.distribution_unit return Prediction( self.output, path, self.confidence, distribution=output_distribution, count=get_instances(output_distribution), median=None if not self.regression else self.median, distribution_unit=output_unit, children=self.children, d_min=None if not self.regression else self.min, d_max=None if not self.regression else self.max)
label_separator.join(prediction_list), label_separator.join(confidence_list) ] elif method == COMBINATION: predictions = multivote.predictions global_distribution = [] for prediction in predictions: prediction_category = None prediction_instances = 0 for category, instances in prediction['distribution']: if category != other_label: if instances > prediction_instances: prediction_category = category prediction_instances = instances if prediction_category is not None: prediction_confidence = ws_confidence( prediction_category, prediction['distribution']) global_distribution.append( [prediction_category, prediction_confidence]) if global_distribution: prediction = sorted(global_distribution, key=lambda x: x[1], reverse=True)[0] else: prediction = [None, None] else: prediction = multivote.combine(method=method, with_confidence=True, options=options) write_prediction(prediction, output, args.prediction_info, input_data, exclude)