def predict(self, input_data, path=None, missing_strategy=LAST_PREDICTION): """Makes a prediction based on a number of field values. The input fields must be keyed by Id. There are two possible strategies to predict when the value for the splitting field is missing: 0 - LAST_PREDICTION: the last issued prediction is returned. 1 - PROPORTIONAL: we consider all possible outcomes and create an average prediction. """ if path is None: path = [] if missing_strategy == PROPORTIONAL: return self.predict_proportional(input_data, path=path) else: if self.children: for child in self.children: if child.predicate.apply(input_data, self.fields): path.append(child.predicate.to_rule(self.fields)) return child.predict(input_data, path=path) return Prediction(self.output, path, None, distribution=None, count=self.count, median=None, distribution_unit=None, children=self.children, d_min=None, d_max=None)
def boosting_last_predict(tree, fields, input_data, path=None): """Predict function for boosting and last prediction strategy """ if path is None: path = [] node = get_node(tree) children_number = node[OFFSETS["children#"]] children = [] if children_number == 0 else node[OFFSETS["children"]] count = node[OFFSETS["count"]] if children: for child in children: [operator, field, value, term, missing] = get_predicate(child) if apply_predicate(operator, field, value, term, missing, input_data, fields[field]): path.append(predicate_to_rule(operator, fields[field], value, term, missing)) return boosting_last_predict( \ child, fields, \ input_data, path=path) return Prediction( node[OFFSETS["output"]], path, None, distribution=None, count=count, median=None, distribution_unit=None, children=children, d_min=None, d_max=None)
def last_prediction_predict(tree, offsets, fields, input_data, path=None): """ Predictions for last prediction missing strategy """ if path is None: path = [] node = get_node(tree) children_number = node[offsets["children#"]] children = [] if children_number == 0 else node[offsets["children"]] for child in children: [operator, field, value, term, missing] = get_predicate(child) if apply_predicate(operator, field, value, term, missing, input_data, fields[field]): new_rule = predicate_to_rule(operator, fields[field], value, term, missing) path.append(new_rule) return last_prediction_predict(child, offsets, fields, input_data, path=path) if "wdistribution" in offsets: output_distribution = node[offsets["wdistribution"]] output_unit = 'categories' if "distribution_unit" not in offsets else \ node[offsets["wdistribution_unit"]] else: output_distribution = node[offsets["distribution"]] output_unit = 'categories' if "distribution_unit" not in offsets else \ node[offsets["distribution_unit"]] return Prediction( \ node[offsets["output"]], path, node[offsets["confidence"]], distribution=output_distribution, count=node[offsets["count"]], median=None if offsets.get("median") is None else \ node[offsets["median"]], distribution_unit=output_unit, children=[] if node[offsets["children#"]] == 0 else \ node[offsets["children"]], d_min=None if offsets.get("min") is None else \ node[offsets["min"]], d_max=None if offsets.get("max") is None else \ node[offsets["max"]])
def classification_proportional_predict(tree, weighted, fields, input_data): """Prediction for classification using proportional strategy """ offset = OFFSETS[str(weighted)] (final_distribution, _, _, last_node, population, _, path) = proportional_predict( \ tree, offset, fields, input_data, path=None) distribution = [list(element) for element in sorted(list(final_distribution.items()), key=lambda x: (-x[1], x[0]))] return Prediction( \ distribution[0][0], path, ws_confidence(distribution[0][0], final_distribution, ws_n=population), distribution, population, None, 'categories', [] if last_node[OFFSETS[str(weighted)]["children#"]] == 0 else \ last_node[OFFSETS[str(weighted)]["children"]])
def predict(self, input_data, path=None, missing_strategy=LAST_PREDICTION): """Makes a prediction based on a number of field values. The input fields must be keyed by Id. There are two possible strategies to predict when the value for the splitting field is missing: 0 - LAST_PREDICTION: the last issued prediction is returned. 1 - PROPORTIONAL: as we cannot choose between the two branches in the tree that stem from this split, we consider both. The algorithm goes on until the final leaves are reached and all their predictions are used to decide the final prediction. """ if path is None: path = [] if missing_strategy == PROPORTIONAL: (final_distribution, d_min, d_max, last_node, population, parent_node) = self.predict_proportional(input_data, path=path) if self.regression: # singular case: # when the prediction is the one given in a 1-instance node if len(final_distribution.items()) == 1: prediction, instances = final_distribution.items()[0] if instances == 1: return Prediction( last_node.output, path, last_node.confidence, distribution=(last_node.distribution if not \ self.weighted else \ last_node.weighted_distribution), count=instances, median=last_node.median, distribution_unit=last_node.distribution_unit, children=last_node.children, d_min=last_node.min, d_max=last_node.max) # when there's more instances, sort elements by their mean distribution = [ list(element) for element in sorted(final_distribution.items(), key=lambda x: x[0]) ] distribution_unit = ('bins' if len(distribution) > BINS_LIMIT else 'counts') distribution = merge_bins(distribution, BINS_LIMIT) total_instances = sum( [instances for _, instances in distribution]) if len(distribution) == 1: # where there's only one bin, there will be no error, but # we use a correction derived from the parent's error prediction = distribution[0][0] if total_instances < 2: total_instances = 1 try: # some strange models can have nodes with no confidence confidence = round( parent_node.confidence / math.sqrt(total_instances), PRECISION) except AttributeError: confidence = None else: prediction = mean(distribution) confidence = round( regression_error( unbiased_sample_variance(distribution, prediction), total_instances), PRECISION) return Prediction(prediction, path, confidence, distribution=distribution, count=total_instances, median=dist_median(distribution, total_instances), distribution_unit=distribution_unit, children=last_node.children, d_min=d_min, d_max=d_max) else: distribution = [ list(element) for element in sorted(final_distribution.items(), key=lambda x: (-x[1], x[0])) ] return Prediction(distribution[0][0], path, ws_confidence(distribution[0][0], final_distribution, ws_n=population), distribution=distribution, count=population, median=None, distribution_unit='categorical', children=last_node.children) else: if self.children: for child in self.children: if child.predicate.apply(input_data, self.fields): path.append(child.predicate.to_rule(self.fields)) return child.predict(input_data, path=path) if self.weighted: output_distribution = self.weighted_distribution output_unit = self.weighted_distribution_unit else: output_distribution = self.distribution output_unit = self.distribution_unit return Prediction( self.output, path, self.confidence, distribution=output_distribution, count=get_instances(output_distribution), median=None if not self.regression else self.median, distribution_unit=output_unit, children=self.children, d_min=None if not self.regression else self.min, d_max=None if not self.regression else self.max)
def _predict(self, input_data, missing_strategy=LAST_PREDICTION, operating_point=None, operating_kind=None, unused_fields=None): """Makes a prediction based on a number of field values. Please, note that this function does not check the types for the input provided, so it's unsafe to use it directly without prior checking. """ # When operating_point is used, we need the probabilities # (or confidences) of all possible classes to decide, so se use # the `predict_probability` or `predict_confidence` methods if operating_point: if self.regression: raise ValueError("The operating_point argument can only be" " used in classifications.") prediction = self.predict_operating( \ input_data, missing_strategy=missing_strategy, operating_point=operating_point) return prediction if operating_kind: if self.regression: raise ValueError("The operating_kind argument can only be" " used in classifications.") prediction = self.predict_operating_kind( \ input_data, missing_strategy=missing_strategy, operating_kind=operating_kind) return prediction prediction = tree_predict( \ self.tree, self.tree_type, self.weighted, self.fields, input_data, missing_strategy=missing_strategy) if self.boosting and missing_strategy == PROPORTIONAL: # output has to be recomputed and comes in a different format g_sum, h_sum, population, path = prediction prediction = Prediction( \ - g_sum / (h_sum + self.boosting.get("lambda", 1)), path, None, distribution=None, count=population, median=None, distribution_unit=None) result = vars(prediction) # changing key name to prediction result['prediction'] = result['output'] del result['output'] # next field = (None if len(prediction.children) == 0 else prediction.children[0][FIELD_OFFSET]) if field is not None and field in self.model_fields: field = self.model_fields[field]['name'] result.update({'next': field}) del result['children'] if not self.regression and not self.boosting: probabilities = self._probabilities(result['distribution']) result['probability'] = probabilities[result['prediction']] # adding unused fields, if any if unused_fields: result.update({'unused_fields': unused_fields}) return result
def regression_proportional_predict(tree, weighted, fields, input_data): """Proportional prediction for regressions """ offset = OFFSETS[str(weighted)] (final_distribution, d_min, d_max, last_node, population, parent_node, path) = proportional_predict( \ tree, offset, fields, input_data, path=None) # singular case: # when the prediction is the one given in a 1-instance node if len(list(final_distribution.items())) == 1: prediction, instances = list(final_distribution.items())[0] if instances == 1: return Prediction( \ last_node[offset["output"]], path, last_node[offset["confidence"]], distribution=last_node[offset["distribution"]] \ if not weighted else \ last_node[offset["wdistribution"]], count=instances, median=last_node[offset["median"]], distribution_unit=last_node[offset["distribution_unit"]], children=[] if last_node[offset["children#"]] == 0 else \ last_node[offset["children"]], d_min=last_node[offset["min"]], d_max=last_node[offset["max"]]) # when there's more instances, sort elements by their mean distribution = [ list(element) for element in sorted(list(final_distribution.items()), key=lambda x: x[0]) ] distribution_unit = ('bins' if len(distribution) > BINS_LIMIT else 'counts') distribution = merge_bins(distribution, BINS_LIMIT) total_instances = sum([instances for _, instances in distribution]) if len(distribution) == 1: # where there's only one bin, there will be no error, but # we use a correction derived from the parent's error prediction = distribution[0][0] if total_instances < 2: total_instances = 1 try: # some strange models can have nodes with no confidence confidence = round( parent_node[offset["confidence"]] / math.sqrt(total_instances), PRECISION) except AttributeError: confidence = None else: prediction = mean(distribution) # weighted trees use the unweighted population to # compute the associated error confidence = round( regression_error( unbiased_sample_variance(distribution, prediction), population), PRECISION) return Prediction( \ prediction, path, confidence, distribution=distribution, count=total_instances, median=dist_median(distribution, total_instances), distribution_unit=distribution_unit, children=[] if last_node[offset["children#"]] == 0 else \ last_node[offset["children"]], d_min=d_min, d_max=d_max)
def predict(self, input_data, by_name=True, print_path=False, out=sys.stdout, with_confidence=False, missing_strategy=LAST_PREDICTION, add_confidence=False, add_path=False, add_distribution=False, add_count=False, add_median=False, add_next=False, add_min=False, add_max=False, add_unused_fields=False, multiple=None): """Makes a prediction based on a number of field values. By default the input fields must be keyed by field name but you can use `by_name=False` to input them directly keyed by id. input_data: Input data to be predicted by_name: Boolean, True if input_data is keyed by names print_path: Boolean, if True the rules that lead to the prediction are printed out: output handler with_confidence: Boolean, if True, all the information in the node (prediction, confidence, distribution and count) is returned in a list format missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for missing fields add_confidence: Boolean, if True adds confidence to the dict output add_path: Boolean, if True adds path to the dict output add_distribution: Boolean, if True adds distribution info to the dict output add_count: Boolean, if True adds the number of instances in the node to the dict output add_median: Boolean, if True adds the median of the values in the distribution add_next: Boolean, if True adds the field that determines next split in the tree add_min: Boolean, if True adds the minimum value in the prediction's distribution (for regressions only) add_max: Boolean, if True adds the maximum value in the prediction's distribution (for regressions only) add_unused_fields: Boolean, if True adds the information about the fields in the input_data that are not being used in the model as predictors. multiple: For categorical fields, it will return the categories in the distribution of the predicted node as a list of dicts: [{'prediction': 'Iris-setosa', 'confidence': 0.9154 'probability': 0.97 'count': 97}, {'prediction': 'Iris-virginica', 'confidence': 0.0103 'probability': 0.03, 'count': 3}] The value of this argument can either be an integer (maximum number of categories to be returned), or the literal 'all', that will cause the entire distribution in the node to be returned. """ # Checks if this is a regression model, using PROPORTIONAL # missing_strategy if (not self.boosting and self.regression and missing_strategy == PROPORTIONAL and not self.regression_ready): raise ImportError("Failed to find the numpy and scipy libraries," " needed to use proportional missing strategy" " for regressions. Please install them before" " using local predictions for the model.") # Checks and cleans input_data leaving the fields used in the model new_data = self.filter_input_data( \ input_data, by_name=by_name, add_unused_fields=add_unused_fields) if add_unused_fields: input_data, unused_fields = new_data else: input_data = new_data # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) prediction = self.tree.predict(input_data, missing_strategy=missing_strategy) if self.boosting and missing_strategy == PROPORTIONAL: # output has to be recomputed and comes in a different format g_sum, h_sum, population, path = prediction prediction = Prediction(-g_sum / (h_sum + self.boosting.get("lambda", 1)), path, None, distribution=None, count=population, median=None, distribution_unit=None) # Prediction path if print_path: out.write( utf8(u' AND '.join(prediction.path) + u' => %s \n' % prediction.output)) out.flush() output = prediction.output if with_confidence: output = [ prediction.output, prediction.confidence, prediction.distribution, prediction.count, prediction.median ] if multiple is not None and not self.regression: output = [] total_instances = float(prediction.count) distribution = enumerate(prediction.distribution) for index, [category, instances] in distribution: if ((isinstance(multiple, basestring) and multiple == 'all') or (isinstance(multiple, int) and index < multiple)): prediction_dict = { 'prediction': category, 'confidence': ws_confidence(category, prediction.distribution), 'probability': instances / total_instances, 'count': instances } output.append(prediction_dict) elif (add_confidence or add_path or add_distribution or add_count or add_median or add_next or add_min or add_max or add_unused_fields): output = {'prediction': prediction.output} if add_confidence: output.update({'confidence': prediction.confidence}) if add_path: output.update({'path': prediction.path}) if add_distribution: output.update({ 'distribution': prediction.distribution, 'distribution_unit': prediction.distribution_unit }) if add_count: output.update({'count': prediction.count}) if add_next: field = (None if len(prediction.children) == 0 else prediction.children[0].predicate.field) if field is not None and field in self.fields: field = self.fields[field]['name'] output.update({'next': field}) if not self.boosting and self.regression: if add_median: output.update({'median': prediction.median}) if add_min: output.update({'min': prediction.min}) if add_max: output.update({'max': prediction.max}) if add_unused_fields: output.update({'unused_fields': unused_fields}) return output