def predict(self, input_data, path=None, missing_strategy=LAST_PREDICTION): """Makes a prediction based on a number of field values. The input fields must be keyed by Id. There are two possible strategies to predict when the value for the splitting field is missing: 0 - LAST_PREDICTION: the last issued prediction is returned. 1 - PROPORTIONAL: as we cannot choose between the two branches in the tree that stem from this split, we consider both. The algorithm goes on until the final leaves are reached and all their predictions are used to decide the final prediction. """ if path is None: path = [] if missing_strategy == PROPORTIONAL: (final_distribution, d_min, d_max, last_node, population, parent_node) = self.predict_proportional(input_data, path=path) if self.regression: # singular case: # when the prediction is the one given in a 1-instance node if len(final_distribution.items()) == 1: prediction, instances = final_distribution.items()[0] if instances == 1: return Prediction( last_node.output, path, last_node.confidence, distribution=(last_node.distribution if not \ self.weighted else \ last_node.weighted_distribution), count=instances, median=last_node.median, distribution_unit=last_node.distribution_unit, children=last_node.children, d_min=last_node.min, d_max=last_node.max) # when there's more instances, sort elements by their mean distribution = [ list(element) for element in sorted(final_distribution.items(), key=lambda x: x[0]) ] distribution_unit = ('bins' if len(distribution) > BINS_LIMIT else 'counts') distribution = merge_bins(distribution, BINS_LIMIT) total_instances = sum( [instances for _, instances in distribution]) if len(distribution) == 1: # where there's only one bin, there will be no error, but # we use a correction derived from the parent's error prediction = distribution[0][0] if total_instances < 2: total_instances = 1 try: # some strange models can have nodes with no confidence confidence = round( parent_node.confidence / math.sqrt(total_instances), PRECISION) except AttributeError: confidence = None else: prediction = mean(distribution) confidence = round( regression_error( unbiased_sample_variance(distribution, prediction), total_instances), PRECISION) return Prediction(prediction, path, confidence, distribution=distribution, count=total_instances, median=dist_median(distribution, total_instances), distribution_unit=distribution_unit, children=last_node.children, d_min=d_min, d_max=d_max) else: distribution = [ list(element) for element in sorted(final_distribution.items(), key=lambda x: (-x[1], x[0])) ] return Prediction(distribution[0][0], path, ws_confidence(distribution[0][0], final_distribution, ws_n=population), distribution=distribution, count=population, median=None, distribution_unit='categorical', children=last_node.children) else: if self.children: for child in self.children: if child.predicate.apply(input_data, self.fields): path.append(child.predicate.to_rule(self.fields)) return child.predict(input_data, path=path) if self.weighted: output_distribution = self.weighted_distribution output_unit = self.weighted_distribution_unit else: output_distribution = self.distribution output_unit = self.distribution_unit return Prediction( self.output, path, self.confidence, distribution=output_distribution, count=get_instances(output_distribution), median=None if not self.regression else self.median, distribution_unit=output_unit, children=self.children, d_min=None if not self.regression else self.min, d_max=None if not self.regression else self.max)
def predict(self, input_data, path=None, missing_strategy=LAST_PREDICTION): """Makes a prediction based on a number of field values. The input fields must be keyed by Id. There are two possible strategies to predict when the value for the splitting field is missing: 0 - LAST_PREDICTION: the last issued prediction is returned. 1 - PROPORTIONAL: as we cannot choose between the two branches in the tree that stem from this split, we consider both. The algorithm goes on until the final leaves are reached and all their predictions are used to decide the final prediction. """ if path is None: path = [] if missing_strategy == PROPORTIONAL: (final_distribution, last_node) = self.predict_proportional(input_data, path=path) if self.regression: # singular case: # when the prediction is the one given in a 1-instance node if len(final_distribution.items()) == 1: prediction, instances = final_distribution.items()[0] if instances == 1: return Prediction( last_node.output, path, last_node.confidence, distribution=last_node.distribution, count=instances, median=last_node.median, distribution_unit=last_node.distribution_unit, children=last_node.children) # when there's more instances, sort elements by their mean distribution = [list(element) for element in sorted(final_distribution.items(), key=lambda x: x[0])] distribution_unit = ('bins' if len(distribution) > BINS_LIMIT else 'counts') distribution = merge_bins(distribution, BINS_LIMIT) total_instances = sum([instances for _, instances in distribution]) prediction = mean(distribution) confidence = regression_error( unbiased_sample_variance(distribution, prediction), total_instances) return Prediction( prediction, path, confidence, distribution=distribution, count=total_instances, median=dist_median(distribution, total_instances), distribution_unit=distribution_unit, children=last_node.children) else: distribution = [list(element) for element in sorted(final_distribution.items(), key=lambda x: (-x[1], x[0]))] return Prediction( distribution[0][0], path, ws_confidence(distribution[0][0], final_distribution), distribution=distribution, count=get_instances(distribution), median=None, distribution_unit='categorical', children=last_node.children) else: if self.children: for child in self.children: if child.predicate.apply(input_data, self.fields): path.append(child.predicate.to_rule(self.fields)) return child.predict(input_data, path=path) return Prediction( self.output, path, self.confidence, distribution=self.distribution, count=get_instances(self.distribution), median=None if not self.regression else self.median, distribution_unit=self.distribution_unit, children=self.children)
def regression_proportional_predict(tree, weighted, fields, input_data): """Proportional prediction for regressions """ offset = OFFSETS[str(weighted)] (final_distribution, d_min, d_max, last_node, population, parent_node, path) = proportional_predict( \ tree, offset, fields, input_data, path=None) # singular case: # when the prediction is the one given in a 1-instance node if len(list(final_distribution.items())) == 1: prediction, instances = list(final_distribution.items())[0] if instances == 1: return Prediction( \ last_node[offset["output"]], path, last_node[offset["confidence"]], distribution=last_node[offset["distribution"]] \ if not weighted else \ last_node[offset["wdistribution"]], count=instances, median=last_node[offset["median"]], distribution_unit=last_node[offset["distribution_unit"]], children=[] if last_node[offset["children#"]] == 0 else \ last_node[offset["children"]], d_min=last_node[offset["min"]], d_max=last_node[offset["max"]]) # when there's more instances, sort elements by their mean distribution = [ list(element) for element in sorted(list(final_distribution.items()), key=lambda x: x[0]) ] distribution_unit = ('bins' if len(distribution) > BINS_LIMIT else 'counts') distribution = merge_bins(distribution, BINS_LIMIT) total_instances = sum([instances for _, instances in distribution]) if len(distribution) == 1: # where there's only one bin, there will be no error, but # we use a correction derived from the parent's error prediction = distribution[0][0] if total_instances < 2: total_instances = 1 try: # some strange models can have nodes with no confidence confidence = round( parent_node[offset["confidence"]] / math.sqrt(total_instances), PRECISION) except AttributeError: confidence = None else: prediction = mean(distribution) # weighted trees use the unweighted population to # compute the associated error confidence = round( regression_error( unbiased_sample_variance(distribution, prediction), population), PRECISION) return Prediction( \ prediction, path, confidence, distribution=distribution, count=total_instances, median=dist_median(distribution, total_instances), distribution_unit=distribution_unit, children=[] if last_node[offset["children#"]] == 0 else \ last_node[offset["children"]], d_min=d_min, d_max=d_max)