def centroid(self, input_data, by_name=True): """Returns the id of the nearest centroid """ # Checks and cleans input_data leaving the fields used in the model clean_input_data = self.filter_input_data(input_data, by_name=by_name) # Checks that all numeric fields are present in input data and # fills them with the default average (if given) when otherwise try: self.fill_numeric_defaults(clean_input_data, self.default_numeric_value) except ValueError: raise Exception("Failed to predict a centroid. Input" " data must contain values for all " "numeric fields to find a centroid.") # Strips affixes for numeric values and casts to the final field type cast(clean_input_data, self.fields) unique_terms = self.get_unique_terms(clean_input_data) nearest = {'centroid_id': None, 'centroid_name': None, 'distance': float('inf')} for centroid in self.centroids: distance2 = centroid.distance2(clean_input_data, unique_terms, self.scales, stop_distance2=nearest['distance']) if distance2 is not None: nearest = {'centroid_id': centroid.centroid_id, 'centroid_name': centroid.name, 'distance': distance2} nearest['distance'] = math.sqrt(nearest['distance']) return nearest
def predict(self, input_data, by_name=True, print_path=False, out=sys.stdout, with_confidence=False, missing_strategy=LAST_PREDICTION): """Makes a prediction based on a number of field values. By default the input fields must be keyed by field name but you can use `by_name` to input them directly keyed by id. """ # Checks and cleans input_data leaving the fields used in the model input_data = self.filter_input_data(input_data, by_name=by_name) # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) prediction_info = self.tree.predict(input_data, missing_strategy=missing_strategy) prediction, path, confidence, distribution, instances = prediction_info # Prediction path if print_path: out.write(utf8(u' AND '.join(path) + u' => %s \n' % prediction)) out.flush() if with_confidence: return [prediction, confidence, distribution, instances] return prediction
def anomaly_score(self, input_data, by_name=True): """Returns the anomaly score given by the iforest To produce an anomaly score, we evaluate each tree in the iforest for its depth result (see the depth method in the AnomalyTree object for details). We find the average of these depths to produce an `observed_mean_depth`. We calculate an `expected_mean_depth` using the `sample_size` and `mean_depth` parameters which come as part of the forest message. We combine those values as seen below, which should result in a value between 0 and 1. """ # Checks and cleans input_data leaving the fields used in the model input_data = self.filter_input_data(input_data, by_name=by_name) # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) depth_sum = 0 if self.iforest is None: raise Exception("We could not find the iforest information to " "compute the anomaly score. Please, rebuild your " "Anomaly object from a complete anomaly detector " "resource.") for tree in self.iforest: depth_sum += tree.depth(input_data)[0] observed_mean_depth = float(depth_sum) / len(self.iforest) return math.pow(2, - observed_mean_depth / self.expected_mean_depth)
def centroid(self, input_data, by_name=True): """Returns the id of the nearest centroid """ # Checks and cleans input_data leaving the fields used in the model input_data = self.filter_input_data(input_data, by_name=by_name) # Checks that all numeric fields are present in input data for field_id, field in self.fields.items(): if (field['optype'] not in OPTIONAL_FIELDS and field_id not in input_data): raise Exception("Failed to predict a centroid. Input" " data must contain values for all " "numeric fields to find a centroid.") # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) unique_terms = self.get_unique_terms(input_data) nearest = {'centroid_id': None, 'centroid_name': None, 'distance': float('inf')} for centroid in self.centroids: distance2 = centroid.distance2(input_data, unique_terms, self.scales, stop_distance2=nearest['distance']) if distance2 is not None: nearest = {'centroid_id': centroid.centroid_id, 'centroid_name': centroid.name, 'distance': distance2} nearest['distance'] = math.sqrt(nearest['distance']) return nearest
def predict(self, input_data, missing_strategy=LAST_PREDICTION, operating_point=None, full=False): """Makes a prediction based on a number of field values. input_data: Input data to be predicted missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for missing fields operating_point: In classification models, this is the point of the ROC curve where the model will be used at. The operating point can be defined in terms of: - the positive_class, the class that is important to predict accurately - the threshold, the value that is stablished as minimum for the positive_class to be predicted. - the kind of measure used to set a threshold: probability or confidence (if available) The operating_point is then defined as a map with two attributes, e.g.: {"positive_class": "Iris-setosa", "threshold": 0.5, "kind": "probability"} full: Boolean that controls whether to include the prediction's attributes. By default, only the prediction is produced. If set to True, the rest of available information is added in a dictionary format. The dictionary keys can be: - prediction: the prediction value - probability: prediction's probability - unused_fields: list of fields in the input data that are not being used in the model """ # Checks and cleans input_data leaving the fields used in the model unused_fields = [] new_data = self.filter_input_data( \ input_data, add_unused_fields=full) if full: input_data, unused_fields = new_data else: input_data = new_data if not self.missing_numerics: check_no_missing_numerics(input_data, self.model_fields) # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) full_prediction = self._predict( \ input_data, missing_strategy=missing_strategy, operating_point=operating_point, unused_fields=unused_fields) if full: return dict((key, value) for key, value in \ full_prediction.items() if value is not None) return full_prediction['prediction']
def anomaly_score(self, input_data): """Returns the anomaly score given by the iforest To produce an anomaly score, we evaluate each tree in the iforest for its depth result (see the depth method in the AnomalyTree object for details). We find the average of these depths to produce an `observed_mean_depth`. We calculate an `expected_mean_depth` using the `sample_size` and `mean_depth` parameters which come as part of the forest message. We combine those values as seen below, which should result in a value between 0 and 1. """ # Checks and cleans input_data leaving the fields used in the model input_data = self.filter_input_data(input_data) # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) depth_sum = 0 if self.iforest is None: raise Exception("We could not find the iforest information to " "compute the anomaly score. Please, rebuild your " "Anomaly object from a complete anomaly detector " "resource.") for tree in self.iforest: depth_sum += tree.depth(input_data)[0] observed_mean_depth = float(depth_sum) / len(self.iforest) return math.pow(2, - observed_mean_depth / self.expected_mean_depth)
def predict(self, input_data, by_name=True, print_path=False, out=sys.stdout, with_confidence=False, missing_strategy=LAST_PREDICTION): """Makes a prediction based on a number of field values. By default the input fields must be keyed by field name but you can use `by_name` to input them directly keyed by id. """ # Checks if this is a regression model, using PROPORTIONAL # missing_strategy if (self.tree.regression and missing_strategy == PROPORTIONAL and not self.regression_ready): raise ImportError("Failed to find the numpy and scipy libraries," " needed to use proportional missing strategy" " for regressions. Please install them before" " using local predictions for the model.") # Checks and cleans input_data leaving the fields used in the model input_data = self.filter_input_data(input_data, by_name=by_name) # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) prediction_info = self.tree.predict(input_data, missing_strategy=missing_strategy) prediction, path, confidence, distribution, instances = prediction_info # Prediction path if print_path: out.write(utf8(u' AND '.join(path) + u' => %s \n' % prediction)) out.flush() if with_confidence: return [prediction, confidence, distribution, instances] return prediction
def predict(self, input_data, by_name=True, add_unused_fields=False): """Makes a prediction based on a number of field values. """ # Checks and cleans input_data leaving the fields used in the model new_data = self.filter_input_data( \ input_data, by_name=by_name, add_unused_fields=add_unused_fields) if add_unused_fields: input_data, unused_fields = new_data else: input_data = new_data # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) # Computes text and categorical field expansion unique_terms = self.get_unique_terms(input_data) input_array = self.fill_array(input_data, unique_terms) if self.networks: return self.predict_list(input_array) else: return self.predict_single(input_array)
def centroid(self, input_data, by_name=True): """Returns the id of the nearest centroid """ # Checks and cleans input_data leaving the fields used in the model input_data = self.filter_input_data(input_data, by_name=by_name) # Checks that all numeric fields are present in input data for field_id, field in self.fields.items(): if (not field['optype'] in OPTIONAL_FIELDS and not field_id in input_data): raise Exception("Failed to predict a centroid. Input" " data must contain values for all " "numeric fields to find a centroid.") # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) unique_terms = self.get_unique_terms(input_data) nearest = {'centroid_id': None, 'centroid_name': None, 'distance': float('inf')} for centroid in self.centroids: distance2 = centroid.distance2(input_data, unique_terms, self.scales, stop_distance2=nearest['distance']) if distance2 is not None: nearest = {'centroid_id': centroid.centroid_id, 'centroid_name': centroid.name, 'distance': distance2} nearest['distance'] = math.sqrt(nearest['distance']) return nearest
def predict(self, input_data, by_name=True, add_unused_fields=False, operating_point=None): """Makes a prediction based on a number of field values. input_data: Input data to be predicted by_name: Boolean, True if input_data is keyed by names add_unused_fields: Boolean, if True adds the information about the fields in the input_data that are not being used in the model as predictors. operating_point: In classification models, this is the point of the ROC curve where the model will be used at. The operating point can be defined in terms of: - the positive_class, the class that is important to predict accurately - the probability_threshold, the probability that is stablished as minimum for the positive_class to be predicted. The operating_point is then defined as a map with two attributes, e.g.: {"positive_class": "Iris-setosa", "probability_threshold": 0.5} """ # Checks and cleans input_data leaving the fields used in the model new_data = self.filter_input_data( \ input_data, by_name=by_name, add_unused_fields=add_unused_fields) if add_unused_fields: input_data, unused_fields = new_data else: input_data = new_data # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) # When operating_point is used, we need the probabilities # of all possible classes to decide, so se use # the `predict_probability` method if operating_point: if self.regression: raise ValueError("The operating_point argument can only be" " used in classifications.") prediction = self.predict_operating( \ input_data, by_name=False, operating_point=operating_point) return prediction # Computes text and categorical field expansion unique_terms = self.get_unique_terms(input_data) input_array = self.fill_array(input_data, unique_terms) if self.networks: prediction = self.predict_list(input_array) else: prediction = self.predict_single(input_array) if add_unused_fields: prediction.update({"unused_fields": unused_fields}) return prediction
def predict(self, input_data, by_name=True): """Returns the class prediction and the probability distribution """ # Checks and cleans input_data leaving the fields used in the model input_data = self.filter_input_data(input_data, by_name=by_name) # In case that missing_numerics is False, checks that all numeric # fields are present in input data. if not self.missing_numerics: for field_id, field in self.fields.items(): if (not field['optype'] in OPTIONAL_FIELDS and not field_id in input_data): raise Exception("Failed to predict. Input" " data must contain values for all numeric" " fields to get a logistic regression" " prediction.") # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) if self.balance_fields: for field in input_data: if self.fields[field]['optype'] == 'numeric': mean = self.fields[field]['summary']['mean'] stddev = self.fields[field]['summary'][ \ 'standard_deviation'] input_data[field] = (input_data[field] - mean) / stddev # Compute text and categorical field expansion unique_terms = self.get_unique_terms(input_data) probabilities = {} total = 0 for category in self.coefficients.keys(): probability = self.category_probability( \ input_data, unique_terms, category) order = self.categories[self.objective_id].index(category) probabilities[category] = {"category": category, "probability": probability, "order": order} total += probabilities[category]["probability"] for category in probabilities.keys(): probabilities[category]["probability"] /= total predictions = sorted(probabilities.items(), key=lambda x: (x[1]["probability"], - x[1]["order"]), reverse=True) for prediction, probability in predictions: del probability['order'] prediction, probability = predictions[0] return { "prediction": prediction, "probability": probability["probability"], "distribution": [{"category": category, "probability": probability["probability"]} for category, probability in predictions]}
def predict(self, input_data, by_name=True, print_path=False, out=sys.stdout, with_confidence=False, missing_strategy=LAST_PREDICTION, add_confidence=False, add_path=False, add_distribution=False, add_count=False): """Makes a prediction based on a number of field values. By default the input fields must be keyed by field name but you can use `by_name` to input them directly keyed by id. """ # Checks if this is a regression model, using PROPORTIONAL # missing_strategy if (self.tree.regression and missing_strategy == PROPORTIONAL and not self.regression_ready): raise ImportError("Failed to find the numpy and scipy libraries," " needed to use proportional missing strategy" " for regressions. Please install them before" " using local predictions for the model.") # Checks and cleans input_data leaving the fields used in the model input_data = self.filter_input_data(input_data, by_name=by_name) # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) prediction_info = self.tree.predict(input_data, missing_strategy=missing_strategy) prediction, path, confidence, distribution, instances = prediction_info # Prediction path if print_path: out.write(utf8(u' AND '.join(path) + u' => %s \n' % prediction)) out.flush() output = prediction if with_confidence: output = [prediction, confidence, distribution, instances] if add_confidence or add_path or add_distribution or add_count: output = {'prediction': prediction} if add_confidence: output.update({'confidence': confidence}) if add_path: rules = path output.update({'path': rules}) if add_distribution: output.update({'distribution': distribution}) if add_count: output.update({'count': instances}) return output
def predict(self, input_data, missing_strategy=LAST_PREDICTION, operating_point=None, full=False): """Makes a prediction based on a number of field values. input_data: Input data to be predicted missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for missing fields operating_point: In classification models, this is the point of the ROC curve where the model will be used at. The operating point can be defined in terms of: - the positive_class, the class that is important to predict accurately - the probability_threshold, the probability that is stablished as minimum for the positive_class to be predicted. The operating_point is then defined as a map with two attributes, e.g.: {"positive_class": "Iris-setosa", "probability_threshold": 0.5} full: Boolean that controls whether to include the prediction's attributes. By default, only the prediction is produced. If set to True, the rest of available information is added in a dictionary format. The dictionary keys can be: - prediction: the prediction value - probability: prediction's probability - unused_fields: list of fields in the input data that are not being used in the model """ # Checks and cleans input_data leaving the fields used in the model unused_fields = [] new_data = self.filter_input_data( \ input_data, add_unused_fields=full) if full: input_data, unused_fields = new_data else: input_data = new_data if not self.missing_numerics: check_no_missing_numerics(input_data, self.fields) # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) full_prediction = self._predict( \ input_data, missing_strategy=missing_strategy, operating_point=operating_point, unused_fields=unused_fields) if full: return dict((key, value) for key, value in \ full_prediction.iteritems() if value is not None) return full_prediction['prediction']
def predict(self, input_data, by_name=True): """Returns the class prediction and the probability distribution """ # Checks and cleans input_data leaving the fields used in the model input_data = self.filter_input_data(input_data, by_name=by_name) # In case that missing_numerics is False, checks that all numeric # fields are present in input data. if not self.missing_numerics: for field_id, field in self.fields.items(): if (not field['optype'] in OPTIONAL_FIELDS and not field_id in input_data): raise Exception("Failed to predict. Input" " data must contain values for all numeric" " fields to get a logistic regression" " prediction.") # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) # Compute text and categorical field expansion unique_terms = self.get_unique_terms(input_data) probabilities = {} total = 0 for category in self.coefficients.keys(): probability = self.category_probability( \ input_data, unique_terms, category) order = self.categories[self.objective_id].index(category) probabilities[category] = { "category": category, "probability": probability, "order": order } total += probabilities[category]["probability"] for category in probabilities.keys(): probabilities[category]["probability"] /= total predictions = sorted(probabilities.items(), key=lambda x: (x[1]["probability"], -x[1]["order"]), reverse=True) for prediction, probability in predictions: del probability['order'] prediction, probability = predictions[0] return { "prediction": prediction, "probability": probability["probability"], "distribution": [{ "category": category, "probability": probability["probability"] } for category, probability in predictions] }
def predict(self, input_data, full=False): """Returns the prediction and the confidence intervals input_data: Input data to be predicted full: Boolean that controls whether to include the prediction's attributes. By default, only the prediction is produced. If set to True, the rest of available information is added in a dictionary format. The dictionary keys can be: - prediction: the prediction value - unused_fields: list of fields in the input data that are not being used in the model """ # Checks and cleans input_data leaving the fields used in the model unused_fields = [] norm_input_data = self.filter_input_data( \ input_data, add_unused_fields=full) if full: norm_input_data, unused_fields = norm_input_data # Strips affixes for numeric values and casts to the final field type cast(norm_input_data, self.fields) # In case that the training data has no missings, input data shouldn't check_no_training_missings(norm_input_data, self.model_fields, self.weight_field, self.objective_id) # Computes text and categorical field expansion unique_terms = self.get_unique_terms(norm_input_data) # Creates an input vector with the values for all expanded fields. input_array = self.expand_input(norm_input_data, unique_terms) compact_input_array = self.expand_input(norm_input_data, unique_terms, True) prediction = dot([flatten(self.coefficients)], [input_array])[0][0] result = { "prediction": prediction} if self.xtx_inverse: result.update({"confidence_bounds": self.confidence_bounds( \ compact_input_array)}) if full: result.update({"unused_fields": unused_fields}) else: result = result["prediction"] return result
def _prepare_for_distance(self, input_data): """Prepares the fields to be able to compute the distance2 """ # Checks and cleans input_data leaving the fields used in the model # and adding default numeric values if set norm_input_data = self.filter_input_data(input_data) # Strips affixes for numeric values and casts to the final field type cast(norm_input_data, self.fields) unique_terms = self.get_unique_terms(norm_input_data) return norm_input_data, unique_terms
def predict(self, input_data, full=False): """Returns the prediction and the confidence intervals input_data: Input data to be predicted full: Boolean that controls whether to include the prediction's attributes. By default, only the prediction is produced. If set to True, the rest of available information is added in a dictionary format. The dictionary keys can be: - prediction: the prediction value - unused_fields: list of fields in the input data that are not being used in the model """ # Checks and cleans input_data leaving the fields used in the model unused_fields = [] new_data = self.filter_input_data( \ input_data, add_unused_fields=full) if full: new_data, unused_fields = new_data # Strips affixes for numeric values and casts to the final field type cast(new_data, self.fields) # In case that the training data has no missings, input data shouldn't check_no_training_missings(new_data, self.fields, self.weight_field, self.objective_id) # Computes text and categorical field expansion unique_terms = self.get_unique_terms(new_data) # Creates an input vector with the values for all expanded fields. input_array = self.expand_input(new_data, unique_terms) compact_input_array = self.expand_input(new_data, unique_terms, True) prediction = dot([flatten(self.coefficients)], [input_array])[0][0] result = { "prediction": prediction} if self.xtx_inverse is not None: result.update({"confidence_bounds": self.confidence_bounds( \ compact_input_array)}) if full: result.update({"unused_fields": unused_fields}) else: result = result["prediction"] return result
def centroid(self, input_data, by_name=True): """Returns the id of the nearest centroid """ # Checks and cleans input_data leaving the fields used in the model input_data = self.filter_input_data(input_data, by_name=by_name) # Checks that all numeric fields are present in input data for field_id, field in self.fields.items(): if (not field['optype'] in ['categorical', 'text'] and not field_id in input_data): raise Exception("Failed to predict a centroid. Input" " data must contain values for all " "numeric fields to find a centroid.") # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) unique_terms = {} for field_id in self.term_forms: if field_id in input_data: case_sensitive = self.term_analysis[field_id].get( 'case_sensitive', True) token_mode = self.term_analysis[field_id].get( 'token_mode', 'all') input_data_field = input_data.get(field_id, '') if token_mode != TM_FULL_TERM: terms = parse_terms(input_data_field, case_sensitive=case_sensitive) else: terms = [] if token_mode != TM_TOKENS: terms.append(input_data_field if case_sensitive else input_data_field.lower()) unique_terms[field_id] = get_unique_terms( terms, self.term_forms[field_id], self.tag_clouds.get(field_id, [])) del input_data[field_id] nearest = {'centroid_id': None, 'centroid_name': None, 'distance': float('inf')} for centroid in self.centroids: distance2 = centroid.distance2(input_data, unique_terms, self.scales, stop_distance2=nearest['distance']) if distance2 is not None: nearest = {'centroid_id': centroid.centroid_id, 'centroid_name': centroid.name, 'distance': distance2} nearest['distance'] = math.sqrt(nearest['distance']) return nearest
def projection(self, input_data, max_components=None, variance_threshold=None, full=False): """Returns the projection of input data in the new components input_data: Input data to be projected """ new_data = self.filter_input_data( \ input_data, add_unused_fields=False) # Strips affixes for numeric values and casts to the final field type cast(new_data, self.fields) # Computes text and categorical field expansion into an input array of # terms and frequencies unique_terms = self.get_unique_terms(new_data) # Creates an input vector with the values for all expanded fields. # The input mask marks the non-missing or categorical fields # The `missings` variable is a boolean indicating whether there's # non-categorical fields missing input_array, missings, input_mask = self.expand_input( new_data, unique_terms) components = self.eigenvectors[:] if max_components is not None: components = components[0:max_components] if variance_threshold is not None: for index, cumulative in enumerate(self.cumulative_variance): if cumulative > variance_threshold: components = components[0:index + 1] result = [value[0] for value in dot(components, [input_array])] # if non-categorical fields values are missing in input data # there's an additional normalization if missings: missing_sums = self.missing_factors(input_mask) for index, value in enumerate(result): result[index] = value / missing_sums[index] \ if missing_sums[index] > 0 else value if full: result = dict(zip(["PC%s" % index \ for index in range(1, len(components) + 1)], result)) return result
def projection(self, input_data, max_components=None, variance_threshold=None, full=False): """Returns the projection of input data in the new components input_data: Input data to be projected """ new_data = self.filter_input_data( \ input_data, add_unused_fields=False) # Strips affixes for numeric values and casts to the final field type cast(new_data, self.fields) # Computes text and categorical field expansion into an input array of # terms and frequencies unique_terms = self.get_unique_terms(new_data) # Creates an input vector with the values for all expanded fields. # The input mask marks the non-missing or categorical fields # The `missings` variable is a boolean indicating whether there's # non-categorical fields missing input_array, missings, input_mask = self.expand_input(new_data, unique_terms) components = self.eigenvectors[:] if max_components is not None: components = components[0: max_components] if variance_threshold is not None: for index, cumulative in enumerate(self.cumulative_variance): if cumulative > variance_threshold: components = components[0: index + 1] result = [value[0] for value in dot(components, [input_array])] # if non-categorical fields values are missing in input data # there's an additional normalization if missings: missing_sums = self.missing_factors(input_mask) for index, value in enumerate(result): result[index] = value / missing_sums[index] \ if missing_sums[index] > 0 else value if full: result = dict(zip(["PC%s" % index \ for index in range(1, len(components) + 1)], result)) return result
def predict(self, input_data, by_name=True, print_path=False, out=sys.stdout, with_confidence=False): """Makes a prediction based on a number of field values. By default the input fields must be keyed by field name but you can use `by_name` to input them directly keyed by id. """ # Strips None values empty_fields = [(key, value) for (key, value) in input_data.items() if value is None] for (key, value) in empty_fields: del input_data[key] # Checks input_data keys against field names and filters the ones # used in the model if by_name: wrong_keys = [key for key in input_data.keys() if not key in self.all_inverted_fields] if wrong_keys: LOGGER.error("Wrong field names in input data: %s" % ", ".join(wrong_keys)) input_data = dict( [[self.inverted_fields[key], value] for key, value in input_data.items() if key in self.inverted_fields]) else: input_data = dict( [[key, value] for key, value in input_data.items() if key in self.tree.fields]) # Strips affixes for numeric values and casts to the final field type cast(input_data, self.tree.fields) prediction_info = self.tree.predict(input_data) prediction, path, confidence, distribution, instances = prediction_info # Prediction path if print_path: out.write(utf8(u' AND '.join(path) + u' => %s \n' % prediction)) out.flush() if with_confidence: return [prediction, confidence, distribution, instances] return prediction
def predict(self, input_data, by_name=True): """Returns the class prediction and the probability distribution """ # Checks and cleans input_data leaving the fields used in the model input_data = self.filter_input_data(input_data, by_name=by_name) # In case that missing_numerics is False, checks that all numeric # fields are present in input data. if not self.missing_numerics: for field_id, field in self.fields.items(): if (not field['optype'] in OPTIONAL_FIELDS and not field_id in input_data): raise Exception("Failed to predict. Input" " data must contain values for all numeric" " fields to get a logistic regression" " prediction.") # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) # Compute text and categorical field expansion unique_terms = self.get_unique_terms(input_data) probabilities = {} total = 0 for category in self.coefficients.keys(): coefficients = self.coefficients[category] probabilities[category] = self.category_probability( input_data, unique_terms, coefficients) total += probabilities[category] for category in probabilities.keys(): probabilities[category] /= total predictions = sorted(probabilities.items(), key=lambda x: x[1], reverse=True) prediction, probability = predictions[0] return { "prediction": prediction, "probability": probability, "distribution": [{"category": category, "probability": probability} for category, probability in predictions]}
def _prepare_for_distance(self, input_data, by_name=True): """Prepares the fields to be able to compute the distance2 """ # Checks and cleans input_data leaving the fields used in the model clean_input_data = self.filter_input_data(input_data, by_name=by_name) # Checks that all numeric fields are present in input data and # fills them with the default average (if given) when otherwise try: self.fill_numeric_defaults(clean_input_data, self.default_numeric_value) except ValueError: raise Exception("Missing values in input data. Input" " data must contain values for all " "numeric fields to compute a distance.") # Strips affixes for numeric values and casts to the final field type cast(clean_input_data, self.fields) unique_terms = self.get_unique_terms(clean_input_data) return clean_input_data, unique_terms
def _prepare_for_distance(self, input_data): """Prepares the fields to be able to compute the distance2 """ # Checks and cleans input_data leaving the fields used in the model clean_input_data = self.filter_input_data(input_data) # Checks that all numeric fields are present in input data and # fills them with the default average (if given) when otherwise try: self.fill_numeric_defaults(clean_input_data, self.default_numeric_value) except ValueError: raise Exception("Missing values in input data. Input" " data must contain values for all " "numeric fields to compute a distance.") # Strips affixes for numeric values and casts to the final field type cast(clean_input_data, self.fields) unique_terms = self.get_unique_terms(clean_input_data) return clean_input_data, unique_terms
def predict(self, input_data, by_name=True, print_path=False, out=sys.stdout, with_confidence=False): """Makes a prediction based on a number of field values. By default the input fields must be keyed by field name but you can use `by_name` to input them directly keyed by id. """ # Checks and cleans input_data leaving the fields used in the model input_data = self.filter_input_data(input_data, by_name=by_name) # Strips affixes for numeric values and casts to the final field type cast(input_data, self.tree.fields) prediction_info = self.tree.predict(input_data) prediction, path, confidence, distribution, instances = prediction_info # Prediction path if print_path: out.write(utf8(u' AND '.join(path) + u' => %s \n' % prediction)) out.flush() if with_confidence: return [prediction, confidence, distribution, instances] return prediction
def predict(self, input_data, by_name=True, add_unused_fields=False): """Returns the class prediction and the probability distribution By default the input fields must be keyed by field name but you can use `by_name` to input them directly keyed by id. input_data: Input data to be predicted by_name: Boolean, True if input_data is keyed by names add_unused_fields: Boolean, if True adds the information about the fields in the input_data that are not being used in the model as predictors. """ # Checks and cleans input_data leaving the fields used in the model new_data = self.filter_input_data( \ input_data, by_name=by_name, add_unused_fields=add_unused_fields) if add_unused_fields: input_data, unused_fields = new_data else: input_data = new_data # In case that missing_numerics is False, checks that all numeric # fields are present in input data. if not self.missing_numerics: for field_id, field in self.fields.items(): if (not field['optype'] in OPTIONAL_FIELDS and not field_id in input_data): raise Exception("Failed to predict. Input" " data must contain values for all numeric" " fields to get a logistic regression" " prediction.") # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) if self.balance_fields: for field in input_data: if self.fields[field]['optype'] == 'numeric': mean = self.fields[field]['summary']['mean'] stddev = self.fields[field]['summary'][ \ 'standard_deviation'] input_data[field] = (input_data[field] - mean) / stddev # Compute text and categorical field expansion unique_terms = self.get_unique_terms(input_data) probabilities = {} total = 0 for category in self.coefficients: probability = self.category_probability( \ input_data, unique_terms, category) order = self.categories[self.objective_id].index(category) probabilities[category] = {"category": category, "probability": probability, "order": order} total += probabilities[category]["probability"] for category in probabilities: probabilities[category]["probability"] /= total predictions = sorted(probabilities.items(), key=lambda x: (x[1]["probability"], - x[1]["order"]), reverse=True) for prediction, probability in predictions: del probability['order'] prediction, probability = predictions[0] result = { "prediction": prediction, "probability": probability["probability"], "distribution": [{"category": category, "probability": probability["probability"]} for category, probability in predictions]} if add_unused_fields: result.update({'unused_fields': unused_fields}) return result
def predict(self, input_data, by_name=True, add_unused_fields=False, operating_point=None, operating_kind=None): """Returns the class prediction and the probability distribution By default the input fields must be keyed by field name but you can use `by_name` to input them directly keyed by id. input_data: Input data to be predicted by_name: Boolean, True if input_data is keyed by names add_unused_fields: Boolean, if True adds the information about the fields in the input_data that are not being used in the model as predictors. operating_point: In classification models, this is the point of the ROC curve where the model will be used at. The operating point can be defined in terms of: - the positive_class, the class that is important to predict accurately - the probability_threshold, the probability that is stablished as minimum for the positive_class to be predicted. The operating_point is then defined as a map with two attributes, e.g.: {"positive_class": "Iris-setosa", "probability_threshold": 0.5} operating_kind: "probability". Sets the property that decides the prediction. Used only if no operating_point is used """ # Checks and cleans input_data leaving the fields used in the model new_data = self.filter_input_data( \ input_data, by_name=by_name, add_unused_fields=add_unused_fields) if add_unused_fields: input_data, unused_fields = new_data else: input_data = new_data # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) # When operating_point is used, we need the probabilities # of all possible classes to decide, so se use # the `predict_probability` method if operating_point: return self.predict_operating( \ input_data, by_name=False, operating_point=operating_point) if operating_kind: return self.predict_operating_kind( \ input_data, by_name=False, operating_kind=operating_kind) # In case that missing_numerics is False, checks that all numeric # fields are present in input data. if not self.missing_numerics: for field_id, field in self.fields.items(): if (not field['optype'] in OPTIONAL_FIELDS and not field_id in input_data): raise Exception("Failed to predict. Input" " data must contain values for all numeric" " fields to get a logistic regression" " prediction.") if self.balance_fields: balance_input(input_data, self.fields) # Computes text and categorical field expansion unique_terms = self.get_unique_terms(input_data) probabilities = {} total = 0 # Computes the contributions for each category for category in self.coefficients: probability = self.category_probability( \ input_data, unique_terms, category) try: order = self.categories[self.objective_id].index(category) except ValueError: if category == u'': order = len(self.categories[self.objective_id]) probabilities[category] = {"category": category, "probability": probability, "order": order} total += probabilities[category]["probability"] # Normalizes the contributions to get a probability for category in probabilities: probabilities[category]["probability"] /= total probabilities[category]["probability"] = round( \ probabilities[category]["probability"], PRECISION) # Chooses the most probable category as prediction predictions = sorted(probabilities.items(), key=lambda x: (x[1]["probability"], - x[1]["order"]), reverse=True) for prediction, probability in predictions: del probability['order'] prediction, probability = predictions[0] result = { "prediction": prediction, "probability": probability["probability"], "distribution": [{"category": category, "probability": probability["probability"]} for category, probability in predictions]} if add_unused_fields: result.update({'unused_fields': unused_fields}) return result
def predict(self, input_data, missing_strategy=LAST_PREDICTION, operating_point=None, operating_kind=None, full=False): """Makes a prediction based on a number of field values. input_data: Input data to be predicted missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for missing fields operating_point: In classification models, this is the point of the ROC curve where the model will be used at. The operating point can be defined in terms of: - the positive_class, the class that is important to predict accurately - the probability_threshold (or confidence_threshold), the probability (or confidence) that is stablished as minimum for the positive_class to be predicted. The operating_point is then defined as a map with two attributes, e.g.: {"positive_class": "Iris-setosa", "probability_threshold": 0.5} or {"positive_class": "Iris-setosa", "confidence_threshold": 0.5} operating_kind: "probability" or "confidence". Sets the property that decides the prediction. Used only if no operating_point is used full: Boolean that controls whether to include the prediction's attributes. By default, only the prediction is produced. If set to True, the rest of available information is added in a dictionary format. The dictionary keys can be: - prediction: the prediction value - confidence: prediction's confidence - probability: prediction's probability - path: rules that lead to the prediction - count: number of training instances supporting the prediction - next: field to check in the next split - min: minim value of the training instances in the predicted node - max: maximum value of the training instances in the predicted node - median: median of the values of the training instances in the predicted node - unused_fields: list of fields in the input data that are not being used in the model """ # Checks and cleans input_data leaving the fields used in the model unused_fields = [] new_data = self.filter_input_data( \ input_data, add_unused_fields=full) if full: input_data, unused_fields = new_data else: input_data = new_data # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) full_prediction = self._predict( \ input_data, missing_strategy=missing_strategy, operating_point=operating_point, operating_kind=operating_kind, unused_fields=unused_fields) if full: return dict((key, value) for key, value in \ full_prediction.items() if value is not None) return full_prediction['prediction']
def predict(self, input_data, by_name=True, add_unused_fields=False): """Returns the class prediction and the probability distribution By default the input fields must be keyed by field name but you can use `by_name` to input them directly keyed by id. input_data: Input data to be predicted by_name: Boolean, True if input_data is keyed by names add_unused_fields: Boolean, if True adds the information about the fields in the input_data that are not being used in the model as predictors. """ # Checks and cleans input_data leaving the fields used in the model new_data = self.filter_input_data( \ input_data, by_name=by_name, add_unused_fields=add_unused_fields) if add_unused_fields: input_data, unused_fields = new_data else: input_data = new_data # In case that missing_numerics is False, checks that all numeric # fields are present in input data. if not self.missing_numerics: for field_id, field in self.fields.items(): if (not field['optype'] in OPTIONAL_FIELDS and not field_id in input_data): raise Exception("Failed to predict. Input" " data must contain values for all numeric" " fields to get a logistic regression" " prediction.") # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) if self.balance_fields: balance_input(input_data, self.fields) # Computes text and categorical field expansion unique_terms = self.get_unique_terms(input_data) probabilities = {} total = 0 # Computes the contributions for each category for category in self.coefficients: probability = self.category_probability( \ input_data, unique_terms, category) order = self.categories[self.objective_id].index(category) probabilities[category] = {"category": category, "probability": probability, "order": order} total += probabilities[category]["probability"] # Normalizes the contributions to get a probability for category in probabilities: probabilities[category]["probability"] /= total # Chooses the most probable category as prediction predictions = sorted(probabilities.items(), key=lambda x: (x[1]["probability"], - x[1]["order"]), reverse=True) for prediction, probability in predictions: del probability['order'] prediction, probability = predictions[0] result = { "prediction": prediction, "probability": probability["probability"], "distribution": [{"category": category, "probability": probability["probability"]} for category, probability in predictions]} if add_unused_fields: result.update({'unused_fields': unused_fields}) return result
def predict(self, input_data, operating_point=None, operating_kind=None, full=False): """Makes a prediction based on a number of field values. input_data: Input data to be predicted operating_point: In classification models, this is the point of the ROC curve where the model will be used at. The operating point can be defined in terms of: - the positive_class, the class that is important to predict accurately - the probability_threshold, the probability that is stablished as minimum for the positive_class to be predicted. The operating_point is then defined as a map with two attributes, e.g.: {"positive_class": "Iris-setosa", "probability_threshold": 0.5} operating_kind: "probability". Sets the property that decides the prediction. Used only if no operating_point is used full: Boolean that controls whether to include the prediction's attributes. By default, only the prediction is produced. If set to True, the rest of available information is added in a dictionary format. The dictionary keys can be: - prediction: the prediction value - probability: prediction's probability - unused_fields: list of fields in the input data that are not being used in the model """ # Checks and cleans input_data leaving the fields used in the model unused_fields = [] new_data = self.filter_input_data( \ input_data, add_unused_fields=full) if full: input_data, unused_fields = new_data else: input_data = new_data # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) # When operating_point is used, we need the probabilities # of all possible classes to decide, so se use # the `predict_probability` method if operating_point: if self.regression: raise ValueError("The operating_point argument can only be" " used in classifications.") return self.predict_operating( \ input_data, operating_point=operating_point) if operating_kind: if self.regression: raise ValueError("The operating_point argument can only be" " used in classifications.") return self.predict_operating_kind( \ input_data, operating_kind=operating_kind) # Computes text and categorical field expansion unique_terms = self.get_unique_terms(input_data) input_array = self.fill_array(input_data, unique_terms) if self.networks: prediction = self.predict_list(input_array) else: prediction = self.predict_single(input_array) if full: if not isinstance(prediction, dict): prediction = {"prediction": prediction} prediction.update({"unused_fields": unused_fields}) else: if isinstance(prediction, dict): prediction = prediction["prediction"] return prediction
def predict(self, input_data, by_name=True, print_path=False, out=sys.stdout, with_confidence=False, missing_strategy=LAST_PREDICTION, add_confidence=False, add_path=False, add_distribution=False, add_count=False, add_median=False, add_next=False, multiple=None): """Makes a prediction based on a number of field values. By default the input fields must be keyed by field name but you can use `by_name` to input them directly keyed by id. input_data: Input data to be predicted by_name: Boolean, True if input_data is keyed by names print_path: Boolean, if True the rules that lead to the prediction are printed out: output handler with_confidence: Boolean, if True, all the information in the node (prediction, confidence, distribution and count) is returned in a list format missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for missing fields add_confidence: Boolean, if True adds confidence to the dict output add_path: Boolean, if True adds path to the dict output add_distribution: Boolean, if True adds distribution info to the dict output add_count: Boolean, if True adds the number of instances in the node to the dict output add_median: Boolean, if True adds the median of the values in the distribution add_next: Boolean, if True adds the field that determines next split in the tree multiple: For categorical fields, it will return the categories in the distribution of the predicted node as a list of dicts: [{'prediction': 'Iris-setosa', 'confidence': 0.9154 'probability': 0.97 'count': 97}, {'prediction': 'Iris-virginica', 'confidence': 0.0103 'probability': 0.03, 'count': 3}] The value of this argument can either be an integer (maximum number of categories to be returned), or the literal 'all', that will cause the entire distribution in the node to be returned. """ # Checks if this is a regression model, using PROPORTIONAL # missing_strategy if (self.tree.regression and missing_strategy == PROPORTIONAL and not self.regression_ready): raise ImportError("Failed to find the numpy and scipy libraries," " needed to use proportional missing strategy" " for regressions. Please install them before" " using local predictions for the model.") # Checks and cleans input_data leaving the fields used in the model input_data = self.filter_input_data(input_data, by_name=by_name) # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) prediction = self.tree.predict(input_data, missing_strategy=missing_strategy) # Prediction path if print_path: out.write( utf8(u' AND '.join(prediction.path) + u' => %s \n' % prediction.output)) out.flush() output = prediction.output if with_confidence: output = [ prediction.output, prediction.confidence, prediction.distribution, prediction.count, prediction.median ] if multiple is not None and not self.tree.regression: output = [] total_instances = float(prediction.count) distribution = enumerate(prediction.distribution) for index, [category, instances] in distribution: if ((isinstance(multiple, basestring) and multiple == 'all') or (isinstance(multiple, int) and index < multiple)): prediction_dict = { 'prediction': category, 'confidence': ws_confidence(category, prediction.distribution), 'probability': instances / total_instances, 'count': instances } output.append(prediction_dict) else: if (add_confidence or add_path or add_distribution or add_count or add_median or add_next): output = {'prediction': prediction.output} if add_confidence: output.update({'confidence': prediction.confidence}) if add_path: output.update({'path': prediction.path}) if add_distribution: output.update({ 'distribution': prediction.distribution, 'distribution_unit': prediction.distribution_unit }) if add_count: output.update({'count': prediction.count}) if self.tree.regression and add_median: output.update({'median': prediction.median}) if add_next: field = (None if len(prediction.children) == 0 else prediction.children[0].predicate.field) if field is not None and field in self.fields: field = self.fields[field]['name'] output.update({'next': field}) return output
def predict(self, input_data, method=None, options=None, missing_strategy=LAST_PREDICTION, operating_point=None, operating_kind=None, median=False, full=False): """Makes a prediction based on the prediction made by every model. :param input_data: Test data to be used as input :param method: **deprecated**. Please check the `operating_kind` attribute. Numeric key code for the following combination methods in classifications/regressions: 0 - majority vote (plurality)/ average: PLURALITY_CODE 1 - confidence weighted majority vote / error weighted: CONFIDENCE_CODE 2 - probability weighted majority vote / average: PROBABILITY_CODE 3 - threshold filtered vote / doesn't apply: THRESHOLD_CODE :param options: Options to be used in threshold filtered votes. :param missing_strategy: numeric key for the individual model's prediction method. See the model predict method. :param operating_point: In classification models, this is the point of the ROC curve where the model will be used at. The operating point can be defined in terms of: - the positive_class, the class that is important to predict accurately - its kind: probability, confidence or voting - its threshold: the minimum established for the positive_class to be predicted. The operating_point is then defined as a map with three attributes, e.g.: {"positive_class": "Iris-setosa", "kind": "probability", "threshold": 0.5} :param operating_kind: "probability", "confidence" or "votes". Sets the property that decides the prediction. Used only if no operating_point is used :param median: Uses the median of each individual model's predicted node as individual prediction for the specified combination method. :param full: Boolean that controls whether to include the prediction's attributes. By default, only the prediction is produced. If set to True, the rest of available information is added in a dictionary format. The dictionary keys can be: - prediction: the prediction value - confidence: prediction's confidence - probability: prediction's probability - path: rules that lead to the prediction - count: number of training instances supporting the prediction - next: field to check in the next split - min: minim value of the training instances in the predicted node - max: maximum value of the training instances in the predicted node - median: median of the values of the training instances in the predicted node - unused_fields: list of fields in the input data that are not being used in the model """ # Checks and cleans input_data leaving the fields used in the model new_data = self.filter_input_data( \ input_data, add_unused_fields=full) unused_fields = None if full: input_data, unused_fields = new_data else: input_data = new_data # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) if median and method is None: # predictions with median are only available with old combiners method = PLURALITY_CODE if method is None and operating_point is None and \ operating_kind is None and not median: # operating_point has precedence over operating_kind. If no # combiner is set, default operating kind is "probability" operating_kind = "probability" if operating_point: if self.regression: raise ValueError("The operating_point argument can only be" " used in classifications.") prediction = self.predict_operating( \ input_data, missing_strategy=missing_strategy, operating_point=operating_point) if full: return prediction return prediction["prediction"] if operating_kind: if self.regression: # for regressions, operating_kind defaults to the old # combiners method = 1 if operating_kind == "confidence" else 0 return self.predict( \ input_data, method=method, options=options, missing_strategy=missing_strategy, operating_point=None, operating_kind=None, full=full) prediction = self.predict_operating_kind( \ input_data, missing_strategy=missing_strategy, operating_kind=operating_kind) return prediction if len(self.models_splits) > 1: # If there's more than one chunk of models, they must be # sequentially used to generate the votes for the prediction votes = MultiVote([], boosting_offsets=self.boosting_offsets) for models_split in self.models_splits: models = self._get_models(models_split) multi_model = MultiModel(models, api=self.api, fields=self.fields) votes_split = multi_model._generate_votes( input_data, missing_strategy=missing_strategy, unused_fields=unused_fields) if median: for prediction in votes_split.predictions: prediction['prediction'] = prediction['median'] votes.extend(votes_split.predictions) else: # When only one group of models is found you use the # corresponding multimodel to predict votes_split = self.multi_model._generate_votes( input_data, missing_strategy=missing_strategy, unused_fields=unused_fields) votes = MultiVote(votes_split.predictions, boosting_offsets=self.boosting_offsets) if median: for prediction in votes.predictions: prediction['prediction'] = prediction['median'] if self.boosting is not None and not self.regression: categories = [ \ d[0] for d in self.fields[self.objective_id]["summary"]["categories"]] options = {"categories": categories} result = votes.combine(method=method, options=options, full=full) if full: unused_fields = set(input_data.keys()) for prediction in votes.predictions: unused_fields = unused_fields.intersection( \ set(prediction.get("unused_fields", []))) if not isinstance(result, dict): result = {"prediction": result} result['unused_fields'] = list(unused_fields) return result
def predict(self, input_data, operating_point=None, operating_kind=None, full=False): """Returns the class prediction and the probability distribution input_data: Input data to be predicted operating_point: In classification models, this is the point of the ROC curve where the model will be used at. The operating point can be defined in terms of: - the positive_class, the class that is important to predict accurately - the probability_threshold, the probability that is stablished as minimum for the positive_class to be predicted. The operating_point is then defined as a map with two attributes, e.g.: {"positive_class": "Iris-setosa", "probability_threshold": 0.5} operating_kind: "probability". Sets the property that decides the prediction. Used only if no operating_point is used full: Boolean that controls whether to include the prediction's attributes. By default, only the prediction is produced. If set to True, the rest of available information is added in a dictionary format. The dictionary keys can be: - prediction: the prediction value - probability: prediction's probability - distribution: distribution of probabilities for each of the objective field classes - unused_fields: list of fields in the input data that are not being used in the model """ # Checks and cleans input_data leaving the fields used in the model unused_fields = [] new_data = self.filter_input_data( \ input_data, add_unused_fields=full) if full: input_data, unused_fields = new_data else: input_data = new_data # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) # When operating_point is used, we need the probabilities # of all possible classes to decide, so se use # the `predict_probability` method if operating_point: return self.predict_operating( \ input_data, operating_point=operating_point) if operating_kind: return self.predict_operating_kind( \ input_data, operating_kind=operating_kind) # In case that missing_numerics is False, checks that all numeric # fields are present in input data. if not self.missing_numerics: check_no_missing_numerics(input_data, self.model_fields, self.weight_field) if self.balance_fields: balance_input(input_data, self.fields) # Computes text and categorical field expansion unique_terms = self.get_unique_terms(input_data) probabilities = {} total = 0 # Computes the contributions for each category for category in self.coefficients: probability = self.category_probability( \ input_data, unique_terms, category) try: order = self.categories[self.objective_id].index(category) except ValueError: if category == '': order = len(self.categories[self.objective_id]) probabilities[category] = { "category": category, "probability": probability, "order": order } total += probabilities[category]["probability"] # Normalizes the contributions to get a probability for category in probabilities: probabilities[category]["probability"] /= total probabilities[category]["probability"] = round( \ probabilities[category]["probability"], PRECISION) # Chooses the most probable category as prediction predictions = sorted(list(probabilities.items()), key=lambda x: (x[1]["probability"], -x[1]["order"]), reverse=True) for prediction, probability in predictions: del probability['order'] prediction, probability = predictions[0] result = { "prediction": prediction, "probability": probability["probability"], "distribution": [{ "category": category, "probability": probability["probability"] } for category, probability in predictions] } if full: result.update({'unused_fields': unused_fields}) else: result = result["prediction"] return result
def predict(self, input_data, by_name=True, print_path=False, out=sys.stdout, with_confidence=False, missing_strategy=LAST_PREDICTION, add_confidence=False, add_path=False, add_distribution=False, add_count=False, add_median=False, add_next=False, add_min=False, add_max=False, multiple=None): """Makes a prediction based on a number of field values. By default the input fields must be keyed by field name but you can use `by_name` to input them directly keyed by id. input_data: Input data to be predicted by_name: Boolean, True if input_data is keyed by names print_path: Boolean, if True the rules that lead to the prediction are printed out: output handler with_confidence: Boolean, if True, all the information in the node (prediction, confidence, distribution and count) is returned in a list format missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for missing fields add_confidence: Boolean, if True adds confidence to the dict output add_path: Boolean, if True adds path to the dict output add_distribution: Boolean, if True adds distribution info to the dict output add_count: Boolean, if True adds the number of instances in the node to the dict output add_median: Boolean, if True adds the median of the values in the distribution add_next: Boolean, if True adds the field that determines next split in the tree add_min: Boolean, if True adds the minimum value in the prediction's distribution (for regressions only) add_max: Boolean, if True adds the maximum value in the prediction's distribution (for regressions only) multiple: For categorical fields, it will return the categories in the distribution of the predicted node as a list of dicts: [{'prediction': 'Iris-setosa', 'confidence': 0.9154 'probability': 0.97 'count': 97}, {'prediction': 'Iris-virginica', 'confidence': 0.0103 'probability': 0.03, 'count': 3}] The value of this argument can either be an integer (maximum number of categories to be returned), or the literal 'all', that will cause the entire distribution in the node to be returned. """ # Checks if this is a regression model, using PROPORTIONAL # missing_strategy if (self.tree.regression and missing_strategy == PROPORTIONAL and not self.regression_ready): raise ImportError("Failed to find the numpy and scipy libraries," " needed to use proportional missing strategy" " for regressions. Please install them before" " using local predictions for the model.") # Checks and cleans input_data leaving the fields used in the model input_data = self.filter_input_data(input_data, by_name=by_name) # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) prediction = self.tree.predict(input_data, missing_strategy=missing_strategy) # Prediction path if print_path: out.write(utf8(u' AND '.join(prediction.path) + u' => %s \n' % prediction.output)) out.flush() output = prediction.output if with_confidence: output = [prediction.output, prediction.confidence, prediction.distribution, prediction.count, prediction.median] if multiple is not None and not self.tree.regression: output = [] total_instances = float(prediction.count) distribution = enumerate(prediction.distribution) for index, [category, instances] in distribution: if ((isinstance(multiple, basestring) and multiple == 'all') or (isinstance(multiple, int) and index < multiple)): prediction_dict = { 'prediction': category, 'confidence': ws_confidence(category, prediction.distribution), 'probability': instances / total_instances, 'count': instances} output.append(prediction_dict) else: if (add_confidence or add_path or add_distribution or add_count or add_median or add_next or add_min or add_max): output = {'prediction': prediction.output} if add_confidence: output.update({'confidence': prediction.confidence}) if add_path: output.update({'path': prediction.path}) if add_distribution: output.update( {'distribution': prediction.distribution, 'distribution_unit': prediction.distribution_unit}) if add_count: output.update({'count': prediction.count}) if self.tree.regression and add_median: output.update({'median': prediction.median}) if add_next: field = (None if len(prediction.children) == 0 else prediction.children[0].predicate.field) if field is not None and field in self.fields: field = self.fields[field]['name'] output.update({'next': field}) if self.tree.regression and add_min: output.update({'min': prediction.min}) if self.tree.regression and add_max: output.update({'max': prediction.max}) return output