def predict(self, input_data, missing_strategy=LAST_PREDICTION, operating_point=None, full=False): """Makes a prediction based on a number of field values. input_data: Input data to be predicted missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for missing fields operating_point: In classification models, this is the point of the ROC curve where the model will be used at. The operating point can be defined in terms of: - the positive_class, the class that is important to predict accurately - the threshold, the value that is stablished as minimum for the positive_class to be predicted. - the kind of measure used to set a threshold: probability or confidence (if available) The operating_point is then defined as a map with two attributes, e.g.: {"positive_class": "Iris-setosa", "threshold": 0.5, "kind": "probability"} full: Boolean that controls whether to include the prediction's attributes. By default, only the prediction is produced. If set to True, the rest of available information is added in a dictionary format. The dictionary keys can be: - prediction: the prediction value - probability: prediction's probability - unused_fields: list of fields in the input data that are not being used in the model """ # Checks and cleans input_data leaving the fields used in the model unused_fields = [] new_data = self.filter_input_data( \ input_data, add_unused_fields=full) if full: input_data, unused_fields = new_data else: input_data = new_data if not self.missing_numerics: check_no_missing_numerics(input_data, self.model_fields) # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) full_prediction = self._predict( \ input_data, missing_strategy=missing_strategy, operating_point=operating_point, unused_fields=unused_fields) if full: return dict((key, value) for key, value in \ full_prediction.items() if value is not None) return full_prediction['prediction']
def predict(self, input_data, missing_strategy=LAST_PREDICTION, operating_point=None, full=False): """Makes a prediction based on a number of field values. input_data: Input data to be predicted missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for missing fields operating_point: In classification models, this is the point of the ROC curve where the model will be used at. The operating point can be defined in terms of: - the positive_class, the class that is important to predict accurately - the probability_threshold, the probability that is stablished as minimum for the positive_class to be predicted. The operating_point is then defined as a map with two attributes, e.g.: {"positive_class": "Iris-setosa", "probability_threshold": 0.5} full: Boolean that controls whether to include the prediction's attributes. By default, only the prediction is produced. If set to True, the rest of available information is added in a dictionary format. The dictionary keys can be: - prediction: the prediction value - probability: prediction's probability - unused_fields: list of fields in the input data that are not being used in the model """ # Checks and cleans input_data leaving the fields used in the model unused_fields = [] new_data = self.filter_input_data( \ input_data, add_unused_fields=full) if full: input_data, unused_fields = new_data else: input_data = new_data if not self.missing_numerics: check_no_missing_numerics(input_data, self.fields) # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) full_prediction = self._predict( \ input_data, missing_strategy=missing_strategy, operating_point=operating_point, unused_fields=unused_fields) if full: return dict((key, value) for key, value in \ full_prediction.iteritems() if value is not None) return full_prediction['prediction']
def predict(self, input_data, operating_point=None, operating_kind=None, full=False): """Returns the class prediction and the probability distribution input_data: Input data to be predicted operating_point: In classification models, this is the point of the ROC curve where the model will be used at. The operating point can be defined in terms of: - the positive_class, the class that is important to predict accurately - the probability_threshold, the probability that is stablished as minimum for the positive_class to be predicted. The operating_point is then defined as a map with two attributes, e.g.: {"positive_class": "Iris-setosa", "probability_threshold": 0.5} operating_kind: "probability". Sets the property that decides the prediction. Used only if no operating_point is used full: Boolean that controls whether to include the prediction's attributes. By default, only the prediction is produced. If set to True, the rest of available information is added in a dictionary format. The dictionary keys can be: - prediction: the prediction value - probability: prediction's probability - distribution: distribution of probabilities for each of the objective field classes - unused_fields: list of fields in the input data that are not being used in the model """ # Checks and cleans input_data leaving the fields used in the model unused_fields = [] new_data = self.filter_input_data( \ input_data, add_unused_fields=full) if full: input_data, unused_fields = new_data else: input_data = new_data # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) # When operating_point is used, we need the probabilities # of all possible classes to decide, so se use # the `predict_probability` method if operating_point: return self.predict_operating( \ input_data, operating_point=operating_point) if operating_kind: return self.predict_operating_kind( \ input_data, operating_kind=operating_kind) # In case that missing_numerics is False, checks that all numeric # fields are present in input data. if not self.missing_numerics: check_no_missing_numerics(input_data, self.model_fields, self.weight_field) if self.balance_fields: balance_input(input_data, self.fields) # Computes text and categorical field expansion unique_terms = self.get_unique_terms(input_data) probabilities = {} total = 0 # Computes the contributions for each category for category in self.coefficients: probability = self.category_probability( \ input_data, unique_terms, category) try: order = self.categories[self.objective_id].index(category) except ValueError: if category == '': order = len(self.categories[self.objective_id]) probabilities[category] = { "category": category, "probability": probability, "order": order } total += probabilities[category]["probability"] # Normalizes the contributions to get a probability for category in probabilities: probabilities[category]["probability"] /= total probabilities[category]["probability"] = round( \ probabilities[category]["probability"], PRECISION) # Chooses the most probable category as prediction predictions = sorted(list(probabilities.items()), key=lambda x: (x[1]["probability"], -x[1]["order"]), reverse=True) for prediction, probability in predictions: del probability['order'] prediction, probability = predictions[0] result = { "prediction": prediction, "probability": probability["probability"], "distribution": [{ "category": category, "probability": probability["probability"] } for category, probability in predictions] } if full: result.update({'unused_fields': unused_fields}) else: result = result["prediction"] return result
def predict_probability(self, input_data, missing_strategy=LAST_PREDICTION, compact=False): """For classification models, Predicts a probability for each possible output class, based on input values. The input fields must be a dictionary keyed by field name or field ID. For regressions, the output is a single element list containing the prediction. :param input_data: Input data to be predicted :param missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for missing fields :param compact: If False, prediction is returned as a list of maps, one per class, with the keys "prediction" and "probability" mapped to the name of the class and it's probability, respectively. If True, returns a list of probabilities ordered by the sorted order of the class names. """ votes = MultiVoteList([]) if not self.missing_numerics: check_no_missing_numerics(input_data, self.fields) for models_split in self.models_splits: models = [] for model in models_split: if get_resource_type(model) == "fusion": models.append(Fusion(model, api=self.api)) else: models.append(SupervisedModel(model, api=self.api)) votes_split = [] for model in models: try: prediction = model.predict_probability( \ input_data, missing_strategy=missing_strategy, compact=True) except ValueError: # logistic regressions can raise this error if they # have missing_numerics=False and some numeric missings # are found continue if self.regression: prediction = prediction[0] if self.weights is not None: prediction = self.weigh(prediction, model.resource_id) else: if self.weights is not None: prediction = self.weigh( \ prediction, model.resource_id) # we need to check that all classes in the fusion # are also in the composing model if not self.regression and \ self.class_names != model.class_names: try: prediction = rearrange_prediction( \ model.class_names, self.class_names, prediction) except AttributeError: # class_names should be defined, but just in case pass votes_split.append(prediction) votes.extend(votes_split) if self.regression: total_weight = len(votes.predictions) if self.weights is None \ else sum(self.weights) prediction = sum([prediction for prediction in \ votes.predictions]) / float(total_weight) if compact: output = [prediction] else: output = {"prediction": prediction} else: output = votes.combine_to_distribution(normalize=True) if not compact: output = [{'category': class_name, 'probability': probability} for class_name, probability in zip(self.class_names, output)] return output