def predict(self, input_data, by_name=True, method=PLURALITY_CODE, with_confidence=False, options=None): """Makes a prediction based on the prediction made by every model. The method parameter is a numeric key to the following combination methods in classifications/regressions: 0 - majority vote (plurality)/ average: PLURALITY_CODE 1 - confidence weighted majority vote / error weighted: CONFIDENCE_CODE 2 - probability weighted majority vote / average: PROBABILITY_CODE 3 - threshold filtered vote / doesn't apply: THRESHOLD_CODE """ if len(self.models_splits) > 1: # If there's more than one chunck of models, they must be # sequentially used to generate the votes for the prediction votes = MultiVote([]) for models_split in self.models_splits: models = [retrieve_resource(self.api, model_id, query_string=ONLY_MODEL) for model_id in models_split] multi_model = MultiModel(models, api=self.api) votes_split = multi_model.generate_votes(input_data, by_name=by_name) votes.extend(votes_split.predictions) else: # When only one group of models is found you use the # corresponding multimodel to predict votes_split = self.multi_model.generate_votes(input_data, by_name=by_name) votes = MultiVote(votes_split.predictions) return votes.combine(method=method, with_confidence=with_confidence, options=options)
def __init__(self, ensemble, api=None, max_models=None): if api is None: self.api = BigML(storage=STORAGE) else: self.api = api self.ensemble_id = None if isinstance(ensemble, list): try: models = [get_model_id(model) for model in ensemble] except ValueError: raise ValueError('Failed to verify the list of models. Check ' 'your model id values.') self.distributions = None else: self.ensemble_id = get_ensemble_id(ensemble) ensemble = check_resource(ensemble, self.api.get_ensemble) models = ensemble['object']['models'] self.distributions = ensemble['object'].get('distributions', None) self.model_ids = models self.fields = self.all_model_fields() number_of_models = len(models) if max_models is None: self.models_splits = [models] else: self.models_splits = [models[index:(index + max_models)] for index in range(0, number_of_models, max_models)] if len(self.models_splits) == 1: models = [retrieve_resource(self.api, model_id, query_string=ONLY_MODEL) for model_id in self.models_splits[0]] self.multi_model = MultiModel(models, self.api)
def _combine_distributions(self, input_data, missing_strategy, method=PROBABILITY_CODE): """Computes the predicted distributions and combines them to give the final predicted distribution. Depending on the method parameter probability, votes or the confidence are used to weight the models. """ if len(self.models_splits) > 1: # If there's more than one chunk of models, they must be # sequentially used to generate the votes for the prediction votes = MultiVoteList([]) for models_split in self.models_splits: models = self._get_models(models_split) multi_model = MultiModel(models, api=self.api, fields=self.fields, class_names=self.class_names) votes_split = multi_model.generate_votes_distribution( \ input_data, missing_strategy=missing_strategy, method=method) votes.extend(votes_split) else: # When only one group of models is found you use the # corresponding multimodel to predict votes = self.multi_model.generate_votes_distribution( \ input_data, missing_strategy=missing_strategy, method=method) return votes.combine_to_distribution(normalize=False)
def local_predict(models, test_reader, output, method, prediction_info=None): """Get local predictions and combine them to get a final prediction """ local_model = MultiModel(models) test_set_header = test_reader.has_headers() for input_data in test_reader: prediction = local_model.predict(input_data, by_name=test_set_header, method=method, with_confidence=True) u.write_prediction(prediction, output, prediction_info)
def predict(self, input_data, by_name=True, method=PLURALITY_CODE, with_confidence=False): """Makes a prediction based on the prediction made by every model. The method parameter is a numeric key to the following combination methods in classifications/regressions: 0 - majority vote (plurality)/ average: PLURALITY_CODE 1 - confidence weighted majority vote / error weighted: CONFIDENCE_CODE 2 - probability weighted majority vote / average: PROBABILITY_CODE """ votes = MultiVote([]) for models_split in self.models_splits: models = [retrieve_model(self.api, model_id) for model_id in models_split] multi_model = MultiModel(models) votes_split = multi_model.generate_votes(input_data, by_name=by_name) votes.extend(votes_split.predictions) return votes.combine(method=method, with_confidence=with_confidence)
def local_predict(models, test_reader, output, args, options=None, exclude=None): """Get local predictions and combine them to get a final prediction """ local_model = MultiModel(models) test_set_header = test_reader.has_headers() for input_data in test_reader: input_data_dict = test_reader.dict(input_data) prediction = local_model.predict( input_data_dict, by_name=test_set_header, method=args.method, with_confidence=True, options=options, missing_strategy=args.missing_strategy) write_prediction(prediction, output, args.prediction_info, input_data, exclude)
def local_batch_predict(models, test_reader, prediction_file, api, args, resume=False, output_path=None, output=None, method=PLURALITY_CODE, options=None, session_file=None, labels=None, ordered=True, exclude=None, models_per_label=1, other_label=OTHER, multi_label_data=None): """Get local predictions form partial Multimodel, combine and save to file """ def draw_progress_bar(current, total): """Draws a text based progress report. """ pct = 100 - ((total - current) * 100) / (total) console_log("Predicted on %s out of %s models [%s%%]" % (localize(current), localize(total), pct), reset=True) max_models = args.max_batch_models if labels is None: labels = [] test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) if output is None: try: output = open(prediction_file, 'w', 0) except IOError: raise IOError("Failed to write in %s" % prediction_file) models_total = len(models) models_splits = [ models[index:(index + max_models)] for index in range(0, models_total, max_models) ] # Input data is stored as a list and predictions are made for all rows # with each model raw_input_data_list = [] for input_data in test_reader: raw_input_data_list.append(input_data) total_votes = [] models_order = [] models_count = 0 single_model = models_total == 1 query_string = FIELDS_QS if single_model else ALL_FIELDS_QS # processing the models in slots for models_split in models_splits: if resume: for model in models_split: pred_file = get_predictions_file_name(model, output_path) c.checkpoint(c.are_predictions_created, pred_file, test_reader.number_of_tests(), debug=args.debug) # retrieving the full models allowed by --max-batch-models to be used # in a multimodel slot complete_models, models_order = retrieve_models_split( models_split, api, query_string=query_string, labels=labels, multi_label_data=multi_label_data, ordered=ordered, models_order=models_order) # predicting with the multimodel slot if complete_models: local_model = MultiModel(complete_models, api=api) # added to ensure garbage collection at each step of the loop gc.collect() try: votes = local_model.batch_predict( raw_input_data_list, output_path, by_name=test_set_header, reuse=True, missing_strategy=args.missing_strategy, headers=test_reader.raw_headers, to_file=(not args.fast), use_median=args.median) except ImportError: sys.exit("Failed to find the numpy and scipy libraries needed" " to use proportional missing strategy for" " regressions. Please, install them manually") # extending the votes for each input data with the new model-slot # predictions if not args.fast: votes = local_model.batch_votes(output_path) models_count += max_models if models_count > models_total: models_count = models_total if args.verbosity: draw_progress_bar(models_count, models_total) if total_votes: for index in range(0, len(votes)): predictions = total_votes[index] predictions.extend(votes[index].predictions) else: total_votes = votes if not single_model: message = u.dated("Combining predictions.\n") u.log_message(message, log_file=session_file, console=args.verbosity) # combining the votes to issue the final prediction for each input data for index in range(0, len(total_votes)): multivote = total_votes[index] input_data = raw_input_data_list[index] if single_model: # single model predictions need no combination prediction = [ multivote.predictions[0]['prediction'], multivote.predictions[0]['confidence'] ] elif method == AGGREGATION: # multi-labeled fields: predictions are concatenated prediction = aggregate_multivote( multivote, options, labels, models_per_label, ordered, models_order, label_separator=args.label_separator) elif method == COMBINATION: # used in --max-categories flag: each model slot contains a # subset of categories and the predictions for all of them # are combined in a global distribution to obtain the final # prediction prediction = combine_multivote(multivote, other_label=other_label) else: prediction = multivote.combine(method=method, with_confidence=True, options=options) write_prediction(prediction, output, args.prediction_info, input_data, exclude)
def i_create_a_local_multi_model(step): world.local_model = MultiModel(world.list_of_models)
def local_batch_predict(models, test_reader, prediction_file, api, max_models=MAX_MODELS, resume=False, output_path=None, output=None, verbosity=True, method=PLURALITY_CODE, session_file=None, debug=False, prediction_info=None): """Get local predictions form partial Multimodel, combine and save to file """ def draw_progress_bar(current, total): """Draws a text based progress report. """ pct = 100 - ((total - current) * 100) / (total) console_log("Predicted on %s out of %s models [%s%%]" % (localize(current), localize(total), pct)) test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) if output is None: try: output = open(prediction_file, 'w', 0) except IOError: raise IOError("Failed to write in %s" % prediction_file) models_total = len(models) models_splits = [ models[index:(index + max_models)] for index in range(0, models_total, max_models) ] input_data_list = [] raw_input_data_list = [] for input_data in test_reader: raw_input_data_list.append(input_data) input_data_list.append(test_reader.dict(input_data)) total_votes = [] models_count = 0 for models_split in models_splits: if resume: for model in models_split: pred_file = get_predictions_file_name(model, output_path) c.checkpoint(c.are_predictions_created, pred_file, test_reader.number_of_tests(), debug=debug) complete_models = [] for index in range(len(models_split)): model = models_split[index] if (isinstance(model, basestring) or bigml.api.get_status(model)['code'] != bigml.api.FINISHED): try: model = u.check_resource(model, api.get_model, FIELDS_QS) except ValueError, exception: sys.exit("Failed to get model: %s" % (model, str(exception))) complete_models.append(model) local_model = MultiModel(complete_models) local_model.batch_predict(input_data_list, output_path, by_name=test_set_header, reuse=True) votes = local_model.batch_votes(output_path) models_count += max_models if models_count > models_total: models_count = models_total if verbosity: draw_progress_bar(models_count, models_total) if total_votes: for index in range(0, len(votes)): predictions = total_votes[index].predictions predictions.extend(votes[index].predictions) else: total_votes = votes
def __init__(self, ensemble, api=None, max_models=None, cache_get=None): self.model_splits = [] self.multi_model = None self.api = get_api_connection(api) self.fields = None self.class_names = None if use_cache(cache_get): # using a cache to store the model attributes self.__dict__ = load(get_ensemble_id(ensemble), cache_get) self.api = get_api_connection(api) if len(self.models_splits) == 1: # retrieve the models from a cache get function try: models = [ Model(model_id, cache_get=cache_get) for model_id in self.models_splits[0] ] except Exception as exc: raise Exception('Error while calling the user-given' ' function %s: %s' % (cache_get.__name__, str(exc))) self.multi_model = MultiModel(models, self.api, fields=self.fields, class_names=self.class_names, cache_get=cache_get) return self.resource_id = None self.objective_id = None self.distributions = None self.distribution = None self.boosting = None self.boosting_offsets = None self.cache_get = None self.regression = False self.importance = {} query_string = ONLY_MODEL no_check_fields = False self.input_fields = [] if isinstance(ensemble, list): if all([isinstance(model, Model) for model in ensemble]): models = ensemble self.model_ids = [ local_model.resource_id for local_model in models ] else: try: models = [get_model_id(model) for model in ensemble] self.model_ids = models except ValueError as exc: raise ValueError('Failed to verify the list of models.' ' Check your model id values: %s' % str(exc)) else: ensemble = self.get_ensemble_resource(ensemble) self.resource_id = get_ensemble_id(ensemble) if not check_local_but_fields(ensemble): # avoid checking fields because of old ensembles ensemble = retrieve_resource(self.api, self.resource_id, no_check_fields=True) if ensemble['object'].get('type') == BOOSTING: self.boosting = ensemble['object'].get('boosting') models = ensemble['object']['models'] self.distributions = ensemble['object'].get('distributions', []) self.importance = ensemble['object'].get('importance', []) self.model_ids = models # new ensembles have the fields structure if ensemble['object'].get('ensemble'): self.fields = ensemble['object'].get( \ 'ensemble', {}).get("fields") self.objective_id = ensemble['object'].get("objective_field") query_string = EXCLUDE_FIELDS no_check_fields = True self.input_fields = ensemble['object'].get('input_fields') number_of_models = len(models) if max_models is None: self.models_splits = [models] else: self.models_splits = [ models[index:(index + max_models)] for index in range(0, number_of_models, max_models) ] if len(self.models_splits) == 1: if not isinstance(models[0], Model): if use_cache(cache_get): # retrieve the models from a cache get function try: models = [ Model(model_id, cache_get=cache_get) for model_id in self.models_splits[0] ] self.cache_get = cache_get except Exception as exc: raise Exception('Error while calling the user-given' ' function %s: %s' % (cache_get.__name__, str(exc))) else: models = [retrieve_resource( \ self.api, model_id, query_string=query_string, no_check_fields=no_check_fields) for model_id in self.models_splits[0]] model = models[0] else: # only retrieving first model self.cache_get = cache_get if not isinstance(models[0], Model): if use_cache(cache_get): # retrieve the models from a cache get function try: model = Model(self.models_splits[0][0], cache_get=cache_get) self.cache_get = cache_get except Exception as exc: raise Exception('Error while calling the user-given' ' function %s: %s' % (cache_get.__name__, str(exc))) else: model = retrieve_resource( \ self.api, self.models_splits[0][0], query_string=query_string, no_check_fields=no_check_fields) models = [model] if self.distributions is None: try: self.distributions = [] for model in models: self.distributions.append( {'training': model.root_distribution}) except AttributeError: self.distributions = [ model['object']['model']['distribution'] for model in models ] if self.boosting is None: self._add_models_attrs(model, max_models) if self.fields is None: self.fields, self.objective_id = self.all_model_fields( max_models=max_models) if self.fields: add_distribution(self) self.regression = \ self.fields[self.objective_id].get('optype') == NUMERIC if self.boosting: self.boosting_offsets = ensemble['object'].get('initial_offset', 0) \ if self.regression else dict(ensemble['object'].get( \ 'initial_offsets', [])) if not self.regression: try: objective_field = self.fields[self.objective_id] categories = objective_field['summary']['categories'] classes = [category[0] for category in categories] except (AttributeError, KeyError): classes = set() for distribution in self.distributions: for category in distribution['training']['categories']: classes.add(category[0]) self.class_names = sorted(classes) self.objective_categories = [category for \ category, _ in self.fields[self.objective_id][ \ "summary"]["categories"]] ModelFields.__init__( \ self, self.fields, objective_id=self.objective_id) if len(self.models_splits) == 1: self.multi_model = MultiModel(models, self.api, fields=self.fields, class_names=self.class_names)
def predict(self, input_data, method=None, options=None, missing_strategy=LAST_PREDICTION, operating_point=None, operating_kind=None, median=False, full=False): """Makes a prediction based on the prediction made by every model. :param input_data: Test data to be used as input :param method: **deprecated**. Please check the `operating_kind` attribute. Numeric key code for the following combination methods in classifications/regressions: 0 - majority vote (plurality)/ average: PLURALITY_CODE 1 - confidence weighted majority vote / error weighted: CONFIDENCE_CODE 2 - probability weighted majority vote / average: PROBABILITY_CODE 3 - threshold filtered vote / doesn't apply: THRESHOLD_CODE :param options: Options to be used in threshold filtered votes. :param missing_strategy: numeric key for the individual model's prediction method. See the model predict method. :param operating_point: In classification models, this is the point of the ROC curve where the model will be used at. The operating point can be defined in terms of: - the positive_class, the class that is important to predict accurately - its kind: probability, confidence or voting - its threshold: the minimum established for the positive_class to be predicted. The operating_point is then defined as a map with three attributes, e.g.: {"positive_class": "Iris-setosa", "kind": "probability", "threshold": 0.5} :param operating_kind: "probability", "confidence" or "votes". Sets the property that decides the prediction. Used only if no operating_point is used :param median: Uses the median of each individual model's predicted node as individual prediction for the specified combination method. :param full: Boolean that controls whether to include the prediction's attributes. By default, only the prediction is produced. If set to True, the rest of available information is added in a dictionary format. The dictionary keys can be: - prediction: the prediction value - confidence: prediction's confidence - probability: prediction's probability - path: rules that lead to the prediction - count: number of training instances supporting the prediction - next: field to check in the next split - min: minim value of the training instances in the predicted node - max: maximum value of the training instances in the predicted node - median: median of the values of the training instances in the predicted node - unused_fields: list of fields in the input data that are not being used in the model """ # Checks and cleans input_data leaving the fields used in the model new_data = self.filter_input_data( \ input_data, add_unused_fields=full) unused_fields = None if full: input_data, unused_fields = new_data else: input_data = new_data # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) if median and method is None: # predictions with median are only available with old combiners method = PLURALITY_CODE if method is None and operating_point is None and \ operating_kind is None and not median: # operating_point has precedence over operating_kind. If no # combiner is set, default operating kind is "probability" operating_kind = "probability" if operating_point: if self.regression: raise ValueError("The operating_point argument can only be" " used in classifications.") prediction = self.predict_operating( \ input_data, missing_strategy=missing_strategy, operating_point=operating_point) if full: return prediction return prediction["prediction"] if operating_kind: if self.regression: # for regressions, operating_kind defaults to the old # combiners method = 1 if operating_kind == "confidence" else 0 return self.predict( \ input_data, method=method, options=options, missing_strategy=missing_strategy, operating_point=None, operating_kind=None, full=full) prediction = self.predict_operating_kind( \ input_data, missing_strategy=missing_strategy, operating_kind=operating_kind) return prediction if len(self.models_splits) > 1: # If there's more than one chunk of models, they must be # sequentially used to generate the votes for the prediction votes = MultiVote([], boosting_offsets=self.boosting_offsets) for models_split in self.models_splits: models = self._get_models(models_split) multi_model = MultiModel(models, api=self.api, fields=self.fields) votes_split = multi_model._generate_votes( input_data, missing_strategy=missing_strategy, unused_fields=unused_fields) if median: for prediction in votes_split.predictions: prediction['prediction'] = prediction['median'] votes.extend(votes_split.predictions) else: # When only one group of models is found you use the # corresponding multimodel to predict votes_split = self.multi_model._generate_votes( input_data, missing_strategy=missing_strategy, unused_fields=unused_fields) votes = MultiVote(votes_split.predictions, boosting_offsets=self.boosting_offsets) if median: for prediction in votes.predictions: prediction['prediction'] = prediction['median'] if self.boosting is not None and not self.regression: categories = [ \ d[0] for d in self.fields[self.objective_id]["summary"]["categories"]] options = {"categories": categories} result = votes.combine(method=method, options=options, full=full) if full: unused_fields = set(input_data.keys()) for prediction in votes.predictions: unused_fields = unused_fields.intersection( \ set(prediction.get("unused_fields", []))) if not isinstance(result, dict): result = {"prediction": result} result['unused_fields'] = list(unused_fields) return result
def local_batch_predict(models, test_reader, prediction_file, api, args, resume=False, output_path=None, output=None, method=PLURALITY_CODE, options=None, session_file=None, labels=None, ordered=True, exclude=None, models_per_label=1, other_label=OTHER, multi_label_data=None): """Get local predictions form partial Multimodel, combine and save to file """ def draw_progress_bar(current, total): """Draws a text based progress report. """ pct = 100 - ((total - current) * 100) / (total) console_log("Predicted on %s out of %s models [%s%%]" % (localize(current), localize(total), pct)) max_models = args.max_batch_models label_separator = args.label_separator if labels is None: labels = [] test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) if output is None: try: output = open(prediction_file, 'w', 0) except IOError: raise IOError("Failed to write in %s" % prediction_file) models_total = len(models) models_splits = [ models[index:(index + max_models)] for index in range(0, models_total, max_models) ] input_data_list = [] raw_input_data_list = [] for input_data in test_reader: raw_input_data_list.append(input_data) input_data_list.append(test_reader.dict(input_data)) total_votes = [] models_count = 0 if not ordered: models_order = [] single_model = models_total == 1 query_string = FIELDS_QS if single_model else ALL_FIELDS_QS for models_split in models_splits: if resume: for model in models_split: pred_file = get_predictions_file_name(model, output_path) c.checkpoint(c.are_predictions_created, pred_file, test_reader.number_of_tests(), debug=args.debug) complete_models = [] for index in range(len(models_split)): model = models_split[index] if (isinstance(model, basestring) or bigml.api.get_status(model)['code'] != bigml.api.FINISHED): try: model = u.check_resource(model, api.get_model, query_string) except ValueError, exception: sys.exit("Failed to get model: %s. %s" % (model, str(exception))) # When user selects the labels in multi-label predictions, we must # filter the models that will be used to predict if labels: objective_column = str(multi_label_data['objective_column']) labels_info = multi_label_data['generated_fields'][ objective_column] labels_columns = [ label_info[1] for label_info in labels_info if label_info[0] in labels ] model_objective_id = model['object']['objective_fields'][0] model_fields = model['object']['model']['fields'] model_objective = model_fields[model_objective_id] model_column = model_objective['column_number'] if (model_column in labels_columns): # When the list of models comes from a --model-tag # selection, the models are not retrieved in the same # order they were created. We must keep track of the # label they are associated with to label their # predictions properly if not ordered: models_order.append(model_column) complete_models.append(model) else: complete_models.append(model) if complete_models: local_model = MultiModel(complete_models) try: local_model.batch_predict( input_data_list, output_path, by_name=test_set_header, reuse=True, missing_strategy=args.missing_strategy) except ImportError: sys.exit("Failed to find the numpy and scipy libraries needed" " to use proportional missing strategy for" " regressions. Please, install them manually") votes = local_model.batch_votes(output_path) models_count += max_models if models_count > models_total: models_count = models_total if args.verbosity: draw_progress_bar(models_count, models_total) if total_votes: for index in range(0, len(votes)): predictions = total_votes[index] predictions.extend(votes[index].predictions) else: total_votes = votes