def batch_votes(self, predictions_file_path, data_locale=None): """Adds the votes for predictions generated by the models. Returns a list of predictions groups. A prediction group is a dict whose key is the prediction and its value is a list of the confidences with which the prediction has been issued """ predictions_files = [] for model in self.models: predictions_files.append((model, csv.reader(open( get_predictions_file_name(model.resource_id, predictions_file_path), "U"), lineterminator="\n"))) votes = [] predictions = {} prediction = True while prediction: if predictions: votes.append(predictions) predictions = {} for (model, handler) in predictions_files: try: row = handler.next() prediction = row[0] confidence = float(row[1]) except StopIteration: prediction = False break prediction = model.to_prediction(prediction, data_locale) if not prediction in predictions: predictions[prediction] = [] predictions[prediction].append(confidence) return votes
def batch_predict(self, input_data_list, output_file_path, by_name=True, reuse=False): """Makes predictions for a list of input data. The predictions generated for each model are stored in an output file. The name of the file will use the following syntax: model_[id of the model]__predictions.csv For instance, when using model/50c0de043b563519830001c2 to predict, the output file name will be model_50c0de043b563519830001c2__predictions.csv """ for model in self.models: output_file = get_predictions_file_name(model.resource_id, output_file_path) if reuse: try: predictions_file = open(output_file) predictions_file.close() continue except IOError: pass try: predictions_file = csv.writer(open(output_file, 'w', 0), lineterminator="\n") except IOError: raise Exception("Cannot find %s directory." % output_file_path) for input_data in input_data_list: prediction = model.predict(input_data, by_name=by_name, with_confidence=True) if isinstance(prediction[0], basestring): prediction[0] = prediction[0].encode("utf-8") predictions_file.writerow(prediction)
def remote_predict_models(models, test_reader, prediction_file, api, args, resume=False, output_path=None, session_file=None, log=None, exclude=None): """Retrieve predictions remotely, combine them and save predictions to file """ predictions_files = [] prediction_args = { "tags": args.tag } test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) message_logged = False raw_input_data_list = [] for input_data in test_reader: raw_input_data_list.append(input_data) single_model = len(models) == 1 if single_model: prediction_file = UnicodeWriter(prediction_file).open_writer() for model in models: model = bigml.api.get_model_id(model) predictions_file = get_predictions_file_name(model, output_path) predictions_files.append(predictions_file) if (not resume or not c.checkpoint(c.are_predictions_created, predictions_file, test_reader.number_of_tests(), debug=args.debug)[0]): if not message_logged: message = u.dated("Creating remote predictions.\n") u.log_message(message, log_file=session_file, console=args.verbosity) message_logged = True with UnicodeWriter(predictions_file) as predictions_file: for input_data in raw_input_data_list: input_data_dict = test_reader.dict(input_data) prediction = api.create_prediction(model, input_data_dict, by_name=test_set_header, wait_time=0, args=prediction_args) u.check_resource_error(prediction, "Failed to create prediction: ") u.log_message("%s\n" % prediction['resource'], log_file=log) prediction_row = prediction_to_row(prediction) predictions_file.writerow(prediction_row) if single_model: write_prediction(prediction_row[0:2], prediction_file, args.prediction_info, input_data, exclude) if single_model: prediction_file.close_writer() else: combine_votes(predictions_files, Model(models[0]).to_prediction, prediction_file, args.method, args.prediction_info, raw_input_data_list, exclude)
def local_batch_predict(models, headers, test_reader, exclude, fields, resume, output_path, max_models, number_of_tests, api, output, verbosity, method, objective_field, session_file, debug): """Get local predictions form partial Multimodel, combine and save to file """ def draw_progress_bar(current, total): """Draws a text based progress report. """ pct = 100 - ((total - current) * 100) / (total) console_log("Predicted on %s out of %s models [%s%%]" % ( localize(current), localize(total), pct)) models_total = len(models) models_splits = [models[index:(index + max_models)] for index in range(0, models_total, max_models)] input_data_list = [] for row in test_reader: for index in exclude: del row[index] input_data_list.append(fields.pair(row, headers, objective_field)) total_votes = [] models_count = 0 for models_split in models_splits: if resume: for model in models_split: pred_file = get_predictions_file_name(model, output_path) u.checkpoint(u.are_predictions_created, pred_file, number_of_tests, debug=debug) complete_models = [] for index in range(len(models_split)): complete_models.append(api.check_resource( models_split[index], api.get_model)) local_model = MultiModel(complete_models) local_model.batch_predict(input_data_list, output_path, reuse=True) votes = local_model.batch_votes(output_path) models_count += max_models if models_count > models_total: models_count = models_total if verbosity: draw_progress_bar(models_count, models_total) if total_votes: for index in range(0, len(votes)): predictions = total_votes[index].predictions predictions.extend(votes[index].predictions) else: total_votes = votes message = u.dated("Combining predictions.\n") u.log_message(message, log_file=session_file, console=verbosity) for multivote in total_votes: u.write_prediction(multivote.combine(method), output)
def batch_votes(self, predictions_file_path, data_locale=None): """Adds the votes for predictions generated by the models. Returns a list of MultiVote objects each of which contains a list of predictions. """ votes_files = [] for model in self.models: votes_files.append(get_predictions_file_name(model.resource_id, predictions_file_path)) return read_votes(votes_files, self.models[0].to_prediction, data_locale=data_locale)
def remote_predict(models, test_reader, prediction_file, api, resume=False, verbosity=True, output_path=None, method=PLURALITY_CODE, tags="", session_file=None, log=None, debug=False, prediction_info=None): """Retrieve predictions remotely, combine them and save predictions to file """ predictions_files = [] prediction_args = { "tags": tags } test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) message_logged = False raw_input_data_list = [] for model in models: model = bigml.api.get_model_id(model) predictions_file = get_predictions_file_name(model, output_path) predictions_files.append(predictions_file) if (not resume or not c.checkpoint(c.are_predictions_created, predictions_file, test_reader.number_of_tests(), debug=debug)): if not message_logged: message = u.dated("Creating remote predictions.") u.log_message(message, log_file=session_file, console=verbosity) message_logged = True predictions_file = csv.writer(open(predictions_file, 'w', 0), lineterminator="\n") for input_data in test_reader: raw_input_data_list.append(input_data) input_data_dict = test_reader.dict(input_data) prediction = api.create_prediction(model, input_data_dict, by_name=test_set_header, wait_time=0, args=prediction_args) u.check_resource_error(prediction, "Failed to create prediction: ") u.log_message("%s\n" % prediction['resource'], log_file=log) prediction_row = prediction_to_row(prediction) predictions_file.writerow(prediction_row) combine_votes(predictions_files, Model(models[0]).to_prediction, prediction_file, method, prediction_info, raw_input_data_list)
def remote_predict(models, headers, output_path, number_of_tests, resume, verbosity, test_reader, exclude, fields, api, prediction_file, method, tags, objective_field, session_file, test_set_header, log, debug): """Retrieve predictions remotely, combine them and save predictions to file """ predictions_files = [] prediction_args = { "tags": tags } for model in models: if not isinstance(model, basestring) and 'resource' in model: model = model['resource'] predictions_file = get_predictions_file_name(model, output_path) predictions_files.append(predictions_file) if (not resume or not u.checkpoint(u.are_predictions_created, predictions_file, number_of_tests, debug=debug)): message = u.dated("Creating remote predictions.\n") u.log_message(message, log_file=session_file, console=verbosity) predictions_file = csv.writer(open(predictions_file, 'w', 0)) for row in test_reader: for index in exclude: del row[index] input_data = fields.pair(row, headers, objective_field) prediction = api.create_prediction(model, input_data, by_name=test_set_header, wait_time=0, args=prediction_args) u.log_message("%s\n" % prediction['resource'], log_file=log) prediction_row = u.prediction_to_row(prediction) predictions_file.writerow(prediction_row) u.combine_votes(predictions_files, Model(models[0]).to_prediction, prediction_file, method)
def local_batch_predict(models, test_reader, prediction_file, api, max_models=MAX_MODELS, resume=False, output_path=None, output=None, verbosity=True, method=PLURALITY_CODE, options=None, session_file=None, debug=False, prediction_info=NORMAL_FORMAT, labels=None, label_separator=None, ordered=True, exclude=None, models_per_label=1, other_label=OTHER, multi_label_data=None): """Get local predictions form partial Multimodel, combine and save to file """ def draw_progress_bar(current, total): """Draws a text based progress report. """ pct = 100 - ((total - current) * 100) / (total) console_log("Predicted on %s out of %s models [%s%%]" % ( localize(current), localize(total), pct)) if labels is None: labels = [] test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) if output is None: try: output = open(prediction_file, 'w', 0) except IOError: raise IOError("Failed to write in %s" % prediction_file) models_total = len(models) models_splits = [models[index:(index + max_models)] for index in range(0, models_total, max_models)] input_data_list = [] raw_input_data_list = [] for input_data in test_reader: raw_input_data_list.append(input_data) input_data_list.append(test_reader.dict(input_data)) total_votes = [] models_count = 0 if not ordered: models_order = [] single_model = models_total == 1 query_string = FIELDS_QS if single_model else ALL_FIELDS_QS for models_split in models_splits: if resume: for model in models_split: pred_file = get_predictions_file_name(model, output_path) c.checkpoint(c.are_predictions_created, pred_file, test_reader.number_of_tests(), debug=debug) complete_models = [] for index in range(len(models_split)): model = models_split[index] if (isinstance(model, basestring) or bigml.api.get_status(model)['code'] != bigml.api.FINISHED): try: model = u.check_resource(model, api.get_model, query_string) except ValueError, exception: sys.exit("Failed to get model: %s. %s" % (model, str(exception))) # When user selects the labels in multi-label predictions, we must # filter the models that will be used to predict if labels: objective_column = str(multi_label_data['objective_column']) labels_info = multi_label_data[ 'generated_fields'][objective_column] labels_columns = [label_info[1] for label_info in labels_info if label_info[0] in labels] model_objective_id = model['object']['objective_fields'][0] model_fields = model['object']['model']['fields'] model_objective = model_fields[model_objective_id] model_column = model_objective['column_number'] if (model_column in labels_columns): # When the list of models comes from a --model-tag # selection, the models are not retrieved in the same # order they were created. We must keep track of the # label they are associated with to label their # predictions properly if not ordered: models_order.append(model_column) complete_models.append(model) else: complete_models.append(model) if complete_models: local_model = MultiModel(complete_models) local_model.batch_predict(input_data_list, output_path, by_name=test_set_header, reuse=True) votes = local_model.batch_votes(output_path) models_count += max_models if models_count > models_total: models_count = models_total if verbosity: draw_progress_bar(models_count, models_total) if total_votes: for index in range(0, len(votes)): predictions = total_votes[index] predictions.extend(votes[index].predictions) else: total_votes = votes
def local_batch_predict(models, test_reader, prediction_file, api, args, resume=False, output_path=None, output=None, method=PLURALITY_CODE, options=None, session_file=None, labels=None, ordered=True, exclude=None, models_per_label=1, other_label=OTHER, multi_label_data=None): """Get local predictions form partial Multimodel, combine and save to file """ def draw_progress_bar(current, total): """Draws a text based progress report. """ pct = 100 - ((total - current) * 100) / (total) console_log("Predicted on %s out of %s models [%s%%]" % ( localize(current), localize(total), pct), reset=True) max_models = args.max_batch_models if labels is None: labels = [] test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) if output is None: try: output = open(prediction_file, 'w', 0) except IOError: raise IOError("Failed to write in %s" % prediction_file) models_total = len(models) models_splits = [models[index:(index + max_models)] for index in range(0, models_total, max_models)] # Input data is stored as a list and predictions are made for all rows # with each model raw_input_data_list = [] for input_data in test_reader: raw_input_data_list.append(input_data) total_votes = [] models_order = [] models_count = 0 single_model = models_total == 1 query_string = FIELDS_QS if single_model else ALL_FIELDS_QS # processing the models in slots for models_split in models_splits: if resume: for model in models_split: pred_file = get_predictions_file_name(model, output_path) c.checkpoint(c.are_predictions_created, pred_file, test_reader.number_of_tests(), debug=args.debug) # retrieving the full models allowed by --max-batch-models to be used # in a multimodel slot complete_models, models_order = retrieve_models_split( models_split, api, query_string=query_string, labels=labels, multi_label_data=multi_label_data, ordered=ordered, models_order=models_order) # predicting with the multimodel slot if complete_models: local_model = MultiModel(complete_models, api=api) # added to ensure garbage collection at each step of the loop gc.collect() try: votes = local_model.batch_predict( raw_input_data_list, output_path, by_name=test_set_header, reuse=True, missing_strategy=args.missing_strategy, headers=test_reader.raw_headers, to_file=(not args.fast), use_median=args.median) except ImportError: sys.exit("Failed to find the numpy and scipy libraries needed" " to use proportional missing strategy for" " regressions. Please, install them manually") # extending the votes for each input data with the new model-slot # predictions if not args.fast: votes = local_model.batch_votes(output_path) models_count += max_models if models_count > models_total: models_count = models_total if args.verbosity: draw_progress_bar(models_count, models_total) if total_votes: for index in range(0, len(votes)): predictions = total_votes[index] predictions.extend(votes[index].predictions) else: total_votes = votes if not single_model: message = u.dated("Combining predictions.\n") u.log_message(message, log_file=session_file, console=args.verbosity) # combining the votes to issue the final prediction for each input data for index in range(0, len(total_votes)): multivote = total_votes[index] input_data = raw_input_data_list[index] if single_model: # single model predictions need no combination prediction = [multivote.predictions[0]['prediction'], multivote.predictions[0]['confidence']] elif method == AGGREGATION: # multi-labeled fields: predictions are concatenated prediction = aggregate_multivote( multivote, options, labels, models_per_label, ordered, models_order, label_separator=args.label_separator) elif method == COMBINATION: # used in --max-categories flag: each model slot contains a # subset of categories and the predictions for all of them # are combined in a global distribution to obtain the final # prediction prediction = combine_multivote(multivote, other_label=other_label) else: prediction = multivote.combine(method=method, with_confidence=True, options=options) write_prediction(prediction, output, args.prediction_info, input_data, exclude)
def batch_predict(self, input_data_list, output_file_path=None, by_name=True, reuse=False, missing_strategy=LAST_PREDICTION, headers=None, to_file=True): """Makes predictions for a list of input data. When the to_file argument is set to True, the predictions generated for each model are stored in an output file. The name of the file will use the following syntax: model_[id of the model]__predictions.csv For instance, when using model/50c0de043b563519830001c2 to predict, the output file name will be model_50c0de043b563519830001c2__predictions.csv On the contrary, if it is False, the function returns a list of MultiVote objects with the model's predictions. """ add_headers = (isinstance(input_data_list[0], list) and headers is not None and len(headers) == len(input_data_list[0])) if not add_headers and not isinstance(input_data_list[0], dict): raise ValueError("Input data list is not a dictionary or the" " headers and input data information are not" " consistent.") order = 0 if not to_file: votes = [] for model in self.models: order += 1 if to_file: output_file = get_predictions_file_name( model.resource_id, output_file_path) if reuse: try: predictions_file = open(output_file) predictions_file.close() continue except IOError: pass try: predictions_file = csv.writer(open(output_file, 'w', 0), lineterminator="\n") except IOError: raise Exception("Cannot find %s directory." % output_file_path) for index, input_data in enumerate(input_data_list): if add_headers: input_data = dict(zip(headers, input_data)) prediction = model.predict(input_data, by_name=by_name, with_confidence=True, missing_strategy=missing_strategy) if to_file: if isinstance(prediction[0], basestring): prediction[0] = prediction[0].encode("utf-8") predictions_file.writerow(prediction) else: prediction, confidence, distribution, instances = prediction prediction_row = [ prediction, confidence, order, distribution, instances ] if len(votes) <= index: votes.append(MultiVote([])) votes[index].append_row(prediction_row) if not to_file: return votes
def remote_predict_models(models, test_reader, prediction_file, api, args, resume=False, output_path=None, session_file=None, log=None, exclude=None): """Retrieve predictions remotely, combine them and save predictions to file """ predictions_files = [] prediction_args = {"tags": args.tag} test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) message_logged = False raw_input_data_list = [] for input_data in test_reader: raw_input_data_list.append(input_data) single_model = len(models) == 1 if single_model: prediction_file = UnicodeWriter(prediction_file).open_writer() for model in models: model = bigml.api.get_model_id(model) predictions_file = get_predictions_file_name(model, output_path) predictions_files.append(predictions_file) if (not resume or not c.checkpoint(c.are_predictions_created, predictions_file, test_reader.number_of_tests(), debug=args.debug)[0]): if not message_logged: message = u.dated("Creating remote predictions.\n") u.log_message(message, log_file=session_file, console=args.verbosity) message_logged = True with UnicodeWriter(predictions_file) as predictions_file: for input_data in raw_input_data_list: input_data_dict = test_reader.dict(input_data) prediction = api.create_prediction(model, input_data_dict, by_name=test_set_header, wait_time=0, args=prediction_args) u.check_resource_error(prediction, "Failed to create prediction: ") u.log_message("%s\n" % prediction['resource'], log_file=log) prediction_row = prediction_to_row(prediction) predictions_file.writerow(prediction_row) if single_model: write_prediction(prediction_row[0:2], prediction_file, args.prediction_info, input_data, exclude) if single_model: prediction_file.close_writer() else: combine_votes(predictions_files, Model(models[0]).to_prediction, prediction_file, args.method, args.prediction_info, raw_input_data_list, exclude)
def batch_predict(self, input_data_list, output_file_path=None, by_name=True, reuse=False, missing_strategy=LAST_PREDICTION, headers=None, to_file=True, use_median=False): """Makes predictions for a list of input data. When the to_file argument is set to True, the predictions generated for each model are stored in an output file. The name of the file will use the following syntax: model_[id of the model]__predictions.csv For instance, when using model/50c0de043b563519830001c2 to predict, the output file name will be model_50c0de043b563519830001c2__predictions.csv On the contrary, if it is False, the function returns a list of MultiVote objects with the model's predictions. """ add_headers = (isinstance(input_data_list[0], list) and headers is not None and len(headers) == len(input_data_list[0])) if not add_headers and not isinstance(input_data_list[0], dict): raise ValueError("Input data list is not a dictionary or the" " headers and input data information are not" " consistent.") order = 0 if not to_file: votes = [] for model in self.models: order += 1 out = None if to_file: output_file = get_predictions_file_name(model.resource_id, output_file_path) if reuse: try: predictions_file = open(output_file) predictions_file.close() continue except IOError: pass try: out = UnicodeWriter(output_file) except IOError: raise Exception("Cannot find %s directory." % output_file_path) if out: out.open_writer() for index, input_data in enumerate(input_data_list): if add_headers: input_data = dict(zip(headers, input_data)) prediction = model.predict(input_data, by_name=by_name, with_confidence=True, missing_strategy=missing_strategy) if use_median and model.tree.regression: # if median is to be used, we just place it as prediction # starting the list prediction[0] = prediction[-1] prediction = prediction[:-1] if to_file: out.writerow(prediction) else: # prediction is a row that contains prediction, confidence, # distribution, instances prediction_row = prediction[0: 2] prediction_row.append(order) prediction_row.extend(prediction[2:]) if len(votes) <= index: votes.append(MultiVote([])) votes[index].append_row(prediction_row) if out: out.close_writer() if not to_file: return votes
def local_batch_predict(models, test_reader, prediction_file, api, args, resume=False, output_path=None, output=None, method=PLURALITY_CODE, options=None, session_file=None, labels=None, ordered=True, exclude=None, models_per_label=1, other_label=OTHER, multi_label_data=None): """Get local predictions form partial Multimodel, combine and save to file """ def draw_progress_bar(current, total): """Draws a text based progress report. """ pct = 100 - ((total - current) * 100) / (total) console_log("Predicted on %s out of %s models [%s%%]" % (localize(current), localize(total), pct)) max_models = args.max_batch_models if labels is None: labels = [] test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) if output is None: try: output = open(prediction_file, 'w', 0) except IOError: raise IOError("Failed to write in %s" % prediction_file) models_total = len(models) models_splits = [ models[index:(index + max_models)] for index in range(0, models_total, max_models) ] # Input data is stored as a list and predictions are made for all rows # with each model raw_input_data_list = [] for input_data in test_reader: raw_input_data_list.append(input_data) total_votes = [] models_order = [] models_count = 0 single_model = models_total == 1 query_string = FIELDS_QS if single_model else ALL_FIELDS_QS # processing the models in slots for models_split in models_splits: if resume: for model in models_split: pred_file = get_predictions_file_name(model, output_path) c.checkpoint(c.are_predictions_created, pred_file, test_reader.number_of_tests(), debug=args.debug) # retrieving the full models allowed by --max-batch-models to be used # in a multimodel slot complete_models, models_order = retrieve_models_split( models_split, api, query_string=query_string, labels=labels, multi_label_data=multi_label_data, ordered=ordered, models_order=models_order) # predicting with the multimodel slot if complete_models: local_model = MultiModel(complete_models, api=api) # added to ensure garbage collection at each step of the loop gc.collect() try: votes = local_model.batch_predict( raw_input_data_list, output_path, by_name=test_set_header, reuse=True, missing_strategy=args.missing_strategy, headers=test_reader.raw_headers, to_file=(not args.fast), use_median=args.median) except ImportError: sys.exit("Failed to find the numpy and scipy libraries needed" " to use proportional missing strategy for" " regressions. Please, install them manually") # extending the votes for each input data with the new model-slot # predictions if not args.fast: votes = local_model.batch_votes(output_path) models_count += max_models if models_count > models_total: models_count = models_total if args.verbosity: draw_progress_bar(models_count, models_total) if total_votes: for index in range(0, len(votes)): predictions = total_votes[index] predictions.extend(votes[index].predictions) else: total_votes = votes if not single_model: message = u.dated("Combining predictions.\n") u.log_message(message, log_file=session_file, console=args.verbosity) # combining the votes to issue the final prediction for each input data for index in range(0, len(total_votes)): multivote = total_votes[index] input_data = raw_input_data_list[index] if single_model: # single model predictions need no combination prediction = [ multivote.predictions[0]['prediction'], multivote.predictions[0]['confidence'] ] elif method == AGGREGATION: # multi-labeled fields: predictions are concatenated prediction = aggregate_multivote( multivote, options, labels, models_per_label, ordered, models_order, label_separator=args.label_separator) elif method == COMBINATION: # used in --max-categories flag: each model slot contains a # subset of categories and the predictions for all of them # are combined in a global distribution to obtain the final # prediction prediction = combine_multivote(multivote, other_label=other_label) else: prediction = multivote.combine(method=method, with_confidence=True, options=options) write_prediction(prediction, output, args.prediction_info, input_data, exclude)
def batch_predict(self, input_data_list, output_file_path=None, by_name=True, reuse=False, missing_strategy=LAST_PREDICTION, headers=None, to_file=True, use_median=False): """Makes predictions for a list of input data. When the to_file argument is set to True, the predictions generated for each model are stored in an output file. The name of the file will use the following syntax: model_[id of the model]__predictions.csv For instance, when using model/50c0de043b563519830001c2 to predict, the output file name will be model_50c0de043b563519830001c2__predictions.csv On the contrary, if it is False, the function returns a list of MultiVote objects with the model's predictions. """ add_headers = (isinstance(input_data_list[0], list) and headers is not None and len(headers) == len(input_data_list[0])) if not add_headers and not isinstance(input_data_list[0], dict): raise ValueError("Input data list is not a dictionary or the" " headers and input data information are not" " consistent.") order = 0 if not to_file: votes = [] for model in self.models: order += 1 out = None if to_file: output_file = get_predictions_file_name( model.resource_id, output_file_path) if reuse: try: predictions_file = open(output_file) predictions_file.close() continue except IOError: pass try: out = UnicodeWriter(output_file) except IOError: raise Exception("Cannot find %s directory." % output_file_path) if out: out.open_writer() for index, input_data in enumerate(input_data_list): if add_headers: input_data = dict(zip(headers, input_data)) prediction = model.predict(input_data, by_name=by_name, with_confidence=True, missing_strategy=missing_strategy) if use_median and model.tree.regression: # if median is to be used, we just place it as prediction # starting the list prediction[0] = prediction[-1] prediction = prediction[:-1] if to_file: out.writerow(prediction) else: # prediction is a row that contains prediction, confidence, # distribution, instances prediction_row = prediction[0:2] prediction_row.append(order) prediction_row.extend(prediction[2:]) if len(votes) <= index: votes.append(MultiVote([])) votes[index].append_row(prediction_row) if out: out.close_writer() if not to_file: return votes
def local_batch_predict(models, test_reader, prediction_file, api, args, resume=False, output_path=None, output=None, method=PLURALITY_CODE, options=None, session_file=None, labels=None, ordered=True, exclude=None, models_per_label=1, other_label=OTHER, multi_label_data=None): """Get local predictions form partial Multimodel, combine and save to file """ def draw_progress_bar(current, total): """Draws a text based progress report. """ pct = 100 - ((total - current) * 100) / (total) console_log("Predicted on %s out of %s models [%s%%]" % (localize(current), localize(total), pct)) max_models = args.max_batch_models label_separator = args.label_separator if labels is None: labels = [] test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) if output is None: try: output = open(prediction_file, 'w', 0) except IOError: raise IOError("Failed to write in %s" % prediction_file) models_total = len(models) models_splits = [ models[index:(index + max_models)] for index in range(0, models_total, max_models) ] input_data_list = [] raw_input_data_list = [] for input_data in test_reader: raw_input_data_list.append(input_data) input_data_list.append(test_reader.dict(input_data)) total_votes = [] models_count = 0 if not ordered: models_order = [] single_model = models_total == 1 query_string = FIELDS_QS if single_model else ALL_FIELDS_QS for models_split in models_splits: if resume: for model in models_split: pred_file = get_predictions_file_name(model, output_path) c.checkpoint(c.are_predictions_created, pred_file, test_reader.number_of_tests(), debug=args.debug) complete_models = [] for index in range(len(models_split)): model = models_split[index] if (isinstance(model, basestring) or bigml.api.get_status(model)['code'] != bigml.api.FINISHED): try: model = u.check_resource(model, api.get_model, query_string) except ValueError, exception: sys.exit("Failed to get model: %s. %s" % (model, str(exception))) # When user selects the labels in multi-label predictions, we must # filter the models that will be used to predict if labels: objective_column = str(multi_label_data['objective_column']) labels_info = multi_label_data['generated_fields'][ objective_column] labels_columns = [ label_info[1] for label_info in labels_info if label_info[0] in labels ] model_objective_id = model['object']['objective_fields'][0] model_fields = model['object']['model']['fields'] model_objective = model_fields[model_objective_id] model_column = model_objective['column_number'] if (model_column in labels_columns): # When the list of models comes from a --model-tag # selection, the models are not retrieved in the same # order they were created. We must keep track of the # label they are associated with to label their # predictions properly if not ordered: models_order.append(model_column) complete_models.append(model) else: complete_models.append(model) if complete_models: local_model = MultiModel(complete_models) try: local_model.batch_predict( input_data_list, output_path, by_name=test_set_header, reuse=True, missing_strategy=args.missing_strategy) except ImportError: sys.exit("Failed to find the numpy and scipy libraries needed" " to use proportional missing strategy for" " regressions. Please, install them manually") votes = local_model.batch_votes(output_path) models_count += max_models if models_count > models_total: models_count = models_total if args.verbosity: draw_progress_bar(models_count, models_total) if total_votes: for index in range(0, len(votes)): predictions = total_votes[index] predictions.extend(votes[index].predictions) else: total_votes = votes
def local_batch_predict(models, test_reader, prediction_file, api, max_models=MAX_MODELS, resume=False, output_path=None, output=None, verbosity=True, method=PLURALITY_CODE, session_file=None, debug=False, prediction_info=None): """Get local predictions form partial Multimodel, combine and save to file """ def draw_progress_bar(current, total): """Draws a text based progress report. """ pct = 100 - ((total - current) * 100) / (total) console_log("Predicted on %s out of %s models [%s%%]" % ( localize(current), localize(total), pct)) test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) if output is None: try: output = open(prediction_file, 'w', 0) except IOError: raise IOError("Failed to write in %s" % prediction_file) models_total = len(models) models_splits = [models[index:(index + max_models)] for index in range(0, models_total, max_models)] input_data_list = [] raw_input_data_list = [] for input_data in test_reader: raw_input_data_list.append(input_data) input_data_list.append(test_reader.dict(input_data)) total_votes = [] models_count = 0 for models_split in models_splits: if resume: for model in models_split: pred_file = get_predictions_file_name(model, output_path) c.checkpoint(c.are_predictions_created, pred_file, test_reader.number_of_tests(), debug=debug) complete_models = [] for index in range(len(models_split)): model = models_split[index] if (isinstance(model, basestring) or bigml.api.get_status(model)['code'] != bigml.api.FINISHED): try: model = u.check_resource(model, api.get_model, FIELDS_QS) except ValueError, exception: sys.exit("Failed to get model: %s" % (model, str(exception))) complete_models.append(model) local_model = MultiModel(complete_models) local_model.batch_predict(input_data_list, output_path, by_name=test_set_header, reuse=True) votes = local_model.batch_votes(output_path) models_count += max_models if models_count > models_total: models_count = models_total if verbosity: draw_progress_bar(models_count, models_total) if total_votes: for index in range(0, len(votes)): predictions = total_votes[index].predictions predictions.extend(votes[index].predictions) else: total_votes = votes
def batch_predict(self, input_data_list, output_file_path=None, by_name=True, reuse=False, missing_strategy=LAST_PREDICTION, headers=None, to_file=True): """Makes predictions for a list of input data. When the to_file argument is set to True, the predictions generated for each model are stored in an output file. The name of the file will use the following syntax: model_[id of the model]__predictions.csv For instance, when using model/50c0de043b563519830001c2 to predict, the output file name will be model_50c0de043b563519830001c2__predictions.csv On the contrary, if it is False, the function returns a list of MultiVote objects with the model's predictions. """ add_headers = (isinstance(input_data_list[0], list) and headers is not None and len(headers) == len(input_data_list[0])) if not add_headers and not isinstance(input_data_list[0], dict): raise ValueError("Input data list is not a dictionary or the" " headers and input data information are not" " consistent.") order = 0 if not to_file: votes = [] for model in self.models: order += 1 if to_file: output_file = get_predictions_file_name(model.resource_id, output_file_path) if reuse: try: predictions_file = open(output_file) predictions_file.close() continue except IOError: pass try: predictions_file = csv.writer(open(output_file, 'w', 0), lineterminator="\n") except IOError: raise Exception("Cannot find %s directory." % output_file_path) for index, input_data in enumerate(input_data_list): if add_headers: input_data = dict(zip(headers, input_data)) prediction = model.predict(input_data, by_name=by_name, with_confidence=True, missing_strategy=missing_strategy) if to_file: if isinstance(prediction[0], basestring): prediction[0] = prediction[0].encode("utf-8") predictions_file.writerow(prediction) else: prediction, confidence, distribution, instances = prediction prediction_row = [prediction, confidence, order, distribution, instances] if len(votes) <= index: votes.append(MultiVote([])) votes[index].append_row(prediction_row) if not to_file: return votes