def __init__(self, models, api=None): self.models = [] if isinstance(models, list): for model in models: self.models.append(Model(model, api=api)) else: self.models.append(Model(models, api=api))
def __init__(self, data, epsilon=0.01, rounding=None, black_box=False): self.black_box = black_box if not self.black_box: model_id = models[data] # retrieve a model from local storage or from bigml.io # (only works for public models) try: self.model = Model('model/{}'.format(model_id), api=BigML(storage=STORAGE)) except ValueError: self.model = Model('public/model/{}'.format(model_id), api=BigML(storage=STORAGE)) self.leaves = self.model.tree.get_leaves() else: logging.info('Extracting a Black Box Model') self.model_id = black_box_models[data] # get the black-box model with the real credentials for sanity # checks try: self.model = Model('model/{}'.format(self.model_id), api=BigML(username='******', api_key=BB_KEY)) except ValueError: self.model = Model('public/model/{}'.format(self.model_id), api=BigML(storage=STORAGE)) self.connection = BigML() TreeExtractor.__init__(self, epsilon, rounding)
def __init__(self, models): self.models = [] if isinstance(models, list): for model in models: self.models.append(Model(model)) else: self.models.append(Model(models))
def __init__(self, models, api=None, fields=None, class_names=None): self.models = [] self.class_names = class_names if isinstance(models, list): if all([isinstance(model, Model) for model in models]): self.models = models else: for model in models: self.models.append(Model(model, api=api, fields=fields)) else: self.models.append(Model(models, api=api, fields=fields))
def train_model(self, inputs, outputs, train): # Create a file with the trained data f = open("./data_train.csv", "w") for x0, y0 in zip(inputs[train], outputs[train]): y0 = np.array(y0) line = ",".join(np.insert(x0, len(x0), y0)) f.write(line + "\n") f.close() # Use the training file created previously to train a BigML model source = check_resource( self.api.create_source( './data_train.csv', { 'term_analysis': { "enabled": False }, 'source_parser': { "locale": "en-US" } }), self.api.get_source) dataset = check_resource(self.api.create_dataset(source), self.api.get_dataset) model = check_resource(self.api.create_model(dataset), self.api.get_model) local_model = Model(model) return [source, dataset, model, local_model]
def local_predict(models, test_reader, output, args, options=None, exclude=None): """Get local predictions and combine them to get a final prediction """ single_model = len(models) == 1 kwargs = {"full": True, "missing_strategy": args.missing_strategy} if single_model: local_model = Model(models[0], api=args.retrieve_api_) else: local_model = Ensemble(models, max_models=args.max_batch_models, api=args.retrieve_api_) kwargs.update({ "method": args.method, "options": options, "median": args.median }) if args.operating_point_: kwargs.update({"operating_point": args.operating_point_}) for input_data in test_reader: input_data_dict = dict(zip(test_reader.raw_headers, input_data)) prediction = local_model.predict(input_data_dict, **kwargs) if single_model and args.median and local_model.tree.regression: # only single models' predictions can be based on the median value # predict prediction["prediction"] = prediction["median"] write_prediction(prediction, output, args.prediction_info, input_data, exclude)
def local_predict(models, test_reader, output, args, options=None, exclude=None): """Get local predictions and combine them to get a final prediction """ single_model = len(models) == 1 test_set_header = test_reader.has_headers() kwargs = { "by_name": test_set_header, "with_confidence": True, "missing_strategy": args.missing_strategy } if single_model: local_model = Model(models[0]) else: local_model = Ensemble(models, max_models=args.max_batch_models) kwargs.update({ "method": args.method, "options": options, "median": args.median }) for input_data in test_reader: input_data_dict = dict(zip(test_reader.raw_headers, input_data)) prediction = local_model.predict(input_data_dict, **kwargs) if single_model and args.median and local_model.tree.regression: # only single models' predictions can be based on the median value # predict prediction[0] = prediction[-1] write_prediction(prediction[0:2], output, args.prediction_info, input_data, exclude)
def remote_predict(models, test_reader, prediction_file, api, resume=False, verbosity=True, output_path=None, method=PLURALITY_CODE, tags="", session_file=None, log=None, debug=False, prediction_info=None): """Retrieve predictions remotely, combine them and save predictions to file """ predictions_files = [] prediction_args = {"tags": tags} test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) message_logged = False raw_input_data_list = [] for model in models: model = bigml.api.get_model_id(model) predictions_file = get_predictions_file_name(model, output_path) predictions_files.append(predictions_file) if (not resume or not c.checkpoint(c.are_predictions_created, predictions_file, test_reader.number_of_tests(), debug=debug)): if not message_logged: message = u.dated("Creating remote predictions.") u.log_message(message, log_file=session_file, console=verbosity) message_logged = True predictions_file = csv.writer(open(predictions_file, 'w', 0), lineterminator="\n") for input_data in test_reader: raw_input_data_list.append(input_data) input_data_dict = test_reader.dict(input_data) prediction = api.create_prediction(model, input_data_dict, by_name=test_set_header, wait_time=0, args=prediction_args) u.check_resource_error(prediction, "Failed to create prediction: ") u.log_message("%s\n" % prediction['resource'], log_file=log) prediction_row = prediction_to_row(prediction) predictions_file.writerow(prediction_row) combine_votes(predictions_files, Model(models[0]).to_prediction, prediction_file, method, prediction_info, raw_input_data_list)
def all_model_fields(self): """Retrieves the fields used as predictors in all the ensemble models """ fields = {} for model_id in self.model_ids: local_model = Model(model_id, self.api) fields.update(local_model.fields) return fields
def generate_models(self, directory=STORAGE): """Generates the functions for the models in the ensemble """ if not os.path.isfile(directory) and not os.path.exists(directory): os.makedirs(directory) open(os.path.join(directory, "__init__.py"), "w").close() for model_id in self.model_ids: local_model = Model(model_id, api=self.api, fields=self.fields) local_flat_tree = FlatTree(local_model.tree, local_model.offsets, local_model.fields, local_model.objective_id, local_model.boosting) with open(os.path.join(directory, "%s.py" % model_id.replace("/", "_")), "w") \ as handler: local_flat_tree.python(out=handler, docstring="Model %s" % model_id)
def all_model_fields(self, max_models=None): """Retrieves the fields used as predictors in all the ensemble models """ fields = {} models = [] objective_id = None no_objective_id = False if isinstance(self.models_splits[0][0], Model): for split in self.models_splits: models.extend(split) else: models = self.model_ids for index, model_id in enumerate(models): if isinstance(model_id, Model): local_model = model_id elif self.cache_get is not None: local_model = self.cache_get(model_id) else: local_model = Model(model_id, self.api) if (max_models is not None and index > 0 and index % max_models == 0): gc.collect() fields.update(local_model.fields) if (objective_id is not None and objective_id != local_model.objective_id): # the models' objective field have different ids, no global id no_objective_id = True else: objective_id = local_model.objective_id if no_objective_id: objective_id = None gc.collect() return fields, objective_id
def test_local_model(model_name): # Create local_model object print("Creating local model from file .... ") model_file = glob.glob(os.path.join(MODEL_STORAGE, model_name, "model_*")) local_model = Model(model_file[0]) predict_storage = os.path.join(PREDICT_STORAGE, model_name) if not os.path.exists(predict_storage): print("Creating predict directory .... ") os.makedirs(predict_storage) predict_storage_local = os.path.join(predict_storage, "local_model_result") if not os.path.exists(predict_storage_local): print("Creating predict directory .... ") os.makedirs(predict_storage_local) print("Start predicting .... ") print(" Opening testing data") training_data_path = os.path.join(DATASET_STORAGE, model_name, model_name) + "_test.csv" with open(training_data_path, 'r') as test_handler, open(os.path.join(predict_storage_local, "PREDICT.txt"), 'w') as fh: reader = csv.DictReader(test_handler) counter = 1 tmp = "" for input_data in reader: tmp = tmp + "=================================\n" print("=================================") tmp = tmp + "===== Prediction " + str(counter) + " ========\n" print("===== Prediction ", counter, " ========") tmp = tmp + "=================================\n" print("=================================") tmp = tmp + "Input testing data : " + str(input_data) + "\n" print("Input testing data : ", input_data) predict_result = local_model.predict(input_data) tmp = tmp + ">> Prediction : " + str(predict_result) + "\n\n" print(">> Prediction : ", predict_result, "\n") fh.write(tmp) counter = counter + 1
def create_local_supervised_ensemble(step): world.local_ensemble = SupervisedModel(world.ensemble_id, world.api) world.local_model = Model(world.local_ensemble.model_ids[0], world.api)
def create_local_ensemble(step): world.local_ensemble = Ensemble(world.ensemble_id, world.api) world.local_model = Model(world.local_ensemble.model_ids[0], world.api)
def compute_output(api, args, training_set, test_set=None, output=None, objective_field=None, description=None, field_attributes=None, types=None, dataset_fields=None, model_fields=None, name=None, training_set_header=True, test_set_header=True, model_ids=None, votes_files=None, resume=False, fields_map=None): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ source = None dataset = None model = None models = None fields = None # It is compulsory to have a description to publish either datasets or # models if (not description and (args.black_box or args.white_box or args.public_dataset)): raise Exception("You should provide a description to publish.") path = u.check_dir(output) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required, open the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared if args.clear_logs: try: open(log, 'w', 0).close() except IOError: pass source, resume, csv_properties, fields = source_processing( training_set, test_set, training_set_header, test_set_header, name, description, api, args, resume, csv_properties=csv_properties, field_attributes=field_attributes, types=types, session_file=session_file, path=path, log=log) dataset, resume, csv_properties, fields = dataset_processing( source, training_set, test_set, model_ids, name, description, fields, dataset_fields, api, args, resume, csv_properties=csv_properties, session_file=session_file, path=path, log=log) # If test_split is used, split the dataset in a training and a test dataset # according to the given split if args.test_split > 0: dataset, test_dataset, resume = split_processing( dataset, name, description, api, args, resume, session_file=session_file, path=path, log=log) models, model_ids, resume = models_processing(dataset, models, model_ids, name, description, test_set, objective_field, fields, model_fields, api, args, resume, session_file=session_file, path=path, log=log) if models: model = models[0] # We get the fields of the model if we haven't got # them yet and update its public state if needed if model and not args.evaluate and (test_set or args.black_box or args.white_box): if args.black_box or args.white_box: model = r.publish_model(model, args, api, session_file) models[0] = model fields, objective_field = get_model_fields(model, model_fields, csv_properties, args) # If predicting if models and test_set and not args.evaluate: predict(test_set, test_set_header, models, fields, output, objective_field, args.remote, api, log, args.max_batch_models, args.method, resume, args.tag, args.verbosity, session_file, args.debug, args.ensemble, args.prediction_info) # When combine_votes flag is used, retrieve the predictions files saved # in the comma separated list of directories and combine them if votes_files: model_id = re.sub(r'.*(model_[a-f0-9]{24})__predictions\.csv$', r'\1', votes_files[0]).replace("_", "/") try: model = bigml.api.check_resource(model_id, api.get_model) except ValueError, exception: sys.exit("Failed to get model %s: %s" % (model_id, str(exception))) local_model = Model(model) message = u.dated("Combining votes.\n") u.log_message(message, log_file=session_file, console=args.verbosity) u.combine_votes(votes_files, local_model.to_prediction, output, args.method)
def remote_predict_models(models, test_reader, prediction_file, api, args, resume=False, output_path=None, session_file=None, log=None, exclude=None): """Retrieve predictions remotely, combine them and save predictions to file """ predictions_files = [] prediction_args = {"tags": args.tag} test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) message_logged = False raw_input_data_list = [] for input_data in test_reader: raw_input_data_list.append(input_data) single_model = len(models) == 1 if single_model: prediction_file = UnicodeWriter(prediction_file).open_writer() for model in models: model = bigml.api.get_model_id(model) predictions_file = get_predictions_file_name(model, output_path) predictions_files.append(predictions_file) if (not resume or not c.checkpoint(c.are_predictions_created, predictions_file, test_reader.number_of_tests(), debug=args.debug)[0]): if not message_logged: message = u.dated("Creating remote predictions.\n") u.log_message(message, log_file=session_file, console=args.verbosity) message_logged = True with UnicodeWriter(predictions_file) as predictions_file: for input_data in raw_input_data_list: input_data_dict = test_reader.dict(input_data) prediction = api.create_prediction(model, input_data_dict, wait_time=0, args=prediction_args) u.check_resource_error(prediction, "Failed to create prediction: ") u.log_message("%s\n" % prediction['resource'], log_file=log) prediction_row = prediction_to_row(prediction) predictions_file.writerow(prediction_row) if single_model: write_prediction(prediction_row[0:2], prediction_file, args.prediction_info, input_data, exclude) if single_model: prediction_file.close_writer() else: combine_votes(predictions_files, Model(models[0]).to_prediction, prediction_file, args.method, args.prediction_info, raw_input_data_list, exclude)
def i_create_a_local_model(step): world.local_model = Model(world.model)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ source = None dataset = None model = None models = None fields = None other_label = OTHER ensemble_ids = [] multi_label_data = None multi_label_fields = [] #local_ensemble = None test_dataset = None datasets = None # variables from command-line options resume = args.resume_ model_ids = args.model_ids_ output = args.predictions dataset_fields = args.dataset_fields_ check_args_coherence(args) path = u.check_dir(output) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # labels to be used in multi-label expansion labels = (None if args.labels is None else [label.strip() for label in args.labels.split(args.args_separator)]) if labels is not None: labels = sorted([label for label in labels]) # multi_label file must be preprocessed to obtain a new extended file if args.multi_label and args.training_set is not None: (args.training_set, multi_label_data) = ps.multi_label_expansion( args.training_set, args.train_header, args, path, labels=labels, session_file=session_file) args.train_header = True args.objective_field = multi_label_data["objective_name"] all_labels = l.get_all_labels(multi_label_data) if not labels: labels = all_labels else: all_labels = labels if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) if args.source_file: # source is retrieved from the contents of the given local JSON file source, csv_properties, fields = u.read_local_resource( args.source_file, csv_properties=csv_properties) else: # source is retrieved from the remote object source, resume, csv_properties, fields = ps.source_processing( api, args, resume, csv_properties=csv_properties, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) if args.multi_label and source: multi_label_data = l.get_multi_label_data(source) (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(args.objective_field, labels, multi_label_data, fields, multi_label_fields) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) if args.dataset_file: # dataset is retrieved from the contents of the given local JSON file model_dataset, csv_properties, fields = u.read_local_resource( args.dataset_file, csv_properties=csv_properties) if not args.datasets: datasets = [model_dataset] dataset = model_dataset else: datasets = u.read_datasets(args.datasets) if not datasets: # dataset is retrieved from the remote object datasets, resume, csv_properties, fields = pd.dataset_processing( source, api, args, resume, fields=fields, csv_properties=csv_properties, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) if datasets: dataset = datasets[0] if args.to_csv is not None: resume = pd.export_dataset(dataset, api, args, resume, session_file=session_file, path=path) # Now we have a dataset, let's check if there's an objective_field # given by the user and update it in the fields structure args.objective_id_ = get_objective_id(args, fields) # If test_split is used, split the dataset in a training and a test dataset # according to the given split if args.test_split > 0: dataset, test_dataset, resume = pd.split_processing( dataset, api, args, resume, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) datasets[0] = dataset # Check if the dataset has a categorical objective field and it # has a max_categories limit for categories if args.max_categories > 0 and len(datasets) == 1: if pd.check_max_categories(fields.fields[args.objective_id_]): distribution = pd.get_categories_distribution(dataset, args.objective_id_) if distribution and len(distribution) > args.max_categories: categories = [element[0] for element in distribution] other_label = pd.create_other_label(categories, other_label) datasets, resume = pd.create_categories_datasets( dataset, distribution, fields, args, api, resume, session_file=session_file, path=path, log=log, other_label=other_label) else: sys.exit("The provided objective field is not categorical nor " "a full terms only text field. " "Only these fields can be used with" " --max-categories") # If multi-dataset flag is on, generate a new dataset from the given # list of datasets if args.multi_dataset: dataset, resume = pd.create_new_dataset( datasets, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) datasets = [dataset] # Check if the dataset has a generators file associated with it, and # generate a new dataset with the specified field structure. Also # if the --to-dataset flag is used to clone or sample the original dataset if args.new_fields or (args.sample_rate != 1 and args.no_model) or \ (args.lisp_filter or args.json_filter) and not has_source(args): if fields is None: if isinstance(dataset, basestring): dataset = u.check_resource(dataset, api=api) fields = Fields(dataset, csv_properties) args.objective_id_ = get_objective_id(args, fields) args.objective_name_ = fields.field_name(args.objective_id_) dataset, resume = pd.create_new_dataset( dataset, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) datasets[0] = dataset # rebuild fields structure for new ids and fields csv_properties.update({'objective_field': args.objective_name_, 'objective_field_present': True}) fields = pd.get_fields_structure(dataset, csv_properties) args.objective_id_ = get_objective_id(args, fields) if args.multi_label and dataset and multi_label_data is None: multi_label_data = l.get_multi_label_data(dataset) (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(args.objective_field, labels, multi_label_data, fields, multi_label_fields) if dataset: # retrieves max_categories data, if any args.max_categories = get_metadata(dataset, 'max_categories', args.max_categories) other_label = get_metadata(dataset, 'other_label', other_label) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) if args.model_file: # model is retrieved from the contents of the given local JSON file model, csv_properties, fields = u.read_local_resource( args.model_file, csv_properties=csv_properties) models = [model] model_ids = [model['resource']] ensemble_ids = [] elif args.ensemble_file: # model is retrieved from the contents of the given local JSON file ensemble, csv_properties, fields = u.read_local_resource( args.ensemble_file, csv_properties=csv_properties) model_ids = ensemble['object']['models'][:] ensemble_ids = [ensemble['resource']] models = model_ids[:] model = retrieve_resource(bigml.api.BigML(storage='./storage'), models[0], query_string=r.ALL_FIELDS_QS) models[0] = model else: # model is retrieved from the remote object models, model_ids, ensemble_ids, resume = pm.models_processing( datasets, models, model_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log, labels=labels, multi_label_data=multi_label_data, other_label=other_label) if models: model = models[0] single_model = len(models) == 1 # If multi-label flag is set and no training_set was provided, label # info is extracted from the user_metadata. If models belong to an # ensemble, the ensemble must be retrieved to get the user_metadata. if model and args.multi_label and multi_label_data is None: if len(ensemble_ids) > 0 and isinstance(ensemble_ids[0], dict): resource = ensemble_ids[0] elif belongs_to_ensemble(model): ensemble_id = get_ensemble_id(model) resource = r.get_ensemble(ensemble_id, api=api, verbosity=args.verbosity, session_file=session_file) else: resource = model multi_label_data = l.get_multi_label_data(resource) # We update the model's public state if needed if model: if (isinstance(model, basestring) or bigml.api.get_status(model)['code'] != bigml.api.FINISHED): if not args.evaluate and not a.has_train(args) and \ not a.has_test(args) : query_string = MINIMUM_MODEL elif not args.test_header: query_string = r.ALL_FIELDS_QS else: query_string = "%s;%s" % (r.ALL_FIELDS_QS, r.FIELDS_QS) model = u.check_resource(model, api.get_model, query_string=query_string) models[0] = model if (args.black_box or args.white_box or (args.shared_flag and r.shared_changed(args.shared, model))): model_args = {} if args.shared_flag and r.shared_changed(args.shared, model): model_args.update(shared=args.shared) if args.black_box or args.white_box: model_args.update(r.set_publish_model_args(args)) if model_args: model = r.update_model(model, model_args, args, api=api, path=path, session_file=session_file) models[0] = model # We get the fields of the model if we haven't got # them yet and need them if model and not args.evaluate and (a.has_test(args) or args.export_fields): # If more than one model, use the full field structure if (not single_model and not args.multi_label and belongs_to_ensemble(model)): if len(ensemble_ids) > 0: ensemble_id = ensemble_ids[0] args.ensemble_ids_ = ensemble_ids else: ensemble_id = get_ensemble_id(model) fields = pm.get_model_fields( model, csv_properties, args, single_model=single_model, multi_label_data=multi_label_data) # Free memory after getting fields # local_ensemble = None gc.collect() # Fills in all_labels from user_metadata if args.multi_label and not all_labels: (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(args.objective_field, labels, multi_label_data, fields, multi_label_fields) if model: # retrieves max_categories data, if any args.max_categories = get_metadata(model, 'max_categories', args.max_categories) other_label = get_metadata(model, 'other_label', other_label) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) # If predicting if (models and (a.has_test(args) or (test_dataset and args.remote)) and not args.evaluate): models_per_label = 1 if test_dataset is None: test_dataset = get_test_dataset(args) if args.multi_label: # When prediction starts from existing models, the # multi_label_fields can be retrieved from the user_metadata # in the models if args.multi_label_fields is None and multi_label_fields: multi_label_field_names = [field[1] for field in multi_label_fields] args.multi_label_fields = ",".join(multi_label_field_names) test_set = ps.multi_label_expansion( args.test_set, args.test_header, args, path, labels=labels, session_file=session_file, input_flag=True)[0] test_set_header = True # Remote predictions: predictions are computed as batch predictions # in bigml.com except when --no-batch flag is set on or multi-label # or max-categories are used if (args.remote and not args.no_batch and not args.multi_label and not args.method in [THRESHOLD_CODE, COMBINATION]): # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) if args.to_dataset and args.dataset_off: model = api.check_resource(model['resource'], query_string=r.ALL_FIELDS_QS) model_fields = Fields(model) objective_field_name = model_fields.field_name( \ model_fields.objective_field) if objective_field_name in test_fields.fields_by_name.keys(): args.prediction_name = "%s (predicted)" % \ objective_field_name batch_prediction_args = r.set_batch_prediction_args( args, fields=fields, dataset_fields=test_fields) remote_predict(model, test_dataset, batch_prediction_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: models_per_label = args.number_of_models if (args.multi_label and len(ensemble_ids) > 0 and args.number_of_models == 1): # use case where ensembles are read from a file models_per_label = len(models) / len(ensemble_ids) predict(models, fields, args, api=api, log=log, resume=resume, session_file=session_file, labels=labels, models_per_label=models_per_label, other_label=other_label, multi_label_data=multi_label_data) # When combine_votes flag is used, retrieve the predictions files saved # in the comma separated list of directories and combine them if args.votes_files_: model_id = re.sub(r'.*(model_[a-f0-9]{24})__predictions\.csv$', r'\1', args.votes_files_[0]).replace("_", "/") try: model = u.check_resource(model_id, api.get_model) except ValueError, exception: sys.exit("Failed to get model %s: %s" % (model_id, str(exception))) local_model = Model(model) message = u.dated("Combining votes.\n") u.log_message(message, log_file=session_file, console=args.verbosity) combine_votes(args.votes_files_, local_model.to_prediction, output, method=args.method)
from bigml.api import BigML from bigml.model import Model api = BigML("friendlycoconut", "936583948d0c870ccb5cb004afcf6c13f086c900") source = api.create_source('https://static.bigml.com/csv/diabetes.csv') api.ok(source) dataset = api.create_dataset(source) api.ok(dataset) model = api.create_model(dataset) api.ok(model) local_model = Model(model) input_data = {"age": 65, "bmi": 36, "plasma glucose": 180, "pregnancies": 3} local_model.predict(input_data, add_confidence=True)
def __init__(self, ensemble, api=None, max_models=None, cache_get=None): self.model_splits = [] self.multi_model = None self.api = get_api_connection(api) self.fields = None self.class_names = None if use_cache(cache_get): # using a cache to store the model attributes self.__dict__ = load(get_ensemble_id(ensemble), cache_get) self.api = get_api_connection(api) if len(self.models_splits) == 1: # retrieve the models from a cache get function try: models = [ Model(model_id, cache_get=cache_get) for model_id in self.models_splits[0] ] except Exception as exc: raise Exception('Error while calling the user-given' ' function %s: %s' % (cache_get.__name__, str(exc))) self.multi_model = MultiModel(models, self.api, fields=self.fields, class_names=self.class_names, cache_get=cache_get) return self.resource_id = None self.objective_id = None self.distributions = None self.distribution = None self.boosting = None self.boosting_offsets = None self.cache_get = None self.regression = False self.importance = {} query_string = ONLY_MODEL no_check_fields = False self.input_fields = [] if isinstance(ensemble, list): if all([isinstance(model, Model) for model in ensemble]): models = ensemble self.model_ids = [ local_model.resource_id for local_model in models ] else: try: models = [get_model_id(model) for model in ensemble] self.model_ids = models except ValueError as exc: raise ValueError('Failed to verify the list of models.' ' Check your model id values: %s' % str(exc)) else: ensemble = self.get_ensemble_resource(ensemble) self.resource_id = get_ensemble_id(ensemble) if not check_local_but_fields(ensemble): # avoid checking fields because of old ensembles ensemble = retrieve_resource(self.api, self.resource_id, no_check_fields=True) if ensemble['object'].get('type') == BOOSTING: self.boosting = ensemble['object'].get('boosting') models = ensemble['object']['models'] self.distributions = ensemble['object'].get('distributions', []) self.importance = ensemble['object'].get('importance', []) self.model_ids = models # new ensembles have the fields structure if ensemble['object'].get('ensemble'): self.fields = ensemble['object'].get( \ 'ensemble', {}).get("fields") self.objective_id = ensemble['object'].get("objective_field") query_string = EXCLUDE_FIELDS no_check_fields = True self.input_fields = ensemble['object'].get('input_fields') number_of_models = len(models) if max_models is None: self.models_splits = [models] else: self.models_splits = [ models[index:(index + max_models)] for index in range(0, number_of_models, max_models) ] if len(self.models_splits) == 1: if not isinstance(models[0], Model): if use_cache(cache_get): # retrieve the models from a cache get function try: models = [ Model(model_id, cache_get=cache_get) for model_id in self.models_splits[0] ] self.cache_get = cache_get except Exception as exc: raise Exception('Error while calling the user-given' ' function %s: %s' % (cache_get.__name__, str(exc))) else: models = [retrieve_resource( \ self.api, model_id, query_string=query_string, no_check_fields=no_check_fields) for model_id in self.models_splits[0]] model = models[0] else: # only retrieving first model self.cache_get = cache_get if not isinstance(models[0], Model): if use_cache(cache_get): # retrieve the models from a cache get function try: model = Model(self.models_splits[0][0], cache_get=cache_get) self.cache_get = cache_get except Exception as exc: raise Exception('Error while calling the user-given' ' function %s: %s' % (cache_get.__name__, str(exc))) else: model = retrieve_resource( \ self.api, self.models_splits[0][0], query_string=query_string, no_check_fields=no_check_fields) models = [model] if self.distributions is None: try: self.distributions = [] for model in models: self.distributions.append( {'training': model.root_distribution}) except AttributeError: self.distributions = [ model['object']['model']['distribution'] for model in models ] if self.boosting is None: self._add_models_attrs(model, max_models) if self.fields is None: self.fields, self.objective_id = self.all_model_fields( max_models=max_models) if self.fields: add_distribution(self) self.regression = \ self.fields[self.objective_id].get('optype') == NUMERIC if self.boosting: self.boosting_offsets = ensemble['object'].get('initial_offset', 0) \ if self.regression else dict(ensemble['object'].get( \ 'initial_offsets', [])) if not self.regression: try: objective_field = self.fields[self.objective_id] categories = objective_field['summary']['categories'] classes = [category[0] for category in categories] except (AttributeError, KeyError): classes = set() for distribution in self.distributions: for category in distribution['training']['categories']: classes.add(category[0]) self.class_names = sorted(classes) self.objective_categories = [category for \ category, _ in self.fields[self.objective_id][ \ "summary"]["categories"]] ModelFields.__init__( \ self, self.fields, objective_id=self.objective_id) if len(self.models_splits) == 1: self.multi_model = MultiModel(models, self.api, fields=self.fields, class_names=self.class_names)
def compute_output(api, args, training_set, test_set=None, output=None, objective_field=None, description=None, field_attributes=None, types=None, dataset_fields=None, model_fields=None, name=None, training_set_header=True, test_set_header=True, model_ids=None, votes_files=None, resume=False, fields_map=None, test_field_attributes=None, test_types=None): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ source = None dataset = None model = None models = None fields = None other_label = OTHER ensemble_ids = [] multi_label_data = None multi_label_fields = [] local_ensemble = None # It is compulsory to have a description to publish either datasets or # models if (not description and (args.black_box or args.white_box or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --max-categories, it is compulsory to specify also the # objective_field if args.max_categories > 0 and objective_field is None: sys.exit("When --max-categories is used, you must also provide the" " --objective field name or column number") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required, open the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared if args.clear_logs: try: open(log, 'w', 0).close() except IOError: pass # labels to be used in multi-label expansion labels = (map(str.strip, args.labels.split(',')) if args.labels is not None else None) if labels is not None: labels = sorted([label.decode("utf-8") for label in labels]) # multi_label file must be preprocessed to obtain a new extended file if args.multi_label and training_set is not None: (training_set, multi_label_data) = ps.multi_label_expansion( training_set, training_set_header, objective_field, args, path, labels=labels, session_file=session_file) training_set_header = True objective_field = multi_label_data["objective_name"] all_labels = l.get_all_labels(multi_label_data) if not labels: labels = all_labels else: all_labels = labels source, resume, csv_properties, fields = ps.source_processing( training_set, test_set, training_set_header, test_set_header, api, args, resume, name=name, description=description, csv_properties=csv_properties, field_attributes=field_attributes, types=types, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) if args.multi_label and source: multi_label_data = l.get_multi_label_data(source) (objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(objective_field, labels, multi_label_data, fields, multi_label_fields) datasets, resume, csv_properties, fields = pd.dataset_processing( source, training_set, test_set, fields, objective_field, api, args, resume, name=name, description=description, dataset_fields=dataset_fields, multi_label_data=multi_label_data, csv_properties=csv_properties, session_file=session_file, path=path, log=log) if datasets: dataset = datasets[0] # If test_split is used, split the dataset in a training and a test dataset # according to the given split if args.test_split > 0: dataset, test_dataset, resume = pd.split_processing( dataset, api, args, resume, name=name, description=description, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) datasets[0] = dataset # Check if the dataset has a categorical objective field and it # has a max_categories limit for categories if args.max_categories > 0 and len(datasets) == 1: objective_id = fields.field_id(fields.objective_field) if pd.check_max_categories(fields.fields[objective_id]): distribution = pd.get_categories_distribution( dataset, objective_id) if distribution and len(distribution) > args.max_categories: categories = [element[0] for element in distribution] other_label = pd.create_other_label(categories, other_label) datasets, resume = pd.create_categories_datasets( dataset, distribution, fields, args, api, resume, session_file=session_file, path=path, log=log, other_label=other_label) else: sys.exit("The provided objective field is not categorical nor " "a full terms only text field. " "Only these fields can be used with" " --max-categories") # If multi-dataset flag is on, generate a new dataset from the given # list of datasets if args.multi_dataset: dataset, resume = pd.create_new_dataset( datasets, api, args, resume, name=name, description=description, fields=fields, dataset_fields=dataset_fields, objective_field=objective_field, session_file=session_file, path=path, log=log) datasets = [dataset] # Check if the dataset has a generators file associated with it, and # generate a new dataset with the specified field structure if args.new_fields: dataset, resume = pd.create_new_dataset( dataset, api, args, resume, name=name, description=description, fields=fields, dataset_fields=dataset_fields, objective_field=objective_field, session_file=session_file, path=path, log=log) datasets[0] = dataset if args.multi_label and dataset and multi_label_data is None: multi_label_data = l.get_multi_label_data(dataset) (objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(objective_field, labels, multi_label_data, fields, multi_label_fields) if dataset: # retrieves max_categories data, if any args.max_categories = get_metadata(dataset, 'max_categories', args.max_categories) other_label = get_metadata(dataset, 'other_label', other_label) models, model_ids, ensemble_ids, resume = pm.models_processing( datasets, models, model_ids, objective_field, fields, api, args, resume, name=name, description=description, model_fields=model_fields, session_file=session_file, path=path, log=log, labels=labels, multi_label_data=multi_label_data, other_label=other_label) if models: model = models[0] single_model = len(models) == 1 # If multi-label flag is set and no training_set was provided, label # info is extracted from the user_metadata. If models belong to an # ensemble, the ensemble must be retrieved to get the user_metadata. if model and args.multi_label and multi_label_data is None: if len(ensemble_ids) > 0 and isinstance(ensemble_ids[0], dict): resource = ensemble_ids[0] elif belongs_to_ensemble(model): ensemble_id = get_ensemble_id(model) resource = r.get_ensemble(ensemble_id, api=api, verbosity=args.verbosity, session_file=session_file) else: resource = model multi_label_data = l.get_multi_label_data(resource) # We update the model's public state if needed if model: if isinstance(model, basestring): if not args.evaluate: query_string = MINIMUM_MODEL else: query_string = r.FIELDS_QS model = u.check_resource(model, api.get_model, query_string=query_string) if (args.black_box or args.white_box or r.shared_changed(args.shared, model)): model_args = {} if r.shared_changed(args.shared, model): model_args.update(shared=args.shared) if args.black_box or args.white_box: model_args.update(r.set_publish_model_args(args)) if model_args: model = r.update_model(model, model_args, args, api=api, path=path, session_file=session_file) models[0] = model # We get the fields of the model if we haven't got # them yet and need them if model and not args.evaluate and test_set: # If more than one model, use the full field structure if (not single_model and not args.multi_label and belongs_to_ensemble(model)): if len(ensemble_ids) > 0: ensemble_id = ensemble_ids[0] else: ensemble_id = get_ensemble_id(model) local_ensemble = Ensemble(ensemble_id, api=api) fields, objective_field = pm.get_model_fields( model, csv_properties, args, single_model=single_model, multi_label_data=multi_label_data, local_ensemble=local_ensemble) # Fills in all_labels from user_metadata if args.multi_label and not all_labels: (objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(objective_field, labels, multi_label_data, fields, multi_label_fields) if model: # retrieves max_categories data, if any args.max_categories = get_metadata(model, 'max_categories', args.max_categories) other_label = get_metadata(model, 'other_label', other_label) # If predicting if models and has_test(args) and not args.evaluate: models_per_label = 1 test_dataset = None if args.multi_label: # When prediction starts from existing models, the # multi_label_fields can be retrieved from the user_metadata # in the models if args.multi_label_fields is None and multi_label_fields: multi_label_field_names = [ field[1] for field in multi_label_fields ] args.multi_label_fields = ",".join(multi_label_field_names) test_set = ps.multi_label_expansion(test_set, test_set_header, objective_field, args, path, labels=labels, session_file=session_file, input_flag=True)[0] test_set_header = True # Remote predictions: predictions are computed as batch predictions # in bigml.com except when --no-batch flag is set on or multi-label # or max-categories are used if (args.remote and not args.no_batch and not args.multi_label and not args.method in [THRESHOLD_CODE, COMBINATION]): # create test source from file test_name = "%s - test" % name if args.test_source is None: (test_source, resume, csv_properties, test_fields) = ps.test_source_processing( test_set, test_set_header, api, args, resume, name=test_name, description=description, field_attributes=test_field_attributes, types=test_types, session_file=session_file, path=path, log=log) else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id, api.get_source) if args.test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args( test_name, description, args) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(args.test_dataset) test_dataset = api.check_resource(test_dataset_id, api.get_dataset) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_prediction_args = r.set_batch_prediction_args( name, description, args, fields=fields, dataset_fields=test_fields, fields_map=fields_map) remote_predict(model, test_dataset, batch_prediction_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: models_per_label = args.number_of_models if (args.multi_label and len(ensemble_ids) > 0 and args.number_of_models == 1): # use case where ensembles are read from a file models_per_label = len(models) / len(ensemble_ids) predict(test_set, test_set_header, models, fields, output, objective_field, args, api=api, log=log, resume=resume, session_file=session_file, labels=labels, models_per_label=models_per_label, other_label=other_label, multi_label_data=multi_label_data) # When combine_votes flag is used, retrieve the predictions files saved # in the comma separated list of directories and combine them if votes_files: model_id = re.sub(r'.*(model_[a-f0-9]{24})__predictions\.csv$', r'\1', votes_files[0]).replace("_", "/") try: model = u.check_resource(model_id, api.get_model) except ValueError, exception: sys.exit("Failed to get model %s: %s" % (model_id, str(exception))) local_model = Model(model) message = u.dated("Combining votes.\n") u.log_message(message, log_file=session_file, console=args.verbosity) combine_votes(votes_files, local_model.to_prediction, output, args.method)
def i_create_a_local_model_from_file(step, model_file): world.local_model = Model(res_filename(model_file))
def localSinglePred(model, vals1, vals2): input_data2 = {"features1": vals1, "features2": vals2} local_model = Model(model) # model Id local_model.predict(input_data2) # add_confidence=True)
def i_create_local_model_from_file(step, export_file): world.local_model = Model( \ res_filename(export_file), api=BigML("wrong-user", "wrong-api-key"))
EXECUTION_model_or_enemble = API.create_execution( SCRIPT_model_or_ensemble['resource'], {'inputs': [["ts-id", training_set]]}) API.ok(EXECUTION_model_or_enemble) model_or_ensemble = EXECUTION_model_or_enemble["object"]["execution"]["result"] #Locally store the model or ensemble if model_or_ensemble[:1] == 'e': global local_ensemble local_ensemble = Ensemble(model_or_ensemble) picklEoR = local_ensemble else: global local_model local_model = Model(model_or_ensemble) picklEoR = local_model #batch prediction to check if the model is accurate batch_prediction = API.create_batch_prediction(model_or_ensemble, testing_set, {"all_fields": True}) API.ok(batch_prediction) API.download_batch_prediction(batch_prediction, filename=(filename[:-4] + "-Model-or-Ensemble-Check.csv")) #Store the data the has been created from this python file f = open(pickle_store, 'wb') pickle.dump([feature_names, model_or_ensemble, picklEoR], f) f.close()
def predict(models, fields, args, api=None, log=None, resume=False, session_file=None, labels=None, models_per_label=1, other_label=OTHER, multi_label_data=None): """Computes a prediction for each entry in the `test_set`. Predictions computed locally using MultiModels on subgroups of models. Chosing a max_batch_models value not bigger than the number_of_models flag will lead to the last case, where memory usage is bounded and each model predictions are saved for further use. """ test_set = args.test_set test_set_header = args.test_header objective_field = args.objective_field output = args.predictions test_reader = TestReader(test_set, test_set_header, fields, objective_field, test_separator=args.test_separator) prediction_file = output output_path = u.check_dir(output) with UnicodeWriter(output) as output: # columns to exclude if input_data is added to the prediction field exclude = use_prediction_headers(args.prediction_header, output, test_reader, fields, args, objective_field) # Remote predictions: predictions are computed in bigml.com and stored # in a file named after the model in the following syntax: # model_[id of the model]__predictions.csv # For instance, # model_50c0de043b563519830001c2_predictions.csv # Predictions are computed individually only if no_batch flag is set if args.remote and args.no_batch and not args.multi_label: if args.ensemble is not None: remote_predict_ensemble(args.ensemble, test_reader, prediction_file, api, args, resume, output_path, session_file, log, exclude) else: remote_predict_models(models, test_reader, prediction_file, api, args, resume, output_path, session_file, log, exclude) return # Local predictions: Predictions are computed locally using models' # rules with MultiModel's predict method message = u.dated("Creating local predictions.\n") u.log_message(message, log_file=session_file, console=args.verbosity) options = {} if args.method == THRESHOLD_CODE: options.update(threshold=args.threshold) if args.threshold_class is None: local_model = Model(models[0]) # default class is the first class that appears in the dataset # objective field summary, which might be different from the # objective summary of each model becaus model are built with # sampling objective_field = local_model.objective_id distribution = local_model.tree.fields[objective_field][ \ "summary"]["categories"] args.threshold_class = distribution[0][0] options.update(category=args.threshold_class) # For a model we build a Model and for a small number of models, # we build a MultiModel using all of # the given models and issue a combined prediction if (len(models) <= args.max_batch_models \ and args.fast and \ not args.multi_label and args.max_categories == 0 \ and args.method != COMBINATION): local_predict(models, test_reader, output, args, options, exclude) elif args.boosting: local_predict(args.ensemble, test_reader, output, args, options, exclude) # For large numbers of models, we split the list of models in chunks # and build a MultiModel for each chunk, issue and store predictions # for each model and combine all of them eventually. else: # Local predictions: predictions are computed locally using # models' rules with MultiModel's predict method and combined using # aggregation if the objective field is a multi-labelled field # or one of the available combination methods: plurality, # confidence weighted and probability weighted if args.multi_label: method = AGGREGATION elif args.max_categories > 0: method = COMBINATION else: method = args.method # For multi-labelled models, the --models flag keeps the order # of the labels and the models but the --model-tag flag # retrieves the models with no order, so the correspondence with # each label must be restored. ordered = True if args.multi_label and (args.model_tag is not None or models_per_label > 1): ordered = False local_batch_predict(models, test_reader, prediction_file, api, args, resume=resume, output_path=output_path, output=output, method=method, options=options, session_file=session_file, labels=labels, ordered=ordered, exclude=exclude, models_per_label=models_per_label, other_label=other_label, multi_label_data=multi_label_data) test_reader.close()
def create_local_ensemble_with_list_of_local_models(step, number_of_models): local_models = [ Model(model) for model in world.models[-int(number_of_models):] ] world.local_ensemble = Ensemble(local_models, world.api)
def i_create_local_model_from_file(step, export_file): world.local_model = Model(res_filename(export_file))
# Requires BigML Python bindings # # Install via: pip install bigml # # or clone it: # git clone https://github.com/bigmlcom/python.git from bigml.model import Model from bigml.api import BigML # Downloads and generates a local version of the model, if it # hasn't been downloaded previously. model = Model('model/5900dbaf014404467d000811', api=BigML("jaimevalero78", "6d685bf8cd3873a510b86500895071bcdd3d0990", dev_mode=True, domain="bigml.io")) # To make predictions fill the desired input_data # (e.g. {"petal length": 1, "sepal length": 3}) # as first parameter in next line. model.predict({}, with_confidence=True) # The result is a list of three elements: prediction, confidence and # distribution