def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ source = None dataset = None model = None models = None fields = None other_label = OTHER ensemble_ids = [] multi_label_data = None multi_label_fields = [] # local_ensemble = None test_dataset = None datasets = None # variables from command-line options resume = args.resume_ model_ids = args.model_ids_ output = args.predictions dataset_fields = args.dataset_fields_ check_args_coherence(args) path = u.check_dir(output) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # labels to be used in multi-label expansion labels = None if args.labels is None else [label.strip() for label in args.labels.split(args.args_separator)] if labels is not None: labels = sorted([label for label in labels]) # multi_label file must be preprocessed to obtain a new extended file if args.multi_label and args.training_set is not None: (args.training_set, multi_label_data) = ps.multi_label_expansion( args.training_set, args.train_header, args, path, labels=labels, session_file=session_file ) args.train_header = True args.objective_field = multi_label_data["objective_name"] all_labels = l.get_all_labels(multi_label_data) if not labels: labels = all_labels else: all_labels = labels if args.objective_field: csv_properties.update({"objective_field": args.objective_field}) if args.source_file: # source is retrieved from the contents of the given local JSON file source, csv_properties, fields = u.read_local_resource(args.source_file, csv_properties=csv_properties) else: # source is retrieved from the remote object source, resume, csv_properties, fields = ps.source_processing( api, args, resume, csv_properties=csv_properties, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log, ) if args.multi_label and source: multi_label_data = l.get_multi_label_data(source) (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync( args.objective_field, labels, multi_label_data, fields, multi_label_fields ) if args.dataset_file: # dataset is retrieved from the contents of the given local JSON file model_dataset, csv_properties, fields = u.read_local_resource(args.dataset_file, csv_properties=csv_properties) if not args.datasets: datasets = [model_dataset] dataset = model_dataset else: datasets = u.read_datasets(args.datasets) if not datasets: # dataset is retrieved from the remote object datasets, resume, csv_properties, fields = pd.dataset_processing( source, api, args, resume, fields=fields, csv_properties=csv_properties, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log, ) if datasets: dataset = datasets[0] if args.to_csv is not None: resume = pd.export_dataset(dataset, api, args, resume, session_file=session_file, path=path) # Now we have a dataset, let's check if there's an objective_field # given by the user and update it in the fields structure args.objective_id_ = get_objective_id(args, fields) # If test_split is used, split the dataset in a training and a test dataset # according to the given split if args.test_split > 0: dataset, test_dataset, resume = pd.split_processing( dataset, api, args, resume, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log ) datasets[0] = dataset # Check if the dataset has a categorical objective field and it # has a max_categories limit for categories if args.max_categories > 0 and len(datasets) == 1: if pd.check_max_categories(fields.fields[args.objective_id_]): distribution = pd.get_categories_distribution(dataset, args.objective_id_) if distribution and len(distribution) > args.max_categories: categories = [element[0] for element in distribution] other_label = pd.create_other_label(categories, other_label) datasets, resume = pd.create_categories_datasets( dataset, distribution, fields, args, api, resume, session_file=session_file, path=path, log=log, other_label=other_label, ) else: sys.exit( "The provided objective field is not categorical nor " "a full terms only text field. " "Only these fields can be used with" " --max-categories" ) # If multi-dataset flag is on, generate a new dataset from the given # list of datasets if args.multi_dataset: dataset, resume = pd.create_new_dataset( datasets, api, args, resume, fields=fields, session_file=session_file, path=path, log=log ) datasets = [dataset] # Check if the dataset has a generators file associated with it, and # generate a new dataset with the specified field structure. Also # if the --to-dataset flag is used to clone or sample the original dataset if ( args.new_fields or (args.sample_rate != 1 and args.no_model) or (args.lisp_filter or args.json_filter) and not has_source(args) ): if fields is None: if isinstance(dataset, basestring): dataset = check_resource(dataset, api=api) fields = Fields(dataset, csv_properties) args.objective_id_ = get_objective_id(args, fields) args.objective_name_ = fields.field_name(args.objective_id_) dataset, resume = pd.create_new_dataset( dataset, api, args, resume, fields=fields, session_file=session_file, path=path, log=log ) datasets[0] = dataset # rebuild fields structure for new ids and fields csv_properties.update({"objective_field": args.objective_name_, "objective_field_present": True}) fields = pd.get_fields_structure(dataset, csv_properties) args.objective_id_ = get_objective_id(args, fields) if args.multi_label and dataset and multi_label_data is None: multi_label_data = l.get_multi_label_data(dataset) (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync( args.objective_field, labels, multi_label_data, fields, multi_label_fields ) if dataset: # retrieves max_categories data, if any args.max_categories = get_metadata(dataset, "max_categories", args.max_categories) other_label = get_metadata(dataset, "other_label", other_label) if args.model_file: # model is retrieved from the contents of the given local JSON file model, csv_properties, fields = u.read_local_resource(args.model_file, csv_properties=csv_properties) models = [model] model_ids = [model["resource"]] ensemble_ids = [] elif args.ensemble_file: # model is retrieved from the contents of the given local JSON file ensemble, csv_properties, fields = u.read_local_resource(args.ensemble_file, csv_properties=csv_properties) model_ids = ensemble["object"]["models"][:] ensemble_ids = [ensemble["resource"]] models = model_ids[:] model = retrieve_resource(bigml.api.BigML(storage="./storage"), models[0], query_string=r.ALL_FIELDS_QS) models[0] = model else: # model is retrieved from the remote object models, model_ids, ensemble_ids, resume = pm.models_processing( datasets, models, model_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log, labels=labels, multi_label_data=multi_label_data, other_label=other_label, ) if models: model = models[0] single_model = len(models) == 1 # If multi-label flag is set and no training_set was provided, label # info is extracted from the user_metadata. If models belong to an # ensemble, the ensemble must be retrieved to get the user_metadata. if model and args.multi_label and multi_label_data is None: if len(ensemble_ids) > 0 and isinstance(ensemble_ids[0], dict): resource = ensemble_ids[0] elif belongs_to_ensemble(model): ensemble_id = get_ensemble_id(model) resource = r.get_ensemble(ensemble_id, api=api, verbosity=args.verbosity, session_file=session_file) else: resource = model multi_label_data = l.get_multi_label_data(resource) # We update the model's public state if needed if model: if isinstance(model, basestring) or bigml.api.get_status(model)["code"] != bigml.api.FINISHED: if not args.evaluate and not a.has_train(args): query_string = MINIMUM_MODEL elif not args.test_header: query_string = r.ALL_FIELDS_QS else: query_string = "%s;%s" % (r.ALL_FIELDS_QS, r.FIELDS_QS) model = u.check_resource(model, api.get_model, query_string=query_string) models[0] = model if args.black_box or args.white_box or (args.shared_flag and r.shared_changed(args.shared, model)): model_args = {} if args.shared_flag and r.shared_changed(args.shared, model): model_args.update(shared=args.shared) if args.black_box or args.white_box: model_args.update(r.set_publish_model_args(args)) if model_args: model = r.update_model(model, model_args, args, api=api, path=path, session_file=session_file) models[0] = model # We get the fields of the model if we haven't got # them yet and need them if model and not args.evaluate and args.test_set: # If more than one model, use the full field structure if not single_model and not args.multi_label and belongs_to_ensemble(model): if len(ensemble_ids) > 0: ensemble_id = ensemble_ids[0] else: ensemble_id = get_ensemble_id(model) fields = pm.get_model_fields( model, csv_properties, args, single_model=single_model, multi_label_data=multi_label_data ) # Free memory after getting fields # local_ensemble = None gc.collect() # Fills in all_labels from user_metadata if args.multi_label and not all_labels: (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync( args.objective_field, labels, multi_label_data, fields, multi_label_fields ) if model: # retrieves max_categories data, if any args.max_categories = get_metadata(model, "max_categories", args.max_categories) other_label = get_metadata(model, "other_label", other_label) # If predicting if models and (a.has_test(args) or (test_dataset and args.remote)) and not args.evaluate: models_per_label = 1 if test_dataset is None: test_dataset = get_test_dataset(args) if args.multi_label: # When prediction starts from existing models, the # multi_label_fields can be retrieved from the user_metadata # in the models if args.multi_label_fields is None and multi_label_fields: multi_label_field_names = [field[1] for field in multi_label_fields] args.multi_label_fields = ",".join(multi_label_field_names) test_set = ps.multi_label_expansion( args.test_set, args.test_header, args, path, labels=labels, session_file=session_file, input_flag=True )[0] test_set_header = True # Remote predictions: predictions are computed as batch predictions # in bigml.com except when --no-batch flag is set on or multi-label # or max-categories are used if ( args.remote and not args.no_batch and not args.multi_label and not args.method in [THRESHOLD_CODE, COMBINATION] ): # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, session_file=session_file, path=path, log=log ) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log ) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_prediction_args = r.set_batch_prediction_args(args, fields=fields, dataset_fields=test_fields) remote_predict( model, test_dataset, batch_prediction_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log, ) else: models_per_label = args.number_of_models if args.multi_label and len(ensemble_ids) > 0 and args.number_of_models == 1: # use case where ensembles are read from a file models_per_label = len(models) / len(ensemble_ids) predict( models, fields, args, api=api, log=log, resume=resume, session_file=session_file, labels=labels, models_per_label=models_per_label, other_label=other_label, multi_label_data=multi_label_data, ) # When combine_votes flag is used, retrieve the predictions files saved # in the comma separated list of directories and combine them if args.votes_files_: model_id = re.sub(r".*(model_[a-f0-9]{24})__predictions\.csv$", r"\1", args.votes_files_[0]).replace("_", "/") try: model = u.check_resource(model_id, api.get_model) except ValueError, exception: sys.exit("Failed to get model %s: %s" % (model_id, str(exception))) local_model = Model(model) message = u.dated("Combining votes.\n") u.log_message(message, log_file=session_file, console=args.verbosity) combine_votes(args.votes_files_, local_model.to_prediction, output, method=args.method)
def compute_output(api, args, training_set, test_set=None, output=None, objective_field=None, description=None, field_attributes=None, types=None, dataset_fields=None, model_fields=None, name=None, training_set_header=True, test_set_header=True, model_ids=None, votes_files=None, resume=False, fields_map=None, test_field_attributes=None, test_types=None): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ source = None dataset = None model = None models = None fields = None other_label = OTHER ensemble_ids = [] multi_label_data = None multi_label_fields = [] local_ensemble = None # It is compulsory to have a description to publish either datasets or # models if (not description and (args.black_box or args.white_box or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --max-categories, it is compulsory to specify also the # objective_field if args.max_categories > 0 and objective_field is None: sys.exit("When --max-categories is used, you must also provide the" " --objective field name or column number") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # labels to be used in multi-label expansion labels = (map(str.strip, args.labels.split(',')) if args.labels is not None else None) if labels is not None: labels = sorted([label.decode("utf-8") for label in labels]) # multi_label file must be preprocessed to obtain a new extended file if args.multi_label and training_set is not None: (training_set, multi_label_data) = ps.multi_label_expansion( training_set, training_set_header, objective_field, args, path, labels=labels, session_file=session_file) training_set_header = True objective_field = multi_label_data["objective_name"] all_labels = l.get_all_labels(multi_label_data) if not labels: labels = all_labels else: all_labels = labels source, resume, csv_properties, fields = ps.source_processing( training_set, test_set, training_set_header, test_set_header, api, args, resume, name=name, description=description, csv_properties=csv_properties, field_attributes=field_attributes, types=types, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) if args.multi_label and source: multi_label_data = l.get_multi_label_data(source) (objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync( objective_field, labels, multi_label_data, fields, multi_label_fields) datasets, resume, csv_properties, fields = pd.dataset_processing( source, training_set, test_set, fields, objective_field, api, args, resume, name=name, description=description, dataset_fields=dataset_fields, multi_label_data=multi_label_data, csv_properties=csv_properties, session_file=session_file, path=path, log=log) if datasets: dataset = datasets[0] # If test_split is used, split the dataset in a training and a test dataset # according to the given split if args.test_split > 0: dataset, test_dataset, resume = pd.split_processing( dataset, api, args, resume, name=name, description=description, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) datasets[0] = dataset # Check if the dataset has a categorical objective field and it # has a max_categories limit for categories if args.max_categories > 0 and len(datasets) == 1: objective_id = fields.field_id(fields.objective_field) if pd.check_max_categories(fields.fields[objective_id]): distribution = pd.get_categories_distribution(dataset, objective_id) if distribution and len(distribution) > args.max_categories: categories = [element[0] for element in distribution] other_label = pd.create_other_label(categories, other_label) datasets, resume = pd.create_categories_datasets( dataset, distribution, fields, args, api, resume, session_file=session_file, path=path, log=log, other_label=other_label) else: sys.exit("The provided objective field is not categorical nor " "a full terms only text field. " "Only these fields can be used with" " --max-categories") # If multi-dataset flag is on, generate a new dataset from the given # list of datasets if args.multi_dataset: dataset, resume = pd.create_new_dataset( datasets, api, args, resume, name=name, description=description, fields=fields, dataset_fields=dataset_fields, objective_field=objective_field, session_file=session_file, path=path, log=log) datasets = [dataset] # Check if the dataset has a generators file associated with it, and # generate a new dataset with the specified field structure if args.new_fields: dataset, resume = pd.create_new_dataset( dataset, api, args, resume, name=name, description=description, fields=fields, dataset_fields=dataset_fields, objective_field=objective_field, session_file=session_file, path=path, log=log) datasets[0] = dataset if args.multi_label and dataset and multi_label_data is None: multi_label_data = l.get_multi_label_data(dataset) (objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync( objective_field, labels, multi_label_data, fields, multi_label_fields) if dataset: # retrieves max_categories data, if any args.max_categories = get_metadata(dataset, 'max_categories', args.max_categories) other_label = get_metadata(dataset, 'other_label', other_label) models, model_ids, ensemble_ids, resume = pm.models_processing( datasets, models, model_ids, objective_field, fields, api, args, resume, name=name, description=description, model_fields=model_fields, session_file=session_file, path=path, log=log, labels=labels, multi_label_data=multi_label_data, other_label=other_label) if models: model = models[0] single_model = len(models) == 1 # If multi-label flag is set and no training_set was provided, label # info is extracted from the user_metadata. If models belong to an # ensemble, the ensemble must be retrieved to get the user_metadata. if model and args.multi_label and multi_label_data is None: if len(ensemble_ids) > 0 and isinstance(ensemble_ids[0], dict): resource = ensemble_ids[0] elif belongs_to_ensemble(model): ensemble_id = get_ensemble_id(model) resource = r.get_ensemble(ensemble_id, api=api, verbosity=args.verbosity, session_file=session_file) else: resource = model multi_label_data = l.get_multi_label_data(resource) # We update the model's public state if needed if model: if isinstance(model, basestring): if not args.evaluate: query_string = MINIMUM_MODEL else: query_string = r.FIELDS_QS model = u.check_resource(model, api.get_model, query_string=query_string) if (args.black_box or args.white_box or (args.shared_flag and r.shared_changed(args.shared, model))): model_args = {} if args.shared_flag and r.shared_changed(args.shared, model): model_args.update(shared=args.shared) if args.black_box or args.white_box: model_args.update(r.set_publish_model_args(args)) if model_args: model = r.update_model(model, model_args, args, api=api, path=path, session_file=session_file) models[0] = model # We get the fields of the model if we haven't got # them yet and need them if model and not args.evaluate and test_set: # If more than one model, use the full field structure if (not single_model and not args.multi_label and belongs_to_ensemble(model)): if len(ensemble_ids) > 0: ensemble_id = ensemble_ids[0] else: ensemble_id = get_ensemble_id(model) local_ensemble = Ensemble(ensemble_id, api=api) fields, objective_field = pm.get_model_fields( model, csv_properties, args, single_model=single_model, multi_label_data=multi_label_data, local_ensemble=local_ensemble) # Fills in all_labels from user_metadata if args.multi_label and not all_labels: (objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync( objective_field, labels, multi_label_data, fields, multi_label_fields) if model: # retrieves max_categories data, if any args.max_categories = get_metadata(model, 'max_categories', args.max_categories) other_label = get_metadata(model, 'other_label', other_label) # If predicting if models and has_test(args) and not args.evaluate: models_per_label = 1 test_dataset = None if args.multi_label: # When prediction starts from existing models, the # multi_label_fields can be retrieved from the user_metadata # in the models if args.multi_label_fields is None and multi_label_fields: multi_label_field_names = [field[1] for field in multi_label_fields] args.multi_label_fields = ",".join(multi_label_field_names) test_set = ps.multi_label_expansion( test_set, test_set_header, objective_field, args, path, labels=labels, session_file=session_file, input_flag=True)[0] test_set_header = True # Remote predictions: predictions are computed as batch predictions # in bigml.com except when --no-batch flag is set on or multi-label # or max-categories are used if (args.remote and not args.no_batch and not args.multi_label and not args.method in [THRESHOLD_CODE, COMBINATION]): # create test source from file test_name = "%s - test" % name if args.test_source is None: (test_source, resume, csv_properties, test_fields) = ps.test_source_processing( test_set, test_set_header, api, args, resume, name=test_name, description=description, field_attributes=test_field_attributes, types=test_types, session_file=session_file, path=path, log=log) else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id, api.get_source) if args.test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(test_name, description, args) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(args.test_dataset) test_dataset = api.check_resource(test_dataset_id, api.get_dataset) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_prediction_args = r.set_batch_prediction_args( name, description, args, fields=fields, dataset_fields=test_fields, fields_map=fields_map) remote_predict(model, test_dataset, batch_prediction_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: models_per_label = args.number_of_models if (args.multi_label and len(ensemble_ids) > 0 and args.number_of_models == 1): # use case where ensembles are read from a file models_per_label = len(models) / len(ensemble_ids) predict(test_set, test_set_header, models, fields, output, objective_field, args, api=api, log=log, resume=resume, session_file=session_file, labels=labels, models_per_label=models_per_label, other_label=other_label, multi_label_data=multi_label_data) # When combine_votes flag is used, retrieve the predictions files saved # in the comma separated list of directories and combine them if votes_files: model_id = re.sub(r'.*(model_[a-f0-9]{24})__predictions\.csv$', r'\1', votes_files[0]).replace("_", "/") try: model = u.check_resource(model_id, api.get_model) except ValueError, exception: sys.exit("Failed to get model %s: %s" % (model_id, str(exception))) local_model = Model(model) message = u.dated("Combining votes.\n") u.log_message(message, log_file=session_file, console=args.verbosity) combine_votes(votes_files, local_model.to_prediction, output, args.method)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ logistic_regression = None logistic_regressions = None # no multi-label support at present # variables from command-line options resume = args.resume_ logistic_regression_ids = args.logistic_regression_ids_ output = args.predictions # there's only one logistic regression to be generated at present args.max_parallel_logistic_regressions = 1 # logistic regressions cannot be published yet. args.public_logistic_regression = False # It is compulsory to have a description to publish either datasets or # logistic regressions if (not args.description_ and (args.public_logistic_regression or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info(api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if datasets: # Now we have a dataset, let's check if there's an objective_field # given by the user and update it in the fields structure args.objective_id_ = get_objective_id(args, fields) if args.logistic_file: # logistic regression is retrieved from the contents of the given local # JSON file logistic_regression, csv_properties, fields = u.read_local_resource( args.logistic_file, csv_properties=csv_properties) logistic_regressions = [logistic_regression] logistic_regression_ids = [logistic_regression['resource']] else: # logistic regression is retrieved from the remote object logistic_regressions, logistic_regression_ids, resume = \ plr.logistic_regressions_processing( \ datasets, logistic_regressions, logistic_regression_ids, \ api, args, resume, fields=fields, \ session_file=session_file, path=path, log=log) if logistic_regressions: logistic_regression = logistic_regressions[0] # We update the logistic regression's public state if needed if logistic_regression: if isinstance(logistic_regression, basestring): if not a.has_test(args): query_string = MINIMUM_MODEL elif args.export_fields: query_string = r.ALL_FIELDS_QS else: query_string = '' logistic_regression = u.check_resource(logistic_regression, api.get_logistic_regression, query_string=query_string) logistic_regressions[0] = logistic_regression if (args.public_logistic_regression or (args.shared_flag and r.shared_changed(args.shared, logistic_regression))): logistic_regression_args = {} if args.shared_flag and r.shared_changed(args.shared, logistic_regression): logistic_regression_args.update(shared=args.shared) if args.public_logistic_regression: logistic_regression_args.update( \ r.set_publish_logistic_regression_args(args)) if logistic_regression_args: logistic_regression = r.update_logistic_regression( \ logistic_regression, logistic_regression_args, args, api=api, path=path, \ session_file=session_file) logistic_regressions[0] = logistic_regression # We get the fields of the logistic_regression if we haven't got # them yet and need them if logistic_regression and (args.test_set or args.export_fields): fields = plr.get_logistic_fields( \ logistic_regression, csv_properties, args) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) # If predicting if logistic_regressions and (a.has_test(args) or \ (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote predictions: predictions are computed as batch predictions # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_prediction_args = r.set_batch_prediction_args( args, fields=fields, dataset_fields=test_fields) remote_lr_prediction(logistic_regression, test_dataset, \ batch_prediction_args, args, \ api, resume, prediction_file=output, \ session_file=session_file, path=path, log=log) else: lr_prediction(logistic_regressions, fields, args, session_file=session_file) # If evaluate flag is on, create remote evaluation and save results in # json and human-readable format. if args.evaluate: # When we resume evaluation and models were already completed, we # should use the datasets array as test datasets if args.has_test_datasets_: test_dataset = get_test_dataset(args) if args.dataset_off and not args.has_test_datasets_: args.test_dataset_ids = datasets if args.test_dataset_ids and args.dataset_off: # Evaluate the models with the corresponding test datasets. test_dataset_id = bigml.api.get_dataset_id( \ args.test_dataset_ids[0]) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) resume = evaluate(logistic_regressions, args.test_dataset_ids, api, args, resume, fields=fields, dataset_fields=test_fields, session_file=session_file, path=path, log=log, objective_field=args.objective_field) else: dataset = datasets[0] if args.test_split > 0 or args.has_test_datasets_: dataset = test_dataset dataset = u.check_resource(dataset, api=api, query_string=r.ALL_FIELDS_QS) dataset_fields = pd.get_fields_structure(dataset, None) resume = evaluate(logistic_regressions, [dataset], api, args, resume, fields=fields, dataset_fields=dataset_fields, session_file=session_file, path=path, log=log, objective_field=args.objective_field) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ cluster = None clusters = None # no multi-label support at present # variables from command-line options resume = args.resume_ cluster_ids = args.cluster_ids_ output = args.predictions # there's only one cluster to be generated at present args.max_parallel_clusters = 1 # clusters cannot be published yet. args.public_cluster = False # It is compulsory to have a description to publish either datasets or # clusters if (not args.description_ and (args.public_cluster or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.cluster_file: # cluster is retrieved from the contents of the given local JSON file cluster, csv_properties, fields = u.read_local_resource( args.cluster_file, csv_properties=csv_properties) clusters = [cluster] cluster_ids = [cluster['resource']] else: # cluster is retrieved from the remote object clusters, cluster_ids, resume = pc.clusters_processing( datasets, clusters, cluster_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if clusters: cluster = clusters[0] # We update the cluster's public state if needed if cluster: if isinstance(cluster, basestring): if args.cluster_datasets is None and not a.has_test(args): query_string = MINIMUM_MODEL else: query_string = '' cluster = u.check_resource(cluster, api.get_cluster, query_string=query_string) clusters[0] = cluster if (args.public_cluster or (args.shared_flag and r.shared_changed(args.shared, cluster))): cluster_args = {} if args.shared_flag and r.shared_changed(args.shared, cluster): cluster_args.update(shared=args.shared) if args.public_cluster: cluster_args.update(r.set_publish_cluster_args(args)) if cluster_args: cluster = r.update_cluster(cluster, cluster_args, args, api=api, path=path, session_file=session_file) clusters[0] = cluster # We get the fields of the cluster if we haven't got # them yet and need them if cluster and args.test_set: fields = pc.get_cluster_fields(cluster, csv_properties, args) # If predicting if clusters and (a.has_test(args) or (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote centroids: centroids are computed as batch centroids # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_centroid_args = r.set_batch_centroid_args( args, fields=fields, dataset_fields=test_fields) remote_centroid(cluster, test_dataset, batch_centroid_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: centroid(clusters, fields, args, session_file=session_file) if cluster and args.cluster_datasets is not None: centroids_info = cluster['object']['clusters']['clusters'] centroids = {centroid['name']: centroid['id'] for centroid in centroids_info} datasets = cluster['object']['cluster_datasets'] if args.cluster_datasets == '': centroid_ids = centroids.values() else: centroid_ids = [centroids[cluster_name] for cluster_name in args.cluster_datasets_ if datasets[centroids[cluster_name]] == ''] for centroid_id in centroid_ids: dataset_args = {'centroid': centroid_id} r.create_dataset(cluster, dataset_args, args, api=api, path=path, session_file=session_file, log=log, dataset_type='cluster') u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ cluster = None clusters = None # no multi-label support at present # variables from command-line options resume = args.resume_ cluster_ids = args.cluster_ids_ output = args.predictions # there's only one cluster to be generated at present args.max_parallel_clusters = 1 # clusters cannot be published yet. args.public_cluster = False # It is compulsory to have a description to publish either datasets or # clusters if (not args.description_ and (args.public_cluster or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info(api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.cluster_file: # cluster is retrieved from the contents of the given local JSON file cluster, csv_properties, fields = u.read_local_resource( args.cluster_file, csv_properties=csv_properties) clusters = [cluster] cluster_ids = [cluster['resource']] else: # cluster is retrieved from the remote object clusters, cluster_ids, resume = pc.clusters_processing( datasets, clusters, cluster_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if clusters: cluster = clusters[0] # We update the cluster's public state if needed if cluster: if isinstance(cluster, basestring): if args.cluster_datasets is None and not a.has_test(args): query_string = MINIMUM_MODEL else: query_string = '' cluster = u.check_resource(cluster, api.get_cluster, query_string=query_string) clusters[0] = cluster if (args.public_cluster or (args.shared_flag and r.shared_changed(args.shared, cluster))): cluster_args = {} if args.shared_flag and r.shared_changed(args.shared, cluster): cluster_args.update(shared=args.shared) if args.public_cluster: cluster_args.update(r.set_publish_cluster_args(args)) if cluster_args: cluster = r.update_cluster(cluster, cluster_args, args, api=api, path=path, session_file=session_file) clusters[0] = cluster # We get the fields of the cluster if we haven't got # them yet and need them if cluster and (args.test_set or args.export_fields): if isinstance(cluster, dict): cluster = cluster['resource'] cluster = u.check_resource(cluster, api.get_cluster, query_string=r.ALL_FIELDS_QS) fields = pc.get_cluster_fields(cluster, csv_properties, args) # If predicting if clusters and (a.has_test(args) or (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote centroids: centroids are computed as batch centroids # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_centroid_args = r.set_batch_centroid_args( args, fields=fields, dataset_fields=test_fields) remote_centroid(cluster, test_dataset, batch_centroid_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: centroid(clusters, fields, args, session_file=session_file) if cluster and args.cluster_datasets is not None: cluster = api.check_resource(cluster) centroids_info = cluster['object']['clusters']['clusters'] centroids = { centroid['name']: centroid['id'] for centroid in centroids_info } cluster_datasets = cluster['object']['cluster_datasets'] if args.cluster_datasets == '': centroid_ids = centroids.values() else: centroid_ids = [ centroids[cluster_name] for cluster_name in args.cluster_datasets_ if cluster_datasets.get(centroids[cluster_name], '') == '' ] for centroid_id in centroid_ids: dataset_args = {'centroid': centroid_id} r.create_dataset(cluster, dataset_args, args, api=api, path=path, session_file=session_file, log=log, dataset_type='cluster') if cluster and args.cluster_models is not None: cluster = api.check_resource(cluster) centroids_info = cluster['object']['clusters']['clusters'] centroids = { centroid['name']: centroid['id'] for centroid in centroids_info } models = cluster['object']['cluster_models'] if args.cluster_models == '': centroid_ids = centroids.values() else: centroid_ids = [ centroids[cluster_name] for cluster_name in args.cluster_models_ if models.get(centroids[cluster_name], '') == '' ] for centroid_id in centroid_ids: model_args = {'centroid': centroid_id} r.create_model(cluster, model_args, args, api=api, path=path, session_file=session_file, log=log, model_type='cluster') if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args, training_set, test_set=None, output=None, objective_field=None, description=None, field_attributes=None, types=None, dataset_fields=None, model_fields=None, name=None, training_set_header=True, test_set_header=True, model_ids=None, votes_files=None, resume=False, fields_map=None, test_field_attributes=None, test_types=None): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ source = None dataset = None model = None models = None fields = None other_label = OTHER ensemble_ids = [] multi_label_data = None multi_label_fields = [] local_ensemble = None # It is compulsory to have a description to publish either datasets or # models if (not description and (args.black_box or args.white_box or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --max-categories, it is compulsory to specify also the # objective_field if args.max_categories > 0 and objective_field is None: sys.exit("When --max-categories is used, you must also provide the" " --objective field name or column number") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required, open the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared if args.clear_logs: try: open(log, 'w', 0).close() except IOError: pass # labels to be used in multi-label expansion labels = (map(str.strip, args.labels.split(',')) if args.labels is not None else None) if labels is not None: labels = sorted([label.decode("utf-8") for label in labels]) # multi_label file must be preprocessed to obtain a new extended file if args.multi_label and training_set is not None: (training_set, multi_label_data) = ps.multi_label_expansion( training_set, training_set_header, objective_field, args, path, labels=labels, session_file=session_file) training_set_header = True objective_field = multi_label_data["objective_name"] all_labels = l.get_all_labels(multi_label_data) if not labels: labels = all_labels else: all_labels = labels source, resume, csv_properties, fields = ps.source_processing( training_set, test_set, training_set_header, test_set_header, api, args, resume, name=name, description=description, csv_properties=csv_properties, field_attributes=field_attributes, types=types, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) if args.multi_label and source: multi_label_data = l.get_multi_label_data(source) (objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(objective_field, labels, multi_label_data, fields, multi_label_fields) datasets, resume, csv_properties, fields = pd.dataset_processing( source, training_set, test_set, fields, objective_field, api, args, resume, name=name, description=description, dataset_fields=dataset_fields, multi_label_data=multi_label_data, csv_properties=csv_properties, session_file=session_file, path=path, log=log) if datasets: dataset = datasets[0] # If test_split is used, split the dataset in a training and a test dataset # according to the given split if args.test_split > 0: dataset, test_dataset, resume = pd.split_processing( dataset, api, args, resume, name=name, description=description, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) datasets[0] = dataset # Check if the dataset has a categorical objective field and it # has a max_categories limit for categories if args.max_categories > 0 and len(datasets) == 1: objective_id = fields.field_id(fields.objective_field) if pd.check_max_categories(fields.fields[objective_id]): distribution = pd.get_categories_distribution( dataset, objective_id) if distribution and len(distribution) > args.max_categories: categories = [element[0] for element in distribution] other_label = pd.create_other_label(categories, other_label) datasets, resume = pd.create_categories_datasets( dataset, distribution, fields, args, api, resume, session_file=session_file, path=path, log=log, other_label=other_label) else: sys.exit("The provided objective field is not categorical nor " "a full terms only text field. " "Only these fields can be used with" " --max-categories") # If multi-dataset flag is on, generate a new dataset from the given # list of datasets if args.multi_dataset: dataset, resume = pd.create_new_dataset( datasets, api, args, resume, name=name, description=description, fields=fields, dataset_fields=dataset_fields, objective_field=objective_field, session_file=session_file, path=path, log=log) datasets = [dataset] # Check if the dataset has a generators file associated with it, and # generate a new dataset with the specified field structure if args.new_fields: dataset, resume = pd.create_new_dataset( dataset, api, args, resume, name=name, description=description, fields=fields, dataset_fields=dataset_fields, objective_field=objective_field, session_file=session_file, path=path, log=log) datasets[0] = dataset if args.multi_label and dataset and multi_label_data is None: multi_label_data = l.get_multi_label_data(dataset) (objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(objective_field, labels, multi_label_data, fields, multi_label_fields) if dataset: # retrieves max_categories data, if any args.max_categories = get_metadata(dataset, 'max_categories', args.max_categories) other_label = get_metadata(dataset, 'other_label', other_label) models, model_ids, ensemble_ids, resume = pm.models_processing( datasets, models, model_ids, objective_field, fields, api, args, resume, name=name, description=description, model_fields=model_fields, session_file=session_file, path=path, log=log, labels=labels, multi_label_data=multi_label_data, other_label=other_label) if models: model = models[0] single_model = len(models) == 1 # If multi-label flag is set and no training_set was provided, label # info is extracted from the user_metadata. If models belong to an # ensemble, the ensemble must be retrieved to get the user_metadata. if model and args.multi_label and multi_label_data is None: if len(ensemble_ids) > 0 and isinstance(ensemble_ids[0], dict): resource = ensemble_ids[0] elif belongs_to_ensemble(model): ensemble_id = get_ensemble_id(model) resource = r.get_ensemble(ensemble_id, api=api, verbosity=args.verbosity, session_file=session_file) else: resource = model multi_label_data = l.get_multi_label_data(resource) # We update the model's public state if needed if model: if isinstance(model, basestring): if not args.evaluate: query_string = MINIMUM_MODEL else: query_string = r.FIELDS_QS model = u.check_resource(model, api.get_model, query_string=query_string) if (args.black_box or args.white_box or r.shared_changed(args.shared, model)): model_args = {} if r.shared_changed(args.shared, model): model_args.update(shared=args.shared) if args.black_box or args.white_box: model_args.update(r.set_publish_model_args(args)) if model_args: model = r.update_model(model, model_args, args, api=api, path=path, session_file=session_file) models[0] = model # We get the fields of the model if we haven't got # them yet and need them if model and not args.evaluate and test_set: # If more than one model, use the full field structure if (not single_model and not args.multi_label and belongs_to_ensemble(model)): if len(ensemble_ids) > 0: ensemble_id = ensemble_ids[0] else: ensemble_id = get_ensemble_id(model) local_ensemble = Ensemble(ensemble_id, api=api) fields, objective_field = pm.get_model_fields( model, csv_properties, args, single_model=single_model, multi_label_data=multi_label_data, local_ensemble=local_ensemble) # Fills in all_labels from user_metadata if args.multi_label and not all_labels: (objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(objective_field, labels, multi_label_data, fields, multi_label_fields) if model: # retrieves max_categories data, if any args.max_categories = get_metadata(model, 'max_categories', args.max_categories) other_label = get_metadata(model, 'other_label', other_label) # If predicting if models and has_test(args) and not args.evaluate: models_per_label = 1 test_dataset = None if args.multi_label: # When prediction starts from existing models, the # multi_label_fields can be retrieved from the user_metadata # in the models if args.multi_label_fields is None and multi_label_fields: multi_label_field_names = [ field[1] for field in multi_label_fields ] args.multi_label_fields = ",".join(multi_label_field_names) test_set = ps.multi_label_expansion(test_set, test_set_header, objective_field, args, path, labels=labels, session_file=session_file, input_flag=True)[0] test_set_header = True # Remote predictions: predictions are computed as batch predictions # in bigml.com except when --no-batch flag is set on or multi-label # or max-categories are used if (args.remote and not args.no_batch and not args.multi_label and not args.method in [THRESHOLD_CODE, COMBINATION]): # create test source from file test_name = "%s - test" % name if args.test_source is None: (test_source, resume, csv_properties, test_fields) = ps.test_source_processing( test_set, test_set_header, api, args, resume, name=test_name, description=description, field_attributes=test_field_attributes, types=test_types, session_file=session_file, path=path, log=log) else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id, api.get_source) if args.test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args( test_name, description, args) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(args.test_dataset) test_dataset = api.check_resource(test_dataset_id, api.get_dataset) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_prediction_args = r.set_batch_prediction_args( name, description, args, fields=fields, dataset_fields=test_fields, fields_map=fields_map) remote_predict(model, test_dataset, batch_prediction_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: models_per_label = args.number_of_models if (args.multi_label and len(ensemble_ids) > 0 and args.number_of_models == 1): # use case where ensembles are read from a file models_per_label = len(models) / len(ensemble_ids) predict(test_set, test_set_header, models, fields, output, objective_field, args, api=api, log=log, resume=resume, session_file=session_file, labels=labels, models_per_label=models_per_label, other_label=other_label, multi_label_data=multi_label_data) # When combine_votes flag is used, retrieve the predictions files saved # in the comma separated list of directories and combine them if votes_files: model_id = re.sub(r'.*(model_[a-f0-9]{24})__predictions\.csv$', r'\1', votes_files[0]).replace("_", "/") try: model = u.check_resource(model_id, api.get_model) except ValueError, exception: sys.exit("Failed to get model %s: %s" % (model_id, str(exception))) local_model = Model(model) message = u.dated("Combining votes.\n") u.log_message(message, log_file=session_file, console=args.verbosity) combine_votes(votes_files, local_model.to_prediction, output, args.method)
and not args.method in [THRESHOLD_CODE, COMBINATION]): # create test source from file test_name = "%s - test" % args.name if args.test_source is None: (test_source, resume, csv_properties, test_fields) = ps.test_source_processing( api, args, resume, session_file=session_file, path=path, log=log) else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_prediction_args = r.set_batch_prediction_args( args, fields=fields, dataset_fields=test_fields)
def compute_output(api, args): """ Creates a fusion using the `models` list or uses the ids of a previously created BigML fusion to make predictions for the `test_set`. """ fusion = None # variables from command-line options resume = args.resume_ fusion_ids = args.fusion_ids_ output = args.predictions # there's only one fusion to be generated at present args.max_parallel_fusions = 1 # fusion cannot be published yet. args.public_fusion = False # It is compulsory to have a description to publish either datasets or # fusions if (not args.description_ and args.public_fusion): sys.exit("You should provide a description to publish.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) if args.fusion_file: # fusion regression is retrieved from the contents of the given local # JSON file fusion, csv_properties, fields = u.read_local_resource( args.fusion_file, csv_properties=csv_properties) fusion_ids = [fusion] else: # fusion is retrieved from the remote object or created fusion, resume = \ pf.fusion_processing( \ fusion, fusion_ids, \ api, args, resume, \ session_file=session_file, path=path, log=log) # We update the fusion public state if needed if fusion: if isinstance(fusion, basestring): if not a.has_test(args): query_string = MINIMUM_MODEL elif args.export_fields: query_string = r.ALL_FIELDS_QS else: query_string = '' fusion = u.check_resource(fusion, api.get_fusion, query_string=query_string) if (args.public_fusion or (args.shared_flag and r.shared_changed(args.shared, fusion))): fusion_args = {} if args.shared_flag and r.shared_changed(args.shared, fusion): fusion_args.update(shared=args.shared) if args.public_fusion: fusion_args.update( \ r.set_publish_fusion_args(args)) if fusion_args: fusion = r.update_fusion( \ fusion, fusion_args, args, api=api, path=path, \ session_file=session_file) # We get the fields of the fusion if we haven't got # them yet and need them if fusion and (args.test_set or args.evaluate): fields = pf.get_fusion_fields( \ fusion, csv_properties, args) # If predicting if fusion and (a.has_test(args) or \ args.remote): test_dataset = get_test_dataset(args) # Remote predictions: predictions are computed as batch predictions # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) if not args.evaluate: batch_prediction_args = r.set_batch_prediction_args( args, fields=fields, dataset_fields=test_fields) remote_prediction(fusion, test_dataset, \ batch_prediction_args, args, \ api, resume, prediction_file=output, \ session_file=session_file, path=path, log=log) else: prediction([fusion], fields, args, session_file=session_file) # If evaluate flag is on, create remote evaluation and save results in # json and human-readable format. if args.evaluate: # When we resume evaluation and models were already completed, we # should use the datasets array as test datasets args.max_parallel_evaluations = 1 # only one evaluation at present args.cross_validation_rate = 0 # no cross-validation args.number_of_evaluations = 1 # only one evaluation if args.has_test_datasets_: test_dataset = get_test_dataset(args) dataset = test_dataset dataset = u.check_resource(dataset, api=api, query_string=r.ALL_FIELDS_QS) dataset_fields = pd.get_fields_structure(dataset, None) resume = evaluate([fusion], [dataset], api, args, resume, fields=fields, dataset_fields=dataset_fields, session_file=session_file, path=path, log=log, objective_field=args.objective_field) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more anomaly detectors using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ anomaly = None anomalies = None # no multi-label support at present # variables from command-line options resume = args.resume_ anomaly_ids = args.anomaly_ids_ output = args.predictions # there's only one anomaly detector to be generated at present args.max_parallel_anomalies = 1 # anomalies cannot be published yet. args.public_anomaly = False # It is compulsory to have a description to publish either datasets or # anomalies if (not args.description_ and (args.public_anomaly or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.anomaly_file: # anomaly is retrieved from the contents of the given local JSON file anomaly, csv_properties, fields = u.read_local_resource( args.anomaly_file, csv_properties=csv_properties) anomalies = [anomaly] anomaly_ids = [anomaly['resource']] else: # anomaly is retrieved from the remote object anomalies, anomaly_ids, resume = pa.anomalies_processing( datasets, anomalies, anomaly_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if anomalies: anomaly = anomalies[0] # We update the anomaly's public state if needed if anomaly: if not a.has_test(args) and not args.anomalies_dataset: query_string = MINIMUM_MODEL elif not a.has_test(args): query_string = ";".join([EXCLUDE_TREES, r.ALL_FIELDS_QS]) else: query_string = r.ALL_FIELDS_QS try: anomaly_id = anomaly.get('resource', anomaly) except AttributeError: anomaly_id = anomaly anomaly = u.check_resource(anomaly_id, query_string=query_string, api=api) anomalies[0] = anomaly if (args.public_anomaly or (args.shared_flag and r.shared_changed(args.shared, anomaly))): anomaly_args = {} if args.shared_flag and r.shared_changed(args.shared, anomaly): anomaly_args.update(shared=args.shared) if args.public_anomaly: anomaly_args.update(r.set_publish_anomaly_args(args)) if anomaly_args: anomaly = r.update_anomaly(anomaly, anomaly_args, args, api=api, path=path, session_file=session_file) anomalies[0] = anomaly # We get the fields of the anomaly detector if we haven't got # them yet and need them if anomaly and (args.test_set or args.export_fields): fields = pa.get_anomaly_fields(anomaly, csv_properties, args) # If creating a top anomalies excluded/included dataset if args.anomalies_dataset and anomaly: origin_dataset = anomaly['object'].get('dataset') if origin_dataset is None: sys.exit("The dataset used to generate the anomaly detector " "cannot be found. Failed to generate the anomalies " " dataset.") local_anomaly = Anomaly(anomaly) include = args.anomalies_dataset == ANOMALIES_IN args.anomaly_filter_ = local_anomaly.anomalies_filter(include=include) _, resume = pd.create_new_dataset( origin_dataset, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) # If predicting if anomaly and args.score: args.test_dataset = anomaly['object']['dataset'] if anomalies and (a.has_test(args) or (test_dataset and args.remote)): # test dataset can be defined by --test-split or --test-dataset or # --test-datasets if test_dataset is None: test_dataset = get_test_dataset(args) # Remote anomaly scores: scores are computed as batch anomaly scores # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_anomaly_score_args = r.set_batch_anomaly_score_args( args, fields=fields, dataset_fields=test_fields) remote_anomaly_score(anomaly, test_dataset, batch_anomaly_score_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: anomaly_score(anomalies, fields, args, session_file=session_file) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ logistic_regression = None logistic_regressions = None # no multi-label support at present # variables from command-line options resume = args.resume_ logistic_regression_ids = args.logistic_regression_ids_ output = args.predictions # there's only one logistic regression to be generated at present args.max_parallel_logistic_regressions = 1 # logistic regressions cannot be published yet. args.public_logistic_regression = False # It is compulsory to have a description to publish either datasets or # logistic regressions if (not args.description_ and (args.public_logistic_regression or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if datasets: # Now we have a dataset, let's check if there's an objective_field # given by the user and update it in the fields structure args.objective_id_ = get_objective_id(args, fields) if args.logistic_file: # logistic regression is retrieved from the contents of the given local # JSON file logistic_regression, csv_properties, fields = u.read_local_resource( args.logistic_file, csv_properties=csv_properties) logistic_regressions = [logistic_regression] logistic_regression_ids = [logistic_regression['resource']] else: # logistic regression is retrieved from the remote object logistic_regressions, logistic_regression_ids, resume = \ plr.logistic_regressions_processing( \ datasets, logistic_regressions, logistic_regression_ids, \ api, args, resume, fields=fields, \ session_file=session_file, path=path, log=log) if logistic_regressions: logistic_regression = logistic_regressions[0] # We update the logistic regression's public state if needed if logistic_regression: if isinstance(logistic_regression, basestring): if not a.has_test(args): query_string = MINIMUM_MODEL elif args.export_fields: query_string = r.ALL_FIELDS_QS else: query_string = '' logistic_regression = u.check_resource(logistic_regression, api.get_logistic_regression, query_string=query_string) logistic_regressions[0] = logistic_regression if (args.public_logistic_regression or (args.shared_flag and r.shared_changed(args.shared, logistic_regression))): logistic_regression_args = {} if args.shared_flag and r.shared_changed(args.shared, logistic_regression): logistic_regression_args.update(shared=args.shared) if args.public_logistic_regression: logistic_regression_args.update( \ r.set_publish_logistic_regression_args(args)) if logistic_regression_args: logistic_regression = r.update_logistic_regression( \ logistic_regression, logistic_regression_args, args, api=api, path=path, \ session_file=session_file) logistic_regressions[0] = logistic_regression # We get the fields of the logistic_regression if we haven't got # them yet and need them if logistic_regression and (args.test_set or args.export_fields): fields = plr.get_logistic_fields( \ logistic_regression, csv_properties, args) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) # If predicting if logistic_regressions and (a.has_test(args) or \ (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote predictions: predictions are computed as batch predictions # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_prediction_args = r.set_batch_prediction_args( args, fields=fields, dataset_fields=test_fields) remote_lr_prediction(logistic_regression, test_dataset, \ batch_prediction_args, args, \ api, resume, prediction_file=output, \ session_file=session_file, path=path, log=log) else: lr_prediction(logistic_regressions, fields, args, session_file=session_file) # If evaluate flag is on, create remote evaluation and save results in # json and human-readable format. if args.evaluate: # When we resume evaluation and models were already completed, we # should use the datasets array as test datasets if args.has_test_datasets_: test_dataset = get_test_dataset(args) if args.dataset_off and not args.has_test_datasets_: args.test_dataset_ids = datasets if args.test_dataset_ids and args.dataset_off: # Evaluate the models with the corresponding test datasets. resume = evaluate(logistic_regressions, args.test_dataset_ids, api, args, resume, fields=fields, dataset_fields=dataset_fields, session_file=session_file, path=path, log=log, labels=labels, all_labels=all_labels, objective_field=args.objective_field) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ pca = None # variables from command-line options resume = args.resume_ pca_ids = args.pca_ids_ output = args.projections # there's only one pca to be generated at present args.max_parallel_pcas = 1 # pca cannot be published yet. args.public_pca = False # It is compulsory to have a description to publish either datasets or # pcas if (not args.description_ and (args.public_pca or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info(api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.pca_file: # pca regression is retrieved from the contents of the given local # JSON file pca, csv_properties, fields = u.read_local_resource( args.pca_file, csv_properties=csv_properties) pac_ids = [pca] else: # pca is retrieved from the remote object or created pca, resume = \ pc.pca_processing( \ datasets, pca, pca_ids, \ api, args, resume, fields=fields, \ session_file=session_file, path=path, log=log) # We update the pca public state if needed if pca: if isinstance(pca, basestring): if not a.has_test(args): query_string = MINIMUM_MODEL elif args.export_fields: query_string = r.ALL_FIELDS_QS else: query_string = '' pca = u.check_resource(pca, api.get_pca, query_string=query_string) if (args.public_pca or (args.shared_flag and r.shared_changed(args.shared, pca))): pca_args = {} if args.shared_flag and r.shared_changed(args.shared, pca): pca_args.update(shared=args.shared) if args.public_pca: pca_args.update( \ r.set_publish_pca_args(args)) if pca_args: pca = r.update_pca( \ pca, pca_args, args, api=api, path=path, \ session_file=session_file) # We get the fields of the pca if we haven't got # them yet and need them if pca and (args.test_set or args.export_fields): fields = pc.get_pca_fields( \ pca, csv_properties, args) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) # If predicting if pca and (a.has_test(args) or \ (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote projections: projections are computed as batch projections # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_projection_args = r.set_batch_projection_args( args, fields=fields, dataset_fields=test_fields) remote_projection(pca, test_dataset, \ batch_projection_args, args, \ api, resume, projection_file=output, \ session_file=session_file, path=path, log=log) else: projection(pca, fields, args, session_file=session_file) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ source = None dataset = None model = None models = None fields = None other_label = OTHER ensemble_ids = [] multi_label_data = None multi_label_fields = [] #local_ensemble = None test_dataset = None datasets = None # variables from command-line options resume = args.resume_ model_ids = args.model_ids_ output = args.predictions dataset_fields = args.dataset_fields_ check_args_coherence(args) path = u.check_dir(output) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # labels to be used in multi-label expansion labels = (None if args.labels is None else [label.strip() for label in args.labels.split(args.args_separator)]) if labels is not None: labels = sorted([label for label in labels]) # multi_label file must be preprocessed to obtain a new extended file if args.multi_label and args.training_set is not None: (args.training_set, multi_label_data) = ps.multi_label_expansion( args.training_set, args.train_header, args, path, labels=labels, session_file=session_file) args.train_header = True args.objective_field = multi_label_data["objective_name"] all_labels = l.get_all_labels(multi_label_data) if not labels: labels = all_labels else: all_labels = labels if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) if args.source_file: # source is retrieved from the contents of the given local JSON file source, csv_properties, fields = u.read_local_resource( args.source_file, csv_properties=csv_properties) else: # source is retrieved from the remote object source, resume, csv_properties, fields = ps.source_processing( api, args, resume, csv_properties=csv_properties, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) if args.multi_label and source: multi_label_data = l.get_multi_label_data(source) (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(args.objective_field, labels, multi_label_data, fields, multi_label_fields) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) if args.dataset_file: # dataset is retrieved from the contents of the given local JSON file model_dataset, csv_properties, fields = u.read_local_resource( args.dataset_file, csv_properties=csv_properties) if not args.datasets: datasets = [model_dataset] dataset = model_dataset else: datasets = u.read_datasets(args.datasets) if not datasets: # dataset is retrieved from the remote object datasets, resume, csv_properties, fields = pd.dataset_processing( source, api, args, resume, fields=fields, csv_properties=csv_properties, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) if datasets: dataset = datasets[0] if args.to_csv is not None: resume = pd.export_dataset(dataset, api, args, resume, session_file=session_file, path=path) # Now we have a dataset, let's check if there's an objective_field # given by the user and update it in the fields structure args.objective_id_ = get_objective_id(args, fields) # If test_split is used, split the dataset in a training and a test dataset # according to the given split if args.test_split > 0: dataset, test_dataset, resume = pd.split_processing( dataset, api, args, resume, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) datasets[0] = dataset # Check if the dataset has a categorical objective field and it # has a max_categories limit for categories if args.max_categories > 0 and len(datasets) == 1: if pd.check_max_categories(fields.fields[args.objective_id_]): distribution = pd.get_categories_distribution(dataset, args.objective_id_) if distribution and len(distribution) > args.max_categories: categories = [element[0] for element in distribution] other_label = pd.create_other_label(categories, other_label) datasets, resume = pd.create_categories_datasets( dataset, distribution, fields, args, api, resume, session_file=session_file, path=path, log=log, other_label=other_label) else: sys.exit("The provided objective field is not categorical nor " "a full terms only text field. " "Only these fields can be used with" " --max-categories") # If multi-dataset flag is on, generate a new dataset from the given # list of datasets if args.multi_dataset: dataset, resume = pd.create_new_dataset( datasets, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) datasets = [dataset] # Check if the dataset has a generators file associated with it, and # generate a new dataset with the specified field structure. Also # if the --to-dataset flag is used to clone or sample the original dataset if args.new_fields or (args.sample_rate != 1 and args.no_model) or \ (args.lisp_filter or args.json_filter) and not has_source(args): if fields is None: if isinstance(dataset, basestring): dataset = u.check_resource(dataset, api=api) fields = Fields(dataset, csv_properties) args.objective_id_ = get_objective_id(args, fields) args.objective_name_ = fields.field_name(args.objective_id_) dataset, resume = pd.create_new_dataset( dataset, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) datasets[0] = dataset # rebuild fields structure for new ids and fields csv_properties.update({'objective_field': args.objective_name_, 'objective_field_present': True}) fields = pd.get_fields_structure(dataset, csv_properties) args.objective_id_ = get_objective_id(args, fields) if args.multi_label and dataset and multi_label_data is None: multi_label_data = l.get_multi_label_data(dataset) (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(args.objective_field, labels, multi_label_data, fields, multi_label_fields) if dataset: # retrieves max_categories data, if any args.max_categories = get_metadata(dataset, 'max_categories', args.max_categories) other_label = get_metadata(dataset, 'other_label', other_label) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) if args.model_file: # model is retrieved from the contents of the given local JSON file model, csv_properties, fields = u.read_local_resource( args.model_file, csv_properties=csv_properties) models = [model] model_ids = [model['resource']] ensemble_ids = [] elif args.ensemble_file: # model is retrieved from the contents of the given local JSON file ensemble, csv_properties, fields = u.read_local_resource( args.ensemble_file, csv_properties=csv_properties) model_ids = ensemble['object']['models'][:] ensemble_ids = [ensemble['resource']] models = model_ids[:] model = retrieve_resource(bigml.api.BigML(storage='./storage'), models[0], query_string=r.ALL_FIELDS_QS) models[0] = model else: # model is retrieved from the remote object models, model_ids, ensemble_ids, resume = pm.models_processing( datasets, models, model_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log, labels=labels, multi_label_data=multi_label_data, other_label=other_label) if models: model = models[0] single_model = len(models) == 1 # If multi-label flag is set and no training_set was provided, label # info is extracted from the user_metadata. If models belong to an # ensemble, the ensemble must be retrieved to get the user_metadata. if model and args.multi_label and multi_label_data is None: if len(ensemble_ids) > 0 and isinstance(ensemble_ids[0], dict): resource = ensemble_ids[0] elif belongs_to_ensemble(model): ensemble_id = get_ensemble_id(model) resource = r.get_ensemble(ensemble_id, api=api, verbosity=args.verbosity, session_file=session_file) else: resource = model multi_label_data = l.get_multi_label_data(resource) # We update the model's public state if needed if model: if (isinstance(model, basestring) or bigml.api.get_status(model)['code'] != bigml.api.FINISHED): if not args.evaluate and not a.has_train(args) and \ not a.has_test(args) : query_string = MINIMUM_MODEL elif not args.test_header: query_string = r.ALL_FIELDS_QS else: query_string = "%s;%s" % (r.ALL_FIELDS_QS, r.FIELDS_QS) model = u.check_resource(model, api.get_model, query_string=query_string) models[0] = model if (args.black_box or args.white_box or (args.shared_flag and r.shared_changed(args.shared, model))): model_args = {} if args.shared_flag and r.shared_changed(args.shared, model): model_args.update(shared=args.shared) if args.black_box or args.white_box: model_args.update(r.set_publish_model_args(args)) if model_args: model = r.update_model(model, model_args, args, api=api, path=path, session_file=session_file) models[0] = model # We get the fields of the model if we haven't got # them yet and need them if model and not args.evaluate and (a.has_test(args) or args.export_fields): # If more than one model, use the full field structure if (not single_model and not args.multi_label and belongs_to_ensemble(model)): if len(ensemble_ids) > 0: ensemble_id = ensemble_ids[0] args.ensemble_ids_ = ensemble_ids else: ensemble_id = get_ensemble_id(model) fields = pm.get_model_fields( model, csv_properties, args, single_model=single_model, multi_label_data=multi_label_data) # Free memory after getting fields # local_ensemble = None gc.collect() # Fills in all_labels from user_metadata if args.multi_label and not all_labels: (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(args.objective_field, labels, multi_label_data, fields, multi_label_fields) if model: # retrieves max_categories data, if any args.max_categories = get_metadata(model, 'max_categories', args.max_categories) other_label = get_metadata(model, 'other_label', other_label) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) # If predicting if (models and (a.has_test(args) or (test_dataset and args.remote)) and not args.evaluate): models_per_label = 1 if test_dataset is None: test_dataset = get_test_dataset(args) if args.multi_label: # When prediction starts from existing models, the # multi_label_fields can be retrieved from the user_metadata # in the models if args.multi_label_fields is None and multi_label_fields: multi_label_field_names = [field[1] for field in multi_label_fields] args.multi_label_fields = ",".join(multi_label_field_names) test_set = ps.multi_label_expansion( args.test_set, args.test_header, args, path, labels=labels, session_file=session_file, input_flag=True)[0] test_set_header = True # Remote predictions: predictions are computed as batch predictions # in bigml.com except when --no-batch flag is set on or multi-label # or max-categories are used if (args.remote and not args.no_batch and not args.multi_label and not args.method in [THRESHOLD_CODE, COMBINATION]): # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) if args.to_dataset and args.dataset_off: model = api.check_resource(model['resource'], query_string=r.ALL_FIELDS_QS) model_fields = Fields(model) objective_field_name = model_fields.field_name( \ model_fields.objective_field) if objective_field_name in test_fields.fields_by_name.keys(): args.prediction_name = "%s (predicted)" % \ objective_field_name batch_prediction_args = r.set_batch_prediction_args( args, fields=fields, dataset_fields=test_fields) remote_predict(model, test_dataset, batch_prediction_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: models_per_label = args.number_of_models if (args.multi_label and len(ensemble_ids) > 0 and args.number_of_models == 1): # use case where ensembles are read from a file models_per_label = len(models) / len(ensemble_ids) predict(models, fields, args, api=api, log=log, resume=resume, session_file=session_file, labels=labels, models_per_label=models_per_label, other_label=other_label, multi_label_data=multi_label_data) # When combine_votes flag is used, retrieve the predictions files saved # in the comma separated list of directories and combine them if args.votes_files_: model_id = re.sub(r'.*(model_[a-f0-9]{24})__predictions\.csv$', r'\1', args.votes_files_[0]).replace("_", "/") try: model = u.check_resource(model_id, api.get_model) except ValueError, exception: sys.exit("Failed to get model %s: %s" % (model_id, str(exception))) local_model = Model(model) message = u.dated("Combining votes.\n") u.log_message(message, log_file=session_file, console=args.verbosity) combine_votes(args.votes_files_, local_model.to_prediction, output, method=args.method)
def compute_output(api, args): """ Creates one or more anomaly detectors using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ anomaly = None anomalies = None # no multi-label support at present # variables from command-line options resume = args.resume_ anomaly_ids = args.anomaly_ids_ output = args.predictions # there's only one anomaly detector to be generated at present args.max_parallel_anomalies = 1 # anomalies cannot be published yet. args.public_anomaly = False # It is compulsory to have a description to publish either datasets or # anomalies if (not args.description_ and (args.public_anomaly or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (dataset, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.anomaly_file: # anomaly is retrieved from the contents of the given local JSON file anomaly, csv_properties, fields = u.read_local_resource( args.anomaly_file, csv_properties=csv_properties) anomalies = [anomaly] anomaly_ids = [anomaly['resource']] else: # anomaly is retrieved from the remote object anomalies, anomaly_ids, resume = pa.anomalies_processing( datasets, anomalies, anomaly_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if anomalies: anomaly = anomalies[0] # We update the anomaly's public state if needed if anomaly: if not a.has_test(args) and not args.anomalies_dataset: query_string = MINIMUM_MODEL elif not a.has_test(args): query_string = ";".join([EXCLUDE_TREES, r.ALL_FIELDS_QS]) else: query_string = r.ALL_FIELDS_QS try: anomaly_id = anomaly.get('resource', anomaly) except AttributeError: anomaly_id = anomaly anomaly = u.check_resource(anomaly_id, query_string=query_string, api=api) anomalies[0] = anomaly if (args.public_anomaly or (args.shared_flag and r.shared_changed(args.shared, anomaly))): anomaly_args = {} if args.shared_flag and r.shared_changed(args.shared, anomaly): anomaly_args.update(shared=args.shared) if args.public_anomaly: anomaly_args.update(r.set_publish_anomaly_args(args)) if anomaly_args: anomaly = r.update_anomaly(anomaly, anomaly_args, args, api=api, path=path, session_file=session_file) anomalies[0] = anomaly # We get the fields of the anomaly detector if we haven't got # them yet and need them if anomaly and args.test_set: fields = pa.get_anomaly_fields(anomaly, csv_properties, args) # If creating a top anomalies excluded/included dataset if args.anomalies_dataset and anomaly: origin_dataset = anomaly['object'].get('dataset') if origin_dataset is None: sys.exit("The dataset used to generate the anomaly detector " "cannot be found. Failed to generate the anomalies " " dataset.") local_anomaly = Anomaly(anomaly) include = args.anomalies_dataset == ANOMALIES_IN args._anomaly_filter = local_anomaly.anomalies_filter(include=include) new_dataset, resume = pd.create_new_dataset( origin_dataset, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) # If predicting if anomaly and args.score: args.test_dataset = anomaly['object']['dataset'] if anomalies and (a.has_test(args) or (test_dataset and args.remote)): # test dataset can be defined by --test-split or --test-dataset or # --test-datasets if test_dataset is None: test_dataset = get_test_dataset(args) # Remote anomaly scores: scores are computed as batch anomaly scores # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_anomaly_score_args = r.set_batch_anomaly_score_args( args, fields=fields, dataset_fields=test_fields) remote_anomaly_score(anomaly, test_dataset, batch_anomaly_score_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: anomaly_score(anomalies, fields, args, session_file=session_file) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ pca = None # variables from command-line options resume = args.resume_ pca_ids = args.pca_ids_ output = args.projections # there's only one pca to be generated at present args.max_parallel_pcas = 1 # pca cannot be published yet. args.public_pca = False # It is compulsory to have a description to publish either datasets or # pcas if (not args.description_ and (args.public_pca or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.pca_file: # pca regression is retrieved from the contents of the given local # JSON file pca, csv_properties, fields = u.read_local_resource( args.pca_file, csv_properties=csv_properties) pac_ids = [pca] else: # pca is retrieved from the remote object or created pca, resume = \ pc.pca_processing( \ datasets, pca, pca_ids, \ api, args, resume, fields=fields, \ session_file=session_file, path=path, log=log) # We update the pca public state if needed if pca: if isinstance(pca, basestring): if not a.has_test(args): query_string = MINIMUM_MODEL elif args.export_fields: query_string = r.ALL_FIELDS_QS else: query_string = '' pca = u.check_resource(pca, api.get_pca, query_string=query_string) if (args.public_pca or (args.shared_flag and r.shared_changed(args.shared, pca))): pca_args = {} if args.shared_flag and r.shared_changed(args.shared, pca): pca_args.update(shared=args.shared) if args.public_pca: pca_args.update( \ r.set_publish_pca_args(args)) if pca_args: pca = r.update_pca( \ pca, pca_args, args, api=api, path=path, \ session_file=session_file) # We get the fields of the pca if we haven't got # them yet and need them if pca and (args.test_set or args.export_fields): fields = pc.get_pca_fields( \ pca, csv_properties, args) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) # If predicting if pca and (a.has_test(args) or \ (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote projections: projections are computed as batch projections # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_projection_args = r.set_batch_projection_args( args, fields=fields, dataset_fields=test_fields) remote_projection(pca, test_dataset, \ batch_projection_args, args, \ api, resume, projection_file=output, \ session_file=session_file, path=path, log=log) else: projection(pca, fields, args, session_file=session_file) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ topic_model = None topic_models = None # no multi-label support at present # variables from command-line options resume = args.resume_ topic_model_ids = args.topic_model_ids_ output = args.predictions # there's only one topic model resource to be generated at present args.max_parallel_topic_models = 1 # topic models cannot be published yet. args.public_topic_model = False # It is compulsory to have a description to publish either datasets or # topic models if (not args.description_ and (args.public_topic_model or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.topic_model_file: # topic model is retrieved from the contents of the given local JSON # file topic_model, csv_properties, fields = u.read_local_resource( args.topic_model_file, csv_properties=csv_properties) topic_models = [topic_model] topic_model_ids = [topic_model['resource']] else: # topic model is retrieved from the remote object topic_models, topic_model_ids, resume = pt.topic_model_processing( datasets, topic_models, topic_model_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if topic_models: topic_model = topic_models[0] # We update the topic model's public state if needed if topic_model: if isinstance(topic_model, basestring): if not a.has_test(args): query_string = MINIMUM_MODEL else: query_string = '' topic_model = u.check_resource(topic_model, api.topic_model, query_string=query_string) topic_models[0] = topic_model if (args.public_topic_model or (args.shared_flag and r.shared_changed(args.shared, topic_model))): topic_model_args = {} if args.shared_flag and \ r.shared_changed(args.shared, topic_model): topic_model_args.update(shared=args.shared) if args.public_topic_model: topic_model_args.update(r.set_publish_topic_model_args(args)) if topic_model_args: topic_model = r.update_topic_model( \ topic_model, topic_model_args, args, api=api, path=path, session_file=session_file) topic_models[0] = topic_model # We get the fields of the topic model if we haven't got # them yet and need them if topic_model and args.test_set: csv_properties.update({'objective_field_present': False, 'objective_field': None}) fields = pt.get_topic_model_fields(topic_model, csv_properties, args) # If predicting if topic_models and (a.has_test(args) or (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote topic distributions:topic distributions are computed as # batch topic distributions # in bigml.com except when --no-batch flag is set. if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_topic_distribution_args = \ r.set_batch_topic_distribution_args( \ args, fields=fields, \ dataset_fields=test_fields) remote_topic_distribution( \ topic_model, test_dataset, batch_topic_distribution_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: topic_distribution(topic_models, fields, args, session_file=session_file) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)