def create_models(dataset, model_ids, model_args, args, api=None, path=None, session_file=None, log=None): """Create remote models """ if api is None: api = bigml.api.BigML() models = model_ids[:] existing_models = len(models) model_args_list = [] if isinstance(model_args, list): model_args_list = model_args if args.number_of_models > 0: message = dated("Creating %s.\n" % plural("model", args.number_of_models)) log_message(message, log_file=session_file, console=args.verbosity) single_model = args.number_of_models == 1 and existing_models == 0 # if there's more than one model the first one must contain # the entire field structure to be used as reference. query_string = (FIELDS_QS if single_model else ALL_FIELDS_QS) for i in range(0, args.number_of_models): if i % args.max_parallel_models == 0 and i > 0: try: models[i - 1] = check_resource( models[i - 1], api.get_model, query_string=query_string) except ValueError, exception: sys.exit("Failed to get a finished model: %s" % str(exception)) if model_args_list: model_args = model_args_list[i] if args.cross_validation_rate > 0: new_seed = get_basic_seed(i + existing_models) model_args.update(seed=new_seed) model = api.create_model(dataset, model_args) model_id = check_resource_error(model, "Failed to create model: ") log_message("%s\n" % model_id, log_file=log) model_ids.append(model_id) models.append(model) log_created_resources("models", path, model_id, open_mode='a') if args.number_of_models < 2 and args.verbosity: if bigml.api.get_status(model)['code'] != bigml.api.FINISHED: try: model = check_resource(model, api.get_model, query_string=query_string) except ValueError, exception: sys.exit("Failed to get a finished model: %s" % str(exception)) models[0] = model message = dated("Model created: %s.\n" % get_url(model)) log_message(message, log_file=session_file, console=args.verbosity)
def create_models(dataset, model_ids, model_args, args, api=None, path=None, session_file=None, log=None): """Create remote models """ if api is None: api = bigml.api.BigML() models = model_ids[:] existing_models = len(models) last_model = None if args.number_of_models > 0: message = dated("Creating %s.\n" % plural("model", args.number_of_models)) log_message(message, log_file=session_file, console=args.verbosity) for i in range(0, args.number_of_models): if i % args.max_parallel_models == 0 and i > 0: try: models[i - 1] = check_resource(models[i - 1], api.get_model, query_string=FIELDS_QS) except ValueError, exception: sys.exit("Failed to get a finished model: %s" % str(exception)) if args.cross_validation_rate > 0: new_seed = get_basic_seed(i + existing_models) model_args.update(seed=new_seed) model = api.create_model(dataset, model_args) log_message("%s\n" % model['resource'], log_file=log) model_ids.append(model['resource']) models.append(model) log_created_resources("models", path, bigml.api.get_model_id(model), open_mode='a') check_resource_error( model, "Failed to create model %s:" % model['resource']) if args.number_of_models < 2 and args.verbosity: if bigml.api.get_status(model)['code'] != bigml.api.FINISHED: try: model = check_resource(model, api.get_model, query_string=FIELDS_QS) except ValueError, exception: sys.exit("Failed to get a finished model: %s" % str(exception)) models[0] = model message = dated("Model created: %s.\n" % get_url(model)) log_message(message, log_file=session_file, console=args.verbosity)
def create_models(dataset, model_ids, model_args, args, api=None, path=None, session_file=None, log=None): """Create remote models """ if api is None: api = bigml.api.BigML() models = model_ids[:] existing_models = len(models) last_model = None if args.number_of_models > 0: message = dated("Creating %s.\n" % plural("model", args.number_of_models)) log_message(message, log_file=session_file, console=args.verbosity) for i in range(0, args.number_of_models): if i % args.max_parallel_models == 0 and i > 0: try: models[i - 1] = check_resource( models[i - 1], api.get_model, query_string=FIELDS_QS) except ValueError, exception: sys.exit("Failed to get a finished model: %s" % str(exception)) if args.cross_validation_rate > 0: new_seed = get_basic_seed(i + existing_models) model_args.update(seed=new_seed) model = api.create_model(dataset, model_args) log_message("%s\n" % model['resource'], log_file=log) model_ids.append(model['resource']) models.append(model) log_created_resources("models", path, bigml.api.get_model_id(model), open_mode='a') check_resource_error(model, "Failed to create model %s:" % model['resource']) if args.number_of_models < 2 and args.verbosity: if bigml.api.get_status(model)['code'] != bigml.api.FINISHED: try: model = check_resource(model, api.get_model, query_string=FIELDS_QS) except ValueError, exception: sys.exit("Failed to get a finished model: %s" % str(exception)) models[0] = model message = dated("Model created: %s.\n" % get_url(model)) log_message(message, log_file=session_file, console=args.verbosity)
def get_samples(sample_ids, args, api=None, session_file=None, query_string=''): """Retrieves remote samples in its actual status """ if api is None: api = bigml.api.BigML() sample_id = "" samples = sample_ids sample_id = sample_ids[0] message = dated("Retrieving %s. %s\n" % (plural("sample", len(sample_ids)), get_url(sample_id))) log_message(message, log_file=session_file, console=args.verbosity) # only one sample to predict at present try: sample = api.get_sample(sample_ids[0], query_string=query_string) check_resource_error( sample, "Failed to create sample: %s" % sample['resource']) sample = check_resource(sample, api=api, query_string=query_string, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a finished sample: %s" % str(exception))
def update_sample(sample, sample_args, args, api=None, path=None, session_file=None): """Updates sample properties """ if api is None: api = bigml.api.BigML() message = dated("Updating sample. %s\n" % get_url(sample)) log_message(message, log_file=session_file, console=args.verbosity) sample = api.update_sample(sample, sample_args) check_resource_error(sample, "Failed to update sample: %s" % sample['resource']) sample = check_resource(sample, api.get_sample, raise_on_error=True) if is_shared(sample): message = dated("Shared sample link. %s\n" % get_url(sample, shared=True)) log_message(message, log_file=session_file, console=args.verbosity) if args.reports: report(args.reports, path, sample) return sample
def get_models(model_ids, args, api=None, session_file=None): """Retrieves remote models in its actual status """ if api is None: api = bigml.api.BigML() model_id = "" models = model_ids single_model = len(model_ids) == 1 if single_model: model_id = model_ids[0] message = dated("Retrieving %s. %s\n" % (plural("model", len(model_ids)), get_url(model_id))) log_message(message, log_file=session_file, console=args.verbosity) if len(model_ids) < args.max_batch_models: models = [] for model in model_ids: try: # if there's more than one model the first one must contain # the entire field structure to be used as reference. query_string = ( ALL_FIELDS_QS if ( (not single_model and (not models or args.multi_label)) or not args.test_header) else FIELDS_QS) model = check_resource(model, api.get_model, query_string=query_string, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a finished model: %s" % str(exception)) models.append(model) model = models[0]
def create_dataset(source_or_dataset, dataset_args, args, api=None, path=None, session_file=None, log=None, dataset_type=None): """Creates remote dataset from source, dataset or datasets list """ if api is None: api = bigml.api.BigML() message = dated("Creating dataset.\n") log_message(message, log_file=session_file, console=args.verbosity) dataset = api.create_dataset(source_or_dataset, dataset_args) suffix = "_" + dataset_type if dataset_type else "" log_created_resources("dataset%s" % suffix, path, bigml.api.get_dataset_id(dataset), open_mode='a') dataset_id = check_resource_error(dataset, "Failed to create dataset: ") try: dataset = check_resource(dataset, api.get_dataset, query_string=ALL_FIELDS_QS) except ValueError, exception: sys.exit("Failed to get a finished dataset: %s" % str(exception))
def update_time_series(time_series, time_series_args, args, api=None, path=None, session_file=None): """Updates time-series properties """ if api is None: api = bigml.api.BigML() message = dated("Updating time-series. %s\n" % get_url(time_series)) log_message(message, log_file=session_file, console=args.verbosity) time_series = api.update_time_series(time_series, \ time_series_args) check_resource_error( time_series, "Failed to update time-series: %s" % time_series['resource']) time_series = check_resource(time_series, api.get_time_series, query_string=FIELDS_QS, raise_on_error=True) if is_shared(time_series): message = dated("Shared time-series link. %s\n" % get_url(time_series, shared=True)) log_message(message, log_file=session_file, console=args.verbosity) if args.reports: report(args.reports, path, time_series) return time_series
def get_time_series(time_series_ids, args, api=None, session_file=None): """Retrieves remote time-series in its actual status """ if api is None: api = bigml.api.BigML() time_series_id = "" time_series_set = time_series_ids time_series_id = time_series_ids[0] message = dated( "Retrieving %s. %s\n" % (plural("time-series", len(time_series_ids)), get_url(time_series_id))) log_message(message, log_file=session_file, console=args.verbosity) # only one time-series to predict at present try: # we need the whole fields structure when exporting fields query_string = FIELDS_QS if not args.export_fields else ALL_FIELDS_QS time_series = check_resource(time_series_ids[0], api.get_time_series, query_string=query_string, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a finished time-series: %s" % \ str(exception))
def update_deepnet(deepnet, deepnet_args, args, api=None, path=None, session_file=None): """Updates deepnet properties """ if api is None: api = bigml.api.BigML() message = dated("Updating deepnet. %s\n" % get_url(deepnet)) log_message(message, log_file=session_file, console=args.verbosity) deepnet = api.update_deepnet(deepnet, deepnet_args) check_resource_error(deepnet, "Failed to update deepnet: %s" % deepnet['resource']) deepnet = check_resource(deepnet, api.get_deepnet, query_string=FIELDS_QS, raise_on_error=True) if is_shared(deepnet): message = dated("Shared deepnet link. %s\n" % get_url(deepnet, shared=True)) log_message(message, log_file=session_file, console=args.verbosity) if args.reports: report(args.reports, path, deepnet) return deepnet
def update_topic_model(topic_model, topic_model_args, args, api=None, path=None, session_file=None): """Updates topic model properties """ if api is None: api = bigml.api.BigML() message = dated("Updating topic model. %s\n" % get_url(topic_model)) log_message(message, log_file=session_file, console=args.verbosity) topic_model = api.update_topic_model(topic_model, \ topic_model_args) check_resource_error( topic_model, "Failed to update topic model: %s" % topic_model['resource']) topic_model = check_resource(topic_model, api.get_topic_model, query_string=FIELDS_QS, raise_on_error=True) if is_shared(topic_model): message = dated("Shared topic model link. %s\n" % get_url(topic_model, shared=True)) log_message(message, log_file=session_file, console=args.verbosity) if args.reports: report(args.reports, path, topic_model) return topic_model
def create_source(data_set, source_args, args, api=None, path=None, session_file=None, log=None, source_type=None): """Creates remote source """ if api is None: api = bigml.api.BigML() suffix = "" if source_type is None else "%s " % source_type message = dated("Creating %ssource.\n" % suffix) log_message(message, log_file=session_file, console=args.verbosity) check_fields_struct(source_args, "source") source = api.create_source(data_set, source_args, progress_bar=args.progress_bar) if path is not None: suffix = "_" + source_type if source_type else "" log_created_resources("source%s" % suffix, path, source['resource'], mode='ab', comment=(u"%s\n" % source['object']['name'])) source_id = check_resource_error(source, "Failed to create source: ") try: source = check_resource(source, api.get_source, query_string=ALL_FIELDS_QS, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a finished source: %s" % str(exception))
def create_execution(execution_args, args, api=None, path=None, session_file=None, log=None): """Creates remote execution """ message = dated("Creating execution.\n") log_message(message, log_file=session_file, console=args.verbosity) scripts = args.script_ids if args.script_ids else args.script execution = api.create_execution(scripts, execution_args) log_created_resources("execution", path, bigml.api.get_execution_id(execution), mode='a') execution_id = check_resource_error(execution, "Failed to create execution: ") try: execution = check_resource(execution, api.get_execution, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a finished execution: %s" % str(exception))
def wait_for_available_tasks(inprogress, max_parallel, api, resource_type, wait_step=2): """According to the max_parallel number of parallel resources to be created, when the number of in progress resources reaches the limit, it checks the ones in inprogress to see if there's a FINISHED or FAULTY resource. If found, it is removed from the inprogress list and returns to allow another one to be created. """ check_kwargs = {"retries": 0, "query_string": "full=false", "api": api} while len(inprogress) == max_parallel: for j in range(0, len(inprogress)): try: ready = check_resource(inprogress[j], **check_kwargs) status = bigml.api.get_status(ready) if status['code'] == bigml.api.FINISHED: del inprogress[j] return elif status['code'] == bigml.api.FAULTY: raise ValueError(status['message']) except ValueError, exception: sys.exit("Failed to get a finished %s: %s" % (resource_type, str(exception))) time.sleep(max_parallel * wait_step)
def update_anomaly(anomaly, anomaly_args, args, api=None, path=None, session_file=None): """Updates anomaly properties """ if api is None: api = bigml.api.BigML() message = dated("Updating anomaly detector. %s\n" % get_url(anomaly)) log_message(message, log_file=session_file, console=args.verbosity) anomaly = api.update_anomaly(anomaly, anomaly_args) check_resource_error(anomaly, "Failed to update anomaly: %s" % anomaly['resource']) anomaly = check_resource(anomaly, api.get_anomaly, query_string=FIELDS_QS, raise_on_error=True) if is_shared(anomaly): message = dated("Shared anomaly link. %s\n" % get_url(anomaly, shared=True)) log_message(message, log_file=session_file, console=args.verbosity) if args.reports: report(args.reports, path, anomaly) return anomaly
def get_logistic_regressions(logistic_regression_ids, args, api=None, session_file=None): """Retrieves remote logistic regression in its actual status """ if api is None: api = bigml.api.BigML() logistic_regression_id = "" logistic_regressions = logistic_regression_ids logistic_regression_id = logistic_regression_ids[0] message = dated( "Retrieving %s. %s\n" % (plural("logistic regression", len(logistic_regression_ids)), get_url(logistic_regression_id))) log_message(message, log_file=session_file, console=args.verbosity) # only one logistic regression to predict at present try: # we need the whole fields structure when exporting fields query_string = FIELDS_QS if not args.export_fields else ALL_FIELDS_QS logistic_regression = check_resource(logistic_regression_ids[0], api.get_logistic_regression, query_string=query_string, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a finished logistic regression: %s" % \ str(exception))
def update_logistic_regression(logistic_regression, logistic_regression_args, args, api=None, path=None, session_file=None): """Updates logistic regression properties """ if api is None: api = bigml.api.BigML() message = dated("Updating logistic regression. %s\n" % get_url(logistic_regression)) log_message(message, log_file=session_file, console=args.verbosity) logistic_regression = api.update_logistic_regression(logistic_regression, \ logistic_regression_args) check_resource_error( logistic_regression, "Failed to update logistic regression: %s" % logistic_regression['resource']) logistic_regression = check_resource(logistic_regression, api.get_logistic_regression, query_string=FIELDS_QS, raise_on_error=True) if is_shared(logistic_regression): message = dated("Shared logistic regression link. %s\n" % get_url(logistic_regression, shared=True)) log_message(message, log_file=session_file, console=args.verbosity) if args.reports: report(args.reports, path, logistic_regression) return logistic_regression
def create_library(source_code, library_args, args, api=None, path=None, session_file=None, log=None): """Creates remote library """ if api is None: api = bigml.api.BigML() message = dated("Creating library \"%s\".\n" % library_args["name"]) log_message(message, log_file=session_file, console=args.verbosity) library = api.create_library(source_code, library_args) log_created_resources("library", path, bigml.api.get_library_id(library), mode='a') library_id = check_resource_error(library, "Failed to create library: ") try: library = check_resource(library, api.get_library, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a compiled library: %s" % str(exception))
def create_external_connector(external_connector_args, args, api=None, session_file=None, path=None, log=None): """Creates remote external connector """ if api is None: api = bigml.api.BigML() message = dated("Creating external connector.\n") log_message(message, log_file=session_file, console=args.verbosity) external_connector = api.create_external_connector( \ args.connection_info, external_connector_args) log_created_resources( \ "external_connector", path, bigml.api.get_external_connector_id(external_connector), mode='a') external_connector_id = check_resource_error( \ external_connector, "Failed to create external connector: ") try: external_connector = check_resource( \ external_connector, api=api, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a finished external connector: %s" % \ str(exception))
def create_forecast(time_series, input_data, forecast_args, args, api=None, session_file=None, path=None, log=None): """Creates remote forecast """ if api is None: api = bigml.api.BigML() message = dated("Creating remote forecast.\n") log_message(message, log_file=session_file, console=args.verbosity) forecast = api.create_forecast(time_series, input_data, forecast_args, retries=None) log_created_resources("forecast", path, bigml.api.get_forecast_id(forecast), mode='a') forecast_id = check_resource_error(forecast, "Failed to create forecast: ") try: forecast = check_resource(forecast, api.get_forecast, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a finished forecast: %s" % str(exception))
def get_models(model_ids, args, api=None, session_file=None): """Retrieves remote models in its actual status """ if api is None: api = bigml.api.BigML() model_id = "" models = model_ids single_model = len(model_ids) == 1 if single_model: model_id = model_ids[0] message = dated("Retrieving %s. %s\n" % (plural("model", len(model_ids)), get_url(model_id))) log_message(message, log_file=session_file, console=args.verbosity) if len(model_ids) < args.max_batch_models: models = [] for model in model_ids: try: # if there's more than one model the first one must contain # the entire field structure to be used as reference. query_string = (ALL_FIELDS_QS if not single_model and (len(models) == 0 or args.multi_label) else FIELDS_QS) model = check_resource(model, api.get_model, query_string=query_string) except ValueError, exception: sys.exit("Failed to get a finished model: %s" % str(exception)) models.append(model) model = models[0]
def get_models(model_ids, args, api=None, session_file=None): """Retrieves remote models in its actual status """ if api is None: api = bigml.api.BigML() model_id = "" models = model_ids if len(model_ids) == 1: model_id = model_ids[0] message = dated("Retrieving %s. %s\n" % (plural("model", len(model_ids)), get_url(model_id))) log_message(message, log_file=session_file, console=args.verbosity) if len(model_ids) < args.max_batch_models: models = [] for model in model_ids: try: model = check_resource(model, api.get_model, query_string=FIELDS_QS) except ValueError, exception: sys.exit("Failed to get a finished model: %s" % str(exception)) models.append(model) model = models[0]
def wait_for_available_tasks(inprogress, max_parallel, get_function, resource_type, query_string=None, wait_step=2): """According to the max_parallel number of parallel resources to be created, when the number of in progress resources reaches the limit, it checks the ones in inprogress to see if there's a FINISHED or FAULTY resource. If found, it is removed from the inprogress list and returns to allow another one to be created. """ check_kwargs = {"retries": 0} if query_string: check_kwargs.update(query_string=query_string) while len(inprogress) == max_parallel: for j in range(0, len(inprogress)): try: ready = check_resource(inprogress[j], get_function, **check_kwargs) status = bigml.api.get_status(ready) if (status['code'] == bigml.api.FINISHED): del inprogress[j] return elif (status['code'] == bigml.api.FAULTY): raise ValueError(status['message']) except ValueError, exception: sys.exit("Failed to get a finished %s: %s" % (resource_type, str(exception))) time.sleep(max_parallel * wait_step)
def create_batch_prediction(model_or_ensemble, test_dataset, batch_prediction_args, verbosity, api=None, session_file=None, path=None, log=None): """Creates remote batch_prediction """ if api is None: api = bigml.api.BigML() message = dated("Creating batch prediction.\n") log_message(message, log_file=session_file, console=verbosity) batch_prediction = api.create_batch_prediction(model_or_ensemble, test_dataset, batch_prediction_args) log_created_resources("batch_prediction", path, bigml.api.get_batch_prediction_id(batch_prediction), open_mode='a') batch_prediction_id = check_resource_error( batch_prediction, "Failed to create batch prediction: ") try: batch_prediction = check_resource(batch_prediction, api.get_batch_prediction) except ValueError, exception: sys.exit("Failed to get a finished batch prediction: %s" % str(exception))
def create_batch_prediction(model_or_ensemble, test_dataset, batch_prediction_args, args, api=None, session_file=None, path=None, log=None): """Creates remote batch_prediction """ if api is None: api = bigml.api.BigML() message = dated("Creating batch prediction.\n") log_message(message, log_file=session_file, console=args.verbosity) batch_prediction = api.create_batch_prediction(model_or_ensemble, test_dataset, batch_prediction_args, retries=None) log_created_resources("batch_prediction", path, bigml.api.get_batch_prediction_id(batch_prediction), mode='a') batch_prediction_id = check_resource_error( batch_prediction, "Failed to create batch prediction: ") try: batch_prediction = check_resource(batch_prediction, api.get_batch_prediction, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a finished batch prediction: %s" % str(exception))
def create_samples(datasets, sample_ids, sample_args, args, api=None, path=None, session_file=None, log=None): """Create remote samples """ if api is None: api = bigml.api.BigML() samples = sample_ids[:] existing_samples = len(samples) sample_args_list = [] datasets = datasets[existing_samples:] # if resuming and all samples were created, there will be no datasets left if datasets: if isinstance(sample_args, list): sample_args_list = sample_args # Only one sample per command, at present number_of_samples = 1 max_parallel_samples = 1 message = dated("Creating %s.\n" % plural("sample", number_of_samples)) log_message(message, log_file=session_file, console=args.verbosity) inprogress = [] for i in range(0, number_of_samples): wait_for_available_tasks(inprogress, max_parallel_samples, api, "sample") if sample_args_list: sample_args = sample_args_list[i] sample = api.create_sample(datasets[i], sample_args, retries=None) sample_id = check_resource_error(sample, "Failed to create sample: ") log_message("%s\n" % sample_id, log_file=log) sample_ids.append(sample_id) inprogress.append(sample_id) samples.append(sample) log_created_resources("samples", path, sample_id, mode='a') if args.verbosity: if bigml.api.get_status(sample)['code'] != bigml.api.FINISHED: try: sample = check_resource(sample, api.get_sample, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a finished sample: %s" % str(exception)) samples[0] = sample message = dated("Sample created: %s\n" % get_url(sample)) log_message(message, log_file=session_file, console=args.verbosity) if args.reports: report(args.reports, path, sample)
def create_clusters(datasets, cluster_ids, cluster_args, args, api=None, path=None, session_file=None, log=None): """Create remote clusters """ if api is None: api = bigml.api.BigML() clusters = cluster_ids[:] existing_clusters = len(clusters) cluster_args_list = [] datasets = datasets[existing_clusters:] # if resuming and all clusters were created, there will be no datasets left if datasets: if isinstance(cluster_args, list): cluster_args_list = cluster_args # Only one cluster per command, at present number_of_clusters = 1 message = dated("Creating %s.\n" % plural("cluster", number_of_clusters)) log_message(message, log_file=session_file, console=args.verbosity) query_string = FIELDS_QS inprogress = [] for i in range(0, number_of_clusters): wait_for_available_tasks(inprogress, args.max_parallel_clusters, api, "cluster") if cluster_args_list: cluster_args = cluster_args_list[i] cluster = api.create_cluster(datasets, cluster_args, retries=None) cluster_id = check_resource_error(cluster, "Failed to create cluster: ") log_message("%s\n" % cluster_id, log_file=log) cluster_ids.append(cluster_id) inprogress.append(cluster_id) clusters.append(cluster) log_created_resources("clusters", path, cluster_id, mode='a') if args.verbosity: if bigml.api.get_status(cluster)['code'] != bigml.api.FINISHED: try: cluster = check_resource(cluster, api.get_cluster, query_string=query_string, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a finished cluster: %s" % str(exception)) clusters[0] = cluster message = dated("Cluster created: %s\n" % get_url(cluster)) log_message(message, log_file=session_file, console=args.verbosity) if args.reports: report(args.reports, path, cluster)
def create_fusion(models, fusion, fusion_args, args, api=None, path=None, session_file=None, log=None): """Create remote fusion """ if api is None: api = bigml.api.BigML() fusions = [] fusion_ids = [] if fusion is not None: fusions = [fusion] fusion_ids = [fusion] # if resuming and all fusions were created if models: # Only one fusion per command, at present message = dated("Creating fusion.\n") log_message(message, log_file=session_file, console=args.verbosity) query_string = FIELDS_QS inprogress = [] wait_for_available_tasks(inprogress, args.max_parallel_fusions, api, "fusion") fusion = api.create_fusion(models, fusion_args, retries=None) fusion_id = check_resource_error( \ fusion, "Failed to create fusion: ") log_message("%s\n" % fusion_id, log_file=log) fusion_ids.append(fusion_id) inprogress.append(fusion_id) fusions.append(fusion) log_created_resources("fusions", path, fusion_id, mode='a') if args.verbosity: if bigml.api.get_status(fusion)['code'] != bigml.api.FINISHED: try: fusion = check_resource( \ fusion, api.get_fusion, query_string=query_string, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a finished fusion: %s" % str(exception)) fusions[0] = fusion message = dated("Fusion created: %s\n" % get_url(fusion)) log_message(message, log_file=session_file, console=args.verbosity) if args.reports: report(args.reports, path, fusion)
def get_evaluation(evaluation, api=None, verbosity=True, session_file=None): """Retrieves evaluation in its actual state """ if api is None: api = bigml.api.BigML() message = dated("Retrieving evaluation. %s\n" % get_url(evaluation)) log_message(message, log_file=session_file, console=verbosity) try: evaluation = check_resource(evaluation, api.get_evaluation) except ValueError, exception: sys.exit("Failed to get a finished evaluation: %s" % str(exception))
def create_dataset(origin_resource, dataset_args, args, api=None, path=None, session_file=None, log=None, dataset_type=None): """Creates remote dataset from source, dataset, cluster or datasets list """ if api is None: api = bigml.api.BigML() message = dated("Creating dataset.\n") log_message(message, log_file=session_file, console=args.verbosity) check_fields_struct(dataset_args, "dataset") # if --json-query or --sql-query are used and no names are set for # the datasets, we create default naming to A, B, C, etc. for the datasets # to be used as origin if ((hasattr(args, 'sql_query') and args.sql_query) or \ (hasattr(args, 'json_query') and args.sql_query)) and \ isinstance(origin_resource, list) and \ ((not isinstance(origin_resource[0], dict)) or \ origin_resource[0].get("name") is None): for index, element in enumerate(origin_resource): if index < len(DS_NAMES): if isinstance(element, dict): if element.get("resource") is not None: element = {"id": element["resource"]} element.update({"name": DS_NAMES[index]}) origin_resource[index] = element elif isinstance(element, basestring): origin_resource[index] = { "id": element, "name": DS_NAMES[index] } dataset = api.create_dataset(origin_resource, dataset_args, retries=None) suffix = "_" + dataset_type if dataset_type else "" log_created_resources("dataset%s" % suffix, path, bigml.api.get_dataset_id(dataset), mode='a') dataset_id = check_resource_error(dataset, "Failed to create dataset: ") try: dataset = check_resource(dataset, api.get_dataset, query_string=ALL_FIELDS_QS, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a finished dataset: %s" % str(exception))
def update_source(source, source_args, args, api=None, session_file=None): """Updates source properties """ if api is None: api = bigml.api.BigML() message = dated("Updating source. %s\n" % get_url(source)) log_message(message, log_file=session_file, console=args.verbosity) source = api.update_source(source, source_args) check_resource_error(source, "Failed to update source: ") source = check_resource(source, api.get_source) return source
def get_ensemble(ensemble, api=None, verbosity=True, session_file=None): """Retrieves remote ensemble in its actual status """ if api is None: api = bigml.api.BigML() if (isinstance(ensemble, basestring) or bigml.api.get_status(ensemble)['code'] != bigml.api.FINISHED): message = dated("Retrieving ensemble. %s\n" % get_url(ensemble)) log_message(message, log_file=session_file, console=verbosity) ensemble = check_resource(ensemble, api.get_ensemble) check_resource_error(ensemble, "Failed to get ensemble: ") return ensemble
def get_dataset(dataset, api=None, verbosity=True, session_file=None): """Retrieves the dataset in its actual state """ if api is None: api = bigml.api.BigML() if (isinstance(dataset, basestring) or bigml.api.get_status(dataset)['code'] != bigml.api.FINISHED): message = dated("Retrieving dataset. %s\n" % get_url(dataset)) log_message(message, log_file=session_file, console=verbosity) dataset = check_resource(dataset, api.get_dataset) check_resource_error(dataset, "Failed to get dataset: ") return dataset
def retrieve_models_split(models_split, api, query_string=FIELDS_QS, labels=None, multi_label_data=None, ordered=True, models_order=None): """Returns a list of full model structures ready to be fed to the MultiModel object to produce predictions. Models are also stored locally in the output directory when the --store flag is used. """ complete_models = [] if models_order is None: models_order = [] for index in range(len(models_split)): model = models_split[index] if (isinstance(model, basestring) or bigml.api.get_status(model)['code'] != bigml.api.FINISHED): try: model = u.check_resource(model, api.get_model, query_string) except ValueError, exception: sys.exit("Failed to get model: %s. %s" % (model, str(exception))) # When user selects the labels in multi-label predictions, we must # filter the models that will be used to predict if labels and multi_label_data: objective_column = str(multi_label_data['objective_column']) labels_info = multi_label_data['generated_fields'][ objective_column] labels_columns = [ label_info[1] for label_info in labels_info if label_info[0] in labels ] model_objective_id = model['object']['objective_fields'][0] model_fields = model['object']['model']['fields'] model_objective = model_fields[model_objective_id] model_column = model_objective['column_number'] if model_column in labels_columns: # When the list of models comes from a --model-tag # selection, the models are not retrieved in the same # order they were created. We must keep track of the # label they are associated with to label their # predictions properly if not ordered: models_order.append(model_column) complete_models.append(model) else: complete_models.append(model)
def get_source(source, api=None, verbosity=True, session_file=None): """Retrieves the source in its actual state and its field info """ if api is None: api = bigml.api.BigML() if (isinstance(source, basestring) or bigml.api.get_status(source)['code'] != bigml.api.FINISHED): message = dated("Retrieving source. %s\n" % get_url(source)) log_message(message, log_file=session_file, console=verbosity) try: source = check_resource(source, api.get_source) except ValueError, exception: sys.exit("Failed to get a finished source: %s" % str(exception))
def create_evaluations(model_ids, datasets, evaluation_args, args, api=None, path=None, session_file=None, log=None, existing_evaluations=0): """Create evaluations for a list of models ``model_ids``: list of model ids to create an evaluation of ``datasets``: dataset objects or ids to evaluate with ``evaluation_args``: arguments for the ``create_evaluation`` call ``args``: input values for bigmler flags ``api``: api to remote objects in BigML ``path``: directory to store the BigMLer generated files in ``session_file``: file to store the messages of that session ``log``: user provided log file ``existing_evaluations``: evaluations found when attempting resume """ evaluations = [] dataset = datasets[0] evaluation_args_list = [] if isinstance(evaluation_args, list): evaluation_args_list = evaluation_args if api is None: api = bigml.api.BigML() remaining_ids = model_ids[existing_evaluations:] number_of_evaluations = len(remaining_ids) message = dated("Creating evaluations.\n") log_message(message, log_file=session_file, console=args.verbosity) for i in range(0, number_of_evaluations): model = remaining_ids[i] if i % args.max_parallel_evaluations == 0 and i > 0: try: evaluations[i - 1] = check_resource( evaluations[i - 1], api.get_evaluation) except ValueError, exception: sys.exit("Failed to get a finished evaluation: %s" % str(exception)) if evaluation_args_list != []: evaluation_args = evaluation_args_list[i] if args.cross_validation_rate > 0: new_seed = get_basic_seed(i + existing_evaluations) evaluation_args.update(seed=new_seed) evaluation = api.create_evaluation(model, dataset, evaluation_args) evaluation_id = check_resource_error(evaluation, "Failed to create evaluation: ") log_created_resources("evaluations", path, evaluation_id, open_mode='a') evaluations.append(evaluation) log_message("%s\n" % evaluation['resource'], log_file=log)
def create_evaluations(model_ids, dataset, evaluation_args, args, api=None, path=None, session_file=None, log=None, existing_evaluations=0): """Create evaluations for a list of models ``model_ids``: list of model ids to create an evaluation of ``dataset``: dataset object or id to evaluate with ``evaluation_args``: arguments for the ``create_evaluation`` call ``args``: input values for bigmler flags ``api``: api to remote objects in BigML ``path``: directory to store the BigMLer generated files in ``session_file``: file to store the messages of that session ``log``: user provided log file ``seed``: seed for the dataset sampling (when needed) """ evaluations = [] if api is None: api = bigml.api.BigML() number_of_evaluations = len(model_ids) message = dated("Creating evaluations.\n") log_message(message, log_file=session_file, console=args.verbosity) for i in range(0, number_of_evaluations): model = model_ids[i] if i % args.max_parallel_evaluations == 0 and i > 0: try: evaluations[i - 1] = check_resource(evaluations[i - 1], api.get_evaluation) except ValueError, exception: sys.exit("Failed to get a finished evaluation: %s" % str(exception)) if args.cross_validation_rate > 0: new_seed = get_basic_seed(i + existing_evaluations) evaluation_args.update(seed=new_seed) evaluation = api.create_evaluation(model, dataset, evaluation_args) log_created_resources("evaluations", path, bigml.api.get_evaluation_id(evaluation), open_mode='a') check_resource_error(evaluation, "Failed to create evaluation: ") evaluations.append(evaluation) log_message("%s\n" % evaluation['resource'], log_file=log)
def retrieve_ensembles_models(ensembles, api, path=None): """Retrieves the models associated to a list of ensembles """ models = [] model_ids = [] for index in range(0, len(ensembles)): ensemble = ensembles[index] if (isinstance(ensemble, basestring) or bigml.api.get_status(ensemble)['code'] != bigml.api.FINISHED): try: ensemble = check_resource(ensemble, api.get_ensemble) ensembles[index] = ensemble except ValueError, exception: sys.exit("Failed to get a finished ensemble: %s" % str(exception)) model_ids.extend(ensemble['object']['models'])
def retrieve_models_split(models_split, api, query_string=FIELDS_QS, labels=None, multi_label_data=None, ordered=True, models_order=None): """Returns a list of full model structures ready to be fed to the MultiModel object to produce predictions. Models are also stored locally in the output directory when the --store flag is used. """ complete_models = [] if models_order is None: models_order = [] for index in range(len(models_split)): model = models_split[index] if (isinstance(model, basestring) or bigml.api.get_status(model)['code'] != bigml.api.FINISHED): try: model = u.check_resource(model, api.get_model, query_string) except ValueError, exception: sys.exit("Failed to get model: %s. %s" % (model, str(exception))) # When user selects the labels in multi-label predictions, we must # filter the models that will be used to predict if labels and multi_label_data: objective_column = str(multi_label_data['objective_column']) labels_info = multi_label_data[ 'generated_fields'][objective_column] labels_columns = [label_info[1] for label_info in labels_info if label_info[0] in labels] model_objective_id = model['object']['objective_fields'][0] model_fields = model['object']['model']['fields'] model_objective = model_fields[model_objective_id] model_column = model_objective['column_number'] if model_column in labels_columns: # When the list of models comes from a --model-tag # selection, the models are not retrieved in the same # order they were created. We must keep track of the # label they are associated with to label their # predictions properly if not ordered: models_order.append(model_column) complete_models.append(model) else: complete_models.append(model)
def create_ensembles(dataset, ensemble_ids, ensemble_args, args, number_of_ensembles=1, api=None, path=None, session_file=None, log=None): """Create ensembles from input data """ if api is None: api = bigml.api.BigML() ensembles = ensemble_ids[:] model_ids = [] ensemble_args_list = [] if isinstance(ensemble_args, list): ensemble_args_list = ensemble_args if number_of_ensembles > 0: message = dated("Creating %s.\n" % plural("ensemble", number_of_ensembles)) log_message(message, log_file=session_file, console=args.verbosity) query_string = ALL_FIELDS_QS for i in range(0, number_of_ensembles): if i % args.max_parallel_ensembles == 0 and i > 0: try: ensembles[i - 1] = check_resource( ensembles[i - 1], api.get_ensemble, query_string=query_string) except ValueError, exception: sys.exit("Failed to get a finished ensemble: %s" % str(exception)) if ensemble_args_list: ensemble_args = ensemble_args_list[i] ensemble = api.create_ensemble(dataset, ensemble_args) ensemble_id = check_resource_error(ensemble, "Failed to create ensemble: ") log_message("%s\n" % ensemble_id, log_file=log) ensemble_ids.append(ensemble_id) ensembles.append(ensemble) log_created_resources("ensembles", path, ensemble_id, open_mode='a') models, model_ids = retrieve_ensembles_models(ensembles, api, path) if number_of_ensembles < 2 and args.verbosity: message = dated("Ensemble created: %s.\n" % get_url(ensemble)) log_message(message, log_file=session_file, console=args.verbosity)
def create_dataset(source_or_dataset, dataset_args, args, api=None, path=None, session_file=None, log=None, dataset_type=None): """Creates remote dataset """ if api is None: api = bigml.api.BigML() message = dated("Creating dataset.\n") log_message(message, log_file=session_file, console=args.verbosity) dataset = api.create_dataset(source_or_dataset, dataset_args) suffix = "_" + dataset_type if dataset_type else "" log_created_resources("dataset%s" % suffix, path, bigml.api.get_dataset_id(dataset)) check_resource_error(dataset, "Failed to create dataset: ") try: dataset = check_resource(dataset, api.get_dataset) except ValueError, exception: sys.exit("Failed to get a finished dataset: %s" % str(exception))
def remote_predict_ensemble(ensemble_id, test_reader, prediction_file, api, resume=False, verbosity=True, output_path=None, method=PLURALITY_CODE, tags="", session_file=None, log=None, debug=False, prediction_info=None): """Retrieve predictions remotely and save predictions to file """ prediction_args = { "tags": tags, "combiner": method } test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) if (not resume or not c.checkpoint(c.are_predictions_created, prediction_file, test_reader.number_of_tests(), debug=debug)): message = u.dated("Creating remote predictions.") u.log_message(message, log_file=session_file, console=verbosity) predictions_file = csv.writer(open(prediction_file, 'w', 0), lineterminator="\n") for input_data in test_reader: input_data_dict = test_reader.dict(input_data) prediction = api.create_prediction(ensemble_id, input_data_dict, by_name=test_set_header, wait_time=0, args=prediction_args) prediction = u.check_resource(prediction, api.get_prediction) u.check_resource_error(prediction, "Failed to create prediction: ") u.log_message("%s\n" % prediction['resource'], log_file=log) prediction_row = prediction_to_row(prediction, prediction_info) write_prediction(prediction_row, predictions_file, prediction_info, input_data)
def remote_predict_ensemble(ensemble_id, test_reader, prediction_file, api, args, resume=False, output_path=None, session_file=None, log=None, exclude=None): """Retrieve predictions remotely and save predictions to file """ prediction_args = { "tags": args.tag, "combiner": args.method } test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) if (not resume or not c.checkpoint( c.are_predictions_created, prediction_file, test_reader.number_of_tests(), debug=args.debug)[0]): message = u.dated("Creating remote predictions.") u.log_message(message, log_file=session_file, console=args.verbosity) with UnicodeWriter(prediction_file) as predictions_file: for input_data in test_reader: input_data_dict = test_reader.dict(input_data) prediction = api.create_prediction(ensemble_id, input_data_dict, by_name=test_set_header, wait_time=0, args=prediction_args) prediction = u.check_resource(prediction, api.get_prediction) u.check_resource_error(prediction, "Failed to create prediction: ") u.log_message("%s\n" % prediction['resource'], log_file=log) prediction_row = prediction_to_row(prediction, args.prediction_info) write_prediction(prediction_row, predictions_file, args.prediction_info, input_data, exclude)
def compute_output(api, args): """ Creates a sample based on a `train_set`, source or dataset. """ samples = None # variables from command-line options resume = args.resume_ sample_ids = args.sample_ids_ output = args.predictions # there's only one sample to be generated at present args.max_parallel_clusters = 1 # sample cannot be published yet. args.public_sample = False # It is compulsory to have a description to publish either datasets or # clusters if (not args.description_ and (args.public_sample or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-sample step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-sample step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, _, resume, csv_properties, fields) = dataset_properties if args.sample_file: # sample is retrieved from the contents of the given local JSON file sample, csv_properties, fields = u.read_local_resource( args.sample_file, csv_properties=csv_properties) samples = [sample] sample_ids = [sample['resource']] else: # sample is retrieved from the remote object samples, sample_ids, resume = psa.samples_processing( datasets, samples, sample_ids, api, args, resume, session_file=session_file, path=path, log=log) if samples: sample = samples[0] # We update the sample's public state if needed if sample: if isinstance(sample, basestring): # build the query string from the sample options sample = u.check_resource(sample, api.get_sample) samples[0] = sample if (args.public_sample or (args.shared_flag and r.shared_changed(args.shared, sample))): sample_args = {} if args.shared_flag and r.shared_changed(args.shared, sample): sample_args.update(shared=args.shared) if args.public_sample: sample_args.update(r.set_publish_sample_args(args)) if sample_args: sample = r.update_sample(sample, sample_args, args, api=api, path=path, session_file=session_file) samples[0] = sample # We get the fields of the sample if we haven't got # them yet and need them if sample and psa.needs_sample_fields(args): fields = psa.get_sample_fields(sample, csv_properties, args) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) sample_file(samples[0], fields, args, api, path=path, session_file=session_file) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
if api is None: api = bigml.api.BigML() message = dated("Creating source.\n") log_message(message, log_file=session_file, console=args.verbosity) source = api.create_source(data_set, source_args, progress_bar=args.progress_bar) if path is not None: try: with open(path + '/source', 'w', 0) as source_file: source_file.write("%s\n" % source['resource']) source_file.write("%s\n" % source['object']['name']) except IOError, exc: raise IOError("%s: Failed to write %s/source" % (str(exc), path)) check_resource_error(source, "Failed to create source: ") try: source = check_resource(source, api.get_source) except ValueError, exception: sys.exit("Failed to get a finished source: %s" % str(exception)) message = dated("Source created: %s\n" % get_url(source)) log_message(message, log_file=session_file, console=args.verbosity) log_message("%s\n" % source['resource'], log_file=log) return source def data_to_source(training_set, test_set, training_set_header, test_set_header, args): """Extracts the flags info to create a source object """ data_set = None
def compute_output(api, args): """ Creates one or more anomaly detectors using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ anomaly = None anomalies = None # no multi-label support at present # variables from command-line options resume = args.resume_ anomaly_ids = args.anomaly_ids_ output = args.predictions # there's only one anomaly detector to be generated at present args.max_parallel_anomalies = 1 # anomalies cannot be published yet. args.public_anomaly = False # It is compulsory to have a description to publish either datasets or # anomalies if (not args.description_ and (args.public_anomaly or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.anomaly_file: # anomaly is retrieved from the contents of the given local JSON file anomaly, csv_properties, fields = u.read_local_resource( args.anomaly_file, csv_properties=csv_properties) anomalies = [anomaly] anomaly_ids = [anomaly['resource']] else: # anomaly is retrieved from the remote object anomalies, anomaly_ids, resume = pa.anomalies_processing( datasets, anomalies, anomaly_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if anomalies: anomaly = anomalies[0] # We update the anomaly's public state if needed if anomaly: if not a.has_test(args) and not args.anomalies_dataset: query_string = MINIMUM_MODEL elif not a.has_test(args): query_string = ";".join([EXCLUDE_TREES, r.ALL_FIELDS_QS]) else: query_string = r.ALL_FIELDS_QS try: anomaly_id = anomaly.get('resource', anomaly) except AttributeError: anomaly_id = anomaly anomaly = u.check_resource(anomaly_id, query_string=query_string, api=api) anomalies[0] = anomaly if (args.public_anomaly or (args.shared_flag and r.shared_changed(args.shared, anomaly))): anomaly_args = {} if args.shared_flag and r.shared_changed(args.shared, anomaly): anomaly_args.update(shared=args.shared) if args.public_anomaly: anomaly_args.update(r.set_publish_anomaly_args(args)) if anomaly_args: anomaly = r.update_anomaly(anomaly, anomaly_args, args, api=api, path=path, session_file=session_file) anomalies[0] = anomaly # We get the fields of the anomaly detector if we haven't got # them yet and need them if anomaly and (args.test_set or args.export_fields): fields = pa.get_anomaly_fields(anomaly, csv_properties, args) # If creating a top anomalies excluded/included dataset if args.anomalies_dataset and anomaly: origin_dataset = anomaly['object'].get('dataset') if origin_dataset is None: sys.exit("The dataset used to generate the anomaly detector " "cannot be found. Failed to generate the anomalies " " dataset.") local_anomaly = Anomaly(anomaly) include = args.anomalies_dataset == ANOMALIES_IN args.anomaly_filter_ = local_anomaly.anomalies_filter(include=include) _, resume = pd.create_new_dataset( origin_dataset, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) # If predicting if anomaly and args.score: args.test_dataset = anomaly['object']['dataset'] if anomalies and (a.has_test(args) or (test_dataset and args.remote)): # test dataset can be defined by --test-split or --test-dataset or # --test-datasets if test_dataset is None: test_dataset = get_test_dataset(args) # Remote anomaly scores: scores are computed as batch anomaly scores # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_anomaly_score_args = r.set_batch_anomaly_score_args( args, fields=fields, dataset_fields=test_fields) remote_anomaly_score(anomaly, test_dataset, batch_anomaly_score_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: anomaly_score(anomalies, fields, args, session_file=session_file) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def local_batch_predict(models, test_reader, prediction_file, api, max_models=MAX_MODELS, resume=False, output_path=None, output=None, verbosity=True, method=PLURALITY_CODE, options=None, session_file=None, debug=False, prediction_info=NORMAL_FORMAT, labels=None, label_separator=None, ordered=True, exclude=None, models_per_label=1, other_label=OTHER, multi_label_data=None): """Get local predictions form partial Multimodel, combine and save to file """ def draw_progress_bar(current, total): """Draws a text based progress report. """ pct = 100 - ((total - current) * 100) / (total) console_log("Predicted on %s out of %s models [%s%%]" % ( localize(current), localize(total), pct)) if labels is None: labels = [] test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) if output is None: try: output = open(prediction_file, 'w', 0) except IOError: raise IOError("Failed to write in %s" % prediction_file) models_total = len(models) models_splits = [models[index:(index + max_models)] for index in range(0, models_total, max_models)] input_data_list = [] raw_input_data_list = [] for input_data in test_reader: raw_input_data_list.append(input_data) input_data_list.append(test_reader.dict(input_data)) total_votes = [] models_count = 0 if not ordered: models_order = [] single_model = models_total == 1 query_string = FIELDS_QS if single_model else ALL_FIELDS_QS for models_split in models_splits: if resume: for model in models_split: pred_file = get_predictions_file_name(model, output_path) c.checkpoint(c.are_predictions_created, pred_file, test_reader.number_of_tests(), debug=debug) complete_models = [] for index in range(len(models_split)): model = models_split[index] if (isinstance(model, basestring) or bigml.api.get_status(model)['code'] != bigml.api.FINISHED): try: model = u.check_resource(model, api.get_model, query_string) except ValueError, exception: sys.exit("Failed to get model: %s. %s" % (model, str(exception))) # When user selects the labels in multi-label predictions, we must # filter the models that will be used to predict if labels: objective_column = str(multi_label_data['objective_column']) labels_info = multi_label_data[ 'generated_fields'][objective_column] labels_columns = [label_info[1] for label_info in labels_info if label_info[0] in labels] model_objective_id = model['object']['objective_fields'][0] model_fields = model['object']['model']['fields'] model_objective = model_fields[model_objective_id] model_column = model_objective['column_number'] if (model_column in labels_columns): # When the list of models comes from a --model-tag # selection, the models are not retrieved in the same # order they were created. We must keep track of the # label they are associated with to label their # predictions properly if not ordered: models_order.append(model_column) complete_models.append(model) else: complete_models.append(model) if complete_models: local_model = MultiModel(complete_models) local_model.batch_predict(input_data_list, output_path, by_name=test_set_header, reuse=True) votes = local_model.batch_votes(output_path) models_count += max_models if models_count > models_total: models_count = models_total if verbosity: draw_progress_bar(models_count, models_total) if total_votes: for index in range(0, len(votes)): predictions = total_votes[index] predictions.extend(votes[index].predictions) else: total_votes = votes
message = dated("Creating %ssource.\n" % suffix) log_message(message, log_file=session_file, console=args.verbosity) source = api.create_source(data_set, source_args, progress_bar=args.progress_bar) if path is not None: try: suffix = "_" + source_type if source_type else "" with open("%s/source%s" % (path, suffix), 'w', 0) as source_file: source_file.write("%s\n" % source['resource']) source_file.write("%s\n" % source['object']['name']) except IOError, exc: sys.exit("%s: Failed to write %s/source" % (str(exc), path)) source_id = check_resource_error(source, "Failed to create source: ") try: source = check_resource(source, api.get_source, query_string=ALL_FIELDS_QS) except ValueError, exception: sys.exit("Failed to get a finished source: %s" % str(exception)) message = dated("Source created: %s\n" % get_url(source)) log_message(message, log_file=session_file, console=args.verbosity) log_message("%s\n" % source_id, log_file=log) return source def data_to_source(training_set, test_set, training_set_header, test_set_header, args): """Extracts the flags info to create a source object """ data_set = None
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ association = None associations = None # no multi-label support at present # variables from command-line options resume = args.resume_ association_ids = args.association_ids_ output = args.predictions # there's only one association resource to be generated at present args.max_parallel_associations = 1 # associations cannot be published yet. args.public_association = False # It is compulsory to have a description to publish either datasets or # associations if (not args.description_ and (args.public_association or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.association_file: # association is retrieved from the contents of the given local JSON # file association, csv_properties, fields = u.read_local_resource( args.association_file, csv_properties=csv_properties) associations = [association] association_ids = [association['resource']] else: # association is retrieved from the remote object associations, association_ids, resume = pa.associations_processing( datasets, associations, association_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if associations: association = associations[0] # We update the association's public state if needed if association: if isinstance(association, basestring): if not a.has_test(args): query_string = MINIMUM_MODEL else: query_string = '' association = u.check_resource(association, api.get_association, query_string=query_string) associations[0] = association if (args.public_association or (args.shared_flag and r.shared_changed(args.shared, association))): association_args = {} if args.shared_flag and \ r.shared_changed(args.shared, association): association_args.update(shared=args.shared) if args.public_association: association_args.update(r.set_publish_association_args(args)) if association_args: association = r.update_association( \ association, association_args, args, api=api, path=path, session_file=session_file) associations[0] = association # We get the fields of the association if we haven't got # them yet and need them if association and args.test_set: fields = pa.get_association_fields(association, csv_properties, args) # If predicting if associations and (a.has_test(args) or (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote association sets: association sets are computed as # batch association sets # in bigml.com except when --no-batch flag is set. They are currently # not supported yet if args.remote and not args.no_batch: sys.exit("Batch association sets are currently not supported.") """ # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_association_args = r.set_batch_association_args( args, fields=fields, dataset_fields=test_fields) remote_association( \ association, test_dataset, batch_association_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) """ else: sys.exit("Local prediction of association sets is currently" " not supported.") """ association_set(associations, fields, args, session_file=session_file) """ u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ source = None dataset = None model = None models = None fields = None other_label = OTHER ensemble_ids = [] multi_label_data = None multi_label_fields = [] # local_ensemble = None test_dataset = None datasets = None # variables from command-line options resume = args.resume_ model_ids = args.model_ids_ output = args.predictions dataset_fields = args.dataset_fields_ check_args_coherence(args) path = u.check_dir(output) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # labels to be used in multi-label expansion labels = None if args.labels is None else [label.strip() for label in args.labels.split(args.args_separator)] if labels is not None: labels = sorted([label for label in labels]) # multi_label file must be preprocessed to obtain a new extended file if args.multi_label and args.training_set is not None: (args.training_set, multi_label_data) = ps.multi_label_expansion( args.training_set, args.train_header, args, path, labels=labels, session_file=session_file ) args.train_header = True args.objective_field = multi_label_data["objective_name"] all_labels = l.get_all_labels(multi_label_data) if not labels: labels = all_labels else: all_labels = labels if args.objective_field: csv_properties.update({"objective_field": args.objective_field}) if args.source_file: # source is retrieved from the contents of the given local JSON file source, csv_properties, fields = u.read_local_resource(args.source_file, csv_properties=csv_properties) else: # source is retrieved from the remote object source, resume, csv_properties, fields = ps.source_processing( api, args, resume, csv_properties=csv_properties, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log, ) if args.multi_label and source: multi_label_data = l.get_multi_label_data(source) (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync( args.objective_field, labels, multi_label_data, fields, multi_label_fields ) if args.dataset_file: # dataset is retrieved from the contents of the given local JSON file model_dataset, csv_properties, fields = u.read_local_resource(args.dataset_file, csv_properties=csv_properties) if not args.datasets: datasets = [model_dataset] dataset = model_dataset else: datasets = u.read_datasets(args.datasets) if not datasets: # dataset is retrieved from the remote object datasets, resume, csv_properties, fields = pd.dataset_processing( source, api, args, resume, fields=fields, csv_properties=csv_properties, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log, ) if datasets: dataset = datasets[0] if args.to_csv is not None: resume = pd.export_dataset(dataset, api, args, resume, session_file=session_file, path=path) # Now we have a dataset, let's check if there's an objective_field # given by the user and update it in the fields structure args.objective_id_ = get_objective_id(args, fields) # If test_split is used, split the dataset in a training and a test dataset # according to the given split if args.test_split > 0: dataset, test_dataset, resume = pd.split_processing( dataset, api, args, resume, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log ) datasets[0] = dataset # Check if the dataset has a categorical objective field and it # has a max_categories limit for categories if args.max_categories > 0 and len(datasets) == 1: if pd.check_max_categories(fields.fields[args.objective_id_]): distribution = pd.get_categories_distribution(dataset, args.objective_id_) if distribution and len(distribution) > args.max_categories: categories = [element[0] for element in distribution] other_label = pd.create_other_label(categories, other_label) datasets, resume = pd.create_categories_datasets( dataset, distribution, fields, args, api, resume, session_file=session_file, path=path, log=log, other_label=other_label, ) else: sys.exit( "The provided objective field is not categorical nor " "a full terms only text field. " "Only these fields can be used with" " --max-categories" ) # If multi-dataset flag is on, generate a new dataset from the given # list of datasets if args.multi_dataset: dataset, resume = pd.create_new_dataset( datasets, api, args, resume, fields=fields, session_file=session_file, path=path, log=log ) datasets = [dataset] # Check if the dataset has a generators file associated with it, and # generate a new dataset with the specified field structure. Also # if the --to-dataset flag is used to clone or sample the original dataset if ( args.new_fields or (args.sample_rate != 1 and args.no_model) or (args.lisp_filter or args.json_filter) and not has_source(args) ): if fields is None: if isinstance(dataset, basestring): dataset = check_resource(dataset, api=api) fields = Fields(dataset, csv_properties) args.objective_id_ = get_objective_id(args, fields) args.objective_name_ = fields.field_name(args.objective_id_) dataset, resume = pd.create_new_dataset( dataset, api, args, resume, fields=fields, session_file=session_file, path=path, log=log ) datasets[0] = dataset # rebuild fields structure for new ids and fields csv_properties.update({"objective_field": args.objective_name_, "objective_field_present": True}) fields = pd.get_fields_structure(dataset, csv_properties) args.objective_id_ = get_objective_id(args, fields) if args.multi_label and dataset and multi_label_data is None: multi_label_data = l.get_multi_label_data(dataset) (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync( args.objective_field, labels, multi_label_data, fields, multi_label_fields ) if dataset: # retrieves max_categories data, if any args.max_categories = get_metadata(dataset, "max_categories", args.max_categories) other_label = get_metadata(dataset, "other_label", other_label) if args.model_file: # model is retrieved from the contents of the given local JSON file model, csv_properties, fields = u.read_local_resource(args.model_file, csv_properties=csv_properties) models = [model] model_ids = [model["resource"]] ensemble_ids = [] elif args.ensemble_file: # model is retrieved from the contents of the given local JSON file ensemble, csv_properties, fields = u.read_local_resource(args.ensemble_file, csv_properties=csv_properties) model_ids = ensemble["object"]["models"][:] ensemble_ids = [ensemble["resource"]] models = model_ids[:] model = retrieve_resource(bigml.api.BigML(storage="./storage"), models[0], query_string=r.ALL_FIELDS_QS) models[0] = model else: # model is retrieved from the remote object models, model_ids, ensemble_ids, resume = pm.models_processing( datasets, models, model_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log, labels=labels, multi_label_data=multi_label_data, other_label=other_label, ) if models: model = models[0] single_model = len(models) == 1 # If multi-label flag is set and no training_set was provided, label # info is extracted from the user_metadata. If models belong to an # ensemble, the ensemble must be retrieved to get the user_metadata. if model and args.multi_label and multi_label_data is None: if len(ensemble_ids) > 0 and isinstance(ensemble_ids[0], dict): resource = ensemble_ids[0] elif belongs_to_ensemble(model): ensemble_id = get_ensemble_id(model) resource = r.get_ensemble(ensemble_id, api=api, verbosity=args.verbosity, session_file=session_file) else: resource = model multi_label_data = l.get_multi_label_data(resource) # We update the model's public state if needed if model: if isinstance(model, basestring) or bigml.api.get_status(model)["code"] != bigml.api.FINISHED: if not args.evaluate and not a.has_train(args): query_string = MINIMUM_MODEL elif not args.test_header: query_string = r.ALL_FIELDS_QS else: query_string = "%s;%s" % (r.ALL_FIELDS_QS, r.FIELDS_QS) model = u.check_resource(model, api.get_model, query_string=query_string) models[0] = model if args.black_box or args.white_box or (args.shared_flag and r.shared_changed(args.shared, model)): model_args = {} if args.shared_flag and r.shared_changed(args.shared, model): model_args.update(shared=args.shared) if args.black_box or args.white_box: model_args.update(r.set_publish_model_args(args)) if model_args: model = r.update_model(model, model_args, args, api=api, path=path, session_file=session_file) models[0] = model # We get the fields of the model if we haven't got # them yet and need them if model and not args.evaluate and args.test_set: # If more than one model, use the full field structure if not single_model and not args.multi_label and belongs_to_ensemble(model): if len(ensemble_ids) > 0: ensemble_id = ensemble_ids[0] else: ensemble_id = get_ensemble_id(model) fields = pm.get_model_fields( model, csv_properties, args, single_model=single_model, multi_label_data=multi_label_data ) # Free memory after getting fields # local_ensemble = None gc.collect() # Fills in all_labels from user_metadata if args.multi_label and not all_labels: (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync( args.objective_field, labels, multi_label_data, fields, multi_label_fields ) if model: # retrieves max_categories data, if any args.max_categories = get_metadata(model, "max_categories", args.max_categories) other_label = get_metadata(model, "other_label", other_label) # If predicting if models and (a.has_test(args) or (test_dataset and args.remote)) and not args.evaluate: models_per_label = 1 if test_dataset is None: test_dataset = get_test_dataset(args) if args.multi_label: # When prediction starts from existing models, the # multi_label_fields can be retrieved from the user_metadata # in the models if args.multi_label_fields is None and multi_label_fields: multi_label_field_names = [field[1] for field in multi_label_fields] args.multi_label_fields = ",".join(multi_label_field_names) test_set = ps.multi_label_expansion( args.test_set, args.test_header, args, path, labels=labels, session_file=session_file, input_flag=True )[0] test_set_header = True # Remote predictions: predictions are computed as batch predictions # in bigml.com except when --no-batch flag is set on or multi-label # or max-categories are used if ( args.remote and not args.no_batch and not args.multi_label and not args.method in [THRESHOLD_CODE, COMBINATION] ): # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, session_file=session_file, path=path, log=log ) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log ) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_prediction_args = r.set_batch_prediction_args(args, fields=fields, dataset_fields=test_fields) remote_predict( model, test_dataset, batch_prediction_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log, ) else: models_per_label = args.number_of_models if args.multi_label and len(ensemble_ids) > 0 and args.number_of_models == 1: # use case where ensembles are read from a file models_per_label = len(models) / len(ensemble_ids) predict( models, fields, args, api=api, log=log, resume=resume, session_file=session_file, labels=labels, models_per_label=models_per_label, other_label=other_label, multi_label_data=multi_label_data, ) # When combine_votes flag is used, retrieve the predictions files saved # in the comma separated list of directories and combine them if args.votes_files_: model_id = re.sub(r".*(model_[a-f0-9]{24})__predictions\.csv$", r"\1", args.votes_files_[0]).replace("_", "/") try: model = u.check_resource(model_id, api.get_model) except ValueError, exception: sys.exit("Failed to get model %s: %s" % (model_id, str(exception))) local_model = Model(model) message = u.dated("Combining votes.\n") u.log_message(message, log_file=session_file, console=args.verbosity) combine_votes(args.votes_files_, local_model.to_prediction, output, method=args.method)
def local_batch_predict(models, test_reader, prediction_file, api, max_models=MAX_MODELS, resume=False, output_path=None, output=None, verbosity=True, method=PLURALITY_CODE, session_file=None, debug=False, prediction_info=None): """Get local predictions form partial Multimodel, combine and save to file """ def draw_progress_bar(current, total): """Draws a text based progress report. """ pct = 100 - ((total - current) * 100) / (total) console_log("Predicted on %s out of %s models [%s%%]" % ( localize(current), localize(total), pct)) test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) if output is None: try: output = open(prediction_file, 'w', 0) except IOError: raise IOError("Failed to write in %s" % prediction_file) models_total = len(models) models_splits = [models[index:(index + max_models)] for index in range(0, models_total, max_models)] input_data_list = [] raw_input_data_list = [] for input_data in test_reader: raw_input_data_list.append(input_data) input_data_list.append(test_reader.dict(input_data)) total_votes = [] models_count = 0 for models_split in models_splits: if resume: for model in models_split: pred_file = get_predictions_file_name(model, output_path) c.checkpoint(c.are_predictions_created, pred_file, test_reader.number_of_tests(), debug=debug) complete_models = [] for index in range(len(models_split)): model = models_split[index] if (isinstance(model, basestring) or bigml.api.get_status(model)['code'] != bigml.api.FINISHED): try: model = u.check_resource(model, api.get_model, FIELDS_QS) except ValueError, exception: sys.exit("Failed to get model: %s" % (model, str(exception))) complete_models.append(model) local_model = MultiModel(complete_models) local_model.batch_predict(input_data_list, output_path, by_name=test_set_header, reuse=True) votes = local_model.batch_votes(output_path) models_count += max_models if models_count > models_total: models_count = models_total if verbosity: draw_progress_bar(models_count, models_total) if total_votes: for index in range(0, len(votes)): predictions = total_votes[index].predictions predictions.extend(votes[index].predictions) else: total_votes = votes
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ time_series = None time_series_set = None # variables from command-line options resume = args.resume_ time_series_ids = args.time_series_ids_ output = args.predictions # there's only one time_series to be generated at present args.max_parallel_time_series = 1 args.max_parallel_evaluations = 1 # time_series cannot be published yet. args.public_time_series = False # no cross-validations args.dataset_off = False args.cross_validation_rate = 0 args.number_of_evaluations = 1 # It is compulsory to have a description to publish either datasets or # time_series if (not args.description_ and (args.public_time_series or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if datasets: # Now we have a dataset, let's check if there's an objective_field # given by the user and update it in the fields structure args.objective_id_ = get_objective_id(args, fields) # if the time series is going to be evaluated, and we don't have # test data, we need to divide the rows using ranges, so we'll need # max rows args.max_rows = datasets[0]["object"]["rows"] if args.time_series_file: # time-series is retrieved from the contents of the given local # JSON file time_series, csv_properties, fields = u.read_local_resource( args.time_series_file, csv_properties=csv_properties) time_series_set = [time_series] time_series_ids = [time_series['resource']] else: # time-series is retrieved from the remote object time_series_set, time_series_ids, resume = \ pts.time_series_processing( \ datasets, time_series_set, time_series_ids, \ api, args, resume, fields=fields, \ session_file=session_file, path=path, log=log) if time_series_set: time_series = time_series_set[0] # We update the time-series' public state if needed if time_series: if isinstance(time_series, basestring): query_string = r.ALL_FIELDS_QS time_series = u.check_resource(time_series, api.get_time_series, query_string=query_string) time_series_set[0] = time_series if (args.public_time_series or (args.shared_flag and r.shared_changed(args.shared, time_series))): time_series_args = {} if args.shared_flag and r.shared_changed(args.shared, time_series): time_series_args.update(shared=args.shared) if args.public_time_series: time_series_args.update( \ r.set_publish_time_series_args(args)) if time_series_args: time_series = r.time_series( \ time_series, time_series_args, args, api=api, path=path, \ session_file=session_file) time_series_set[0] = time_series """ # We get the fields of the time-series if we haven't got # them yet and need them if time_series and (args.test_set or args.export_fields): fields = pts.get_time_series_fields( \ time_series, csv_properties, args) """ if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) # If forecasting if time_series_set and a.has_ts_test(args): if args.remote: forecast_args = r.set_forecast_args( args, fields=fields) remote_forecast(time_series, forecast_args, args, \ api, resume, prediction_file=output, \ session_file=session_file, path=path, log=log) else: forecast(time_series, args, session_file=session_file) # If evaluate flag is on, create remote evaluation and save results in # json and human-readable format. if args.evaluate: # When we resume evaluation and models were already completed, we # should use the datasets array as test datasets if args.has_test_datasets_: test_dataset = get_test_dataset(args) if args.dataset_off and not args.has_test_datasets_: args.test_dataset_ids = datasets if args.test_dataset_ids and args.dataset_off: # Evaluate the models with the corresponding test datasets. test_dataset_id = bigml.api.get_dataset_id( \ args.test_dataset_ids[0]) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) resume = evaluate(time_series_set, args.test_dataset_ids, api, args, resume, fields=fields, dataset_fields=test_fields, session_file=session_file, path=path, log=log, objective_field=args.objective_field) else: dataset = datasets[0] if args.test_split > 0 or args.has_test_datasets_: dataset = test_dataset else: args.range_ = [int(args.max_rows * r.EVALUATE_SAMPLE_RATE), args.max_rows] dataset = u.check_resource(dataset, api=api, query_string=r.ALL_FIELDS_QS) dataset_fields = pd.get_fields_structure(dataset, None) resume = evaluate(time_series_set, [dataset], api, args, resume, fields=fields, dataset_fields=dataset_fields, session_file=session_file, path=path, log=log, objective_field=args.objective_field) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
) else: if args.multi_label and args.test_set is not None: # When evaluation starts from existing models, the # multi_label_fields can be retrieved from the user_metadata # in the models if args.multi_label_fields is None and multi_label_fields: args.multi_label_fields = multi_label_fields test_set = ps.multi_label_expansion( test_set, test_set_header, args, path, labels=labels, session_file=session_file )[0] test_set_header = True if args.test_split > 0 or args.has_test_datasets_: dataset = test_dataset dataset = u.check_resource(dataset, api=api, query_string=r.ALL_FIELDS_QS) dataset_fields = pd.get_fields_structure(dataset, None) models_or_ensembles = ensemble_ids if ensemble_ids != [] else models resume = evaluate( models_or_ensembles, [dataset], api, args, resume, fields=fields, dataset_fields=dataset_fields, session_file=session_file, path=path, log=log, labels=labels, all_labels=all_labels,
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ cluster = None clusters = None # no multi-label support at present # variables from command-line options resume = args.resume_ cluster_ids = args.cluster_ids_ output = args.predictions # there's only one cluster to be generated at present args.max_parallel_clusters = 1 # clusters cannot be published yet. args.public_cluster = False # It is compulsory to have a description to publish either datasets or # clusters if (not args.description_ and (args.public_cluster or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.cluster_file: # cluster is retrieved from the contents of the given local JSON file cluster, csv_properties, fields = u.read_local_resource( args.cluster_file, csv_properties=csv_properties) clusters = [cluster] cluster_ids = [cluster['resource']] else: # cluster is retrieved from the remote object clusters, cluster_ids, resume = pc.clusters_processing( datasets, clusters, cluster_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if clusters: cluster = clusters[0] # We update the cluster's public state if needed if cluster: if isinstance(cluster, basestring): if args.cluster_datasets is None and not a.has_test(args): query_string = MINIMUM_MODEL else: query_string = '' cluster = u.check_resource(cluster, api.get_cluster, query_string=query_string) clusters[0] = cluster if (args.public_cluster or (args.shared_flag and r.shared_changed(args.shared, cluster))): cluster_args = {} if args.shared_flag and r.shared_changed(args.shared, cluster): cluster_args.update(shared=args.shared) if args.public_cluster: cluster_args.update(r.set_publish_cluster_args(args)) if cluster_args: cluster = r.update_cluster(cluster, cluster_args, args, api=api, path=path, session_file=session_file) clusters[0] = cluster # We get the fields of the cluster if we haven't got # them yet and need them if cluster and args.test_set: fields = pc.get_cluster_fields(cluster, csv_properties, args) # If predicting if clusters and (a.has_test(args) or (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote centroids: centroids are computed as batch centroids # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_centroid_args = r.set_batch_centroid_args( args, fields=fields, dataset_fields=test_fields) remote_centroid(cluster, test_dataset, batch_centroid_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: centroid(clusters, fields, args, session_file=session_file) if cluster and args.cluster_datasets is not None: centroids_info = cluster['object']['clusters']['clusters'] centroids = {centroid['name']: centroid['id'] for centroid in centroids_info} datasets = cluster['object']['cluster_datasets'] if args.cluster_datasets == '': centroid_ids = centroids.values() else: centroid_ids = [centroids[cluster_name] for cluster_name in args.cluster_datasets_ if datasets[centroids[cluster_name]] == ''] for centroid_id in centroid_ids: dataset_args = {'centroid': centroid_id} r.create_dataset(cluster, dataset_args, args, api=api, path=path, session_file=session_file, log=log, dataset_type='cluster') u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args, training_set, test_set=None, output=None, objective_field=None, description=None, field_attributes=None, types=None, dataset_fields=None, model_fields=None, name=None, training_set_header=True, test_set_header=True, model_ids=None, votes_files=None, resume=False, fields_map=None, test_field_attributes=None, test_types=None): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ source = None dataset = None model = None models = None fields = None other_label = OTHER ensemble_ids = [] multi_label_data = None multi_label_fields = [] local_ensemble = None # It is compulsory to have a description to publish either datasets or # models if (not description and (args.black_box or args.white_box or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --max-categories, it is compulsory to specify also the # objective_field if args.max_categories > 0 and objective_field is None: sys.exit("When --max-categories is used, you must also provide the" " --objective field name or column number") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # labels to be used in multi-label expansion labels = (map(str.strip, args.labels.split(',')) if args.labels is not None else None) if labels is not None: labels = sorted([label.decode("utf-8") for label in labels]) # multi_label file must be preprocessed to obtain a new extended file if args.multi_label and training_set is not None: (training_set, multi_label_data) = ps.multi_label_expansion( training_set, training_set_header, objective_field, args, path, labels=labels, session_file=session_file) training_set_header = True objective_field = multi_label_data["objective_name"] all_labels = l.get_all_labels(multi_label_data) if not labels: labels = all_labels else: all_labels = labels source, resume, csv_properties, fields = ps.source_processing( training_set, test_set, training_set_header, test_set_header, api, args, resume, name=name, description=description, csv_properties=csv_properties, field_attributes=field_attributes, types=types, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) if args.multi_label and source: multi_label_data = l.get_multi_label_data(source) (objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync( objective_field, labels, multi_label_data, fields, multi_label_fields) datasets, resume, csv_properties, fields = pd.dataset_processing( source, training_set, test_set, fields, objective_field, api, args, resume, name=name, description=description, dataset_fields=dataset_fields, multi_label_data=multi_label_data, csv_properties=csv_properties, session_file=session_file, path=path, log=log) if datasets: dataset = datasets[0] # If test_split is used, split the dataset in a training and a test dataset # according to the given split if args.test_split > 0: dataset, test_dataset, resume = pd.split_processing( dataset, api, args, resume, name=name, description=description, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) datasets[0] = dataset # Check if the dataset has a categorical objective field and it # has a max_categories limit for categories if args.max_categories > 0 and len(datasets) == 1: objective_id = fields.field_id(fields.objective_field) if pd.check_max_categories(fields.fields[objective_id]): distribution = pd.get_categories_distribution(dataset, objective_id) if distribution and len(distribution) > args.max_categories: categories = [element[0] for element in distribution] other_label = pd.create_other_label(categories, other_label) datasets, resume = pd.create_categories_datasets( dataset, distribution, fields, args, api, resume, session_file=session_file, path=path, log=log, other_label=other_label) else: sys.exit("The provided objective field is not categorical nor " "a full terms only text field. " "Only these fields can be used with" " --max-categories") # If multi-dataset flag is on, generate a new dataset from the given # list of datasets if args.multi_dataset: dataset, resume = pd.create_new_dataset( datasets, api, args, resume, name=name, description=description, fields=fields, dataset_fields=dataset_fields, objective_field=objective_field, session_file=session_file, path=path, log=log) datasets = [dataset] # Check if the dataset has a generators file associated with it, and # generate a new dataset with the specified field structure if args.new_fields: dataset, resume = pd.create_new_dataset( dataset, api, args, resume, name=name, description=description, fields=fields, dataset_fields=dataset_fields, objective_field=objective_field, session_file=session_file, path=path, log=log) datasets[0] = dataset if args.multi_label and dataset and multi_label_data is None: multi_label_data = l.get_multi_label_data(dataset) (objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync( objective_field, labels, multi_label_data, fields, multi_label_fields) if dataset: # retrieves max_categories data, if any args.max_categories = get_metadata(dataset, 'max_categories', args.max_categories) other_label = get_metadata(dataset, 'other_label', other_label) models, model_ids, ensemble_ids, resume = pm.models_processing( datasets, models, model_ids, objective_field, fields, api, args, resume, name=name, description=description, model_fields=model_fields, session_file=session_file, path=path, log=log, labels=labels, multi_label_data=multi_label_data, other_label=other_label) if models: model = models[0] single_model = len(models) == 1 # If multi-label flag is set and no training_set was provided, label # info is extracted from the user_metadata. If models belong to an # ensemble, the ensemble must be retrieved to get the user_metadata. if model and args.multi_label and multi_label_data is None: if len(ensemble_ids) > 0 and isinstance(ensemble_ids[0], dict): resource = ensemble_ids[0] elif belongs_to_ensemble(model): ensemble_id = get_ensemble_id(model) resource = r.get_ensemble(ensemble_id, api=api, verbosity=args.verbosity, session_file=session_file) else: resource = model multi_label_data = l.get_multi_label_data(resource) # We update the model's public state if needed if model: if isinstance(model, basestring): if not args.evaluate: query_string = MINIMUM_MODEL else: query_string = r.FIELDS_QS model = u.check_resource(model, api.get_model, query_string=query_string) if (args.black_box or args.white_box or (args.shared_flag and r.shared_changed(args.shared, model))): model_args = {} if args.shared_flag and r.shared_changed(args.shared, model): model_args.update(shared=args.shared) if args.black_box or args.white_box: model_args.update(r.set_publish_model_args(args)) if model_args: model = r.update_model(model, model_args, args, api=api, path=path, session_file=session_file) models[0] = model # We get the fields of the model if we haven't got # them yet and need them if model and not args.evaluate and test_set: # If more than one model, use the full field structure if (not single_model and not args.multi_label and belongs_to_ensemble(model)): if len(ensemble_ids) > 0: ensemble_id = ensemble_ids[0] else: ensemble_id = get_ensemble_id(model) local_ensemble = Ensemble(ensemble_id, api=api) fields, objective_field = pm.get_model_fields( model, csv_properties, args, single_model=single_model, multi_label_data=multi_label_data, local_ensemble=local_ensemble) # Fills in all_labels from user_metadata if args.multi_label and not all_labels: (objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync( objective_field, labels, multi_label_data, fields, multi_label_fields) if model: # retrieves max_categories data, if any args.max_categories = get_metadata(model, 'max_categories', args.max_categories) other_label = get_metadata(model, 'other_label', other_label) # If predicting if models and has_test(args) and not args.evaluate: models_per_label = 1 test_dataset = None if args.multi_label: # When prediction starts from existing models, the # multi_label_fields can be retrieved from the user_metadata # in the models if args.multi_label_fields is None and multi_label_fields: multi_label_field_names = [field[1] for field in multi_label_fields] args.multi_label_fields = ",".join(multi_label_field_names) test_set = ps.multi_label_expansion( test_set, test_set_header, objective_field, args, path, labels=labels, session_file=session_file, input_flag=True)[0] test_set_header = True # Remote predictions: predictions are computed as batch predictions # in bigml.com except when --no-batch flag is set on or multi-label # or max-categories are used if (args.remote and not args.no_batch and not args.multi_label and not args.method in [THRESHOLD_CODE, COMBINATION]): # create test source from file test_name = "%s - test" % name if args.test_source is None: (test_source, resume, csv_properties, test_fields) = ps.test_source_processing( test_set, test_set_header, api, args, resume, name=test_name, description=description, field_attributes=test_field_attributes, types=test_types, session_file=session_file, path=path, log=log) else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id, api.get_source) if args.test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(test_name, description, args) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(args.test_dataset) test_dataset = api.check_resource(test_dataset_id, api.get_dataset) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_prediction_args = r.set_batch_prediction_args( name, description, args, fields=fields, dataset_fields=test_fields, fields_map=fields_map) remote_predict(model, test_dataset, batch_prediction_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: models_per_label = args.number_of_models if (args.multi_label and len(ensemble_ids) > 0 and args.number_of_models == 1): # use case where ensembles are read from a file models_per_label = len(models) / len(ensemble_ids) predict(test_set, test_set_header, models, fields, output, objective_field, args, api=api, log=log, resume=resume, session_file=session_file, labels=labels, models_per_label=models_per_label, other_label=other_label, multi_label_data=multi_label_data) # When combine_votes flag is used, retrieve the predictions files saved # in the comma separated list of directories and combine them if votes_files: model_id = re.sub(r'.*(model_[a-f0-9]{24})__predictions\.csv$', r'\1', votes_files[0]).replace("_", "/") try: model = u.check_resource(model_id, api.get_model) except ValueError, exception: sys.exit("Failed to get model %s: %s" % (model_id, str(exception))) local_model = Model(model) message = u.dated("Combining votes.\n") u.log_message(message, log_file=session_file, console=args.verbosity) combine_votes(votes_files, local_model.to_prediction, output, args.method)
verbosity=args.verbosity, session_file=session_file) else: resource = model multi_label_data = l.get_multi_label_data(resource) # We update the model's public state if needed if model: if isinstance(model, basestring): if not args.evaluate: query_string = MINIMUM_MODEL elif not args.test_header: query_string = r.ALL_FIELDS_QS else: query_string = "%s;%s" % (r.ALL_FIELDS_QS, r.FIELDS_QS) model = u.check_resource(model, api.get_model, query_string=query_string) if (args.black_box or args.white_box or (args.shared_flag and r.shared_changed(args.shared, model))): model_args = {} if args.shared_flag and r.shared_changed(args.shared, model): model_args.update(shared=args.shared) if args.black_box or args.white_box: model_args.update(r.set_publish_model_args(args)) if model_args: model = r.update_model(model, model_args, args, api=api, path=path, session_file=session_file) models[0] = model # We get the fields of the model if we haven't got # them yet and need them