def create_batch_prediction(model_or_ensemble, test_dataset, batch_prediction_args, args, api=None, session_file=None, path=None, log=None): """Creates remote batch_prediction """ if api is None: api = bigml.api.BigML() message = dated("Creating batch prediction.\n") log_message(message, log_file=session_file, console=args.verbosity) batch_prediction = api.create_batch_prediction(model_or_ensemble, test_dataset, batch_prediction_args, retries=None) log_created_resources("batch_prediction", path, bigml.api.get_batch_prediction_id(batch_prediction), mode='a') batch_prediction_id = check_resource_error( batch_prediction, "Failed to create batch prediction: ") try: batch_prediction = check_resource(batch_prediction, api.get_batch_prediction, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a finished batch prediction: %s" % str(exception))
def remote_anomaly_score(anomaly, test_dataset, batch_anomaly_score_args, args, api, resume, prediction_file=None, session_file=None, path=None, log=None): """Computes an anomaly score for each entry in the `test_set`. Predictions are computed remotely using the batch anomaly score call. """ anomaly_id = bigml.api.get_anomaly_id(anomaly) # if resuming, try to extract dataset form log files if resume: message = u.dated("Batch anomaly score not found. Resuming.\n") resume, batch_anomaly_score = c.checkpoint( c.is_batch_anomaly_score_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if not resume: batch_anomaly_score = create_batch_anomaly_score( anomaly_id, test_dataset, batch_anomaly_score_args, args, api, session_file=session_file, path=path, log=log) if not args.no_csv: api.download_batch_anomaly_score(batch_anomaly_score, prediction_file) if args.to_dataset: batch_anomaly_score = bigml.api.check_resource(batch_anomaly_score, api=api) new_dataset = bigml.api.get_dataset_id( batch_anomaly_score['object']['output_dataset_resource']) if new_dataset is not None: message = u.dated("Batch anomaly score dataset created: %s\n" % u.get_url(new_dataset)) u.log_message(message, log_file=session_file, console=args.verbosity) u.log_created_resources("batch_anomaly_score_dataset", path, new_dataset, open_mode='a')
def create_source(data_set, source_args, args, api=None, path=None, session_file=None, log=None, source_type=None): """Creates remote source """ if api is None: api = bigml.api.BigML() suffix = "" if source_type is None else "%s " % source_type message = dated("Creating %ssource.\n" % suffix) log_message(message, log_file=session_file, console=args.verbosity) check_fields_struct(source_args, "source") source = api.create_source(data_set, source_args, progress_bar=args.progress_bar) if path is not None: suffix = "_" + source_type if source_type else "" log_created_resources("source%s" % suffix, path, source['resource'], mode='ab', comment=(u"%s\n" % source['object']['name'])) source_id = check_resource_error(source, "Failed to create source: ") try: source = check_resource(source, api.get_source, query_string=ALL_FIELDS_QS, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a finished source: %s" % str(exception))
def create_library(source_code, library_args, args, api=None, path=None, session_file=None, log=None): """Creates remote library """ if api is None: api = bigml.api.BigML() message = dated("Creating library \"%s\".\n" % library_args["name"]) log_message(message, log_file=session_file, console=args.verbosity) library = api.create_library(source_code, library_args) log_created_resources("library", path, bigml.api.get_library_id(library), mode='a') library_id = check_resource_error(library, "Failed to create library: ") try: library = check_resource(library, api.get_library, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a compiled library: %s" % str(exception))
def create_external_connector(external_connector_args, args, api=None, session_file=None, path=None, log=None): """Creates remote external connector """ if api is None: api = bigml.api.BigML() message = dated("Creating external connector.\n") log_message(message, log_file=session_file, console=args.verbosity) external_connector = api.create_external_connector( \ args.connection_info, external_connector_args) log_created_resources( \ "external_connector", path, bigml.api.get_external_connector_id(external_connector), mode='a') external_connector_id = check_resource_error( \ external_connector, "Failed to create external connector: ") try: external_connector = check_resource( \ external_connector, api=api, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a finished external connector: %s" % \ str(exception))
def create_dataset(source_or_dataset, dataset_args, args, api=None, path=None, session_file=None, log=None, dataset_type=None): """Creates remote dataset from source, dataset or datasets list """ if api is None: api = bigml.api.BigML() message = dated("Creating dataset.\n") log_message(message, log_file=session_file, console=args.verbosity) dataset = api.create_dataset(source_or_dataset, dataset_args) suffix = "_" + dataset_type if dataset_type else "" log_created_resources("dataset%s" % suffix, path, bigml.api.get_dataset_id(dataset), open_mode='a') dataset_id = check_resource_error(dataset, "Failed to create dataset: ") try: dataset = check_resource(dataset, api.get_dataset, query_string=ALL_FIELDS_QS) except ValueError, exception: sys.exit("Failed to get a finished dataset: %s" % str(exception))
def create_execution(execution_args, args, api=None, path=None, session_file=None, log=None): """Creates remote execution """ message = dated("Creating execution.\n") log_message(message, log_file=session_file, console=args.verbosity) scripts = args.script_ids if args.script_ids else args.script execution = api.create_execution(scripts, execution_args) log_created_resources("execution", path, bigml.api.get_execution_id(execution), mode='a') execution_id = check_resource_error(execution, "Failed to create execution: ") try: execution = check_resource(execution, api.get_execution, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a finished execution: %s" % str(exception))
def create_evaluation(model_or_ensemble, dataset, evaluation_args, args, api=None, path=None, session_file=None, log=None, seed=SEED): """Create evaluation ``model_or_ensemble``: resource object or id for the model or ensemble that should be evaluated ``dataset``: dataset object or id to evaluate with ``evaluation_args``: arguments for the ``create_evaluation`` call ``args``: input values for bigmler flags ``api``: api to remote objects in BigML ``path``: directory to store the BigMLer generated files in ``session_file``: file to store the messages of that session ``log``: user provided log file ``seed``: seed for the dataset sampling (when needed) """ if api is None: api = bigml.api.BigML() if args.cross_validation_rate > 0: evaluation_args.update(seed=seed) message = dated("Creating evaluation.\n") log_message(message, log_file=session_file, console=args.verbosity) evaluation = api.create_evaluation(model_or_ensemble, dataset, evaluation_args) log_created_resources("evaluation", path, bigml.api.get_evaluation_id(evaluation)) check_resource_error(evaluation, "Failed to create evaluation: ") log_message("%s\n" % evaluation['resource'], log_file=log) return evaluation
def remote_predict(model, test_dataset, batch_prediction_args, args, api, resume, prediction_file=None, session_file=None, path=None, log=None): """Computes a prediction for each entry in the `test_set`. Predictions are computed remotely using the batch predictions call. """ if args.ensemble is not None: model_or_ensemble = args.ensemble else: model_or_ensemble = bigml.api.get_model_id(model) # if resuming, try to extract dataset form log files if resume: message = u.dated("Batch prediction not found. Resuming.\n") resume, batch_prediction = c.checkpoint( c.is_batch_prediction_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if not resume: batch_prediction = create_batch_prediction( model_or_ensemble, test_dataset, batch_prediction_args, args, api, session_file=session_file, path=path, log=log) if not args.no_csv: api.download_batch_prediction(batch_prediction, prediction_file) if args.to_dataset: batch_prediction = bigml.api.check_resource(batch_prediction, api=api) new_dataset = bigml.api.get_dataset_id( batch_prediction['object']['output_dataset_resource']) if new_dataset is not None: message = u.dated("Batch prediction dataset created: %s\n" % u.get_url(new_dataset)) u.log_message(message, log_file=session_file, console=args.verbosity) u.log_created_resources("batch_prediction_dataset", path, new_dataset, mode='a')
def remote_centroid(cluster, test_dataset, batch_centroid_args, args, api, resume, prediction_file=None, session_file=None, path=None, log=None): """Computes a centroid for each entry in the `test_set`. Predictions are computed remotely using the batch centroid call. """ cluster_id = bigml.api.get_cluster_id(cluster) # if resuming, try to extract dataset form log files if resume: message = u.dated("Batch centroid not found. Resuming.\n") resume, batch_centroid = c.checkpoint( c.is_batch_centroid_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if not resume: batch_centroid = create_batch_centroid( cluster_id, test_dataset, batch_centroid_args, args, api, session_file=session_file, path=path, log=log) if not args.no_csv: file_name = api.download_batch_centroid(batch_centroid, prediction_file) if file_name is None: sys.exit("Failed downloading CSV.") if args.to_dataset: batch_centroid = bigml.api.check_resource(batch_centroid, api=api) new_dataset = bigml.api.get_dataset_id( batch_centroid['object']['output_dataset_resource']) if new_dataset is not None: message = u.dated("Batch centroid dataset created: %s\n" % u.get_url(new_dataset)) u.log_message(message, log_file=session_file, console=args.verbosity) u.log_created_resources("batch_centroid_dataset", path, new_dataset, mode='a')
def create_batch_prediction(model_or_ensemble, test_dataset, batch_prediction_args, verbosity, api=None, session_file=None, path=None, log=None): """Creates remote batch_prediction """ if api is None: api = bigml.api.BigML() message = dated("Creating batch prediction.\n") log_message(message, log_file=session_file, console=verbosity) batch_prediction = api.create_batch_prediction(model_or_ensemble, test_dataset, batch_prediction_args) log_created_resources("batch_prediction", path, bigml.api.get_batch_prediction_id(batch_prediction), open_mode='a') batch_prediction_id = check_resource_error( batch_prediction, "Failed to create batch prediction: ") try: batch_prediction = check_resource(batch_prediction, api.get_batch_prediction) except ValueError, exception: sys.exit("Failed to get a finished batch prediction: %s" % str(exception))
def create_ensembles(datasets, ensemble_ids, ensemble_args, args, number_of_ensembles=1, api=None, path=None, session_file=None, log=None): """Create ensembles from input data """ if api is None: api = bigml.api.BigML() ensembles = ensemble_ids[:] existing_ensembles = len(ensembles) model_ids = [] ensemble_args_list = [] if isinstance(ensemble_args, list): ensemble_args_list = ensemble_args if args.dataset_off and args.evaluate: args.test_dataset_ids = datasets[:] if not args.multi_label: datasets = datasets[existing_ensembles:] if number_of_ensembles > 0: message = dated("Creating %s.\n" % plural("ensemble", number_of_ensembles)) log_message(message, log_file=session_file, console=args.verbosity) inprogress = [] for i in range(0, number_of_ensembles): wait_for_available_tasks(inprogress, args.max_parallel_ensembles, api, "ensemble", wait_step=args.number_of_models) if ensemble_args_list: ensemble_args = ensemble_args_list[i] if args.dataset_off and args.evaluate: multi_dataset = args.test_dataset_ids[:] del multi_dataset[i + existing_ensembles] ensemble = api.create_ensemble(multi_dataset, ensemble_args, retries=None) else: ensemble = api.create_ensemble(datasets, ensemble_args, retries=None) ensemble_id = check_resource_error(ensemble, "Failed to create ensemble: ") log_message("%s\n" % ensemble_id, log_file=log) ensemble_ids.append(ensemble_id) inprogress.append(ensemble_id) ensembles.append(ensemble) log_created_resources("ensembles", path, ensemble_id, mode='a') models, model_ids = retrieve_ensembles_models(ensembles, api, path) if number_of_ensembles < 2 and args.verbosity: message = dated("Ensemble created: %s\n" % get_url(ensemble)) log_message(message, log_file=session_file, console=args.verbosity) if args.reports: report(args.reports, path, ensemble) return ensembles, ensemble_ids, models, model_ids
def remote_centroid(cluster, test_dataset, batch_centroid_args, args, api, resume, prediction_file=None, session_file=None, path=None, log=None): """Computes a centroid for each entry in the `test_set`. Predictions are computed remotely using the batch centroid call. """ cluster_id = bigml.api.get_cluster_id(cluster) # if resuming, try to extract dataset form log files if resume: message = u.dated("Batch centroid not found. Resuming.\n") resume, batch_centroid = c.checkpoint( c.is_batch_centroid_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if not resume: batch_centroid = create_batch_centroid( cluster_id, test_dataset, batch_centroid_args, args, api, session_file=session_file, path=path, log=log) if not args.no_csv: api.download_batch_centroid(batch_centroid, prediction_file) if args.to_dataset: batch_centroid = bigml.api.check_resource(batch_centroid, api=api) new_dataset = bigml.api.get_dataset_id( batch_centroid['object']['output_dataset_resource']) if new_dataset is not None: message = u.dated("Batch centroid dataset created: %s\n" % u.get_url(new_dataset)) u.log_message(message, log_file=session_file, console=args.verbosity) u.log_created_resources("batch_centroid_dataset", path, new_dataset, mode='a')
def create_forecast(time_series, input_data, forecast_args, args, api=None, session_file=None, path=None, log=None): """Creates remote forecast """ if api is None: api = bigml.api.BigML() message = dated("Creating remote forecast.\n") log_message(message, log_file=session_file, console=args.verbosity) forecast = api.create_forecast(time_series, input_data, forecast_args, retries=None) log_created_resources("forecast", path, bigml.api.get_forecast_id(forecast), mode='a') forecast_id = check_resource_error(forecast, "Failed to create forecast: ") try: forecast = check_resource(forecast, api.get_forecast, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a finished forecast: %s" % str(exception))
def create_models(dataset, model_ids, model_args, args, api=None, path=None, session_file=None, log=None): """Create remote models """ if api is None: api = bigml.api.BigML() models = model_ids[:] existing_models = len(models) model_args_list = [] if isinstance(model_args, list): model_args_list = model_args if args.number_of_models > 0: message = dated("Creating %s.\n" % plural("model", args.number_of_models)) log_message(message, log_file=session_file, console=args.verbosity) single_model = args.number_of_models == 1 and existing_models == 0 # if there's more than one model the first one must contain # the entire field structure to be used as reference. query_string = (FIELDS_QS if single_model else ALL_FIELDS_QS) for i in range(0, args.number_of_models): if i % args.max_parallel_models == 0 and i > 0: try: models[i - 1] = check_resource( models[i - 1], api.get_model, query_string=query_string) except ValueError, exception: sys.exit("Failed to get a finished model: %s" % str(exception)) if model_args_list: model_args = model_args_list[i] if args.cross_validation_rate > 0: new_seed = get_basic_seed(i + existing_models) model_args.update(seed=new_seed) model = api.create_model(dataset, model_args) model_id = check_resource_error(model, "Failed to create model: ") log_message("%s\n" % model_id, log_file=log) model_ids.append(model_id) models.append(model) log_created_resources("models", path, model_id, open_mode='a') if args.number_of_models < 2 and args.verbosity: if bigml.api.get_status(model)['code'] != bigml.api.FINISHED: try: model = check_resource(model, api.get_model, query_string=query_string) except ValueError, exception: sys.exit("Failed to get a finished model: %s" % str(exception)) models[0] = model message = dated("Model created: %s.\n" % get_url(model)) log_message(message, log_file=session_file, console=args.verbosity)
def create_samples(datasets, sample_ids, sample_args, args, api=None, path=None, session_file=None, log=None): """Create remote samples """ if api is None: api = bigml.api.BigML() samples = sample_ids[:] existing_samples = len(samples) sample_args_list = [] datasets = datasets[existing_samples:] # if resuming and all samples were created, there will be no datasets left if datasets: if isinstance(sample_args, list): sample_args_list = sample_args # Only one sample per command, at present number_of_samples = 1 max_parallel_samples = 1 message = dated("Creating %s.\n" % plural("sample", number_of_samples)) log_message(message, log_file=session_file, console=args.verbosity) inprogress = [] for i in range(0, number_of_samples): wait_for_available_tasks(inprogress, max_parallel_samples, api, "sample") if sample_args_list: sample_args = sample_args_list[i] sample = api.create_sample(datasets[i], sample_args, retries=None) sample_id = check_resource_error(sample, "Failed to create sample: ") log_message("%s\n" % sample_id, log_file=log) sample_ids.append(sample_id) inprogress.append(sample_id) samples.append(sample) log_created_resources("samples", path, sample_id, mode='a') if args.verbosity: if bigml.api.get_status(sample)['code'] != bigml.api.FINISHED: try: sample = check_resource(sample, api.get_sample, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a finished sample: %s" % str(exception)) samples[0] = sample message = dated("Sample created: %s\n" % get_url(sample)) log_message(message, log_file=session_file, console=args.verbosity) if args.reports: report(args.reports, path, sample)
def create_models(dataset, model_ids, model_args, args, api=None, path=None, session_file=None, log=None): """Create remote models """ if api is None: api = bigml.api.BigML() models = model_ids[:] existing_models = len(models) last_model = None if args.number_of_models > 0: message = dated("Creating %s.\n" % plural("model", args.number_of_models)) log_message(message, log_file=session_file, console=args.verbosity) for i in range(0, args.number_of_models): if i % args.max_parallel_models == 0 and i > 0: try: models[i - 1] = check_resource(models[i - 1], api.get_model, query_string=FIELDS_QS) except ValueError, exception: sys.exit("Failed to get a finished model: %s" % str(exception)) if args.cross_validation_rate > 0: new_seed = get_basic_seed(i + existing_models) model_args.update(seed=new_seed) model = api.create_model(dataset, model_args) log_message("%s\n" % model['resource'], log_file=log) model_ids.append(model['resource']) models.append(model) log_created_resources("models", path, bigml.api.get_model_id(model), open_mode='a') check_resource_error( model, "Failed to create model %s:" % model['resource']) if args.number_of_models < 2 and args.verbosity: if bigml.api.get_status(model)['code'] != bigml.api.FINISHED: try: model = check_resource(model, api.get_model, query_string=FIELDS_QS) except ValueError, exception: sys.exit("Failed to get a finished model: %s" % str(exception)) models[0] = model message = dated("Model created: %s.\n" % get_url(model)) log_message(message, log_file=session_file, console=args.verbosity)
def remote_anomaly_score(anomaly, test_dataset, batch_anomaly_score_args, args, api, resume, prediction_file=None, session_file=None, path=None, log=None): """Computes an anomaly score for each entry in the `test_set`. Predictions are computed remotely using the batch anomaly score call. """ anomaly_id = bigml.api.get_anomaly_id(anomaly) # if resuming, try to extract dataset form log files if resume: message = u.dated("Batch anomaly score not found. Resuming.\n") resume, batch_anomaly_score = c.checkpoint( c.is_batch_anomaly_score_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if not resume: batch_anomaly_score = create_batch_anomaly_score( anomaly_id, test_dataset, batch_anomaly_score_args, args, api, session_file=session_file, path=path, log=log) if not args.no_csv: file_name = api.download_batch_anomaly_score(batch_anomaly_score, prediction_file) if file_name is None: sys.exit("Failed downloading CSV.") if args.to_dataset: batch_anomaly_score = bigml.api.check_resource(batch_anomaly_score, api=api) new_dataset = bigml.api.get_dataset_id( batch_anomaly_score['object']['output_dataset_resource']) if new_dataset is not None: message = u.dated("Batch anomaly score dataset created: %s\n" % u.get_url(new_dataset)) u.log_message(message, log_file=session_file, console=args.verbosity) u.log_created_resources("batch_anomaly_score_dataset", path, new_dataset, mode='a')
def create_clusters(datasets, cluster_ids, cluster_args, args, api=None, path=None, session_file=None, log=None): """Create remote clusters """ if api is None: api = bigml.api.BigML() clusters = cluster_ids[:] existing_clusters = len(clusters) cluster_args_list = [] datasets = datasets[existing_clusters:] # if resuming and all clusters were created, there will be no datasets left if datasets: if isinstance(cluster_args, list): cluster_args_list = cluster_args # Only one cluster per command, at present number_of_clusters = 1 message = dated("Creating %s.\n" % plural("cluster", number_of_clusters)) log_message(message, log_file=session_file, console=args.verbosity) query_string = FIELDS_QS inprogress = [] for i in range(0, number_of_clusters): wait_for_available_tasks(inprogress, args.max_parallel_clusters, api, "cluster") if cluster_args_list: cluster_args = cluster_args_list[i] cluster = api.create_cluster(datasets, cluster_args, retries=None) cluster_id = check_resource_error(cluster, "Failed to create cluster: ") log_message("%s\n" % cluster_id, log_file=log) cluster_ids.append(cluster_id) inprogress.append(cluster_id) clusters.append(cluster) log_created_resources("clusters", path, cluster_id, mode='a') if args.verbosity: if bigml.api.get_status(cluster)['code'] != bigml.api.FINISHED: try: cluster = check_resource(cluster, api.get_cluster, query_string=query_string, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a finished cluster: %s" % str(exception)) clusters[0] = cluster message = dated("Cluster created: %s\n" % get_url(cluster)) log_message(message, log_file=session_file, console=args.verbosity) if args.reports: report(args.reports, path, cluster)
def remote_prediction(model, test_dataset, batch_prediction_args, args, api, resume, prediction_file=None, session_file=None, path=None, log=None): """Computes a prediction for each entry in the `test_set`. Predictions are computed remotely using the batch prediction call. """ model_id = bigml.api.get_resource_id( \ model) batch_prediction_args.update({"probability": True, "confidence": False}) # if resuming, try to extract dataset form log files if resume: message = u.dated("Batch prediction not found. Resuming.\n") resume, batch_prediction = c.checkpoint(c.is_batch_prediction_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if not resume: batch_prediction = create_batch_prediction(model_id, test_dataset, batch_prediction_args, args, api, session_file=session_file, path=path, log=log) if not args.no_csv: file_name = api.download_batch_prediction(batch_prediction, prediction_file) if file_name is None: sys.exit("Failed downloading CSV.") if args.to_dataset: batch_prediction = bigml.api.check_resource(batch_prediction, api=api) new_dataset = bigml.api.get_dataset_id( batch_prediction['object']['output_dataset_resource']) if new_dataset is not None: message = u.dated("Batch prediction dataset created: %s\n" % u.get_url(new_dataset)) u.log_message(message, log_file=session_file, console=args.verbosity) u.log_created_resources("batch_prediction_dataset", path, new_dataset, mode='a')
def create_fusion(models, fusion, fusion_args, args, api=None, path=None, session_file=None, log=None): """Create remote fusion """ if api is None: api = bigml.api.BigML() fusions = [] fusion_ids = [] if fusion is not None: fusions = [fusion] fusion_ids = [fusion] # if resuming and all fusions were created if models: # Only one fusion per command, at present message = dated("Creating fusion.\n") log_message(message, log_file=session_file, console=args.verbosity) query_string = FIELDS_QS inprogress = [] wait_for_available_tasks(inprogress, args.max_parallel_fusions, api, "fusion") fusion = api.create_fusion(models, fusion_args, retries=None) fusion_id = check_resource_error( \ fusion, "Failed to create fusion: ") log_message("%s\n" % fusion_id, log_file=log) fusion_ids.append(fusion_id) inprogress.append(fusion_id) fusions.append(fusion) log_created_resources("fusions", path, fusion_id, mode='a') if args.verbosity: if bigml.api.get_status(fusion)['code'] != bigml.api.FINISHED: try: fusion = check_resource( \ fusion, api.get_fusion, query_string=query_string, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a finished fusion: %s" % str(exception)) fusions[0] = fusion message = dated("Fusion created: %s\n" % get_url(fusion)) log_message(message, log_file=session_file, console=args.verbosity) if args.reports: report(args.reports, path, fusion)
def create_ensembles(datasets, ensemble_ids, ensemble_args, args, number_of_ensembles=1, api=None, path=None, session_file=None, log=None): """Create ensembles from input data """ if api is None: api = bigml.api.BigML() ensembles = ensemble_ids[:] model_ids = [] ensemble_args_list = [] if isinstance(ensemble_args, list): ensemble_args_list = ensemble_args if number_of_ensembles > 0: message = dated("Creating %s.\n" % plural("ensemble", number_of_ensembles)) log_message(message, log_file=session_file, console=args.verbosity) query_string = ALL_FIELDS_QS inprogress = [] for i in range(0, number_of_ensembles): wait_for_available_tasks(inprogress, args.max_parallel_ensembles, api.get_ensemble, "ensemble", query_string=query_string, wait_step=args.number_of_models) if ensemble_args_list: ensemble_args = ensemble_args_list[i] ensemble = api.create_ensemble(datasets, ensemble_args) ensemble_id = check_resource_error(ensemble, "Failed to create ensemble: ") log_message("%s\n" % ensemble_id, log_file=log) ensemble_ids.append(ensemble_id) inprogress.append(ensemble_id) ensembles.append(ensemble) log_created_resources("ensembles", path, ensemble_id, open_mode='a') models, model_ids = retrieve_ensembles_models(ensembles, api, path) if number_of_ensembles < 2 and args.verbosity: message = dated("Ensemble created: %s.\n" % get_url(ensemble)) log_message(message, log_file=session_file, console=args.verbosity) if args.reports: report(args.reports, path, ensemble) return ensembles, ensemble_ids, models, model_ids
def remote_predict(model, test_dataset, batch_prediction_args, args, api, resume, prediction_file=None, session_file=None, path=None, log=None): """Computes a prediction for each entry in the `test_set`. Predictions are computed remotely using the batch predictions call. """ if args.ensemble is not None: model_or_ensemble = args.ensemble else: model_or_ensemble = bigml.api.get_model_id(model) # if resuming, try to extract dataset form log files if resume: message = u.dated("Batch prediction not found. Resuming.\n") resume, batch_prediction = c.checkpoint(c.is_batch_prediction_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if not resume: batch_prediction = create_batch_prediction(model_or_ensemble, test_dataset, batch_prediction_args, args, api, session_file=session_file, path=path, log=log) if not args.no_csv: api.download_batch_prediction(batch_prediction, prediction_file) if args.to_dataset: batch_prediction = bigml.api.check_resource(batch_prediction, api=api) new_dataset = bigml.api.get_dataset_id( batch_prediction['object']['output_dataset_resource']) if new_dataset is not None: message = u.dated("Batch prediction dataset created: %s\n" % u.get_url(new_dataset)) u.log_message(message, log_file=session_file, console=args.verbosity) u.log_created_resources("batch_prediction_dataset", path, new_dataset, mode='a')
def create_dataset(origin_resource, dataset_args, args, api=None, path=None, session_file=None, log=None, dataset_type=None): """Creates remote dataset from source, dataset, cluster or datasets list """ if api is None: api = bigml.api.BigML() message = dated("Creating dataset.\n") log_message(message, log_file=session_file, console=args.verbosity) check_fields_struct(dataset_args, "dataset") # if --json-query or --sql-query are used and no names are set for # the datasets, we create default naming to A, B, C, etc. for the datasets # to be used as origin if ((hasattr(args, 'sql_query') and args.sql_query) or \ (hasattr(args, 'json_query') and args.sql_query)) and \ isinstance(origin_resource, list) and \ ((not isinstance(origin_resource[0], dict)) or \ origin_resource[0].get("name") is None): for index, element in enumerate(origin_resource): if index < len(DS_NAMES): if isinstance(element, dict): if element.get("resource") is not None: element = {"id": element["resource"]} element.update({"name": DS_NAMES[index]}) origin_resource[index] = element elif isinstance(element, basestring): origin_resource[index] = { "id": element, "name": DS_NAMES[index] } dataset = api.create_dataset(origin_resource, dataset_args, retries=None) suffix = "_" + dataset_type if dataset_type else "" log_created_resources("dataset%s" % suffix, path, bigml.api.get_dataset_id(dataset), mode='a') dataset_id = check_resource_error(dataset, "Failed to create dataset: ") try: dataset = check_resource(dataset, api.get_dataset, query_string=ALL_FIELDS_QS, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a finished dataset: %s" % str(exception))
def create_models(dataset, model_ids, model_args, args, api=None, path=None, session_file=None, log=None): """Create remote models """ if api is None: api = bigml.api.BigML() models = model_ids[:] existing_models = len(models) last_model = None if args.number_of_models > 0: message = dated("Creating %s.\n" % plural("model", args.number_of_models)) log_message(message, log_file=session_file, console=args.verbosity) for i in range(0, args.number_of_models): if i % args.max_parallel_models == 0 and i > 0: try: models[i - 1] = check_resource( models[i - 1], api.get_model, query_string=FIELDS_QS) except ValueError, exception: sys.exit("Failed to get a finished model: %s" % str(exception)) if args.cross_validation_rate > 0: new_seed = get_basic_seed(i + existing_models) model_args.update(seed=new_seed) model = api.create_model(dataset, model_args) log_message("%s\n" % model['resource'], log_file=log) model_ids.append(model['resource']) models.append(model) log_created_resources("models", path, bigml.api.get_model_id(model), open_mode='a') check_resource_error(model, "Failed to create model %s:" % model['resource']) if args.number_of_models < 2 and args.verbosity: if bigml.api.get_status(model)['code'] != bigml.api.FINISHED: try: model = check_resource(model, api.get_model, query_string=FIELDS_QS) except ValueError, exception: sys.exit("Failed to get a finished model: %s" % str(exception)) models[0] = model message = dated("Model created: %s.\n" % get_url(model)) log_message(message, log_file=session_file, console=args.verbosity)
def create_evaluations(model_ids, datasets, evaluation_args, args, api=None, path=None, session_file=None, log=None, existing_evaluations=0): """Create evaluations for a list of models ``model_ids``: list of model ids to create an evaluation of ``datasets``: dataset objects or ids to evaluate with ``evaluation_args``: arguments for the ``create_evaluation`` call ``args``: input values for bigmler flags ``api``: api to remote objects in BigML ``path``: directory to store the BigMLer generated files in ``session_file``: file to store the messages of that session ``log``: user provided log file ``existing_evaluations``: evaluations found when attempting resume """ evaluations = [] dataset = datasets[0] evaluation_args_list = [] if isinstance(evaluation_args, list): evaluation_args_list = evaluation_args if api is None: api = bigml.api.BigML() remaining_ids = model_ids[existing_evaluations:] number_of_evaluations = len(remaining_ids) message = dated("Creating evaluations.\n") log_message(message, log_file=session_file, console=args.verbosity) inprogress = [] for i in range(0, number_of_evaluations): model = remaining_ids[i] wait_for_available_tasks(inprogress, args.max_parallel_evaluations, api.get_evaluation, "evaluation") if evaluation_args_list != []: evaluation_args = evaluation_args_list[i] if args.cross_validation_rate > 0: new_seed = get_basic_seed(i + existing_evaluations) evaluation_args.update(seed=new_seed) evaluation = api.create_evaluation(model, dataset, evaluation_args) evaluation_id = check_resource_error(evaluation, "Failed to create evaluation: ") inprogress.append(evaluation_id) log_created_resources("evaluations", path, evaluation_id, open_mode='a') evaluations.append(evaluation) log_message("%s\n" % evaluation['resource'], log_file=log) return evaluations
def create_evaluations(model_ids, dataset, evaluation_args, args, api=None, path=None, session_file=None, log=None, existing_evaluations=0): """Create evaluations for a list of models ``model_ids``: list of model ids to create an evaluation of ``dataset``: dataset object or id to evaluate with ``evaluation_args``: arguments for the ``create_evaluation`` call ``args``: input values for bigmler flags ``api``: api to remote objects in BigML ``path``: directory to store the BigMLer generated files in ``session_file``: file to store the messages of that session ``log``: user provided log file ``seed``: seed for the dataset sampling (when needed) """ evaluations = [] if api is None: api = bigml.api.BigML() number_of_evaluations = len(model_ids) message = dated("Creating evaluations.\n") log_message(message, log_file=session_file, console=args.verbosity) for i in range(0, number_of_evaluations): model = model_ids[i] if i % args.max_parallel_evaluations == 0 and i > 0: try: evaluations[i - 1] = check_resource(evaluations[i - 1], api.get_evaluation) except ValueError, exception: sys.exit("Failed to get a finished evaluation: %s" % str(exception)) if args.cross_validation_rate > 0: new_seed = get_basic_seed(i + existing_evaluations) evaluation_args.update(seed=new_seed) evaluation = api.create_evaluation(model, dataset, evaluation_args) log_created_resources("evaluations", path, bigml.api.get_evaluation_id(evaluation), open_mode='a') check_resource_error(evaluation, "Failed to create evaluation: ") evaluations.append(evaluation) log_message("%s\n" % evaluation['resource'], log_file=log)
def create_ensemble(dataset, ensemble_args, args, api=None, path=None, session_file=None, log=None): """Create ensemble from input data """ if api is None: api = bigml.api.BigML() message = dated("Creating ensemble.\n") log_message(message, log_file=session_file, console=args.verbosity) ensemble = api.create_ensemble(dataset, ensemble_args) log_created_resources("ensemble", path, bigml.api.get_ensemble_id(ensemble)) check_resource_error(ensemble, "Failed to create ensemble: ") log_message("%s\n" % ensemble['resource'], log_file=log) return ensemble
def ensemble_per_label(labels, dataset, api, args, resume, fields=None, multi_label_data=None, session_file=None, path=None, log=None): """Creates an ensemble per label for multi-label datasets """ ensemble_ids = [] ensembles = [] model_ids = [] models = [] number_of_ensembles = len(labels) if resume: resume, ensemble_ids = c.checkpoint(c.are_ensembles_created, path, number_of_ensembles, debug=args.debug) ensembles = ensemble_ids if not resume: message = u.dated("Found %s ensembles out of %s." " Resuming.\n" % (len(ensemble_ids), number_of_ensembles)) u.log_message(message, log_file=session_file, console=args.verbosity) # erase models' info that will be rebuilt u.log_created_resources("models", path, None, mode='w') number_of_ensembles = len(labels) - len(ensemble_ids) ensemble_args_list = r.set_label_ensemble_args(args, labels, multi_label_data, number_of_ensembles, fields) # create ensembles changing the input_field to select # only one label at a time (ensembles, ensemble_ids, models, model_ids) = r.create_ensembles(dataset, ensemble_ids, ensemble_args_list, args, number_of_ensembles, api, path, session_file, log) return ensembles, ensemble_ids, models, model_ids, resume
def create_ensembles(dataset, ensemble_ids, ensemble_args, args, number_of_ensembles=1, api=None, path=None, session_file=None, log=None): """Create ensembles from input data """ if api is None: api = bigml.api.BigML() ensembles = ensemble_ids[:] model_ids = [] ensemble_args_list = [] if isinstance(ensemble_args, list): ensemble_args_list = ensemble_args if number_of_ensembles > 0: message = dated("Creating %s.\n" % plural("ensemble", number_of_ensembles)) log_message(message, log_file=session_file, console=args.verbosity) query_string = ALL_FIELDS_QS for i in range(0, number_of_ensembles): if i % args.max_parallel_ensembles == 0 and i > 0: try: ensembles[i - 1] = check_resource( ensembles[i - 1], api.get_ensemble, query_string=query_string) except ValueError, exception: sys.exit("Failed to get a finished ensemble: %s" % str(exception)) if ensemble_args_list: ensemble_args = ensemble_args_list[i] ensemble = api.create_ensemble(dataset, ensemble_args) ensemble_id = check_resource_error(ensemble, "Failed to create ensemble: ") log_message("%s\n" % ensemble_id, log_file=log) ensemble_ids.append(ensemble_id) ensembles.append(ensemble) log_created_resources("ensembles", path, ensemble_id, open_mode='a') models, model_ids = retrieve_ensembles_models(ensembles, api, path) if number_of_ensembles < 2 and args.verbosity: message = dated("Ensemble created: %s.\n" % get_url(ensemble)) log_message(message, log_file=session_file, console=args.verbosity)
def create_script(source_code, script_args, args, api=None, path=None, session_file=None, log=None): """Creates remote script """ if api is None: api = bigml.api.BigML() message = dated("Creating script \"%s\".\n" % script_args["name"]) log_message(message, log_file=session_file, console=args.verbosity) script = api.create_script(source_code, script_args) log_created_resources("scripts", path, bigml.api.get_script_id(script), mode='a') script_id = check_resource_error(script, "Failed to create script: ") try: script = check_resource(script, api.get_script, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a compiled script: %s" % str(exception))
def create_dataset(source_or_dataset, dataset_args, args, api=None, path=None, session_file=None, log=None, dataset_type=None): """Creates remote dataset """ if api is None: api = bigml.api.BigML() message = dated("Creating dataset.\n") log_message(message, log_file=session_file, console=args.verbosity) dataset = api.create_dataset(source_or_dataset, dataset_args) suffix = "_" + dataset_type if dataset_type else "" log_created_resources("dataset%s" % suffix, path, bigml.api.get_dataset_id(dataset)) check_resource_error(dataset, "Failed to create dataset: ") try: dataset = check_resource(dataset, api.get_dataset) except ValueError, exception: sys.exit("Failed to get a finished dataset: %s" % str(exception))
def ensemble_per_label(labels, dataset, fields, objective_field, api, args, resume, name=None, description=None, model_fields=None, multi_label_data=None, session_file=None, path=None, log=None): """Creates an ensemble per label for multi-label datasets """ ensemble_ids = [] ensembles = [] model_ids = [] models = [] number_of_ensembles = len(labels) if resume: resume, ensemble_ids = c.checkpoint( c.are_ensembles_created, path, number_of_ensembles, debug=args.debug) ensembles = ensemble_ids if not resume: message = u.dated("Found %s ensembles out of %s." " Resuming.\n" % (len(ensemble_ids), number_of_ensembles)) u.log_message(message, log_file=session_file, console=args.verbosity) # erase models' info that will be rebuilt u.log_created_resources("models", path, None, open_mode='w') number_of_ensembles = len(labels) - len(ensemble_ids) ensemble_args_list = r.set_label_ensemble_args( name, description, args, labels, multi_label_data, number_of_ensembles, fields, model_fields, objective_field) # create ensembles changing the input_field to select # only one label at a time (ensembles, ensemble_ids, models, model_ids) = r.create_ensembles( dataset, ensemble_ids, ensemble_args_list, args, number_of_ensembles, api, path, session_file, log) return ensembles, ensemble_ids, models, model_ids, resume
def create_evaluations(model_ids, dataset, evaluation_args, args, api=None, path=None, session_file=None, log=None, existing_evaluations=0): """Create evaluations for a list of models ``model_ids``: list of model ids to create an evaluation of ``dataset``: dataset object or id to evaluate with ``evaluation_args``: arguments for the ``create_evaluation`` call ``args``: input values for bigmler flags ``api``: api to remote objects in BigML ``path``: directory to store the BigMLer generated files in ``session_file``: file to store the messages of that session ``log``: user provided log file ``seed``: seed for the dataset sampling (when needed) """ evaluations = [] if api is None: api = bigml.api.BigML() number_of_evaluations = len(model_ids) message = dated("Creating evaluations.\n") log_message(message, log_file=session_file, console=args.verbosity) for i in range(0, number_of_evaluations): model = model_ids[i] if i % args.max_parallel_evaluations == 0 and i > 0: try: evaluations[i - 1] = check_resource( evaluations[i - 1], api.get_evaluation) except ValueError, exception: sys.exit("Failed to get a finished evaluation: %s" % str(exception)) if args.cross_validation_rate > 0: new_seed = get_basic_seed(i + existing_evaluations) evaluation_args.update(seed=new_seed) evaluation = api.create_evaluation(model, dataset, evaluation_args) log_created_resources("evaluations", path, bigml.api.get_evaluation_id(evaluation), open_mode='a') check_resource_error(evaluation, "Failed to create evaluation: ") evaluations.append(evaluation) log_message("%s\n" % evaluation['resource'], log_file=log)
def create_model(cluster, model_args, args, api=None, path=None, session_file=None, log=None, model_type=None): """Creates remote model from cluster and centroid """ if api is None: api = bigml.api.BigML() message = dated("Creating model.\n") log_message(message, log_file=session_file, console=args.verbosity) model = api.create_model(cluster, model_args, retries=None) suffix = "" if model_type is None else "_%s" % model_type log_created_resources("models%s" % suffix, path, bigml.api.get_model_id(model), mode='a') model_id = check_resource_error(model, "Failed to create model: ") try: model = check_resource(model, api.get_model, query_string=ALL_FIELDS_QS, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a finished model: %s" % str(exception))
def remote_prediction(model, test_dataset, batch_prediction_args, args, api, resume, prediction_file=None, session_file=None, path=None, log=None): """Computes a prediction for each entry in the `test_set`. Predictions are computed remotely using the batch prediction call. """ model_id = bigml.api.get_resource_id( \ model) batch_prediction_args.update({"probability": True, "confidence": False}) # if resuming, try to extract dataset form log files if resume: message = u.dated("Batch prediction not found. Resuming.\n") resume, batch_prediction = c.checkpoint( c.is_batch_prediction_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if not resume: batch_prediction = create_batch_prediction( model_id, test_dataset, batch_prediction_args, args, api, session_file=session_file, path=path, log=log) if not args.no_csv: file_name = api.download_batch_prediction(batch_prediction, prediction_file) if file_name is None: sys.exit("Failed downloading CSV.") if args.to_dataset: batch_prediction = bigml.api.check_resource(batch_prediction, api=api) new_dataset = bigml.api.get_dataset_id( batch_prediction['object']['output_dataset_resource']) if new_dataset is not None: message = u.dated("Batch prediction dataset created: %s\n" % u.get_url(new_dataset)) u.log_message(message, log_file=session_file, console=args.verbosity) u.log_created_resources("batch_prediction_dataset", path, new_dataset, mode='a')
def create_project(project_args, args, api=None, session_file=None, path=None, log=None): """Creates remote project """ if api is None: api = bigml.api.BigML() message = dated("Creating project.\n") log_message(message, log_file=session_file, console=args.verbosity) project = api.create_project(project_args) log_created_resources("project", path, bigml.api.get_project_id(project), mode='a') project_id = check_resource_error(project, "Failed to create project: ") try: project = check_resource(project, api=api, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a finished project: %s" % str(exception))
def remote_predict(model, test_dataset, batch_prediction_args, args, api, resume, prediction_file=None, session_file=None, path=None, log=None): """Computes a prediction for each entry in the `test_set`. Predictions are computed remotely using the batch predictions call. """ if args.ensemble is not None and not args.dataset_off: model_or_ensemble = args.ensemble elif args.dataset_off: if hasattr(args, "ensemble_ids_") and args.ensemble_ids_: models = args.ensemble_ids_ else: models = args.model_ids_ test_datasets = args.test_dataset_ids else: model_or_ensemble = bigml.api.get_model_id(model) # if resuming, try to extract dataset form log files if resume: message = u.dated("Batch prediction not found. Resuming.\n") resume, batch_prediction = c.checkpoint( c.is_batch_prediction_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if not resume: if not args.dataset_off: batch_prediction = create_batch_prediction( model_or_ensemble, test_dataset, batch_prediction_args, args, api, session_file=session_file, path=path, log=log) else: batch_predictions = [] for index, test_dataset_n in enumerate(test_datasets): batch_predictions.append(create_batch_prediction( \ models[index], test_dataset_n, batch_prediction_args, args, api, session_file=session_file, path=path, log=log)) if not args.no_csv and not args.dataset_off: file_name = api.download_batch_prediction(batch_prediction, prediction_file) if file_name is None: sys.exit("Failed downloading CSV.") if args.to_dataset and not args.dataset_off: batch_prediction = bigml.api.check_resource(batch_prediction, api=api) new_dataset = bigml.api.get_dataset_id( batch_prediction['object']['output_dataset_resource']) if new_dataset is not None: message = u.dated("Batch prediction dataset created: %s\n" % u.get_url(new_dataset)) u.log_message(message, log_file=session_file, console=args.verbosity) u.log_created_resources("batch_prediction_dataset", path, new_dataset, mode='a') elif args.to_dataset and args.dataset_off: predictions_datasets = [] for batch_prediction in batch_predictions: batch_prediction = bigml.api.check_resource(batch_prediction, api=api) new_dataset = bigml.api.get_dataset_id( batch_prediction['object']['output_dataset_resource']) if new_dataset is not None: predictions_datasets.append(new_dataset) message = u.dated("Batch prediction dataset created: %s\n" % u.get_url(new_dataset)) u.log_message(message, log_file=session_file, console=args.verbosity) u.log_created_resources("batch_prediction_dataset", path, new_dataset, mode='a') multi_dataset = api.create_dataset(predictions_datasets) log_created_resources("dataset_pred", path, bigml.api.get_dataset_id(multi_dataset), mode='a') dataset_id = check_resource_error(multi_dataset, "Failed to create dataset: ") try: multi_dataset = api.check_resource(multi_dataset) except ValueError, exception: sys.exit("Failed to get a finished dataset: %s" % str(exception)) message = dated("Predictions dataset created: %s\n" % get_url(multi_dataset)) log_message(message, log_file=session_file, console=args.verbosity) log_message("%s\n" % dataset_id, log_file=log) if not args.no_csv: file_name = api.download_dataset(dataset_id, prediction_file) if file_name is None: sys.exit("Failed downloading CSV.")
def create_package(args, api, command_obj, resume=False): """Creates the package whizzml resources as referred in the metadata.json file. """ common_options = command_obj.common_options set_subcommand_file(args.output_dir) if resume: retrieve_subcommands() # read the metadata.json information message = ('Reading the metadata.json files.........\n') u.log_message(message, log_file=session_file, console=args.verbosity) package_dir = args.package_dir output_dir = args.output_dir metadata_file = os.path.join(package_dir, METADATA_FILE) metadata = None with open(metadata_file) as metadata_handler: metadata = json.load(metadata_handler) # recurse into components/directories, if any if metadata.get("kind") == "package" and 'components' in metadata: components = metadata.get("components") for component in components: message = ('Inspecting component %s.........\n' % component) u.log_message(message, log_file=session_file, console=args.verbosity) args.package_dir = os.path.join(package_dir, component) create_package(args, api, command_obj, resume=resume) args.package_dir = package_dir else: # create libraries or scripts imports = [] if metadata.get("imports") is not None: lib_imports = metadata.get("imports") for lib_import in lib_imports: args.package_dir = os.path.join(package_dir, lib_import) if args.embed_libs: library_ref = create_package( \ args, api, command_obj, resume=resume) u.log_created_resources("imports", output_dir, library_ref) else: try: # try to read the library id, if it is already there library_ref = read_library_id(os.path.join( \ output_dir, os.path.basename(args.package_dir))) except IOError: library_ref = create_package( \ args, api, command_obj, resume=resume) library_ref = read_library_id(os.path.join( \ output_dir, os.path.basename(args.package_dir))) imports.append(library_ref) args.package_dir = package_dir # read the metadata.json information message = ('Creating the %s.........\n' % metadata.get("kind")) u.log_message(message, log_file=session_file, console=args.verbosity) if metadata.get("kind") in WHIZZML_RESOURCES: whizzml_code = os.path.normpath(os.path.join(args.package_dir, \ metadata.get("source_code", "%s.whizzml" % \ metadata.get("kind")))) if args.embed_libs and metadata.get("kind") == WHIZZML_LIBRARY: return whizzml_code args.output_dir = os.path.join(output_dir, \ os.path.basename(package_dir)) # creating command to create the resource command = COMMANDS[metadata.get("kind")] % (whizzml_code, args.output_dir) command_args = command.split() bigml.util.check_dir(args.output_dir) # getting inputs and outputs for the script from metadata if "inputs" in metadata: inputs_file = os.path.join(args.output_dir, "inputs.json") with open(inputs_file, "w") as inputs_handler: json.dump(metadata.get("inputs"), inputs_handler) command_args.extend(["--declare-inputs", inputs_file]) if "outputs" in metadata: outputs_file = os.path.join(args.output_dir, "outputs.json") with open(outputs_file, "w") as outputs_handler: json.dump(metadata.get("outputs"), outputs_handler) command_args.extend(["--declare-outputs", outputs_file]) if "description" in metadata: desc_file = os.path.join(args.output_dir, "description.txt") with open(desc_file, "w") as desc_handler: desc_handler.write(metadata.get("description")) command_args.extend(["--description", desc_file]) if metadata.get("name"): command_args.extend(["--name", metadata.get("name")]) if args.tag: for tag in args.tag: command_args.extend(["--tag", tag]) # adding imports, if any if imports: if args.embed_libs: # imports to be embedded are in the same output directory command_args.extend( \ ["--embedded-imports", os.path.join(output_dir, "imports")]) else: # imports to be refereced by ID command_args.extend(["--imports", ",".join(imports)]) command_args.extend(["--verbosity", str(args.verbosity)]) command_obj.propagate(command_args) # u.add_api_context(command_args, args) if args.upgrade: command_args.extend(["--upgrade"]) if resume: next_command = subcommand_list.pop() if different_command(next_command, command): resume = False u.sys_log_message(command, log_file=subcommand_file) execute_dispatcher(args=command_args) elif not subcommand_list: execute_dispatcher(args=['execute', '--resume']) resume = False else: u.sys_log_message(command, log_file=subcommand_file) execute_dispatcher(args=command_args) args.output_dir = output_dir return whizzml_code
def models_processing(dataset, models, model_ids, name, description, test_set, objective_field, fields, model_fields, api, args, resume, session_file=None, path=None, log=None): """Creates or retrieves models from the input data """ log_models = False # If we have a dataset but not a model, we create the model if the no_model # flag hasn't been set up. if (dataset and not args.model and not model_ids and not args.no_model and not args.ensemble): model_ids = [] models = [] if args.number_of_models > 1: # Ensemble of models ensemble, resume = ensemble_processing( dataset, name, description, objective_field, fields, api, args, resume, session_file=session_file, path=path, log=log) args.ensemble = bigml.api.get_ensemble_id(ensemble) log_models = True else: # Cross-validation case: we create 2 * n models to be validated # holding out an n% of data if args.cross_validation_rate > 0: if args.number_of_evaluations > 0: args.number_of_models = args.number_of_evaluations else: args.number_of_models = int(MONTECARLO_FACTOR * args.cross_validation_rate) if resume: resume, model_ids = c.checkpoint( c.are_models_created, path, args.number_of_models, debug=args.debug) if not resume: message = u.dated("Found %s models out of %s. Resuming.\n" % (len(model_ids), args.number_of_models)) u.log_message(message, log_file=session_file, console=args.verbosity) models = model_ids args.number_of_models -= len(model_ids) model_args = r.set_model_args(name, description, args, objective_field, fields, model_fields) models, model_ids = r.create_models(dataset, models, model_args, args, api, path, session_file, log) # If a model is provided, we use it. elif args.model: model_ids = [args.model] models = model_ids[:] elif args.models or args.model_tag: models = model_ids[:] if args.ensemble: ensemble = r.get_ensemble(args.ensemble, api, args.verbosity, session_file) model_ids = ensemble['object']['models'] if log_models: for model_id in model_ids: u.log_created_resources("models", path, model_id, open_mode='a') models = model_ids[:] # If we are going to predict we must retrieve the models if model_ids and test_set and not args.evaluate: models, model_ids = r.get_models(models, args, api, session_file) return models, model_ids, resume
models = [] model_ids = [] for index in range(0, len(ensembles)): ensemble = ensembles[index] if (isinstance(ensemble, basestring) or bigml.api.get_status(ensemble)['code'] != bigml.api.FINISHED): try: ensemble = check_resource(ensemble, api.get_ensemble) ensembles[index] = ensemble except ValueError, exception: sys.exit("Failed to get a finished ensemble: %s" % str(exception)) model_ids.extend(ensemble['object']['models']) if path is not None: for model_id in model_ids: log_created_resources("models", path, model_id, open_mode='a') models = model_ids[:] models[0] = check_resource(models[0], api.get_model, query_string=ALL_FIELDS_QS) return models, model_ids def get_ensemble(ensemble, api=None, verbosity=True, session_file=None): """Retrieves remote ensemble in its actual status """ if api is None: api = bigml.api.BigML() if (isinstance(ensemble, basestring) or bigml.api.get_status(ensemble)['code'] != bigml.api.FINISHED): message = dated("Retrieving ensemble. %s\n" %
def create_package(args, api, command_obj, resume=False): """Creates the package whizzml resources as referred in the metadata.json file. """ set_subcommand_file(args.output_dir) if resume: retrieve_subcommands() # read the metadata.json information message = ('Reading the metadata.json files.........\n') u.log_message(message, log_file=session_file, console=args.verbosity) package_dir = args.package_dir output_dir = args.output_dir metadata_file = os.path.join(package_dir, METADATA_FILE) metadata = None with open(metadata_file) as metadata_handler: metadata = json.load(metadata_handler) # recurse into components/directories, if any if metadata.get("kind") == "package" and 'components' in metadata: components = metadata.get("components") for component in components: message = ('Inspecting component %s.........\n' % component) u.log_message(message, log_file=session_file, console=args.verbosity) args.package_dir = os.path.join(package_dir, component) create_package(args, api, command_obj, resume=resume) args.package_dir = package_dir else: # create libraries or scripts imports = [] category = str(metadata.get("category", DFT_CATEGORY)) if metadata.get("imports") is not None: lib_imports = metadata.get("imports") for lib_import in lib_imports: args.package_dir = os.path.join(package_dir, lib_import) if args.embed_libs: library_ref = create_package( \ args, api, command_obj, resume=resume) u.log_created_resources("imports", output_dir, library_ref) else: try: # try to read the library id, if it is already there library_ref = read_library_id(os.path.join( \ output_dir, os.path.basename(args.package_dir))) except IOError: library_ref = create_package( \ args, api, command_obj, resume=resume) library_ref = read_library_id(os.path.join( \ output_dir, os.path.basename(args.package_dir))) imports.append(library_ref) args.package_dir = package_dir # read the metadata.json information message = ('Creating the %s.........\n' % metadata.get("kind")) u.log_message(message, log_file=session_file, console=args.verbosity) if metadata.get("kind") in WHIZZML_RESOURCES: whizzml_code = os.path.normpath(os.path.join(args.package_dir, \ metadata.get("source_code", "%s.whizzml" % \ metadata.get("kind")))) if args.embed_libs and metadata.get("kind") == WHIZZML_LIBRARY: return whizzml_code args.output_dir = os.path.join(output_dir, \ os.path.basename(package_dir)) # creating command to create the resource command = COMMANDS[metadata.get("kind")] % (whizzml_code, args.output_dir) command_args = command.split() bigml.util.check_dir(args.output_dir) # getting inputs and outputs for the script from metadata if "inputs" in metadata: inputs_file = os.path.join(args.output_dir, "inputs.json") u.write_to_utf8(inputs_file, json.dumps(metadata.get("inputs"))) command_args.extend(["--declare-inputs", inputs_file]) if "outputs" in metadata: outputs_file = os.path.join(args.output_dir, "outputs.json") u.write_to_utf8(outputs_file, json.dumps(metadata.get("outputs"))) command_args.extend(["--declare-outputs", outputs_file]) if "description" in metadata: desc_file = os.path.join(args.output_dir, "description.txt") u.write_to_utf8(desc_file, metadata.get("description")) command_args.extend(["--description", desc_file]) if metadata.get("name"): command_args.extend(["--name", metadata.get("name")]) if args.tag: for tag in args.tag: command_args.extend(["--tag", tag]) command_args.extend(["--category", category]) # adding imports, if any if imports: if args.embed_libs: # imports to be embedded are in the same output directory command_args.extend( \ ["--embedded-imports", os.path.join(output_dir, "imports")]) else: # imports to be refereced by ID command_args.extend(["--imports", ",".join(imports)]) command_args.extend(["--verbosity", str(args.verbosity)]) command_obj.propagate(command_args) # u.add_api_context(command_args, args) if args.upgrade: command_args.extend(["--upgrade"]) if resume: next_command = subcommand_list.pop() if different_command(next_command, command): resume = False u.sys_log_message(command, log_file=subcommand_file) execute_dispatcher(args=command_args) elif not subcommand_list: execute_dispatcher(args=['execute', '--resume']) resume = False else: u.sys_log_message(command, log_file=subcommand_file) execute_dispatcher(args=command_args) args.output_dir = output_dir return whizzml_code return ""
def evaluations_report(args): """Analyze cross-validations in directory and create evaluations report """ metrics = [] evaluations_json = [] path = os.path.join(args.from_dir, ANALYZE_PATH) for _, directories, _ in os.walk(path): for directory in directories: file_name = os.path.join(path, directory, CROSS_VALIDATION_FILE) kfold_evaluation = json.load(open(file_name)) kfold_evaluation['name'] = directory.replace('kfold', '#') evaluation = kfold_evaluation command = get_command_line(os.path.join(path, directory)) feature, value = parse_test_feature(command) evaluation_json = {feature: value, "directory": directory, "time": os.path.getmtime(file_name)} kfold_evaluation[feature] = value evaluation = evaluation.get(MODEL_KEY, {}) # read the applicable metrics and add the kfold number info for option in OPTIMIZE_OPTIONS: new_eval = copy.copy(evaluation_json) new_eval["measure"] = option if directory.startswith("node_th"): new_eval["kfold"] = int(directory.replace("node_th", "")) elif directory.startswith("kfold"): new_eval["kfold"] = int(directory.replace("kfold", "")) elif directory.startswith("random"): new_eval["kfold"] = int(directory.replace("random", "")) if option in evaluation: new_eval["value"] = evaluation[option] metrics.append(new_eval) # check for averaged values too else: option_pref = "%s%s" % (PREFIX, option) if option_pref in evaluation: new_eval["value"] = evaluation[option_pref] metrics.append(new_eval) evaluations_json.append(kfold_evaluation) check_subdir(args.from_dir, REPORTS_DIR) check_subdir(os.path.join(args.from_dir, REPORTS_DIR), ANALYZE_DIR) # generate summary of metrics values json.dump(sorted(metrics, key=lambda x: x['time']), open(os.path.join(args.from_dir, REPORTS_DIR, ANALYZE_DIR, METRICS_FILE), "w")) # generate list of evaluations json.dump(evaluations_json, open(os.path.join(args.from_dir, REPORTS_DIR, ANALYZE_DIR, EVALUATIONS_JSON_FILE), "w")) # checks the global server directories check_subdir(HOME, SERVER_DIRECTORY.split(os.sep)[0]) check_subdir(HOME, SERVER_DIRECTORY) # copy templates to directory basename = os.path.basename(ANALYZE_TEMPLATE) base_destination_dir = os.path.join( os.getcwd(), args.from_dir, REPORTS_DIR) destination_dir = os.path.join(base_destination_dir, ANALYZE_DIR) destination_file = os.path.join(destination_dir, basename) shutil.copyfile(ANALYZE_TEMPLATE, destination_file) dirname = os.path.join(HOME, SERVER_DIRECTORY) # current_directory = os.getcwd() os.chdir(dirname) symlink = tempfile.NamedTemporaryFile(dir=dirname).name try: os.symlink(base_destination_dir, symlink) except (AttributeError, OSError): os.mkdir(os.path.basename(symlink)) shutil.copytree(destination_dir, os.path.join(symlink, ANALYZE_DIR)) #saves the symlink file name in the current reports directory log_created_resources("symlink", base_destination_dir, symlink, mode='a') # returns the link address relative to the server folder return os.path.join(os.path.basename(symlink), ANALYZE_DIR, basename)
def models_processing(dataset, models, model_ids, name, description, test_set, objective_field, fields, model_fields, api, args, resume, session_file=None, path=None, log=None): """Creates or retrieves models from the input data """ log_models = False # If we have a dataset but not a model, we create the model if the no_model # flag hasn't been set up. if (dataset and not args.model and not model_ids and not args.no_model and not args.ensemble): model_ids = [] models = [] if args.number_of_models > 1: # Ensemble of models ensemble, resume = ensemble_processing(dataset, name, description, objective_field, fields, api, args, resume, session_file=session_file, path=path, log=log) args.ensemble = bigml.api.get_ensemble_id(ensemble) log_models = True else: # Cross-validation case: we create 2 * n models to be validated # holding out an n% of data if args.cross_validation_rate > 0: if args.number_of_evaluations > 0: args.number_of_models = args.number_of_evaluations else: args.number_of_models = int(MONTECARLO_FACTOR * args.cross_validation_rate) if resume: resume, model_ids = c.checkpoint(c.are_models_created, path, args.number_of_models, debug=args.debug) if not resume: message = u.dated( "Found %s models out of %s. Resuming.\n" % (len(model_ids), args.number_of_models)) u.log_message(message, log_file=session_file, console=args.verbosity) models = model_ids args.number_of_models -= len(model_ids) model_args = r.set_model_args(name, description, args, objective_field, fields, model_fields) models, model_ids = r.create_models(dataset, models, model_args, args, api, path, session_file, log) # If a model is provided, we use it. elif args.model: model_ids = [args.model] models = model_ids[:] elif args.models or args.model_tag: models = model_ids[:] if args.ensemble: ensemble = r.get_ensemble(args.ensemble, api, args.verbosity, session_file) model_ids = ensemble['object']['models'] if log_models: for model_id in model_ids: u.log_created_resources("models", path, model_id, open_mode='a') models = model_ids[:] # If we are going to predict we must retrieve the models if model_ids and test_set and not args.evaluate: models, model_ids = r.get_models(models, args, api, session_file) return models, model_ids, resume
def remote_predict(model, test_dataset, batch_prediction_args, args, api, resume, prediction_file=None, session_file=None, path=None, log=None): """Computes a prediction for each entry in the `test_set`. Predictions are computed remotely using the batch predictions call. """ if args.ensemble is not None and not args.dataset_off: model_or_ensemble = args.ensemble elif args.dataset_off: if hasattr(args, "ensemble_ids_") and args.ensemble_ids_: models = args.ensemble_ids_ else: models = args.model_ids_ test_datasets = args.test_dataset_ids else: model_or_ensemble = bigml.api.get_model_id(model) # if resuming, try to extract dataset form log files if resume: message = u.dated("Batch prediction not found. Resuming.\n") resume, batch_prediction = c.checkpoint(c.is_batch_prediction_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if not resume: if not args.dataset_off: batch_prediction = create_batch_prediction( model_or_ensemble, test_dataset, batch_prediction_args, args, api, session_file=session_file, path=path, log=log) else: batch_predictions = [] for index, test_dataset_n in enumerate(test_datasets): batch_predictions.append(create_batch_prediction( \ models[index], test_dataset_n, batch_prediction_args, args, api, session_file=session_file, path=path, log=log)) if not args.no_csv and not args.dataset_off: file_name = api.download_batch_prediction(batch_prediction, prediction_file) if file_name is None: sys.exit("Failed downloading CSV.") if args.to_dataset and not args.dataset_off: batch_prediction = bigml.api.check_resource(batch_prediction, api=api) new_dataset = bigml.api.get_dataset_id( batch_prediction['object']['output_dataset_resource']) if new_dataset is not None: message = u.dated("Batch prediction dataset created: %s\n" % u.get_url(new_dataset)) u.log_message(message, log_file=session_file, console=args.verbosity) u.log_created_resources("batch_prediction_dataset", path, new_dataset, mode='a') elif args.to_dataset and args.dataset_off: predictions_datasets = [] for batch_prediction in batch_predictions: batch_prediction = bigml.api.check_resource(batch_prediction, api=api) new_dataset = bigml.api.get_dataset_id( batch_prediction['object']['output_dataset_resource']) if new_dataset is not None: predictions_datasets.append(new_dataset) message = u.dated("Batch prediction dataset created: %s\n" % u.get_url(new_dataset)) u.log_message(message, log_file=session_file, console=args.verbosity) u.log_created_resources("batch_prediction_dataset", path, new_dataset, mode='a') multi_dataset = api.create_dataset(predictions_datasets) log_created_resources("dataset_pred", path, bigml.api.get_dataset_id(multi_dataset), mode='a') dataset_id = check_resource_error(multi_dataset, "Failed to create dataset: ") try: multi_dataset = api.check_resource(multi_dataset) except ValueError, exception: sys.exit("Failed to get a finished dataset: %s" % str(exception)) message = dated("Predictions dataset created: %s\n" % get_url(multi_dataset)) log_message(message, log_file=session_file, console=args.verbosity) log_message("%s\n" % dataset_id, log_file=log) if not args.no_csv: file_name = api.download_dataset(dataset_id, prediction_file) if file_name is None: sys.exit("Failed downloading CSV.")
def create_models(datasets, model_ids, model_args, args, api=None, path=None, session_file=None, log=None): """Create remote models """ if api is None: api = bigml.api.BigML() models = model_ids[:] existing_models = len(models) model_args_list = [] if args.dataset_off and args.evaluate: args.test_dataset_ids = datasets[:] if not args.multi_label: datasets = datasets[existing_models:] # if resuming and all models were created, there will be no datasets left if datasets: dataset = datasets[0] if isinstance(model_args, list): model_args_list = model_args if args.number_of_models > 0: message = dated("Creating %s.\n" % plural("model", args.number_of_models)) log_message(message, log_file=session_file, console=args.verbosity) single_model = args.number_of_models == 1 and existing_models == 0 # if there's more than one model the first one must contain # the entire field structure to be used as reference. query_string = (FIELDS_QS if single_model else ALL_FIELDS_QS) inprogress = [] for i in range(0, args.number_of_models): wait_for_available_tasks(inprogress, args.max_parallel_models, api.get_model, "model", query_string=query_string) if model_args_list: model_args = model_args_list[i] if args.cross_validation_rate > 0: new_seed = get_basic_seed(i + existing_models) model_args.update(seed=new_seed) # one model per dataset (--max-categories or single model) if (args.max_categories or (args.test_datasets and args.evaluate)) > 0: dataset = datasets[i] model = api.create_model(dataset, model_args) elif args.dataset_off and args.evaluate: multi_dataset = args.test_dataset_ids[:] del multi_dataset[i + existing_models] model = api.create_model(multi_dataset, model_args) else: model = api.create_model(datasets, model_args) model_id = check_resource_error(model, "Failed to create model: ") log_message("%s\n" % model_id, log_file=log) model_ids.append(model_id) inprogress.append(model_id) models.append(model) log_created_resources("models", path, model_id, open_mode='a') if args.number_of_models < 2 and args.verbosity: if bigml.api.get_status(model)['code'] != bigml.api.FINISHED: try: model = check_resource(model, api.get_model, query_string=query_string) except ValueError, exception: sys.exit("Failed to get a finished model: %s" % str(exception)) models[0] = model message = dated("Model created: %s.\n" % get_url(model)) log_message(message, log_file=session_file, console=args.verbosity) if args.reports: report(args.reports, path, model)
def create_anomalies(datasets, anomaly_ids, anomaly_args, args, api=None, path=None, session_file=None, log=None): """Create remote anomalies """ if api is None: api = bigml.api.BigML() anomalies = anomaly_ids[:] existing_anomalies = len(anomalies) anomaly_args_list = [] datasets = datasets[existing_anomalies:] # if resuming and all anomalies were created, # there will be no datasets left if datasets: if isinstance(anomaly_args, list): anomaly_args_list = anomaly_args # Only one anomaly per command, at present number_of_anomalies = 1 message = dated("Creating %s.\n" % plural("anomaly detector", number_of_anomalies)) log_message(message, log_file=session_file, console=args.verbosity) query_string = FIELDS_QS inprogress = [] for i in range(0, number_of_anomalies): wait_for_available_tasks(inprogress, args.max_parallel_anomalies, api, "anomaly") if anomaly_args_list: anomaly_args = anomaly_args_list[i] anomaly = api.create_anomaly(datasets, anomaly_args, retries=None) anomaly_id = check_resource_error(anomaly, "Failed to create anomaly: ") log_message("%s\n" % anomaly_id, log_file=log) anomaly_ids.append(anomaly_id) inprogress.append(anomaly_id) anomalies.append(anomaly) log_created_resources("anomalies", path, anomaly_id, mode='a') if args.verbosity: if bigml.api.get_status(anomaly)['code'] != bigml.api.FINISHED: try: anomaly = check_resource(anomaly, api.get_anomaly, query_string=query_string, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a finished anomaly: %s" % str(exception)) anomalies[0] = anomaly message = dated("Anomaly created: %s\n" % get_url(anomaly)) log_message(message, log_file=session_file, console=args.verbosity) if args.reports: report(args.reports, path, anomaly)
def evaluations_report(args): """Analyze cross-validations in directory and create evaluations report """ metrics = [] evaluations_json = [] path = os.path.join(args.from_dir, ANALYZE_PATH) for _, directories, _ in os.walk(path): for directory in directories: file_name = os.path.join(path, directory, CROSS_VALIDATION_FILE) kfold_evaluation = json.load(open(file_name)) kfold_evaluation['name'] = directory.replace('kfold', '#') evaluation = kfold_evaluation command = get_command_line(os.path.join(path, directory)) feature, value = parse_test_feature(command) evaluation_json = { feature: value, "directory": directory, "time": os.path.getmtime(file_name) } kfold_evaluation[feature] = value evaluation = evaluation.get(MODEL_KEY, {}) # read the applicable metrics and add the kfold number info for option in OPTIMIZE_OPTIONS: new_eval = copy.copy(evaluation_json) new_eval["measure"] = option if directory.startswith("node_th"): new_eval["kfold"] = int(directory.replace("node_th", "")) elif directory.startswith("kfold"): new_eval["kfold"] = int(directory.replace("kfold", "")) elif directory.startswith("random"): new_eval["kfold"] = int(directory.replace("random", "")) if option in evaluation: new_eval["value"] = evaluation[option] metrics.append(new_eval) # check for averaged values too else: option_pref = "%s%s" % (PREFIX, option) if option_pref in evaluation: new_eval["value"] = evaluation[option_pref] metrics.append(new_eval) evaluations_json.append(kfold_evaluation) check_subdir(args.from_dir, REPORTS_DIR) check_subdir(os.path.join(args.from_dir, REPORTS_DIR), ANALYZE_DIR) # generate summary of metrics values json.dump( sorted(metrics, key=lambda x: x['time']), open( os.path.join(args.from_dir, REPORTS_DIR, ANALYZE_DIR, METRICS_FILE), "w")) # generate list of evaluations json.dump( evaluations_json, open( os.path.join(args.from_dir, REPORTS_DIR, ANALYZE_DIR, EVALUATIONS_JSON_FILE), "w")) # checks the global server directories check_subdir(HOME, SERVER_DIRECTORY.split(os.sep)[0]) check_subdir(HOME, SERVER_DIRECTORY) # copy templates to directory basename = os.path.basename(ANALYZE_TEMPLATE) base_destination_dir = os.path.join(os.getcwd(), args.from_dir, REPORTS_DIR) destination_dir = os.path.join(base_destination_dir, ANALYZE_DIR) destination_file = os.path.join(destination_dir, basename) shutil.copyfile(ANALYZE_TEMPLATE, destination_file) dirname = os.path.join(HOME, SERVER_DIRECTORY) # current_directory = os.getcwd() os.chdir(dirname) symlink = tempfile.NamedTemporaryFile(dir=dirname).name try: os.symlink(base_destination_dir, symlink) except (AttributeError, OSError): os.mkdir(os.path.basename(symlink)) shutil.copytree(destination_dir, os.path.join(symlink, ANALYZE_DIR)) #saves the symlink file name in the current reports directory log_created_resources("symlink", base_destination_dir, symlink, mode='a') # returns the link address relative to the server folder return os.path.join(os.path.basename(symlink), ANALYZE_DIR, basename)
def models_processing(dataset, models, model_ids, objective_field, fields, api, args, resume, name=None, description=None, model_fields=None, session_file=None, path=None, log=None, labels=None, all_labels=None): """Creates or retrieves models from the input data """ log_models = False ensemble_ids = [] # If we have a dataset but not a model, we create the model if the no_model # flag hasn't been set up. if dataset and not (has_models(args) or args.no_model): model_ids = [] models = [] if args.multi_label: # Create one model per column choosing only the label column if args.training_set is None: all_labels, labels = l.retrieve_labels(fields.fields, labels) # If --number-of-models is not set or is 1, create one model per # label. Otherwise, create one ensemble per label with the required # number of models if args.number_of_models < 2: models, model_ids, resume = model_per_label( labels, all_labels, dataset, fields, objective_field, api, args, resume, name, description, model_fields, session_file, path, log) else: (ensembles, ensemble_ids, models, model_ids, resume) = ensemble_per_label( labels, all_labels, dataset, fields, objective_field, api, args, resume, name, description, model_fields, session_file, path, log) elif args.number_of_models > 1: ensembles = [] # Ensemble of models (ensembles, ensemble_ids, models, model_ids, resume) = ensemble_processing( dataset, objective_field, fields, api, args, resume, name=name, description=description, model_fields=model_fields, session_file=session_file, path=path, log=log) ensemble = ensembles[0] args.ensemble = bigml.api.get_ensemble_id(ensemble) log_models = True else: # Cross-validation case: we create 2 * n models to be validated # holding out an n% of data if args.cross_validation_rate > 0: if args.number_of_evaluations > 0: args.number_of_models = args.number_of_evaluations else: args.number_of_models = int(MONTECARLO_FACTOR * args.cross_validation_rate) if resume: resume, model_ids = c.checkpoint( c.are_models_created, path, args.number_of_models, debug=args.debug) if not resume: message = u.dated("Found %s models out of %s. Resuming.\n" % (len(model_ids), args.number_of_models)) u.log_message(message, log_file=session_file, console=args.verbosity) models = model_ids args.number_of_models -= len(model_ids) model_args = r.set_model_args(name, description, args, objective_field, fields, model_fields) models, model_ids = r.create_models(dataset, models, model_args, args, api, path, session_file, log) # If a model is provided, we use it. elif args.model: model_ids = [args.model] models = model_ids[:] elif args.models or args.model_tag: models = model_ids[:] if args.ensemble: ensemble = r.get_ensemble(args.ensemble, api, args.verbosity, session_file) ensemble_ids = [ensemble] model_ids = ensemble['object']['models'] if log_models and args.number_of_models > 1: for model_id in model_ids: u.log_created_resources("models", path, model_id, open_mode='a') models = model_ids[:] if args.ensembles or args.ensemble_tag: model_ids = [] ensemble_ids = [] # Parses ensemble/ids if provided. if args.ensemble_tag: ensemble_ids = (ensemble_ids + u.list_ids(api.list_ensembles, "tags__in=%s" % args.ensemble_tag)) else: ensemble_ids = u.read_resources(args.ensembles) for ensemble_id in ensemble_ids: ensemble = r.get_ensemble(ensemble_id, api) if args.ensemble is None: args.ensemble = ensemble_id model_ids.extend(ensemble['object']['models']) models = model_ids[:] # If we are going to predict we must retrieve the models if model_ids and args.test_set and not args.evaluate: models, model_ids = r.get_models(models, args, api, session_file) return models, model_ids, ensemble_ids, resume