def ensemble_processing(datasets, api, args, resume, fields=None, session_file=None, path=None, log=None): """Creates an ensemble of models from the input data """ ensembles = [] ensemble_ids = [] models = [] model_ids = [] number_of_ensembles = len(datasets) if resume: resume, ensemble_ids = c.checkpoint(c.are_ensembles_created, path, number_of_ensembles, debug=args.debug) if args.number_of_models > 1: _, model_ids = c.checkpoint(c.are_models_created, path, \ number_of_ensembles * args.number_of_models) models = model_ids if not resume: message = u.dated("Found %s ensembles out of %s. Resuming.\n" % (len(ensemble_ids), number_of_ensembles)) u.log_message(message, log_file=session_file, console=args.verbosity) ensembles = ensemble_ids number_of_ensembles -= len(ensemble_ids) if number_of_ensembles > 0: ensemble_args = r.set_ensemble_args(args, fields=fields) ensembles, ensemble_ids, models, model_ids = r.create_ensembles( datasets, ensembles, ensemble_args, args, api=api, path=path, number_of_ensembles=number_of_ensembles, session_file=session_file, log=log) return ensembles, ensemble_ids, models, model_ids, resume
def evaluations_process(time_series_set, datasets, fields, dataset_fields, api, args, resume, session_file=None, path=None, log=None): """Evaluates time-series against datasets """ existing_evaluations = 0 evaluations = [] number_of_evaluations = len(time_series_set) if resume: resume, evaluations = c.checkpoint(c.are_evaluations_created, path, number_of_evaluations, debug=args.debug) if not resume: existing_evaluations = len(evaluations) message = u.dated("Found %s evaluations from %s. Resuming.\n" % (existing_evaluations, number_of_evaluations)) number_of_evaluations -= existing_evaluations u.log_message(message, log_file=session_file, console=args.verbosity) if not resume: evaluation_args = r.set_evaluation_args(args, fields, dataset_fields) evaluations.extend(r.create_evaluations( time_series_set, datasets, evaluation_args, args, api, path=path, session_file=session_file, log=log, existing_evaluations=existing_evaluations)) return evaluations, resume
def execution_processing(api, args, session_file=None, path=None, log=None): """Creating or retrieving an execution """ execution = None resume = args.resume if args.script or args.scripts: # If resuming, try to extract args.execution form log files if resume: message = u.dated("Execution not found. Resuming.\n") resume, args.execution = c.checkpoint( c.is_execution_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if not resume: args.resume = resume # Check if there's a created project for it args.project_id = pp.project_processing( api, args, resume, session_file=session_file, path=path, log=log) execution_args = r.set_execution_args(args) execution = r.create_execution(execution_args, args, api, path, session_file, log) # If a source is provided either through the command line or in resume # steps, we use it. elif args.execution: execution = bigml.api.get_execution_id(args.execution) return execution
def project_processing(api, args, resume, session_file=None, path=None, log=None, create=False): """Creating or retrieving a project from input arguments """ # if no project info given by the user, we skip project processing and no # project will be assigned if args.project is None and args.project_id is None: return None project_id = None if args.project: # If resuming, try to extract args.project_id form log files if resume: message = u.dated("Project not found. Resuming.\n") resume, project_id = c.checkpoint( c.is_project_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) elif not create: project_id = r.get_project_by_name( args.project, api=api, verbosity=args.verbosity, session_file=session_file) elif args.project_id: project_id = bigml.api.get_project_id(args.project_id) # If no project is found by that name, we create a new one. if project_id is None: project_args = r.set_project_args(args, name=args.project) project = r.create_project( project_args, args, api, session_file, path, log) project_id = project['resource'] return project_id
def alternative_dataset_processing(dataset_or_source, suffix, dataset_args, api, args, resume, session_file=None, path=None, log=None): """Creates a dataset. Used in splits to generate train and test datasets """ alternative_dataset = None # if resuming, try to extract dataset form log files if resume: message = u.dated("Dataset not found. Resuming.\n") resume, alternative_dataset = c.checkpoint(c.is_dataset_created, path, "_%s" % suffix, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if alternative_dataset is None: alternative_dataset = r.create_dataset(dataset_or_source, dataset_args, args, api, path, session_file, log, suffix) if alternative_dataset: alternative_dataset = r.get_dataset(alternative_dataset, api, args.verbosity, session_file) return alternative_dataset, resume
def ensemble_processing(datasets, api, args, resume, fields=None, session_file=None, path=None, log=None): """Creates an ensemble of models from the input data """ ensembles = [] ensemble_ids = [] number_of_ensembles = len(datasets) if resume: resume, ensemble_ids = c.checkpoint( c.are_ensembles_created, path, number_of_ensembles, debug=args.debug) if not resume: message = u.dated("Found %s ensembles out of %s. Resuming.\n" % (len(ensemble_ids), number_of_ensembles)) u.log_message(message, log_file=session_file, console=args.verbosity) ensembles = ensemble_ids number_of_ensembles -= len(ensemble_ids) if number_of_ensembles > 0: ensemble_args = r.set_ensemble_args(args, fields=fields) ensembles, ensemble_ids, models, model_ids = r.create_ensembles( datasets, ensembles, ensemble_args, args, api=api, path=path, number_of_ensembles=number_of_ensembles, session_file=session_file, log=log) return ensembles, ensemble_ids, models, model_ids, resume
def update_external_connector(args, api, resume, session_file=None, path=None, log=None): """Updating external connector attributes according to input arguments """ # if no external connector info given by the user, we skip processing and # no update will be performed if args.external_connector_id is None: return None # If resuming, try to extract args.external_connector_id form log files if resume: message = u.dated("External connector not found. Resuming.\n") resume, external_connector_id = c.checkpoint( c.is_external_connector_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) elif args.external_connector_id: external_connector_id = bigml.api.get_external_connector_id( \ args.external_connector_id) if external_connector_id is not None: external_connector_args = r.set_basic_args(args, args.name) external_connector = r.update_external_connector( external_connector_args, args, api, session_file, log) external_connector_id = external_connector['resource'] return external_connector_id
def ensemble_processing(dataset, name, description, objective_field, fields, api, args, resume, session_file=None, path=None, log=None): """Creates an ensemble of models from the input data """ ensemble = None if resume: message = u.dated("Ensemble not found. Resuming.\n") resume, ensemble = c.checkpoint(c.is_ensemble_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if ensemble is None: ensemble_args = r.set_ensemble_args(name, description, args, objective_field, fields) ensemble = r.create_ensemble(dataset, ensemble_args, args, api, path, session_file, log) return ensemble, resume
def evaluate(model, dataset, name, description, fields, fields_map, output, api, args, resume, session_file=None, path=None, log=None): """Evaluates a model or an ensemble with the given dataset """ if resume: message = u.dated("Evaluation not found. Resuming.\n") resume, evaluation = c.checkpoint( c.is_evaluation_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if not resume: evaluation_args = r.set_evaluation_args(name, description, args, fields, fields_map) if args.ensemble: model_or_ensemble = args.ensemble else: model_or_ensemble = model evaluation = r.create_evaluation(model_or_ensemble, dataset, evaluation_args, args, api, path, session_file, log) evaluation = r.get_evaluation(evaluation, api, args.verbosity, session_file) r.save_evaluation(evaluation, output, api) return resume
def remote_centroid(cluster, test_dataset, batch_centroid_args, args, api, resume, prediction_file=None, session_file=None, path=None, log=None): """Computes a centroid for each entry in the `test_set`. Predictions are computed remotely using the batch centroid call. """ cluster_id = bigml.api.get_cluster_id(cluster) # if resuming, try to extract dataset form log files if resume: message = u.dated("Batch centroid not found. Resuming.\n") resume, batch_centroid = c.checkpoint( c.is_batch_centroid_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if not resume: batch_centroid = create_batch_centroid( cluster_id, test_dataset, batch_centroid_args, args, api, session_file=session_file, path=path, log=log) if not args.no_csv: api.download_batch_centroid(batch_centroid, prediction_file) if args.to_dataset: batch_centroid = bigml.api.check_resource(batch_centroid, api=api) new_dataset = bigml.api.get_dataset_id( batch_centroid['object']['output_dataset_resource']) if new_dataset is not None: message = u.dated("Batch centroid dataset created: %s\n" % u.get_url(new_dataset)) u.log_message(message, log_file=session_file, console=args.verbosity) u.log_created_resources("batch_centroid_dataset", path, new_dataset, mode='a')
def test_source_processing(api, args, resume, name=None, csv_properties=None, session_file=None, path=None, log=None): """Creating or retrieving a test data source from input arguments """ test_source = None fields = None if csv_properties is None: csv_properties = {} if args.test_set and args.remote: # If resuming, try to extract args.source form log files if resume: message = u.dated("Test source not found. Resuming.\n") resume, args.test_source = c.checkpoint( c.is_source_created, path, suffix="_test", debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if not resume: source_args = r.set_source_args(args, name=name, data_set_header=args.test_header) test_source = r.create_source(args.test_set, source_args, args, api, path, session_file, log, source_type="test") # If a source is provided either through the command line or in resume # steps, we use it. elif args.test_source: test_source = bigml.api.get_source_id(args.test_source) # If we already have source, we check that is finished, extract the # fields, and update them if needed. if test_source: test_source = r.get_source(test_source, api, args.verbosity, session_file) if 'source_parser' in test_source['object']: source_parser = test_source['object']['source_parser'] if 'missing_tokens' in source_parser: csv_properties['missing_tokens'] = ( source_parser['missing_tokens']) if 'locale' in source_parser: csv_properties['data_locale'] = source_parser['locale'] if (args.user_locale is not None and bigml_locale(args.user_locale) == source_parser['locale']): args.user_locale = None fields = Fields(test_source['object']['fields'], **csv_properties) if (args.field_attributes_ or args.types_ or args.user_locale or args.json_args.get('source')): # avoid updating project_id in source project_id, args.project_id = args.project_id, None test_source_args = r.set_source_args(args, fields=fields) test_source = r.update_source(test_source, test_source_args, args, api, session_file) args.project_id = project_id fields = Fields(test_source['object']['fields'], **csv_properties) return test_source, resume, csv_properties, fields
def remote_predict(model, test_dataset, batch_prediction_args, args, api, resume, prediction_file=None, session_file=None, path=None, log=None): """Computes a prediction for each entry in the `test_set`. Predictions are computed remotely using the batch predictions call. """ if args.ensemble is not None: model_or_ensemble = args.ensemble else: model_or_ensemble = bigml.api.get_model_id(model) # if resuming, try to extract dataset form log files if resume: message = u.dated("Batch prediction not found. Resuming.\n") resume, batch_prediction = c.checkpoint( c.is_batch_prediction_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if not resume: batch_prediction = create_batch_prediction( model_or_ensemble, test_dataset, batch_prediction_args, args, api, session_file=session_file, path=path, log=log) if not args.no_csv: api.download_batch_prediction(batch_prediction, prediction_file) if args.to_dataset: batch_prediction = bigml.api.check_resource(batch_prediction, api=api) new_dataset = bigml.api.get_dataset_id( batch_prediction['object']['output_dataset_resource']) if new_dataset is not None: message = u.dated("Batch prediction dataset created: %s\n" % u.get_url(new_dataset)) u.log_message(message, log_file=session_file, console=args.verbosity) u.log_created_resources("batch_prediction_dataset", path, new_dataset, mode='a')
def library_processing(api, args, session_file=None, path=None, log=None): """Creating or retrieving a library """ library = None resume = args.resume if args.code_file or args.code: # If resuming, try to extract args.library form log files if resume: message = u.dated("Library not found. Resuming.\n") resume, library = c.checkpoint( c.is_library_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if not resume: args.resume = resume if args.code_file: try: with open(args.code_file) as code_file: source_code = code_file.read() except IOError: sys.exit("Failed to find the source code file: %s" % args.code_file) else: source_code = args.code # Check if there's a created project for it args.project_id = pp.project_processing( api, args, resume, session_file=session_file, path=path, log=log) library_args = r.set_library_args(args) library = r.create_library(source_code, library_args, args, api, path, session_file, log) return library
def update_project(args, api, resume, session_file=None, path=None, log=None): """Updating project attributes according to input arguments """ # if no project info given by the user, we skip project processing and no # project will be assigned if args.project_id is None: return None # If resuming, try to extract args.project_id form log files if resume: message = u.dated("Project not found. Resuming.\n") resume, project_id = c.checkpoint( c.is_project_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) elif args.project_id: project_id = bigml.api.get_project_id(args.project_id) if project_id is not None: project_args = r.set_project_args(args, name=args.project) project = r.update_project( project_args, args, api, session_file, log) project_id = project['resource'] return project_id
def pca_processing(datasets, pca, \ pca_ids, api, args, resume, fields=None, \ session_file=None, path=None, log=None): """Creates or retrieves pca from the input data """ # If we have a dataset but not a model, we create the model if the no_model # flag hasn't been set up. if datasets and not (has_pca(args) or \ args.no_pca): pca_ids = [] pcas = [] # Only 1 pca per bigmler command at present number_of_pcas = 1 if resume: resume, pca_ids = c.checkpoint( \ c.are_pcas_created, path, \ number_of_pcas, debug=args.debug) if not resume: message = u.dated("Found %s pcas out of %s." " Resuming.\n" % (len(pca_ids), number_of_pcas)) u.log_message(message, log_file=session_file, console=args.verbosity) pcas = pca_ids number_of_pcas -= len(pca_ids) args.exclude_fields = [] if args.exclude_objective: dataset = datasets[0] fields = Fields(dataset) objective_id = \ fields.fields_by_column_number[fields.objective_field] args.exclude_fields = [objective_id] pca_args = r.set_pca_args( \ args, fields=fields, \ pca_fields=args.pca_fields_) pca = \ r.create_pca( \ datasets, pca, pca_args, \ args, api, path, session_file, log) # If a pca is provided, we use it. elif args.pca: pca_ids = [args.pca] pca = pca_ids[0] elif args.pca or args.pca_tag: pca = pca_ids[0] # If we are going to create projections, we must retrieve the pca if pca_ids and (args.test_set or args.export_fields): pca = \ r.get_pca(pca, args, api, session_file) return pca, resume
def model_per_label(labels, datasets, fields, objective_field, api, args, resume, name=None, description=None, model_fields=None, multi_label_data=None, session_file=None, path=None, log=None): """Creates a model per label for multi-label datasets """ model_ids = [] models = [] args.number_of_models = len(labels) if resume: resume, model_ids = c.checkpoint( c.are_models_created, path, args.number_of_models, debug=args.debug) if not resume: message = u.dated("Found %s models out of %s." " Resuming.\n" % (len(model_ids), args.number_of_models)) u.log_message(message, log_file=session_file, console=args.verbosity) models = model_ids args.number_of_models = len(labels) - len(model_ids) model_args_list = r.set_label_model_args( name, description, args, labels, multi_label_data, fields, model_fields, objective_field) # create models changing the input_field to select # only one label at a time models, model_ids = r.create_models( datasets, models, model_args_list, args, api, path, session_file, log) args.number_of_models = 1 return models, model_ids, resume
def ensemble_processing(dataset, objective_field, fields, api, args, resume, name=None, description=None, model_fields=None, session_file=None, path=None, log=None): """Creates an ensemble of models from the input data """ ensembles = [] number_of_ensembles = 1 if resume: message = u.dated("Ensemble not found. Resuming.\n") resume, ensembles = c.checkpoint( c.are_ensembles_created, path, number_of_ensembles, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) try: ensemble = ensembles[0] except IndexError: ensemble = None if ensemble is None: ensemble_args = r.set_ensemble_args(name, description, args, model_fields, objective_field, fields) ensembles, ensemble_ids, models, model_ids = r.create_ensembles( dataset, ensembles, ensemble_args, args, api=api, path=path, session_file=session_file, log=log) return ensembles, ensemble_ids, models, model_ids, resume
def connector_processing(api, args, resume, session_file=None, path=None, log=None): """Creating or retrieving an external connector from input arguments """ # if no external connection info given by the user, we skip # processing and no connector will be created if not u.has_connection_info(args) and args.external_connector_id is None: return None external_connector_id = None if u.has_connection_info(args): # If resuming, try to extract args.external_connector_id form log files if resume: message = u.dated("External connector ID not found. Resuming.\n") resume, external_connector_id = c.checkpoint( c.is_external_connector_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) else: external_connector_id = bigml.api.get_external_connector_id( \ args.external_connector_id) # If no external connector is found, we create a new one. if external_connector_id is None: connector_args = r.set_external_connector_args(\ args, name=args.name) connector = r.create_external_connector( connector_args, args, api, session_file, path, log) external_connector_id = connector['resource'] return external_connector_id
def ensemble_processing(datasets, objective_field, fields, api, args, resume, name=None, description=None, model_fields=None, session_file=None, path=None, log=None): """Creates an ensemble of models from the input data """ ensembles = [] number_of_ensembles = 1 if resume: message = u.dated("Ensemble not found. Resuming.\n") resume, ensembles = c.checkpoint( c.are_ensembles_created, path, number_of_ensembles, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) try: ensemble = ensembles[0] except IndexError: ensemble = None if ensemble is None: ensemble_args = r.set_ensemble_args(name, description, args, model_fields, objective_field, fields) ensembles, ensemble_ids, models, model_ids = r.create_ensembles( datasets, ensembles, ensemble_args, args, api=api, path=path, session_file=session_file, log=log) return ensembles, ensemble_ids, models, model_ids, resume
def update_project(args, api, resume, session_file=None, path=None, log=None): """Updating project attributes according to input arguments """ # if no project info given by the user, we skip project processing and no # project will be assigned if args.project_id is None: return None # If resuming, try to extract args.project_id form log files if resume: message = u.dated("Project not found. Resuming.\n") resume, project_id = c.checkpoint(c.is_project_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) elif args.project_id: project_id = bigml.api.get_project_id(args.project_id) if project_id is not None: project_args = r.set_project_args(args, name=args.project) project = r.update_project(project_args, args, api, session_file) project_id = project['resource'] return project_id
def evaluations_process(time_series_set, datasets, fields, dataset_fields, api, args, resume, session_file=None, path=None, log=None, objective_field=None): """Evaluates time-series against datasets """ existing_evaluations = 0 evaluations = [] number_of_evaluations = len(time_series_set) if resume: resume, evaluations = c.checkpoint(c.are_evaluations_created, path, number_of_evaluations, debug=args.debug) if not resume: existing_evaluations = len(evaluations) message = u.dated("Found %s evaluations from %s. Resuming.\n" % (existing_evaluations, number_of_evaluations)) number_of_evaluations -= existing_evaluations u.log_message(message, log_file=session_file, console=args.verbosity) if not resume: evaluation_args = r.set_evaluation_args(args, fields, dataset_fields) evaluations.extend(r.create_evaluations( time_series_set, datasets, evaluation_args, args, api, path=path, session_file=session_file, log=log, existing_evaluations=existing_evaluations)) return evaluations, resume
def alternative_dataset_processing( dataset_or_source, suffix, dataset_args, api, args, resume, session_file=None, path=None, log=None ): """Creates a dataset. Used in splits to generate train and test datasets """ alternative_dataset = None # if resuming, try to extract dataset form log files if resume: message = u.dated("Dataset not found. Resuming.\n") resume, alternative_dataset = c.checkpoint( c.is_dataset_created, path, "_%s" % suffix, debug=args.debug, message=message, log_file=session_file, console=args.verbosity, ) if alternative_dataset is None: alternative_dataset = r.create_dataset( dataset_or_source, dataset_args, args, api, path, session_file, log, suffix ) if alternative_dataset: alternative_dataset = r.get_dataset(alternative_dataset, api, args.verbosity, session_file) return alternative_dataset, resume
def remote_centroid(cluster, test_dataset, batch_centroid_args, args, api, resume, prediction_file=None, session_file=None, path=None, log=None): """Computes a centroid for each entry in the `test_set`. Predictions are computed remotely using the batch centroid call. """ cluster_id = bigml.api.get_cluster_id(cluster) # if resuming, try to extract dataset form log files if resume: message = u.dated("Batch centroid not found. Resuming.\n") resume, batch_centroid = c.checkpoint( c.is_batch_centroid_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if not resume: batch_centroid = create_batch_centroid( cluster_id, test_dataset, batch_centroid_args, args, api, session_file=session_file, path=path, log=log) if not args.no_csv: file_name = api.download_batch_centroid(batch_centroid, prediction_file) if file_name is None: sys.exit("Failed downloading CSV.") if args.to_dataset: batch_centroid = bigml.api.check_resource(batch_centroid, api=api) new_dataset = bigml.api.get_dataset_id( batch_centroid['object']['output_dataset_resource']) if new_dataset is not None: message = u.dated("Batch centroid dataset created: %s\n" % u.get_url(new_dataset)) u.log_message(message, log_file=session_file, console=args.verbosity) u.log_created_resources("batch_centroid_dataset", path, new_dataset, mode='a')
def export_dataset(dataset, api, args, resume, session_file=None, path=None): """Exports the dataset to a CSV file given by the user or a filename based on the dataset id by default. """ filename = csv_name(args.to_csv, path, dataset) if resume: resume = c.checkpoint( c.is_dataset_exported, filename, debug=args.debug) if not resume: message = u.dated("No dataset exported. Resuming.\n") u.log_message(message, log_file=session_file, console=args.verbosity) else: message = u.dated("Exporting dataset to CSV file: %s\n" % filename) u.log_message(message, log_file=session_file, console=args.verbosity) if not resume: file_name = api.download_dataset(dataset, filename=filename) if file_name is None: sys.exit("Failed downloading CSV.") return resume
def remote_anomaly_score(anomaly, test_dataset, batch_anomaly_score_args, args, api, resume, prediction_file=None, session_file=None, path=None, log=None): """Computes an anomaly score for each entry in the `test_set`. Predictions are computed remotely using the batch anomaly score call. """ anomaly_id = bigml.api.get_anomaly_id(anomaly) # if resuming, try to extract dataset form log files if resume: message = u.dated("Batch anomaly score not found. Resuming.\n") resume, batch_anomaly_score = c.checkpoint( c.is_batch_anomaly_score_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if not resume: batch_anomaly_score = create_batch_anomaly_score( anomaly_id, test_dataset, batch_anomaly_score_args, args, api, session_file=session_file, path=path, log=log) if not args.no_csv: api.download_batch_anomaly_score(batch_anomaly_score, prediction_file) if args.to_dataset: batch_anomaly_score = bigml.api.check_resource(batch_anomaly_score, api=api) new_dataset = bigml.api.get_dataset_id( batch_anomaly_score['object']['output_dataset_resource']) if new_dataset is not None: message = u.dated("Batch anomaly score dataset created: %s\n" % u.get_url(new_dataset)) u.log_message(message, log_file=session_file, console=args.verbosity) u.log_created_resources("batch_anomaly_score_dataset", path, new_dataset, open_mode='a')
def remote_forecast(time_series, forecast_args, args, api, resume, prediction_file=None, session_file=None, path=None, log=None): """Computes a remote forecast. """ time_series_id = bigml.api.get_time_series_id( \ time_series) # if resuming, try to extract dataset form log files if resume: message = u.dated("Forecast not found. Resuming.\n") resume, forecast = c.checkpoint( c.is_forecast_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if not resume: local_time_series = TimeSeries(time_series, api=args.retrieve_api_) output = args.predictions if args.test_set is not None: input_data = u.read_json(args.test_set) elif args.horizon is not None: input_data = {local_time_series.objective_id: { \ "horizon": args.horizon}} forecast = create_forecast( time_series_id, input_data, forecast_args, args, api, session_file=session_file, path=path, log=log) write_forecasts(forecast["object"]["forecast"]["result"], output)
def source_processing(training_set, test_set, training_set_header, test_set_header, api, args, resume, name=None, description=None, csv_properties=None, field_attributes=None, types=None, session_file=None, path=None, log=None): """Creating or retrieving a data source from input arguments """ source = None fields = None if (training_set or (args.evaluate and test_set)): # If resuming, try to extract args.source form log files if resume: message = u.dated("Source not found. Resuming.\n") resume, args.source = c.checkpoint( c.is_source_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) # If neither a previous source, dataset or model are provided. # we create a new one. Also if --evaluate and test data are provided # we create a new dataset to test with. data_set, data_set_header = r.data_to_source(training_set, test_set, training_set_header, test_set_header, args) if data_set is not None: source_args = r.set_source_args(data_set_header, name, description, args) source = r.create_source(data_set, source_args, args, api, path, session_file, log) # If a source is provided either through the command line or in resume # steps, we use it. elif args.source: source = bigml.api.get_source_id(args.source) # If we already have source, we check that is finished, extract the # fields, and update them if needed. if source: source = r.get_source(source, api, args.verbosity, session_file) if 'source_parser' in source['object']: source_parser = source['object']['source_parser'] if 'missing_tokens' in source_parser: csv_properties['missing_tokens'] = ( source_parser['missing_tokens']) if 'data_locale' in source_parser: csv_properties['data_locale'] = source_parser['locale'] fields = Fields(source['object']['fields'], **csv_properties) if field_attributes: source = r.update_source_fields(source, field_attributes, fields, api, args.verbosity, session_file) if types: source = r.update_source_fields(source, types, fields, api, args.verbosity, session_file) if field_attributes or types: fields = Fields(source['object']['fields'], **csv_properties) return source, resume, csv_properties, fields
def anomalies_processing(datasets, anomalies, anomaly_ids, api, args, resume, fields=None, session_file=None, path=None, log=None): """Creates or retrieves anomalies from the command data """ # If we have a dataset but not a model, we create the model if the no_model # flag hasn't been set up. if datasets and not (has_anomalies(args) or args.no_anomaly): anomaly_ids = [] anomalies = [] # Only 1 anomaly detector per bigmler command at present number_of_anomalies = 1 if resume: resume, anomaly_ids = c.checkpoint(c.are_anomalies_created, path, number_of_anomalies, debug=args.debug) if not resume: message = u.dated("Found %s anomaly detectors out of %s." " Resuming.\n" % (len(anomaly_ids), number_of_anomalies)) u.log_message(message, log_file=session_file, console=args.verbosity) anomalies = anomaly_ids number_of_anomalies -= len(anomaly_ids) anomaly_args = r.set_anomaly_args(args, fields=fields, anomaly_fields=args.anomaly_fields_) anomalies, anomaly_ids = r.create_anomalies(datasets, anomalies, anomaly_args, args, api, path, session_file, log) # If an anomaly detector is provided, we use it. elif args.anomaly: anomaly_ids = [args.anomaly] anomalies = anomaly_ids[:] elif args.anomalies or args.anomaly_tag: anomalies = anomaly_ids[:] # If we are going to predict we must retrieve the anomalies if anomaly_ids and args.test_set: anomalies, anomaly_ids = r.get_anomalies(anomalies, args, api, session_file) return anomalies, anomaly_ids, resume
def remote_predict_models(models, test_reader, prediction_file, api, args, resume=False, output_path=None, session_file=None, log=None, exclude=None): """Retrieve predictions remotely, combine them and save predictions to file """ predictions_files = [] prediction_args = { "tags": args.tag } test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) message_logged = False raw_input_data_list = [] for input_data in test_reader: raw_input_data_list.append(input_data) single_model = len(models) == 1 if single_model: prediction_file = UnicodeWriter(prediction_file).open_writer() for model in models: model = bigml.api.get_model_id(model) predictions_file = get_predictions_file_name(model, output_path) predictions_files.append(predictions_file) if (not resume or not c.checkpoint(c.are_predictions_created, predictions_file, test_reader.number_of_tests(), debug=args.debug)[0]): if not message_logged: message = u.dated("Creating remote predictions.\n") u.log_message(message, log_file=session_file, console=args.verbosity) message_logged = True with UnicodeWriter(predictions_file) as predictions_file: for input_data in raw_input_data_list: input_data_dict = test_reader.dict(input_data) prediction = api.create_prediction(model, input_data_dict, by_name=test_set_header, wait_time=0, args=prediction_args) u.check_resource_error(prediction, "Failed to create prediction: ") u.log_message("%s\n" % prediction['resource'], log_file=log) prediction_row = prediction_to_row(prediction) predictions_file.writerow(prediction_row) if single_model: write_prediction(prediction_row[0:2], prediction_file, args.prediction_info, input_data, exclude) if single_model: prediction_file.close_writer() else: combine_votes(predictions_files, Model(models[0]).to_prediction, prediction_file, args.method, args.prediction_info, raw_input_data_list, exclude)
def evaluations_process(models_or_ensembles, datasets, name, description, fields, dataset_fields, fields_map, api, args, resume, session_file=None, path=None, log=None, labels=None, all_labels=None, objective_field=None): """Evaluates models or ensembles against datasets """ existing_evaluations = 0 evaluations = [] number_of_evaluations = len(models_or_ensembles) if resume: resume, evaluations = c.checkpoint(c.are_evaluations_created, path, number_of_evaluations, debug=args.debug) if not resume: existing_evaluations = len(evaluations) message = u.dated("Found %s evaluations from %s. Resuming.\n" % (existing_evaluations, number_of_evaluations)) number_of_evaluations -= existing_evaluations u.log_message(message, log_file=session_file, console=args.verbosity) if not resume: if args.multi_label: evaluation_args = r.set_label_evaluation_args( name, description, args, labels, all_labels, number_of_evaluations, fields, dataset_fields, fields_map, objective_field) else: evaluation_args = r.set_evaluation_args(name, description, args, fields, dataset_fields, fields_map) evaluations.extend( r.create_evaluations(models_or_ensembles, datasets, evaluation_args, args, api, path=path, session_file=session_file, log=log, existing_evaluations=existing_evaluations)) return evaluations, resume
def remote_predict(models, test_reader, prediction_file, api, resume=False, verbosity=True, output_path=None, method=PLURALITY_CODE, tags="", session_file=None, log=None, debug=False, prediction_info=None): """Retrieve predictions remotely, combine them and save predictions to file """ predictions_files = [] prediction_args = {"tags": tags} test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) message_logged = False raw_input_data_list = [] for model in models: model = bigml.api.get_model_id(model) predictions_file = get_predictions_file_name(model, output_path) predictions_files.append(predictions_file) if (not resume or not c.checkpoint(c.are_predictions_created, predictions_file, test_reader.number_of_tests(), debug=debug)): if not message_logged: message = u.dated("Creating remote predictions.") u.log_message(message, log_file=session_file, console=verbosity) message_logged = True predictions_file = csv.writer(open(predictions_file, 'w', 0), lineterminator="\n") for input_data in test_reader: raw_input_data_list.append(input_data) input_data_dict = test_reader.dict(input_data) prediction = api.create_prediction(model, input_data_dict, by_name=test_set_header, wait_time=0, args=prediction_args) u.check_resource_error(prediction, "Failed to create prediction: ") u.log_message("%s\n" % prediction['resource'], log_file=log) prediction_row = prediction_to_row(prediction) predictions_file.writerow(prediction_row) combine_votes(predictions_files, Model(models[0]).to_prediction, prediction_file, method, prediction_info, raw_input_data_list)
def fusion_processing(fusion, \ fusion_ids, api, args, resume, fields=None, \ session_file=None, path=None, log=None): """Creates or retrieves fusion from the input data """ # If we have a models' list but not a fusion, # we create the model if the no_model # flag hasn't been set up. if args.fusion_models_ is not None and not has_fusion(args): fusion_ids = [] # Only 1 fusion per bigmler command at present number_of_fusions = 1 if resume: resume, fusion_ids = c.checkpoint( \ c.are_fusions_created, path, \ number_of_fusions, debug=args.debug) if not resume: message = u.dated("Found %s fusions out of %s." " Resuming.\n" % (len(fusion_ids), number_of_fusions)) u.log_message(message, log_file=session_file, console=args.verbosity) fusion = fusion_ids[0] first_model_id = api.get_fusion(fusion)[ \ "object"]["fusion"]["models"][0]["id"] first_model_kind = api.get_fusion(fusion)[ \ "object"]["fusion"]["models"][0]["kind"] first_model = api.getters[first_model_kind](first_model_id) fields = Fields(first_model) number_of_fusions -= len(fusion_ids) fusion_args = r.set_fusion_args( \ args, fields) fusion = \ r.create_fusion( \ args.fusion_models_, fusion, fusion_args, \ args, api, path, session_file, log) # If a fusion is provided, we use it. elif args.fusion: fusion_ids = [args.fusion] fusion = fusion_ids[0] elif args.fusion or args.fusion_tag: fusion = fusion_ids[0] # If we are going to create predictions, we must retrieve the fusion if fusion_ids and args.test_set: fusion = \ r.get_fusion(fusion, args, api, session_file) args.objective_field = fusion['object']['objective_field_name'] return fusion, resume
def remote_anomaly_score(anomaly, test_dataset, batch_anomaly_score_args, args, api, resume, prediction_file=None, session_file=None, path=None, log=None): """Computes an anomaly score for each entry in the `test_set`. Predictions are computed remotely using the batch anomaly score call. """ anomaly_id = bigml.api.get_anomaly_id(anomaly) # if resuming, try to extract dataset form log files if resume: message = u.dated("Batch anomaly score not found. Resuming.\n") resume, batch_anomaly_score = c.checkpoint( c.is_batch_anomaly_score_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if not resume: batch_anomaly_score = create_batch_anomaly_score( anomaly_id, test_dataset, batch_anomaly_score_args, args, api, session_file=session_file, path=path, log=log) if not args.no_csv: file_name = api.download_batch_anomaly_score(batch_anomaly_score, prediction_file) if file_name is None: sys.exit("Failed downloading CSV.") if args.to_dataset: batch_anomaly_score = bigml.api.check_resource(batch_anomaly_score, api=api) new_dataset = bigml.api.get_dataset_id( batch_anomaly_score['object']['output_dataset_resource']) if new_dataset is not None: message = u.dated("Batch anomaly score dataset created: %s\n" % u.get_url(new_dataset)) u.log_message(message, log_file=session_file, console=args.verbosity) u.log_created_resources("batch_anomaly_score_dataset", path, new_dataset, mode='a')
def split_processing(dataset, name, description, api, args, resume, session_file=None, path=None, log=None): """Splits a dataset into train and test datasets """ train_dataset = None test_dataset = None sample_rate = 1 - args.test_split # if resuming, try to extract train dataset form log files if resume: message = u.dated("Dataset not found. Resuming.\n") resume, train_dataset = c.checkpoint( c.is_dataset_created, path, "_train", debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if train_dataset is None: dataset_split_args = r.set_dataset_split_args( "%s - train (%s %%)" % (name, int(sample_rate * 100)), description, args, sample_rate, out_of_bag=False) train_dataset = r.create_dataset( dataset, dataset_split_args, args, api, path, session_file, log, "train") if train_dataset: train_dataset = r.get_dataset(train_dataset, api, args.verbosity, session_file) # if resuming, try to extract test dataset form log files if resume: message = u.dated("Dataset not found. Resuming.\n") resume, test_dataset = c.checkpoint( c.is_dataset_created, path, "_test", debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if test_dataset is None: dataset_split_args = r.set_dataset_split_args( "%s - test (%s %%)" % (name, int(args.test_split * 100)), description, args, sample_rate, out_of_bag=True) test_dataset = r.create_dataset( dataset, dataset_split_args, args, api, path, session_file, log, "test") if test_dataset: test_dataset = r.get_dataset(test_dataset, api, args.verbosity, session_file) return train_dataset, test_dataset, resume
def clusters_processing(datasets, clusters, cluster_ids, api, args, resume, fields=None, session_file=None, path=None, log=None): """Creates or retrieves clusters from the input data """ # If we have a dataset but not a model, we create the model if the no_model # flag hasn't been set up. if datasets and not (has_clusters(args) or args.no_cluster): cluster_ids = [] clusters = [] # Only 1 cluster per bigmler command at present number_of_clusters = 1 if resume: resume, cluster_ids = c.checkpoint(c.are_clusters_created, path, number_of_clusters, debug=args.debug) if not resume: message = u.dated("Found %s clusters out of %s. Resuming.\n" % (len(cluster_ids), number_of_clusters)) u.log_message(message, log_file=session_file, console=args.verbosity) clusters = cluster_ids number_of_clusters -= len(cluster_ids) cluster_args = r.set_cluster_args(args, fields=fields, cluster_fields=args.cluster_fields_) clusters, cluster_ids = r.create_clusters(datasets, clusters, cluster_args, args, api, path, session_file, log) # If a cluster is provided, we use it. elif args.cluster: cluster_ids = [args.cluster] clusters = cluster_ids[:] elif args.clusters or args.cluster_tag: clusters = cluster_ids[:] # If we are going to predict we must retrieve the clusters if cluster_ids and args.test_set: clusters, cluster_ids = r.get_clusters(clusters, args, api, session_file) return clusters, cluster_ids, resume
def remote_prediction(model, test_dataset, batch_prediction_args, args, api, resume, prediction_file=None, session_file=None, path=None, log=None): """Computes a prediction for each entry in the `test_set`. Predictions are computed remotely using the batch prediction call. """ model_id = bigml.api.get_resource_id( \ model) batch_prediction_args.update({"probability": True, "confidence": False}) # if resuming, try to extract dataset form log files if resume: message = u.dated("Batch prediction not found. Resuming.\n") resume, batch_prediction = c.checkpoint(c.is_batch_prediction_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if not resume: batch_prediction = create_batch_prediction(model_id, test_dataset, batch_prediction_args, args, api, session_file=session_file, path=path, log=log) if not args.no_csv: file_name = api.download_batch_prediction(batch_prediction, prediction_file) if file_name is None: sys.exit("Failed downloading CSV.") if args.to_dataset: batch_prediction = bigml.api.check_resource(batch_prediction, api=api) new_dataset = bigml.api.get_dataset_id( batch_prediction['object']['output_dataset_resource']) if new_dataset is not None: message = u.dated("Batch prediction dataset created: %s\n" % u.get_url(new_dataset)) u.log_message(message, log_file=session_file, console=args.verbosity) u.log_created_resources("batch_prediction_dataset", path, new_dataset, mode='a')
def samples_processing(datasets, samples, sample_ids, api, args, resume, session_file=None, path=None, log=None): """Creates or retrieves samples from the input data """ # If we have a dataset but not a sample, we create the sample if the # no_sample flag hasn't been set up. if datasets and not (has_samples(args) or args.no_sample): sample_ids = [] samples = [] # Only 1 sample per bigmler command at present number_of_samples = 1 if resume: resume, sample_ids = c.checkpoint(c.are_samples_created, path, number_of_samples, debug=args.debug) if not resume: message = u.dated("Found %s samples out of %s. Resuming.\n" % (len(sample_ids), number_of_samples)) u.log_message(message, log_file=session_file, console=args.verbosity) samples = sample_ids number_of_samples -= len(sample_ids) sample_args = r.set_sample_args(args) samples, sample_ids = r.create_samples(datasets, samples, sample_args, args, api, path, session_file, log) # If a sample is provided, we use it. elif args.sample: sample_ids = [args.sample] samples = sample_ids[:] elif args.samples or args.sample_tag: samples = sample_ids[:] # We must retrieve the samples' output to store them as CSV files if sample_ids and needs_sample_fields(args): samples, sample_ids = r.get_samples(samples, args, api, session_file=session_file) return samples, sample_ids, resume
def source_processing( api, args, resume, csv_properties=None, multi_label_data=None, session_file=None, path=None, log=None ): """Creating or retrieving a data source from input arguments """ source = None fields = None if args.training_set or (hasattr(args, "evaluate") and args.evaluate and args.test_set): # If resuming, try to extract args.source form log files if resume: message = u.dated("Source not found. Resuming.\n") resume, args.source = c.checkpoint( c.is_source_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity, ) # If neither a previous source, dataset or model are provided. # we create a new one. Also if --evaluate and test data are provided # we create a new dataset to test with. data_set, data_set_header = r.data_to_source(args) if data_set is not None: source_args = r.set_source_args(args, multi_label_data=multi_label_data, data_set_header=data_set_header) source = r.create_source(data_set, source_args, args, api, path, session_file, log) # If a source is provided either through the command line or in resume # steps, we use it. elif args.source: source = bigml.api.get_source_id(args.source) # If we already have source, we check that is finished, extract the # fields, and update them if needed. if source: source = r.get_source(source, api, args.verbosity, session_file) if "source_parser" in source["object"]: source_parser = source["object"]["source_parser"] if "missing_tokens" in source_parser: csv_properties["missing_tokens"] = source_parser["missing_tokens"] if "locale" in source_parser: csv_properties["data_locale"] = source_parser["locale"] # No changes if user locale is the one in the source. if args.user_locale is not None and bigml_locale(args.user_locale) == source_parser["locale"]: args.user_locale = None fields = Fields(source["object"]["fields"], **csv_properties) if args.field_attributes_ or args.types_ or args.user_locale or args.json_args.get("source"): source_args = r.set_source_args(args, fields=fields) source = r.update_source(source, source_args, args, api, session_file) fields = Fields(source["object"]["fields"], **csv_properties) return source, resume, csv_properties, fields
def topic_model_processing(datasets, topic_models, topic_model_ids, api, args, resume, fields=None, session_file=None, path=None, log=None): """Creates or retrieves topic models from the input data """ # If we have a dataset but not a topic model, we create the topic model # if the no_topic_model # flag hasn't been set up. if datasets and not (has_topic_models(args) or args.no_topic_model): topic_model_ids = [] topic_models = [] # Only 1 topic model per bigmler command at present number_of_topic_models = 1 if resume: resume, topic_model_ids = c.checkpoint( c.are_topic_models_created, path, number_of_topic_models, debug=args.debug) if not resume: message = u.dated( "Found %s topic models out of %s. Resuming.\n" % (len(topic_model_ids), number_of_topic_models)) u.log_message(message, log_file=session_file, console=args.verbosity) topic_models = topic_model_ids number_of_topic_models -= len(topic_model_ids) topic_model_args = r.set_topic_model_args( \ args, fields=fields, topic_model_fields=args.topic_model_fields_) topic_models, topic_model_ids = r.create_topic_models( \ datasets, topic_models, topic_model_args, args, api, path, session_file, log) # If a topic model is provided, we use it. elif args.topic_model: topic_model_ids = [args.topic_model] topic_models = topic_model_ids[:] elif args.topic_models or args.topic_model_tag: topic_models = topic_model_ids[:] # If we are going to predict we must retrieve the topic models if topic_model_ids and args.test_set: topic_models, topic_model_ids = r.get_topic_models( topic_models, args, api, session_file) return topic_models, topic_model_ids, resume
def dataset_processing(source, training_set, test_set, model_ids, name, description, fields, dataset_fields, api, args, resume, csv_properties=None, session_file=None, path=None, log=None): """Creating or retrieving dataset from input arguments """ dataset = None if (training_set or args.source or (args.evaluate and test_set)): # if resuming, try to extract args.dataset form log files if resume: message = u.dated("Dataset not found. Resuming.\n") resume, args.dataset = c.checkpoint(c.is_dataset_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) # If we have a source but no dataset or model has been provided, we # create a new dataset if the no_dataset option isn't set up. Also # if evaluate is set and test_set has been provided. if ((source and not args.dataset and not args.model and not model_ids and not args.no_dataset) or (args.evaluate and args.test_set and not args.dataset)): dataset_args = r.set_dataset_args(name, description, args, fields, dataset_fields) dataset = r.create_dataset(source, dataset_args, args, api, path, session_file, log) # If a dataset is provided, let's retrieve it. elif args.dataset: dataset = bigml.api.get_dataset_id(args.dataset) # If we already have a dataset, we check the status and get the fields if # we hadn't them yet. if dataset: dataset = r.get_dataset(dataset, api, args.verbosity, session_file) if not csv_properties and 'locale' in dataset['object']: csv_properties = {'data_locale': dataset['object']['locale']} fields = Fields(dataset['object']['fields'], **csv_properties) if args.public_dataset: r.publish_dataset(dataset, api, args, session_file) return dataset, resume, csv_properties, fields
def fusion_processing(fusion, \ fusion_ids, api, args, resume, fields=None, \ session_file=None, path=None, log=None): """Creates or retrieves fusion from the input data """ # If we have a models' list but not a fusion, # we create the model if the no_model # flag hasn't been set up. if args.fusion_models_ is not None and not has_fusion(args): fusion_ids = [] fusions = [] # Only 1 fusion per bigmler command at present number_of_fusions = 1 if resume: resume, fusion_ids = c.checkpoint( \ c.are_fusions_created, path, \ number_of_fusions, debug=args.debug) if not resume: message = u.dated("Found %s fusions out of %s." " Resuming.\n" % (len(fusion_ids), number_of_fusions)) u.log_message(message, log_file=session_file, console=args.verbosity) first_model = api.getters[models[0]](model[0]) fields = Fields(first_model) fusions = fusion_ids number_of_fusions -= len(fusion_ids) fusion_args = r.set_fusion_args( \ args, fields) fusion = \ r.create_fusion( \ args.fusion_models_, fusion, fusion_args, \ args, api, path, session_file, log) # If a fusion is provided, we use it. elif args.fusion: fusion_ids = [args.fusion] fusion = fusion_ids[0] elif args.fusion or args.fusion_tag: fusion = fusion_ids[0] # If we are going to create predictions, we must retrieve the fusion if fusion_ids and args.test_set: fusion = \ r.get_fusion(fusion, args, api, session_file) args.objective_field = fusion['object']['objective_field_name'] return fusion, resume
def logistic_regressions_processing(datasets, logistic_regressions, \ logistic_regression_ids, api, args, resume, fields=None, \ session_file=None, path=None, log=None): """Creates or retrieves logistic regression from the input data """ # If we have a dataset but not a model, we create the model if the no_model # flag hasn't been set up. if datasets and not (has_logistic_regression(args) or \ args.no_logistic_regression): logistic_regression_ids = [] logistic_regressions = [] # Only 1 logistic regression per bigmler command at present number_of_logistic_regressions = 1 if resume: resume, logistic_regression_ids = c.checkpoint( \ c.are_logistic_regressions_created, path, \ number_of_logistic_regressions, debug=args.debug) if not resume: message = u.dated("Found %s logistic regressions out of %s." " Resuming.\n" % (len(logistic_regression_ids), number_of_logistic_regressions)) u.log_message(message, log_file=session_file, console=args.verbosity) logistic_regressions = logistic_regression_ids number_of_logistic_regressions -= len(logistic_regression_ids) logistic_regression_args = r.set_logistic_regression_args( \ args, fields=fields, \ logistic_regression_fields=args.logistic_fields_, objective_id=args.objective_id_) logistic_regressions, logistic_regression_ids = \ r.create_logistic_regressions( \ datasets, logistic_regressions, logistic_regression_args, \ args, api, path, session_file, log) # If a logistic regression is provided, we use it. elif args.logistic_regression: logistic_regression_ids = [args.logistic_regression] logistic_regressions = logistic_regression_ids[:] elif args.logistic_regressions or args.logistic_regression_tag: logistic_regressions = logistic_regression_ids[:] # If we are going to predict we must retrieve the logistic regressions if logistic_regression_ids and (args.test_set or args.export_fields): logistic_regressions, logistic_regression_ids = \ r.get_logistic_regressions(logistic_regressions, args, api, \ session_file) return logistic_regressions, logistic_regression_ids, resume
def deepnets_processing(datasets, deepnets, \ deepnet_ids, api, args, resume, fields=None, \ session_file=None, path=None, log=None): """Creates or retrieves deepnet from the input data """ # If we have a dataset but not a model, we create the model if the no_model # flag hasn't been set up. if datasets and not (has_deepnet(args) or \ args.no_deepnet): deepnet_ids = [] deepnets = [] # Only 1 deepnet per bigmler command at present number_of_deepnets = 1 if resume: resume, deepnet_ids = c.checkpoint( \ c.are_deepnets_created, path, \ number_of_deepnets, debug=args.debug) if not resume: message = u.dated("Found %s deepnets out of %s." " Resuming.\n" % (len(deepnet_ids), number_of_deepnets)) u.log_message(message, log_file=session_file, console=args.verbosity) deepnets = deepnet_ids number_of_deepnets -= len(deepnet_ids) deepnet_args = r.set_deepnet_args( \ args, fields=fields, \ deepnet_fields=args.deepnet_fields_, objective_id=args.objective_id_) deepnets, deepnets_ids = \ r.create_deepnets( \ datasets, deepnets, deepnet_args, \ args, api, path, session_file, log) # If a deepnet is provided, we use it. elif args.deepnet: deepnet_ids = [args.deepnet] deepnets = deepnet_ids[:] elif args.deepnets or args.deepnet_tag: deepnets = deepnet_ids[:] # If we are going to predict we must retrieve the deepnets if deepnet_ids and (args.test_set or args.export_fields): deepnets, deepnet_ids = \ r.get_deepnets(deepnets, args, api, \ session_file) return deepnets, deepnet_ids, resume
def time_series_processing(datasets, time_series_set, \ time_series_ids, api, args, resume, fields=None, \ session_file=None, path=None, log=None): """Creates or retrieves time_series from the input data """ # If we have a dataset but not a model, we create the model if the no_model # flag hasn't been set up. if datasets and not (has_time_series(args) or \ args.no_time_series): time_series_ids = [] time_series_set = [] # Only 1 time-series per bigmler command at present number_of_time_series = 1 if resume: resume, time_series_ids = c.checkpoint( \ c.are_time_series_created, path, \ number_of_time_series, debug=args.debug) if not resume: message = u.dated( "Found %s time-series out of %s." " Resuming.\n" % (len(time_series_ids), number_of_time_series)) u.log_message(message, log_file=session_file, console=args.verbosity) time_series_set = time_series_ids number_of_time_series -= len(time_series_ids) time_series_args = r.set_time_series_args( \ args, fields=fields, objective_id=args.objective_id_) time_series_set, time_series_ids = \ r.create_time_series( \ datasets, time_series_set, time_series_args, \ args, api, path, session_file, log) # If a time_series is provided, we use it. elif args.time_series: time_series_ids = [args.time_series] time_series_set = time_series_ids[:] elif args.time_series_set or args.time_series_tag: time_series_set = time_series_ids[:] # If we are going to predict we must retrieve the time-series if time_series_ids and args.export_fields: time_series_set, time_series_ids = \ r.get_time_series(time_series_set, args, api, \ session_file) return time_series_set, time_series_ids, resume
def create_new_dataset(datasets, api, args, resume, fields=None, session_file=None, path=None, log=None): """Generates a new dataset using the generators given in a generators file or a multi-dataset from a list of datasets """ origin_resource = datasets if not isinstance(datasets, basestring) and args.multi_dataset: suffix = "multi" else: datasets = [] suffix = "gen" number_of_datasets = 1 if resume: resume, datasets = c.checkpoint(c.are_datasets_created, path, number_of_datasets, debug=args.debug, suffix=suffix) if not resume: message = u.dated("Found %s datasets out of %s. Resuming.\n" % (len(datasets), number_of_datasets)) u.log_message(message, log_file=session_file, console=args.verbosity) if not resume: dataset_args = r.set_dataset_args(args, fields) if args.multi_dataset and args.multi_dataset_json: dataset_args.update(args.multi_dataset_json) elif hasattr(args, 'anomalies_dataset') and args.anomalies_dataset: dataset_args.update({'lisp_filter': args.anomaly_filter_}) elif hasattr(args, 'lisp_filter') and args.lisp_filter: dataset_args.update({'lisp_filter': args.lisp_filter}) elif hasattr(args, 'json_filter') and args.json_filter: dataset_args.update({'json_filter': args.json_filter}) else: dataset_args.update(args.dataset_json_generators) new_dataset = r.create_dataset(origin_resource, dataset_args, args, api=api, path=path, session_file=session_file, log=log, dataset_type=suffix) else: new_dataset = datasets[0] return new_dataset, resume
def library_processing(api, args, session_file=None, path=None, log=None): """Creating or retrieving a library """ library = None resume = args.resume if args.code_file or args.code: # If resuming, try to extract args.library form log files if resume: message = u.dated("Library not found. Resuming.\n") resume, library = c.checkpoint(c.is_library_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if not resume: args.resume = resume if args.code_file: try: with open(args.code_file) as code_file: source_code = code_file.read() except IOError: sys.exit("Failed to find the source code file: %s" % args.code_file) else: source_code = args.code # Check if there's a created project for it args.project_id = pp.project_processing(api, args, resume, session_file=session_file, path=path, log=log) # Check if we are upgrading if args.upgrade: library = u.get_last_resource("library", api, build_query_string(args)) log_created_resources("library", path, library, mode='a') message = u.dated("Library found: %s \n" " (library ID: %s)\n" % (args.name, library)) u.log_message(message, log_file=session_file, console=args.verbosity) if library is None: library_args = rl.set_library_args(args) add_version_tag(library_args, args.name) library = rl.create_library(source_code, library_args, args, api, path, session_file, log) return library
def create_categories_datasets(dataset, distribution, fields, args, api, resume, session_file=None, path=None, log=None, other_label=OTHER): """Generates a new dataset using a subset of categories of the original one """ if args.max_categories < 1: sys.exit("--max-categories can only be a positive number.") datasets = [] categories_splits = [distribution[i: i + args.max_categories] for i in range(0, len(distribution), args.max_categories)] number_of_datasets = len(categories_splits) if resume: resume, datasets = c.checkpoint( c.are_datasets_created, path, number_of_datasets, debug=args.debug) if not resume: message = u.dated("Found %s datasets out of %s. Resuming.\n" % (len(datasets), number_of_datasets)) u.log_message(message, log_file=session_file, console=args.verbosity) if not resume: for i in range(len(datasets), number_of_datasets): split = categories_splits[i] category_selector = "(if (or" for element in split: category = element[0] category_selector += " (= v \"%s\")" % category category_selector += ") v \"%s\")" % other_label category_generator = "(let (v (f %s)) %s)" % ( fields.objective_field, category_selector) try: dataset_args = { "all_but": [fields.objective_field], "new_fields": [ {"name": fields.field_name(fields.objective_field), "field": category_generator, "label": "max_categories: %s" % args.max_categories}], "user_metadata": {"max_categories": args.max_categories, "other_label": other_label}} except ValueError, exc: sys.exit(exc) new_dataset = r.create_dataset( dataset, dataset_args, args, api=api, path=path, session_file=session_file, log=log, dataset_type="parts") new_dataset = bigml.api.check_resource(new_dataset, api.get_dataset) datasets.append(new_dataset)
def create_new_dataset( datasets, api, args, resume, name=None, description=None, fields=None, dataset_fields=None, objective_field=None, session_file=None, path=None, log=None, ): """Generates a new dataset using the generators given in a generators file or a multi-dataset from a list of datasets """ origin_resource = datasets if not isinstance(datasets, basestring) and args.multi_dataset: suffix = "multi" else: datasets = [] suffix = "gen" number_of_datasets = 1 if resume: resume, datasets = c.checkpoint( c.are_datasets_created, path, number_of_datasets, debug=args.debug, suffix=suffix ) if not resume: message = u.dated("Found %s datasets out of %s. Resuming.\n" % (len(datasets), number_of_datasets)) u.log_message(message, log_file=session_file, console=args.verbosity) if not resume: dataset_args = r.set_dataset_args( name, description, args, fields, dataset_fields, objective_field=objective_field ) if args.multi_dataset and args.multi_dataset_json: dataset_args.update(args.multi_dataset_json) else: dataset_args.update(args.dataset_json_generators) new_dataset = r.create_dataset( origin_resource, dataset_args, args, api=api, path=path, session_file=session_file, log=log, dataset_type=suffix, ) else: new_dataset = datasets[0] return new_dataset, resume
def time_series_processing(datasets, time_series, \ time_series_ids, api, args, resume, fields=None, \ session_file=None, path=None, log=None): """Creates or retrieves time_series from the input data """ # If we have a dataset but not a model, we create the model if the no_model # flag hasn't been set up. if datasets and not (has_time_series(args) or \ args.no_time_series): time_series_ids = [] time_series_set = [] # Only 1 time-series per bigmler command at present number_of_time_series = 1 if resume: resume, time_series_ids = c.checkpoint( \ c.are_time_series_created, path, \ number_of_time_series, debug=args.debug) if not resume: message = u.dated("Found %s time-series out of %s." " Resuming.\n" % (len(time_series_ids), number_of_time_series)) u.log_message(message, log_file=session_file, console=args.verbosity) time_series_set = time_series_ids number_of_time_series -= len(time_series_ids) time_series_args = r.set_time_series_args( \ args, fields=fields, objective_id=args.objective_id_) time_series_set, time_series_ids = \ r.create_time_series( \ datasets, time_series_set, time_series_args, \ args, api, path, session_file, log) # If a time_series is provided, we use it. elif args.time_series: time_series_ids = [args.time_series] time_series_set = time_series_ids[:] elif args.time_series_set or args.time_series_tag: time_series_set = time_series_ids[:] # If we are going to predict we must retrieve the time-series if time_series_ids and args.export_fields: time_series_set, time_series_ids = \ r.get_time_series(time_series_set, args, api, \ session_file) return time_series_set, time_series_ids, resume
def remote_predict(model, test_dataset, batch_prediction_args, args, api, resume, prediction_file=None, session_file=None, path=None, log=None): """Computes a prediction for each entry in the `test_set`. Predictions are computed remotely using the batch predictions call. """ if args.ensemble is not None: model_or_ensemble = args.ensemble else: model_or_ensemble = bigml.api.get_model_id(model) # if resuming, try to extract dataset form log files if resume: message = u.dated("Batch prediction not found. Resuming.\n") resume, batch_prediction = c.checkpoint(c.is_batch_prediction_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if not resume: batch_prediction = create_batch_prediction(model_or_ensemble, test_dataset, batch_prediction_args, args, api, session_file=session_file, path=path, log=log) if not args.no_csv: api.download_batch_prediction(batch_prediction, prediction_file) if args.to_dataset: batch_prediction = bigml.api.check_resource(batch_prediction, api=api) new_dataset = bigml.api.get_dataset_id( batch_prediction['object']['output_dataset_resource']) if new_dataset is not None: message = u.dated("Batch prediction dataset created: %s\n" % u.get_url(new_dataset)) u.log_message(message, log_file=session_file, console=args.verbosity) u.log_created_resources("batch_prediction_dataset", path, new_dataset, mode='a')
def remote_predict(models, test_reader, prediction_file, api, resume=False, verbosity=True, output_path=None, method=PLURALITY_CODE, tags="", session_file=None, log=None, debug=False, prediction_info=None): """Retrieve predictions remotely, combine them and save predictions to file """ predictions_files = [] prediction_args = { "tags": tags } test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) message_logged = False raw_input_data_list = [] for model in models: model = bigml.api.get_model_id(model) predictions_file = get_predictions_file_name(model, output_path) predictions_files.append(predictions_file) if (not resume or not c.checkpoint(c.are_predictions_created, predictions_file, test_reader.number_of_tests(), debug=debug)): if not message_logged: message = u.dated("Creating remote predictions.") u.log_message(message, log_file=session_file, console=verbosity) message_logged = True predictions_file = csv.writer(open(predictions_file, 'w', 0), lineterminator="\n") for input_data in test_reader: raw_input_data_list.append(input_data) input_data_dict = test_reader.dict(input_data) prediction = api.create_prediction(model, input_data_dict, by_name=test_set_header, wait_time=0, args=prediction_args) u.check_resource_error(prediction, "Failed to create prediction: ") u.log_message("%s\n" % prediction['resource'], log_file=log) prediction_row = prediction_to_row(prediction) predictions_file.writerow(prediction_row) combine_votes(predictions_files, Model(models[0]).to_prediction, prediction_file, method, prediction_info, raw_input_data_list)
def anomalies_processing(datasets, anomalies, anomaly_ids, api, args, resume, fields=None, session_file=None, path=None, log=None): """Creates or retrieves anomalies from the command data """ # If we have a dataset but not a model, we create the model if the no_model # flag hasn't been set up. if datasets and not (has_anomalies(args) or args.no_anomaly): anomaly_ids = [] anomalies = [] # Only 1 anomaly detector per bigmler command at present number_of_anomalies = 1 if resume: resume, anomaly_ids = c.checkpoint( c.are_anomalies_created, path, number_of_anomalies, debug=args.debug) if not resume: message = u.dated("Found %s anomaly detectors out of %s." " Resuming.\n" % (len(anomaly_ids), number_of_anomalies)) u.log_message(message, log_file=session_file, console=args.verbosity) anomalies = anomaly_ids number_of_anomalies -= len(anomaly_ids) anomaly_args = r.set_anomaly_args(args, fields=fields, anomaly_fields=args.anomaly_fields_) anomalies, anomaly_ids = r.create_anomalies(datasets, anomalies, anomaly_args, args, api, path, session_file, log) # If an anomaly detector is provided, we use it. elif args.anomaly: anomaly_ids = [args.anomaly] anomalies = anomaly_ids[:] elif args.anomalies or args.anomaly_tag: anomalies = anomaly_ids[:] # If we are going to predict we must retrieve the anomalies if anomaly_ids and args.test_set: anomalies, anomaly_ids = r.get_anomalies(anomalies, args, api, session_file) return anomalies, anomaly_ids, resume