def connector_dispatcher(args=sys.argv[1:]): """Parses command line and calls the different processing functions """ command_args, _, api, session_file, _ = get_context(args, SETTINGS) path = u.check_dir(command_args.output) log = None if command_args.log_file: u.check_dir(command_args.log_file) log = command_args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) if not command_args.external_connector_id and \ u.has_connection_info(command_args): # create connector pec.connector_processing(api, command_args, command_args.resume, session_file=session_file, path=path, log=log) if command_args.external_connector_id and ( command_args.connector_attributes or command_args.name or command_args.tag or command_args.description or command_args.category): # update connector's attributes pec.update_external_connector(command_args, api, command_args.resume, \ session_file=session_file) u.log_message("_" * 80 + "\n", log_file=session_file) u.print_generated_files(command_args.output_dir, log_file=session_file, verbosity=command_args.verbosity)
def set_source_args(data_set_header, name, description, args): """Returns a source arguments dict """ source_args = { "name": name, "description": description, "category": args.category, "tags": args.tag, "source_parser": { "header": data_set_header } } # If user has given an OS locale, try to add the locale used in bigml.com if args.user_locale is not None: source_locale = bigml_locale(args.user_locale) if source_locale is None: log_message("WARNING: %s locale equivalence not found." " Using %s instead.\n" % (args.user_locale, LOCALE_DEFAULT), log_file=None, console=True) source_locale = LOCALE_DEFAULT source_args["source_parser"].update({'locale': source_locale}) return source_args
def project_dispatcher(args=sys.argv[1:]): """Parses command line and calls the different processing functions """ command_args, command, api, session_file, resume = get_context(args, SETTINGS) path = u.check_dir(command_args.output) log = None if command_args.log_file: u.check_dir(command_args.log_file) log = command_args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) if not command_args.project_id and command_args.name: command_args.project = command_args.name if command_args.project: # create project pp.project_processing( api, command_args, command_args.resume, session_file=session_file, path=path, log=log, create=True) if command_args.project_id and ( command_args.project_attributes or command_args.name or command_args.tag or command_args.description or command_args.category): # update project's attributes pp.update_project(command_args, api, command_args.resume, \ session_file=session_file) u.log_message("_" * 80 + "\n", log_file=session_file) u.print_generated_files(command_args.output_dir, log_file=session_file, verbosity=command_args.verbosity)
def prediction(models, fields, args, session_file=None): """Computes a supervised model prediction for each entry in the `test_set`. """ test_set = args.test_set test_set_header = args.test_header output = args.predictions test_reader = TestReader(test_set, test_set_header, fields, None, test_separator=args.test_separator) with UnicodeWriter(output, lineterminator="\n") as output: # columns to exclude if input_data is added to the prediction field exclude = use_prediction_headers(args.prediction_header, output, test_reader, fields, args, args.objective_field, quality="probability") # Local predictions: Predictions are computed locally message = u.dated("Creating local predictions.\n") u.log_message(message, log_file=session_file, console=args.verbosity) local_prediction(models, test_reader, output, args, exclude=exclude) test_reader.close()
def delete_dispatcher(args=sys.argv[1:]): """Parses command line and calls the different processing functions """ command = command_handling(args, COMMAND_LOG) # Parses command line arguments. command_args = a.parse_and_check(command) if command_args.resume: command_args, session_file, _ = get_stored_command( args, command_args.debug, command_log=COMMAND_LOG, dirs_log=DIRS_LOG, sessions_log=SESSIONS_LOG) else: if command_args.output_dir is None: command_args.output_dir = a.NOW directory = u.check_dir(os.path.join(command_args.output_dir, "tmp")) session_file = os.path.join(directory, SESSIONS_LOG) u.log_message(command.command + "\n", log_file=session_file) try: shutil.copy(DEFAULTS_FILE, os.path.join(directory, DEFAULTS_FILE)) except IOError: pass u.sys_log_message(u"%s\n" % os.path.abspath(directory), log_file=DIRS_LOG) # If --clear-logs the log files are cleared if "--clear-logs" in args: clear_log_files(LOG_FILES) # Creates the corresponding api instance api = a.get_api_instance(command_args, u.check_dir(session_file)) delete_resources(command_args, api) u.log_message("_" * 80 + "\n", log_file=session_file)
def update_deepnet(deepnet, deepnet_args, args, api=None, path=None, session_file=None): """Updates deepnet properties """ if api is None: api = bigml.api.BigML() message = dated("Updating deepnet. %s\n" % get_url(deepnet)) log_message(message, log_file=session_file, console=args.verbosity) deepnet = api.update_deepnet(deepnet, deepnet_args) check_resource_error(deepnet, "Failed to update deepnet: %s" % deepnet['resource']) deepnet = check_resource(deepnet, api.get_deepnet, query_string=FIELDS_QS, raise_on_error=True) if is_shared(deepnet): message = dated("Shared deepnet link. %s\n" % get_url(deepnet, shared=True)) log_message(message, log_file=session_file, console=args.verbosity) if args.reports: report(args.reports, path, deepnet) return deepnet
def remote_centroid(cluster, test_dataset, batch_centroid_args, args, api, resume, prediction_file=None, session_file=None, path=None, log=None): """Computes a centroid for each entry in the `test_set`. Predictions are computed remotely using the batch centroid call. """ cluster_id = bigml.api.get_cluster_id(cluster) # if resuming, try to extract dataset form log files if resume: message = u.dated("Batch centroid not found. Resuming.\n") resume, batch_centroid = c.checkpoint( c.is_batch_centroid_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if not resume: batch_centroid = create_batch_centroid( cluster_id, test_dataset, batch_centroid_args, args, api, session_file=session_file, path=path, log=log) if not args.no_csv: file_name = api.download_batch_centroid(batch_centroid, prediction_file) if file_name is None: sys.exit("Failed downloading CSV.") if args.to_dataset: batch_centroid = bigml.api.check_resource(batch_centroid, api=api) new_dataset = bigml.api.get_dataset_id( batch_centroid['object']['output_dataset_resource']) if new_dataset is not None: message = u.dated("Batch centroid dataset created: %s\n" % u.get_url(new_dataset)) u.log_message(message, log_file=session_file, console=args.verbosity) u.log_created_resources("batch_centroid_dataset", path, new_dataset, mode='a')
def create_batch_prediction(model_or_ensemble, test_dataset, batch_prediction_args, args, api=None, session_file=None, path=None, log=None): """Creates remote batch_prediction """ if api is None: api = bigml.api.BigML() message = dated("Creating batch prediction.\n") log_message(message, log_file=session_file, console=args.verbosity) batch_prediction = api.create_batch_prediction(model_or_ensemble, test_dataset, batch_prediction_args) log_created_resources("batch_prediction", path, bigml.api.get_batch_prediction_id(batch_prediction), open_mode='a') batch_prediction_id = check_resource_error( batch_prediction, "Failed to create batch prediction: ") try: batch_prediction = check_resource(batch_prediction, api.get_batch_prediction) except ValueError, exception: sys.exit("Failed to get a finished batch prediction: %s" % str(exception))
def model_per_label(labels, datasets, fields, objective_field, api, args, resume, name=None, description=None, model_fields=None, multi_label_data=None, session_file=None, path=None, log=None): """Creates a model per label for multi-label datasets """ model_ids = [] models = [] args.number_of_models = len(labels) if resume: resume, model_ids = c.checkpoint( c.are_models_created, path, args.number_of_models, debug=args.debug) if not resume: message = u.dated("Found %s models out of %s." " Resuming.\n" % (len(model_ids), args.number_of_models)) u.log_message(message, log_file=session_file, console=args.verbosity) models = model_ids args.number_of_models = len(labels) - len(model_ids) model_args_list = r.set_label_model_args( name, description, args, labels, multi_label_data, fields, model_fields, objective_field) # create models changing the input_field to select # only one label at a time models, model_ids = r.create_models( datasets, models, model_args_list, args, api, path, session_file, log) args.number_of_models = 1 return models, model_ids, resume
def create_source(data_set, source_args, args, api=None, path=None, session_file=None, log=None, source_type=None): """Creates remote source """ if api is None: api = bigml.api.BigML() suffix = "" if source_type is None else "%s " % source_type message = dated("Creating %ssource.\n" % suffix) log_message(message, log_file=session_file, console=args.verbosity) source = api.create_source(data_set, source_args, progress_bar=args.progress_bar) if path is not None: try: suffix = "_" + source_type if source_type else "" with open("%s/source%s" % (path, suffix), 'w', 0) as source_file: source_file.write("%s\n" % source['resource']) source_file.write("%s\n" % source['object']['name']) except IOError, exc: sys.exit("%s: Failed to write %s/source" % (str(exc), path))
def update_topic_model(topic_model, topic_model_args, args, api=None, path=None, session_file=None): """Updates topic model properties """ if api is None: api = bigml.api.BigML() message = dated("Updating topic model. %s\n" % get_url(topic_model)) log_message(message, log_file=session_file, console=args.verbosity) topic_model = api.update_topic_model(topic_model, \ topic_model_args) check_resource_error( topic_model, "Failed to update topic model: %s" % topic_model['resource']) topic_model = check_resource(topic_model, api.get_topic_model, query_string=FIELDS_QS, raise_on_error=True) if is_shared(topic_model): message = dated("Shared topic model link. %s\n" % get_url(topic_model, shared=True)) log_message(message, log_file=session_file, console=args.verbosity) if args.reports: report(args.reports, path, topic_model) return topic_model
def create_batch_prediction(model_or_ensemble, test_dataset, batch_prediction_args, verbosity, api=None, session_file=None, path=None, log=None): """Creates remote batch_prediction """ if api is None: api = bigml.api.BigML() message = dated("Creating batch prediction.\n") log_message(message, log_file=session_file, console=verbosity) batch_prediction = api.create_batch_prediction(model_or_ensemble, test_dataset, batch_prediction_args) log_created_resources("batch_prediction", path, bigml.api.get_batch_prediction_id(batch_prediction), open_mode='a') batch_prediction_id = check_resource_error( batch_prediction, "Failed to create batch prediction: ") try: batch_prediction = check_resource(batch_prediction, api.get_batch_prediction) except ValueError, exception: sys.exit("Failed to get a finished batch prediction: %s" % str(exception))
def update_association(association, association_args, args, api=None, path=None, session_file=None): """Updates association properties """ if api is None: api = bigml.api.BigML() message = dated("Updating association. %s\n" % get_url(association)) log_message(message, log_file=session_file, console=args.verbosity) association = api.update_association(association, association_args) check_resource_error(association, "Failed to update association: %s" % association['resource']) association = check_resource(association, api.get_association, query_string=FIELDS_QS, raise_on_error=True) if is_shared(association): message = dated("Shared association link. %s\n" % get_url(association, shared=True)) log_message(message, log_file=session_file, console=args.verbosity) if args.reports: report(args.reports, path, association) return association
def remote_anomaly_score(anomaly, test_dataset, batch_anomaly_score_args, args, api, resume, prediction_file=None, session_file=None, path=None, log=None): """Computes an anomaly score for each entry in the `test_set`. Predictions are computed remotely using the batch anomaly score call. """ anomaly_id = bigml.api.get_anomaly_id(anomaly) # if resuming, try to extract dataset form log files if resume: message = u.dated("Batch anomaly score not found. Resuming.\n") resume, batch_anomaly_score = c.checkpoint( c.is_batch_anomaly_score_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if not resume: batch_anomaly_score = create_batch_anomaly_score( anomaly_id, test_dataset, batch_anomaly_score_args, args, api, session_file=session_file, path=path, log=log) if not args.no_csv: api.download_batch_anomaly_score(batch_anomaly_score, prediction_file) if args.to_dataset: batch_anomaly_score = bigml.api.check_resource(batch_anomaly_score, api=api) new_dataset = bigml.api.get_dataset_id( batch_anomaly_score['object']['output_dataset_resource']) if new_dataset is not None: message = u.dated("Batch anomaly score dataset created: %s\n" % u.get_url(new_dataset)) u.log_message(message, log_file=session_file, console=args.verbosity) u.log_created_resources("batch_anomaly_score_dataset", path, new_dataset, open_mode='a')
def create_evaluation(model_or_ensemble, dataset, evaluation_args, args, api=None, path=None, session_file=None, log=None, seed=SEED): """Create evaluation ``model_or_ensemble``: resource object or id for the model or ensemble that should be evaluated ``dataset``: dataset object or id to evaluate with ``evaluation_args``: arguments for the ``create_evaluation`` call ``args``: input values for bigmler flags ``api``: api to remote objects in BigML ``path``: directory to store the BigMLer generated files in ``session_file``: file to store the messages of that session ``log``: user provided log file ``seed``: seed for the dataset sampling (when needed) """ if api is None: api = bigml.api.BigML() if args.cross_validation_rate > 0: evaluation_args.update(seed=seed) message = dated("Creating evaluation.\n") log_message(message, log_file=session_file, console=args.verbosity) evaluation = api.create_evaluation(model_or_ensemble, dataset, evaluation_args) log_created_resources("evaluation", path, bigml.api.get_evaluation_id(evaluation)) check_resource_error(evaluation, "Failed to create evaluation: ") log_message("%s\n" % evaluation['resource'], log_file=log) return evaluation
def main_dispatcher(args=sys.argv[1:]): """Parses command line and calls the different processing functions """ # If --clear-logs the log files are cleared if "--clear-logs" in args: clear_log_files(LOG_FILES) command = command_handling(args, COMMAND_LOG) # Parses command line arguments. command_args = a.parse_and_check(command) default_output = ('evaluation' if command_args.evaluate else 'predictions.csv') resume = command_args.resume if command_args.resume: command_args, session_file, output_dir = get_stored_command( args, command_args.debug, command_log=COMMAND_LOG, dirs_log=DIRS_LOG, sessions_log=SESSIONS_LOG) default_output = ('evaluation' if command_args.evaluate else 'predictions.csv') if command_args.predictions is None: command_args.predictions = os.path.join(output_dir, default_output) else: if command_args.output_dir is None: command_args.output_dir = a.NOW if command_args.predictions is None: command_args.predictions = os.path.join(command_args.output_dir, default_output) if len(os.path.dirname(command_args.predictions).strip()) == 0: command_args.predictions = os.path.join(command_args.output_dir, command_args.predictions) directory = u.check_dir(command_args.predictions) session_file = os.path.join(directory, SESSIONS_LOG) u.log_message(command.command + "\n", log_file=session_file) try: defaults_file = open(DEFAULTS_FILE, 'r') contents = defaults_file.read() defaults_file.close() defaults_copy = open(os.path.join(directory, DEFAULTS_FILE), 'w', 0) defaults_copy.write(contents) defaults_copy.close() except IOError: pass u.sys_log_message(u"%s\n" % os.path.abspath(directory), log_file=DIRS_LOG) # Creates the corresponding api instance api = a.get_api_instance(command_args, u.check_dir(session_file)) if (a.has_train(command_args) or a.has_test(command_args) or command_args.votes_dirs): output_args = a.get_output_args(api, command_args, resume) a.transform_args(command_args, command.flags, api, command.user_defaults) compute_output(**output_args) u.log_message("_" * 80 + "\n", log_file=session_file)
def anomaly_score(anomalies, fields, args, session_file=None): """Computes an anomaly score for each entry in the `test_set`. """ test_set = args.test_set test_set_header = args.test_header output = args.predictions test_reader = TestReader(test_set, test_set_header, fields, None, test_separator=args.test_separator) with UnicodeWriter(output, lineterminator="\n") as output: # columns to exclude if input_data is added to the prediction field exclude = use_prediction_headers(args.prediction_header, output, test_reader, fields, args) # Local anomaly scores: Anomaly scores are computed locally using # the local anomaly detector method message = u.dated("Creating local anomaly scores.\n") u.log_message(message, log_file=session_file, console=args.verbosity) local_anomaly_score(anomalies, test_reader, output, args, exclude=exclude) test_reader.close()
def ensemble_processing(datasets, api, args, resume, fields=None, session_file=None, path=None, log=None): """Creates an ensemble of models from the input data """ ensembles = [] ensemble_ids = [] number_of_ensembles = len(datasets) if resume: resume, ensemble_ids = c.checkpoint( c.are_ensembles_created, path, number_of_ensembles, debug=args.debug) if not resume: message = u.dated("Found %s ensembles out of %s. Resuming.\n" % (len(ensemble_ids), number_of_ensembles)) u.log_message(message, log_file=session_file, console=args.verbosity) ensembles = ensemble_ids number_of_ensembles -= len(ensemble_ids) if number_of_ensembles > 0: ensemble_args = r.set_ensemble_args(args, fields=fields) ensembles, ensemble_ids, models, model_ids = r.create_ensembles( datasets, ensembles, ensemble_args, args, api=api, path=path, number_of_ensembles=number_of_ensembles, session_file=session_file, log=log) return ensembles, ensemble_ids, models, model_ids, resume
def evaluations_process(time_series_set, datasets, fields, dataset_fields, api, args, resume, session_file=None, path=None, log=None): """Evaluates time-series against datasets """ existing_evaluations = 0 evaluations = [] number_of_evaluations = len(time_series_set) if resume: resume, evaluations = c.checkpoint(c.are_evaluations_created, path, number_of_evaluations, debug=args.debug) if not resume: existing_evaluations = len(evaluations) message = u.dated("Found %s evaluations from %s. Resuming.\n" % (existing_evaluations, number_of_evaluations)) number_of_evaluations -= existing_evaluations u.log_message(message, log_file=session_file, console=args.verbosity) if not resume: evaluation_args = r.set_evaluation_args(args, fields, dataset_fields) evaluations.extend(r.create_evaluations( time_series_set, datasets, evaluation_args, args, api, path=path, session_file=session_file, log=log, existing_evaluations=existing_evaluations)) return evaluations, resume
def create_kfold_datasets_file(args, api, common_options, resume=False): """Create the kfold dataset resources and store their ids in a file one per line """ message = ('Creating the kfold datasets............\n') u.log_message(message, log_file=session_file, console=args.verbosity) if args.output_dir is None: args.output_dir = a.NOW # retrieve dataset dataset_id = bigml.api.get_dataset_id(args.dataset) if dataset_id: dataset = api.check_resource(dataset_id, api.get_dataset) # check that kfold_field is unique fields = Fields(dataset, {"objective_field": args.objective_field, "objective_field_present": True}) objective_id = fields.field_id(fields.objective_field) kfold_field_name = avoid_duplicates(DEFAULT_KFOLD_FIELD, fields) # create jsons to generate partial datasets selecting_file_list, resume = create_kfold_json(args, kfold_field_name, objective_id, resume=resume) # generate test datasets datasets_file, resume = create_kfold_datasets(dataset_id, args, selecting_file_list, fields.objective_field, kfold_field_name, common_options, resume=resume) return datasets_file, fields.field_column_number(objective_id), resume return None, None, None
def create_batch_anomaly_score(anomaly, test_dataset, batch_anomaly_score_args, args, api=None, session_file=None, path=None, log=None): """Creates remote batch anomaly score """ if api is None: api = bigml.api.BigML() message = dated("Creating batch anomaly score.\n") log_message(message, log_file=session_file, console=args.verbosity) batch_anomaly_score = api.create_batch_anomaly_score( anomaly, test_dataset, batch_anomaly_score_args, retries=None) log_created_resources( "batch_anomaly_score", path, bigml.api.get_batch_anomaly_score_id(batch_anomaly_score), mode='a') batch_anomaly_score_id = check_resource_error( batch_anomaly_score, "Failed to create batch prediction: ") try: batch_anomaly_score = check_resource(batch_anomaly_score, api.get_batch_anomaly_score, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a finished batch anomaly score: %s" % str(exception))
def get_time_series(time_series_ids, args, api=None, session_file=None): """Retrieves remote time-series in its actual status """ if api is None: api = bigml.api.BigML() time_series_id = "" time_series_set = time_series_ids time_series_id = time_series_ids[0] message = dated( "Retrieving %s. %s\n" % (plural("time-series", len(time_series_ids)), get_url(time_series_id))) log_message(message, log_file=session_file, console=args.verbosity) # only one time-series to predict at present try: # we need the whole fields structure when exporting fields query_string = FIELDS_QS if not args.export_fields else ALL_FIELDS_QS time_series = check_resource(time_series_ids[0], api.get_time_series, query_string=query_string, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a finished time-series: %s" % \ str(exception))
def get_models(model_ids, args, api=None, session_file=None): """Retrieves remote models in its actual status """ if api is None: api = bigml.api.BigML() model_id = "" models = model_ids single_model = len(model_ids) == 1 if single_model: model_id = model_ids[0] message = dated("Retrieving %s. %s\n" % (plural("model", len(model_ids)), get_url(model_id))) log_message(message, log_file=session_file, console=args.verbosity) if len(model_ids) < args.max_batch_models: models = [] for model in model_ids: try: # if there's more than one model the first one must contain # the entire field structure to be used as reference. query_string = (ALL_FIELDS_QS if not single_model and (len(models) == 0 or args.multi_label) else FIELDS_QS) model = check_resource(model, api.get_model, query_string=query_string) except ValueError, exception: sys.exit("Failed to get a finished model: %s" % str(exception)) models.append(model) model = models[0]
def update_time_series(time_series, time_series_args, args, api=None, path=None, session_file=None): """Updates time-series properties """ if api is None: api = bigml.api.BigML() message = dated("Updating time-series. %s\n" % get_url(time_series)) log_message(message, log_file=session_file, console=args.verbosity) time_series = api.update_time_series(time_series, \ time_series_args) check_resource_error( time_series, "Failed to update time-series: %s" % time_series['resource']) time_series = check_resource(time_series, api.get_time_series, query_string=FIELDS_QS, raise_on_error=True) if is_shared(time_series): message = dated("Shared time-series link. %s\n" % get_url(time_series, shared=True)) log_message(message, log_file=session_file, console=args.verbosity) if args.reports: report(args.reports, path, time_series) return time_series
def get_models(model_ids, args, api=None, session_file=None): """Retrieves remote models in its actual status """ if api is None: api = bigml.api.BigML() model_id = "" models = model_ids single_model = len(model_ids) == 1 if single_model: model_id = model_ids[0] message = dated("Retrieving %s. %s\n" % (plural("model", len(model_ids)), get_url(model_id))) log_message(message, log_file=session_file, console=args.verbosity) if len(model_ids) < args.max_batch_models: models = [] for model in model_ids: try: # if there's more than one model the first one must contain # the entire field structure to be used as reference. query_string = ( ALL_FIELDS_QS if ( (not single_model and (not models or args.multi_label)) or not args.test_header) else FIELDS_QS) model = check_resource(model, api.get_model, query_string=query_string, raise_on_error=True) except Exception, exception: sys.exit("Failed to get a finished model: %s" % str(exception)) models.append(model) model = models[0]
def get_models(model_ids, args, api=None, session_file=None): """Retrieves remote models in its actual status """ if api is None: api = bigml.api.BigML() model_id = "" models = model_ids if len(model_ids) == 1: model_id = model_ids[0] message = dated("Retrieving %s. %s\n" % (plural("model", len(model_ids)), get_url(model_id))) log_message(message, log_file=session_file, console=args.verbosity) if len(model_ids) < args.max_batch_models: models = [] for model in model_ids: try: model = check_resource(model, api.get_model, query_string=FIELDS_QS) except ValueError, exception: sys.exit("Failed to get a finished model: %s" % str(exception)) models.append(model) model = models[0]
def checkpoint(function, *args, **kwargs): """Redirects to each checkpoint function """ common_parms = ['debug', 'message', 'log_file', 'console'] debug = kwargs.get('debug', False) message = kwargs.get('message', None) log_file = kwargs.get('log_file', None) console = kwargs.get('console', False) f_kwargs = { key: value for key, value in kwargs.items() if not key in common_parms } result = function(*args, **f_kwargs) if debug: console_log( "Checkpoint: checking %s with args:\n%s\n\nResult:\n%s\n" % (function.__name__, "\n".join([repr(arg) for arg in args]), repr(result))) # resume is the first element in the result tuple if not result[0] and message is not None: log_message(message, log_file=log_file, console=console) return result
def topic_distribution(topic_models, fields, args, session_file=None): """Computes a topic distribution for each entry in the `test_set`. """ test_set = args.test_set test_set_header = args.test_header output = args.predictions test_reader = TestReader(test_set, test_set_header, fields, None, test_separator=args.test_separator) with UnicodeWriter(output, lineterminator="\n") as output: # columns to exclude if input_data is added to the prediction field exclude, headers = use_prediction_headers(test_reader, fields, args) # Local topic distributions: Topic distributions are computed # locally using topic models' # method message = u.dated("Creating local topic distributions.\n") u.log_message(message, log_file=session_file, console=args.verbosity) local_topic_distribution(topic_models, test_reader, output, args, exclude=exclude, headers=headers) test_reader.close()
def create_dataset(source_or_dataset, dataset_args, args, api=None, path=None, session_file=None, log=None, dataset_type=None): """Creates remote dataset from source, dataset or datasets list """ if api is None: api = bigml.api.BigML() message = dated("Creating dataset.\n") log_message(message, log_file=session_file, console=args.verbosity) dataset = api.create_dataset(source_or_dataset, dataset_args) suffix = "_" + dataset_type if dataset_type else "" log_created_resources("dataset%s" % suffix, path, bigml.api.get_dataset_id(dataset), open_mode='a') dataset_id = check_resource_error(dataset, "Failed to create dataset: ") try: dataset = check_resource(dataset, api.get_dataset, query_string=ALL_FIELDS_QS) except ValueError, exception: sys.exit("Failed to get a finished dataset: %s" % str(exception))
def set_source_args(data_set_header, name, description, args, multi_label_data=None): """Returns a source arguments dict """ source_args = { "name": name, "description": description, "category": args.category, "tags": args.tag, "source_parser": {"header": data_set_header}} # If user has given an OS locale, try to add the locale used in bigml.com if args.user_locale is not None: source_locale = bigml_locale(args.user_locale) if source_locale is None: log_message("WARNING: %s locale equivalence not found." " Using %s instead.\n" % (args.user_locale, LOCALE_DEFAULT), log_file=None, console=True) source_locale = LOCALE_DEFAULT source_args["source_parser"].update({'locale': source_locale}) # If user has set a training separator, use it. if args.training_separator is not None: training_separator = args.training_separator.decode("string_escape") source_args["source_parser"].update({'separator': training_separator}) # If uploading a multi-label file, add the user_metadata info needed to # manage the multi-label fields if args.multi_label and multi_label_data is not None: source_args.update( {"user_metadata": {"multi_label_data": multi_label_data}}) if args.json_args['source']: source_args.update(args.json_args['source']) return source_args
def export_dataset(dataset, api, args, resume, session_file=None, path=None): """Exports the dataset to a CSV file given by the user or a filename based on the dataset id by default. """ filename = csv_name(args.to_csv, path, dataset) if resume: resume = c.checkpoint( c.is_dataset_exported, filename, debug=args.debug) if not resume: message = u.dated("No dataset exported. Resuming.\n") u.log_message(message, log_file=session_file, console=args.verbosity) else: message = u.dated("Exporting dataset to CSV file: %s\n" % filename) u.log_message(message, log_file=session_file, console=args.verbosity) if not resume: file_name = api.download_dataset(dataset, filename=filename) if file_name is None: sys.exit("Failed downloading CSV.") return resume
def evaluations_process(time_series_set, datasets, fields, dataset_fields, api, args, resume, session_file=None, path=None, log=None, objective_field=None): """Evaluates time-series against datasets """ existing_evaluations = 0 evaluations = [] number_of_evaluations = len(time_series_set) if resume: resume, evaluations = c.checkpoint(c.are_evaluations_created, path, number_of_evaluations, debug=args.debug) if not resume: existing_evaluations = len(evaluations) message = u.dated("Found %s evaluations from %s. Resuming.\n" % (existing_evaluations, number_of_evaluations)) number_of_evaluations -= existing_evaluations u.log_message(message, log_file=session_file, console=args.verbosity) if not resume: evaluation_args = r.set_evaluation_args(args, fields, dataset_fields) evaluations.extend(r.create_evaluations( time_series_set, datasets, evaluation_args, args, api, path=path, session_file=session_file, log=log, existing_evaluations=existing_evaluations)) return evaluations, resume
def pca_processing(datasets, pca, \ pca_ids, api, args, resume, fields=None, \ session_file=None, path=None, log=None): """Creates or retrieves pca from the input data """ # If we have a dataset but not a model, we create the model if the no_model # flag hasn't been set up. if datasets and not (has_pca(args) or \ args.no_pca): pca_ids = [] pcas = [] # Only 1 pca per bigmler command at present number_of_pcas = 1 if resume: resume, pca_ids = c.checkpoint( \ c.are_pcas_created, path, \ number_of_pcas, debug=args.debug) if not resume: message = u.dated("Found %s pcas out of %s." " Resuming.\n" % (len(pca_ids), number_of_pcas)) u.log_message(message, log_file=session_file, console=args.verbosity) pcas = pca_ids number_of_pcas -= len(pca_ids) args.exclude_fields = [] if args.exclude_objective: dataset = datasets[0] fields = Fields(dataset) objective_id = \ fields.fields_by_column_number[fields.objective_field] args.exclude_fields = [objective_id] pca_args = r.set_pca_args( \ args, fields=fields, \ pca_fields=args.pca_fields_) pca = \ r.create_pca( \ datasets, pca, pca_args, \ args, api, path, session_file, log) # If a pca is provided, we use it. elif args.pca: pca_ids = [args.pca] pca = pca_ids[0] elif args.pca or args.pca_tag: pca = pca_ids[0] # If we are going to create projections, we must retrieve the pca if pca_ids and (args.test_set or args.export_fields): pca = \ r.get_pca(pca, args, api, session_file) return pca, resume
def remote_predict(model, test_dataset, batch_prediction_args, args, api, resume, prediction_file=None, session_file=None, path=None, log=None): """Computes a prediction for each entry in the `test_set`. Predictions are computed remotely using the batch predictions call. """ if args.ensemble is not None: model_or_ensemble = args.ensemble else: model_or_ensemble = bigml.api.get_model_id(model) # if resuming, try to extract dataset form log files if resume: message = u.dated("Batch prediction not found. Resuming.\n") resume, batch_prediction = c.checkpoint( c.is_batch_prediction_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if not resume: batch_prediction = create_batch_prediction( model_or_ensemble, test_dataset, batch_prediction_args, args, api, session_file=session_file, path=path, log=log) if not args.no_csv: api.download_batch_prediction(batch_prediction, prediction_file) if args.to_dataset: batch_prediction = bigml.api.check_resource(batch_prediction, api=api) new_dataset = bigml.api.get_dataset_id( batch_prediction['object']['output_dataset_resource']) if new_dataset is not None: message = u.dated("Batch prediction dataset created: %s\n" % u.get_url(new_dataset)) u.log_message(message, log_file=session_file, console=args.verbosity) u.log_created_resources("batch_prediction_dataset", path, new_dataset, mode='a')
def cluster_dispatcher(args=sys.argv[1:]): """Parses command line and calls the different processing functions """ # If --clear-logs the log files are cleared if "--clear-logs" in args: clear_log_files(LOG_FILES) command = command_handling(args, COMMAND_LOG) # Parses command line arguments. command_args = a.parse_and_check(command) resume = command_args.resume if command_args.resume: # Keep the debug option if set debug = command_args.debug # Restore the args of the call to resume from the command log file stored_command = StoredCommand(args, COMMAND_LOG, DIRS_LOG) command = Command(None, stored_command=stored_command) # Logs the issued command and the resumed command session_file = os.path.join(stored_command.output_dir, SESSIONS_LOG) stored_command.log_command(session_file=session_file) # Parses resumed arguments. command_args = a.parse_and_check(command) if command_args.predictions is None: command_args.predictions = os.path.join(stored_command.output_dir, DEFAULT_OUTPUT) else: if command_args.output_dir is None: command_args.output_dir = a.NOW if command_args.predictions is None: command_args.predictions = os.path.join(command_args.output_dir, DEFAULT_OUTPUT) if len(os.path.dirname(command_args.predictions).strip()) == 0: command_args.predictions = os.path.join(command_args.output_dir, command_args.predictions) directory = u.check_dir(command_args.predictions) session_file = os.path.join(directory, SESSIONS_LOG) u.log_message(command.command + "\n", log_file=session_file) try: defaults_file = open(DEFAULTS_FILE, "r") contents = defaults_file.read() defaults_file.close() defaults_copy = open(os.path.join(directory, DEFAULTS_FILE), "w", 0) defaults_copy.write(contents) defaults_copy.close() except IOError: pass u.sys_log_message(u"%s\n" % os.path.abspath(directory), log_file=DIRS_LOG) # Creates the corresponding api instance if resume and debug: command_args.debug = True api = a.get_api_instance(command_args, u.check_dir(session_file)) # Selects the action to perform if has_train(command_args) or has_test(command_args) or command_args.cluster_datasets is not None: output_args = a.get_output_args(api, command_args, resume) a.transform_args(command_args, command.flags, api, command.user_defaults) compute_output(**output_args) u.log_message("_" * 80 + "\n", log_file=session_file)
def anomalies_processing(datasets, anomalies, anomaly_ids, api, args, resume, fields=None, session_file=None, path=None, log=None): """Creates or retrieves anomalies from the command data """ # If we have a dataset but not a model, we create the model if the no_model # flag hasn't been set up. if datasets and not (has_anomalies(args) or args.no_anomaly): anomaly_ids = [] anomalies = [] # Only 1 anomaly detector per bigmler command at present number_of_anomalies = 1 if resume: resume, anomaly_ids = c.checkpoint(c.are_anomalies_created, path, number_of_anomalies, debug=args.debug) if not resume: message = u.dated("Found %s anomaly detectors out of %s." " Resuming.\n" % (len(anomaly_ids), number_of_anomalies)) u.log_message(message, log_file=session_file, console=args.verbosity) anomalies = anomaly_ids number_of_anomalies -= len(anomaly_ids) anomaly_args = r.set_anomaly_args(args, fields=fields, anomaly_fields=args.anomaly_fields_) anomalies, anomaly_ids = r.create_anomalies(datasets, anomalies, anomaly_args, args, api, path, session_file, log) # If an anomaly detector is provided, we use it. elif args.anomaly: anomaly_ids = [args.anomaly] anomalies = anomaly_ids[:] elif args.anomalies or args.anomaly_tag: anomalies = anomaly_ids[:] # If we are going to predict we must retrieve the anomalies if anomaly_ids and args.test_set: anomalies, anomaly_ids = r.get_anomalies(anomalies, args, api, session_file) return anomalies, anomaly_ids, resume
def remote_predict_models(models, test_reader, prediction_file, api, args, resume=False, output_path=None, session_file=None, log=None, exclude=None): """Retrieve predictions remotely, combine them and save predictions to file """ predictions_files = [] prediction_args = { "tags": args.tag } test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) message_logged = False raw_input_data_list = [] for input_data in test_reader: raw_input_data_list.append(input_data) single_model = len(models) == 1 if single_model: prediction_file = UnicodeWriter(prediction_file).open_writer() for model in models: model = bigml.api.get_model_id(model) predictions_file = get_predictions_file_name(model, output_path) predictions_files.append(predictions_file) if (not resume or not c.checkpoint(c.are_predictions_created, predictions_file, test_reader.number_of_tests(), debug=args.debug)[0]): if not message_logged: message = u.dated("Creating remote predictions.\n") u.log_message(message, log_file=session_file, console=args.verbosity) message_logged = True with UnicodeWriter(predictions_file) as predictions_file: for input_data in raw_input_data_list: input_data_dict = test_reader.dict(input_data) prediction = api.create_prediction(model, input_data_dict, by_name=test_set_header, wait_time=0, args=prediction_args) u.check_resource_error(prediction, "Failed to create prediction: ") u.log_message("%s\n" % prediction['resource'], log_file=log) prediction_row = prediction_to_row(prediction) predictions_file.writerow(prediction_row) if single_model: write_prediction(prediction_row[0:2], prediction_file, args.prediction_info, input_data, exclude) if single_model: prediction_file.close_writer() else: combine_votes(predictions_files, Model(models[0]).to_prediction, prediction_file, args.method, args.prediction_info, raw_input_data_list, exclude)
def cluster_dispatcher(args=sys.argv[1:]): """Parses command line and calls the different processing functions """ # If --clear-logs the log files are cleared if "--clear-logs" in args: clear_log_files(LOG_FILES) command = command_handling(args, COMMAND_LOG) # Parses command line arguments. command_args = a.parse_and_check(command) resume = command_args.resume if command_args.resume: command_args, session_file, output_dir = get_stored_command( args, command_args.debug, command_log=COMMAND_LOG, dirs_log=DIRS_LOG, sessions_log=SESSIONS_LOG) if command_args.predictions is None: command_args.predictions = os.path.join(output_dir, DEFAULT_OUTPUT) else: if command_args.output_dir is None: command_args.output_dir = a.NOW if command_args.predictions is None: command_args.predictions = os.path.join(command_args.output_dir, DEFAULT_OUTPUT) if len(os.path.dirname(command_args.predictions).strip()) == 0: command_args.predictions = os.path.join(command_args.output_dir, command_args.predictions) directory = u.check_dir(command_args.predictions) session_file = os.path.join(directory, SESSIONS_LOG) u.log_message(command.command + "\n", log_file=session_file) try: defaults_file = open(DEFAULTS_FILE, 'r') contents = defaults_file.read() defaults_file.close() defaults_copy = open(os.path.join(directory, DEFAULTS_FILE), 'w', 0) defaults_copy.write(contents) defaults_copy.close() except IOError: pass u.sys_log_message(u"%s\n" % os.path.abspath(directory), log_file=DIRS_LOG) # Creates the corresponding api instance api = a.get_api_instance(command_args, u.check_dir(session_file)) # Selects the action to perform if (a.has_train(command_args) or a.has_test(command_args) or command_args.cluster_datasets is not None): output_args = a.get_output_args(api, command_args, resume) a.transform_args(command_args, command.flags, api, command.user_defaults) compute_output(**output_args) u.log_message("_" * 80 + "\n", log_file=session_file)
def pca_processing(datasets, pca, \ pca_ids, api, args, resume, fields=None, \ session_file=None, path=None, log=None): """Creates or retrieves pca from the input data """ # If we have a dataset but not a model, we create the model if the no_model # flag hasn't been set up. if datasets and not (has_pca(args) or \ args.no_pca): pca_ids = [] # Only 1 pca per bigmler command at present number_of_pcas = 1 if resume: resume, pca_ids = c.checkpoint( \ c.are_pcas_created, path, \ number_of_pcas, debug=args.debug) if not resume: message = u.dated("Found %s pcas out of %s." " Resuming.\n" % (len(pca_ids), number_of_pcas)) u.log_message(message, log_file=session_file, console=args.verbosity) number_of_pcas -= len(pca_ids) args.exclude_fields = [] if args.exclude_objective: dataset = datasets[0] fields = Fields(dataset) objective_id = \ fields.fields_by_column_number[fields.objective_field] args.exclude_fields = [objective_id] pca_args = r.set_pca_args( \ args, fields=fields, \ pca_fields=args.pca_fields_) pca = \ r.create_pca( \ datasets, pca, pca_args, \ args, api, path, session_file, log) # If a pca is provided, we use it. elif args.pca: pca_ids = [args.pca] pca = pca_ids[0] elif args.pca or args.pca_tag: pca = pca_ids[0] # If we are going to create projections, we must retrieve the pca if pca_ids and (args.test_set or args.export_fields): pca = \ r.get_pca(pca, args, api, session_file) return pca, resume
def fusion_processing(fusion, \ fusion_ids, api, args, resume, fields=None, \ session_file=None, path=None, log=None): """Creates or retrieves fusion from the input data """ # If we have a models' list but not a fusion, # we create the model if the no_model # flag hasn't been set up. if args.fusion_models_ is not None and not has_fusion(args): fusion_ids = [] # Only 1 fusion per bigmler command at present number_of_fusions = 1 if resume: resume, fusion_ids = c.checkpoint( \ c.are_fusions_created, path, \ number_of_fusions, debug=args.debug) if not resume: message = u.dated("Found %s fusions out of %s." " Resuming.\n" % (len(fusion_ids), number_of_fusions)) u.log_message(message, log_file=session_file, console=args.verbosity) fusion = fusion_ids[0] first_model_id = api.get_fusion(fusion)[ \ "object"]["fusion"]["models"][0]["id"] first_model_kind = api.get_fusion(fusion)[ \ "object"]["fusion"]["models"][0]["kind"] first_model = api.getters[first_model_kind](first_model_id) fields = Fields(first_model) number_of_fusions -= len(fusion_ids) fusion_args = r.set_fusion_args( \ args, fields) fusion = \ r.create_fusion( \ args.fusion_models_, fusion, fusion_args, \ args, api, path, session_file, log) # If a fusion is provided, we use it. elif args.fusion: fusion_ids = [args.fusion] fusion = fusion_ids[0] elif args.fusion or args.fusion_tag: fusion = fusion_ids[0] # If we are going to create predictions, we must retrieve the fusion if fusion_ids and args.test_set: fusion = \ r.get_fusion(fusion, args, api, session_file) args.objective_field = fusion['object']['objective_field_name'] return fusion, resume
def evaluations_process(models_or_ensembles, datasets, name, description, fields, dataset_fields, fields_map, api, args, resume, session_file=None, path=None, log=None, labels=None, all_labels=None, objective_field=None): """Evaluates models or ensembles against datasets """ existing_evaluations = 0 evaluations = [] number_of_evaluations = len(models_or_ensembles) if resume: resume, evaluations = c.checkpoint(c.are_evaluations_created, path, number_of_evaluations, debug=args.debug) if not resume: existing_evaluations = len(evaluations) message = u.dated("Found %s evaluations from %s. Resuming.\n" % (existing_evaluations, number_of_evaluations)) number_of_evaluations -= existing_evaluations u.log_message(message, log_file=session_file, console=args.verbosity) if not resume: if args.multi_label: evaluation_args = r.set_label_evaluation_args( name, description, args, labels, all_labels, number_of_evaluations, fields, dataset_fields, fields_map, objective_field) else: evaluation_args = r.set_evaluation_args(name, description, args, fields, dataset_fields, fields_map) evaluations.extend( r.create_evaluations(models_or_ensembles, datasets, evaluation_args, args, api, path=path, session_file=session_file, log=log, existing_evaluations=existing_evaluations)) return evaluations, resume
def local_batch_predict(models, headers, test_reader, exclude, fields, resume, output_path, max_models, number_of_tests, api, output, verbosity, method, objective_field, session_file, debug): """Get local predictions form partial Multimodel, combine and save to file """ def draw_progress_bar(current, total): """Draws a text based progress report. """ pct = 100 - ((total - current) * 100) / (total) console_log("Predicted on %s out of %s models [%s%%]" % ( localize(current), localize(total), pct)) models_total = len(models) models_splits = [models[index:(index + max_models)] for index in range(0, models_total, max_models)] input_data_list = [] for row in test_reader: for index in exclude: del row[index] input_data_list.append(fields.pair(row, headers, objective_field)) total_votes = [] models_count = 0 for models_split in models_splits: if resume: for model in models_split: pred_file = get_predictions_file_name(model, output_path) u.checkpoint(u.are_predictions_created, pred_file, number_of_tests, debug=debug) complete_models = [] for index in range(len(models_split)): complete_models.append(api.check_resource( models_split[index], api.get_model)) local_model = MultiModel(complete_models) local_model.batch_predict(input_data_list, output_path, reuse=True) votes = local_model.batch_votes(output_path) models_count += max_models if models_count > models_total: models_count = models_total if verbosity: draw_progress_bar(models_count, models_total) if total_votes: for index in range(0, len(votes)): predictions = total_votes[index].predictions predictions.extend(votes[index].predictions) else: total_votes = votes message = u.dated("Combining predictions.\n") u.log_message(message, log_file=session_file, console=verbosity) for multivote in total_votes: u.write_prediction(multivote.combine(method), output)
def multi_label_expansion(training_set, training_set_header, args, output_path, labels=None, session_file=None, input_flag=False): """Splitting the labels in a multi-label objective field to create a source with column per label """ objective_field = args.objective_field input_reader = TrainReader(training_set, training_set_header, objective_field, multi_label=True, labels=labels, label_separator=args.label_separator, training_separator=args.training_separator, multi_label_fields=args.multi_label_fields_list, label_aggregates=args.label_aggregates_list, objective=not input_flag) # read file to get all the different labels if no --labels flag is given # or use labels given in --labels and generate the new field names new_headers = input_reader.get_label_headers() try: file_name = os.path.basename(training_set) except AttributeError: file_name = "test_set.csv" if input_flag else "training_set.csv" output_file = "%s%sextended_%s" % (output_path, os.sep, file_name) message = u.dated("Transforming to extended source.\n") u.log_message(message, log_file=session_file, console=args.verbosity) with open(output_file, u.open_mode('w')) as output_handler: output = csv.writer(output_handler, lineterminator="\n") output.writerow(new_headers) # read to write new source file with column per label input_reader.reset() if training_set_header: input_reader.get_next() while True: try: row = input_reader.get_next(extended=True) output.writerow(row) except StopIteration: break # training sources are zipped to minimize upload time and resources if not input_flag: output_file_zip = "%s%sextended_%s.zip" % (output_path, os.sep, file_name) with ZipFile(output_file_zip, 'w', ZIP_DEFLATED) as output_zipped_file: output_zipped_file.write(output_file, file_name) output_file = output_file_zip objective_field = input_reader.headers[input_reader.objective_column] input_reader.close() return (output_file, input_reader.get_multi_label_data())
def remote_predict(models, test_reader, prediction_file, api, resume=False, verbosity=True, output_path=None, method=PLURALITY_CODE, tags="", session_file=None, log=None, debug=False, prediction_info=None): """Retrieve predictions remotely, combine them and save predictions to file """ predictions_files = [] prediction_args = {"tags": tags} test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) message_logged = False raw_input_data_list = [] for model in models: model = bigml.api.get_model_id(model) predictions_file = get_predictions_file_name(model, output_path) predictions_files.append(predictions_file) if (not resume or not c.checkpoint(c.are_predictions_created, predictions_file, test_reader.number_of_tests(), debug=debug)): if not message_logged: message = u.dated("Creating remote predictions.") u.log_message(message, log_file=session_file, console=verbosity) message_logged = True predictions_file = csv.writer(open(predictions_file, 'w', 0), lineterminator="\n") for input_data in test_reader: raw_input_data_list.append(input_data) input_data_dict = test_reader.dict(input_data) prediction = api.create_prediction(model, input_data_dict, by_name=test_set_header, wait_time=0, args=prediction_args) u.check_resource_error(prediction, "Failed to create prediction: ") u.log_message("%s\n" % prediction['resource'], log_file=log) prediction_row = prediction_to_row(prediction) predictions_file.writerow(prediction_row) combine_votes(predictions_files, Model(models[0]).to_prediction, prediction_file, method, prediction_info, raw_input_data_list)
def logistic_regression_dispatcher(args=sys.argv[1:]): """Parses command line and calls the different processing functions """ # If --clear-logs the log files are cleared if "--clear-logs" in args: clear_log_files(LOG_FILES) command = command_handling(args, COMMAND_LOG) # Parses command line arguments. command_args = a.parse_and_check(command) default_output = ('evaluation' if command_args.evaluate else 'predictions.csv') resume = command_args.resume if command_args.resume: command_args, session_file, output_dir = get_stored_command( args, command_args.debug, command_log=COMMAND_LOG, dirs_log=DIRS_LOG, sessions_log=SESSIONS_LOG) default_output = ('evaluation' if command_args.evaluate else 'predictions.csv') if command_args.predictions is None: command_args.predictions = os.path.join(output_dir, default_output) else: if command_args.output_dir is None: command_args.output_dir = a.NOW if command_args.predictions is None: command_args.predictions = os.path.join(command_args.output_dir, default_output) if len(os.path.dirname(command_args.predictions).strip()) == 0: command_args.predictions = os.path.join(command_args.output_dir, command_args.predictions) directory = u.check_dir(command_args.predictions) session_file = os.path.join(directory, SESSIONS_LOG) u.log_message(command.command + "\n", log_file=session_file) try: shutil.copy(DEFAULTS_FILE, os.path.join(directory, DEFAULTS_FILE)) except IOError: pass u.sys_log_message(u"%s\n" % os.path.abspath(directory), log_file=DIRS_LOG) # Creates the corresponding api instance api = a.get_api_instance(command_args, u.check_dir(session_file)) # Selects the action to perform if (a.has_train(command_args) or a.has_test(command_args) or command_args.export_fields): output_args = a.get_output_args(api, command_args, resume) a.transform_args(command_args, command.flags, api, command.user_defaults) compute_output(**output_args) u.log_message("_" * 80 + "\n", log_file=session_file)
def clusters_processing(datasets, clusters, cluster_ids, api, args, resume, fields=None, session_file=None, path=None, log=None): """Creates or retrieves clusters from the input data """ # If we have a dataset but not a model, we create the model if the no_model # flag hasn't been set up. if datasets and not (has_clusters(args) or args.no_cluster): cluster_ids = [] clusters = [] # Only 1 cluster per bigmler command at present number_of_clusters = 1 if resume: resume, cluster_ids = c.checkpoint(c.are_clusters_created, path, number_of_clusters, debug=args.debug) if not resume: message = u.dated("Found %s clusters out of %s. Resuming.\n" % (len(cluster_ids), number_of_clusters)) u.log_message(message, log_file=session_file, console=args.verbosity) clusters = cluster_ids number_of_clusters -= len(cluster_ids) cluster_args = r.set_cluster_args(args, fields=fields, cluster_fields=args.cluster_fields_) clusters, cluster_ids = r.create_clusters(datasets, clusters, cluster_args, args, api, path, session_file, log) # If a cluster is provided, we use it. elif args.cluster: cluster_ids = [args.cluster] clusters = cluster_ids[:] elif args.clusters or args.cluster_tag: clusters = cluster_ids[:] # If we are going to predict we must retrieve the clusters if cluster_ids and args.test_set: clusters, cluster_ids = r.get_clusters(clusters, args, api, session_file) return clusters, cluster_ids, resume
def samples_processing(datasets, samples, sample_ids, api, args, resume, session_file=None, path=None, log=None): """Creates or retrieves samples from the input data """ # If we have a dataset but not a sample, we create the sample if the # no_sample flag hasn't been set up. if datasets and not (has_samples(args) or args.no_sample): sample_ids = [] samples = [] # Only 1 sample per bigmler command at present number_of_samples = 1 if resume: resume, sample_ids = c.checkpoint(c.are_samples_created, path, number_of_samples, debug=args.debug) if not resume: message = u.dated("Found %s samples out of %s. Resuming.\n" % (len(sample_ids), number_of_samples)) u.log_message(message, log_file=session_file, console=args.verbosity) samples = sample_ids number_of_samples -= len(sample_ids) sample_args = r.set_sample_args(args) samples, sample_ids = r.create_samples(datasets, samples, sample_args, args, api, path, session_file, log) # If a sample is provided, we use it. elif args.sample: sample_ids = [args.sample] samples = sample_ids[:] elif args.samples or args.sample_tag: samples = sample_ids[:] # We must retrieve the samples' output to store them as CSV files if sample_ids and needs_sample_fields(args): samples, sample_ids = r.get_samples(samples, args, api, session_file=session_file) return samples, sample_ids, resume
def topic_model_processing(datasets, topic_models, topic_model_ids, api, args, resume, fields=None, session_file=None, path=None, log=None): """Creates or retrieves topic models from the input data """ # If we have a dataset but not a topic model, we create the topic model # if the no_topic_model # flag hasn't been set up. if datasets and not (has_topic_models(args) or args.no_topic_model): topic_model_ids = [] topic_models = [] # Only 1 topic model per bigmler command at present number_of_topic_models = 1 if resume: resume, topic_model_ids = c.checkpoint( c.are_topic_models_created, path, number_of_topic_models, debug=args.debug) if not resume: message = u.dated( "Found %s topic models out of %s. Resuming.\n" % (len(topic_model_ids), number_of_topic_models)) u.log_message(message, log_file=session_file, console=args.verbosity) topic_models = topic_model_ids number_of_topic_models -= len(topic_model_ids) topic_model_args = r.set_topic_model_args( \ args, fields=fields, topic_model_fields=args.topic_model_fields_) topic_models, topic_model_ids = r.create_topic_models( \ datasets, topic_models, topic_model_args, args, api, path, session_file, log) # If a topic model is provided, we use it. elif args.topic_model: topic_model_ids = [args.topic_model] topic_models = topic_model_ids[:] elif args.topic_models or args.topic_model_tag: topic_models = topic_model_ids[:] # If we are going to predict we must retrieve the topic models if topic_model_ids and args.test_set: topic_models, topic_model_ids = r.get_topic_models( topic_models, args, api, session_file) return topic_models, topic_model_ids, resume
def fusion_processing(fusion, \ fusion_ids, api, args, resume, fields=None, \ session_file=None, path=None, log=None): """Creates or retrieves fusion from the input data """ # If we have a models' list but not a fusion, # we create the model if the no_model # flag hasn't been set up. if args.fusion_models_ is not None and not has_fusion(args): fusion_ids = [] fusions = [] # Only 1 fusion per bigmler command at present number_of_fusions = 1 if resume: resume, fusion_ids = c.checkpoint( \ c.are_fusions_created, path, \ number_of_fusions, debug=args.debug) if not resume: message = u.dated("Found %s fusions out of %s." " Resuming.\n" % (len(fusion_ids), number_of_fusions)) u.log_message(message, log_file=session_file, console=args.verbosity) first_model = api.getters[models[0]](model[0]) fields = Fields(first_model) fusions = fusion_ids number_of_fusions -= len(fusion_ids) fusion_args = r.set_fusion_args( \ args, fields) fusion = \ r.create_fusion( \ args.fusion_models_, fusion, fusion_args, \ args, api, path, session_file, log) # If a fusion is provided, we use it. elif args.fusion: fusion_ids = [args.fusion] fusion = fusion_ids[0] elif args.fusion or args.fusion_tag: fusion = fusion_ids[0] # If we are going to create predictions, we must retrieve the fusion if fusion_ids and args.test_set: fusion = \ r.get_fusion(fusion, args, api, session_file) args.objective_field = fusion['object']['objective_field_name'] return fusion, resume
def multi_label_expansion( training_set, training_set_header, args, output_path, labels=None, session_file=None, input_flag=False ): """Splitting the labels in a multi-label objective field to create a source with column per label """ objective_field = args.objective_field input_reader = TrainReader( training_set, training_set_header, objective_field, multi_label=True, labels=labels, label_separator=args.label_separator, training_separator=args.training_separator, multi_label_fields=args.multi_label_fields_list, label_aggregates=args.label_aggregates_list, objective=not input_flag, ) # read file to get all the different labels if no --labels flag is given # or use labels given in --labels and generate the new field names new_headers = input_reader.get_label_headers() try: file_name = os.path.basename(training_set) except AttributeError: file_name = "test_set.csv" if input_flag else "training_set.csv" output_file = "%s%sextended_%s" % (output_path, os.sep, file_name) message = u.dated("Transforming to extended source.\n") u.log_message(message, log_file=session_file, console=args.verbosity) with open(output_file, "w", 0) as output_handler: output = csv.writer(output_handler, lineterminator="\n") output.writerow(new_headers) # read to write new source file with column per label input_reader.reset() if training_set_header: input_reader.next() while True: try: row = input_reader.next(extended=True) output.writerow(row) except StopIteration: break # training sources are zipped to minimize upload time and resources if not input_flag: output_file_zip = "%s%sextended_%s.zip" % (output_path, os.sep, file_name) with ZipFile(output_file_zip, "w", ZIP_DEFLATED) as output_zipped_file: output_zipped_file.write(output_file, file_name) output_file = output_file_zip objective_field = input_reader.headers[input_reader.objective_column] return (output_file, input_reader.get_multi_label_data())
def deepnets_processing(datasets, deepnets, \ deepnet_ids, api, args, resume, fields=None, \ session_file=None, path=None, log=None): """Creates or retrieves deepnet from the input data """ # If we have a dataset but not a model, we create the model if the no_model # flag hasn't been set up. if datasets and not (has_deepnet(args) or \ args.no_deepnet): deepnet_ids = [] deepnets = [] # Only 1 deepnet per bigmler command at present number_of_deepnets = 1 if resume: resume, deepnet_ids = c.checkpoint( \ c.are_deepnets_created, path, \ number_of_deepnets, debug=args.debug) if not resume: message = u.dated("Found %s deepnets out of %s." " Resuming.\n" % (len(deepnet_ids), number_of_deepnets)) u.log_message(message, log_file=session_file, console=args.verbosity) deepnets = deepnet_ids number_of_deepnets -= len(deepnet_ids) deepnet_args = r.set_deepnet_args( \ args, fields=fields, \ deepnet_fields=args.deepnet_fields_, objective_id=args.objective_id_) deepnets, deepnets_ids = \ r.create_deepnets( \ datasets, deepnets, deepnet_args, \ args, api, path, session_file, log) # If a deepnet is provided, we use it. elif args.deepnet: deepnet_ids = [args.deepnet] deepnets = deepnet_ids[:] elif args.deepnets or args.deepnet_tag: deepnets = deepnet_ids[:] # If we are going to predict we must retrieve the deepnets if deepnet_ids and (args.test_set or args.export_fields): deepnets, deepnet_ids = \ r.get_deepnets(deepnets, args, api, \ session_file) return deepnets, deepnet_ids, resume
def logistic_regressions_processing(datasets, logistic_regressions, \ logistic_regression_ids, api, args, resume, fields=None, \ session_file=None, path=None, log=None): """Creates or retrieves logistic regression from the input data """ # If we have a dataset but not a model, we create the model if the no_model # flag hasn't been set up. if datasets and not (has_logistic_regression(args) or \ args.no_logistic_regression): logistic_regression_ids = [] logistic_regressions = [] # Only 1 logistic regression per bigmler command at present number_of_logistic_regressions = 1 if resume: resume, logistic_regression_ids = c.checkpoint( \ c.are_logistic_regressions_created, path, \ number_of_logistic_regressions, debug=args.debug) if not resume: message = u.dated("Found %s logistic regressions out of %s." " Resuming.\n" % (len(logistic_regression_ids), number_of_logistic_regressions)) u.log_message(message, log_file=session_file, console=args.verbosity) logistic_regressions = logistic_regression_ids number_of_logistic_regressions -= len(logistic_regression_ids) logistic_regression_args = r.set_logistic_regression_args( \ args, fields=fields, \ logistic_regression_fields=args.logistic_fields_, objective_id=args.objective_id_) logistic_regressions, logistic_regression_ids = \ r.create_logistic_regressions( \ datasets, logistic_regressions, logistic_regression_args, \ args, api, path, session_file, log) # If a logistic regression is provided, we use it. elif args.logistic_regression: logistic_regression_ids = [args.logistic_regression] logistic_regressions = logistic_regression_ids[:] elif args.logistic_regressions or args.logistic_regression_tag: logistic_regressions = logistic_regression_ids[:] # If we are going to predict we must retrieve the logistic regressions if logistic_regression_ids and (args.test_set or args.export_fields): logistic_regressions, logistic_regression_ids = \ r.get_logistic_regressions(logistic_regressions, args, api, \ session_file) return logistic_regressions, logistic_regression_ids, resume
def time_series_processing(datasets, time_series, \ time_series_ids, api, args, resume, fields=None, \ session_file=None, path=None, log=None): """Creates or retrieves time_series from the input data """ # If we have a dataset but not a model, we create the model if the no_model # flag hasn't been set up. if datasets and not (has_time_series(args) or \ args.no_time_series): time_series_ids = [] time_series_set = [] # Only 1 time-series per bigmler command at present number_of_time_series = 1 if resume: resume, time_series_ids = c.checkpoint( \ c.are_time_series_created, path, \ number_of_time_series, debug=args.debug) if not resume: message = u.dated("Found %s time-series out of %s." " Resuming.\n" % (len(time_series_ids), number_of_time_series)) u.log_message(message, log_file=session_file, console=args.verbosity) time_series_set = time_series_ids number_of_time_series -= len(time_series_ids) time_series_args = r.set_time_series_args( \ args, fields=fields, objective_id=args.objective_id_) time_series_set, time_series_ids = \ r.create_time_series( \ datasets, time_series_set, time_series_args, \ args, api, path, session_file, log) # If a time_series is provided, we use it. elif args.time_series: time_series_ids = [args.time_series] time_series_set = time_series_ids[:] elif args.time_series_set or args.time_series_tag: time_series_set = time_series_ids[:] # If we are going to predict we must retrieve the time-series if time_series_ids and args.export_fields: time_series_set, time_series_ids = \ r.get_time_series(time_series_set, args, api, \ session_file) return time_series_set, time_series_ids, resume
def multi_label_expansion(training_set, training_set_header, objective_field, args, output_path, field_attributes=None, labels=None, session_file=None): """Splitting the labels in a multi-label objective field to create a source with column per label """ # find out column number corresponding to the objective field training_reader = TrainReader(training_set, training_set_header, objective_field, multi_label=True, labels=labels, label_separator=args.label_separator, training_separator=args.training_separator) # read file to get all the different labels if no --labels flag is given # or use labels given in --labels and generate the new field names new_headers = training_reader.get_headers(objective_field=False) new_field_names = [l.get_label_field(training_reader.objective_name, label) for label in training_reader.labels] new_headers.extend(new_field_names) new_headers.append(training_reader.objective_name) new_headers = [header.encode("utf-8") for header in new_headers] try: file_name = os.path.basename(training_set) except AttributeError: file_name = "training_set.csv" output_file = "%s%sextended_%s" % (output_path, os.sep, file_name) message = u.dated("Transforming to extended source.\n") u.log_message(message, log_file=session_file, console=args.verbosity) with open(output_file, 'w', 0) as output_handler: output = csv.writer(output_handler, lineterminator="\n") output.writerow(new_headers) # read to write new source file with column per label training_reader.reset() if training_set_header: training_reader.next() while True: try: row = training_reader.next(extended=True) output.writerow(row) except StopIteration: break objective_field = training_reader.headers[training_reader.objective_column] if field_attributes is None: field_attributes = {} for label_column, label in training_reader.labels_columns(): field_attributes.update({label_column: { "label": "%s%s" % (l.MULTI_LABEL_LABEL, label)}}) # Setting field label to mark objective and label fields and objective # field (just in case it was not set previously and other derived fields # are added in the source construction process after the real last field). return (output_file, training_reader.labels, field_attributes, training_reader.objective_name)
def create_new_dataset( datasets, api, args, resume, name=None, description=None, fields=None, dataset_fields=None, objective_field=None, session_file=None, path=None, log=None, ): """Generates a new dataset using the generators given in a generators file or a multi-dataset from a list of datasets """ origin_resource = datasets if not isinstance(datasets, basestring) and args.multi_dataset: suffix = "multi" else: datasets = [] suffix = "gen" number_of_datasets = 1 if resume: resume, datasets = c.checkpoint( c.are_datasets_created, path, number_of_datasets, debug=args.debug, suffix=suffix ) if not resume: message = u.dated("Found %s datasets out of %s. Resuming.\n" % (len(datasets), number_of_datasets)) u.log_message(message, log_file=session_file, console=args.verbosity) if not resume: dataset_args = r.set_dataset_args( name, description, args, fields, dataset_fields, objective_field=objective_field ) if args.multi_dataset and args.multi_dataset_json: dataset_args.update(args.multi_dataset_json) else: dataset_args.update(args.dataset_json_generators) new_dataset = r.create_dataset( origin_resource, dataset_args, args, api=api, path=path, session_file=session_file, log=log, dataset_type=suffix, ) else: new_dataset = datasets[0] return new_dataset, resume
def create_categories_datasets(dataset, distribution, fields, args, api, resume, session_file=None, path=None, log=None, other_label=OTHER): """Generates a new dataset using a subset of categories of the original one """ if args.max_categories < 1: sys.exit("--max-categories can only be a positive number.") datasets = [] categories_splits = [distribution[i: i + args.max_categories] for i in range(0, len(distribution), args.max_categories)] number_of_datasets = len(categories_splits) if resume: resume, datasets = c.checkpoint( c.are_datasets_created, path, number_of_datasets, debug=args.debug) if not resume: message = u.dated("Found %s datasets out of %s. Resuming.\n" % (len(datasets), number_of_datasets)) u.log_message(message, log_file=session_file, console=args.verbosity) if not resume: for i in range(len(datasets), number_of_datasets): split = categories_splits[i] category_selector = "(if (or" for element in split: category = element[0] category_selector += " (= v \"%s\")" % category category_selector += ") v \"%s\")" % other_label category_generator = "(let (v (f %s)) %s)" % ( fields.objective_field, category_selector) try: dataset_args = { "all_but": [fields.objective_field], "new_fields": [ {"name": fields.field_name(fields.objective_field), "field": category_generator, "label": "max_categories: %s" % args.max_categories}], "user_metadata": {"max_categories": args.max_categories, "other_label": other_label}} except ValueError, exc: sys.exit(exc) new_dataset = r.create_dataset( dataset, dataset_args, args, api=api, path=path, session_file=session_file, log=log, dataset_type="parts") new_dataset = bigml.api.check_resource(new_dataset, api.get_dataset) datasets.append(new_dataset)
def predict(test_set, test_set_header, models, fields, output, objective_field, remote=False, api=None, log=None, max_models=MAX_MODELS, method=0, resume=False, tags=None, verbosity=1, session_file=None, debug=False, ensemble_id=None, prediction_info=None): """Computes a prediction for each entry in the `test_set`. Predictions can be computed remotely, locally using MultiModels built on all the models or locally using MultiModels on subgroups of models. Chosing a max_batch_models value not bigger than the number_of_models flag will lead to the last case, where memory usage is bounded and each model predictions are saved for further use. """ test_reader = TestReader(test_set, test_set_header, fields, objective_field) prediction_file = output output_path = u.check_dir(output) output = csv.writer(open(output, 'w', 0), lineterminator="\n") # Remote predictions: predictions are computed in bigml.com and stored # in a file named after the model in the following syntax: # model_[id of the model]__predictions.csv # For instance, # model_50c0de043b563519830001c2_predictions.csv if remote: if ensemble_id is not None: remote_predict_ensemble(ensemble_id, test_reader, prediction_file, api, resume, verbosity, output_path, method, tags, session_file, log, debug, prediction_info) else: remote_predict(models, test_reader, prediction_file, api, resume, verbosity, output_path, method, tags, session_file, log, debug, prediction_info) # Local predictions: Predictions are computed locally using models' rules # with MultiModel's predict method else: message = u.dated("Creating local predictions.\n") u.log_message(message, log_file=session_file, console=verbosity) # For a small number of models, we build a MultiModel using all of # the given models and issue a combined prediction if len(models) < max_models: local_predict(models, test_reader, output, method, prediction_info) # For large numbers of models, we split the list of models in chunks # and build a MultiModel for each chunk, issue and store predictions # for each model and combine all of them eventually. else: local_batch_predict(models, test_reader, prediction_file, api, max_models, resume, output_path, output, verbosity, method, session_file, debug, prediction_info)