def project_dispatcher(args=sys.argv[1:]): """Parses command line and calls the different processing functions """ command_args, command, api, session_file, resume = get_context(args, SETTINGS) path = u.check_dir(command_args.output) log = None if command_args.log_file: u.check_dir(command_args.log_file) log = command_args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) if not command_args.project_id and command_args.name: command_args.project = command_args.name if command_args.project: # create project pp.project_processing( api, command_args, command_args.resume, session_file=session_file, path=path, log=log, create=True) if command_args.project_id and ( command_args.project_attributes or command_args.name or command_args.tag or command_args.description or command_args.category): # update project's attributes pp.update_project(command_args, api, command_args.resume, \ session_file=session_file) u.log_message("_" * 80 + "\n", log_file=session_file) u.print_generated_files(command_args.output_dir, log_file=session_file, verbosity=command_args.verbosity)
def main_dispatcher(args=sys.argv[1:]): """Parses command line and calls the different processing functions """ # If --clear-logs the log files are cleared if "--clear-logs" in args: clear_log_files(LOG_FILES) command = command_handling(args, COMMAND_LOG) # Parses command line arguments. command_args = a.parse_and_check(command) default_output = ('evaluation' if command_args.evaluate else 'predictions.csv') resume = command_args.resume if command_args.resume: command_args, session_file, output_dir = get_stored_command( args, command_args.debug, command_log=COMMAND_LOG, dirs_log=DIRS_LOG, sessions_log=SESSIONS_LOG) default_output = ('evaluation' if command_args.evaluate else 'predictions.csv') if command_args.predictions is None: command_args.predictions = os.path.join(output_dir, default_output) else: if command_args.output_dir is None: command_args.output_dir = a.NOW if command_args.predictions is None: command_args.predictions = os.path.join(command_args.output_dir, default_output) if len(os.path.dirname(command_args.predictions).strip()) == 0: command_args.predictions = os.path.join(command_args.output_dir, command_args.predictions) directory = u.check_dir(command_args.predictions) session_file = os.path.join(directory, SESSIONS_LOG) u.log_message(command.command + "\n", log_file=session_file) try: defaults_file = open(DEFAULTS_FILE, 'r') contents = defaults_file.read() defaults_file.close() defaults_copy = open(os.path.join(directory, DEFAULTS_FILE), 'w', 0) defaults_copy.write(contents) defaults_copy.close() except IOError: pass u.sys_log_message(u"%s\n" % os.path.abspath(directory), log_file=DIRS_LOG) # Creates the corresponding api instance api = a.get_api_instance(command_args, u.check_dir(session_file)) if (a.has_train(command_args) or a.has_test(command_args) or command_args.votes_dirs): output_args = a.get_output_args(api, command_args, resume) a.transform_args(command_args, command.flags, api, command.user_defaults) compute_output(**output_args) u.log_message("_" * 80 + "\n", log_file=session_file)
def delete_dispatcher(args=sys.argv[1:]): """Parses command line and calls the different processing functions """ command = command_handling(args, COMMAND_LOG) # Parses command line arguments. command_args = a.parse_and_check(command) if command_args.resume: command_args, session_file, _ = get_stored_command( args, command_args.debug, command_log=COMMAND_LOG, dirs_log=DIRS_LOG, sessions_log=SESSIONS_LOG) else: if command_args.output_dir is None: command_args.output_dir = a.NOW directory = u.check_dir(os.path.join(command_args.output_dir, "tmp")) session_file = os.path.join(directory, SESSIONS_LOG) u.log_message(command.command + "\n", log_file=session_file) try: shutil.copy(DEFAULTS_FILE, os.path.join(directory, DEFAULTS_FILE)) except IOError: pass u.sys_log_message(u"%s\n" % os.path.abspath(directory), log_file=DIRS_LOG) # If --clear-logs the log files are cleared if "--clear-logs" in args: clear_log_files(LOG_FILES) # Creates the corresponding api instance api = a.get_api_instance(command_args, u.check_dir(session_file)) delete_resources(command_args, api) u.log_message("_" * 80 + "\n", log_file=session_file)
def cluster_dispatcher(args=sys.argv[1:]): """Parses command line and calls the different processing functions """ # If --clear-logs the log files are cleared if "--clear-logs" in args: clear_log_files(LOG_FILES) command = command_handling(args, COMMAND_LOG) # Parses command line arguments. command_args = a.parse_and_check(command) resume = command_args.resume if command_args.resume: # Keep the debug option if set debug = command_args.debug # Restore the args of the call to resume from the command log file stored_command = StoredCommand(args, COMMAND_LOG, DIRS_LOG) command = Command(None, stored_command=stored_command) # Logs the issued command and the resumed command session_file = os.path.join(stored_command.output_dir, SESSIONS_LOG) stored_command.log_command(session_file=session_file) # Parses resumed arguments. command_args = a.parse_and_check(command) if command_args.predictions is None: command_args.predictions = os.path.join(stored_command.output_dir, DEFAULT_OUTPUT) else: if command_args.output_dir is None: command_args.output_dir = a.NOW if command_args.predictions is None: command_args.predictions = os.path.join(command_args.output_dir, DEFAULT_OUTPUT) if len(os.path.dirname(command_args.predictions).strip()) == 0: command_args.predictions = os.path.join(command_args.output_dir, command_args.predictions) directory = u.check_dir(command_args.predictions) session_file = os.path.join(directory, SESSIONS_LOG) u.log_message(command.command + "\n", log_file=session_file) try: defaults_file = open(DEFAULTS_FILE, "r") contents = defaults_file.read() defaults_file.close() defaults_copy = open(os.path.join(directory, DEFAULTS_FILE), "w", 0) defaults_copy.write(contents) defaults_copy.close() except IOError: pass u.sys_log_message(u"%s\n" % os.path.abspath(directory), log_file=DIRS_LOG) # Creates the corresponding api instance if resume and debug: command_args.debug = True api = a.get_api_instance(command_args, u.check_dir(session_file)) # Selects the action to perform if has_train(command_args) or has_test(command_args) or command_args.cluster_datasets is not None: output_args = a.get_output_args(api, command_args, resume) a.transform_args(command_args, command.flags, api, command.user_defaults) compute_output(**output_args) u.log_message("_" * 80 + "\n", log_file=session_file)
def create_kfold_evaluations(datasets_file, args, command_obj, resume=False, counter=0): """ Create k-fold cross-validation from a datasets file """ global subcommand_list output_dir = os.path.normpath( u.check_dir(os.path.join(u"%s%s" % (args.output_dir, counter), u"evaluation.json"))) model_fields = args.model_fields name_suffix = "_subset_%s" % counter name_max_length = NAME_MAX_LENGTH - len(name_suffix) name = "%s%s" % (args.name[0: name_max_length], name_suffix) dataset_id = u.read_datasets(datasets_file)[0] model_dataset = os.path.normpath( os.path.join(u.check_dir(datasets_file), dataset_id.replace("/", "_"))) command = COMMANDS["create_cv"] % (datasets_file, output_dir, name, model_dataset) command_args = command.split() if model_fields: command_args.append("--model-fields") command_args.append(model_fields) command_args.append("--objective") command_args.append(args.objective_field) command_args = add_model_options(command_args, args) """ common_options_list = u.get_options_list(args, command_obj.common_options, prioritary=command_args) command_args.extend(common_options_list) """ command_obj.propagate(command_args, exclude=["--dataset", "--datasets", "--dataset-file"]) command = rebuild_command(command_args) if resume: next_command = subcommand_list.pop() if different_command(next_command, command): resume = False u.sys_log_message(command, log_file=subcommand_file) main_dispatcher(args=command_args) elif not subcommand_list: main_dispatcher(args=['main', '--resume']) resume = False else: u.sys_log_message(command, log_file=subcommand_file) main_dispatcher(args=command_args) evaluation_file = os.path.normpath(os.path.join(output_dir, "evaluation.json")) try: with open(evaluation_file) as evaluation_handler: evaluation = json.loads(evaluation_handler.read()) return evaluation, resume except (ValueError, IOError): sys.exit("Failed to retrieve evaluation.")
def whizzml_dispatcher(args=sys.argv[1:]): """Main processing of the parsed options for BigMLer whizzml """ # If --clear-logs the log files are cleared if "--clear-logs" in args: clear_log_files(LOG_FILES) command = command_handling(args, COMMAND_LOG) # Parses command line arguments. command_args = command.parser.parse_args(command.args) resume = command_args.resume if resume: command_args, session_file, _ = get_stored_command( args, command_args.debug, command_log=COMMAND_LOG, dirs_log=DIRS_LOG, sessions_log=SESSIONS_LOG) else: if command_args.output_dir is None: command_args.output_dir = a.NOW session_file = os.path.join(command_args.output_dir, SESSIONS_LOG) # If logging is required, open the file for logging log = None if command_args.log_file: u.check_dir(command_args.log_file) log = command_args.log_file # If --clear_logs the log files are cleared if command_args.clear_logs: clear_log_files([log]) u.sys_log_message(u"%s\n" % os.path.abspath(command_args.output_dir), log_file=DIRS_LOG) session_file = os.path.join(command_args.output_dir, SESSIONS_LOG) # create api instance form args api = a.get_api_instance(command_args, u.check_dir(session_file)) # Creates the corresponding api instance api = a.get_api_instance(command_args, u.check_dir(session_file)) a.transform_dataset_options(command_args, api) # package_dir if command_args.package_dir is not None: create_package(command_args, api, command.common_options, resume=resume) else: sys.exit("You must use the --package-dir flag pointing to the" " directory where the metadata.json file is. Type\n" " bigmler whizzml --help\n" " to see all the available options.")
def cluster_dispatcher(args=sys.argv[1:]): """Parses command line and calls the different processing functions """ # If --clear-logs the log files are cleared if "--clear-logs" in args: clear_log_files(LOG_FILES) command = command_handling(args, COMMAND_LOG) # Parses command line arguments. command_args = a.parse_and_check(command) resume = command_args.resume if command_args.resume: command_args, session_file, output_dir = get_stored_command( args, command_args.debug, command_log=COMMAND_LOG, dirs_log=DIRS_LOG, sessions_log=SESSIONS_LOG) if command_args.predictions is None: command_args.predictions = os.path.join(output_dir, DEFAULT_OUTPUT) else: if command_args.output_dir is None: command_args.output_dir = a.NOW if command_args.predictions is None: command_args.predictions = os.path.join(command_args.output_dir, DEFAULT_OUTPUT) if len(os.path.dirname(command_args.predictions).strip()) == 0: command_args.predictions = os.path.join(command_args.output_dir, command_args.predictions) directory = u.check_dir(command_args.predictions) session_file = os.path.join(directory, SESSIONS_LOG) u.log_message(command.command + "\n", log_file=session_file) try: shutil.copy(DEFAULTS_FILE, os.path.join(directory, DEFAULTS_FILE)) except IOError: pass u.sys_log_message(u"%s\n" % os.path.abspath(directory), log_file=DIRS_LOG) # Creates the corresponding api instance api = a.get_api_instance(command_args, u.check_dir(session_file)) # Selects the action to perform if (a.has_train(command_args) or a.has_test(command_args) or command_args.cluster_datasets is not None or command_args.export_fields is not None): output_args = a.get_output_args(api, command_args, resume) a.transform_args(command_args, command.flags, api, command.user_defaults) compute_output(**output_args) u.log_message("_" * 80 + "\n", log_file=session_file)
def delete_dispatcher(args=sys.argv[1:]): """Parses command line and calls the different processing functions """ command = command_handling(args, COMMAND_LOG) # Parses command line arguments. command_args = a.parse_and_check(command) resume = command_args.resume if command_args.resume: # Keep the debug option if set debug = command_args.debug # Restore the args of the call to resume from the command log file stored_command = StoredCommand(args, COMMAND_LOG, DIRS_LOG) command = Command(None, stored_command=stored_command) # Logs the issued command and the resumed command session_file = os.path.join(stored_command.output_dir, SESSIONS_LOG) stored_command.log_command(session_file=session_file) # Parses resumed arguments. command_args = a.parse_and_check(command) else: if command_args.output_dir is None: command_args.output_dir = a.NOW directory = u.check_dir(os.path.join(command_args.output_dir, "tmp")) session_file = os.path.join(directory, SESSIONS_LOG) u.log_message(command.command + "\n", log_file=session_file) try: defaults_file = open(DEFAULTS_FILE, 'r') contents = defaults_file.read() defaults_file.close() defaults_copy = open(os.path.join(directory, DEFAULTS_FILE), 'w', 0) defaults_copy.write(contents) defaults_copy.close() except IOError: pass u.sys_log_message(u"%s\n" % os.path.abspath(directory), log_file=DIRS_LOG) # If --clear-logs the log files are cleared if "--clear-logs" in args: clear_log_files(LOG_FILES) # Creates the corresponding api instance if resume and debug: command_args.debug = True api = a.get_api_instance(command_args, u.check_dir(session_file)) delete_resources(command_args, api) u.log_message("_" * 80 + "\n", log_file=session_file)
def get_cmd_context(args, settings): """Parses the args array to create an args object storing the defaults and user-given values. It also sets the output directory and the log files. """ command = command_handling(args, settings['command_log']) # Parses command line arguments. command_args = a.parse_and_check(command) resume = command_args.resume if command_args.resume: command_args, session_file, output_dir = get_stored_command( args, command_args.debug, command_log=settings['command_log'], dirs_log=settings["dirs_log"], sessions_log=settings['sessions_log']) if settings.get('default_output') is None: settings['default_output'] = "tmp.txt" if not hasattr(command_args, "output") or command_args.output is None: command_args.output = os.path.join(output_dir, settings['default_output']) else: if hasattr(command_args, "output") and \ command_args.output is not None: command_args.output_dir = u.check_dir(command_args.output) if command_args.output_dir is None: command_args.output_dir = a.NOW if settings.get('default_output') is None: settings['default_output'] = "tmp.txt" if not hasattr(command_args, "output") or command_args.output is None: command_args.output = os.path.join(command_args.output_dir, settings['default_output']) if len(os.path.dirname(command_args.output).strip()) == 0: command_args.output = os.path.join(command_args.output_dir, command_args.output) directory = u.check_dir(command_args.output) session_file = os.path.join(directory, settings['sessions_log']) u.log_message(command.command + "\n", log_file=session_file) if settings.get('defaults_file') is not None: try: shutil.copy(settings['defaults_file'], os.path.join(directory, settings['defaults_file'])) except IOError: pass u.sys_log_message(u"%s\n" % os.path.abspath(directory), log_file=settings['dirs_log']) return command_args, command, session_file, resume
def kfold_evaluate(datasets_file, api, args, counter, common_options, penalty=DEFAULT_PENALTY, metric=ACCURACY, resume=False): """Scoring k-fold cross-validation using the given feature subset """ # create evaluation with input_fields args.output_dir = os.path.join(u.check_dir(datasets_file), "kfold") evaluation, resume = create_kfold_evaluations(datasets_file, args, common_options, resume=resume, counter=counter) evaluation = evaluation.get('model', {}) avg_metric = AVG_PREFIX % metric metric_literal = metric if not avg_metric in evaluation: avg_metric = AVG_PREFIX % R_SQUARED metric_literal = R_SQUARED if not avg_metric in evaluation: sys.exit("Failed to find %s or r-squared in the evaluation" % metric) return (evaluation[avg_metric] - penalty * len(args.model_fields.split(args.args_separator)), metric_literal, resume)
def get_api_instance(command_args, storage_path): """Returns an api instance using the given parameters """ api_command_args = { 'username': command_args.username, 'api_key': command_args.api_key, 'dev_mode': command_args.dev_mode, 'debug': command_args.debug} if command_args.store: api_command_args.update({'storage': storage_path}) command_args.api_ = bigml.api.BigML(**api_command_args) # if locally stored models are used, local predicting objects should use # this directory to look for the model information first. Otherwise, # use the storage_path and if not set ./storage directory retrieve_dir = None for stored_model in STORED_MODELS: if hasattr(command_args, stored_model) and \ getattr(command_args, stored_model) is not None: retrieve_dir = check_dir(getattr(command_args, stored_model)) break if retrieve_dir is None: retrieve_dir = storage_path if command_args.store else './storage' command_args.retrieve_api_ = bigml.api.BigML(**{ \ 'username': command_args.username, 'api_key': command_args.api_key, 'dev_mode': command_args.dev_mode, 'debug': command_args.debug, 'storage': retrieve_dir}) return command_args.api_
def add_gazibit_links(resource, output_dir=None, shared=False): """ Adds the link to the resource in the corresponding section of the report template """ try: gazibit_tmp = GAZIBIT_SHARED if shared else GAZIBIT_PRIVATE path = check_dir(os.path.join(output_dir, REPORTS_DIR, os.path.basename(gazibit_tmp))) input_file = os.path.join(path, os.path.basename(gazibit_tmp)) output_file = tempfile.NamedTemporaryFile( mode="w", dir=output_dir, delete=False) if not os.path.isfile(input_file): shutil.copyfile(gazibit_tmp, input_file) with open(input_file, "r") as report_template: with output_file as report_output: content = report_template.read() resource_type = bigml.api.get_resource_type(resource) resource_type = resource_type.upper() url_template = URL_TEMPLATE % resource_type content = content.replace(url_template, get_url(resource, shared=shared)) section_template = SECTION_START % resource_type content = content.replace(section_template, "") section_template = SECTION_END % resource_type content = content.replace(section_template, "") report_output.write(content) os.remove(input_file) os.rename(output_file.name, input_file) except IOError, exc: os.remove(output_file.name) sys.exit("Failed to generate the gazibit output report. %s" % str(exc))
def kfold_evaluate(datasets_file, args, counter, common_options, penalty=DEFAULT_PENALTY, metric=ACCURACY, resume=False): """Scoring k-fold cross-validation using the given feature subset """ # create evaluation with input_fields args.output_dir = os.path.normpath(os.path.join(u.check_dir(datasets_file), "kfold")) evaluation, resume = create_kfold_evaluations(datasets_file, args, common_options, resume=resume, counter=counter) evaluation = extract_evaluation_info( evaluation, args.optimize_category) avg_metric = AVG_PREFIX % metric metric_literal = metric if not avg_metric in evaluation: avg_metric = AVG_PREFIX % R_SQUARED metric_literal = R_SQUARED if not avg_metric in evaluation: sys.exit("Failed to find %s or r-squared in the evaluation" % metric) invert = -1 if metric in MINIMIZE_OPTIONS else 1 return (invert * (evaluation[avg_metric] - invert * penalty * len(args.model_fields.split(args.args_separator))), evaluation[avg_metric], metric_literal, resume)
def clear_reports(output_dir): """Clears the sections useless sections """ # read report files path = check_dir(os.path.join(output_dir, REPORTS_DIR, GAZIBIT_TOKEN)) for report_file in os.listdir(path): input_file = os.path.join(path, report_file) output_file = tempfile.NamedTemporaryFile( mode="w", dir=output_dir, delete=False) try: with open(input_file, "r") as report_template: with output_file as report_output: content = report_template.read() while content.find(SECTION_START_PREFIX) > 0: start = content.find(SECTION_START_PREFIX) end = content.find("\n", content.find(SECTION_END_PREFIX)) content = "%s%s" % (content[0: start], content[end:]) report_output.write(content) os.remove(input_file) os.rename(output_file.name, input_file) except IOError, exc: os.remove(output_file.name) sys.exit("Failed to generate the output report. %s" % str(exc))
def execute_dispatcher(args=sys.argv[1:]): """Parses command line and calls the different processing functions """ # If --clear-logs the log files are cleared if "--clear-logs" in args: clear_log_files(LOG_FILES) command = command_handling(args, COMMAND_LOG) default_output = 'whizzml_results' # Parses command line arguments. command_args = a.parse_and_check(command) resume = command_args.resume if command_args.resume: command_args, session_file, output_dir = get_stored_command( args, command_args.debug, command_log=COMMAND_LOG, dirs_log=DIRS_LOG, sessions_log=SESSIONS_LOG) if command_args.output is None: command_args.output = os.path.join(output_dir, default_output) else: if command_args.output_dir is None: command_args.output_dir = a.NOW if command_args.output is None: command_args.output = os.path.join(command_args.output_dir, default_output) if len(os.path.dirname(command_args.output).strip()) == 0: command_args.output = os.path.join(command_args.output_dir, command_args.output) directory = u.check_dir(command_args.output) session_file = os.path.join(directory, SESSIONS_LOG) u.log_message(command.command + "\n", log_file=session_file) try: shutil.copy(DEFAULTS_FILE, os.path.join(directory, DEFAULTS_FILE)) except IOError: pass u.sys_log_message(u"%s\n" % os.path.abspath(directory), log_file=DIRS_LOG) # Creates the corresponding api instance api = a.get_api_instance(command_args, u.check_dir(session_file)) _ = a.get_output_args(api, command_args, resume) a.transform_args(command_args, command.flags, api, command.user_defaults) execute_whizzml(command_args, api, session_file) u.log_message("_" * 80 + "\n", log_file=session_file)
def remote_predict_models(models, test_reader, prediction_file, api, args, resume=False, output_path=None, session_file=None, log=None, exclude=None): """Retrieve predictions remotely, combine them and save predictions to file """ predictions_files = [] prediction_args = { "tags": args.tag } test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) message_logged = False raw_input_data_list = [] for input_data in test_reader: raw_input_data_list.append(input_data) single_model = len(models) == 1 if single_model: prediction_file = UnicodeWriter(prediction_file).open_writer() for model in models: model = bigml.api.get_model_id(model) predictions_file = get_predictions_file_name(model, output_path) predictions_files.append(predictions_file) if (not resume or not c.checkpoint(c.are_predictions_created, predictions_file, test_reader.number_of_tests(), debug=args.debug)[0]): if not message_logged: message = u.dated("Creating remote predictions.\n") u.log_message(message, log_file=session_file, console=args.verbosity) message_logged = True with UnicodeWriter(predictions_file) as predictions_file: for input_data in raw_input_data_list: input_data_dict = test_reader.dict(input_data) prediction = api.create_prediction(model, input_data_dict, by_name=test_set_header, wait_time=0, args=prediction_args) u.check_resource_error(prediction, "Failed to create prediction: ") u.log_message("%s\n" % prediction['resource'], log_file=log) prediction_row = prediction_to_row(prediction) predictions_file.writerow(prediction_row) if single_model: write_prediction(prediction_row[0:2], prediction_file, args.prediction_info, input_data, exclude) if single_model: prediction_file.close_writer() else: combine_votes(predictions_files, Model(models[0]).to_prediction, prediction_file, args.method, args.prediction_info, raw_input_data_list, exclude)
def execute_whizzml(args, api, session_file): """executes the code in a script or a source code file """ # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) path = args.output_dir if args.to_library: pw.library_processing( \ api, args, session_file=session_file, path=path, log=log) else: if args.script_file: # script is retrieved from the contents of the given local file script, _, _ = u.read_local_resource(args.script_file) args.script = script['resource'] args.script_ids = [args.script] elif args.code_file or args.code: script, scripts = pw.script_processing( \ api, args, session_file=session_file, path=path, log=log) args.script = script if isinstance(script, basestring) else \ script.get('resource') args.script_ids = scripts if (args.script or args.scripts) and not args.no_execute: execution = pw.execution_processing( \ api, args, session_file=session_file, path=path, log=log) execution = r.get_execution( \ execution, api, args.verbosity, session_file) r.save_txt_and_json(execution['object']['execution'], args.output, api=api) args.execution = execution['resource'] u.log_message("_" * 80 + "\n", log_file=session_file) u.print_generated_files(args.output_dir, log_file=session_file, verbosity=args.verbosity)
def delete_resources(command_args, api): """Deletes the resources selected by the user given options """ if command_args.predictions is None: path = a.NOW else: path = u.check_dir(command_args.predictions) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) message = u.dated("Retrieving objects to delete.\n") u.log_message(message, log_file=session_file, console=command_args.verbosity) delete_list = [] if command_args.delete_list: delete_list = map(str.strip, command_args.delete_list.split(',')) if command_args.delete_file: if not os.path.exists(command_args.delete_file): sys.exit("File %s not found" % command_args.delete_file) delete_list.extend([line for line in open(command_args.delete_file, "r")]) resource_selectors = [ (command_args.source_tag, api.list_sources), (command_args.dataset_tag, api.list_datasets), (command_args.model_tag, api.list_models), (command_args.prediction_tag, api.list_predictions), (command_args.evaluation_tag, api.list_evaluations), (command_args.ensemble_tag, api.list_ensembles), (command_args.batch_prediction_tag, api.list_batch_predictions)] for selector, api_call in resource_selectors: query_string = None if command_args.all_tag: query_string = "tags__in=%s" % command_args.all_tag elif selector: query_string = "tags__in=%s" % selector if query_string: delete_list.extend(u.list_ids(api_call, query_string)) message = u.dated("Deleting objects.\n") u.log_message(message, log_file=session_file, console=command_args.verbosity) message = "\n".join(delete_list) u.log_message(message, log_file=session_file) u.delete(api, delete_list) if sys.platform == "win32" and sys.stdout.isatty(): message = (u"\nGenerated files:\n\n" + unicode(u.print_tree(path, " "), "utf-8") + u"\n") else: message = "\nGenerated files:\n\n" + u.print_tree(path, " ") + "\n" u.log_message(message, log_file=session_file, console=command_args.verbosity)
def predict(test_set, test_set_header, models, fields, output, objective_field, remote=False, api=None, log=None, max_models=MAX_MODELS, method=0, resume=False, tags=None, verbosity=1, session_file=None, debug=False, ensemble_id=None, prediction_info=None): """Computes a prediction for each entry in the `test_set`. Predictions can be computed remotely, locally using MultiModels built on all the models or locally using MultiModels on subgroups of models. Chosing a max_batch_models value not bigger than the number_of_models flag will lead to the last case, where memory usage is bounded and each model predictions are saved for further use. """ test_reader = TestReader(test_set, test_set_header, fields, objective_field) prediction_file = output output_path = u.check_dir(output) output = csv.writer(open(output, 'w', 0), lineterminator="\n") # Remote predictions: predictions are computed in bigml.com and stored # in a file named after the model in the following syntax: # model_[id of the model]__predictions.csv # For instance, # model_50c0de043b563519830001c2_predictions.csv if remote: if ensemble_id is not None: remote_predict_ensemble(ensemble_id, test_reader, prediction_file, api, resume, verbosity, output_path, method, tags, session_file, log, debug, prediction_info) else: remote_predict(models, test_reader, prediction_file, api, resume, verbosity, output_path, method, tags, session_file, log, debug, prediction_info) # Local predictions: Predictions are computed locally using models' rules # with MultiModel's predict method else: message = u.dated("Creating local predictions.\n") u.log_message(message, log_file=session_file, console=verbosity) # For a small number of models, we build a MultiModel using all of # the given models and issue a combined prediction if len(models) < max_models: local_predict(models, test_reader, output, method, prediction_info) # For large numbers of models, we split the list of models in chunks # and build a MultiModel for each chunk, issue and store predictions # for each model and combine all of them eventually. else: local_batch_predict(models, test_reader, prediction_file, api, max_models, resume, output_path, output, verbosity, method, session_file, debug, prediction_info)
def remote_predict(models, test_reader, prediction_file, api, resume=False, verbosity=True, output_path=None, method=PLURALITY_CODE, tags="", session_file=None, log=None, debug=False, prediction_info=None): """Retrieve predictions remotely, combine them and save predictions to file """ predictions_files = [] prediction_args = { "tags": tags } test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) message_logged = False raw_input_data_list = [] for model in models: model = bigml.api.get_model_id(model) predictions_file = get_predictions_file_name(model, output_path) predictions_files.append(predictions_file) if (not resume or not c.checkpoint(c.are_predictions_created, predictions_file, test_reader.number_of_tests(), debug=debug)): if not message_logged: message = u.dated("Creating remote predictions.") u.log_message(message, log_file=session_file, console=verbosity) message_logged = True predictions_file = csv.writer(open(predictions_file, 'w', 0), lineterminator="\n") for input_data in test_reader: raw_input_data_list.append(input_data) input_data_dict = test_reader.dict(input_data) prediction = api.create_prediction(model, input_data_dict, by_name=test_set_header, wait_time=0, args=prediction_args) u.check_resource_error(prediction, "Failed to create prediction: ") u.log_message("%s\n" % prediction['resource'], log_file=log) prediction_row = prediction_to_row(prediction) predictions_file.writerow(prediction_row) combine_votes(predictions_files, Model(models[0]).to_prediction, prediction_file, method, prediction_info, raw_input_data_list)
def fill_remote_context(command_args, command, session_file, resume=False, api=None): """Fills the part of the context that needs to be retrieved from the remote server. Creates a connection to the API and manages the requests that retrive the resource ids to be used. Transforms arguments from the command-line-friendly format to the required structure. """ if api is None: api = a.get_api_instance(command_args, u.check_dir(session_file)) a.get_output_args(api, command_args, resume) a.transform_args(command_args, command.flags, api) return command_args, api
def combine_votes(votes_files, to_prediction, to_file, method=0, prediction_info=None, input_data_list=None): """Combines the votes found in the votes' files and stores predictions. votes_files: should contain the list of file names to_prediction: is the Model method that casts prediction to numeric type if needed to_file: is the name of the final output file. """ votes = read_votes(votes_files, to_prediction) u.check_dir(to_file) output = csv.writer(open(to_file, 'w', 0), lineterminator="\n") number_of_tests = len(votes) if input_data_list is None or len(input_data_list) != number_of_tests: input_data_list = None for index in range(0, number_of_tests): multivote = votes[index] input_data = (None if input_data_list is None else input_data_list[index]) write_prediction(multivote.combine(method, True), output, prediction_info, input_data)
def combine_votes(votes_files, to_prediction, to_file, method=0, prediction_info=NORMAL_FORMAT, input_data_list=None, exclude=None): """Combines the votes found in the votes' files and stores predictions. votes_files: should contain the list of file names to_prediction: is the Model method that casts prediction to numeric type if needed to_file: is the name of the final output file. """ votes = read_votes(votes_files, to_prediction) u.check_dir(to_file) with UnicodeWriter(to_file) as output: number_of_tests = len(votes) if input_data_list is None or len(input_data_list) != number_of_tests: input_data_list = None for index in range(0, number_of_tests): multivote = votes[index] input_data = (None if input_data_list is None else input_data_list[index]) write_prediction(multivote.combine(method, full=True), output, prediction_info, input_data, exclude)
def upload_reports(report_types, output_dir): """Uploads the reports to their respective remote location """ if GAZIBIT in report_types: if os.environ.get(GAZIBIT_TOKEN) is not None: output_file = os.path.join(output_dir, REPORTS_DIR, os.path.basename(GAZIBIT_PRIVATE)) path = check_dir(output_file) gazibit_upload(output_file, exit=True) output_file = os.path.join(output_dir, REPORTS_DIR, os.path.basename(GAZIBIT_SHARED)) gazibit_upload(output_file) else: sys.exit("To upload your gazibit report you need to" " set your gazibit token in the GAZIBIT_TOKEN" " environment variable. Failed to find GAZIBIT_TOKEN.")
def create_candidates_evaluations(datasets_file, args, command_obj, resume=False, random_candidates=DEFAULT_MIN_CANDIDATES): """ Create random candidates ensembles evaluations """ global subcommand_list output_dir = os.path.normpath(u.check_dir( os.path.join(u"%s%s" % (args.output_dir, random_candidates), "evaluation.json"))) command = COMMANDS["random_candidates"] % ( datasets_file, random_candidates, output_dir) command_args = command.split() """ common_options_list = u.get_options_list(args, command_obj.common_options, prioritary=command_args) command_args.extend(common_options_list) """ command_args.append("--objective") command_args.append(args.objective_field) command_args = add_model_options(command_args, args) command_obj.propagate(command_args, exclude=["--dataset", "--datasets", "--dataset-file"]) command = rebuild_command(command_args) if resume: next_command = subcommand_list.pop() if different_command(next_command, command): resume = False u.sys_log_message(command, log_file=subcommand_file) main_dispatcher(args=command_args) elif not subcommand_list: main_dispatcher(args=['main', '--resume']) resume = False else: u.sys_log_message(command, log_file=subcommand_file) main_dispatcher(args=command_args) evaluation_file = os.path.normpath(os.path.join(output_dir, "evaluation.json")) try: with open(evaluation_file, u.open_mode("r")) as evaluation_handler: evaluation = json.loads(evaluation_handler.read()) return evaluation, resume except (ValueError, IOError): sys.exit("Failed to retrieve evaluation.")
def create_kfold_cv(args, api, common_options, resume=False): """Creates the kfold cross-validation """ set_subcommand_file(args.output_dir) if resume: retrieve_subcommands() datasets_file, objective_name, resume = create_kfold_datasets_file( args, api, common_options, resume=resume) if datasets_file is not None: args.output_dir = os.path.join(u.check_dir(datasets_file), KFOLD_SUBDIR) message = ('Creating the kfold evaluations.........\n') u.log_message(message, log_file=session_file, console=args.verbosity) create_kfold_evaluations(datasets_file, args, common_options, resume=resume)
def create_kfold_evaluations(datasets_file, args, common_options, resume=False, counter=0): """ Create k-fold cross-validation from a datasets file """ global subcommand_list output_dir = u.check_dir(os.path.join("%s%s" % (args.output_dir, counter), "evaluation.json")) model_fields = args.model_fields name_suffix = "_subset_%s" % counter name_max_length = NAME_MAX_LENGTH - len(name_suffix) name = "%s%s" % (args.name[0: name_max_length] , name_suffix) command = COMMANDS["create_cv"] % (datasets_file, output_dir, name) command_args = command.split() if model_fields: command_args.append("--model-fields") command_args.append(model_fields) common_options_list = u.get_options_list(args, common_options, prioritary=command_args) command_args.extend(common_options_list) command = " ".join(command_args) if resume: next_command = subcommand_list.pop().strip() if next_command != command: resume = False u.log_message("%s\n" % command, log_file=subcommand_file, console=False) main_dispatcher(args=command_args) elif not subcommand_list: main_dispatcher(args=['main', '--resume']) resume = False else: u.log_message("%s\n" % command, log_file=subcommand_file, console=False) main_dispatcher(args=command_args) evaluation_file = os.path.join(output_dir, "evaluation.json") try: with open(evaluation_file) as evaluation_handler: evaluation = json.loads(evaluation_handler.read()) return evaluation, resume except (ValueError, IOError): sys.exit("Failed to retrieve evaluation.")
def remote_predict_ensemble(ensemble_id, test_reader, prediction_file, api, resume=False, verbosity=True, output_path=None, method=PLURALITY_CODE, tags="", session_file=None, log=None, debug=False, prediction_info=None): """Retrieve predictions remotely and save predictions to file """ prediction_args = { "tags": tags, "combiner": method } test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) if (not resume or not c.checkpoint(c.are_predictions_created, prediction_file, test_reader.number_of_tests(), debug=debug)): message = u.dated("Creating remote predictions.") u.log_message(message, log_file=session_file, console=verbosity) predictions_file = csv.writer(open(prediction_file, 'w', 0), lineterminator="\n") for input_data in test_reader: input_data_dict = test_reader.dict(input_data) prediction = api.create_prediction(ensemble_id, input_data_dict, by_name=test_set_header, wait_time=0, args=prediction_args) prediction = u.check_resource(prediction, api.get_prediction) u.check_resource_error(prediction, "Failed to create prediction: ") u.log_message("%s\n" % prediction['resource'], log_file=log) prediction_row = prediction_to_row(prediction, prediction_info) write_prediction(prediction_row, predictions_file, prediction_info, input_data)
def create_node_th_evaluations(datasets_file, args, common_options, resume=False, node_threshold=DEFAULT_MIN_NODES): """ Create node_threshold evaluations """ global subcommand_list output_dir = os.path.normpath(u.check_dir( os.path.join(u"%s%s" % (args.output_dir, node_threshold), "evaluation.json"))) command = COMMANDS["node_threshold"] % ( datasets_file, node_threshold, output_dir) command_args = command.split() common_options_list = u.get_options_list(args, common_options, prioritary=command_args) command_args.extend(common_options_list) command_args.append("--objective") command_args.append(args.objective_field) command_args = add_model_options(command_args, args) command = rebuild_command(command_args) if resume: next_command = subcommand_list.pop() if different_command(next_command, command): resume = False u.sys_log_message(command, log_file=subcommand_file) main_dispatcher(args=command_args) elif not subcommand_list: main_dispatcher(args=['main', '--resume']) resume = False else: u.sys_log_message(command, log_file=subcommand_file) main_dispatcher(args=command_args) evaluation_file = os.path.normpath(os.path.join(output_dir, "evaluation.json")) try: with open(evaluation_file, u.open_mode("r")) as evaluation_handler: evaluation = json.loads(evaluation_handler.read()) return evaluation, resume except (ValueError, IOError): sys.exit("Failed to retrieve evaluation.")
def remote_predict_ensemble(ensemble_id, test_reader, prediction_file, api, args, resume=False, output_path=None, session_file=None, log=None, exclude=None): """Retrieve predictions remotely and save predictions to file """ prediction_args = { "tags": args.tag, "combiner": args.method } test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) if (not resume or not c.checkpoint( c.are_predictions_created, prediction_file, test_reader.number_of_tests(), debug=args.debug)[0]): message = u.dated("Creating remote predictions.") u.log_message(message, log_file=session_file, console=args.verbosity) with UnicodeWriter(prediction_file) as predictions_file: for input_data in test_reader: input_data_dict = test_reader.dict(input_data) prediction = api.create_prediction(ensemble_id, input_data_dict, by_name=test_set_header, wait_time=0, args=prediction_args) prediction = u.check_resource(prediction, api.get_prediction) u.check_resource_error(prediction, "Failed to create prediction: ") u.log_message("%s\n" % prediction['resource'], log_file=log) prediction_row = prediction_to_row(prediction, args.prediction_info) write_prediction(prediction_row, predictions_file, args.prediction_info, input_data, exclude)
def compute_output(api, args): """ Creates a fusion using the `models` list or uses the ids of a previously created BigML fusion to make predictions for the `test_set`. """ fusion = None # variables from command-line options resume = args.resume_ fusion_ids = args.fusion_ids_ output = args.predictions # there's only one fusion to be generated at present args.max_parallel_fusions = 1 # fusion cannot be published yet. args.public_fusion = False # It is compulsory to have a description to publish either datasets or # fusions if (not args.description_ and args.public_fusion): sys.exit("You should provide a description to publish.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) if args.fusion_file: # fusion regression is retrieved from the contents of the given local # JSON file fusion, csv_properties, fields = u.read_local_resource( args.fusion_file, csv_properties=csv_properties) fusion_ids = [fusion] else: # fusion is retrieved from the remote object or created fusion, resume = \ pf.fusion_processing( \ fusion, fusion_ids, \ api, args, resume, \ session_file=session_file, path=path, log=log) # We update the fusion public state if needed if fusion: if isinstance(fusion, basestring): if not a.has_test(args): query_string = MINIMUM_MODEL elif args.export_fields: query_string = r.ALL_FIELDS_QS else: query_string = '' fusion = u.check_resource(fusion, api.get_fusion, query_string=query_string) if (args.public_fusion or (args.shared_flag and r.shared_changed(args.shared, fusion))): fusion_args = {} if args.shared_flag and r.shared_changed(args.shared, fusion): fusion_args.update(shared=args.shared) if args.public_fusion: fusion_args.update( \ rfus.set_publish_fusion_args(args)) if fusion_args: fusion = rfus.update_fusion( \ fusion, fusion_args, args, api=api, path=path, \ session_file=session_file) # We get the fields of the fusion if we haven't got # them yet and need them if fusion and (args.test_set or args.evaluate): fields = pf.get_fusion_fields( \ fusion, csv_properties, args) # If predicting if fusion and (a.has_test(args) or \ args.remote): test_dataset = get_test_dataset(args) # Remote predictions: predictions are computed as batch predictions # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) if not args.evaluate: batch_prediction_args = set_batch_prediction_args( args, fields=fields, dataset_fields=test_fields) remote_prediction(fusion, test_dataset, \ batch_prediction_args, args, \ api, resume, prediction_file=output, \ session_file=session_file, path=path, log=log) else: prediction([fusion], fields, args, session_file=session_file) # If evaluate flag is on, create remote evaluation and save results in # json and human-readable format. if args.evaluate: # When we resume evaluation and models were already completed, we # should use the datasets array as test datasets args.max_parallel_evaluations = 1 # only one evaluation at present args.cross_validation_rate = 0 # no cross-validation args.number_of_evaluations = 1 # only one evaluation if args.has_test_datasets_: test_dataset = get_test_dataset(args) dataset = test_dataset dataset = u.check_resource(dataset, api=api, query_string=r.ALL_FIELDS_QS) dataset_fields = pd.get_fields_structure(dataset, None) resume = evaluate([fusion], [dataset], api, args, resume, fields=fields, dataset_fields=dataset_fields, session_file=session_file, path=path, log=log, objective_field=args.objective_field) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates a dataset using the `training_set`. """ source = None dataset = None fields = None other_label = OTHER multi_label_data = None multi_label_fields = [] datasets = None # variables from command-line options resume = args.resume_ output = args.output dataset_fields = args.dataset_fields_ check_args_coherence(args) path = u.check_dir(output) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # labels to be used in multi-label expansion labels = (None if args.labels is None else [ label.strip() for label in args.labels.split(args.args_separator) ]) if labels is not None: labels = sorted([label for label in labels]) # multi_label file must be preprocessed to obtain a new extended file if args.multi_label and args.training_set is not None: (args.training_set, multi_label_data) = ps.multi_label_expansion( args.training_set, args.train_header, args, path, labels=labels, session_file=session_file) args.train_header = True args.objective_field = multi_label_data["objective_name"] all_labels = l.get_all_labels(multi_label_data) if not labels: labels = all_labels else: all_labels = labels if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) if args.source_file: # source is retrieved from the contents of the given local JSON file source, csv_properties, fields = u.read_local_resource( args.source_file, csv_properties=csv_properties) else: # source is retrieved from the remote object source, resume, csv_properties, fields = ps.source_processing( api, args, resume, csv_properties=csv_properties, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) if source is not None: args.source = bigml.api.get_source_id(source) if args.multi_label and source: multi_label_data = l.get_multi_label_data(source) (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(args.objective_field, labels, multi_label_data, fields, multi_label_fields) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) if args.dataset_file: # dataset is retrieved from the contents of the given local JSON file model_dataset, csv_properties, fields = u.read_local_resource( args.dataset_file, csv_properties=csv_properties) if not args.datasets: datasets = [model_dataset] dataset = model_dataset else: datasets = u.read_datasets(args.datasets) if not datasets: # dataset is retrieved from the remote object datasets, resume, csv_properties, fields = pd.dataset_processing( source, api, args, resume, fields=fields, csv_properties=csv_properties, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) if datasets: dataset = datasets[-1] if args.to_csv is not None: resume = pd.export_dataset(dataset, api, args, resume, session_file=session_file, path=path) # Now we have a dataset, let's check if there's an objective_field # given by the user and update it in the fields structure args.objective_id_ = get_objective_id(args, fields) # If test_split is used, split the dataset in a training and a test dataset # according to the given split if args.test_split > 0: dataset, test_dataset, resume = pd.split_processing( dataset, api, args, resume, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) datasets[0] = dataset # Check if the dataset has a categorical objective field and it # has a max_categories limit for categories if args.max_categories > 0 and len(datasets) == 1: if pd.check_max_categories(fields.fields[args.objective_id_]): distribution = pd.get_categories_distribution( dataset, args.objective_id_) if distribution and len(distribution) > args.max_categories: categories = [element[0] for element in distribution] other_label = pd.create_other_label(categories, other_label) datasets, resume = pd.create_categories_datasets( dataset, distribution, fields, args, api, resume, session_file=session_file, path=path, log=log, other_label=other_label) else: sys.exit("The provided objective field is not categorical nor " "a full terms only text field. " "Only these fields can be used with" " --max-categories") # If any of the transformations is applied, # generate a new dataset from the given list of datasets if args.new_dataset: dataset, resume = pd.create_new_dataset(datasets, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) datasets = [dataset] # Check if the dataset has a generators file associated with it, and # generate a new dataset with the specified field structure. Also # if the --to-dataset flag is used to clone or sample the original dataset if args.new_fields or args.sample_rate != 1 or \ (args.lisp_filter or args.json_filter) and not has_source(args): if fields is None: if isinstance(dataset, basestring): dataset = u.check_resource(dataset, api=api) fields = Fields(dataset, csv_properties) args.objective_id_ = get_objective_id(args, fields) args.objective_name_ = fields.field_name(args.objective_id_) dataset, resume = pd.create_new_dataset(dataset, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) datasets[0] = dataset # rebuild fields structure for new ids and fields csv_properties.update({ 'objective_field': args.objective_name_, 'objective_field_present': True }) fields = pd.get_fields_structure(dataset, csv_properties) args.objective_id_ = get_objective_id(args, fields) if args.multi_label and dataset and multi_label_data is None: multi_label_data = l.get_multi_label_data(dataset) (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(args.objective_field, labels, multi_label_data, fields, multi_label_fields) if dataset: # retrieves max_categories data, if any args.max_categories = get_metadata(dataset, 'max_categories', args.max_categories) other_label = get_metadata(dataset, 'other_label', other_label) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ time_series = None time_series_set = None # variables from command-line options resume = args.resume_ time_series_ids = args.time_series_ids_ output = args.predictions # there's only one time_series to be generated at present args.max_parallel_time_series = 1 args.max_parallel_evaluations = 1 # time_series cannot be published yet. args.public_time_series = False # no cross-validations args.dataset_off = False args.cross_validation_rate = 0 args.number_of_evaluations = 1 # It is compulsory to have a description to publish either datasets or # time_series if (not args.description_ and (args.public_time_series or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info(api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if datasets: # Now we have a dataset, let's check if there's an objective_field # given by the user and update it in the fields structure args.objective_id_ = get_objective_id(args, fields) # if the time series is going to be evaluated, and we don't have # test data, we need to divide the rows using ranges, so we'll need # max rows args.max_rows = datasets[0]["object"]["rows"] if args.time_series_file: # time-series is retrieved from the contents of the given local # JSON file time_series, csv_properties, fields = u.read_local_resource( args.time_series_file, csv_properties=csv_properties) time_series_set = [time_series] time_series_ids = [time_series['resource']] else: # time-series is retrieved from the remote object time_series_set, time_series_ids, resume = \ pts.time_series_processing( \ datasets, time_series_set, time_series_ids, \ api, args, resume, fields=fields, \ session_file=session_file, path=path, log=log) if time_series_set: time_series = time_series_set[0] # We update the time-series' public state if needed if time_series: if isinstance(time_series, basestring): query_string = r.ALL_FIELDS_QS time_series = u.check_resource(time_series, api.get_time_series, query_string=query_string) time_series_set[0] = time_series if (args.public_time_series or (args.shared_flag and r.shared_changed(args.shared, time_series))): time_series_args = {} if args.shared_flag and r.shared_changed(args.shared, time_series): time_series_args.update(shared=args.shared) if args.public_time_series: time_series_args.update( \ r.set_publish_time_series_args(args)) if time_series_args: time_series = r.time_series( \ time_series, time_series_args, args, api=api, path=path, \ session_file=session_file) time_series_set[0] = time_series """ # We get the fields of the time-series if we haven't got # them yet and need them if time_series and (args.test_set or args.export_fields): fields = pts.get_time_series_fields( \ time_series, csv_properties, args) """ if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) # If forecasting if time_series_set and a.has_ts_test(args): if args.remote: forecast_args = r.set_forecast_args(args, fields=fields) remote_forecast(time_series, forecast_args, args, \ api, resume, prediction_file=output, \ session_file=session_file, path=path, log=log) else: forecast(time_series, args, session_file=session_file) # If evaluate flag is on, create remote evaluation and save results in # json and human-readable format. if args.evaluate: # When we resume evaluation and models were already completed, we # should use the datasets array as test datasets if args.has_test_datasets_: test_dataset = get_test_dataset(args) if args.dataset_off and not args.has_test_datasets_: args.test_dataset_ids = datasets if args.test_dataset_ids and args.dataset_off: # Evaluate the models with the corresponding test datasets. test_dataset_id = bigml.api.get_dataset_id( \ args.test_dataset_ids[0]) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) resume = evaluate(time_series_set, args.test_dataset_ids, api, args, resume, fields=fields, dataset_fields=test_fields, session_file=session_file, path=path, log=log, objective_field=args.objective_field) else: dataset = datasets[0] if args.test_split > 0 or args.has_test_datasets_: dataset = test_dataset else: args.range_ = [ int(args.max_rows * r.EVALUATE_SAMPLE_RATE), args.max_rows ] dataset = u.check_resource(dataset, api=api, query_string=r.ALL_FIELDS_QS) dataset_fields = pd.get_fields_structure(dataset, None) resume = evaluate(time_series_set, [dataset], api, args, resume, fields=fields, dataset_fields=dataset_fields, session_file=session_file, path=path, log=log, objective_field=args.objective_field) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def anomaly_dispatcher(args=sys.argv[1:]): """Parses command line and calls the different processing functions """ # If --clear-logs the log files are cleared if "--clear-logs" in args: clear_log_files(LOG_FILES) command = command_handling(args, COMMAND_LOG) # Parses command line arguments. command_args = a.parse_and_check(command) resume = command_args.resume if command_args.resume: # Keep the debug option if set debug = command_args.debug # Restore the args of the call to resume from the command log file stored_command = StoredCommand(args, COMMAND_LOG, DIRS_LOG) command = Command(None, stored_command=stored_command) # Logs the issued command and the resumed command session_file = os.path.join(stored_command.output_dir, SESSIONS_LOG) stored_command.log_command(session_file=session_file) # Parses resumed arguments. command_args = a.parse_and_check(command) if command_args.predictions is None: command_args.predictions = os.path.join(stored_command.output_dir, DEFAULT_OUTPUT) else: if command_args.output_dir is None: command_args.output_dir = a.NOW if command_args.predictions is None: command_args.predictions = os.path.join(command_args.output_dir, DEFAULT_OUTPUT) if len(os.path.dirname(command_args.predictions).strip()) == 0: command_args.predictions = os.path.join(command_args.output_dir, command_args.predictions) directory = u.check_dir(command_args.predictions) session_file = os.path.join(directory, SESSIONS_LOG) u.log_message(command.command + "\n", log_file=session_file) try: defaults_file = open(DEFAULTS_FILE, 'r') contents = defaults_file.read() defaults_file.close() defaults_copy = open(os.path.join(directory, DEFAULTS_FILE), 'w', 0) defaults_copy.write(contents) defaults_copy.close() except IOError: pass u.sys_log_message(u"%s\n" % os.path.abspath(directory), log_file=DIRS_LOG) # Creates the corresponding api instance if resume and debug: command_args.debug = True api = a.get_api_instance(command_args, u.check_dir(session_file)) # Selects the action to perform if (has_train(command_args) or has_test(command_args)): output_args = a.get_output_args(api, command_args, resume) a.transform_args(command_args, command.flags, api, command.user_defaults) compute_output(**output_args) u.log_message("_" * 80 + "\n", log_file=session_file)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ association = None associations = None # no multi-label support at present # variables from command-line options resume = args.resume_ association_ids = args.association_ids_ output = args.predictions # there's only one association resource to be generated at present args.max_parallel_associations = 1 # associations cannot be published yet. args.public_association = False # It is compulsory to have a description to publish either datasets or # associations if (not args.description_ and (args.public_association or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info(api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.association_file: # association is retrieved from the contents of the given local JSON # file association, csv_properties, fields = u.read_local_resource( args.association_file, csv_properties=csv_properties) associations = [association] association_ids = [association['resource']] else: # association is retrieved from the remote object associations, association_ids, resume = pa.associations_processing( datasets, associations, association_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if associations: association = associations[0] # We update the association's public state if needed if association: if isinstance(association, basestring): if not a.has_test(args): query_string = MINIMUM_MODEL else: query_string = '' association = u.check_resource(association, api.get_association, query_string=query_string) associations[0] = association if (args.public_association or (args.shared_flag and r.shared_changed(args.shared, association))): association_args = {} if args.shared_flag and \ r.shared_changed(args.shared, association): association_args.update(shared=args.shared) if args.public_association: association_args.update(r.set_publish_association_args(args)) if association_args: association = r.update_association( \ association, association_args, args, api=api, path=path, session_file=session_file) associations[0] = association # We get the fields of the association if we haven't got # them yet and need them if association and args.test_set: fields = pa.get_association_fields(association, csv_properties, args) # If predicting if associations and (a.has_test(args) or (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote association sets: association sets are computed as # batch association sets # in bigml.com except when --no-batch flag is set. They are currently # not supported yet if args.remote and not args.no_batch: sys.exit("Batch association sets are currently not supported.") """ # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_association_args = r.set_batch_association_args( args, fields=fields, dataset_fields=test_fields) remote_association( \ association, test_dataset, batch_association_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) """ else: sys.exit("Local prediction of association sets is currently" " not supported.") """ association_set(associations, fields, args, session_file=session_file) """ u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def reify_dispatcher(args=sys.argv[1:]): """Parses command line and calls the different processing functions """ command = command_handling(args, COMMAND_LOG) # Parses command line arguments. command_args = a.parse_and_check(command) if command_args.resume: command_args, session_file, _ = get_stored_command( args, command_args.debug, command_log=COMMAND_LOG, dirs_log=DIRS_LOG, sessions_log=SESSIONS_LOG) if command_args.output is None: command_args.output = os.path.join(command_args.output_dir, DEFAULT_OUTPUT) else: if command_args.output_dir is None: command_args.output_dir = a.NOW if command_args.output is None: command_args.output = os.path.join(command_args.output_dir, DEFAULT_OUTPUT) if len(os.path.dirname(command_args.output).strip()) == 0: command_args.output = os.path.join(command_args.output_dir, command_args.output) directory = u.check_dir(command_args.output) command_args.output_dir = directory session_file = os.path.join(directory, SESSIONS_LOG) u.log_message(command.command + "\n", log_file=session_file) directory = u.check_dir(os.path.join(command_args.output_dir, "tmp")) session_file = os.path.join(directory, SESSIONS_LOG) u.log_message(command.command + "\n", log_file=session_file) try: shutil.copy(DEFAULTS_FILE, os.path.join(directory, DEFAULTS_FILE)) except IOError: pass u.sys_log_message(u"%s\n" % os.path.abspath(directory), log_file=DIRS_LOG) # If --clear-logs the log files are cleared if "--clear-logs" in args: clear_log_files(LOG_FILES) def logger(message): """Partial to log messages according to args.verbosity """ u.log_message(u.dated(message), \ log_file=session_file, console=command_args.verbosity) # Creates the corresponding api instance api = a.get_api_instance(command_args, u.check_dir(session_file)) message = "Starting reification for %s\n\n" % command_args.resource_id u.log_message(message, \ log_file=session_file, console=command_args.verbosity) reify_resources(command_args, api, logger) message = "\nReification complete. See the results in %s\n\n" % \ command_args.output u.log_message(message, \ log_file=session_file, console=command_args.verbosity) u.log_message("_" * 80 + "\n", log_file=session_file) u.print_generated_files(command_args.output_dir, log_file=session_file, verbosity=command_args.verbosity)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ deepnet = None deepnets = None # variables from command-line options resume = args.resume_ deepnet_ids = args.deepnet_ids_ output = args.predictions # there's only one deepnet to be generated at present args.max_parallel_deepnets = 1 # deepnets cannot be published yet. args.public_deepnet = False # It is compulsory to have a description to publish either datasets or # deepnet if (not args.description_ and (args.public_deepnet or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info(api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if datasets: # Now we have a dataset, let's check if there's an objective_field # given by the user and update it in the fields structure args.objective_id_ = get_objective_id(args, fields) if args.deepnet_file: # deepnet is retrieved from the contents of the given local # JSON file deepnet, csv_properties, fields = u.read_local_resource( args.deepnet_file, csv_properties=csv_properties) deepnets = [deepnet] deepnet_ids = [deepnet['resource']] else: # deepnet is retrieved from the remote object deepnets, deepnet_ids, resume = \ pdn.deepnets_processing( \ datasets, deepnets, deepnet_ids, \ api, args, resume, fields=fields, \ session_file=session_file, path=path, log=log) if deepnets: deepnet = deepnets[0] # We update the deepnet's public state if needed if deepnet: if isinstance(deepnet, basestring) or \ api.status(deepnet) != bigml.api.FINISHED: if not a.has_test(args): query_string = MINIMUM_MODEL elif args.export_fields: query_string = r.ALL_FIELDS_QS else: query_string = '' deepnet = u.check_resource(deepnet, api.get_deepnet, query_string=query_string) deepnets[0] = deepnet if (args.public_deepnet or (args.shared_flag and r.shared_changed(args.shared, deepnet))): deepnet_args = {} if args.shared_flag and r.shared_changed(args.shared, deepnet): deepnet_args.update(shared=args.shared) if args.public_deepnet: deepnet_args.update( \ r.set_publish_deepnet_args(args)) if deepnet_args: deepnet = r.update_deepnet( \ deepnet, deepnet_args, args, api=api, path=path, \ session_file=session_file) deepnet[0] = deepnet # We get the fields of the deepnet if we haven't got # them yet and need them if deepnet and (args.test_set or args.export_fields): fields = pdn.get_deepnet_fields( \ deepnet, csv_properties, args) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) # If predicting if deepnets and (a.has_test(args) or \ (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote predictions: predictions are computed as batch predictions # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_prediction_args = r.set_batch_prediction_args( args, fields=fields, dataset_fields=test_fields) remote_dn_prediction(deepnet, test_dataset, \ batch_prediction_args, args, \ api, resume, prediction_file=output, \ session_file=session_file, path=path, log=log) else: dn_prediction(deepnets, fields, args, session_file=session_file) # If evaluate flag is on, create remote evaluation and save results in # json and human-readable format. if args.evaluate: # When we resume evaluation and models were already completed, we # should use the datasets array as test datasets if args.has_test_datasets_: test_dataset = get_test_dataset(args) if args.dataset_off and not args.has_test_datasets_: args.test_dataset_ids = datasets if args.test_dataset_ids and args.dataset_off: # Evaluate the models with the corresponding test datasets. test_dataset_id = bigml.api.get_dataset_id( \ args.test_dataset_ids[0]) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) resume = evaluate(deepnets, args.test_dataset_ids, api, args, resume, fields=fields, dataset_fields=test_fields, session_file=session_file, path=path, log=log, objective_field=args.objective_field) else: dataset = datasets[0] if args.test_split > 0 or args.has_test_datasets_: dataset = test_dataset dataset = u.check_resource(dataset, api=api, query_string=r.ALL_FIELDS_QS) dataset_fields = pd.get_fields_structure(dataset, None) resume = evaluate(deepnets, [dataset], api, args, resume, fields=fields, dataset_fields=dataset_fields, session_file=session_file, path=path, log=log, objective_field=args.objective_field) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args, training_set, test_set=None, output=None, objective_field=None, description=None, field_attributes=None, types=None, dataset_fields=None, model_fields=None, name=None, training_set_header=True, test_set_header=True, model_ids=None, votes_files=None, resume=False, fields_map=None): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ source = None dataset = None model = None models = None fields = None # It is compulsory to have a description to publish either datasets or # models if (not description and (args.black_box or args.white_box or args.public_dataset)): raise Exception("You should provide a description to publish.") path = u.check_dir(output) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required, open the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared if args.clear_logs: try: open(log, 'w', 0).close() except IOError: pass source, resume, csv_properties, fields = source_processing( training_set, test_set, training_set_header, test_set_header, name, description, api, args, resume, csv_properties=csv_properties, field_attributes=field_attributes, types=types, session_file=session_file, path=path, log=log) dataset, resume, csv_properties, fields = dataset_processing( source, training_set, test_set, model_ids, name, description, fields, dataset_fields, api, args, resume, csv_properties=csv_properties, session_file=session_file, path=path, log=log) # If test_split is used, split the dataset in a training and a test dataset # according to the given split if args.test_split > 0: dataset, test_dataset, resume = split_processing( dataset, name, description, api, args, resume, session_file=session_file, path=path, log=log) models, model_ids, resume = models_processing(dataset, models, model_ids, name, description, test_set, objective_field, fields, model_fields, api, args, resume, session_file=session_file, path=path, log=log) if models: model = models[0] # We get the fields of the model if we haven't got # them yet and update its public state if needed if model and not args.evaluate and (test_set or args.black_box or args.white_box): if args.black_box or args.white_box: model = r.publish_model(model, args, api, session_file) models[0] = model fields, objective_field = get_model_fields(model, model_fields, csv_properties, args) # If predicting if models and test_set and not args.evaluate: predict(test_set, test_set_header, models, fields, output, objective_field, args.remote, api, log, args.max_batch_models, args.method, resume, args.tag, args.verbosity, session_file, args.debug, args.ensemble, args.prediction_info) # When combine_votes flag is used, retrieve the predictions files saved # in the comma separated list of directories and combine them if votes_files: model_id = re.sub(r'.*(model_[a-f0-9]{24})__predictions\.csv$', r'\1', votes_files[0]).replace("_", "/") try: model = u.check_resource(model_id, api.get_model) except ValueError, exception: sys.exit("Failed to get model %s: %s" % (model_id, str(exception))) local_model = Model(model) message = u.dated("Combining votes.\n") u.log_message(message, log_file=session_file, console=args.verbosity) combine_votes(votes_files, local_model.to_prediction, output, args.method)
def predict(models, fields, args, api=None, log=None, resume=False, session_file=None, labels=None, models_per_label=1, other_label=OTHER, multi_label_data=None): """Computes a prediction for each entry in the `test_set`. Predictions computed locally using MultiModels on subgroups of models. Chosing a max_batch_models value not bigger than the number_of_models flag will lead to the last case, where memory usage is bounded and each model predictions are saved for further use. """ test_set = args.test_set test_set_header = args.test_header objective_field = args.objective_field output = args.predictions test_reader = TestReader(test_set, test_set_header, fields, objective_field, test_separator=args.test_separator) prediction_file = output output_path = u.check_dir(output) with UnicodeWriter(output) as output: # columns to exclude if input_data is added to the prediction field exclude = use_prediction_headers(args.prediction_header, output, test_reader, fields, args, objective_field) # Remote predictions: predictions are computed in bigml.com and stored # in a file named after the model in the following syntax: # model_[id of the model]__predictions.csv # For instance, # model_50c0de043b563519830001c2_predictions.csv # Predictions are computed individually only if no_batch flag is set if (args.remote and args.no_batch and not args.multi_label and args.method != THRESHOLD_CODE): if args.ensemble is not None: remote_predict_ensemble(args.ensemble, test_reader, prediction_file, api, args, resume, output_path, session_file, log, exclude) else: remote_predict_models(models, test_reader, prediction_file, api, args, resume, output_path, session_file, log, exclude) return # Local predictions: Predictions are computed locally using models' # rules with MultiModel's predict method message = u.dated("Creating local predictions.\n") u.log_message(message, log_file=session_file, console=args.verbosity) options = {} if args.method == THRESHOLD_CODE: options.update(threshold=args.threshold) if args.threshold_class is None: local_model = Model(models[0]) args.threshold_class = local_model.tree.distribution[0][0] options.update(category=args.threshold_class) # For a model we build a Model and for a small number of models, # we build a MultiModel using all of # the given models and issue a combined prediction if (len(models) <= args.max_batch_models and args.fast and not args.multi_label and args.max_categories == 0 and args.method != COMBINATION): local_predict(models, test_reader, output, args, options, exclude) # For large numbers of models, we split the list of models in chunks # and build a MultiModel for each chunk, issue and store predictions # for each model and combine all of them eventually. else: # Local predictions: predictions are computed locally using # models' rules with MultiModel's predict method and combined using # aggregation if the objective field is a multi-labelled field # or one of the available combination methods: plurality, # confidence weighted and probability weighted if args.multi_label: method = AGGREGATION elif args.max_categories > 0: method = COMBINATION else: method = args.method # For multi-labelled models, the --models flag keeps the order # of the labels and the models but the --model-tag flag # retrieves the models with no order, so the correspondence with # each label must be restored. ordered = True if args.multi_label and (args.model_tag is not None or models_per_label > 1): ordered = False local_batch_predict(models, test_reader, prediction_file, api, args, resume=resume, output_path=output_path, output=output, method=method, options=options, session_file=session_file, labels=labels, ordered=ordered, exclude=exclude, models_per_label=models_per_label, other_label=other_label, multi_label_data=multi_label_data) test_reader.close()
open_list.append((child, score, metric_value, counter)) try: best_features = [fields.field_name(field_ids[i]) for (i, score) in enumerate(best_state) if score] except ValueError, exc: sys.exit(exc) message = (u'The best feature subset is: %s \n' % u", ".join(best_features)) u.log_message(message, log_file=session_file, console=1) if metric in PERCENT_EVAL_METRICS: message = (u'%s = %0.2f%%\n' % (metric.capitalize(), (best_metric_value * 100))) else: message = (u'%s = %f\n' % (metric.capitalize(), best_metric_value)) u.log_message(message, log_file=session_file, console=1) output_dir = os.path.normpath(u.check_dir(datasets_file)) if args.predictions_csv: resume = create_prediction_dataset(output_dir, "kfold%s" % best_counter, args, resume) message = (u'Evaluated %d/%d feature subsets\n\n' % ((len(open_list) + len(closed_list) - 1), 2 ** len(field_ids) - 1)) u.log_message(message, log_file=session_file, console=1) features_writer.close_writer() return best_features def extract_evaluation_info(evaluation, category): """Returns the evaluation metrics for the chosen category or the average.
def local_batch_predict(models, test_reader, prediction_file, api, args, resume=False, output_path=None, output=None, method=PLURALITY_CODE, options=None, session_file=None, labels=None, ordered=True, exclude=None, models_per_label=1, other_label=OTHER, multi_label_data=None): """Get local predictions form partial Multimodel, combine and save to file """ def draw_progress_bar(current, total): """Draws a text based progress report. """ pct = 100 - ((total - current) * 100) / (total) console_log("Predicted on %s out of %s models [%s%%]" % (localize(current), localize(total), pct), reset=True) max_models = args.max_batch_models if labels is None: labels = [] test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) if output is None: try: output = open(prediction_file, 'w', 0) except IOError: raise IOError("Failed to write in %s" % prediction_file) models_total = len(models) models_splits = [ models[index:(index + max_models)] for index in range(0, models_total, max_models) ] # Input data is stored as a list and predictions are made for all rows # with each model raw_input_data_list = [] for input_data in test_reader: raw_input_data_list.append(input_data) total_votes = [] models_order = [] models_count = 0 single_model = models_total == 1 query_string = FIELDS_QS if single_model else ALL_FIELDS_QS # processing the models in slots for models_split in models_splits: if resume: for model in models_split: pred_file = get_predictions_file_name(model, output_path) c.checkpoint(c.are_predictions_created, pred_file, test_reader.number_of_tests(), debug=args.debug) # retrieving the full models allowed by --max-batch-models to be used # in a multimodel slot complete_models, models_order = retrieve_models_split( models_split, api, query_string=query_string, labels=labels, multi_label_data=multi_label_data, ordered=ordered, models_order=models_order) # predicting with the multimodel slot if complete_models: local_model = MultiModel(complete_models, api=api) # added to ensure garbage collection at each step of the loop gc.collect() try: votes = local_model.batch_predict( raw_input_data_list, output_path, by_name=test_set_header, reuse=True, missing_strategy=args.missing_strategy, headers=test_reader.raw_headers, to_file=(not args.fast), use_median=args.median) except ImportError: sys.exit("Failed to find the numpy and scipy libraries needed" " to use proportional missing strategy for" " regressions. Please, install them manually") # extending the votes for each input data with the new model-slot # predictions if not args.fast: votes = local_model.batch_votes(output_path) models_count += max_models if models_count > models_total: models_count = models_total if args.verbosity: draw_progress_bar(models_count, models_total) if total_votes: for index in range(0, len(votes)): predictions = total_votes[index] predictions.extend(votes[index].predictions) else: total_votes = votes if not single_model: message = u.dated("Combining predictions.\n") u.log_message(message, log_file=session_file, console=args.verbosity) # combining the votes to issue the final prediction for each input data for index in range(0, len(total_votes)): multivote = total_votes[index] input_data = raw_input_data_list[index] if single_model: # single model predictions need no combination prediction = [ multivote.predictions[0]['prediction'], multivote.predictions[0]['confidence'] ] elif method == AGGREGATION: # multi-labeled fields: predictions are concatenated prediction = aggregate_multivote( multivote, options, labels, models_per_label, ordered, models_order, label_separator=args.label_separator) elif method == COMBINATION: # used in --max-categories flag: each model slot contains a # subset of categories and the predictions for all of them # are combined in a global distribution to obtain the final # prediction prediction = combine_multivote(multivote, other_label=other_label) else: prediction = multivote.combine(method=method, with_confidence=True, options=options) write_prediction(prediction, output, args.prediction_info, input_data, exclude)
def remote_predict_models(models, test_reader, prediction_file, api, args, resume=False, output_path=None, session_file=None, log=None, exclude=None): """Retrieve predictions remotely, combine them and save predictions to file """ predictions_files = [] prediction_args = {"tags": args.tag} test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) message_logged = False raw_input_data_list = [] for input_data in test_reader: raw_input_data_list.append(input_data) single_model = len(models) == 1 if single_model: prediction_file = UnicodeWriter(prediction_file).open_writer() for model in models: model = bigml.api.get_model_id(model) predictions_file = get_predictions_file_name(model, output_path) predictions_files.append(predictions_file) if (not resume or not c.checkpoint(c.are_predictions_created, predictions_file, test_reader.number_of_tests(), debug=args.debug)[0]): if not message_logged: message = u.dated("Creating remote predictions.\n") u.log_message(message, log_file=session_file, console=args.verbosity) message_logged = True with UnicodeWriter(predictions_file) as predictions_file: for input_data in raw_input_data_list: input_data_dict = test_reader.dict(input_data) prediction = api.create_prediction(model, input_data_dict, by_name=test_set_header, wait_time=0, args=prediction_args) u.check_resource_error(prediction, "Failed to create prediction: ") u.log_message("%s\n" % prediction['resource'], log_file=log) prediction_row = prediction_to_row(prediction) predictions_file.writerow(prediction_row) if single_model: write_prediction(prediction_row[0:2], prediction_file, args.prediction_info, input_data, exclude) if single_model: prediction_file.close_writer() else: combine_votes(predictions_files, Model(models[0]).to_prediction, prediction_file, args.method, args.prediction_info, raw_input_data_list, exclude)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ source = None dataset = None model = None models = None fields = None other_label = OTHER ensemble_ids = [] multi_label_data = None multi_label_fields = [] #local_ensemble = None test_dataset = None datasets = None # variables from command-line options resume = args.resume_ model_ids = args.model_ids_ output = args.predictions dataset_fields = args.dataset_fields_ check_args_coherence(args) path = u.check_dir(output) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # labels to be used in multi-label expansion labels = (None if args.labels is None else [label.strip() for label in args.labels.split(args.args_separator)]) if labels is not None: labels = sorted([label for label in labels]) # multi_label file must be preprocessed to obtain a new extended file if args.multi_label and args.training_set is not None: (args.training_set, multi_label_data) = ps.multi_label_expansion( args.training_set, args.train_header, args, path, labels=labels, session_file=session_file) args.train_header = True args.objective_field = multi_label_data["objective_name"] all_labels = l.get_all_labels(multi_label_data) if not labels: labels = all_labels else: all_labels = labels if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) if args.source_file: # source is retrieved from the contents of the given local JSON file source, csv_properties, fields = u.read_local_resource( args.source_file, csv_properties=csv_properties) else: # source is retrieved from the remote object source, resume, csv_properties, fields = ps.source_processing( api, args, resume, csv_properties=csv_properties, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) if args.multi_label and source: multi_label_data = l.get_multi_label_data(source) (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(args.objective_field, labels, multi_label_data, fields, multi_label_fields) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) if args.dataset_file: # dataset is retrieved from the contents of the given local JSON file model_dataset, csv_properties, fields = u.read_local_resource( args.dataset_file, csv_properties=csv_properties) if not args.datasets: datasets = [model_dataset] dataset = model_dataset else: datasets = u.read_datasets(args.datasets) if not datasets: # dataset is retrieved from the remote object datasets, resume, csv_properties, fields = pd.dataset_processing( source, api, args, resume, fields=fields, csv_properties=csv_properties, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) if datasets: dataset = datasets[0] if args.to_csv is not None: resume = pd.export_dataset(dataset, api, args, resume, session_file=session_file, path=path) # Now we have a dataset, let's check if there's an objective_field # given by the user and update it in the fields structure args.objective_id_ = get_objective_id(args, fields) # If test_split is used, split the dataset in a training and a test dataset # according to the given split if args.test_split > 0: dataset, test_dataset, resume = pd.split_processing( dataset, api, args, resume, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) datasets[0] = dataset # Check if the dataset has a categorical objective field and it # has a max_categories limit for categories if args.max_categories > 0 and len(datasets) == 1: if pd.check_max_categories(fields.fields[args.objective_id_]): distribution = pd.get_categories_distribution(dataset, args.objective_id_) if distribution and len(distribution) > args.max_categories: categories = [element[0] for element in distribution] other_label = pd.create_other_label(categories, other_label) datasets, resume = pd.create_categories_datasets( dataset, distribution, fields, args, api, resume, session_file=session_file, path=path, log=log, other_label=other_label) else: sys.exit("The provided objective field is not categorical nor " "a full terms only text field. " "Only these fields can be used with" " --max-categories") # If multi-dataset flag is on, generate a new dataset from the given # list of datasets if args.multi_dataset: dataset, resume = pd.create_new_dataset( datasets, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) datasets = [dataset] # Check if the dataset has a generators file associated with it, and # generate a new dataset with the specified field structure. Also # if the --to-dataset flag is used to clone or sample the original dataset if args.new_fields or (args.sample_rate != 1 and args.no_model) or \ (args.lisp_filter or args.json_filter) and not has_source(args): if fields is None: if isinstance(dataset, basestring): dataset = u.check_resource(dataset, api=api) fields = Fields(dataset, csv_properties) args.objective_id_ = get_objective_id(args, fields) args.objective_name_ = fields.field_name(args.objective_id_) dataset, resume = pd.create_new_dataset( dataset, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) datasets[0] = dataset # rebuild fields structure for new ids and fields csv_properties.update({'objective_field': args.objective_name_, 'objective_field_present': True}) fields = pd.get_fields_structure(dataset, csv_properties) args.objective_id_ = get_objective_id(args, fields) if args.multi_label and dataset and multi_label_data is None: multi_label_data = l.get_multi_label_data(dataset) (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(args.objective_field, labels, multi_label_data, fields, multi_label_fields) if dataset: # retrieves max_categories data, if any args.max_categories = get_metadata(dataset, 'max_categories', args.max_categories) other_label = get_metadata(dataset, 'other_label', other_label) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) if args.model_file: # model is retrieved from the contents of the given local JSON file model, csv_properties, fields = u.read_local_resource( args.model_file, csv_properties=csv_properties) models = [model] model_ids = [model['resource']] ensemble_ids = [] elif args.ensemble_file: # model is retrieved from the contents of the given local JSON file ensemble, csv_properties, fields = u.read_local_resource( args.ensemble_file, csv_properties=csv_properties) model_ids = ensemble['object']['models'][:] ensemble_ids = [ensemble['resource']] models = model_ids[:] model = retrieve_resource(bigml.api.BigML(storage='./storage'), models[0], query_string=r.ALL_FIELDS_QS) models[0] = model else: # model is retrieved from the remote object models, model_ids, ensemble_ids, resume = pm.models_processing( datasets, models, model_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log, labels=labels, multi_label_data=multi_label_data, other_label=other_label) if models: model = models[0] single_model = len(models) == 1 # If multi-label flag is set and no training_set was provided, label # info is extracted from the user_metadata. If models belong to an # ensemble, the ensemble must be retrieved to get the user_metadata. if model and args.multi_label and multi_label_data is None: if len(ensemble_ids) > 0 and isinstance(ensemble_ids[0], dict): resource = ensemble_ids[0] elif belongs_to_ensemble(model): ensemble_id = get_ensemble_id(model) resource = r.get_ensemble(ensemble_id, api=api, verbosity=args.verbosity, session_file=session_file) else: resource = model multi_label_data = l.get_multi_label_data(resource) # We update the model's public state if needed if model: if (isinstance(model, basestring) or bigml.api.get_status(model)['code'] != bigml.api.FINISHED): if not args.evaluate and not a.has_train(args) and \ not a.has_test(args) : query_string = MINIMUM_MODEL elif not args.test_header: query_string = r.ALL_FIELDS_QS else: query_string = "%s;%s" % (r.ALL_FIELDS_QS, r.FIELDS_QS) model = u.check_resource(model, api.get_model, query_string=query_string) models[0] = model if (args.black_box or args.white_box or (args.shared_flag and r.shared_changed(args.shared, model))): model_args = {} if args.shared_flag and r.shared_changed(args.shared, model): model_args.update(shared=args.shared) if args.black_box or args.white_box: model_args.update(r.set_publish_model_args(args)) if model_args: model = r.update_model(model, model_args, args, api=api, path=path, session_file=session_file) models[0] = model # We get the fields of the model if we haven't got # them yet and need them if model and not args.evaluate and (a.has_test(args) or args.export_fields): # If more than one model, use the full field structure if (not single_model and not args.multi_label and belongs_to_ensemble(model)): if len(ensemble_ids) > 0: ensemble_id = ensemble_ids[0] args.ensemble_ids_ = ensemble_ids else: ensemble_id = get_ensemble_id(model) fields = pm.get_model_fields( model, csv_properties, args, single_model=single_model, multi_label_data=multi_label_data) # Free memory after getting fields # local_ensemble = None gc.collect() # Fills in all_labels from user_metadata if args.multi_label and not all_labels: (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(args.objective_field, labels, multi_label_data, fields, multi_label_fields) if model: # retrieves max_categories data, if any args.max_categories = get_metadata(model, 'max_categories', args.max_categories) other_label = get_metadata(model, 'other_label', other_label) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) # If predicting if (models and (a.has_test(args) or (test_dataset and args.remote)) and not args.evaluate): models_per_label = 1 if test_dataset is None: test_dataset = get_test_dataset(args) if args.multi_label: # When prediction starts from existing models, the # multi_label_fields can be retrieved from the user_metadata # in the models if args.multi_label_fields is None and multi_label_fields: multi_label_field_names = [field[1] for field in multi_label_fields] args.multi_label_fields = ",".join(multi_label_field_names) test_set = ps.multi_label_expansion( args.test_set, args.test_header, args, path, labels=labels, session_file=session_file, input_flag=True)[0] test_set_header = True # Remote predictions: predictions are computed as batch predictions # in bigml.com except when --no-batch flag is set on or multi-label # or max-categories are used if (args.remote and not args.no_batch and not args.multi_label and not args.method in [THRESHOLD_CODE, COMBINATION]): # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) if args.to_dataset and args.dataset_off: model = api.check_resource(model['resource'], query_string=r.ALL_FIELDS_QS) model_fields = Fields(model) objective_field_name = model_fields.field_name( \ model_fields.objective_field) if objective_field_name in test_fields.fields_by_name.keys(): args.prediction_name = "%s (predicted)" % \ objective_field_name batch_prediction_args = r.set_batch_prediction_args( args, fields=fields, dataset_fields=test_fields) remote_predict(model, test_dataset, batch_prediction_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: models_per_label = args.number_of_models if (args.multi_label and len(ensemble_ids) > 0 and args.number_of_models == 1): # use case where ensembles are read from a file models_per_label = len(models) / len(ensemble_ids) predict(models, fields, args, api=api, log=log, resume=resume, session_file=session_file, labels=labels, models_per_label=models_per_label, other_label=other_label, multi_label_data=multi_label_data) # When combine_votes flag is used, retrieve the predictions files saved # in the comma separated list of directories and combine them if args.votes_files_: model_id = re.sub(r'.*(model_[a-f0-9]{24})__predictions\.csv$', r'\1', args.votes_files_[0]).replace("_", "/") try: model = u.check_resource(model_id, api.get_model) except ValueError, exception: sys.exit("Failed to get model %s: %s" % (model_id, str(exception))) local_model = Model(model) message = u.dated("Combining votes.\n") u.log_message(message, log_file=session_file, console=args.verbosity) combine_votes(args.votes_files_, local_model.to_prediction, output, method=args.method)
def main(args=sys.argv[1:]): """Main process """ train_stdin = False for i in range(0, len(args)): if args[i].startswith("--"): args[i] = args[i].replace("_", "-") if (args[i] == '--train' and (i == len(args) - 1 or args[i + 1].startswith("--"))): train_stdin = True # If --clear-logs the log files are cleared if "--clear-logs" in args: for log_file in LOG_FILES: try: open(log_file, 'w', 0).close() except IOError: pass literal_args = args[:] for i in range(0, len(args)): # quoting literals with blanks: 'petal length' if ' ' in args[i]: prefix = "" literal = args[i] # literals with blanks after "+" or "-": +'petal length' if args[i][0] in r.ADD_REMOVE_PREFIX: prefix = args[i][0] literal = args[i][1:] literal_args[i] = '%s"%s"' % (prefix, literal) message = "bigmler %s\n" % " ".join(literal_args) # Resume calls are not logged if not "--resume" in args: with open(COMMAND_LOG, "a", 0) as command_log: command_log.write(message) resume = False parser = create_parser(defaults=get_user_defaults(), constants={ 'NOW': NOW, 'MAX_MODELS': MAX_MODELS, 'PLURALITY': PLURALITY }) # Parses command line arguments. command_args = parser.parse_args(args) if command_args.cross_validation_rate > 0 and (command_args.test_set or command_args.evaluate or command_args.model or command_args.models or command_args.model_tag): parser.error("Non compatible flags: --cross-validation-rate" " cannot be used with --evaluate, --model," " --models or --model-tag. Usage:\n\n" "bigmler --train data/iris.csv " "--cross-validation-rate 0.1") default_output = ('evaluation' if command_args.evaluate else 'predictions.csv') if command_args.resume: debug = command_args.debug command = u.get_log_reversed(COMMAND_LOG, command_args.stack_level) args = shlex.split(command)[1:] try: position = args.index("--train") if (position == (len(args) - 1) or args[position + 1].startswith("--")): train_stdin = True except ValueError: pass output_dir = u.get_log_reversed(DIRS_LOG, command_args.stack_level) defaults_file = "%s%s%s" % (output_dir, os.sep, DEFAULTS_FILE) parser = create_parser(defaults=get_user_defaults(defaults_file), constants={ 'NOW': NOW, 'MAX_MODELS': MAX_MODELS, 'PLURALITY': PLURALITY }) command_args = parser.parse_args(args) if command_args.predictions is None: command_args.predictions = ("%s%s%s" % (output_dir, os.sep, default_output)) # Logs the issued command and the resumed command session_file = "%s%s%s" % (output_dir, os.sep, SESSIONS_LOG) u.log_message(message, log_file=session_file) message = "\nResuming command:\n%s\n\n" % command u.log_message(message, log_file=session_file, console=True) try: defaults_handler = open(defaults_file, 'r') contents = defaults_handler.read() message = "\nUsing the following defaults:\n%s\n\n" % contents u.log_message(message, log_file=session_file, console=True) defaults_handler.close() except IOError: pass resume = True else: if command_args.predictions is None: command_args.predictions = ("%s%s%s" % (NOW, os.sep, default_output)) if len(os.path.dirname(command_args.predictions).strip()) == 0: command_args.predictions = ( "%s%s%s" % (NOW, os.sep, command_args.predictions)) directory = u.check_dir(command_args.predictions) session_file = "%s%s%s" % (directory, os.sep, SESSIONS_LOG) u.log_message(message + "\n", log_file=session_file) try: defaults_file = open(DEFAULTS_FILE, 'r') contents = defaults_file.read() defaults_file.close() defaults_copy = open("%s%s%s" % (directory, os.sep, DEFAULTS_FILE), 'w', 0) defaults_copy.write(contents) defaults_copy.close() except IOError: pass with open(DIRS_LOG, "a", 0) as directory_log: directory_log.write("%s\n" % os.path.abspath(directory)) if resume and debug: command_args.debug = True if train_stdin: command_args.training_set = StringIO.StringIO(sys.stdin.read()) api_command_args = { 'username': command_args.username, 'api_key': command_args.api_key, 'dev_mode': command_args.dev_mode, 'debug': command_args.debug } if command_args.store: api_command_args.update({'storage': u.check_dir(session_file)}) api = bigml.api.BigML(**api_command_args) if (command_args.evaluate and not (command_args.training_set or command_args.source or command_args.dataset) and not (command_args.test_set and (command_args.model or command_args.models or command_args.model_tag or command_args.ensemble))): parser.error("Evaluation wrong syntax.\n" "\nTry for instance:\n\nbigmler --train data/iris.csv" " --evaluate\nbigmler --model " "model/5081d067035d076151000011 --dataset " "dataset/5081d067035d076151003423 --evaluate\n" "bigmler --ensemble ensemble/5081d067035d076151003443" " --evaluate") if command_args.objective_field: objective = command_args.objective_field try: command_args.objective_field = int(objective) except ValueError: pass output_args = { "api": api, "training_set": command_args.training_set, "test_set": command_args.test_set, "output": command_args.predictions, "objective_field": command_args.objective_field, "name": command_args.name, "training_set_header": command_args.train_header, "test_set_header": command_args.test_header, "args": command_args, "resume": resume, } # Reads description if provided. if command_args.description: description_arg = u.read_description(command_args.description) output_args.update(description=description_arg) else: output_args.update(description="Created using BigMLer") # Parses fields if provided. if command_args.field_attributes: field_attributes_arg = (u.read_field_attributes( command_args.field_attributes)) output_args.update(field_attributes=field_attributes_arg) # Parses types if provided. if command_args.types: types_arg = u.read_types(command_args.types) output_args.update(types=types_arg) # Parses dataset fields if provided. if command_args.dataset_fields: dataset_fields_arg = map(lambda x: x.strip(), command_args.dataset_fields.split(',')) output_args.update(dataset_fields=dataset_fields_arg) # Parses model input fields if provided. if command_args.model_fields: model_fields_arg = map(lambda x: x.strip(), command_args.model_fields.split(',')) output_args.update(model_fields=model_fields_arg) model_ids = [] # Parses model/ids if provided. if command_args.models: model_ids = u.read_models(command_args.models) output_args.update(model_ids=model_ids) dataset_id = None # Parses dataset/id if provided. if command_args.datasets: dataset_id = u.read_dataset(command_args.datasets) command_args.dataset = dataset_id # Retrieve model/ids if provided. if command_args.model_tag: model_ids = (model_ids + u.list_ids( api.list_models, "tags__in=%s" % command_args.model_tag)) output_args.update(model_ids=model_ids) # Reads a json filter if provided. if command_args.json_filter: json_filter = u.read_json_filter(command_args.json_filter) command_args.json_filter = json_filter # Reads a lisp filter if provided. if command_args.lisp_filter: lisp_filter = u.read_lisp_filter(command_args.lisp_filter) command_args.lisp_filter = lisp_filter # Adds default tags unless that it is requested not to do so. if command_args.no_tag: command_args.tag.append('BigMLer') command_args.tag.append('BigMLer_%s' % NOW) # Checks combined votes method if (command_args.method and not command_args.method in COMBINATION_WEIGHTS.keys()): command_args.method = 0 else: combiner_methods = dict([[value, key] for key, value in COMBINER_MAP.items()]) command_args.method = combiner_methods.get(command_args.method, 0) # Reads votes files in the provided directories. if command_args.votes_dirs: dirs = map(lambda x: x.strip(), command_args.votes_dirs.split(',')) votes_path = os.path.dirname(command_args.predictions) votes_files = u.read_votes_files(dirs, votes_path) output_args.update(votes_files=votes_files) # Parses fields map if provided. if command_args.fields_map: fields_map_arg = u.read_fields_map(command_args.fields_map) output_args.update(fields_map=fields_map_arg) # Parses resources ids if provided. if command_args.delete: if command_args.predictions is None: path = NOW else: path = u.check_dir(command_args.predictions) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) message = u.dated("Retrieving objects to delete.\n") u.log_message(message, log_file=session_file, console=command_args.verbosity) delete_list = [] if command_args.delete_list: delete_list = map(lambda x: x.strip(), command_args.delete_list.split(',')) if command_args.delete_file: if not os.path.exists(command_args.delete_file): raise Exception("File %s not found" % command_args.delete_file) delete_list.extend( [line for line in open(command_args.delete_file, "r")]) if command_args.all_tag: query_string = "tags__in=%s" % command_args.all_tag delete_list.extend(u.list_ids(api.list_sources, query_string)) delete_list.extend(u.list_ids(api.list_datasets, query_string)) delete_list.extend(u.list_ids(api.list_models, query_string)) delete_list.extend(u.list_ids(api.list_predictions, query_string)) delete_list.extend(u.list_ids(api.list_evaluations, query_string)) # Retrieve sources/ids if provided if command_args.source_tag: query_string = "tags__in=%s" % command_args.source_tag delete_list.extend(u.list_ids(api.list_sources, query_string)) # Retrieve datasets/ids if provided if command_args.dataset_tag: query_string = "tags__in=%s" % command_args.dataset_tag delete_list.extend(u.list_ids(api.list_datasets, query_string)) # Retrieve model/ids if provided if command_args.model_tag: query_string = "tags__in=%s" % command_args.model_tag delete_list.extend(u.list_ids(api.list_models, query_string)) # Retrieve prediction/ids if provided if command_args.prediction_tag: query_string = "tags__in=%s" % command_args.prediction_tag delete_list.extend(u.list_ids(api.list_predictions, query_string)) # Retrieve evaluation/ids if provided if command_args.evaluation_tag: query_string = "tags__in=%s" % command_args.evaluation_tag delete_list.extend(u.list_ids(api.list_evaluations, query_string)) # Retrieve ensembles/ids if provided if command_args.ensemble_tag: query_string = "tags__in=%s" % command_args.ensemble_tag delete_list.extend(u.list_ids(api.list_ensembles, query_string)) message = u.dated("Deleting objects.\n") u.log_message(message, log_file=session_file, console=command_args.verbosity) message = "\n".join(delete_list) u.log_message(message, log_file=session_file) u.delete(api, delete_list) if sys.platform == "win32" and sys.stdout.isatty(): message = (u"\nGenerated files:\n\n" + unicode(u.print_tree(path, " "), "utf-8") + u"\n") else: message = "\nGenerated files:\n\n" + u.print_tree(path, " ") + "\n" u.log_message(message, log_file=session_file, console=command_args.verbosity) elif (command_args.training_set or command_args.test_set or command_args.source or command_args.dataset or command_args.datasets or command_args.votes_dirs): compute_output(**output_args) u.log_message("_" * 80 + "\n", log_file=session_file)
def analyze_dispatcher(args=sys.argv[1:]): """Main processing of the parsed options for BigMLer analyze """ # If --clear-logs the log files are cleared if "--clear-logs" in args: clear_log_files(LOG_FILES) command = command_handling(args, COMMAND_LOG) # Parses command line arguments. command_args = command.parser.parse_args(command.args) resume = command_args.resume if resume: command_args, session_file, _ = get_stored_command( args, command_args.debug, command_log=COMMAND_LOG, dirs_log=DIRS_LOG, sessions_log=SESSIONS_LOG) else: if command_args.output_dir is None: command_args.output_dir = a.NOW session_file = os.path.join(command_args.output_dir, SESSIONS_LOG) # If logging is required, open the file for logging log = None if command_args.log_file: u.check_dir(command_args.log_file) log = command_args.log_file # If --clear_logs the log files are cleared if command_args.clear_logs: clear_log_files([log]) if command_args.model_fields: model_fields = command_args.model_fields.split(',') command_args.model_fields_ = [ model_field.strip() for model_field in model_fields ] else: command_args.model_fields_ = {} u.sys_log_message(u"%s\n" % os.path.abspath(command_args.output_dir), log_file=DIRS_LOG) session_file = os.path.join(command_args.output_dir, SESSIONS_LOG) # create api instance form args api = a.get_api_instance(command_args, u.check_dir(session_file)) # Creates the corresponding api instance api = a.get_api_instance(command_args, u.check_dir(session_file)) a.transform_dataset_options(command_args, api) # --maximize flag will be deprecated. Use --optimize flag. if command_args.maximize is not None and command_args.optimize is None: command_args.optimize = command_args.maximize incompatible_flags = [ command_args.cv, command_args.features, command_args.nodes, command_args.random_fields ] if sum([int(bool(flag)) for flag in incompatible_flags]) > 1: sys.exit("The following flags cannot be used together:\n --features" "\n --cross-validation\n --nodes\n --random-fields") if (command_args.dataset is None and command_args.datasets is None and command_args.dataset_file is None): sys.exit("The analyze command needs an existing dataset ID. Please, " "use the --dataset flag.") if not any(incompatible_flags): sys.exit("You need to specify the type of analysis: features, node " "threshold, cross validation or random fields.") # k-fold cross-validation if command_args.cv and command_args.dataset is not None: create_kfold_cv(command_args, api, command.common_options, resume=resume) # features analysis elif command_args.features: create_features_analysis(command_args, api, command.common_options, resume=resume) # node threshold analysis elif command_args.nodes: create_nodes_analysis(command_args, api, command.common_options, resume=resume) # random fields analysis elif command_args.random_fields: create_candidates_analysis(command_args, api, command.common_options, resume=resume) else: sys.exit("You must choose one of the available analysis: --features," " --nodes, --random-fields or --cross-validation. Add" " your prefered option to" " the command line or type\n bigmler analyze --help\n" " to see all the available options.")
def local_batch_predict(models, test_reader, prediction_file, api, max_models=MAX_MODELS, resume=False, output_path=None, output=None, verbosity=True, method=PLURALITY_CODE, session_file=None, debug=False, prediction_info=None): """Get local predictions form partial Multimodel, combine and save to file """ def draw_progress_bar(current, total): """Draws a text based progress report. """ pct = 100 - ((total - current) * 100) / (total) console_log("Predicted on %s out of %s models [%s%%]" % (localize(current), localize(total), pct)) test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) if output is None: try: output = open(prediction_file, 'w', 0) except IOError: raise IOError("Failed to write in %s" % prediction_file) models_total = len(models) models_splits = [ models[index:(index + max_models)] for index in range(0, models_total, max_models) ] input_data_list = [] for input_data in test_reader: input_data_list.append(input_data) total_votes = [] models_count = 0 for models_split in models_splits: if resume: for model in models_split: pred_file = get_predictions_file_name(model, output_path) c.checkpoint(c.are_predictions_created, pred_file, test_reader.number_of_tests(), debug=debug) complete_models = [] for index in range(len(models_split)): model = models_split[index] if (isinstance(model, basestring) or bigml.api.get_status(model)['code'] != bigml.api.FINISHED): try: model = bigml.api.check_resource(model, api.get_model, FIELDS_QS) except ValueError, exception: sys.exit("Failed to get model: %s" % (model, str(exception))) complete_models.append(model) local_model = MultiModel(complete_models) local_model.batch_predict(input_data_list, output_path, by_name=test_set_header, reuse=True) votes = local_model.batch_votes(output_path) models_count += max_models if models_count > models_total: models_count = models_total if verbosity: draw_progress_bar(models_count, models_total) if total_votes: for index in range(0, len(votes)): predictions = total_votes[index].predictions predictions.extend(votes[index].predictions) else: total_votes = votes
def project_dispatcher(args=sys.argv[1:]): """Parses command line and calls the different processing functions """ command = command_handling(args, COMMAND_LOG) # Parses command line arguments. command_args = a.parse_and_check(command) if command_args.resume: command_args, session_file, _ = get_stored_command( args, command_args.debug, command_log=COMMAND_LOG, dirs_log=DIRS_LOG, sessions_log=SESSIONS_LOG) else: if command_args.output_dir is None: command_args.output_dir = a.NOW directory = u.check_dir("%s/x.txt" % command_args.output_dir) command_args.output_dir = directory session_file = os.path.join(directory, SESSIONS_LOG) u.log_message(command.command + "\n", log_file=session_file) directory = u.check_dir(os.path.join(command_args.output_dir, "tmp")) session_file = os.path.join(directory, SESSIONS_LOG) u.log_message(command.command + "\n", log_file=session_file) try: shutil.copy(DEFAULTS_FILE, os.path.join(directory, DEFAULTS_FILE)) except IOError: pass u.sys_log_message(u"%s\n" % os.path.abspath(directory), log_file=DIRS_LOG) path = u.check_dir("%s/x.txt" % command_args.output_dir) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) # If logging is required set the file for logging log = None if command_args.log_file: u.check_dir(command_args.log_file) log = command_args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # Creates the corresponding api instance api = a.get_api_instance(command_args, u.check_dir(session_file)) a.get_output_args(api, command_args, command_args.resume) a.attribute_args(command_args) if not command_args.project_id and command_args.name: command_args.project = command_args.name if command_args.project: # create project pp.project_processing(api, command_args, command_args.resume, session_file=session_file, path=path, log=log, create=True) if command_args.project_id and (command_args.project_attributes or command_args.name or command_args.tag or command_args.description or command_args.category): # update project's attributes pp.update_project(command_args, api, command_args.resume, \ session_file=session_file) u.log_message("_" * 80 + "\n", log_file=session_file) u.print_generated_files(command_args.output_dir, log_file=session_file, verbosity=command_args.verbosity)
def compute_output(api, args): """ Creates one or more anomaly detectors using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ anomaly = None anomalies = None # no multi-label support at present # variables from command-line options resume = args.resume_ anomaly_ids = args.anomaly_ids_ output = args.predictions # there's only one anomaly detector to be generated at present args.max_parallel_anomalies = 1 # anomalies cannot be published yet. args.public_anomaly = False # It is compulsory to have a description to publish either datasets or # anomalies if (not args.description_ and (args.public_anomaly or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (dataset, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.anomaly_file: # anomaly is retrieved from the contents of the given local JSON file anomaly, csv_properties, fields = u.read_local_resource( args.anomaly_file, csv_properties=csv_properties) anomalies = [anomaly] anomaly_ids = [anomaly['resource']] else: # anomaly is retrieved from the remote object anomalies, anomaly_ids, resume = pa.anomalies_processing( datasets, anomalies, anomaly_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if anomalies: anomaly = anomalies[0] # We update the anomaly's public state if needed if anomaly: if not a.has_test(args) and not args.anomalies_dataset: query_string = MINIMUM_MODEL elif not a.has_test(args): query_string = ";".join([EXCLUDE_TREES, r.ALL_FIELDS_QS]) else: query_string = r.ALL_FIELDS_QS try: anomaly_id = anomaly.get('resource', anomaly) except AttributeError: anomaly_id = anomaly anomaly = u.check_resource(anomaly_id, query_string=query_string, api=api) anomalies[0] = anomaly if (args.public_anomaly or (args.shared_flag and r.shared_changed(args.shared, anomaly))): anomaly_args = {} if args.shared_flag and r.shared_changed(args.shared, anomaly): anomaly_args.update(shared=args.shared) if args.public_anomaly: anomaly_args.update(r.set_publish_anomaly_args(args)) if anomaly_args: anomaly = r.update_anomaly(anomaly, anomaly_args, args, api=api, path=path, session_file=session_file) anomalies[0] = anomaly # We get the fields of the anomaly detector if we haven't got # them yet and need them if anomaly and args.test_set: fields = pa.get_anomaly_fields(anomaly, csv_properties, args) # If creating a top anomalies excluded/included dataset if args.anomalies_dataset and anomaly: origin_dataset = anomaly['object'].get('dataset') if origin_dataset is None: sys.exit("The dataset used to generate the anomaly detector " "cannot be found. Failed to generate the anomalies " " dataset.") local_anomaly = Anomaly(anomaly) include = args.anomalies_dataset == ANOMALIES_IN args._anomaly_filter = local_anomaly.anomalies_filter(include=include) new_dataset, resume = pd.create_new_dataset( origin_dataset, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) # If predicting if anomaly and args.score: args.test_dataset = anomaly['object']['dataset'] if anomalies and (a.has_test(args) or (test_dataset and args.remote)): # test dataset can be defined by --test-split or --test-dataset or # --test-datasets if test_dataset is None: test_dataset = get_test_dataset(args) # Remote anomaly scores: scores are computed as batch anomaly scores # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_anomaly_score_args = r.set_batch_anomaly_score_args( args, fields=fields, dataset_fields=test_fields) remote_anomaly_score(anomaly, test_dataset, batch_anomaly_score_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: anomaly_score(anomalies, fields, args, session_file=session_file) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def local_batch_predict(models, test_reader, prediction_file, api, args, resume=False, output_path=None, output=None, method=PLURALITY_CODE, options=None, session_file=None, labels=None, ordered=True, exclude=None, models_per_label=1, other_label=OTHER, multi_label_data=None): """Get local predictions form partial Multimodel, combine and save to file """ def draw_progress_bar(current, total): """Draws a text based progress report. """ pct = 100 - ((total - current) * 100) / (total) console_log("Predicted on %s out of %s models [%s%%]" % (localize(current), localize(total), pct)) max_models = args.max_batch_models label_separator = args.label_separator if labels is None: labels = [] test_set_header = test_reader.has_headers() if output_path is None: output_path = u.check_dir(prediction_file) if output is None: try: output = open(prediction_file, 'w', 0) except IOError: raise IOError("Failed to write in %s" % prediction_file) models_total = len(models) models_splits = [ models[index:(index + max_models)] for index in range(0, models_total, max_models) ] input_data_list = [] raw_input_data_list = [] for input_data in test_reader: raw_input_data_list.append(input_data) input_data_list.append(test_reader.dict(input_data)) total_votes = [] models_count = 0 if not ordered: models_order = [] single_model = models_total == 1 query_string = FIELDS_QS if single_model else ALL_FIELDS_QS for models_split in models_splits: if resume: for model in models_split: pred_file = get_predictions_file_name(model, output_path) c.checkpoint(c.are_predictions_created, pred_file, test_reader.number_of_tests(), debug=args.debug) complete_models = [] for index in range(len(models_split)): model = models_split[index] if (isinstance(model, basestring) or bigml.api.get_status(model)['code'] != bigml.api.FINISHED): try: model = u.check_resource(model, api.get_model, query_string) except ValueError, exception: sys.exit("Failed to get model: %s. %s" % (model, str(exception))) # When user selects the labels in multi-label predictions, we must # filter the models that will be used to predict if labels: objective_column = str(multi_label_data['objective_column']) labels_info = multi_label_data['generated_fields'][ objective_column] labels_columns = [ label_info[1] for label_info in labels_info if label_info[0] in labels ] model_objective_id = model['object']['objective_fields'][0] model_fields = model['object']['model']['fields'] model_objective = model_fields[model_objective_id] model_column = model_objective['column_number'] if (model_column in labels_columns): # When the list of models comes from a --model-tag # selection, the models are not retrieved in the same # order they were created. We must keep track of the # label they are associated with to label their # predictions properly if not ordered: models_order.append(model_column) complete_models.append(model) else: complete_models.append(model) if complete_models: local_model = MultiModel(complete_models) try: local_model.batch_predict( input_data_list, output_path, by_name=test_set_header, reuse=True, missing_strategy=args.missing_strategy) except ImportError: sys.exit("Failed to find the numpy and scipy libraries needed" " to use proportional missing strategy for" " regressions. Please, install them manually") votes = local_model.batch_votes(output_path) models_count += max_models if models_count > models_total: models_count = models_total if args.verbosity: draw_progress_bar(models_count, models_total) if total_votes: for index in range(0, len(votes)): predictions = total_votes[index] predictions.extend(votes[index].predictions) else: total_votes = votes
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ cluster = None clusters = None # no multi-label support at present # variables from command-line options resume = args.resume_ cluster_ids = args.cluster_ids_ output = args.predictions # there's only one cluster to be generated at present args.max_parallel_clusters = 1 # clusters cannot be published yet. args.public_cluster = False # It is compulsory to have a description to publish either datasets or # clusters if (not args.description_ and (args.public_cluster or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info(api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.cluster_file: # cluster is retrieved from the contents of the given local JSON file cluster, csv_properties, fields = u.read_local_resource( args.cluster_file, csv_properties=csv_properties) clusters = [cluster] cluster_ids = [cluster['resource']] else: # cluster is retrieved from the remote object clusters, cluster_ids, resume = pc.clusters_processing( datasets, clusters, cluster_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if clusters: cluster = clusters[0] # We update the cluster's public state if needed if cluster: if isinstance(cluster, basestring): if args.cluster_datasets is None and not a.has_test(args): query_string = MINIMUM_MODEL else: query_string = '' cluster = u.check_resource(cluster, api.get_cluster, query_string=query_string) clusters[0] = cluster if (args.public_cluster or (args.shared_flag and r.shared_changed(args.shared, cluster))): cluster_args = {} if args.shared_flag and r.shared_changed(args.shared, cluster): cluster_args.update(shared=args.shared) if args.public_cluster: cluster_args.update(r.set_publish_cluster_args(args)) if cluster_args: cluster = r.update_cluster(cluster, cluster_args, args, api=api, path=path, session_file=session_file) clusters[0] = cluster # We get the fields of the cluster if we haven't got # them yet and need them if cluster and (args.test_set or args.export_fields): if isinstance(cluster, dict): cluster = cluster['resource'] cluster = u.check_resource(cluster, api.get_cluster, query_string=r.ALL_FIELDS_QS) fields = pc.get_cluster_fields(cluster, csv_properties, args) # If predicting if clusters and (a.has_test(args) or (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote centroids: centroids are computed as batch centroids # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_centroid_args = r.set_batch_centroid_args( args, fields=fields, dataset_fields=test_fields) remote_centroid(cluster, test_dataset, batch_centroid_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: centroid(clusters, fields, args, session_file=session_file) if cluster and args.cluster_datasets is not None: cluster = api.check_resource(cluster) centroids_info = cluster['object']['clusters']['clusters'] centroids = { centroid['name']: centroid['id'] for centroid in centroids_info } datasets = cluster['object']['cluster_datasets'] if args.cluster_datasets == '': centroid_ids = centroids.values() else: centroid_ids = [ centroids[cluster_name] for cluster_name in args.cluster_datasets_ if datasets.get(centroids[cluster_name], '') == '' ] for centroid_id in centroid_ids: dataset_args = {'centroid': centroid_id} r.create_dataset(cluster, dataset_args, args, api=api, path=path, session_file=session_file, log=log, dataset_type='cluster') if cluster and args.cluster_models is not None: cluster = api.check_resource(cluster) centroids_info = cluster['object']['clusters']['clusters'] centroids = { centroid['name']: centroid['id'] for centroid in centroids_info } models = cluster['object']['cluster_models'] if args.cluster_models == '': centroid_ids = centroids.values() else: centroid_ids = [ centroids[cluster_name] for cluster_name in args.cluster_models_ if models.get(centroids[cluster_name], '') == '' ] for centroid_id in centroid_ids: model_args = {'centroid': centroid_id} r.create_model(cluster, model_args, args, api=api, path=path, session_file=session_file, log=log, model_type='cluster') if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ pca = None # variables from command-line options resume = args.resume_ pca_ids = args.pca_ids_ output = args.projections # there's only one pca to be generated at present args.max_parallel_pcas = 1 # pca cannot be published yet. args.public_pca = False # It is compulsory to have a description to publish either datasets or # pcas if (not args.description_ and (args.public_pca or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info(api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.pca_file: # pca regression is retrieved from the contents of the given local # JSON file pca, csv_properties, fields = u.read_local_resource( args.pca_file, csv_properties=csv_properties) pac_ids = [pca] else: # pca is retrieved from the remote object or created pca, resume = \ pc.pca_processing( \ datasets, pca, pca_ids, \ api, args, resume, fields=fields, \ session_file=session_file, path=path, log=log) # We update the pca public state if needed if pca: if isinstance(pca, basestring): if not a.has_test(args): query_string = MINIMUM_MODEL elif args.export_fields: query_string = r.ALL_FIELDS_QS else: query_string = '' pca = u.check_resource(pca, api.get_pca, query_string=query_string) if (args.public_pca or (args.shared_flag and r.shared_changed(args.shared, pca))): pca_args = {} if args.shared_flag and r.shared_changed(args.shared, pca): pca_args.update(shared=args.shared) if args.public_pca: pca_args.update( \ r.set_publish_pca_args(args)) if pca_args: pca = r.update_pca( \ pca, pca_args, args, api=api, path=path, \ session_file=session_file) # We get the fields of the pca if we haven't got # them yet and need them if pca and (args.test_set or args.export_fields): fields = pc.get_pca_fields( \ pca, csv_properties, args) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) # If predicting if pca and (a.has_test(args) or \ (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote projections: projections are computed as batch projections # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_projection_args = r.set_batch_projection_args( args, fields=fields, dataset_fields=test_fields) remote_projection(pca, test_dataset, \ batch_projection_args, args, \ api, resume, projection_file=output, \ session_file=session_file, path=path, log=log) else: projection(pca, fields, args, session_file=session_file) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def analyze_dispatcher(args=sys.argv[1:]): """Main processing of the parsed options for BigMLer analyze """ # If --clear-logs the log files are cleared if "--clear-logs" in args: clear_log_files(LOG_FILES) command = command_handling(args, COMMAND_LOG) # Parses command line arguments. command_args = command.parser.parse_args(command.args) resume = command_args.resume if resume: # Keep the debug option if set debug = command_args.debug # Restore the args of the call to resume from the command log file stored_command = StoredCommand(args, COMMAND_LOG, DIRS_LOG) command = Command(None, stored_command=stored_command) # Logs the issued command and the resumed command session_file = os.path.join(stored_command.output_dir, SESSIONS_LOG) stored_command.log_command(session_file=session_file) # Parses resumed arguments. command_args = command.parser.parse_args(command.args) command_args.debug = debug else: if command_args.output_dir is None: command_args.output_dir = a.NOW session_file = os.path.join(command_args.output_dir, SESSIONS_LOG) # If logging is required, open the file for logging log = None if command_args.log_file: u.check_dir(command_args.log_file) log = command_args.log_file # If --clear_logs the log files are cleared if command_args.clear_logs: clear_log_files([log]) if command_args.model_fields: model_fields = command_args.model_fields.split(',') command_args.model_fields_ = [ model_field.strip() for model_field in model_fields ] else: command_args.model_fields_ = {} u.sys_log_message(u"%s\n" % os.path.abspath(command_args.output_dir), log_file=DIRS_LOG) session_file = os.path.join(command_args.output_dir, SESSIONS_LOG) # create api instance form args api = a.get_api_instance(command_args, u.check_dir(session_file)) # --maximize flag will be deprecated. Use --optimize flag. if command_args.maximize is not None and command_args.optimize is None: command_args.optimize = command_args.maximize incompatible_flags = [ command_args.cv, command_args.features, command_args.nodes ] if sum([int(bool(flag)) for flag in incompatible_flags]) > 1: sys.exit("The following flags cannot be used together:\n --features" "\n --cross-validation\n --nodes") # k-fold cross-validation if command_args.cv and command_args.dataset is not None: create_kfold_cv(command_args, api, command.common_options, resume=resume) # features analysis if command_args.features: create_features_analysis(command_args, api, command.common_options, resume=resume) # node threshold analysis if command_args.nodes: create_nodes_analysis(command_args, api, command.common_options, resume=resume)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ topic_model = None topic_models = None # no multi-label support at present # variables from command-line options resume = args.resume_ topic_model_ids = args.topic_model_ids_ output = args.predictions # there's only one topic model resource to be generated at present args.max_parallel_topic_models = 1 # topic models cannot be published yet. args.public_topic_model = False # It is compulsory to have a description to publish either datasets or # topic models if (not args.description_ and (args.public_topic_model or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info(api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.topic_model_file: # topic model is retrieved from the contents of the given local JSON # file topic_model, csv_properties, fields = u.read_local_resource( args.topic_model_file, csv_properties=csv_properties) topic_models = [topic_model] topic_model_ids = [topic_model['resource']] else: # topic model is retrieved from the remote object topic_models, topic_model_ids, resume = pt.topic_model_processing( datasets, topic_models, topic_model_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if topic_models: topic_model = topic_models[0] # We update the topic model's public state if needed if topic_model: if isinstance(topic_model, basestring): if not a.has_test(args): query_string = MINIMUM_MODEL else: query_string = '' topic_model = u.check_resource(topic_model, api.topic_model, query_string=query_string) topic_models[0] = topic_model if (args.public_topic_model or (args.shared_flag and r.shared_changed(args.shared, topic_model))): topic_model_args = {} if args.shared_flag and \ r.shared_changed(args.shared, topic_model): topic_model_args.update(shared=args.shared) if args.public_topic_model: topic_model_args.update(rtm.set_publish_topic_model_args(args)) if topic_model_args: topic_model = rtm.update_topic_model( \ topic_model, topic_model_args, args, api=api, path=path, session_file=session_file) topic_models[0] = topic_model # We get the fields of the topic model if we haven't got # them yet and need them if topic_model and args.test_set: csv_properties.update({ 'objective_field_present': False, 'objective_field': None }) fields = pt.get_topic_model_fields(topic_model, csv_properties, args) # If predicting if topic_models and (a.has_test(args) or (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote topic distributions:topic distributions are computed as # batch topic distributions # in bigml.com except when --no-batch flag is set. if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_topic_distribution_args = \ rtd.set_batch_topic_distribution_args( \ args, fields=fields, \ dataset_fields=test_fields) remote_topic_distribution( \ topic_model, test_dataset, batch_topic_distribution_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: topic_distribution(topic_models, fields, args, session_file=session_file) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates a sample based on a `train_set`, source or dataset. """ samples = None # variables from command-line options resume = args.resume_ sample_ids = args.sample_ids_ output = args.predictions # there's only one sample to be generated at present args.max_parallel_clusters = 1 # sample cannot be published yet. args.public_sample = False # It is compulsory to have a description to publish either datasets or # clusters if (not args.description_ and (args.public_sample or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-sample step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-sample step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, _, resume, csv_properties, fields) = dataset_properties if args.sample_file: # sample is retrieved from the contents of the given local JSON file sample, csv_properties, fields = u.read_local_resource( args.sample_file, csv_properties=csv_properties) samples = [sample] sample_ids = [sample['resource']] else: # sample is retrieved from the remote object samples, sample_ids, resume = psa.samples_processing( datasets, samples, sample_ids, api, args, resume, session_file=session_file, path=path, log=log) if samples: sample = samples[0] # We update the sample's public state if needed if sample: if isinstance(sample, basestring): # build the query string from the sample options sample = u.check_resource(sample, api.get_sample) samples[0] = sample if (args.public_sample or (args.shared_flag and r.shared_changed(args.shared, sample))): sample_args = {} if args.shared_flag and r.shared_changed(args.shared, sample): sample_args.update(shared=args.shared) if args.public_sample: sample_args.update(r.set_publish_sample_args(args)) if sample_args: sample = r.update_sample(sample, sample_args, args, api=api, path=path, session_file=session_file) samples[0] = sample # We get the fields of the sample if we haven't got # them yet and need them if sample and psa.needs_sample_fields(args): fields = psa.get_sample_fields(sample, csv_properties, args) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) sample_file(samples[0], fields, args, api, path=path, session_file=session_file) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)