def connector_dispatcher(args=sys.argv[1:]): """Parses command line and calls the different processing functions """ command_args, _, api, session_file, _ = get_context(args, SETTINGS) path = u.check_dir(command_args.output) log = None if command_args.log_file: u.check_dir(command_args.log_file) log = command_args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) if not command_args.external_connector_id and \ u.has_connection_info(command_args): # create connector pec.connector_processing(api, command_args, command_args.resume, session_file=session_file, path=path, log=log) if command_args.external_connector_id and ( command_args.connector_attributes or command_args.name or command_args.tag or command_args.description or command_args.category): # update connector's attributes pec.update_external_connector(command_args, api, command_args.resume, \ session_file=session_file) u.log_message("_" * 80 + "\n", log_file=session_file) u.print_generated_files(command_args.output_dir, log_file=session_file, verbosity=command_args.verbosity)
def reify_dispatcher(args=sys.argv[1:]): """Parses command line and calls the different processing functions """ # If --clear-logs the log files are cleared if "--clear-logs" in args: clear_log_files(LOG_FILES) command_args, command, api, session_file, resume = get_context(args, SETTINGS) def logger(message): """Partial to log messages according to args.verbosity """ u.log_message(u.dated(message), \ log_file=session_file, console=command_args.verbosity) print command_args.output, command_args.output_dir message = "Starting reification for %s\n\n" % command_args.resource_id u.log_message(message, \ log_file=session_file, console=command_args.verbosity) reify_resources(command_args, api, logger) message = "\nReification complete. See the results in %s\n\n" % \ command_args.output u.log_message(message, \ log_file=session_file, console=command_args.verbosity) u.log_message("_" * 80 + "\n", log_file=session_file) u.print_generated_files(command_args.output_dir, log_file=session_file, verbosity=command_args.verbosity)
def project_dispatcher(args=sys.argv[1:]): """Parses command line and calls the different processing functions """ command_args, command, api, session_file, resume = get_context(args, SETTINGS) path = u.check_dir(command_args.output) log = None if command_args.log_file: u.check_dir(command_args.log_file) log = command_args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) if not command_args.project_id and command_args.name: command_args.project = command_args.name if command_args.project: # create project pp.project_processing( api, command_args, command_args.resume, session_file=session_file, path=path, log=log, create=True) if command_args.project_id and ( command_args.project_attributes or command_args.name or command_args.tag or command_args.description or command_args.category): # update project's attributes pp.update_project(command_args, api, command_args.resume, \ session_file=session_file) u.log_message("_" * 80 + "\n", log_file=session_file) u.print_generated_files(command_args.output_dir, log_file=session_file, verbosity=command_args.verbosity)
def reify_dispatcher(args=sys.argv[1:]): """Parses command line and calls the different processing functions """ # If --clear-logs the log files are cleared if "--clear-logs" in args: clear_log_files(LOG_FILES) command_args, _, api, session_file, _ = get_context(args, SETTINGS) def logger(message): """Partial to log messages according to args.verbosity """ u.log_message(u.dated(message), \ log_file=session_file, console=command_args.verbosity) message = "Starting reification for %s\n\n" % command_args.resource_id u.log_message(message, \ log_file=session_file, console=command_args.verbosity) reify_resources(command_args, api) message = "\nReification complete. See the results in %s\n\n" % \ command_args.output u.log_message(message, \ log_file=session_file, console=command_args.verbosity) u.log_message("_" * 80 + "\n", log_file=session_file) u.print_generated_files(command_args.output_dir, log_file=session_file, verbosity=command_args.verbosity)
def execute_whizzml(args, api, session_file): """executes the code in a script or a source code file """ # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) path = args.output_dir if args.to_library: library = pw.library_processing(api, args, session_file=session_file, path=path, log=log) else: if args.script_file: # script is retrieved from the contents of the given local JSON file script, _, _ = u.read_local_resource(args.script_file) args.script = script['resource'] args.script_ids = [args.script] elif args.code_file or args.code: script, scripts = pw.script_processing(api, args, session_file=session_file, path=path, log=log) args.script = script['resource'] args.script_ids = scripts if (args.script or args.scripts) and not args.no_execute: execution = pw.execution_processing(api, args, session_file=session_file, path=path, log=log) execution = r.get_execution( \ execution, api, args.verbosity, session_file) r.save_txt_and_json(execution['object']['execution'], args.output, api=api) args.execution = execution['resource'] u.log_message("_" * 80 + "\n", log_file=session_file) u.print_generated_files(args.output_dir, log_file=session_file, verbosity=args.verbosity)
def execute_whizzml(args, api, session_file): """executes the code in a script or a source code file """ # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) path = args.output_dir if args.to_library: pw.library_processing( \ api, args, session_file=session_file, path=path, log=log) else: if args.script_file: # script is retrieved from the contents of the given local file script, _, _ = u.read_local_resource(args.script_file) args.script = script['resource'] args.script_ids = [args.script] elif args.code_file or args.code: script, scripts = pw.script_processing( \ api, args, session_file=session_file, path=path, log=log) args.script = script if isinstance(script, basestring) else \ script.get('resource') args.script_ids = scripts if (args.script or args.scripts) and not args.no_execute: execution = pw.execution_processing( \ api, args, session_file=session_file, path=path, log=log) execution = r.get_execution( \ execution, api, args.verbosity, session_file) r.save_txt_and_json(execution['object']['execution'], args.output, api=api) args.execution = execution['resource'] u.log_message("_" * 80 + "\n", log_file=session_file) u.print_generated_files(args.output_dir, log_file=session_file, verbosity=args.verbosity)
def export_dispatcher(args=sys.argv[1:]): """Parses command line and calls the different export functions """ # If --clear-logs the log files are cleared if "--clear-logs" in args: clear_log_files(LOG_FILES) command_args, command, api, session_file, resume = get_context(args, SETTINGS) # Creates the corresponding api instance resource = command_args.ensemble or command_args.model message = "Generating %s code for %s\n\n" % (command_args.language, resource) u.log_message(message, \ log_file=session_file, console=command_args.verbosity) export_code(command_args, api) u.log_message("_" * 80 + "\n", log_file=session_file) u.print_generated_files(command_args.output_dir, log_file=session_file, verbosity=command_args.verbosity)
def export_dispatcher(args=sys.argv[1:]): """Parses command line and calls the different export functions """ # If --clear-logs the log files are cleared if "--clear-logs" in args: clear_log_files(LOG_FILES) command_args, _, api, session_file, _ = get_context(args, SETTINGS) # Creates the corresponding api instance resource = command_args.ensemble or command_args.model message = "Generating %s code for %s\n\n" % (command_args.language, resource) u.log_message(message, \ log_file=session_file, console=command_args.verbosity) export_code(command_args, api) u.log_message("_" * 80 + "\n", log_file=session_file) u.print_generated_files(command_args.output_dir, log_file=session_file, verbosity=command_args.verbosity)
def compute_output(api, args): """ Creates a dataset using the `training_set`. """ source = None dataset = None fields = None other_label = OTHER multi_label_data = None multi_label_fields = [] datasets = None # variables from command-line options resume = args.resume_ output = args.output dataset_fields = args.dataset_fields_ check_args_coherence(args) path = u.check_dir(output) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # labels to be used in multi-label expansion labels = (None if args.labels is None else [label.strip() for label in args.labels.split(args.args_separator)]) if labels is not None: labels = sorted([label for label in labels]) # multi_label file must be preprocessed to obtain a new extended file if args.multi_label and args.training_set is not None: (args.training_set, multi_label_data) = ps.multi_label_expansion( args.training_set, args.train_header, args, path, labels=labels, session_file=session_file) args.train_header = True args.objective_field = multi_label_data["objective_name"] all_labels = l.get_all_labels(multi_label_data) if not labels: labels = all_labels else: all_labels = labels if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) if args.source_file: # source is retrieved from the contents of the given local JSON file source, csv_properties, fields = u.read_local_resource( args.source_file, csv_properties=csv_properties) else: # source is retrieved from the remote object source, resume, csv_properties, fields = ps.source_processing( api, args, resume, csv_properties=csv_properties, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) if source is not None: args.source = bigml.api.get_source_id(source) if args.multi_label and source: multi_label_data = l.get_multi_label_data(source) (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(args.objective_field, labels, multi_label_data, fields, multi_label_fields) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) if args.dataset_file: # dataset is retrieved from the contents of the given local JSON file model_dataset, csv_properties, fields = u.read_local_resource( args.dataset_file, csv_properties=csv_properties) if not args.datasets: datasets = [model_dataset] dataset = model_dataset else: datasets = u.read_datasets(args.datasets) if not datasets: # dataset is retrieved from the remote object datasets, resume, csv_properties, fields = pd.dataset_processing( source, api, args, resume, fields=fields, csv_properties=csv_properties, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) if datasets: dataset = datasets[-1] if args.to_csv is not None: resume = pd.export_dataset(dataset, api, args, resume, session_file=session_file, path=path) # Now we have a dataset, let's check if there's an objective_field # given by the user and update it in the fields structure args.objective_id_ = get_objective_id(args, fields) # If test_split is used, split the dataset in a training and a test dataset # according to the given split if args.test_split > 0: dataset, test_dataset, resume = pd.split_processing( dataset, api, args, resume, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) datasets[0] = dataset # Check if the dataset has a categorical objective field and it # has a max_categories limit for categories if args.max_categories > 0 and len(datasets) == 1: if pd.check_max_categories(fields.fields[args.objective_id_]): distribution = pd.get_categories_distribution(dataset, args.objective_id_) if distribution and len(distribution) > args.max_categories: categories = [element[0] for element in distribution] other_label = pd.create_other_label(categories, other_label) datasets, resume = pd.create_categories_datasets( dataset, distribution, fields, args, api, resume, session_file=session_file, path=path, log=log, other_label=other_label) else: sys.exit("The provided objective field is not categorical nor " "a full terms only text field. " "Only these fields can be used with" " --max-categories") # If any of the transformations is applied, # generate a new dataset from the given list of datasets if args.new_dataset: dataset, resume = pd.create_new_dataset( datasets, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) datasets = [dataset] # Check if the dataset has a generators file associated with it, and # generate a new dataset with the specified field structure. Also # if the --to-dataset flag is used to clone or sample the original dataset if args.new_fields or args.sample_rate != 1 or \ (args.lisp_filter or args.json_filter) and not has_source(args): if fields is None: if isinstance(dataset, basestring): dataset = u.check_resource(dataset, api=api) fields = Fields(dataset, csv_properties) args.objective_id_ = get_objective_id(args, fields) args.objective_name_ = fields.field_name(args.objective_id_) dataset, resume = pd.create_new_dataset( dataset, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) datasets[0] = dataset # rebuild fields structure for new ids and fields csv_properties.update({'objective_field': args.objective_name_, 'objective_field_present': True}) fields = pd.get_fields_structure(dataset, csv_properties) args.objective_id_ = get_objective_id(args, fields) if args.multi_label and dataset and multi_label_data is None: multi_label_data = l.get_multi_label_data(dataset) (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(args.objective_field, labels, multi_label_data, fields, multi_label_fields) if dataset: # retrieves max_categories data, if any args.max_categories = get_metadata(dataset, 'max_categories', args.max_categories) other_label = get_metadata(dataset, 'other_label', other_label) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ logistic_regression = None logistic_regressions = None # no multi-label support at present # variables from command-line options resume = args.resume_ logistic_regression_ids = args.logistic_regression_ids_ output = args.predictions # there's only one logistic regression to be generated at present args.max_parallel_logistic_regressions = 1 # logistic regressions cannot be published yet. args.public_logistic_regression = False # It is compulsory to have a description to publish either datasets or # logistic regressions if (not args.description_ and (args.public_logistic_regression or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info(api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if datasets: # Now we have a dataset, let's check if there's an objective_field # given by the user and update it in the fields structure args.objective_id_ = get_objective_id(args, fields) if args.logistic_file: # logistic regression is retrieved from the contents of the given local # JSON file logistic_regression, csv_properties, fields = u.read_local_resource( args.logistic_file, csv_properties=csv_properties) logistic_regressions = [logistic_regression] logistic_regression_ids = [logistic_regression['resource']] else: # logistic regression is retrieved from the remote object logistic_regressions, logistic_regression_ids, resume = \ plr.logistic_regressions_processing( \ datasets, logistic_regressions, logistic_regression_ids, \ api, args, resume, fields=fields, \ session_file=session_file, path=path, log=log) if logistic_regressions: logistic_regression = logistic_regressions[0] # We update the logistic regression's public state if needed if logistic_regression: if isinstance(logistic_regression, basestring): if not a.has_test(args): query_string = MINIMUM_MODEL elif args.export_fields: query_string = r.ALL_FIELDS_QS else: query_string = '' logistic_regression = u.check_resource(logistic_regression, api.get_logistic_regression, query_string=query_string) logistic_regressions[0] = logistic_regression if (args.public_logistic_regression or (args.shared_flag and r.shared_changed(args.shared, logistic_regression))): logistic_regression_args = {} if args.shared_flag and r.shared_changed(args.shared, logistic_regression): logistic_regression_args.update(shared=args.shared) if args.public_logistic_regression: logistic_regression_args.update( \ r.set_publish_logistic_regression_args(args)) if logistic_regression_args: logistic_regression = r.update_logistic_regression( \ logistic_regression, logistic_regression_args, args, api=api, path=path, \ session_file=session_file) logistic_regressions[0] = logistic_regression # We get the fields of the logistic_regression if we haven't got # them yet and need them if logistic_regression and (args.test_set or args.export_fields): fields = plr.get_logistic_fields( \ logistic_regression, csv_properties, args) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) # If predicting if logistic_regressions and (a.has_test(args) or \ (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote predictions: predictions are computed as batch predictions # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_prediction_args = r.set_batch_prediction_args( args, fields=fields, dataset_fields=test_fields) remote_lr_prediction(logistic_regression, test_dataset, \ batch_prediction_args, args, \ api, resume, prediction_file=output, \ session_file=session_file, path=path, log=log) else: lr_prediction(logistic_regressions, fields, args, session_file=session_file) # If evaluate flag is on, create remote evaluation and save results in # json and human-readable format. if args.evaluate: # When we resume evaluation and models were already completed, we # should use the datasets array as test datasets if args.has_test_datasets_: test_dataset = get_test_dataset(args) if args.dataset_off and not args.has_test_datasets_: args.test_dataset_ids = datasets if args.test_dataset_ids and args.dataset_off: # Evaluate the models with the corresponding test datasets. test_dataset_id = bigml.api.get_dataset_id( \ args.test_dataset_ids[0]) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) resume = evaluate(logistic_regressions, args.test_dataset_ids, api, args, resume, fields=fields, dataset_fields=test_fields, session_file=session_file, path=path, log=log, objective_field=args.objective_field) else: dataset = datasets[0] if args.test_split > 0 or args.has_test_datasets_: dataset = test_dataset dataset = u.check_resource(dataset, api=api, query_string=r.ALL_FIELDS_QS) dataset_fields = pd.get_fields_structure(dataset, None) resume = evaluate(logistic_regressions, [dataset], api, args, resume, fields=fields, dataset_fields=dataset_fields, session_file=session_file, path=path, log=log, objective_field=args.objective_field) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ cluster = None clusters = None # no multi-label support at present # variables from command-line options resume = args.resume_ cluster_ids = args.cluster_ids_ output = args.predictions # there's only one cluster to be generated at present args.max_parallel_clusters = 1 # clusters cannot be published yet. args.public_cluster = False # It is compulsory to have a description to publish either datasets or # clusters if (not args.description_ and (args.public_cluster or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info(api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.cluster_file: # cluster is retrieved from the contents of the given local JSON file cluster, csv_properties, fields = u.read_local_resource( args.cluster_file, csv_properties=csv_properties) clusters = [cluster] cluster_ids = [cluster['resource']] else: # cluster is retrieved from the remote object clusters, cluster_ids, resume = pc.clusters_processing( datasets, clusters, cluster_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if clusters: cluster = clusters[0] # We update the cluster's public state if needed if cluster: if isinstance(cluster, basestring): if args.cluster_datasets is None and not a.has_test(args): query_string = MINIMUM_MODEL else: query_string = '' cluster = u.check_resource(cluster, api.get_cluster, query_string=query_string) clusters[0] = cluster if (args.public_cluster or (args.shared_flag and r.shared_changed(args.shared, cluster))): cluster_args = {} if args.shared_flag and r.shared_changed(args.shared, cluster): cluster_args.update(shared=args.shared) if args.public_cluster: cluster_args.update(r.set_publish_cluster_args(args)) if cluster_args: cluster = r.update_cluster(cluster, cluster_args, args, api=api, path=path, session_file=session_file) clusters[0] = cluster # We get the fields of the cluster if we haven't got # them yet and need them if cluster and (args.test_set or args.export_fields): if isinstance(cluster, dict): cluster = cluster['resource'] cluster = u.check_resource(cluster, api.get_cluster, query_string=r.ALL_FIELDS_QS) fields = pc.get_cluster_fields(cluster, csv_properties, args) # If predicting if clusters and (a.has_test(args) or (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote centroids: centroids are computed as batch centroids # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_centroid_args = r.set_batch_centroid_args( args, fields=fields, dataset_fields=test_fields) remote_centroid(cluster, test_dataset, batch_centroid_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: centroid(clusters, fields, args, session_file=session_file) if cluster and args.cluster_datasets is not None: cluster = api.check_resource(cluster) centroids_info = cluster['object']['clusters']['clusters'] centroids = { centroid['name']: centroid['id'] for centroid in centroids_info } cluster_datasets = cluster['object']['cluster_datasets'] if args.cluster_datasets == '': centroid_ids = centroids.values() else: centroid_ids = [ centroids[cluster_name] for cluster_name in args.cluster_datasets_ if cluster_datasets.get(centroids[cluster_name], '') == '' ] for centroid_id in centroid_ids: dataset_args = {'centroid': centroid_id} r.create_dataset(cluster, dataset_args, args, api=api, path=path, session_file=session_file, log=log, dataset_type='cluster') if cluster and args.cluster_models is not None: cluster = api.check_resource(cluster) centroids_info = cluster['object']['clusters']['clusters'] centroids = { centroid['name']: centroid['id'] for centroid in centroids_info } models = cluster['object']['cluster_models'] if args.cluster_models == '': centroid_ids = centroids.values() else: centroid_ids = [ centroids[cluster_name] for cluster_name in args.cluster_models_ if models.get(centroids[cluster_name], '') == '' ] for centroid_id in centroid_ids: model_args = {'centroid': centroid_id} r.create_model(cluster, model_args, args, api=api, path=path, session_file=session_file, log=log, model_type='cluster') if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates a sample based on a `train_set`, source or dataset. """ samples = None # variables from command-line options resume = args.resume_ sample_ids = args.sample_ids_ output = args.predictions # there's only one sample to be generated at present args.max_parallel_clusters = 1 # sample cannot be published yet. args.public_sample = False # It is compulsory to have a description to publish either datasets or # clusters if (not args.description_ and (args.public_sample or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-sample step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-sample step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info(api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, _, resume, csv_properties, fields) = dataset_properties if args.sample_file: # sample is retrieved from the contents of the given local JSON file sample, csv_properties, fields = u.read_local_resource( args.sample_file, csv_properties=csv_properties) samples = [sample] sample_ids = [sample['resource']] else: # sample is retrieved from the remote object samples, sample_ids, resume = psa.samples_processing( datasets, samples, sample_ids, api, args, resume, session_file=session_file, path=path, log=log) if samples: sample = samples[0] # We update the sample's public state if needed if sample: if isinstance(sample, basestring): # build the query string from the sample options sample = u.check_resource(sample, api.get_sample) samples[0] = sample if (args.public_sample or (args.shared_flag and r.shared_changed(args.shared, sample))): sample_args = {} if args.shared_flag and r.shared_changed(args.shared, sample): sample_args.update(shared=args.shared) if args.public_sample: sample_args.update(r.set_publish_sample_args(args)) if sample_args: sample = r.update_sample(sample, sample_args, args, api=api, path=path, session_file=session_file) samples[0] = sample # We get the fields of the sample if we haven't got # them yet and need them if sample and psa.needs_sample_fields(args): fields = psa.get_sample_fields(sample, csv_properties, args) sample_file(samples[0], fields, args, api, path=path, session_file=session_file) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ association = None associations = None # no multi-label support at present # variables from command-line options resume = args.resume_ association_ids = args.association_ids_ output = args.predictions # there's only one association resource to be generated at present args.max_parallel_associations = 1 # associations cannot be published yet. args.public_association = False # It is compulsory to have a description to publish either datasets or # associations if (not args.description_ and (args.public_association or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.association_file: # association is retrieved from the contents of the given local JSON # file association, csv_properties, fields = u.read_local_resource( args.association_file, csv_properties=csv_properties) associations = [association] association_ids = [association['resource']] else: # association is retrieved from the remote object associations, association_ids, resume = pa.associations_processing( datasets, associations, association_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if associations: association = associations[0] # We update the association's public state if needed if association: if isinstance(association, basestring): if not a.has_test(args): query_string = MINIMUM_MODEL else: query_string = '' association = u.check_resource(association, api.get_association, query_string=query_string) associations[0] = association if (args.public_association or (args.shared_flag and r.shared_changed(args.shared, association))): association_args = {} if args.shared_flag and \ r.shared_changed(args.shared, association): association_args.update(shared=args.shared) if args.public_association: association_args.update(ras.set_publish_association_args(args)) if association_args: association = ras.update_association( \ association, association_args, args, api=api, path=path, session_file=session_file) associations[0] = association # We get the fields of the association if we haven't got # them yet and need them if association and args.test_set: fields = pa.get_association_fields(association, csv_properties, args) # If predicting if associations and (a.has_test(args) or (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote association sets: association sets are computed as # batch association sets # in bigml.com except when --no-batch flag is set. They are currently # not supported yet if args.remote and not args.no_batch: sys.exit("Batch association sets are currently not supported.") else: sys.exit("Local prediction of association sets is currently" " not supported.") u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ time_series = None time_series_set = None # variables from command-line options resume = args.resume_ time_series_ids = args.time_series_ids_ output = args.predictions # there's only one time_series to be generated at present args.max_parallel_time_series = 1 args.max_parallel_evaluations = 1 # time_series cannot be published yet. args.public_time_series = False # no cross-validations args.dataset_off = False args.cross_validation_rate = 0 args.number_of_evaluations = 1 # It is compulsory to have a description to publish either datasets or # time_series if (not args.description_ and (args.public_time_series or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if datasets: # Now we have a dataset, let's check if there's an objective_field # given by the user and update it in the fields structure args.objective_id_ = get_objective_id(args, fields) # if the time series is going to be evaluated, and we don't have # test data, we need to divide the rows using ranges, so we'll need # max rows args.max_rows = datasets[0]["object"]["rows"] if args.time_series_file: # time-series is retrieved from the contents of the given local # JSON file time_series, csv_properties, fields = u.read_local_resource( args.time_series_file, csv_properties=csv_properties) time_series_set = [time_series] time_series_ids = [time_series['resource']] else: # time-series is retrieved from the remote object time_series_set, time_series_ids, resume = \ pts.time_series_processing( \ datasets, time_series_set, time_series_ids, \ api, args, resume, fields=fields, \ session_file=session_file, path=path, log=log) if time_series_set: time_series = time_series_set[0] # We update the time-series' public state if needed if time_series: if isinstance(time_series, basestring): query_string = r.ALL_FIELDS_QS time_series = u.check_resource(time_series, api.get_time_series, query_string=query_string) time_series_set[0] = time_series if (args.public_time_series or (args.shared_flag and r.shared_changed(args.shared, time_series))): time_series_args = {} if args.shared_flag and r.shared_changed(args.shared, time_series): time_series_args.update(shared=args.shared) if args.public_time_series: time_series_args.update( \ r.set_publish_time_series_args(args)) if time_series_args: time_series = r.time_series( \ time_series, time_series_args, args, api=api, path=path, \ session_file=session_file) time_series_set[0] = time_series """ # We get the fields of the time-series if we haven't got # them yet and need them if time_series and (args.test_set or args.export_fields): fields = pts.get_time_series_fields( \ time_series, csv_properties, args) """ if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) # If forecasting if time_series_set and a.has_ts_test(args): if args.remote: forecast_args = r.set_forecast_args( args, fields=fields) remote_forecast(time_series, forecast_args, args, \ api, resume, prediction_file=output, \ session_file=session_file, path=path, log=log) else: forecast(time_series, args, session_file=session_file) # If evaluate flag is on, create remote evaluation and save results in # json and human-readable format. if args.evaluate: # When we resume evaluation and models were already completed, we # should use the datasets array as test datasets if args.has_test_datasets_: test_dataset = get_test_dataset(args) if args.dataset_off and not args.has_test_datasets_: args.test_dataset_ids = datasets if args.test_dataset_ids and args.dataset_off: # Evaluate the models with the corresponding test datasets. test_dataset_id = bigml.api.get_dataset_id( \ args.test_dataset_ids[0]) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) resume = evaluate(time_series_set, args.test_dataset_ids, api, args, resume, fields=fields, dataset_fields=test_fields, session_file=session_file, path=path, log=log, objective_field=args.objective_field) else: dataset = datasets[0] if args.test_split > 0 or args.has_test_datasets_: dataset = test_dataset else: args.range_ = [int(args.max_rows * r.EVALUATE_SAMPLE_RATE), args.max_rows] dataset = u.check_resource(dataset, api=api, query_string=r.ALL_FIELDS_QS) dataset_fields = pd.get_fields_structure(dataset, None) resume = evaluate(time_series_set, [dataset], api, args, resume, fields=fields, dataset_fields=dataset_fields, session_file=session_file, path=path, log=log, objective_field=args.objective_field) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates a dataset using the `training_set`. """ source = None dataset = None fields = None other_label = OTHER multi_label_data = None multi_label_fields = [] datasets = None # variables from command-line options resume = args.resume_ output = args.output dataset_fields = args.dataset_fields_ check_args_coherence(args) path = u.check_dir(output) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # labels to be used in multi-label expansion labels = (None if args.labels is None else [ label.strip() for label in args.labels.split(args.args_separator) ]) if labels is not None: labels = sorted([label for label in labels]) # multi_label file must be preprocessed to obtain a new extended file if args.multi_label and args.training_set is not None: (args.training_set, multi_label_data) = ps.multi_label_expansion( args.training_set, args.train_header, args, path, labels=labels, session_file=session_file) args.train_header = True args.objective_field = multi_label_data["objective_name"] all_labels = l.get_all_labels(multi_label_data) if not labels: labels = all_labels else: all_labels = labels if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) if args.source_file: # source is retrieved from the contents of the given local JSON file source, csv_properties, fields = u.read_local_resource( args.source_file, csv_properties=csv_properties) else: # source is retrieved from the remote object source, resume, csv_properties, fields = ps.source_processing( api, args, resume, csv_properties=csv_properties, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) if source is not None: args.source = bigml.api.get_source_id(source) if args.multi_label and source: multi_label_data = l.get_multi_label_data(source) (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(args.objective_field, labels, multi_label_data, fields, multi_label_fields) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) if args.dataset_file: # dataset is retrieved from the contents of the given local JSON file model_dataset, csv_properties, fields = u.read_local_resource( args.dataset_file, csv_properties=csv_properties) if not args.datasets: datasets = [model_dataset] dataset = model_dataset else: datasets = u.read_datasets(args.datasets) if not datasets: # dataset is retrieved from the remote object datasets, resume, csv_properties, fields = pd.dataset_processing( source, api, args, resume, fields=fields, csv_properties=csv_properties, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) if datasets: dataset = datasets[-1] if args.to_csv is not None: resume = pd.export_dataset(dataset, api, args, resume, session_file=session_file, path=path) # Now we have a dataset, let's check if there's an objective_field # given by the user and update it in the fields structure args.objective_id_ = get_objective_id(args, fields) # If test_split is used, split the dataset in a training and a test dataset # according to the given split if args.test_split > 0: dataset, test_dataset, resume = pd.split_processing( dataset, api, args, resume, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) datasets[0] = dataset # Check if the dataset has a categorical objective field and it # has a max_categories limit for categories if args.max_categories > 0 and len(datasets) == 1: if pd.check_max_categories(fields.fields[args.objective_id_]): distribution = pd.get_categories_distribution( dataset, args.objective_id_) if distribution and len(distribution) > args.max_categories: categories = [element[0] for element in distribution] other_label = pd.create_other_label(categories, other_label) datasets, resume = pd.create_categories_datasets( dataset, distribution, fields, args, api, resume, session_file=session_file, path=path, log=log, other_label=other_label) else: sys.exit("The provided objective field is not categorical nor " "a full terms only text field. " "Only these fields can be used with" " --max-categories") # If any of the transformations is applied, # generate a new dataset from the given list of datasets if args.new_dataset: dataset, resume = pd.create_new_dataset(datasets, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) datasets = [dataset] # Check if the dataset has a generators file associated with it, and # generate a new dataset with the specified field structure. Also # if the --to-dataset flag is used to clone or sample the original dataset if args.new_fields or args.sample_rate != 1 or \ (args.lisp_filter or args.json_filter) and not has_source(args): if fields is None: if isinstance(dataset, basestring): dataset = u.check_resource(dataset, api=api) fields = Fields(dataset, csv_properties) args.objective_id_ = get_objective_id(args, fields) args.objective_name_ = fields.field_name(args.objective_id_) dataset, resume = pd.create_new_dataset(dataset, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) datasets[0] = dataset # rebuild fields structure for new ids and fields csv_properties.update({ 'objective_field': args.objective_name_, 'objective_field_present': True }) fields = pd.get_fields_structure(dataset, csv_properties) args.objective_id_ = get_objective_id(args, fields) if args.multi_label and dataset and multi_label_data is None: multi_label_data = l.get_multi_label_data(dataset) (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(args.objective_field, labels, multi_label_data, fields, multi_label_fields) if dataset: # retrieves max_categories data, if any args.max_categories = get_metadata(dataset, 'max_categories', args.max_categories) other_label = get_metadata(dataset, 'other_label', other_label) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def reify_dispatcher(args=sys.argv[1:]): """Parses command line and calls the different processing functions """ command = command_handling(args, COMMAND_LOG) # Parses command line arguments. command_args = a.parse_and_check(command) if command_args.resume: command_args, session_file, _ = get_stored_command( args, command_args.debug, command_log=COMMAND_LOG, dirs_log=DIRS_LOG, sessions_log=SESSIONS_LOG) if command_args.output is None: command_args.output = os.path.join(command_args.output_dir, DEFAULT_OUTPUT) else: if command_args.output_dir is None: command_args.output_dir = a.NOW if command_args.output is None: command_args.output = os.path.join(command_args.output_dir, DEFAULT_OUTPUT) if len(os.path.dirname(command_args.output).strip()) == 0: command_args.output = os.path.join(command_args.output_dir, command_args.output) directory = u.check_dir(command_args.output) command_args.output_dir = directory session_file = os.path.join(directory, SESSIONS_LOG) u.log_message(command.command + "\n", log_file=session_file) directory = u.check_dir(os.path.join(command_args.output_dir, "tmp")) session_file = os.path.join(directory, SESSIONS_LOG) u.log_message(command.command + "\n", log_file=session_file) try: shutil.copy(DEFAULTS_FILE, os.path.join(directory, DEFAULTS_FILE)) except IOError: pass u.sys_log_message(u"%s\n" % os.path.abspath(directory), log_file=DIRS_LOG) # If --clear-logs the log files are cleared if "--clear-logs" in args: clear_log_files(LOG_FILES) def logger(message): """Partial to log messages according to args.verbosity """ u.log_message(u.dated(message), \ log_file=session_file, console=command_args.verbosity) # Creates the corresponding api instance api = a.get_api_instance(command_args, u.check_dir(session_file)) message = "Starting reification for %s\n\n" % command_args.resource_id u.log_message(message, \ log_file=session_file, console=command_args.verbosity) reify_resources(command_args, api, logger) message = "\nReification complete. See the results in %s\n\n" % \ command_args.output u.log_message(message, \ log_file=session_file, console=command_args.verbosity) u.log_message("_" * 80 + "\n", log_file=session_file) u.print_generated_files(command_args.output_dir, log_file=session_file, verbosity=command_args.verbosity)
def compute_output(api, args): """ Creates a fusion using the `models` list or uses the ids of a previously created BigML fusion to make predictions for the `test_set`. """ fusion = None # variables from command-line options resume = args.resume_ fusion_ids = args.fusion_ids_ output = args.predictions # there's only one fusion to be generated at present args.max_parallel_fusions = 1 # fusion cannot be published yet. args.public_fusion = False # It is compulsory to have a description to publish either datasets or # fusions if (not args.description_ and args.public_fusion): sys.exit("You should provide a description to publish.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) if args.fusion_file: # fusion regression is retrieved from the contents of the given local # JSON file fusion, csv_properties, fields = u.read_local_resource( args.fusion_file, csv_properties=csv_properties) fusion_ids = [fusion] else: # fusion is retrieved from the remote object or created fusion, resume = \ pf.fusion_processing( \ fusion, fusion_ids, \ api, args, resume, \ session_file=session_file, path=path, log=log) # We update the fusion public state if needed if fusion: if isinstance(fusion, basestring): if not a.has_test(args): query_string = MINIMUM_MODEL elif args.export_fields: query_string = r.ALL_FIELDS_QS else: query_string = '' fusion = u.check_resource(fusion, api.get_fusion, query_string=query_string) if (args.public_fusion or (args.shared_flag and r.shared_changed(args.shared, fusion))): fusion_args = {} if args.shared_flag and r.shared_changed(args.shared, fusion): fusion_args.update(shared=args.shared) if args.public_fusion: fusion_args.update( \ r.set_publish_fusion_args(args)) if fusion_args: fusion = r.update_fusion( \ fusion, fusion_args, args, api=api, path=path, \ session_file=session_file) # We get the fields of the fusion if we haven't got # them yet and need them if fusion and (args.test_set or args.evaluate): fields = pf.get_fusion_fields( \ fusion, csv_properties, args) # If predicting if fusion and (a.has_test(args) or \ args.remote): test_dataset = get_test_dataset(args) # Remote predictions: predictions are computed as batch predictions # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) if not args.evaluate: batch_prediction_args = r.set_batch_prediction_args( args, fields=fields, dataset_fields=test_fields) remote_prediction(fusion, test_dataset, \ batch_prediction_args, args, \ api, resume, prediction_file=output, \ session_file=session_file, path=path, log=log) else: prediction([fusion], fields, args, session_file=session_file) # If evaluate flag is on, create remote evaluation and save results in # json and human-readable format. if args.evaluate: # When we resume evaluation and models were already completed, we # should use the datasets array as test datasets args.max_parallel_evaluations = 1 # only one evaluation at present args.cross_validation_rate = 0 # no cross-validation args.number_of_evaluations = 1 # only one evaluation if args.has_test_datasets_: test_dataset = get_test_dataset(args) dataset = test_dataset dataset = u.check_resource(dataset, api=api, query_string=r.ALL_FIELDS_QS) dataset_fields = pd.get_fields_structure(dataset, None) resume = evaluate([fusion], [dataset], api, args, resume, fields=fields, dataset_fields=dataset_fields, session_file=session_file, path=path, log=log, objective_field=args.objective_field) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ logistic_regression = None logistic_regressions = None # no multi-label support at present # variables from command-line options resume = args.resume_ logistic_regression_ids = args.logistic_regression_ids_ output = args.predictions # there's only one logistic regression to be generated at present args.max_parallel_logistic_regressions = 1 # logistic regressions cannot be published yet. args.public_logistic_regression = False # It is compulsory to have a description to publish either datasets or # logistic regressions if (not args.description_ and (args.public_logistic_regression or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if datasets: # Now we have a dataset, let's check if there's an objective_field # given by the user and update it in the fields structure args.objective_id_ = get_objective_id(args, fields) if args.logistic_file: # logistic regression is retrieved from the contents of the given local # JSON file logistic_regression, csv_properties, fields = u.read_local_resource( args.logistic_file, csv_properties=csv_properties) logistic_regressions = [logistic_regression] logistic_regression_ids = [logistic_regression['resource']] else: # logistic regression is retrieved from the remote object logistic_regressions, logistic_regression_ids, resume = \ plr.logistic_regressions_processing( \ datasets, logistic_regressions, logistic_regression_ids, \ api, args, resume, fields=fields, \ session_file=session_file, path=path, log=log) if logistic_regressions: logistic_regression = logistic_regressions[0] # We update the logistic regression's public state if needed if logistic_regression: if isinstance(logistic_regression, basestring): if not a.has_test(args): query_string = MINIMUM_MODEL elif args.export_fields: query_string = r.ALL_FIELDS_QS else: query_string = '' logistic_regression = u.check_resource(logistic_regression, api.get_logistic_regression, query_string=query_string) logistic_regressions[0] = logistic_regression if (args.public_logistic_regression or (args.shared_flag and r.shared_changed(args.shared, logistic_regression))): logistic_regression_args = {} if args.shared_flag and r.shared_changed(args.shared, logistic_regression): logistic_regression_args.update(shared=args.shared) if args.public_logistic_regression: logistic_regression_args.update( \ r.set_publish_logistic_regression_args(args)) if logistic_regression_args: logistic_regression = r.update_logistic_regression( \ logistic_regression, logistic_regression_args, args, api=api, path=path, \ session_file=session_file) logistic_regressions[0] = logistic_regression # We get the fields of the logistic_regression if we haven't got # them yet and need them if logistic_regression and (args.test_set or args.export_fields): fields = plr.get_logistic_fields( \ logistic_regression, csv_properties, args) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) # If predicting if logistic_regressions and (a.has_test(args) or \ (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote predictions: predictions are computed as batch predictions # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_prediction_args = r.set_batch_prediction_args( args, fields=fields, dataset_fields=test_fields) remote_lr_prediction(logistic_regression, test_dataset, \ batch_prediction_args, args, \ api, resume, prediction_file=output, \ session_file=session_file, path=path, log=log) else: lr_prediction(logistic_regressions, fields, args, session_file=session_file) # If evaluate flag is on, create remote evaluation and save results in # json and human-readable format. if args.evaluate: # When we resume evaluation and models were already completed, we # should use the datasets array as test datasets if args.has_test_datasets_: test_dataset = get_test_dataset(args) if args.dataset_off and not args.has_test_datasets_: args.test_dataset_ids = datasets if args.test_dataset_ids and args.dataset_off: # Evaluate the models with the corresponding test datasets. resume = evaluate(logistic_regressions, args.test_dataset_ids, api, args, resume, fields=fields, dataset_fields=dataset_fields, session_file=session_file, path=path, log=log, labels=labels, all_labels=all_labels, objective_field=args.objective_field) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ pca = None # variables from command-line options resume = args.resume_ pca_ids = args.pca_ids_ output = args.projections # there's only one pca to be generated at present args.max_parallel_pcas = 1 # pca cannot be published yet. args.public_pca = False # It is compulsory to have a description to publish either datasets or # pcas if (not args.description_ and (args.public_pca or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info(api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.pca_file: # pca regression is retrieved from the contents of the given local # JSON file pca, csv_properties, fields = u.read_local_resource( args.pca_file, csv_properties=csv_properties) pac_ids = [pca] else: # pca is retrieved from the remote object or created pca, resume = \ pc.pca_processing( \ datasets, pca, pca_ids, \ api, args, resume, fields=fields, \ session_file=session_file, path=path, log=log) # We update the pca public state if needed if pca: if isinstance(pca, basestring): if not a.has_test(args): query_string = MINIMUM_MODEL elif args.export_fields: query_string = r.ALL_FIELDS_QS else: query_string = '' pca = u.check_resource(pca, api.get_pca, query_string=query_string) if (args.public_pca or (args.shared_flag and r.shared_changed(args.shared, pca))): pca_args = {} if args.shared_flag and r.shared_changed(args.shared, pca): pca_args.update(shared=args.shared) if args.public_pca: pca_args.update( \ r.set_publish_pca_args(args)) if pca_args: pca = r.update_pca( \ pca, pca_args, args, api=api, path=path, \ session_file=session_file) # We get the fields of the pca if we haven't got # them yet and need them if pca and (args.test_set or args.export_fields): fields = pc.get_pca_fields( \ pca, csv_properties, args) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) # If predicting if pca and (a.has_test(args) or \ (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote projections: projections are computed as batch projections # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_projection_args = r.set_batch_projection_args( args, fields=fields, dataset_fields=test_fields) remote_projection(pca, test_dataset, \ batch_projection_args, args, \ api, resume, projection_file=output, \ session_file=session_file, path=path, log=log) else: projection(pca, fields, args, session_file=session_file) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ time_series = None time_series_set = None # variables from command-line options resume = args.resume_ time_series_ids = args.time_series_ids_ output = args.predictions # there's only one time_series to be generated at present args.max_parallel_time_series = 1 args.max_parallel_evaluations = 1 # time_series cannot be published yet. args.public_time_series = False # no cross-validations args.dataset_off = False args.cross_validation_rate = 0 args.number_of_evaluations = 1 # It is compulsory to have a description to publish either datasets or # time_series if (not args.description_ and (args.public_time_series or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info(api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if datasets: # Now we have a dataset, let's check if there's an objective_field # given by the user and update it in the fields structure args.objective_id_ = get_objective_id(args, fields) # if the time series is going to be evaluated, and we don't have # test data, we need to divide the rows using ranges, so we'll need # max rows args.max_rows = datasets[0]["object"]["rows"] if args.time_series_file: # time-series is retrieved from the contents of the given local # JSON file time_series, csv_properties, fields = u.read_local_resource( args.time_series_file, csv_properties=csv_properties) time_series_set = [time_series] time_series_ids = [time_series['resource']] else: # time-series is retrieved from the remote object time_series_set, time_series_ids, resume = \ pts.time_series_processing( \ datasets, time_series_set, time_series_ids, \ api, args, resume, fields=fields, \ session_file=session_file, path=path, log=log) if time_series_set: time_series = time_series_set[0] # We update the time-series' public state if needed if time_series: if isinstance(time_series, basestring): query_string = r.ALL_FIELDS_QS time_series = u.check_resource(time_series, api.get_time_series, query_string=query_string) time_series_set[0] = time_series if (args.public_time_series or (args.shared_flag and r.shared_changed(args.shared, time_series))): time_series_args = {} if args.shared_flag and r.shared_changed(args.shared, time_series): time_series_args.update(shared=args.shared) if args.public_time_series: time_series_args.update( \ r.set_publish_time_series_args(args)) if time_series_args: time_series = r.time_series( \ time_series, time_series_args, args, api=api, path=path, \ session_file=session_file) time_series_set[0] = time_series """ # We get the fields of the time-series if we haven't got # them yet and need them if time_series and (args.test_set or args.export_fields): fields = pts.get_time_series_fields( \ time_series, csv_properties, args) """ if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) # If forecasting if time_series_set and a.has_ts_test(args): if args.remote: forecast_args = r.set_forecast_args(args, fields=fields) remote_forecast(time_series, forecast_args, args, \ api, resume, prediction_file=output, \ session_file=session_file, path=path, log=log) else: forecast(time_series, args, session_file=session_file) # If evaluate flag is on, create remote evaluation and save results in # json and human-readable format. if args.evaluate: # When we resume evaluation and models were already completed, we # should use the datasets array as test datasets if args.has_test_datasets_: test_dataset = get_test_dataset(args) if args.dataset_off and not args.has_test_datasets_: args.test_dataset_ids = datasets if args.test_dataset_ids and args.dataset_off: # Evaluate the models with the corresponding test datasets. test_dataset_id = bigml.api.get_dataset_id( \ args.test_dataset_ids[0]) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) resume = evaluate(time_series_set, args.test_dataset_ids, api, args, resume, fields=fields, dataset_fields=test_fields, session_file=session_file, path=path, log=log, objective_field=args.objective_field) else: dataset = datasets[0] if args.test_split > 0 or args.has_test_datasets_: dataset = test_dataset else: args.range_ = [ int(args.max_rows * r.EVALUATE_SAMPLE_RATE), args.max_rows ] dataset = u.check_resource(dataset, api=api, query_string=r.ALL_FIELDS_QS) dataset_fields = pd.get_fields_structure(dataset, None) resume = evaluate(time_series_set, [dataset], api, args, resume, fields=fields, dataset_fields=dataset_fields, session_file=session_file, path=path, log=log, objective_field=args.objective_field) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
if args.test_split > 0 or args.has_test_datasets_: dataset = test_dataset dataset = u.check_resource(dataset, api=api, query_string=r.ALL_FIELDS_QS) dataset_fields = pd.get_fields_structure(dataset, None) models_or_ensembles = (ensemble_ids if ensemble_ids != [] else models) resume = evaluate(models_or_ensembles, [dataset], api, args, resume, fields=fields, dataset_fields=dataset_fields, session_file=session_file, path=path, log=log, labels=labels, all_labels=all_labels, objective_field=args.objective_field) # If cross_validation_rate is > 0, create remote evaluations and save # results in json and human-readable format. Then average the results to # issue a cross_validation measure set. if args.cross_validation_rate > 0: args.sample_rate = 1 - args.cross_validation_rate cross_validate(models, dataset, fields, api, args, resume, session_file=session_file, path=path, log=log) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def project_dispatcher(args=sys.argv[1:]): """Parses command line and calls the different processing functions """ command = command_handling(args, COMMAND_LOG) # Parses command line arguments. command_args = a.parse_and_check(command) if command_args.resume: command_args, session_file, _ = get_stored_command( args, command_args.debug, command_log=COMMAND_LOG, dirs_log=DIRS_LOG, sessions_log=SESSIONS_LOG) else: if command_args.output_dir is None: command_args.output_dir = a.NOW directory = u.check_dir("%s/x.txt" % command_args.output_dir) command_args.output_dir = directory session_file = os.path.join(directory, SESSIONS_LOG) u.log_message(command.command + "\n", log_file=session_file) directory = u.check_dir(os.path.join(command_args.output_dir, "tmp")) session_file = os.path.join(directory, SESSIONS_LOG) u.log_message(command.command + "\n", log_file=session_file) try: shutil.copy(DEFAULTS_FILE, os.path.join(directory, DEFAULTS_FILE)) except IOError: pass u.sys_log_message(u"%s\n" % os.path.abspath(directory), log_file=DIRS_LOG) path = u.check_dir("%s/x.txt" % command_args.output_dir) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) # If logging is required set the file for logging log = None if command_args.log_file: u.check_dir(command_args.log_file) log = command_args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # Creates the corresponding api instance api = a.get_api_instance(command_args, u.check_dir(session_file)) a.get_output_args(api, command_args, command_args.resume) a.attribute_args(command_args) if not command_args.project_id and command_args.name: command_args.project = command_args.name if command_args.project: # create project pp.project_processing( api, command_args, command_args.resume, session_file=session_file, path=path, log=log, create=True) if command_args.project_id and ( command_args.project_attributes or command_args.name or command_args.tag or command_args.description or command_args.category): # update project's attributes pp.update_project(command_args, api, command_args.resume, \ session_file=session_file) u.log_message("_" * 80 + "\n", log_file=session_file) u.print_generated_files(command_args.output_dir, log_file=session_file, verbosity=command_args.verbosity)
def compute_output(api, args): """ Creates one or more anomaly detectors using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ anomaly = None anomalies = None # no multi-label support at present # variables from command-line options resume = args.resume_ anomaly_ids = args.anomaly_ids_ output = args.predictions # there's only one anomaly detector to be generated at present args.max_parallel_anomalies = 1 # anomalies cannot be published yet. args.public_anomaly = False # It is compulsory to have a description to publish either datasets or # anomalies if (not args.description_ and (args.public_anomaly or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.anomaly_file: # anomaly is retrieved from the contents of the given local JSON file anomaly, csv_properties, fields = u.read_local_resource( args.anomaly_file, csv_properties=csv_properties) anomalies = [anomaly] anomaly_ids = [anomaly['resource']] else: # anomaly is retrieved from the remote object anomalies, anomaly_ids, resume = pa.anomalies_processing( datasets, anomalies, anomaly_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if anomalies: anomaly = anomalies[0] # We update the anomaly's public state if needed if anomaly: if not a.has_test(args) and not args.anomalies_dataset: query_string = MINIMUM_MODEL elif not a.has_test(args): query_string = ";".join([EXCLUDE_TREES, r.ALL_FIELDS_QS]) else: query_string = r.ALL_FIELDS_QS try: anomaly_id = anomaly.get('resource', anomaly) except AttributeError: anomaly_id = anomaly anomaly = u.check_resource(anomaly_id, query_string=query_string, api=api) anomalies[0] = anomaly if (args.public_anomaly or (args.shared_flag and r.shared_changed(args.shared, anomaly))): anomaly_args = {} if args.shared_flag and r.shared_changed(args.shared, anomaly): anomaly_args.update(shared=args.shared) if args.public_anomaly: anomaly_args.update(r.set_publish_anomaly_args(args)) if anomaly_args: anomaly = r.update_anomaly(anomaly, anomaly_args, args, api=api, path=path, session_file=session_file) anomalies[0] = anomaly # We get the fields of the anomaly detector if we haven't got # them yet and need them if anomaly and (args.test_set or args.export_fields): fields = pa.get_anomaly_fields(anomaly, csv_properties, args) # If creating a top anomalies excluded/included dataset if args.anomalies_dataset and anomaly: origin_dataset = anomaly['object'].get('dataset') if origin_dataset is None: sys.exit("The dataset used to generate the anomaly detector " "cannot be found. Failed to generate the anomalies " " dataset.") local_anomaly = Anomaly(anomaly) include = args.anomalies_dataset == ANOMALIES_IN args.anomaly_filter_ = local_anomaly.anomalies_filter(include=include) _, resume = pd.create_new_dataset( origin_dataset, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) # If predicting if anomaly and args.score: args.test_dataset = anomaly['object']['dataset'] if anomalies and (a.has_test(args) or (test_dataset and args.remote)): # test dataset can be defined by --test-split or --test-dataset or # --test-datasets if test_dataset is None: test_dataset = get_test_dataset(args) # Remote anomaly scores: scores are computed as batch anomaly scores # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_anomaly_score_args = r.set_batch_anomaly_score_args( args, fields=fields, dataset_fields=test_fields) remote_anomaly_score(anomaly, test_dataset, batch_anomaly_score_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: anomaly_score(anomalies, fields, args, session_file=session_file) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates a sample based on a `train_set`, source or dataset. """ samples = None # variables from command-line options resume = args.resume_ sample_ids = args.sample_ids_ output = args.predictions # there's only one sample to be generated at present args.max_parallel_clusters = 1 # sample cannot be published yet. args.public_sample = False # It is compulsory to have a description to publish either datasets or # clusters if (not args.description_ and (args.public_sample or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-sample step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-sample step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, _, resume, csv_properties, fields) = dataset_properties if args.sample_file: # sample is retrieved from the contents of the given local JSON file sample, csv_properties, fields = u.read_local_resource( args.sample_file, csv_properties=csv_properties) samples = [sample] sample_ids = [sample['resource']] else: # sample is retrieved from the remote object samples, sample_ids, resume = psa.samples_processing( datasets, samples, sample_ids, api, args, resume, session_file=session_file, path=path, log=log) if samples: sample = samples[0] # We update the sample's public state if needed if sample: if isinstance(sample, basestring): # build the query string from the sample options sample = u.check_resource(sample, api.get_sample) samples[0] = sample if (args.public_sample or (args.shared_flag and r.shared_changed(args.shared, sample))): sample_args = {} if args.shared_flag and r.shared_changed(args.shared, sample): sample_args.update(shared=args.shared) if args.public_sample: sample_args.update(r.set_publish_sample_args(args)) if sample_args: sample = r.update_sample(sample, sample_args, args, api=api, path=path, session_file=session_file) samples[0] = sample # We get the fields of the sample if we haven't got # them yet and need them if sample and psa.needs_sample_fields(args): fields = psa.get_sample_fields(sample, csv_properties, args) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) sample_file(samples[0], fields, args, api, path=path, session_file=session_file) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def delete_resources(command_args, api, deleted_list=None): """Deletes the resources selected by the user given options """ if deleted_list is None: deleted_list = [] if command_args.output_dir is None: path = a.NOW else: path = command_args.output_dir session_file = os.path.join(path, SESSIONS_LOG) message = u.dated("Retrieving objects to delete.\n") u.log_message(message, log_file=session_file, console=command_args.verbosity) # Parses resource types to filter if command_args.resource_types is not None: resource_types = [resource_type.strip() for resource_type in command_args.resource_types.split(',')] command_args.resource_types_ = resource_types else: command_args.resource_types_ = None delete_list = [] # by ids if command_args.delete_list: delete_list = [resource_id.strip() for resource_id in command_args.delete_list.split(',')] # in file if command_args.delete_file: if not os.path.exists(command_args.delete_file): sys.exit("File %s not found" % command_args.delete_file) with open(command_args.delete_file, "r") as delete_file: resource_id = bigml.api.get_resource_id( delete_file.readline().strip()) if resource_id: delete_list.append(resource_id) # from directory if command_args.from_dir: delete_list.extend(retrieve_resources(command_args.from_dir)) # filter resource_types if any delete_list = filter_resource_types(delete_list, command_args.resource_types_) # by time interval and tag (plus filtered resource_types) time_qs_list = time_interval_qs(command_args, api) delete_list.extend(get_delete_list(command_args, api, time_qs_list)) # by filter expression (plus filtered resource_types) filter_qs_list = filter_qs(command_args, api) delete_list.extend(get_delete_list(command_args, api, filter_qs_list)) delete_list = [resource_id for resource_id in delete_list \ if resource_id not in deleted_list] # if there are projects or executions, delete them first bulk_deletion = not command_args.dry_run and \ any([resource_id.startswith("project/") or \ (not command_args.execution_only and \ resource_id.startswith("execution/")) for resource_id in delete_list]) aprox = "*" if bulk_deletion else "" # if bulk_deletion, keep only the project and executions resources in # the deletion list types_summary, delete_list = resources_by_type( \ delete_list, bulk_deletion) message = u.dated("Deleting %s objects%s.\n" % (len(delete_list), aprox)) u.log_message(message, log_file=session_file, console=command_args.verbosity) for resource_type, instances in types_summary.items(): message = "%s%ss: %s\n" % (" " * INDENT_IDS, resource_type, instances) u.log_message(message, log_file=session_file, console=command_args.verbosity) if aprox != "": message = ("* WARNING: Deleting a project or an execution will delete" " also its associated resources. Note that their IDs" " may not be listed in this report.\n") u.log_message(message, log_file=session_file, console=command_args.verbosity) if len(delete_list) > ROWS_LIMIT: pre_indent = INDENT_IDS - 4 message = ("\n%s%s\n" % ((" " * pre_indent), ("Showing only the first %s resources.\n%s" "See details in bigmler_sessions" " file.\n") % (ROWS_LIMIT, " " * pre_indent))) u.log_message(message, log_file=None, console=command_args.verbosity) # ensure uniqueness delete_list = list(set(delete_list)) # Partial console message. Limited number of rows segment = delete_list[0: ROWS_LIMIT] message = ("\n%s" % (" " * INDENT_IDS)).join(segment) message = ("%s" % (" " * INDENT_IDS)) + message + "\n" u.log_message(message, log_file=None, console=command_args.verbosity) # Complete message in session file message = ("\n%s" % (" " * INDENT_IDS)).join(delete_list) message = ("%s" % (" " * INDENT_IDS)) + message + "\n" u.log_message(message, log_file=session_file) if not command_args.dry_run: u.delete(api, delete_list, exe_outputs=not command_args.execution_only) if bulk_deletion: # if projects and executions have already been deleted, delete the rest delete_resources(command_args, api, deleted_list=delete_list) else: u.print_generated_files(path, log_file=session_file, verbosity=command_args.verbosity)
def delete_resources(command_args, api): """Deletes the resources selected by the user given options """ if command_args.output_dir is None: path = a.NOW else: path = command_args.output_dir session_file = os.path.join(path, SESSIONS_LOG) message = u.dated("Retrieving objects to delete.\n") u.log_message(message, log_file=session_file, console=command_args.verbosity) # Parses resource types to filter if command_args.resource_types is not None: resource_types = [ resource_type.strip() for resource_type in command_args.resource_types.split(',') ] command_args.resource_types_ = resource_types else: command_args.resource_types_ = None delete_list = [] # by ids if command_args.delete_list: delete_list = [ resource_id.strip() for resource_id in command_args.delete_list.split(',') ] # in file if command_args.delete_file: if not os.path.exists(command_args.delete_file): sys.exit("File %s not found" % command_args.delete_file) with open(command_args.delete_file, "r") as delete_file: resource_id = bigml.api.get_resource_id( delete_file.readline().strip()) if resource_id: delete_list.append(resource_id) # from directory if command_args.from_dir: delete_list.extend(retrieve_resources(command_args.from_dir)) # filter resource_types if any delete_list = filter_resource_types(delete_list, command_args.resource_types_) # by time interval and tag (plus filtered resource_types) time_qs_list = time_interval_qs(command_args, api) delete_list.extend(get_delete_list(command_args, api, time_qs_list)) types_summary = resources_by_type(delete_list) message = u.dated("Deleting %s objects.\n" % len(delete_list)) u.log_message(message, log_file=session_file, console=command_args.verbosity) for resource_type, instances in types_summary.items(): message = "%s%ss: %s\n" % (" " * INDENT_IDS, resource_type, instances) u.log_message(message, log_file=session_file, console=command_args.verbosity) if len(delete_list) > ROWS_LIMIT: pre_indent = INDENT_IDS - 4 message = ("\n%s%s\n" % ((" " * pre_indent), ("Showing only the first %s resources.\n%s" "See details in bigmler_sessions" " file.\n") % (ROWS_LIMIT, " " * pre_indent))) u.log_message(message, log_file=None, console=command_args.verbosity) # Partial console message. Limited number of rows segment = delete_list[0:ROWS_LIMIT] message = ("\n%s" % (" " * INDENT_IDS)).join(segment) message = ("%s" % (" " * INDENT_IDS)) + message + "\n" u.log_message(message, log_file=None, console=command_args.verbosity) # Complete message in session file message = ("\n%s" % (" " * INDENT_IDS)).join(delete_list) message = ("%s" % (" " * INDENT_IDS)) + message + "\n" u.log_message(message, log_file=session_file) if not command_args.dry_run: u.delete(api, delete_list) u.print_generated_files(path, log_file=session_file, verbosity=command_args.verbosity)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ pca = None # variables from command-line options resume = args.resume_ pca_ids = args.pca_ids_ output = args.projections # there's only one pca to be generated at present args.max_parallel_pcas = 1 # pca cannot be published yet. args.public_pca = False # It is compulsory to have a description to publish either datasets or # pcas if (not args.description_ and (args.public_pca or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.pca_file: # pca regression is retrieved from the contents of the given local # JSON file pca, csv_properties, fields = u.read_local_resource( args.pca_file, csv_properties=csv_properties) pac_ids = [pca] else: # pca is retrieved from the remote object or created pca, resume = \ pc.pca_processing( \ datasets, pca, pca_ids, \ api, args, resume, fields=fields, \ session_file=session_file, path=path, log=log) # We update the pca public state if needed if pca: if isinstance(pca, basestring): if not a.has_test(args): query_string = MINIMUM_MODEL elif args.export_fields: query_string = r.ALL_FIELDS_QS else: query_string = '' pca = u.check_resource(pca, api.get_pca, query_string=query_string) if (args.public_pca or (args.shared_flag and r.shared_changed(args.shared, pca))): pca_args = {} if args.shared_flag and r.shared_changed(args.shared, pca): pca_args.update(shared=args.shared) if args.public_pca: pca_args.update( \ r.set_publish_pca_args(args)) if pca_args: pca = r.update_pca( \ pca, pca_args, args, api=api, path=path, \ session_file=session_file) # We get the fields of the pca if we haven't got # them yet and need them if pca and (args.test_set or args.export_fields): fields = pc.get_pca_fields( \ pca, csv_properties, args) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) # If predicting if pca and (a.has_test(args) or \ (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote projections: projections are computed as batch projections # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_projection_args = r.set_batch_projection_args( args, fields=fields, dataset_fields=test_fields) remote_projection(pca, test_dataset, \ batch_projection_args, args, \ api, resume, projection_file=output, \ session_file=session_file, path=path, log=log) else: projection(pca, fields, args, session_file=session_file) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def delete_resources(command_args, api, deleted_list=None): """Deletes the resources selected by the user given options """ if deleted_list is None: deleted_list = [] if command_args.output_dir is None: path = a.NOW else: path = command_args.output_dir session_file = os.path.join(path, SESSIONS_LOG) message = u.dated("Retrieving objects to delete.\n") u.log_message(message, log_file=session_file, console=command_args.verbosity) # Parses resource types to filter if command_args.resource_types is not None: resource_types = [ resource_type.strip() for resource_type in command_args.resource_types.split(',') ] command_args.resource_types_ = resource_types else: command_args.resource_types_ = None delete_list = [] # by ids if command_args.delete_list: delete_list = [ resource_id.strip() for resource_id in command_args.delete_list.split(',') ] # in file if command_args.delete_file: if not os.path.exists(command_args.delete_file): sys.exit("File %s not found" % command_args.delete_file) with open(command_args.delete_file, "r") as delete_file: resource_id = bigml.api.get_resource_id( delete_file.readline().strip()) if resource_id: delete_list.append(resource_id) # from directory if command_args.from_dir: delete_list.extend(retrieve_resources(command_args.from_dir)) # filter resource_types if any delete_list = filter_resource_types(delete_list, command_args.resource_types_) # by time interval and tag (plus filtered resource_types) time_qs_list = time_interval_qs(command_args, api) delete_list.extend(get_delete_list(command_args, api, time_qs_list)) # by filter expression (plus filtered resource_types) filter_qs_list = filter_qs(command_args) delete_list.extend(get_delete_list(command_args, api, filter_qs_list)) delete_list = [resource_id for resource_id in delete_list \ if resource_id not in deleted_list] # if there are projects or executions, delete them first bulk_deletion = not command_args.dry_run and \ any([resource_id.startswith("project/") or \ (not command_args.execution_only and \ resource_id.startswith("execution/")) for resource_id in delete_list]) aprox = "*" if bulk_deletion else "" # if bulk_deletion, keep only the project and executions resources in # the deletion list types_summary, delete_list = resources_by_type( \ delete_list, bulk_deletion) message = u.dated("Deleting %s objects%s.\n" % (len(delete_list), aprox)) u.log_message(message, log_file=session_file, console=command_args.verbosity) for resource_type, instances in types_summary.items(): message = "%s%ss: %s\n" % (" " * INDENT_IDS, resource_type, instances) u.log_message(message, log_file=session_file, console=command_args.verbosity) if aprox != "": message = ("* WARNING: Deleting a project or an execution will delete" " also its associated resources. Note that their IDs" " may not be listed in this report.\n") u.log_message(message, log_file=session_file, console=command_args.verbosity) if len(delete_list) > ROWS_LIMIT: pre_indent = INDENT_IDS - 4 message = ("\n%s%s\n" % ((" " * pre_indent), ("Showing only the first %s resources.\n%s" "See details in bigmler_sessions" " file.\n") % (ROWS_LIMIT, " " * pre_indent))) u.log_message(message, log_file=None, console=command_args.verbosity) # ensure uniqueness delete_list = list(set(delete_list)) # Partial console message. Limited number of rows segment = delete_list[0:ROWS_LIMIT] message = ("\n%s" % (" " * INDENT_IDS)).join(segment) message = ("%s" % (" " * INDENT_IDS)) + message + "\n" u.log_message(message, log_file=None, console=command_args.verbosity) # Complete message in session file message = ("\n%s" % (" " * INDENT_IDS)).join(delete_list) message = ("%s" % (" " * INDENT_IDS)) + message + "\n" u.log_message(message, log_file=session_file) if not command_args.dry_run: u.delete(api, delete_list, exe_outputs=not command_args.execution_only) if bulk_deletion: # if projects and executions have already been deleted, delete the rest delete_resources(command_args, api, deleted_list=delete_list) else: u.print_generated_files(path, log_file=session_file, verbosity=command_args.verbosity)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ topic_model = None topic_models = None # no multi-label support at present # variables from command-line options resume = args.resume_ topic_model_ids = args.topic_model_ids_ output = args.predictions # there's only one topic model resource to be generated at present args.max_parallel_topic_models = 1 # topic models cannot be published yet. args.public_topic_model = False # It is compulsory to have a description to publish either datasets or # topic models if (not args.description_ and (args.public_topic_model or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.topic_model_file: # topic model is retrieved from the contents of the given local JSON # file topic_model, csv_properties, fields = u.read_local_resource( args.topic_model_file, csv_properties=csv_properties) topic_models = [topic_model] topic_model_ids = [topic_model['resource']] else: # topic model is retrieved from the remote object topic_models, topic_model_ids, resume = pt.topic_model_processing( datasets, topic_models, topic_model_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if topic_models: topic_model = topic_models[0] # We update the topic model's public state if needed if topic_model: if isinstance(topic_model, basestring): if not a.has_test(args): query_string = MINIMUM_MODEL else: query_string = '' topic_model = u.check_resource(topic_model, api.topic_model, query_string=query_string) topic_models[0] = topic_model if (args.public_topic_model or (args.shared_flag and r.shared_changed(args.shared, topic_model))): topic_model_args = {} if args.shared_flag and \ r.shared_changed(args.shared, topic_model): topic_model_args.update(shared=args.shared) if args.public_topic_model: topic_model_args.update(r.set_publish_topic_model_args(args)) if topic_model_args: topic_model = r.update_topic_model( \ topic_model, topic_model_args, args, api=api, path=path, session_file=session_file) topic_models[0] = topic_model # We get the fields of the topic model if we haven't got # them yet and need them if topic_model and args.test_set: csv_properties.update({'objective_field_present': False, 'objective_field': None}) fields = pt.get_topic_model_fields(topic_model, csv_properties, args) # If predicting if topic_models and (a.has_test(args) or (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote topic distributions:topic distributions are computed as # batch topic distributions # in bigml.com except when --no-batch flag is set. if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_topic_distribution_args = \ r.set_batch_topic_distribution_args( \ args, fields=fields, \ dataset_fields=test_fields) remote_topic_distribution( \ topic_model, test_dataset, batch_topic_distribution_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: topic_distribution(topic_models, fields, args, session_file=session_file) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ cluster = None clusters = None # no multi-label support at present # variables from command-line options resume = args.resume_ cluster_ids = args.cluster_ids_ output = args.predictions # there's only one cluster to be generated at present args.max_parallel_clusters = 1 # clusters cannot be published yet. args.public_cluster = False # It is compulsory to have a description to publish either datasets or # clusters if (not args.description_ and (args.public_cluster or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.cluster_file: # cluster is retrieved from the contents of the given local JSON file cluster, csv_properties, fields = u.read_local_resource( args.cluster_file, csv_properties=csv_properties) clusters = [cluster] cluster_ids = [cluster['resource']] else: # cluster is retrieved from the remote object clusters, cluster_ids, resume = pc.clusters_processing( datasets, clusters, cluster_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if clusters: cluster = clusters[0] # We update the cluster's public state if needed if cluster: if isinstance(cluster, basestring): if args.cluster_datasets is None and not a.has_test(args): query_string = MINIMUM_MODEL else: query_string = '' cluster = u.check_resource(cluster, api.get_cluster, query_string=query_string) clusters[0] = cluster if (args.public_cluster or (args.shared_flag and r.shared_changed(args.shared, cluster))): cluster_args = {} if args.shared_flag and r.shared_changed(args.shared, cluster): cluster_args.update(shared=args.shared) if args.public_cluster: cluster_args.update(r.set_publish_cluster_args(args)) if cluster_args: cluster = r.update_cluster(cluster, cluster_args, args, api=api, path=path, session_file=session_file) clusters[0] = cluster # We get the fields of the cluster if we haven't got # them yet and need them if cluster and args.test_set: fields = pc.get_cluster_fields(cluster, csv_properties, args) # If predicting if clusters and (a.has_test(args) or (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote centroids: centroids are computed as batch centroids # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_centroid_args = r.set_batch_centroid_args( args, fields=fields, dataset_fields=test_fields) remote_centroid(cluster, test_dataset, batch_centroid_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: centroid(clusters, fields, args, session_file=session_file) if cluster and args.cluster_datasets is not None: centroids_info = cluster['object']['clusters']['clusters'] centroids = {centroid['name']: centroid['id'] for centroid in centroids_info} datasets = cluster['object']['cluster_datasets'] if args.cluster_datasets == '': centroid_ids = centroids.values() else: centroid_ids = [centroids[cluster_name] for cluster_name in args.cluster_datasets_ if datasets[centroids[cluster_name]] == ''] for centroid_id in centroid_ids: dataset_args = {'centroid': centroid_id} r.create_dataset(cluster, dataset_args, args, api=api, path=path, session_file=session_file, log=log, dataset_type='cluster') u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def delete_resources(command_args, api): """Deletes the resources selected by the user given options """ if command_args.output_dir is None: path = a.NOW else: path = command_args.output_dir session_file = os.path.join(path, SESSIONS_LOG) message = u.dated("Retrieving objects to delete.\n") u.log_message(message, log_file=session_file, console=command_args.verbosity) # Parses resource types to filter if command_args.resource_types is not None: resource_types = [resource_type.strip() for resource_type in command_args.resource_types.split(',')] command_args.resource_types_ = resource_types else: command_args.resource_types_ = None delete_list = [] # by ids if command_args.delete_list: delete_list = [resource_id.strip() for resource_id in command_args.delete_list.split(',')] # in file if command_args.delete_file: if not os.path.exists(command_args.delete_file): sys.exit("File %s not found" % command_args.delete_file) with open(command_args.delete_file, "r") as delete_file: resource_id = bigml.api.get_resource_id( delete_file.readline().strip()) if resource_id: delete_list.append(resource_id) # from directory if command_args.from_dir: delete_list.extend(retrieve_resources(command_args.from_dir)) # filter resource_types if any delete_list = filter_resource_types(delete_list, command_args.resource_types_) # by time interval and tag (plus filtered resource_types) time_qs_list = time_interval_qs(command_args, api) delete_list.extend(get_delete_list(command_args, api, time_qs_list)) types_summary = resources_by_type(delete_list) message = u.dated("Deleting %s objects.\n" % len(delete_list)) u.log_message(message, log_file=session_file, console=command_args.verbosity) for resource_type, instances in types_summary.items(): message = "%s%ss: %s\n" % (" " * INDENT_IDS, resource_type, instances) u.log_message(message, log_file=session_file, console=command_args.verbosity) if len(delete_list) > ROWS_LIMIT: pre_indent = INDENT_IDS - 4 message = ("\n%s%s\n" % ((" " * pre_indent), ("Showing only the first %s resources.\n%s" "See details in bigmler_sessions" " file.\n") % (ROWS_LIMIT, " " * pre_indent))) u.log_message(message, log_file=None, console=command_args.verbosity) # Partial console message. Limited number of rows segment = delete_list[0: ROWS_LIMIT] message = ("\n%s" % (" " * INDENT_IDS)).join(segment) message = ("%s" % (" " * INDENT_IDS)) + message + "\n" u.log_message(message, log_file=None, console=command_args.verbosity) # Complete message in session file message = ("\n%s" % (" " * INDENT_IDS)).join(delete_list) message = ("%s" % (" " * INDENT_IDS)) + message + "\n" u.log_message(message, log_file=session_file) if not command_args.dry_run: u.delete(api, delete_list) u.print_generated_files(path, log_file=session_file, verbosity=command_args.verbosity)
def project_dispatcher(args=sys.argv[1:]): """Parses command line and calls the different processing functions """ command = command_handling(args, COMMAND_LOG) # Parses command line arguments. command_args = a.parse_and_check(command) if command_args.resume: command_args, session_file, _ = get_stored_command( args, command_args.debug, command_log=COMMAND_LOG, dirs_log=DIRS_LOG, sessions_log=SESSIONS_LOG) else: if command_args.output_dir is None: command_args.output_dir = a.NOW directory = u.check_dir("%s/x.txt" % command_args.output_dir) command_args.output_dir = directory session_file = os.path.join(directory, SESSIONS_LOG) u.log_message(command.command + "\n", log_file=session_file) directory = u.check_dir(os.path.join(command_args.output_dir, "tmp")) session_file = os.path.join(directory, SESSIONS_LOG) u.log_message(command.command + "\n", log_file=session_file) try: shutil.copy(DEFAULTS_FILE, os.path.join(directory, DEFAULTS_FILE)) except IOError: pass u.sys_log_message(u"%s\n" % os.path.abspath(directory), log_file=DIRS_LOG) path = u.check_dir("%s/x.txt" % command_args.output_dir) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) # If logging is required set the file for logging log = None if command_args.log_file: u.check_dir(command_args.log_file) log = command_args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # Creates the corresponding api instance api = a.get_api_instance(command_args, u.check_dir(session_file)) a.get_output_args(api, command_args, command_args.resume) a.attribute_args(command_args) if not command_args.project_id and command_args.name: command_args.project = command_args.name if command_args.project: # create project pp.project_processing(api, command_args, command_args.resume, session_file=session_file, path=path, log=log, create=True) if command_args.project_id and (command_args.project_attributes or command_args.name or command_args.tag or command_args.description or command_args.category): # update project's attributes pp.update_project(command_args, api, command_args.resume, \ session_file=session_file) u.log_message("_" * 80 + "\n", log_file=session_file) u.print_generated_files(command_args.output_dir, log_file=session_file, verbosity=command_args.verbosity)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ association = None associations = None # no multi-label support at present # variables from command-line options resume = args.resume_ association_ids = args.association_ids_ output = args.predictions # there's only one association resource to be generated at present args.max_parallel_associations = 1 # associations cannot be published yet. args.public_association = False # It is compulsory to have a description to publish either datasets or # associations if (not args.description_ and (args.public_association or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.association_file: # association is retrieved from the contents of the given local JSON # file association, csv_properties, fields = u.read_local_resource( args.association_file, csv_properties=csv_properties) associations = [association] association_ids = [association['resource']] else: # association is retrieved from the remote object associations, association_ids, resume = pa.associations_processing( datasets, associations, association_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if associations: association = associations[0] # We update the association's public state if needed if association: if isinstance(association, basestring): if not a.has_test(args): query_string = MINIMUM_MODEL else: query_string = '' association = u.check_resource(association, api.get_association, query_string=query_string) associations[0] = association if (args.public_association or (args.shared_flag and r.shared_changed(args.shared, association))): association_args = {} if args.shared_flag and \ r.shared_changed(args.shared, association): association_args.update(shared=args.shared) if args.public_association: association_args.update(r.set_publish_association_args(args)) if association_args: association = r.update_association( \ association, association_args, args, api=api, path=path, session_file=session_file) associations[0] = association # We get the fields of the association if we haven't got # them yet and need them if association and args.test_set: fields = pa.get_association_fields(association, csv_properties, args) # If predicting if associations and (a.has_test(args) or (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote association sets: association sets are computed as # batch association sets # in bigml.com except when --no-batch flag is set. They are currently # not supported yet if args.remote and not args.no_batch: sys.exit("Batch association sets are currently not supported.") """ # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_association_args = r.set_batch_association_args( args, fields=fields, dataset_fields=test_fields) remote_association( \ association, test_dataset, batch_association_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) """ else: sys.exit("Local prediction of association sets is currently" " not supported.") """ association_set(associations, fields, args, session_file=session_file) """ u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
dataset_fields = pd.get_fields_structure(dataset, None) models_or_ensembles = ensemble_ids if ensemble_ids != [] else models resume = evaluate( models_or_ensembles, [dataset], api, args, resume, fields=fields, dataset_fields=dataset_fields, session_file=session_file, path=path, log=log, labels=labels, all_labels=all_labels, objective_field=args.objective_field, ) # If cross_validation_rate is > 0, create remote evaluations and save # results in json and human-readable format. Then average the results to # issue a cross_validation measure set. if args.cross_validation_rate > 0: args.sample_rate = 1 - args.cross_validation_rate cross_validate(models, dataset, fields, api, args, resume, session_file=session_file, path=path, log=log) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more anomaly detectors using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ anomaly = None anomalies = None # no multi-label support at present # variables from command-line options resume = args.resume_ anomaly_ids = args.anomaly_ids_ output = args.predictions # there's only one anomaly detector to be generated at present args.max_parallel_anomalies = 1 # anomalies cannot be published yet. args.public_anomaly = False # It is compulsory to have a description to publish either datasets or # anomalies if (not args.description_ and (args.public_anomaly or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (dataset, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.anomaly_file: # anomaly is retrieved from the contents of the given local JSON file anomaly, csv_properties, fields = u.read_local_resource( args.anomaly_file, csv_properties=csv_properties) anomalies = [anomaly] anomaly_ids = [anomaly['resource']] else: # anomaly is retrieved from the remote object anomalies, anomaly_ids, resume = pa.anomalies_processing( datasets, anomalies, anomaly_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if anomalies: anomaly = anomalies[0] # We update the anomaly's public state if needed if anomaly: if not a.has_test(args) and not args.anomalies_dataset: query_string = MINIMUM_MODEL elif not a.has_test(args): query_string = ";".join([EXCLUDE_TREES, r.ALL_FIELDS_QS]) else: query_string = r.ALL_FIELDS_QS try: anomaly_id = anomaly.get('resource', anomaly) except AttributeError: anomaly_id = anomaly anomaly = u.check_resource(anomaly_id, query_string=query_string, api=api) anomalies[0] = anomaly if (args.public_anomaly or (args.shared_flag and r.shared_changed(args.shared, anomaly))): anomaly_args = {} if args.shared_flag and r.shared_changed(args.shared, anomaly): anomaly_args.update(shared=args.shared) if args.public_anomaly: anomaly_args.update(r.set_publish_anomaly_args(args)) if anomaly_args: anomaly = r.update_anomaly(anomaly, anomaly_args, args, api=api, path=path, session_file=session_file) anomalies[0] = anomaly # We get the fields of the anomaly detector if we haven't got # them yet and need them if anomaly and args.test_set: fields = pa.get_anomaly_fields(anomaly, csv_properties, args) # If creating a top anomalies excluded/included dataset if args.anomalies_dataset and anomaly: origin_dataset = anomaly['object'].get('dataset') if origin_dataset is None: sys.exit("The dataset used to generate the anomaly detector " "cannot be found. Failed to generate the anomalies " " dataset.") local_anomaly = Anomaly(anomaly) include = args.anomalies_dataset == ANOMALIES_IN args._anomaly_filter = local_anomaly.anomalies_filter(include=include) new_dataset, resume = pd.create_new_dataset( origin_dataset, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) # If predicting if anomaly and args.score: args.test_dataset = anomaly['object']['dataset'] if anomalies and (a.has_test(args) or (test_dataset and args.remote)): # test dataset can be defined by --test-split or --test-dataset or # --test-datasets if test_dataset is None: test_dataset = get_test_dataset(args) # Remote anomaly scores: scores are computed as batch anomaly scores # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_anomaly_score_args = r.set_batch_anomaly_score_args( args, fields=fields, dataset_fields=test_fields) remote_anomaly_score(anomaly, test_dataset, batch_anomaly_score_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: anomaly_score(anomalies, fields, args, session_file=session_file) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)