def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ cluster = None clusters = None # no multi-label support at present # variables from command-line options resume = args.resume_ cluster_ids = args.cluster_ids_ output = args.predictions # there's only one cluster to be generated at present args.max_parallel_clusters = 1 # clusters cannot be published yet. args.public_cluster = False # It is compulsory to have a description to publish either datasets or # clusters if (not args.description_ and (args.public_cluster or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info(api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.cluster_file: # cluster is retrieved from the contents of the given local JSON file cluster, csv_properties, fields = u.read_local_resource( args.cluster_file, csv_properties=csv_properties) clusters = [cluster] cluster_ids = [cluster['resource']] else: # cluster is retrieved from the remote object clusters, cluster_ids, resume = pc.clusters_processing( datasets, clusters, cluster_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if clusters: cluster = clusters[0] # We update the cluster's public state if needed if cluster: if isinstance(cluster, basestring): if args.cluster_datasets is None and not a.has_test(args): query_string = MINIMUM_MODEL else: query_string = '' cluster = u.check_resource(cluster, api.get_cluster, query_string=query_string) clusters[0] = cluster if (args.public_cluster or (args.shared_flag and r.shared_changed(args.shared, cluster))): cluster_args = {} if args.shared_flag and r.shared_changed(args.shared, cluster): cluster_args.update(shared=args.shared) if args.public_cluster: cluster_args.update(r.set_publish_cluster_args(args)) if cluster_args: cluster = r.update_cluster(cluster, cluster_args, args, api=api, path=path, session_file=session_file) clusters[0] = cluster # We get the fields of the cluster if we haven't got # them yet and need them if cluster and (args.test_set or args.export_fields): if isinstance(cluster, dict): cluster = cluster['resource'] cluster = u.check_resource(cluster, api.get_cluster, query_string=r.ALL_FIELDS_QS) fields = pc.get_cluster_fields(cluster, csv_properties, args) # If predicting if clusters and (a.has_test(args) or (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote centroids: centroids are computed as batch centroids # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_centroid_args = r.set_batch_centroid_args( args, fields=fields, dataset_fields=test_fields) remote_centroid(cluster, test_dataset, batch_centroid_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: centroid(clusters, fields, args, session_file=session_file) if cluster and args.cluster_datasets is not None: cluster = api.check_resource(cluster) centroids_info = cluster['object']['clusters']['clusters'] centroids = { centroid['name']: centroid['id'] for centroid in centroids_info } cluster_datasets = cluster['object']['cluster_datasets'] if args.cluster_datasets == '': centroid_ids = centroids.values() else: centroid_ids = [ centroids[cluster_name] for cluster_name in args.cluster_datasets_ if cluster_datasets.get(centroids[cluster_name], '') == '' ] for centroid_id in centroid_ids: dataset_args = {'centroid': centroid_id} r.create_dataset(cluster, dataset_args, args, api=api, path=path, session_file=session_file, log=log, dataset_type='cluster') if cluster and args.cluster_models is not None: cluster = api.check_resource(cluster) centroids_info = cluster['object']['clusters']['clusters'] centroids = { centroid['name']: centroid['id'] for centroid in centroids_info } models = cluster['object']['cluster_models'] if args.cluster_models == '': centroid_ids = centroids.values() else: centroid_ids = [ centroids[cluster_name] for cluster_name in args.cluster_models_ if models.get(centroids[cluster_name], '') == '' ] for centroid_id in centroid_ids: model_args = {'centroid': centroid_id} r.create_model(cluster, model_args, args, api=api, path=path, session_file=session_file, log=log, model_type='cluster') if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ logistic_regression = None logistic_regressions = None # no multi-label support at present # variables from command-line options resume = args.resume_ logistic_regression_ids = args.logistic_regression_ids_ output = args.predictions # there's only one logistic regression to be generated at present args.max_parallel_logistic_regressions = 1 # logistic regressions cannot be published yet. args.public_logistic_regression = False # It is compulsory to have a description to publish either datasets or # logistic regressions if (not args.description_ and (args.public_logistic_regression or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info(api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if datasets: # Now we have a dataset, let's check if there's an objective_field # given by the user and update it in the fields structure args.objective_id_ = get_objective_id(args, fields) if args.logistic_file: # logistic regression is retrieved from the contents of the given local # JSON file logistic_regression, csv_properties, fields = u.read_local_resource( args.logistic_file, csv_properties=csv_properties) logistic_regressions = [logistic_regression] logistic_regression_ids = [logistic_regression['resource']] else: # logistic regression is retrieved from the remote object logistic_regressions, logistic_regression_ids, resume = \ plr.logistic_regressions_processing( \ datasets, logistic_regressions, logistic_regression_ids, \ api, args, resume, fields=fields, \ session_file=session_file, path=path, log=log) if logistic_regressions: logistic_regression = logistic_regressions[0] # We update the logistic regression's public state if needed if logistic_regression: if isinstance(logistic_regression, basestring): if not a.has_test(args): query_string = MINIMUM_MODEL elif args.export_fields: query_string = r.ALL_FIELDS_QS else: query_string = '' logistic_regression = u.check_resource(logistic_regression, api.get_logistic_regression, query_string=query_string) logistic_regressions[0] = logistic_regression if (args.public_logistic_regression or (args.shared_flag and r.shared_changed(args.shared, logistic_regression))): logistic_regression_args = {} if args.shared_flag and r.shared_changed(args.shared, logistic_regression): logistic_regression_args.update(shared=args.shared) if args.public_logistic_regression: logistic_regression_args.update( \ r.set_publish_logistic_regression_args(args)) if logistic_regression_args: logistic_regression = r.update_logistic_regression( \ logistic_regression, logistic_regression_args, args, api=api, path=path, \ session_file=session_file) logistic_regressions[0] = logistic_regression # We get the fields of the logistic_regression if we haven't got # them yet and need them if logistic_regression and (args.test_set or args.export_fields): fields = plr.get_logistic_fields( \ logistic_regression, csv_properties, args) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) # If predicting if logistic_regressions and (a.has_test(args) or \ (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote predictions: predictions are computed as batch predictions # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_prediction_args = r.set_batch_prediction_args( args, fields=fields, dataset_fields=test_fields) remote_lr_prediction(logistic_regression, test_dataset, \ batch_prediction_args, args, \ api, resume, prediction_file=output, \ session_file=session_file, path=path, log=log) else: lr_prediction(logistic_regressions, fields, args, session_file=session_file) # If evaluate flag is on, create remote evaluation and save results in # json and human-readable format. if args.evaluate: # When we resume evaluation and models were already completed, we # should use the datasets array as test datasets if args.has_test_datasets_: test_dataset = get_test_dataset(args) if args.dataset_off and not args.has_test_datasets_: args.test_dataset_ids = datasets if args.test_dataset_ids and args.dataset_off: # Evaluate the models with the corresponding test datasets. test_dataset_id = bigml.api.get_dataset_id( \ args.test_dataset_ids[0]) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) resume = evaluate(logistic_regressions, args.test_dataset_ids, api, args, resume, fields=fields, dataset_fields=test_fields, session_file=session_file, path=path, log=log, objective_field=args.objective_field) else: dataset = datasets[0] if args.test_split > 0 or args.has_test_datasets_: dataset = test_dataset dataset = u.check_resource(dataset, api=api, query_string=r.ALL_FIELDS_QS) dataset_fields = pd.get_fields_structure(dataset, None) resume = evaluate(logistic_regressions, [dataset], api, args, resume, fields=fields, dataset_fields=dataset_fields, session_file=session_file, path=path, log=log, objective_field=args.objective_field) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates a sample based on a `train_set`, source or dataset. """ samples = None # variables from command-line options resume = args.resume_ sample_ids = args.sample_ids_ output = args.predictions # there's only one sample to be generated at present args.max_parallel_clusters = 1 # sample cannot be published yet. args.public_sample = False # It is compulsory to have a description to publish either datasets or # clusters if (not args.description_ and (args.public_sample or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-sample step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-sample step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info(api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, _, resume, csv_properties, fields) = dataset_properties if args.sample_file: # sample is retrieved from the contents of the given local JSON file sample, csv_properties, fields = u.read_local_resource( args.sample_file, csv_properties=csv_properties) samples = [sample] sample_ids = [sample['resource']] else: # sample is retrieved from the remote object samples, sample_ids, resume = psa.samples_processing( datasets, samples, sample_ids, api, args, resume, session_file=session_file, path=path, log=log) if samples: sample = samples[0] # We update the sample's public state if needed if sample: if isinstance(sample, basestring): # build the query string from the sample options sample = u.check_resource(sample, api.get_sample) samples[0] = sample if (args.public_sample or (args.shared_flag and r.shared_changed(args.shared, sample))): sample_args = {} if args.shared_flag and r.shared_changed(args.shared, sample): sample_args.update(shared=args.shared) if args.public_sample: sample_args.update(r.set_publish_sample_args(args)) if sample_args: sample = r.update_sample(sample, sample_args, args, api=api, path=path, session_file=session_file) samples[0] = sample # We get the fields of the sample if we haven't got # them yet and need them if sample and psa.needs_sample_fields(args): fields = psa.get_sample_fields(sample, csv_properties, args) sample_file(samples[0], fields, args, api, path=path, session_file=session_file) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ cluster = None clusters = None # no multi-label support at present # variables from command-line options resume = args.resume_ cluster_ids = args.cluster_ids_ output = args.predictions # there's only one cluster to be generated at present args.max_parallel_clusters = 1 # clusters cannot be published yet. args.public_cluster = False # It is compulsory to have a description to publish either datasets or # clusters if (not args.description_ and (args.public_cluster or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.cluster_file: # cluster is retrieved from the contents of the given local JSON file cluster, csv_properties, fields = u.read_local_resource( args.cluster_file, csv_properties=csv_properties) clusters = [cluster] cluster_ids = [cluster['resource']] else: # cluster is retrieved from the remote object clusters, cluster_ids, resume = pc.clusters_processing( datasets, clusters, cluster_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if clusters: cluster = clusters[0] # We update the cluster's public state if needed if cluster: if isinstance(cluster, basestring): if args.cluster_datasets is None and not a.has_test(args): query_string = MINIMUM_MODEL else: query_string = '' cluster = u.check_resource(cluster, api.get_cluster, query_string=query_string) clusters[0] = cluster if (args.public_cluster or (args.shared_flag and r.shared_changed(args.shared, cluster))): cluster_args = {} if args.shared_flag and r.shared_changed(args.shared, cluster): cluster_args.update(shared=args.shared) if args.public_cluster: cluster_args.update(r.set_publish_cluster_args(args)) if cluster_args: cluster = r.update_cluster(cluster, cluster_args, args, api=api, path=path, session_file=session_file) clusters[0] = cluster # We get the fields of the cluster if we haven't got # them yet and need them if cluster and args.test_set: fields = pc.get_cluster_fields(cluster, csv_properties, args) # If predicting if clusters and (a.has_test(args) or (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote centroids: centroids are computed as batch centroids # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_centroid_args = r.set_batch_centroid_args( args, fields=fields, dataset_fields=test_fields) remote_centroid(cluster, test_dataset, batch_centroid_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: centroid(clusters, fields, args, session_file=session_file) if cluster and args.cluster_datasets is not None: centroids_info = cluster['object']['clusters']['clusters'] centroids = {centroid['name']: centroid['id'] for centroid in centroids_info} datasets = cluster['object']['cluster_datasets'] if args.cluster_datasets == '': centroid_ids = centroids.values() else: centroid_ids = [centroids[cluster_name] for cluster_name in args.cluster_datasets_ if datasets[centroids[cluster_name]] == ''] for centroid_id in centroid_ids: dataset_args = {'centroid': centroid_id} r.create_dataset(cluster, dataset_args, args, api=api, path=path, session_file=session_file, log=log, dataset_type='cluster') u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ time_series = None time_series_set = None # variables from command-line options resume = args.resume_ time_series_ids = args.time_series_ids_ output = args.predictions # there's only one time_series to be generated at present args.max_parallel_time_series = 1 args.max_parallel_evaluations = 1 # time_series cannot be published yet. args.public_time_series = False # no cross-validations args.dataset_off = False args.cross_validation_rate = 0 args.number_of_evaluations = 1 # It is compulsory to have a description to publish either datasets or # time_series if (not args.description_ and (args.public_time_series or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if datasets: # Now we have a dataset, let's check if there's an objective_field # given by the user and update it in the fields structure args.objective_id_ = get_objective_id(args, fields) # if the time series is going to be evaluated, and we don't have # test data, we need to divide the rows using ranges, so we'll need # max rows args.max_rows = datasets[0]["object"]["rows"] if args.time_series_file: # time-series is retrieved from the contents of the given local # JSON file time_series, csv_properties, fields = u.read_local_resource( args.time_series_file, csv_properties=csv_properties) time_series_set = [time_series] time_series_ids = [time_series['resource']] else: # time-series is retrieved from the remote object time_series_set, time_series_ids, resume = \ pts.time_series_processing( \ datasets, time_series_set, time_series_ids, \ api, args, resume, fields=fields, \ session_file=session_file, path=path, log=log) if time_series_set: time_series = time_series_set[0] # We update the time-series' public state if needed if time_series: if isinstance(time_series, basestring): query_string = r.ALL_FIELDS_QS time_series = u.check_resource(time_series, api.get_time_series, query_string=query_string) time_series_set[0] = time_series if (args.public_time_series or (args.shared_flag and r.shared_changed(args.shared, time_series))): time_series_args = {} if args.shared_flag and r.shared_changed(args.shared, time_series): time_series_args.update(shared=args.shared) if args.public_time_series: time_series_args.update( \ r.set_publish_time_series_args(args)) if time_series_args: time_series = r.time_series( \ time_series, time_series_args, args, api=api, path=path, \ session_file=session_file) time_series_set[0] = time_series """ # We get the fields of the time-series if we haven't got # them yet and need them if time_series and (args.test_set or args.export_fields): fields = pts.get_time_series_fields( \ time_series, csv_properties, args) """ if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) # If forecasting if time_series_set and a.has_ts_test(args): if args.remote: forecast_args = r.set_forecast_args( args, fields=fields) remote_forecast(time_series, forecast_args, args, \ api, resume, prediction_file=output, \ session_file=session_file, path=path, log=log) else: forecast(time_series, args, session_file=session_file) # If evaluate flag is on, create remote evaluation and save results in # json and human-readable format. if args.evaluate: # When we resume evaluation and models were already completed, we # should use the datasets array as test datasets if args.has_test_datasets_: test_dataset = get_test_dataset(args) if args.dataset_off and not args.has_test_datasets_: args.test_dataset_ids = datasets if args.test_dataset_ids and args.dataset_off: # Evaluate the models with the corresponding test datasets. test_dataset_id = bigml.api.get_dataset_id( \ args.test_dataset_ids[0]) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) resume = evaluate(time_series_set, args.test_dataset_ids, api, args, resume, fields=fields, dataset_fields=test_fields, session_file=session_file, path=path, log=log, objective_field=args.objective_field) else: dataset = datasets[0] if args.test_split > 0 or args.has_test_datasets_: dataset = test_dataset else: args.range_ = [int(args.max_rows * r.EVALUATE_SAMPLE_RATE), args.max_rows] dataset = u.check_resource(dataset, api=api, query_string=r.ALL_FIELDS_QS) dataset_fields = pd.get_fields_structure(dataset, None) resume = evaluate(time_series_set, [dataset], api, args, resume, fields=fields, dataset_fields=dataset_fields, session_file=session_file, path=path, log=log, objective_field=args.objective_field) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ association = None associations = None # no multi-label support at present # variables from command-line options resume = args.resume_ association_ids = args.association_ids_ output = args.predictions # there's only one association resource to be generated at present args.max_parallel_associations = 1 # associations cannot be published yet. args.public_association = False # It is compulsory to have a description to publish either datasets or # associations if (not args.description_ and (args.public_association or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.association_file: # association is retrieved from the contents of the given local JSON # file association, csv_properties, fields = u.read_local_resource( args.association_file, csv_properties=csv_properties) associations = [association] association_ids = [association['resource']] else: # association is retrieved from the remote object associations, association_ids, resume = pa.associations_processing( datasets, associations, association_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if associations: association = associations[0] # We update the association's public state if needed if association: if isinstance(association, basestring): if not a.has_test(args): query_string = MINIMUM_MODEL else: query_string = '' association = u.check_resource(association, api.get_association, query_string=query_string) associations[0] = association if (args.public_association or (args.shared_flag and r.shared_changed(args.shared, association))): association_args = {} if args.shared_flag and \ r.shared_changed(args.shared, association): association_args.update(shared=args.shared) if args.public_association: association_args.update(ras.set_publish_association_args(args)) if association_args: association = ras.update_association( \ association, association_args, args, api=api, path=path, session_file=session_file) associations[0] = association # We get the fields of the association if we haven't got # them yet and need them if association and args.test_set: fields = pa.get_association_fields(association, csv_properties, args) # If predicting if associations and (a.has_test(args) or (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote association sets: association sets are computed as # batch association sets # in bigml.com except when --no-batch flag is set. They are currently # not supported yet if args.remote and not args.no_batch: sys.exit("Batch association sets are currently not supported.") else: sys.exit("Local prediction of association sets is currently" " not supported.") u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ association = None associations = None # no multi-label support at present # variables from command-line options resume = args.resume_ association_ids = args.association_ids_ output = args.predictions # there's only one association resource to be generated at present args.max_parallel_associations = 1 # associations cannot be published yet. args.public_association = False # It is compulsory to have a description to publish either datasets or # associations if (not args.description_ and (args.public_association or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.association_file: # association is retrieved from the contents of the given local JSON # file association, csv_properties, fields = u.read_local_resource( args.association_file, csv_properties=csv_properties) associations = [association] association_ids = [association['resource']] else: # association is retrieved from the remote object associations, association_ids, resume = pa.associations_processing( datasets, associations, association_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if associations: association = associations[0] # We update the association's public state if needed if association: if isinstance(association, basestring): if not a.has_test(args): query_string = MINIMUM_MODEL else: query_string = '' association = u.check_resource(association, api.get_association, query_string=query_string) associations[0] = association if (args.public_association or (args.shared_flag and r.shared_changed(args.shared, association))): association_args = {} if args.shared_flag and \ r.shared_changed(args.shared, association): association_args.update(shared=args.shared) if args.public_association: association_args.update(r.set_publish_association_args(args)) if association_args: association = r.update_association( \ association, association_args, args, api=api, path=path, session_file=session_file) associations[0] = association # We get the fields of the association if we haven't got # them yet and need them if association and args.test_set: fields = pa.get_association_fields(association, csv_properties, args) # If predicting if associations and (a.has_test(args) or (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote association sets: association sets are computed as # batch association sets # in bigml.com except when --no-batch flag is set. They are currently # not supported yet if args.remote and not args.no_batch: sys.exit("Batch association sets are currently not supported.") """ # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_association_args = r.set_batch_association_args( args, fields=fields, dataset_fields=test_fields) remote_association( \ association, test_dataset, batch_association_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) """ else: sys.exit("Local prediction of association sets is currently" " not supported.") """ association_set(associations, fields, args, session_file=session_file) """ u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates a sample based on a `train_set`, source or dataset. """ samples = None # variables from command-line options resume = args.resume_ sample_ids = args.sample_ids_ output = args.predictions # there's only one sample to be generated at present args.max_parallel_clusters = 1 # sample cannot be published yet. args.public_sample = False # It is compulsory to have a description to publish either datasets or # clusters if (not args.description_ and (args.public_sample or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-sample step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-sample step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, _, resume, csv_properties, fields) = dataset_properties if args.sample_file: # sample is retrieved from the contents of the given local JSON file sample, csv_properties, fields = u.read_local_resource( args.sample_file, csv_properties=csv_properties) samples = [sample] sample_ids = [sample['resource']] else: # sample is retrieved from the remote object samples, sample_ids, resume = psa.samples_processing( datasets, samples, sample_ids, api, args, resume, session_file=session_file, path=path, log=log) if samples: sample = samples[0] # We update the sample's public state if needed if sample: if isinstance(sample, basestring): # build the query string from the sample options sample = u.check_resource(sample, api.get_sample) samples[0] = sample if (args.public_sample or (args.shared_flag and r.shared_changed(args.shared, sample))): sample_args = {} if args.shared_flag and r.shared_changed(args.shared, sample): sample_args.update(shared=args.shared) if args.public_sample: sample_args.update(r.set_publish_sample_args(args)) if sample_args: sample = r.update_sample(sample, sample_args, args, api=api, path=path, session_file=session_file) samples[0] = sample # We get the fields of the sample if we haven't got # them yet and need them if sample and psa.needs_sample_fields(args): fields = psa.get_sample_fields(sample, csv_properties, args) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) sample_file(samples[0], fields, args, api, path=path, session_file=session_file) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ logistic_regression = None logistic_regressions = None # no multi-label support at present # variables from command-line options resume = args.resume_ logistic_regression_ids = args.logistic_regression_ids_ output = args.predictions # there's only one logistic regression to be generated at present args.max_parallel_logistic_regressions = 1 # logistic regressions cannot be published yet. args.public_logistic_regression = False # It is compulsory to have a description to publish either datasets or # logistic regressions if (not args.description_ and (args.public_logistic_regression or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if datasets: # Now we have a dataset, let's check if there's an objective_field # given by the user and update it in the fields structure args.objective_id_ = get_objective_id(args, fields) if args.logistic_file: # logistic regression is retrieved from the contents of the given local # JSON file logistic_regression, csv_properties, fields = u.read_local_resource( args.logistic_file, csv_properties=csv_properties) logistic_regressions = [logistic_regression] logistic_regression_ids = [logistic_regression['resource']] else: # logistic regression is retrieved from the remote object logistic_regressions, logistic_regression_ids, resume = \ plr.logistic_regressions_processing( \ datasets, logistic_regressions, logistic_regression_ids, \ api, args, resume, fields=fields, \ session_file=session_file, path=path, log=log) if logistic_regressions: logistic_regression = logistic_regressions[0] # We update the logistic regression's public state if needed if logistic_regression: if isinstance(logistic_regression, basestring): if not a.has_test(args): query_string = MINIMUM_MODEL elif args.export_fields: query_string = r.ALL_FIELDS_QS else: query_string = '' logistic_regression = u.check_resource(logistic_regression, api.get_logistic_regression, query_string=query_string) logistic_regressions[0] = logistic_regression if (args.public_logistic_regression or (args.shared_flag and r.shared_changed(args.shared, logistic_regression))): logistic_regression_args = {} if args.shared_flag and r.shared_changed(args.shared, logistic_regression): logistic_regression_args.update(shared=args.shared) if args.public_logistic_regression: logistic_regression_args.update( \ r.set_publish_logistic_regression_args(args)) if logistic_regression_args: logistic_regression = r.update_logistic_regression( \ logistic_regression, logistic_regression_args, args, api=api, path=path, \ session_file=session_file) logistic_regressions[0] = logistic_regression # We get the fields of the logistic_regression if we haven't got # them yet and need them if logistic_regression and (args.test_set or args.export_fields): fields = plr.get_logistic_fields( \ logistic_regression, csv_properties, args) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) # If predicting if logistic_regressions and (a.has_test(args) or \ (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote predictions: predictions are computed as batch predictions # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_prediction_args = r.set_batch_prediction_args( args, fields=fields, dataset_fields=test_fields) remote_lr_prediction(logistic_regression, test_dataset, \ batch_prediction_args, args, \ api, resume, prediction_file=output, \ session_file=session_file, path=path, log=log) else: lr_prediction(logistic_regressions, fields, args, session_file=session_file) # If evaluate flag is on, create remote evaluation and save results in # json and human-readable format. if args.evaluate: # When we resume evaluation and models were already completed, we # should use the datasets array as test datasets if args.has_test_datasets_: test_dataset = get_test_dataset(args) if args.dataset_off and not args.has_test_datasets_: args.test_dataset_ids = datasets if args.test_dataset_ids and args.dataset_off: # Evaluate the models with the corresponding test datasets. resume = evaluate(logistic_regressions, args.test_dataset_ids, api, args, resume, fields=fields, dataset_fields=dataset_fields, session_file=session_file, path=path, log=log, labels=labels, all_labels=all_labels, objective_field=args.objective_field) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more anomaly detectors using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ anomaly = None anomalies = None # no multi-label support at present # variables from command-line options resume = args.resume_ anomaly_ids = args.anomaly_ids_ output = args.predictions # there's only one anomaly detector to be generated at present args.max_parallel_anomalies = 1 # anomalies cannot be published yet. args.public_anomaly = False # It is compulsory to have a description to publish either datasets or # anomalies if (not args.description_ and (args.public_anomaly or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.anomaly_file: # anomaly is retrieved from the contents of the given local JSON file anomaly, csv_properties, fields = u.read_local_resource( args.anomaly_file, csv_properties=csv_properties) anomalies = [anomaly] anomaly_ids = [anomaly['resource']] else: # anomaly is retrieved from the remote object anomalies, anomaly_ids, resume = pa.anomalies_processing( datasets, anomalies, anomaly_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if anomalies: anomaly = anomalies[0] # We update the anomaly's public state if needed if anomaly: if not a.has_test(args) and not args.anomalies_dataset: query_string = MINIMUM_MODEL elif not a.has_test(args): query_string = ";".join([EXCLUDE_TREES, r.ALL_FIELDS_QS]) else: query_string = r.ALL_FIELDS_QS try: anomaly_id = anomaly.get('resource', anomaly) except AttributeError: anomaly_id = anomaly anomaly = u.check_resource(anomaly_id, query_string=query_string, api=api) anomalies[0] = anomaly if (args.public_anomaly or (args.shared_flag and r.shared_changed(args.shared, anomaly))): anomaly_args = {} if args.shared_flag and r.shared_changed(args.shared, anomaly): anomaly_args.update(shared=args.shared) if args.public_anomaly: anomaly_args.update(r.set_publish_anomaly_args(args)) if anomaly_args: anomaly = r.update_anomaly(anomaly, anomaly_args, args, api=api, path=path, session_file=session_file) anomalies[0] = anomaly # We get the fields of the anomaly detector if we haven't got # them yet and need them if anomaly and (args.test_set or args.export_fields): fields = pa.get_anomaly_fields(anomaly, csv_properties, args) # If creating a top anomalies excluded/included dataset if args.anomalies_dataset and anomaly: origin_dataset = anomaly['object'].get('dataset') if origin_dataset is None: sys.exit("The dataset used to generate the anomaly detector " "cannot be found. Failed to generate the anomalies " " dataset.") local_anomaly = Anomaly(anomaly) include = args.anomalies_dataset == ANOMALIES_IN args.anomaly_filter_ = local_anomaly.anomalies_filter(include=include) _, resume = pd.create_new_dataset( origin_dataset, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) # If predicting if anomaly and args.score: args.test_dataset = anomaly['object']['dataset'] if anomalies and (a.has_test(args) or (test_dataset and args.remote)): # test dataset can be defined by --test-split or --test-dataset or # --test-datasets if test_dataset is None: test_dataset = get_test_dataset(args) # Remote anomaly scores: scores are computed as batch anomaly scores # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_anomaly_score_args = r.set_batch_anomaly_score_args( args, fields=fields, dataset_fields=test_fields) remote_anomaly_score(anomaly, test_dataset, batch_anomaly_score_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: anomaly_score(anomalies, fields, args, session_file=session_file) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ pca = None # variables from command-line options resume = args.resume_ pca_ids = args.pca_ids_ output = args.projections # there's only one pca to be generated at present args.max_parallel_pcas = 1 # pca cannot be published yet. args.public_pca = False # It is compulsory to have a description to publish either datasets or # pcas if (not args.description_ and (args.public_pca or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info(api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.pca_file: # pca regression is retrieved from the contents of the given local # JSON file pca, csv_properties, fields = u.read_local_resource( args.pca_file, csv_properties=csv_properties) pac_ids = [pca] else: # pca is retrieved from the remote object or created pca, resume = \ pc.pca_processing( \ datasets, pca, pca_ids, \ api, args, resume, fields=fields, \ session_file=session_file, path=path, log=log) # We update the pca public state if needed if pca: if isinstance(pca, basestring): if not a.has_test(args): query_string = MINIMUM_MODEL elif args.export_fields: query_string = r.ALL_FIELDS_QS else: query_string = '' pca = u.check_resource(pca, api.get_pca, query_string=query_string) if (args.public_pca or (args.shared_flag and r.shared_changed(args.shared, pca))): pca_args = {} if args.shared_flag and r.shared_changed(args.shared, pca): pca_args.update(shared=args.shared) if args.public_pca: pca_args.update( \ r.set_publish_pca_args(args)) if pca_args: pca = r.update_pca( \ pca, pca_args, args, api=api, path=path, \ session_file=session_file) # We get the fields of the pca if we haven't got # them yet and need them if pca and (args.test_set or args.export_fields): fields = pc.get_pca_fields( \ pca, csv_properties, args) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) # If predicting if pca and (a.has_test(args) or \ (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote projections: projections are computed as batch projections # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_projection_args = r.set_batch_projection_args( args, fields=fields, dataset_fields=test_fields) remote_projection(pca, test_dataset, \ batch_projection_args, args, \ api, resume, projection_file=output, \ session_file=session_file, path=path, log=log) else: projection(pca, fields, args, session_file=session_file) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more anomaly detectors using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ anomaly = None anomalies = None # no multi-label support at present # variables from command-line options resume = args.resume_ anomaly_ids = args.anomaly_ids_ output = args.predictions # there's only one anomaly detector to be generated at present args.max_parallel_anomalies = 1 # anomalies cannot be published yet. args.public_anomaly = False # It is compulsory to have a description to publish either datasets or # anomalies if (not args.description_ and (args.public_anomaly or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (dataset, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.anomaly_file: # anomaly is retrieved from the contents of the given local JSON file anomaly, csv_properties, fields = u.read_local_resource( args.anomaly_file, csv_properties=csv_properties) anomalies = [anomaly] anomaly_ids = [anomaly['resource']] else: # anomaly is retrieved from the remote object anomalies, anomaly_ids, resume = pa.anomalies_processing( datasets, anomalies, anomaly_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if anomalies: anomaly = anomalies[0] # We update the anomaly's public state if needed if anomaly: if not a.has_test(args) and not args.anomalies_dataset: query_string = MINIMUM_MODEL elif not a.has_test(args): query_string = ";".join([EXCLUDE_TREES, r.ALL_FIELDS_QS]) else: query_string = r.ALL_FIELDS_QS try: anomaly_id = anomaly.get('resource', anomaly) except AttributeError: anomaly_id = anomaly anomaly = u.check_resource(anomaly_id, query_string=query_string, api=api) anomalies[0] = anomaly if (args.public_anomaly or (args.shared_flag and r.shared_changed(args.shared, anomaly))): anomaly_args = {} if args.shared_flag and r.shared_changed(args.shared, anomaly): anomaly_args.update(shared=args.shared) if args.public_anomaly: anomaly_args.update(r.set_publish_anomaly_args(args)) if anomaly_args: anomaly = r.update_anomaly(anomaly, anomaly_args, args, api=api, path=path, session_file=session_file) anomalies[0] = anomaly # We get the fields of the anomaly detector if we haven't got # them yet and need them if anomaly and args.test_set: fields = pa.get_anomaly_fields(anomaly, csv_properties, args) # If creating a top anomalies excluded/included dataset if args.anomalies_dataset and anomaly: origin_dataset = anomaly['object'].get('dataset') if origin_dataset is None: sys.exit("The dataset used to generate the anomaly detector " "cannot be found. Failed to generate the anomalies " " dataset.") local_anomaly = Anomaly(anomaly) include = args.anomalies_dataset == ANOMALIES_IN args._anomaly_filter = local_anomaly.anomalies_filter(include=include) new_dataset, resume = pd.create_new_dataset( origin_dataset, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) # If predicting if anomaly and args.score: args.test_dataset = anomaly['object']['dataset'] if anomalies and (a.has_test(args) or (test_dataset and args.remote)): # test dataset can be defined by --test-split or --test-dataset or # --test-datasets if test_dataset is None: test_dataset = get_test_dataset(args) # Remote anomaly scores: scores are computed as batch anomaly scores # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_anomaly_score_args = r.set_batch_anomaly_score_args( args, fields=fields, dataset_fields=test_fields) remote_anomaly_score(anomaly, test_dataset, batch_anomaly_score_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: anomaly_score(anomalies, fields, args, session_file=session_file) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ time_series = None time_series_set = None # variables from command-line options resume = args.resume_ time_series_ids = args.time_series_ids_ output = args.predictions # there's only one time_series to be generated at present args.max_parallel_time_series = 1 args.max_parallel_evaluations = 1 # time_series cannot be published yet. args.public_time_series = False # no cross-validations args.dataset_off = False args.cross_validation_rate = 0 args.number_of_evaluations = 1 # It is compulsory to have a description to publish either datasets or # time_series if (not args.description_ and (args.public_time_series or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info(api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if datasets: # Now we have a dataset, let's check if there's an objective_field # given by the user and update it in the fields structure args.objective_id_ = get_objective_id(args, fields) # if the time series is going to be evaluated, and we don't have # test data, we need to divide the rows using ranges, so we'll need # max rows args.max_rows = datasets[0]["object"]["rows"] if args.time_series_file: # time-series is retrieved from the contents of the given local # JSON file time_series, csv_properties, fields = u.read_local_resource( args.time_series_file, csv_properties=csv_properties) time_series_set = [time_series] time_series_ids = [time_series['resource']] else: # time-series is retrieved from the remote object time_series_set, time_series_ids, resume = \ pts.time_series_processing( \ datasets, time_series_set, time_series_ids, \ api, args, resume, fields=fields, \ session_file=session_file, path=path, log=log) if time_series_set: time_series = time_series_set[0] # We update the time-series' public state if needed if time_series: if isinstance(time_series, basestring): query_string = r.ALL_FIELDS_QS time_series = u.check_resource(time_series, api.get_time_series, query_string=query_string) time_series_set[0] = time_series if (args.public_time_series or (args.shared_flag and r.shared_changed(args.shared, time_series))): time_series_args = {} if args.shared_flag and r.shared_changed(args.shared, time_series): time_series_args.update(shared=args.shared) if args.public_time_series: time_series_args.update( \ r.set_publish_time_series_args(args)) if time_series_args: time_series = r.time_series( \ time_series, time_series_args, args, api=api, path=path, \ session_file=session_file) time_series_set[0] = time_series """ # We get the fields of the time-series if we haven't got # them yet and need them if time_series and (args.test_set or args.export_fields): fields = pts.get_time_series_fields( \ time_series, csv_properties, args) """ if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) # If forecasting if time_series_set and a.has_ts_test(args): if args.remote: forecast_args = r.set_forecast_args(args, fields=fields) remote_forecast(time_series, forecast_args, args, \ api, resume, prediction_file=output, \ session_file=session_file, path=path, log=log) else: forecast(time_series, args, session_file=session_file) # If evaluate flag is on, create remote evaluation and save results in # json and human-readable format. if args.evaluate: # When we resume evaluation and models were already completed, we # should use the datasets array as test datasets if args.has_test_datasets_: test_dataset = get_test_dataset(args) if args.dataset_off and not args.has_test_datasets_: args.test_dataset_ids = datasets if args.test_dataset_ids and args.dataset_off: # Evaluate the models with the corresponding test datasets. test_dataset_id = bigml.api.get_dataset_id( \ args.test_dataset_ids[0]) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) resume = evaluate(time_series_set, args.test_dataset_ids, api, args, resume, fields=fields, dataset_fields=test_fields, session_file=session_file, path=path, log=log, objective_field=args.objective_field) else: dataset = datasets[0] if args.test_split > 0 or args.has_test_datasets_: dataset = test_dataset else: args.range_ = [ int(args.max_rows * r.EVALUATE_SAMPLE_RATE), args.max_rows ] dataset = u.check_resource(dataset, api=api, query_string=r.ALL_FIELDS_QS) dataset_fields = pd.get_fields_structure(dataset, None) resume = evaluate(time_series_set, [dataset], api, args, resume, fields=fields, dataset_fields=dataset_fields, session_file=session_file, path=path, log=log, objective_field=args.objective_field) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ pca = None # variables from command-line options resume = args.resume_ pca_ids = args.pca_ids_ output = args.projections # there's only one pca to be generated at present args.max_parallel_pcas = 1 # pca cannot be published yet. args.public_pca = False # It is compulsory to have a description to publish either datasets or # pcas if (not args.description_ and (args.public_pca or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.pca_file: # pca regression is retrieved from the contents of the given local # JSON file pca, csv_properties, fields = u.read_local_resource( args.pca_file, csv_properties=csv_properties) pac_ids = [pca] else: # pca is retrieved from the remote object or created pca, resume = \ pc.pca_processing( \ datasets, pca, pca_ids, \ api, args, resume, fields=fields, \ session_file=session_file, path=path, log=log) # We update the pca public state if needed if pca: if isinstance(pca, basestring): if not a.has_test(args): query_string = MINIMUM_MODEL elif args.export_fields: query_string = r.ALL_FIELDS_QS else: query_string = '' pca = u.check_resource(pca, api.get_pca, query_string=query_string) if (args.public_pca or (args.shared_flag and r.shared_changed(args.shared, pca))): pca_args = {} if args.shared_flag and r.shared_changed(args.shared, pca): pca_args.update(shared=args.shared) if args.public_pca: pca_args.update( \ r.set_publish_pca_args(args)) if pca_args: pca = r.update_pca( \ pca, pca_args, args, api=api, path=path, \ session_file=session_file) # We get the fields of the pca if we haven't got # them yet and need them if pca and (args.test_set or args.export_fields): fields = pc.get_pca_fields( \ pca, csv_properties, args) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) # If predicting if pca and (a.has_test(args) or \ (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote projections: projections are computed as batch projections # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_projection_args = r.set_batch_projection_args( args, fields=fields, dataset_fields=test_fields) remote_projection(pca, test_dataset, \ batch_projection_args, args, \ api, resume, projection_file=output, \ session_file=session_file, path=path, log=log) else: projection(pca, fields, args, session_file=session_file) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ topic_model = None topic_models = None # no multi-label support at present # variables from command-line options resume = args.resume_ topic_model_ids = args.topic_model_ids_ output = args.predictions # there's only one topic model resource to be generated at present args.max_parallel_topic_models = 1 # topic models cannot be published yet. args.public_topic_model = False # It is compulsory to have a description to publish either datasets or # topic models if (not args.description_ and (args.public_topic_model or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.topic_model_file: # topic model is retrieved from the contents of the given local JSON # file topic_model, csv_properties, fields = u.read_local_resource( args.topic_model_file, csv_properties=csv_properties) topic_models = [topic_model] topic_model_ids = [topic_model['resource']] else: # topic model is retrieved from the remote object topic_models, topic_model_ids, resume = pt.topic_model_processing( datasets, topic_models, topic_model_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if topic_models: topic_model = topic_models[0] # We update the topic model's public state if needed if topic_model: if isinstance(topic_model, basestring): if not a.has_test(args): query_string = MINIMUM_MODEL else: query_string = '' topic_model = u.check_resource(topic_model, api.topic_model, query_string=query_string) topic_models[0] = topic_model if (args.public_topic_model or (args.shared_flag and r.shared_changed(args.shared, topic_model))): topic_model_args = {} if args.shared_flag and \ r.shared_changed(args.shared, topic_model): topic_model_args.update(shared=args.shared) if args.public_topic_model: topic_model_args.update(r.set_publish_topic_model_args(args)) if topic_model_args: topic_model = r.update_topic_model( \ topic_model, topic_model_args, args, api=api, path=path, session_file=session_file) topic_models[0] = topic_model # We get the fields of the topic model if we haven't got # them yet and need them if topic_model and args.test_set: csv_properties.update({'objective_field_present': False, 'objective_field': None}) fields = pt.get_topic_model_fields(topic_model, csv_properties, args) # If predicting if topic_models and (a.has_test(args) or (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote topic distributions:topic distributions are computed as # batch topic distributions # in bigml.com except when --no-batch flag is set. if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_topic_distribution_args = \ r.set_batch_topic_distribution_args( \ args, fields=fields, \ dataset_fields=test_fields) remote_topic_distribution( \ topic_model, test_dataset, batch_topic_distribution_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: topic_distribution(topic_models, fields, args, session_file=session_file) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)