def alternative_dataset_processing( dataset_or_source, suffix, dataset_args, api, args, resume, session_file=None, path=None, log=None ): """Creates a dataset. Used in splits to generate train and test datasets """ alternative_dataset = None # if resuming, try to extract dataset form log files if resume: message = u.dated("Dataset not found. Resuming.\n") resume, alternative_dataset = c.checkpoint( c.is_dataset_created, path, "_%s" % suffix, debug=args.debug, message=message, log_file=session_file, console=args.verbosity, ) if alternative_dataset is None: alternative_dataset = r.create_dataset( dataset_or_source, dataset_args, args, api, path, session_file, log, suffix ) if alternative_dataset: alternative_dataset = r.get_dataset(alternative_dataset, api, args.verbosity, session_file) return alternative_dataset, resume
def alternative_dataset_processing(dataset_or_source, suffix, dataset_args, api, args, resume, session_file=None, path=None, log=None): """Creates a dataset. Used in splits to generate train and test datasets """ alternative_dataset = None # if resuming, try to extract dataset form log files if resume: message = u.dated("Dataset not found. Resuming.\n") resume, alternative_dataset = c.checkpoint(c.is_dataset_created, path, "_%s" % suffix, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if alternative_dataset is None: alternative_dataset = r.create_dataset(dataset_or_source, dataset_args, args, api, path, session_file, log, suffix) if alternative_dataset: alternative_dataset = r.get_dataset(alternative_dataset, api, args.verbosity, session_file) return alternative_dataset, resume
def split_processing(dataset, name, description, api, args, resume, session_file=None, path=None, log=None): """Splits a dataset into train and test datasets """ train_dataset = None test_dataset = None sample_rate = 1 - args.test_split # if resuming, try to extract train dataset form log files if resume: message = u.dated("Dataset not found. Resuming.\n") resume, train_dataset = c.checkpoint( c.is_dataset_created, path, "_train", debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if train_dataset is None: dataset_split_args = r.set_dataset_split_args( "%s - train (%s %%)" % (name, int(sample_rate * 100)), description, args, sample_rate, out_of_bag=False) train_dataset = r.create_dataset( dataset, dataset_split_args, args, api, path, session_file, log, "train") if train_dataset: train_dataset = r.get_dataset(train_dataset, api, args.verbosity, session_file) # if resuming, try to extract test dataset form log files if resume: message = u.dated("Dataset not found. Resuming.\n") resume, test_dataset = c.checkpoint( c.is_dataset_created, path, "_test", debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if test_dataset is None: dataset_split_args = r.set_dataset_split_args( "%s - test (%s %%)" % (name, int(args.test_split * 100)), description, args, sample_rate, out_of_bag=True) test_dataset = r.create_dataset( dataset, dataset_split_args, args, api, path, session_file, log, "test") if test_dataset: test_dataset = r.get_dataset(test_dataset, api, args.verbosity, session_file) return train_dataset, test_dataset, resume
def dataset_processing(source, training_set, test_set, model_ids, name, description, fields, dataset_fields, api, args, resume, csv_properties=None, session_file=None, path=None, log=None): """Creating or retrieving dataset from input arguments """ dataset = None if (training_set or args.source or (args.evaluate and test_set)): # if resuming, try to extract args.dataset form log files if resume: message = u.dated("Dataset not found. Resuming.\n") resume, args.dataset = c.checkpoint(c.is_dataset_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) # If we have a source but no dataset or model has been provided, we # create a new dataset if the no_dataset option isn't set up. Also # if evaluate is set and test_set has been provided. if ((source and not args.dataset and not args.model and not model_ids and not args.no_dataset) or (args.evaluate and args.test_set and not args.dataset)): dataset_args = r.set_dataset_args(name, description, args, fields, dataset_fields) dataset = r.create_dataset(source, dataset_args, args, api, path, session_file, log) # If a dataset is provided, let's retrieve it. elif args.dataset: dataset = bigml.api.get_dataset_id(args.dataset) # If we already have a dataset, we check the status and get the fields if # we hadn't them yet. if dataset: dataset = r.get_dataset(dataset, api, args.verbosity, session_file) if not csv_properties and 'locale' in dataset['object']: csv_properties = {'data_locale': dataset['object']['locale']} fields = Fields(dataset['object']['fields'], **csv_properties) if args.public_dataset: r.publish_dataset(dataset, api, args, session_file) return dataset, resume, csv_properties, fields
def create_new_dataset(datasets, api, args, resume, fields=None, session_file=None, path=None, log=None): """Generates a new dataset using the generators given in a generators file or a multi-dataset from a list of datasets """ origin_resource = datasets if not isinstance(datasets, basestring) and args.multi_dataset: suffix = "multi" else: datasets = [] suffix = "gen" number_of_datasets = 1 if resume: resume, datasets = c.checkpoint(c.are_datasets_created, path, number_of_datasets, debug=args.debug, suffix=suffix) if not resume: message = u.dated("Found %s datasets out of %s. Resuming.\n" % (len(datasets), number_of_datasets)) u.log_message(message, log_file=session_file, console=args.verbosity) if not resume: dataset_args = r.set_dataset_args(args, fields) if args.multi_dataset and args.multi_dataset_json: dataset_args.update(args.multi_dataset_json) elif hasattr(args, 'anomalies_dataset') and args.anomalies_dataset: dataset_args.update({'lisp_filter': args.anomaly_filter_}) elif hasattr(args, 'lisp_filter') and args.lisp_filter: dataset_args.update({'lisp_filter': args.lisp_filter}) elif hasattr(args, 'json_filter') and args.json_filter: dataset_args.update({'json_filter': args.json_filter}) else: dataset_args.update(args.dataset_json_generators) new_dataset = r.create_dataset(origin_resource, dataset_args, args, api=api, path=path, session_file=session_file, log=log, dataset_type=suffix) else: new_dataset = datasets[0] return new_dataset, resume
def create_categories_datasets(dataset, distribution, fields, args, api, resume, session_file=None, path=None, log=None, other_label=OTHER): """Generates a new dataset using a subset of categories of the original one """ if args.max_categories < 1: sys.exit("--max-categories can only be a positive number.") datasets = [] categories_splits = [distribution[i: i + args.max_categories] for i in range(0, len(distribution), args.max_categories)] number_of_datasets = len(categories_splits) if resume: resume, datasets = c.checkpoint( c.are_datasets_created, path, number_of_datasets, debug=args.debug) if not resume: message = u.dated("Found %s datasets out of %s. Resuming.\n" % (len(datasets), number_of_datasets)) u.log_message(message, log_file=session_file, console=args.verbosity) if not resume: for i in range(len(datasets), number_of_datasets): split = categories_splits[i] category_selector = "(if (or" for element in split: category = element[0] category_selector += " (= v \"%s\")" % category category_selector += ") v \"%s\")" % other_label category_generator = "(let (v (f %s)) %s)" % ( fields.objective_field, category_selector) try: dataset_args = { "all_but": [fields.objective_field], "new_fields": [ {"name": fields.field_name(fields.objective_field), "field": category_generator, "label": "max_categories: %s" % args.max_categories}], "user_metadata": {"max_categories": args.max_categories, "other_label": other_label}} except ValueError, exc: sys.exit(exc) new_dataset = r.create_dataset( dataset, dataset_args, args, api=api, path=path, session_file=session_file, log=log, dataset_type="parts") new_dataset = bigml.api.check_resource(new_dataset, api.get_dataset) datasets.append(new_dataset)
def create_new_dataset( datasets, api, args, resume, name=None, description=None, fields=None, dataset_fields=None, objective_field=None, session_file=None, path=None, log=None, ): """Generates a new dataset using the generators given in a generators file or a multi-dataset from a list of datasets """ origin_resource = datasets if not isinstance(datasets, basestring) and args.multi_dataset: suffix = "multi" else: datasets = [] suffix = "gen" number_of_datasets = 1 if resume: resume, datasets = c.checkpoint( c.are_datasets_created, path, number_of_datasets, debug=args.debug, suffix=suffix ) if not resume: message = u.dated("Found %s datasets out of %s. Resuming.\n" % (len(datasets), number_of_datasets)) u.log_message(message, log_file=session_file, console=args.verbosity) if not resume: dataset_args = r.set_dataset_args( name, description, args, fields, dataset_fields, objective_field=objective_field ) if args.multi_dataset and args.multi_dataset_json: dataset_args.update(args.multi_dataset_json) else: dataset_args.update(args.dataset_json_generators) new_dataset = r.create_dataset( origin_resource, dataset_args, args, api=api, path=path, session_file=session_file, log=log, dataset_type=suffix, ) else: new_dataset = datasets[0] return new_dataset, resume
def create_new_dataset(datasets, api, args, resume, name=None, description=None, fields=None, dataset_fields=None, objective_field=None, session_file=None, path=None, log=None): """Generates a new dataset using the generators given in a generators file or a multi-dataset from a list of datasets """ origin_resource = datasets if not isinstance(datasets, basestring) and args.multi_dataset: suffix = "multi" else: datasets = [] suffix = "gen" number_of_datasets = 1 if resume: resume, datasets = c.checkpoint( c.are_datasets_created, path, number_of_datasets, debug=args.debug, suffix=suffix) if not resume: message = u.dated("Found %s datasets out of %s. Resuming.\n" % (len(datasets), number_of_datasets)) u.log_message(message, log_file=session_file, console=args.verbosity) if not resume: if args.multi_dataset: dataset_args = r.set_dataset_args(name, description, args, fields, dataset_fields, objective_field=objective_field) if args.multi_dataset_json: dataset_args.update(args.multi_dataset_json) else: dataset_args = {} dataset_args.update(args.dataset_json_generators) dataset_args.update(r.set_dataset_args( name, description, args, fields, dataset_fields, objective_field=objective_field)) new_dataset = r.create_dataset(origin_resource, dataset_args, args, api=api, path=path, session_file=session_file, log=log, dataset_type=suffix) else: new_dataset = datasets[0] return new_dataset, resume
def dataset_processing(source, training_set, test_set, model_ids, name, description, fields, dataset_fields, api, args, resume, csv_properties=None, session_file=None, path=None, log=None): """Creating or retrieving dataset from input arguments """ dataset = None if (training_set or args.source or (args.evaluate and test_set)): # if resuming, try to extract args.dataset form log files if resume: message = u.dated("Dataset not found. Resuming.\n") resume, args.dataset = c.checkpoint( c.is_dataset_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) # If we have a source but no dataset or model has been provided, we # create a new dataset if the no_dataset option isn't set up. Also # if evaluate is set and test_set has been provided. if ((source and not args.dataset and not args.model and not model_ids and not args.no_dataset) or (args.evaluate and args.test_set and not args.dataset)): dataset_args = r.set_dataset_args(name, description, args, fields, dataset_fields) dataset = r.create_dataset(source, dataset_args, args, api, path, session_file, log) # If a dataset is provided, let's retrieve it. elif args.dataset: dataset = bigml.api.get_dataset_id(args.dataset) # If we already have a dataset, we check the status and get the fields if # we hadn't them yet. if dataset: dataset = r.get_dataset(dataset, api, args.verbosity, session_file) if not csv_properties and 'locale' in dataset['object']: csv_properties = { 'data_locale': dataset['object']['locale']} fields = Fields(dataset['object']['fields'], **csv_properties) if args.public_dataset: r.publish_dataset(dataset, api, args, session_file) return dataset, resume, csv_properties, fields
def dataset_processing(source, api, args, resume, fields=None, csv_properties=None, multi_label_data=None, session_file=None, path=None, log=None): """Creating or retrieving dataset from input arguments """ datasets = [] dataset = None if (args.training_set or args.source or (hasattr(args, "evaluate") and args.evaluate and args.test_set)): # if resuming, try to extract args.dataset form log files if resume: message = u.dated("Dataset not found. Resuming.\n") resume, args.dataset = c.checkpoint(c.is_dataset_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) # If we have a source but no dataset or model has been provided, we # create a new dataset if the no_dataset option isn't set up. Also # if evaluate is set and test_set has been provided. if ((source and not args.has_datasets_ and not args.has_models_ and not args.no_dataset) or (hasattr(args, "evaluate") and args.evaluate and args.test_set and not args.dataset)): dataset_args = r.set_dataset_args(args, fields, multi_label_data=multi_label_data) dataset = r.create_dataset(source, dataset_args, args, api, path, session_file, log) # If set of datasets is provided, let's check their ids. elif args.dataset_ids: for i in range(0, len(args.dataset_ids)): dataset_id = args.dataset_ids[i] if isinstance(dataset_id, dict) and "id" in dataset_id: dataset_id = dataset_id["id"] datasets.append(bigml.api.get_dataset_id(dataset_id)) dataset = datasets[0] # If a dataset is provided, let's retrieve it. elif args.dataset: dataset = bigml.api.get_dataset_id(args.dataset) # If we already have a dataset, we check the status and get the fields if # we hadn't them yet. if dataset: dataset = r.get_dataset(dataset, api, args.verbosity, session_file) if ('object' in dataset and 'objective_field' in dataset['object'] and 'column_number' in dataset['object']['objective_field']): dataset_objective = dataset['object']['objective_field'][ 'column_number'] csv_properties.update(objective_field=dataset_objective, objective_field_present=True) fields = get_fields_structure(dataset, csv_properties) if args.public_dataset: r.publish_dataset(dataset, args, api, session_file) if hasattr(args, 'objective_field'): new_objective = get_new_objective(fields, args.objective_field) else: new_objective = None updated = False # We'll update the dataset if # the flag --dataset_attributes is used # the --multi-label flag is used and there's an --objective-field # the --max-categories flag is used and there's an --objective-field # the --impor-fields flag is used if check_dataset_update(args, dataset): dataset_args = r.set_dataset_args(args, fields) if args.shared_flag and r.shared_changed(args.shared, dataset): dataset_args.update(shared=args.shared) dataset = r.update_dataset(dataset, dataset_args, args, api=api, path=path, session_file=session_file) dataset = r.get_dataset(dataset, api, args.verbosity, session_file) updated = True if new_objective is not None: csv_properties.update(objective_field=args.objective_field, objective_field_present=True) updated = True if updated: fields = Fields(dataset['object']['fields'], **csv_properties) if not datasets: datasets = [dataset] else: datasets[0] = dataset return datasets, resume, csv_properties, fields
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ cluster = None clusters = None # no multi-label support at present # variables from command-line options resume = args.resume_ cluster_ids = args.cluster_ids_ output = args.predictions # there's only one cluster to be generated at present args.max_parallel_clusters = 1 # clusters cannot be published yet. args.public_cluster = False # It is compulsory to have a description to publish either datasets or # clusters if (not args.description_ and (args.public_cluster or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info(api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.cluster_file: # cluster is retrieved from the contents of the given local JSON file cluster, csv_properties, fields = u.read_local_resource( args.cluster_file, csv_properties=csv_properties) clusters = [cluster] cluster_ids = [cluster['resource']] else: # cluster is retrieved from the remote object clusters, cluster_ids, resume = pc.clusters_processing( datasets, clusters, cluster_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if clusters: cluster = clusters[0] # We update the cluster's public state if needed if cluster: if isinstance(cluster, basestring): if args.cluster_datasets is None and not a.has_test(args): query_string = MINIMUM_MODEL else: query_string = '' cluster = u.check_resource(cluster, api.get_cluster, query_string=query_string) clusters[0] = cluster if (args.public_cluster or (args.shared_flag and r.shared_changed(args.shared, cluster))): cluster_args = {} if args.shared_flag and r.shared_changed(args.shared, cluster): cluster_args.update(shared=args.shared) if args.public_cluster: cluster_args.update(r.set_publish_cluster_args(args)) if cluster_args: cluster = r.update_cluster(cluster, cluster_args, args, api=api, path=path, session_file=session_file) clusters[0] = cluster # We get the fields of the cluster if we haven't got # them yet and need them if cluster and (args.test_set or args.export_fields): if isinstance(cluster, dict): cluster = cluster['resource'] cluster = u.check_resource(cluster, api.get_cluster, query_string=r.ALL_FIELDS_QS) fields = pc.get_cluster_fields(cluster, csv_properties, args) # If predicting if clusters and (a.has_test(args) or (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote centroids: centroids are computed as batch centroids # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_centroid_args = r.set_batch_centroid_args( args, fields=fields, dataset_fields=test_fields) remote_centroid(cluster, test_dataset, batch_centroid_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: centroid(clusters, fields, args, session_file=session_file) if cluster and args.cluster_datasets is not None: cluster = api.check_resource(cluster) centroids_info = cluster['object']['clusters']['clusters'] centroids = { centroid['name']: centroid['id'] for centroid in centroids_info } cluster_datasets = cluster['object']['cluster_datasets'] if args.cluster_datasets == '': centroid_ids = centroids.values() else: centroid_ids = [ centroids[cluster_name] for cluster_name in args.cluster_datasets_ if cluster_datasets.get(centroids[cluster_name], '') == '' ] for centroid_id in centroid_ids: dataset_args = {'centroid': centroid_id} r.create_dataset(cluster, dataset_args, args, api=api, path=path, session_file=session_file, log=log, dataset_type='cluster') if cluster and args.cluster_models is not None: cluster = api.check_resource(cluster) centroids_info = cluster['object']['clusters']['clusters'] centroids = { centroid['name']: centroid['id'] for centroid in centroids_info } models = cluster['object']['cluster_models'] if args.cluster_models == '': centroid_ids = centroids.values() else: centroid_ids = [ centroids[cluster_name] for cluster_name in args.cluster_models_ if models.get(centroids[cluster_name], '') == '' ] for centroid_id in centroid_ids: model_args = {'centroid': centroid_id} r.create_model(cluster, model_args, args, api=api, path=path, session_file=session_file, log=log, model_type='cluster') if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ cluster = None clusters = None # no multi-label support at present # variables from command-line options resume = args.resume_ cluster_ids = args.cluster_ids_ output = args.predictions # there's only one cluster to be generated at present args.max_parallel_clusters = 1 # clusters cannot be published yet. args.public_cluster = False # It is compulsory to have a description to publish either datasets or # clusters if (not args.description_ and (args.public_cluster or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.cluster_file: # cluster is retrieved from the contents of the given local JSON file cluster, csv_properties, fields = u.read_local_resource( args.cluster_file, csv_properties=csv_properties) clusters = [cluster] cluster_ids = [cluster['resource']] else: # cluster is retrieved from the remote object clusters, cluster_ids, resume = pc.clusters_processing( datasets, clusters, cluster_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if clusters: cluster = clusters[0] # We update the cluster's public state if needed if cluster: if isinstance(cluster, basestring): if args.cluster_datasets is None and not a.has_test(args): query_string = MINIMUM_MODEL else: query_string = '' cluster = u.check_resource(cluster, api.get_cluster, query_string=query_string) clusters[0] = cluster if (args.public_cluster or (args.shared_flag and r.shared_changed(args.shared, cluster))): cluster_args = {} if args.shared_flag and r.shared_changed(args.shared, cluster): cluster_args.update(shared=args.shared) if args.public_cluster: cluster_args.update(r.set_publish_cluster_args(args)) if cluster_args: cluster = r.update_cluster(cluster, cluster_args, args, api=api, path=path, session_file=session_file) clusters[0] = cluster # We get the fields of the cluster if we haven't got # them yet and need them if cluster and args.test_set: fields = pc.get_cluster_fields(cluster, csv_properties, args) # If predicting if clusters and (a.has_test(args) or (test_dataset and args.remote)): if test_dataset is None: test_dataset = get_test_dataset(args) # Remote centroids: centroids are computed as batch centroids # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_centroid_args = r.set_batch_centroid_args( args, fields=fields, dataset_fields=test_fields) remote_centroid(cluster, test_dataset, batch_centroid_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: centroid(clusters, fields, args, session_file=session_file) if cluster and args.cluster_datasets is not None: centroids_info = cluster['object']['clusters']['clusters'] centroids = {centroid['name']: centroid['id'] for centroid in centroids_info} datasets = cluster['object']['cluster_datasets'] if args.cluster_datasets == '': centroid_ids = centroids.values() else: centroid_ids = [centroids[cluster_name] for cluster_name in args.cluster_datasets_ if datasets[centroids[cluster_name]] == ''] for centroid_id in centroid_ids: dataset_args = {'centroid': centroid_id} r.create_dataset(cluster, dataset_args, args, api=api, path=path, session_file=session_file, log=log, dataset_type='cluster') u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def split_processing(dataset, name, description, api, args, resume, session_file=None, path=None, log=None): """Splits a dataset into train and test datasets """ train_dataset = None test_dataset = None sample_rate = 1 - args.test_split # if resuming, try to extract train dataset form log files if resume: message = u.dated("Dataset not found. Resuming.\n") resume, train_dataset = c.checkpoint(c.is_dataset_created, path, "_train", debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if train_dataset is None: dataset_split_args = r.set_dataset_split_args( "%s - train (%s %%)" % (name, int(sample_rate * 100)), description, args, sample_rate, out_of_bag=False) train_dataset = r.create_dataset(dataset, dataset_split_args, args, api, path, session_file, log, "train") if train_dataset: train_dataset = r.get_dataset(train_dataset, api, args.verbosity, session_file) # if resuming, try to extract test dataset form log files if resume: message = u.dated("Dataset not found. Resuming.\n") resume, test_dataset = c.checkpoint(c.is_dataset_created, path, "_test", debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if test_dataset is None: dataset_split_args = r.set_dataset_split_args( "%s - test (%s %%)" % (name, int(args.test_split * 100)), description, args, sample_rate, out_of_bag=True) test_dataset = r.create_dataset(dataset, dataset_split_args, args, api, path, session_file, log, "test") if test_dataset: test_dataset = r.get_dataset(test_dataset, api, args.verbosity, session_file) return train_dataset, test_dataset, resume
def dataset_processing(source, training_set, test_set, fields, objective_field, api, args, resume, name=None, description=None, dataset_fields=None, multi_label_data=None, csv_properties=None, session_file=None, path=None, log=None): """Creating or retrieving dataset from input arguments """ datasets = [] dataset = None if (training_set or args.source or (args.evaluate and test_set)): # if resuming, try to extract args.dataset form log files if resume: message = u.dated("Dataset not found. Resuming.\n") resume, args.dataset = c.checkpoint( c.is_dataset_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) # If we have a source but no dataset or model has been provided, we # create a new dataset if the no_dataset option isn't set up. Also # if evaluate is set and test_set has been provided. if ((source and not has_datasets(args) and not has_models(args) and not args.no_dataset) or (args.evaluate and args.test_set and not args.dataset)): dataset_args = r.set_dataset_args(name, description, args, fields, dataset_fields, objective_field=objective_field, multi_label_data=multi_label_data) dataset = r.create_dataset(source, dataset_args, args, api, path, session_file, log) # If a dataset is provided, let's retrieve it. elif args.dataset: dataset = bigml.api.get_dataset_id(args.dataset) # If set of datasets is provided, let's check their ids. elif args.dataset_ids: for i in range(0, len(args.dataset_ids)): datasets.append(bigml.api.get_dataset_id(args.dataset_ids[i])) dataset = datasets[0] # If we already have a dataset, we check the status and get the fields if # we hadn't them yet. if dataset: dataset = r.get_dataset(dataset, api, args.verbosity, session_file) if ('object' in dataset and 'objective_field' in dataset['object'] and 'column_number' in dataset['object']['objective_field']): dataset_objective = dataset[ 'object']['objective_field']['column_number'] csv_properties.update(objective_field=dataset_objective, objective_field_present=True) fields = get_fields_structure(dataset, csv_properties) if args.public_dataset: r.publish_dataset(dataset, args, api, session_file) new_objective = get_new_objective(fields, args.objective_field, dataset) if (new_objective is not None or args.dataset_attributes or r.shared_changed(args.shared, dataset)): dataset_args = r.set_dataset_args(name, description, args, fields, dataset_fields, objective_field) dataset_args.update(shared=args.shared) dataset = r.update_dataset(dataset, dataset_args, args, api=api, path=path, session_file=session_file) dataset = r.get_dataset(dataset, api, args.verbosity, session_file) csv_properties.update(objective_field=objective_field, objective_field_present=True) fields = Fields(dataset['object']['fields'], **csv_properties) if not datasets: datasets = [dataset] else: datasets[0] = dataset return datasets, resume, csv_properties, fields
def dataset_processing( source, training_set, test_set, fields, objective_field, api, args, resume, name=None, description=None, dataset_fields=None, multi_label_data=None, csv_properties=None, session_file=None, path=None, log=None, ): """Creating or retrieving dataset from input arguments """ datasets = [] dataset = None if training_set or args.source or (args.evaluate and test_set): # if resuming, try to extract args.dataset form log files if resume: message = u.dated("Dataset not found. Resuming.\n") resume, args.dataset = c.checkpoint( c.is_dataset_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity, ) # If we have a source but no dataset or model has been provided, we # create a new dataset if the no_dataset option isn't set up. Also # if evaluate is set and test_set has been provided. if (source and not has_datasets(args) and not has_models(args) and not args.no_dataset) or ( args.evaluate and args.test_set and not args.dataset ): dataset_args = r.set_dataset_args( name, description, args, fields, dataset_fields, objective_field=objective_field, multi_label_data=multi_label_data, ) dataset = r.create_dataset(source, dataset_args, args, api, path, session_file, log) # If a dataset is provided, let's retrieve it. elif args.dataset: dataset = bigml.api.get_dataset_id(args.dataset) # If set of datasets is provided, let's check their ids. elif args.dataset_ids: for i in range(0, len(args.dataset_ids)): datasets.append(bigml.api.get_dataset_id(args.dataset_ids[i])) dataset = datasets[0] # If we already have a dataset, we check the status and get the fields if # we hadn't them yet. if dataset: dataset = r.get_dataset(dataset, api, args.verbosity, session_file) if ( "object" in dataset and "objective_field" in dataset["object"] and "column_number" in dataset["object"]["objective_field"] ): dataset_objective = dataset["object"]["objective_field"]["column_number"] csv_properties.update(objective_field=dataset_objective, objective_field_present=True) fields = get_fields_structure(dataset, csv_properties) if args.public_dataset: r.publish_dataset(dataset, args, api, session_file) new_objective = get_new_objective(fields, args.objective_field, dataset) if ( new_objective is not None or args.dataset_attributes or (args.shared_flag and r.shared_changed(args.shared, dataset)) ): dataset_args = r.set_dataset_args(name, description, args, fields, dataset_fields, objective_field) if args.shared_flag and r.shared_changed(args.shared, dataset): dataset_args.update(shared=args.shared) dataset = r.update_dataset(dataset, dataset_args, args, api=api, path=path, session_file=session_file) dataset = r.get_dataset(dataset, api, args.verbosity, session_file) csv_properties.update(objective_field=objective_field, objective_field_present=True) fields = Fields(dataset["object"]["fields"], **csv_properties) if not datasets: datasets = [dataset] else: datasets[0] = dataset return datasets, resume, csv_properties, fields
def dataset_processing(source, api, args, resume, fields=None, csv_properties=None, multi_label_data=None, session_file=None, path=None, log=None): """Creating or retrieving dataset from input arguments """ datasets = [] dataset = None if (args.training_set or args.source or ( hasattr(args, "evaluate") and args.evaluate and args.test_set)): # if resuming, try to extract args.dataset form log files if resume: message = u.dated("Dataset not found. Resuming.\n") resume, args.dataset = c.checkpoint( c.is_dataset_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) # If we have a source but no dataset or model has been provided, we # create a new dataset if the no_dataset option isn't set up. Also # if evaluate is set and test_set has been provided. if ((source and not args.has_datasets_ and not args.has_models_ and not args.no_dataset) or (hasattr(args, "evaluate") and args.evaluate and args.test_set and not args.dataset)): dataset_args = r.set_dataset_args(args, fields, multi_label_data=multi_label_data) dataset = r.create_dataset(source, dataset_args, args, api, path, session_file, log) # If a dataset is provided, let's retrieve it. elif args.dataset: dataset = bigml.api.get_dataset_id(args.dataset) # If set of datasets is provided, let's check their ids. elif args.dataset_ids: for i in range(0, len(args.dataset_ids)): datasets.append(bigml.api.get_dataset_id(args.dataset_ids[i])) dataset = datasets[0] # If we already have a dataset, we check the status and get the fields if # we hadn't them yet. if dataset: dataset = r.get_dataset(dataset, api, args.verbosity, session_file) if ('object' in dataset and 'objective_field' in dataset['object'] and 'column_number' in dataset['object']['objective_field']): dataset_objective = dataset[ 'object']['objective_field']['column_number'] csv_properties.update(objective_field=dataset_objective, objective_field_present=True) fields = get_fields_structure(dataset, csv_properties) if args.public_dataset: r.publish_dataset(dataset, args, api, session_file) if hasattr(args, 'objective_field'): new_objective = get_new_objective(fields, args.objective_field) else: new_objective = None updated = False # We'll update the dataset if # the flag --dataset_attributes is used # the --multi-label flag is used and there's an --objective-field # the --max-categories flag is used and there's an --objective-field if check_dataset_update(args, dataset): dataset_args = r.set_dataset_args(args, fields) if args.shared_flag and r.shared_changed(args.shared, dataset): dataset_args.update(shared=args.shared) dataset = r.update_dataset(dataset, dataset_args, args, api=api, path=path, session_file=session_file) dataset = r.get_dataset(dataset, api, args.verbosity, session_file) updated = True if new_objective is not None: csv_properties.update(objective_field=args.objective_field, objective_field_present=True) updated = True if updated: fields = Fields(dataset['object']['fields'], **csv_properties) if not datasets: datasets = [dataset] else: datasets[0] = dataset return datasets, resume, csv_properties, fields
def compute_output(api, args, training_set, test_set=None, output=None, objective_field=None, description=None, field_attributes=None, types=None, dataset_fields=None, model_fields=None, name=None, training_set_header=True, test_set_header=True, model_ids=None, votes_files=None, resume=False, fields_map=None): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ source = None dataset = None model = None models = None fields = None # It is compulsory to have a description to publish either datasets or # models if (not description and (args.black_box or args.white_box or args.public_dataset)): raise Exception("You should provide a description to publish.") path = u.check_dir(output) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required, open the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared if args.clear_logs: try: open(log, 'w', 0).close() except IOError: pass # Starting source processing if (training_set or (args.evaluate and test_set)): # If resuming, try to extract args.source form log files if resume: resume, args.source = u.checkpoint(u.is_source_created, path, debug=args.debug) if not resume: message = u.dated("Source not found. Resuming.\n") u.log_message(message, log_file=session_file, console=args.verbosity) # If neither a previous source, dataset or model are provided. # we create a new one. Also if --evaluate and test data are provided # we create a new dataset to test with. data_set, data_set_header = r.data_to_source(training_set, test_set, training_set_header, test_set_header, args) if data_set is not None: source_args = r.set_source_args(data_set_header, name, description, args) source = r.create_source(data_set, source_args, args, api, path, session_file, log) # If a source is provided either through the command line or in resume # steps, we use it. elif args.source: source = bigml.api.get_source_id(args.source) # If we already have source, we check that is finished, extract the # fields, and update them if needed. if source: source = r.get_source(source, api, args.verbosity, session_file) if 'source_parser' in source['object']: source_parser = source['object']['source_parser'] if 'missing_tokens' in source_parser: csv_properties['missing_tokens'] = ( source_parser['missing_tokens']) if 'data_locale' in source_parser: csv_properties['data_locale'] = source_parser['locale'] fields = Fields(source['object']['fields'], **csv_properties) if field_attributes: source = r.update_source_fields(source, field_attributes, fields, api, args.verbosity, session_file) if types: source = r.update_source_fields(source, types, fields, api, args.verbosity, session_file) # End of source processing # Starting dataset processing if (training_set or args.source or (args.evaluate and test_set)): # if resuming, try to extract args.dataset form log files if resume: resume, args.dataset = u.checkpoint(u.is_dataset_created, path, debug=args.debug) if not resume: message = u.dated("Dataset not found. Resuming.\n") u.log_message(message, log_file=session_file, console=args.verbosity) # If we have a source but not dataset or model has been provided, we # create a new dataset if the no_dataset option isn't set up. Also # if evaluate is set and test_set has been provided. if ((source and not args.dataset and not args.model and not model_ids and not args.no_dataset) or (args.evaluate and args.test_set and not args.dataset)): dataset_args = r.set_dataset_args(name, description, args, fields, dataset_fields) dataset = r.create_dataset(source, dataset_args, args, api, path, session_file, log) # If a dataset is provided, let's retrieve it. elif args.dataset: dataset = bigml.api.get_dataset_id(args.dataset) # If we already have a dataset, we check the status and get the fields if # we hadn't them yet. if dataset: dataset = r.get_dataset(dataset, api, args.verbosity, session_file) if not csv_properties and 'locale' in dataset['object']: csv_properties = { 'data_locale': dataset['object']['locale']} fields = Fields(dataset['object']['fields'], **csv_properties) if args.public_dataset: r.publish_dataset(dataset, api, args, session_file) #end of dataset processing #start of model processing # If we have a dataset but not a model, we create the model if the no_model # flag hasn't been set up. if (dataset and not args.model and not model_ids and not args.no_model): model_ids = [] models = [] if resume: resume, model_ids = u.checkpoint(u.are_models_created, path, args.number_of_models, debug=args.debug) if not resume: message = u.dated("Found %s models out of %s. Resuming.\n" % (len(model_ids), args.number_of_models)) u.log_message(message, log_file=session_file, console=args.verbosity) models = model_ids args.number_of_models -= len(model_ids) model_args = r.set_model_args(name, description, args, objective_field, fields, model_fields) models, model_ids = r.create_models(dataset, models, model_args, args, api, path, session_file, log) model = models[0] # If a model is provided, we use it. elif args.model: model = args.model model_ids = [model] models = [model] elif args.models or args.model_tag: models = model_ids[:] model = models[0] # If we are going to predict we must retrieve the models if model_ids and test_set and not args.evaluate: models, model_ids = r.get_models(models, args, api, session_file) model = models[0] # We get the fields of the model if we haven't got # them yet and update its public state if needed if model and not args.evaluate and (test_set or args.black_box or args.white_box): if args.black_box or args.white_box: model = r.publish_model(model, args, api, session_file) models[0] = model if not csv_properties: csv_properties = {} csv_properties.update(verbose=True) if args.user_locale is None: args.user_locale = model['object'].get('locale', None) csv_properties.update(data_locale=args.user_locale) if 'model_fields' in model['object']['model']: model_fields = model['object']['model']['model_fields'].keys() csv_properties.update(include=model_fields) if 'missing_tokens' in model['object']['model']: missing_tokens = model['object']['model']['missing_tokens'] else: missing_tokens = MISSING_TOKENS csv_properties.update(missing_tokens=missing_tokens) objective_field = models[0]['object']['objective_fields'] if isinstance(objective_field, list): objective_field = objective_field[0] csv_properties.update(objective_field=objective_field) fields = Fields(model['object']['model']['fields'], **csv_properties) # end of model processing # If predicting if models and test_set and not args.evaluate: predict(test_set, test_set_header, models, fields, output, objective_field, args.remote, api, log, args.max_batch_models, args.method, resume, args.tag, args.verbosity, session_file, args.debug) # When combine_votes flag is used, retrieve the predictions files saved # in the comma separated list of directories and combine them if votes_files: model_id = re.sub(r'.*(model_[a-f0-9]{24})__predictions\.csv$', r'\1', votes_files[0]).replace("_", "/") try: model = api.check_resource(model_id, api.get_model) except ValueError, exception: sys.exit("Failed to get model %s: %s" % (model_id, str(exception))) local_model = Model(model) message = u.dated("Combining votes.\n") u.log_message(message, log_file=session_file, console=args.verbosity) u.combine_votes(votes_files, local_model.to_prediction, output, args.method)