def dataset_processing(source, api, args, resume, fields=None, csv_properties=None, multi_label_data=None, session_file=None, path=None, log=None): """Creating or retrieving dataset from input arguments """ datasets = [] dataset = None if (args.training_set or args.source or (hasattr(args, "evaluate") and args.evaluate and args.test_set)): # if resuming, try to extract args.dataset form log files if resume: message = u.dated("Dataset not found. Resuming.\n") resume, args.dataset = c.checkpoint(c.is_dataset_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) # If we have a source but no dataset or model has been provided, we # create a new dataset if the no_dataset option isn't set up. Also # if evaluate is set and test_set has been provided. if ((source and not args.has_datasets_ and not args.has_models_ and not args.no_dataset) or (hasattr(args, "evaluate") and args.evaluate and args.test_set and not args.dataset)): dataset_args = r.set_dataset_args(args, fields, multi_label_data=multi_label_data) dataset = r.create_dataset(source, dataset_args, args, api, path, session_file, log) # If set of datasets is provided, let's check their ids. elif args.dataset_ids: for i in range(0, len(args.dataset_ids)): dataset_id = args.dataset_ids[i] if isinstance(dataset_id, dict) and "id" in dataset_id: dataset_id = dataset_id["id"] datasets.append(bigml.api.get_dataset_id(dataset_id)) dataset = datasets[0] # If a dataset is provided, let's retrieve it. elif args.dataset: dataset = bigml.api.get_dataset_id(args.dataset) # If we already have a dataset, we check the status and get the fields if # we hadn't them yet. if dataset: dataset = r.get_dataset(dataset, api, args.verbosity, session_file) if ('object' in dataset and 'objective_field' in dataset['object'] and 'column_number' in dataset['object']['objective_field']): dataset_objective = dataset['object']['objective_field'][ 'column_number'] csv_properties.update(objective_field=dataset_objective, objective_field_present=True) fields = get_fields_structure(dataset, csv_properties) if args.public_dataset: r.publish_dataset(dataset, args, api, session_file) if hasattr(args, 'objective_field'): new_objective = get_new_objective(fields, args.objective_field) else: new_objective = None updated = False # We'll update the dataset if # the flag --dataset_attributes is used # the --multi-label flag is used and there's an --objective-field # the --max-categories flag is used and there's an --objective-field # the --impor-fields flag is used if check_dataset_update(args, dataset): dataset_args = r.set_dataset_args(args, fields) if args.shared_flag and r.shared_changed(args.shared, dataset): dataset_args.update(shared=args.shared) dataset = r.update_dataset(dataset, dataset_args, args, api=api, path=path, session_file=session_file) dataset = r.get_dataset(dataset, api, args.verbosity, session_file) updated = True if new_objective is not None: csv_properties.update(objective_field=args.objective_field, objective_field_present=True) updated = True if updated: fields = Fields(dataset['object']['fields'], **csv_properties) if not datasets: datasets = [dataset] else: datasets[0] = dataset return datasets, resume, csv_properties, fields
def dataset_processing( source, training_set, test_set, fields, objective_field, api, args, resume, name=None, description=None, dataset_fields=None, multi_label_data=None, csv_properties=None, session_file=None, path=None, log=None, ): """Creating or retrieving dataset from input arguments """ datasets = [] dataset = None if training_set or args.source or (args.evaluate and test_set): # if resuming, try to extract args.dataset form log files if resume: message = u.dated("Dataset not found. Resuming.\n") resume, args.dataset = c.checkpoint( c.is_dataset_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity, ) # If we have a source but no dataset or model has been provided, we # create a new dataset if the no_dataset option isn't set up. Also # if evaluate is set and test_set has been provided. if (source and not has_datasets(args) and not has_models(args) and not args.no_dataset) or ( args.evaluate and args.test_set and not args.dataset ): dataset_args = r.set_dataset_args( name, description, args, fields, dataset_fields, objective_field=objective_field, multi_label_data=multi_label_data, ) dataset = r.create_dataset(source, dataset_args, args, api, path, session_file, log) # If a dataset is provided, let's retrieve it. elif args.dataset: dataset = bigml.api.get_dataset_id(args.dataset) # If set of datasets is provided, let's check their ids. elif args.dataset_ids: for i in range(0, len(args.dataset_ids)): datasets.append(bigml.api.get_dataset_id(args.dataset_ids[i])) dataset = datasets[0] # If we already have a dataset, we check the status and get the fields if # we hadn't them yet. if dataset: dataset = r.get_dataset(dataset, api, args.verbosity, session_file) if ( "object" in dataset and "objective_field" in dataset["object"] and "column_number" in dataset["object"]["objective_field"] ): dataset_objective = dataset["object"]["objective_field"]["column_number"] csv_properties.update(objective_field=dataset_objective, objective_field_present=True) fields = get_fields_structure(dataset, csv_properties) if args.public_dataset: r.publish_dataset(dataset, args, api, session_file) new_objective = get_new_objective(fields, args.objective_field, dataset) if ( new_objective is not None or args.dataset_attributes or (args.shared_flag and r.shared_changed(args.shared, dataset)) ): dataset_args = r.set_dataset_args(name, description, args, fields, dataset_fields, objective_field) if args.shared_flag and r.shared_changed(args.shared, dataset): dataset_args.update(shared=args.shared) dataset = r.update_dataset(dataset, dataset_args, args, api=api, path=path, session_file=session_file) dataset = r.get_dataset(dataset, api, args.verbosity, session_file) csv_properties.update(objective_field=objective_field, objective_field_present=True) fields = Fields(dataset["object"]["fields"], **csv_properties) if not datasets: datasets = [dataset] else: datasets[0] = dataset return datasets, resume, csv_properties, fields
def dataset_processing(source, api, args, resume, fields=None, csv_properties=None, multi_label_data=None, session_file=None, path=None, log=None): """Creating or retrieving dataset from input arguments """ datasets = [] dataset = None if (args.training_set or args.source or ( hasattr(args, "evaluate") and args.evaluate and args.test_set)): # if resuming, try to extract args.dataset form log files if resume: message = u.dated("Dataset not found. Resuming.\n") resume, args.dataset = c.checkpoint( c.is_dataset_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) # If we have a source but no dataset or model has been provided, we # create a new dataset if the no_dataset option isn't set up. Also # if evaluate is set and test_set has been provided. if ((source and not args.has_datasets_ and not args.has_models_ and not args.no_dataset) or (hasattr(args, "evaluate") and args.evaluate and args.test_set and not args.dataset)): dataset_args = r.set_dataset_args(args, fields, multi_label_data=multi_label_data) dataset = r.create_dataset(source, dataset_args, args, api, path, session_file, log) # If a dataset is provided, let's retrieve it. elif args.dataset: dataset = bigml.api.get_dataset_id(args.dataset) # If set of datasets is provided, let's check their ids. elif args.dataset_ids: for i in range(0, len(args.dataset_ids)): datasets.append(bigml.api.get_dataset_id(args.dataset_ids[i])) dataset = datasets[0] # If we already have a dataset, we check the status and get the fields if # we hadn't them yet. if dataset: dataset = r.get_dataset(dataset, api, args.verbosity, session_file) if ('object' in dataset and 'objective_field' in dataset['object'] and 'column_number' in dataset['object']['objective_field']): dataset_objective = dataset[ 'object']['objective_field']['column_number'] csv_properties.update(objective_field=dataset_objective, objective_field_present=True) fields = get_fields_structure(dataset, csv_properties) if args.public_dataset: r.publish_dataset(dataset, args, api, session_file) if hasattr(args, 'objective_field'): new_objective = get_new_objective(fields, args.objective_field) else: new_objective = None updated = False # We'll update the dataset if # the flag --dataset_attributes is used # the --multi-label flag is used and there's an --objective-field # the --max-categories flag is used and there's an --objective-field if check_dataset_update(args, dataset): dataset_args = r.set_dataset_args(args, fields) if args.shared_flag and r.shared_changed(args.shared, dataset): dataset_args.update(shared=args.shared) dataset = r.update_dataset(dataset, dataset_args, args, api=api, path=path, session_file=session_file) dataset = r.get_dataset(dataset, api, args.verbosity, session_file) updated = True if new_objective is not None: csv_properties.update(objective_field=args.objective_field, objective_field_present=True) updated = True if updated: fields = Fields(dataset['object']['fields'], **csv_properties) if not datasets: datasets = [dataset] else: datasets[0] = dataset return datasets, resume, csv_properties, fields
def dataset_processing(source, training_set, test_set, fields, objective_field, api, args, resume, name=None, description=None, dataset_fields=None, multi_label_data=None, csv_properties=None, session_file=None, path=None, log=None): """Creating or retrieving dataset from input arguments """ datasets = [] dataset = None if (training_set or args.source or (args.evaluate and test_set)): # if resuming, try to extract args.dataset form log files if resume: message = u.dated("Dataset not found. Resuming.\n") resume, args.dataset = c.checkpoint( c.is_dataset_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) # If we have a source but no dataset or model has been provided, we # create a new dataset if the no_dataset option isn't set up. Also # if evaluate is set and test_set has been provided. if ((source and not has_datasets(args) and not has_models(args) and not args.no_dataset) or (args.evaluate and args.test_set and not args.dataset)): dataset_args = r.set_dataset_args(name, description, args, fields, dataset_fields, objective_field=objective_field, multi_label_data=multi_label_data) dataset = r.create_dataset(source, dataset_args, args, api, path, session_file, log) # If a dataset is provided, let's retrieve it. elif args.dataset: dataset = bigml.api.get_dataset_id(args.dataset) # If set of datasets is provided, let's check their ids. elif args.dataset_ids: for i in range(0, len(args.dataset_ids)): datasets.append(bigml.api.get_dataset_id(args.dataset_ids[i])) dataset = datasets[0] # If we already have a dataset, we check the status and get the fields if # we hadn't them yet. if dataset: dataset = r.get_dataset(dataset, api, args.verbosity, session_file) if ('object' in dataset and 'objective_field' in dataset['object'] and 'column_number' in dataset['object']['objective_field']): dataset_objective = dataset[ 'object']['objective_field']['column_number'] csv_properties.update(objective_field=dataset_objective, objective_field_present=True) fields = get_fields_structure(dataset, csv_properties) if args.public_dataset: r.publish_dataset(dataset, args, api, session_file) new_objective = get_new_objective(fields, args.objective_field, dataset) if (new_objective is not None or args.dataset_attributes or r.shared_changed(args.shared, dataset)): dataset_args = r.set_dataset_args(name, description, args, fields, dataset_fields, objective_field) dataset_args.update(shared=args.shared) dataset = r.update_dataset(dataset, dataset_args, args, api=api, path=path, session_file=session_file) dataset = r.get_dataset(dataset, api, args.verbosity, session_file) csv_properties.update(objective_field=objective_field, objective_field_present=True) fields = Fields(dataset['object']['fields'], **csv_properties) if not datasets: datasets = [dataset] else: datasets[0] = dataset return datasets, resume, csv_properties, fields