def get_input_fields(resource, referrer=None): """New list of input fields """ if referrer is None: referrer = {} input_fields_ids = resource.get('input_fields', []) if referrer: referrer_input_fields = [[]] # compare fields by name resource_fields = Fields({ 'resource': resource['resource'], 'object': resource }) referrer_fields = Fields({ 'resource': referrer['resource'], 'object': referrer }) input_fields = [ resource_fields.field_name(field_id) for field_id in input_fields_ids ] input_fields = sorted(input_fields) referrer_type = get_resource_type(referrer) if referrer_type == 'dataset': referrer_fields = Fields(referrer_fields.preferred_fields()) referrer_fields_names = sorted( \ [field['name'] for _, field in referrer_fields.fields.items()]) else: referrer_fields_names = sorted( \ referrer_fields.fields_by_name.keys()) # check referrer input fields to see if they are equal referrer_input_fields.append(referrer_fields_names) # check whether the resource has an objective field not included in # the input fields list resource_type = get_resource_type(resource) if resource_type == 'model': objective_id = resource.get('objective_field') try: objective_id = objective_id.get('id') except AttributeError: pass referrer_objective = resource_fields.field_name(objective_id) referrer_input_fields.append([ name for name in referrer_fields_names if name != referrer_objective ]) if input_fields in referrer_input_fields: return [] return referrer_fields.fields.keys()
def pca_processing(datasets, pca, \ pca_ids, api, args, resume, fields=None, \ session_file=None, path=None, log=None): """Creates or retrieves pca from the input data """ # If we have a dataset but not a model, we create the model if the no_model # flag hasn't been set up. if datasets and not (has_pca(args) or \ args.no_pca): pca_ids = [] pcas = [] # Only 1 pca per bigmler command at present number_of_pcas = 1 if resume: resume, pca_ids = c.checkpoint( \ c.are_pcas_created, path, \ number_of_pcas, debug=args.debug) if not resume: message = u.dated("Found %s pcas out of %s." " Resuming.\n" % (len(pca_ids), number_of_pcas)) u.log_message(message, log_file=session_file, console=args.verbosity) pcas = pca_ids number_of_pcas -= len(pca_ids) args.exclude_fields = [] if args.exclude_objective: dataset = datasets[0] fields = Fields(dataset) objective_id = \ fields.fields_by_column_number[fields.objective_field] args.exclude_fields = [objective_id] pca_args = r.set_pca_args( \ args, fields=fields, \ pca_fields=args.pca_fields_) pca = \ r.create_pca( \ datasets, pca, pca_args, \ args, api, path, session_file, log) # If a pca is provided, we use it. elif args.pca: pca_ids = [args.pca] pca = pca_ids[0] elif args.pca or args.pca_tag: pca = pca_ids[0] # If we are going to create projections, we must retrieve the pca if pca_ids and (args.test_set or args.export_fields): pca = \ r.get_pca(pca, args, api, session_file) return pca, resume
def read_local_resource(path, csv_properties=None): """Read the JSON resource structure information from the given file. """ resource = empty_resource() if csv_properties is None: csv_properties = {} fields = None open_mode = "rt" if PYTHON3 else "rb" with open(path, open_mode) as resource_file: try: resource = json.loads(resource_file.read()) except IOError: pass resource_id = resource.get('resource') if resource_id is None: sys.exit("Failed to extract a BigML resource structure from the" " contents of file %s." % path) if resource.get('object') is None: resource = { 'resource': resource_id, 'object': resource, 'error': None, 'code': bigml.api.HTTP_OK } fields, resource_locale, missing_tokens = get_fields_structure(resource) if missing_tokens: csv_properties['missing_tokens'] = missing_tokens if resource_locale: csv_properties['data_locale'] = resource_locale if fields: fields = Fields(resource, **csv_properties) return resource, csv_properties, fields
def get_model_fields(model, model_fields, csv_properties, args): """Retrieves fields info from model resource """ if not csv_properties: csv_properties = {} csv_properties.update(verbose=True) if args.user_locale is None: args.user_locale = model['object'].get('locale', None) csv_properties.update(data_locale=args.user_locale) if 'model_fields' in model['object']['model']: model_fields = model['object']['model']['model_fields'].keys() csv_properties.update(include=model_fields) if 'missing_tokens' in model['object']['model']: missing_tokens = model['object']['model']['missing_tokens'] else: missing_tokens = MISSING_TOKENS csv_properties.update(missing_tokens=missing_tokens) objective_field = model['object']['objective_fields'] if isinstance(objective_field, list): objective_field = objective_field[0] csv_properties.update(objective_field=objective_field) fields = Fields(model['object']['model']['fields'], **csv_properties) return fields, objective_field
def get_input_fields(resource, referrer=None): """New list of input fields """ if referrer is None: referrer = {} input_fields_ids = resource.get('input_fields', []) if referrer: referrer_fields = Fields({ 'resource': referrer['resource'], 'object': referrer }) referrer_fields_ids = referrer_fields.fields.keys() # case where objective field is not in input fields # check whether the resource has an objective field not included in # the input fields list resource_type = get_resource_type(resource) if resource_type == 'model': objective_id = resource.get('objective_field') try: objective_id = objective_id.get('id') except AttributeError: pass if objective_id not in input_fields_ids: input_fields_ids.append(objective_id) if input_fields_ids.sort() == referrer_fields_ids.sort(): return [] return input_fields_ids
def get_fields_changes(resource, referrer=None, updatable_attrs=DEFAULT_UPDATABLE): """Changed field attributes """ if referrer is None: referrer = {} fields_attributes = {} resource_fields = Fields({ 'resource': resource['resource'], 'object': resource }).fields resource_type = get_resource_type(resource) # for sources, extract all the updatable attributes if resource_type == 'source': updatable_attrs = SOURCE_UPDATABLE for field_id in resource_fields.keys(): field_opts = {} field = resource_fields[field_id] for attribute in updatable_attrs: if field.get(attribute): field_opts.update({attribute: field[attribute]}) if field_opts != {}: fields_attributes.update({field_id: field_opts}) return fields_attributes # for the rest of resources, check which attributes changed if referrer: referrer_fields = Fields({ 'resource': referrer['resource'], 'object': referrer }).fields for field_id in resource_fields.keys(): field_opts = {} if not field_id in referrer_fields.keys(): continue field = resource_fields[field_id] for attribute in updatable_attrs: ref_values = ["", referrer_fields[field_id].get(attribute, "")] if not field.get(attribute, "") in ref_values: field_opts.update({attribute: field[attribute]}) if field_opts != {}: fields_attributes.update({field_id: field_opts}) return fields_attributes
def update_with_summary_file(step, resource, summary_file): fields = Fields(resource) changes = fields.filter_fields_update( \ fields.new_fields_structure(res_filename(summary_file))) resource_type = get_resource_type(resource) resource = world.api.updaters[resource_type](resource, changes) world.api.ok(resource) setattr(world, resource_type, resource)
def get_fields_structure(resource, csv_properties): """Builds a Fields object from the fields information in the resource """ if not csv_properties and 'locale' in resource['object']: csv_properties = {'data_locale': resource['object']['locale']} fields = Fields(resource['object']['fields'], **csv_properties) return fields
def fusion_processing(fusion, \ fusion_ids, api, args, resume, fields=None, \ session_file=None, path=None, log=None): """Creates or retrieves fusion from the input data """ # If we have a models' list but not a fusion, # we create the model if the no_model # flag hasn't been set up. if args.fusion_models_ is not None and not has_fusion(args): fusion_ids = [] # Only 1 fusion per bigmler command at present number_of_fusions = 1 if resume: resume, fusion_ids = c.checkpoint( \ c.are_fusions_created, path, \ number_of_fusions, debug=args.debug) if not resume: message = u.dated("Found %s fusions out of %s." " Resuming.\n" % (len(fusion_ids), number_of_fusions)) u.log_message(message, log_file=session_file, console=args.verbosity) fusion = fusion_ids[0] first_model_id = api.get_fusion(fusion)[ \ "object"]["fusion"]["models"][0]["id"] first_model_kind = api.get_fusion(fusion)[ \ "object"]["fusion"]["models"][0]["kind"] first_model = api.getters[first_model_kind](first_model_id) fields = Fields(first_model) number_of_fusions -= len(fusion_ids) fusion_args = r.set_fusion_args( \ args, fields) fusion = \ r.create_fusion( \ args.fusion_models_, fusion, fusion_args, \ args, api, path, session_file, log) # If a fusion is provided, we use it. elif args.fusion: fusion_ids = [args.fusion] fusion = fusion_ids[0] elif args.fusion or args.fusion_tag: fusion = fusion_ids[0] # If we are going to create predictions, we must retrieve the fusion if fusion_ids and args.test_set: fusion = \ r.get_fusion(fusion, args, api, session_file) args.objective_field = fusion['object']['objective_field_name'] return fusion, resume
def dataset_processing(source, training_set, test_set, model_ids, name, description, fields, dataset_fields, api, args, resume, csv_properties=None, session_file=None, path=None, log=None): """Creating or retrieving dataset from input arguments """ dataset = None if (training_set or args.source or (args.evaluate and test_set)): # if resuming, try to extract args.dataset form log files if resume: message = u.dated("Dataset not found. Resuming.\n") resume, args.dataset = c.checkpoint(c.is_dataset_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) # If we have a source but no dataset or model has been provided, we # create a new dataset if the no_dataset option isn't set up. Also # if evaluate is set and test_set has been provided. if ((source and not args.dataset and not args.model and not model_ids and not args.no_dataset) or (args.evaluate and args.test_set and not args.dataset)): dataset_args = r.set_dataset_args(name, description, args, fields, dataset_fields) dataset = r.create_dataset(source, dataset_args, args, api, path, session_file, log) # If a dataset is provided, let's retrieve it. elif args.dataset: dataset = bigml.api.get_dataset_id(args.dataset) # If we already have a dataset, we check the status and get the fields if # we hadn't them yet. if dataset: dataset = r.get_dataset(dataset, api, args.verbosity, session_file) if not csv_properties and 'locale' in dataset['object']: csv_properties = {'data_locale': dataset['object']['locale']} fields = Fields(dataset['object']['fields'], **csv_properties) if args.public_dataset: r.publish_dataset(dataset, api, args, session_file) return dataset, resume, csv_properties, fields
def get_fusion_fields(fusion, csv_properties, args): """Retrieves fields info from Fusion resource """ args.retrieve_api_.ok(fusion) if not csv_properties: csv_properties = {} csv_properties.update(verbose=True) csv_properties.update(missing_tokens=DEFAULT_MISSING_TOKENS) return Fields(fusion['object']['fusion']['fields'], \ **csv_properties)
def get_topic_model_fields(topic_model, csv_properties, args): """Retrieves fields info from topic model resource """ if not csv_properties: csv_properties = {} csv_properties.update(verbose=True) if args.user_locale is None: args.user_locale = topic_model['object'].get('locale', None) csv_properties.update(data_locale=args.user_locale) csv_properties.update(missing_tokens=DEFAULT_MISSING_TOKENS) return Fields(topic_model, **csv_properties)
def get_cluster_fields(cluster, csv_properties, args): """Retrieves fields info from cluster resource """ if not csv_properties: csv_properties = {} csv_properties.update(verbose=True) if args.user_locale is None: args.user_locale = cluster['object'].get('locale', None) csv_properties.update(data_locale=args.user_locale) csv_properties.update(missing_tokens=DEFAULT_MISSING_TOKENS) return Fields(cluster['object']['clusters']['fields'], **csv_properties)
def get_model_fields(model, csv_properties, args, single_model=True, multi_label_data=None, local_ensemble=None): """Retrieves fields info from model resource """ if not csv_properties: csv_properties = {} csv_properties.update(verbose=True) if args.user_locale is None: args.user_locale = model['object'].get('locale', None) csv_properties.update(data_locale=args.user_locale) if single_model and 'model_fields' in model['object']['model']: model_fields = model['object']['model']['model_fields'].keys() csv_properties.update(include=model_fields) else: csv_properties.update(include=None) if 'missing_tokens' in model['object']['model']: missing_tokens = model['object']['model']['missing_tokens'] else: missing_tokens = MISSING_TOKENS csv_properties.update(missing_tokens=missing_tokens) # if the model belongs to a multi-label set of models, the real objective # field is never amongst the set of fields of each individual model, so # we must add it. fields_dict = copy.deepcopy(model['object']['model']['fields']) if args.multi_label: # Adds the real objective field to fields_dict objective_field = multi_label_data['objective_name'] objective_id = multi_label_data['objective_id'] objective_column = multi_label_data['objective_column'] fields_dict[objective_id] = { "op_type": "categorical", "name": objective_field, "column_number": objective_column } else: if local_ensemble is not None: fields_dict = copy.deepcopy(local_ensemble.fields) objective_field = model['object']['objective_fields'] if isinstance(objective_field, list): objective_field = objective_field[0] csv_properties.update(objective_field=objective_field) fields = Fields(fields_dict, **csv_properties) return fields, objective_field
def get_logistic_fields(logistic_regression, csv_properties, args): """Retrieves fields info from logistic regression resource """ if not csv_properties: csv_properties = {} csv_properties.update(verbose=True) if args.user_locale is None: args.user_locale = logistic_regression['object'].get('locale', None) csv_properties.update(data_locale=args.user_locale) csv_properties.update(missing_tokens=DEFAULT_MISSING_TOKENS) return Fields(logistic_regression['object'][ \ 'logistic_regression']['fields'], \ **csv_properties)
def best_first_search(datasets_file, api, args, common_options, staleness=None, penalty=None, objective_name=None, resume=False): """Selecting the fields to be used in the model construction """ counter = 0 loop_counter = 0 features_file = os.path.normpath( os.path.join(args.output_dir, FEATURES_LOG)) with open(features_file, u.open_mode("w")) as features_handler: features_writer = csv.writer(features_handler, lineterminator="\n") features_writer.writerow( ["step", "state", "score", "metric_value", "best_score"]) features_handler.flush() if staleness is None: staleness = DEFAULT_STALENESS if penalty is None: penalty = DEFAULT_PENALTY # retrieving the first dataset in the file try: with open(datasets_file, u.open_mode("r")) as datasets_handler: dataset_id = datasets_handler.readline().strip() except IOError, exc: sys.exit("Could not read the generated datasets file: %s" % str(exc)) try: stored_dataset = u.storage_file_name(args.output_dir, dataset_id) with open(stored_dataset, u.open_mode("r")) as dataset_handler: dataset = json.loads(dataset_handler.read()) except IOError: dataset = api.check_resource(dataset_id, query_string=ALL_FIELDS_QS) # initial feature set fields = Fields(dataset) excluded_features = ([] if args.exclude_features is None else args.exclude_features.split(args.args_separator)) try: excluded_ids = [ fields.field_id(feature) for feature in excluded_features ] objective_id = fields.field_id(objective_name) except ValueError, exc: sys.exit(exc)
def get_deepnet_fields(deepnet, csv_properties, args): """Retrieves fields info from deepnet resource """ if not csv_properties: csv_properties = {} csv_properties.update(verbose=True) if args.user_locale is None: args.user_locale = deepnet['object'].get('locale', None) csv_properties.update(data_locale=args.user_locale) csv_properties.update(missing_tokens=DEFAULT_MISSING_TOKENS) csv_properties.update(objective_field=deepnet['object'].get( \ 'objective_field')) return Fields(deepnet['object'][ \ 'deepnet']['fields'], **csv_properties)
def get_pca_fields(pca, csv_properties, args): """Retrieves fields info from PCA resource """ args.retrieve_api_.ok(pca) if not csv_properties: csv_properties = {} csv_properties.update(verbose=True) if args.user_locale is None: args.user_locale = pca['object'].get('locale', None) csv_properties.update(data_locale=args.user_locale) csv_properties.update(missing_tokens=DEFAULT_MISSING_TOKENS) if args.exclude_objective: csv_properties.update({"objective_field_present": False}) csv_properties.update({"objective_field": None}) return Fields(pca['object']['pca']['fields'], \ **csv_properties)
def create_kfold_datasets_file(args, api, common_options, resume=False): """Create the kfold dataset resources and store their ids in a file one per line """ message = ('Creating the kfold datasets............\n') u.log_message(message, log_file=session_file, console=args.verbosity) if args.output_dir is None: args.output_dir = a.NOW # retrieve dataset dataset_id = bigml.api.get_dataset_id(args.dataset) if dataset_id: dataset = api.check_resource(dataset_id) try: args.objective_field = int(args.objective_field) except (TypeError, ValueError): pass # if the user provided no objective field, try to use the one in the # dataset if args.objective_field is None: try: args.objective_field = dataset['object'][ 'objective_field']['column_number'] except KeyError: pass # check that kfold_field is unique fields = Fields(dataset, objective_field=args.objective_field, objective_field_present=True) try: objective_id = fields.field_id(fields.objective_field) objective_name = fields.field_name(objective_id) except ValueError, exc: sys.exit(exc) kfold_field_name = avoid_duplicates(DEFAULT_KFOLD_FIELD, fields) # create jsons to generate partial datasets selecting_file_list, resume = create_kfold_json(args, kfold_field_name, objective_id, resume=resume) # generate test datasets datasets_file, resume = create_kfold_datasets(dataset_id, args, selecting_file_list, objective_name, common_options, resume=resume) return datasets_file, objective_name, resume
def test_source_processing(api, args, resume, name=None, csv_properties=None, session_file=None, path=None, log=None): """Creating or retrieving a test data source from input arguments """ test_source = None fields = None if csv_properties is None: csv_properties = {} if args.test_set and args.remote: # If resuming, try to extract args.source form log files if resume: message = u.dated("Test source not found. Resuming.\n") resume, args.test_source = c.checkpoint(c.is_source_created, path, suffix="_test", debug=args.debug, message=message, log_file=session_file, console=args.verbosity) if not resume: source_args = r.set_source_args(args, name=name, data_set_header=args.test_header) test_source = r.create_source(args.test_set, source_args, args, api, path, session_file, log, source_type="test") # If a source is provided either through the command line or in resume # steps, we use it. elif args.test_source: test_source = bigml.api.get_source_id(args.test_source) # If we already have source, we check that is finished, extract the # fields, and update them if needed. if test_source: test_source = r.get_source(test_source, api, args.verbosity, session_file) if 'source_parser' in test_source['object']: source_parser = test_source['object']['source_parser'] if 'missing_tokens' in source_parser: csv_properties['missing_tokens'] = ( source_parser['missing_tokens']) if 'locale' in source_parser: csv_properties['data_locale'] = source_parser['locale'] if (args.user_locale is not None and bigml_locale( args.user_locale) == source_parser['locale']): args.user_locale = None fields = Fields(test_source['object']['fields'], **csv_properties) if (args.field_attributes_ or args.types_ or args.user_locale or args.json_args.get('source')): # avoid updating project_id in source project_id, args.project_id = args.project_id, None test_source_args = r.set_source_args(args, fields=fields) test_source = r.update_source(test_source, source_args, args, api, session_file) args.project_id = project_id fields = Fields(source['object']['fields'], **csv_properties) return test_source, resume, csv_properties, fields
def source_processing(api, args, resume, csv_properties=None, multi_label_data=None, session_file=None, path=None, log=None): """Creating or retrieving a data source from input arguments """ source = None fields = None if (args.training_set or (hasattr(args, "evaluate") and args.evaluate and args.test_set)): # If resuming, try to extract args.source form log files if resume: message = u.dated("Source not found. Resuming.\n") resume, args.source = c.checkpoint(c.is_source_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) # If neither a previous source, dataset or model are provided. # we create a new one. Also if --evaluate and test data are provided # we create a new dataset to test with. data_set, data_set_header = r.data_to_source(args) if data_set is not None: # Check if there's a created project for it args.project_id = pp.project_processing(api, args, resume, session_file=session_file, path=path, log=log) source_args = r.set_source_args(args, multi_label_data=multi_label_data, data_set_header=data_set_header) source = r.create_source(data_set, source_args, args, api, path, session_file, log) # If a source is provided either through the command line or in resume # steps, we use it. elif args.source: source = bigml.api.get_source_id(args.source) # If we already have source, we check that is finished, extract the # fields, and update them if needed. if source: source = r.get_source(source, api, args.verbosity, session_file) if 'source_parser' in source['object']: source_parser = source['object']['source_parser'] if 'missing_tokens' in source_parser: csv_properties['missing_tokens'] = ( source_parser['missing_tokens']) if 'locale' in source_parser: csv_properties['data_locale'] = source_parser['locale'] # No changes if user locale is the one in the source. if (args.user_locale is not None and bigml_locale( args.user_locale) == source_parser['locale']): args.user_locale = None fields = Fields(source['object']['fields'], **csv_properties) if (args.field_attributes_ or args.types_ or args.user_locale or args.json_args.get('source')): # avoid updating project_id in source project_id, args.project_id = args.project_id, None source_args = r.set_source_args(args, fields=fields) source = r.update_source(source, source_args, args, api, session_file) args.project_id = project_id fields = Fields(source['object']['fields'], **csv_properties) return source, resume, csv_properties, fields
def dataset_processing(source, api, args, resume, fields=None, csv_properties=None, multi_label_data=None, session_file=None, path=None, log=None): """Creating or retrieving dataset from input arguments """ datasets = [] dataset = None if (args.training_set or args.source or (hasattr(args, "evaluate") and args.evaluate and args.test_set)): # if resuming, try to extract args.dataset form log files if resume: message = u.dated("Dataset not found. Resuming.\n") resume, args.dataset = c.checkpoint(c.is_dataset_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) # If we have a source but no dataset or model has been provided, we # create a new dataset if the no_dataset option isn't set up. Also # if evaluate is set and test_set has been provided. if ((source and not args.has_datasets_ and not args.has_models_ and not args.no_dataset) or (hasattr(args, "evaluate") and args.evaluate and args.test_set and not args.dataset)): dataset_args = r.set_dataset_args(args, fields, multi_label_data=multi_label_data) dataset = r.create_dataset(source, dataset_args, args, api, path, session_file, log) # If set of datasets is provided, let's check their ids. elif args.dataset_ids: for i in range(0, len(args.dataset_ids)): dataset_id = args.dataset_ids[i] if isinstance(dataset_id, dict) and "id" in dataset_id: dataset_id = dataset_id["id"] datasets.append(bigml.api.get_dataset_id(dataset_id)) dataset = datasets[0] # If a dataset is provided, let's retrieve it. elif args.dataset: dataset = bigml.api.get_dataset_id(args.dataset) # If we already have a dataset, we check the status and get the fields if # we hadn't them yet. if dataset: dataset = r.get_dataset(dataset, api, args.verbosity, session_file) if ('object' in dataset and 'objective_field' in dataset['object'] and 'column_number' in dataset['object']['objective_field']): dataset_objective = dataset['object']['objective_field'][ 'column_number'] csv_properties.update(objective_field=dataset_objective, objective_field_present=True) fields = get_fields_structure(dataset, csv_properties) if args.public_dataset: r.publish_dataset(dataset, args, api, session_file) if hasattr(args, 'objective_field'): new_objective = get_new_objective(fields, args.objective_field) else: new_objective = None updated = False # We'll update the dataset if # the flag --dataset_attributes is used # the --multi-label flag is used and there's an --objective-field # the --max-categories flag is used and there's an --objective-field # the --impor-fields flag is used if check_dataset_update(args, dataset): dataset_args = r.set_dataset_args(args, fields) if args.shared_flag and r.shared_changed(args.shared, dataset): dataset_args.update(shared=args.shared) dataset = r.update_dataset(dataset, dataset_args, args, api=api, path=path, session_file=session_file) dataset = r.get_dataset(dataset, api, args.verbosity, session_file) updated = True if new_objective is not None: csv_properties.update(objective_field=args.objective_field, objective_field_present=True) updated = True if updated: fields = Fields(dataset['object']['fields'], **csv_properties) if not datasets: datasets = [dataset] else: datasets[0] = dataset return datasets, resume, csv_properties, fields
def i_get_the_missing_values(step): resource = world.dataset fields = Fields(resource['fields']) world.step_result = fields.missing_counts()
def create_fields(step, objective_column): world.fields = Fields(world.source, objective_field=int(objective_column), objective_field_present=True)
def reify_dataset(self, resource_id): """Extracts the REST API arguments from the dataset JSON structure """ child = self.get_resource(resource_id) origin, parent_id = u.get_origin_info(child) parent = self.get_resource(parent_id) opts = {"create": {}, "update": {}, "get": {}} # as two-steps result from a cluster or batch prediction, centroid # or anomaly score grandparent = parent if origin in ['origin_batch_resource', 'cluster']: if origin == "cluster": opts['create'].update({"centroid": child['centroid']}) grandparents = u.get_origin_info(parent) # batch resources have two parents, choose the dataset if origin == "origin_batch_resource" and \ isinstance(grandparents, list): for gp_origin, grandparent in grandparents: if gp_origin == "dataset": break else: _, grandparent = grandparents grandparent = self.get_resource(grandparent) # options common to all model types call = "update" if origin == "origin_batch_resource" else "create" u.common_dataset_opts(child, grandparent, opts, call=call) # update options dataset_defaults = DEFAULTS["dataset"].get("update", {}) for attribute, default_value in dataset_defaults.items(): opts["update"].update( u.default_setting(child, attribute, *default_value)) # name, exclude automatic naming alternatives autonames = [u''] u.non_automatic_name(child, opts, autonames=autonames) # objective field resource_fields = Fields({ 'resource': child['resource'], 'object': child }) objective_id = child['objective_field']['id'] preferred_fields = resource_fields.preferred_fields() # if there's no preferred fields, use the fields structure if len(preferred_fields.keys()) == 0: preferred_fields = resource_fields.fields max_column = sorted([ field['column_number'] for _, field in preferred_fields.items() if field['optype'] != "text" ], reverse=True)[0] objective_column = resource_fields.fields[objective_id][ \ 'column_number'] if objective_column != max_column: opts['create'].update({"objective_field": {"id": objective_id}}) if origin != "origin_batch_resource": # resize if (child['size'] != grandparent['size'] and get_resource_type(parent) == 'source'): opts['create'].update({"size": child['size']}) # generated fields if child.get('new_fields', None): new_fields = child['new_fields'] for new_field in new_fields: new_field['field'] = new_field['generator'] del new_field['generator'] opts['create'].update({"new_fields": new_fields}) u.range_opts(child, grandparent, opts) # for batch_predictions, batch_clusters, batch_anomalies generated # datasets, attributes cannot be set at creation time, so we # must update the resource instead suffix = None if origin == "origin_batch_resource": opts["update"].update(opts["create"]) opts["create"] = {} suffix = "['object']['output_dataset_resource']" calls = u.build_calls(resource_id, [parent_id], opts, suffix=suffix) self.add(resource_id, calls)
def create_fields_from_dataset(step, objective_column): world.fields = Fields(world.dataset, objective_field=int(objective_column), objective_field_present=True)
def create_kfold_datasets_file(args, api, command_obj, resume=False): """Create the kfold dataset resources and store their ids in a file one per line """ message = ('Creating the kfold datasets............\n') u.log_message(message, log_file=session_file, console=args.verbosity) if args.output_dir is None: args.output_dir = a.NOW csv_properties = {} fields = None dataset = None datasets = [] if args.dataset_file: # dataset is retrieved from the contents of the given local JSON file model_dataset, csv_properties, fields = u.read_local_resource( args.dataset_file, csv_properties=csv_properties) if not args.datasets: datasets = [model_dataset] dataset = model_dataset else: datasets = u.read_datasets(args.datasets) dataset_id = dataset['resource'] elif args.dataset: dataset_id = bigml.api.get_dataset_id(args.dataset) datasets = [dataset_id] elif args.dataset_ids: datasets = args.dataset_ids dataset_id = datasets[0] if dataset_id: if not dataset: dataset = api.check_resource(dataset_id, query_string=ALL_FIELDS_QS) try: args.objective_field = int(args.objective_field) except (TypeError, ValueError): pass # if the user provided no objective field, try to use the one in the # dataset if args.objective_field is None: try: args.objective_field = dataset['object']['objective_field'][ 'column_number'] except KeyError: pass # check that kfold_field is unique fields = Fields(dataset, objective_field=args.objective_field, objective_field_present=True) if args.random_fields: default_candidates_limits(args, fields) try: objective_id = fields.field_id(fields.objective_field) objective_name = fields.field_name(objective_id) except ValueError, exc: sys.exit(exc) kfold_field_name = avoid_duplicates(DEFAULT_KFOLD_FIELD, fields) # create jsons to generate partial datasets selecting_file_list, resume = create_kfold_json(args, kfold_field_name, objective_id, resume=resume) # generate test datasets datasets_file, resume = create_kfold_datasets(dataset_id, args, selecting_file_list, command_obj, resume=resume) return datasets_file, objective_name, resume
if penalty is None: penalty = DEFAULT_PENALTY # retrieving the first dataset in the file try: with open(datasets_file, u.open_mode("r")) as datasets_handler: dataset_id = datasets_handler.readline().strip() except IOError, exc: sys.exit("Could not read the generated datasets file: %s" % str(exc)) try: stored_dataset = u.storage_file_name(args.output_dir, dataset_id) with open(stored_dataset, u.open_mode("r")) as dataset_handler: dataset = json.loads(dataset_handler.read()) except IOError: dataset = api.check_resource(dataset_id, query_string=ALL_FIELDS_QS) # initial feature set fields = Fields(dataset) excluded_features = ([] if args.exclude_features is None else args.exclude_features.split(args.args_separator)) try: excluded_ids = [ fields.field_id(feature) for feature in excluded_features ] objective_id = fields.field_id(objective_name) except ValueError, exc: sys.exit(exc) field_ids = [ field_id for field_id in fields.preferred_fields() if field_id != objective_id and not field_id in excluded_ids ] field_ids.sort() # headers are extended with a column per field
def source_processing(training_set, test_set, training_set_header, test_set_header, name, description, api, args, resume, csv_properties=None, field_attributes=None, types=None, session_file=None, path=None, log=None): """Creating or retrieving a data source from input arguments """ source = None fields = None if (training_set or (args.evaluate and test_set)): # If resuming, try to extract args.source form log files if resume: message = u.dated("Source not found. Resuming.\n") resume, args.source = c.checkpoint(c.is_source_created, path, debug=args.debug, message=message, log_file=session_file, console=args.verbosity) # If neither a previous source, dataset or model are provided. # we create a new one. Also if --evaluate and test data are provided # we create a new dataset to test with. data_set, data_set_header = r.data_to_source(training_set, test_set, training_set_header, test_set_header, args) if data_set is not None: source_args = r.set_source_args(data_set_header, name, description, args) source = r.create_source(data_set, source_args, args, api, path, session_file, log) # If a source is provided either through the command line or in resume # steps, we use it. elif args.source: source = bigml.api.get_source_id(args.source) # If we already have source, we check that is finished, extract the # fields, and update them if needed. if source: source = r.get_source(source, api, args.verbosity, session_file) if 'source_parser' in source['object']: source_parser = source['object']['source_parser'] if 'missing_tokens' in source_parser: csv_properties['missing_tokens'] = ( source_parser['missing_tokens']) if 'data_locale' in source_parser: csv_properties['data_locale'] = source_parser['locale'] fields = Fields(source['object']['fields'], **csv_properties) if field_attributes: source = r.update_source_fields(source, field_attributes, fields, api, args.verbosity, session_file) if types: source = r.update_source_fields(source, types, fields, api, args.verbosity, session_file) return source, resume, csv_properties, fields
def compute_output(api, args): """ Creates a dataset using the `training_set`. """ source = None dataset = None fields = None other_label = OTHER multi_label_data = None multi_label_fields = [] datasets = None # variables from command-line options resume = args.resume_ output = args.output check_args_coherence(args) path = u.check_dir(output) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # labels to be used in multi-label expansion labels = (None if args.labels is None else [ label.strip() for label in args.labels.split(args.args_separator) ]) if labels is not None: labels = sorted([label for label in labels]) # multi_label file must be preprocessed to obtain a new extended file if args.multi_label and args.training_set is not None: (args.training_set, multi_label_data) = ps.multi_label_expansion( args.training_set, args.train_header, args, path, labels=labels, session_file=session_file) args.train_header = True args.objective_field = multi_label_data["objective_name"] all_labels = l.get_all_labels(multi_label_data) if not labels: labels = all_labels else: all_labels = labels if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) if args.source_file: # source is retrieved from the contents of the given local JSON file source, csv_properties, fields = u.read_local_resource( args.source_file, csv_properties=csv_properties) else: # source is retrieved from the remote object source, resume, csv_properties, fields = ps.source_processing( api, args, resume, csv_properties=csv_properties, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) if source is not None: args.source = bigml.api.get_source_id(source) if args.multi_label and source: multi_label_data = l.get_multi_label_data(source) (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(args.objective_field, labels, multi_label_data, fields, multi_label_fields) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) if args.dataset_file: # dataset is retrieved from the contents of the given local JSON file model_dataset, csv_properties, fields = u.read_local_resource( args.dataset_file, csv_properties=csv_properties) if not args.datasets: datasets = [model_dataset] dataset = model_dataset else: datasets = u.read_datasets(args.datasets) if not datasets: # dataset is retrieved from the remote object datasets, resume, csv_properties, fields = pd.dataset_processing( source, api, args, resume, fields=fields, csv_properties=csv_properties, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) if datasets: dataset = datasets[-1] if args.to_csv is not None: resume = pd.export_dataset(dataset, api, args, resume, session_file=session_file, path=path) # Now we have a dataset, let's check if there's an objective_field # given by the user and update it in the fields structure args.objective_id_ = get_objective_id(args, fields) # If test_split is used, split the dataset in a training and a test dataset # according to the given split if args.test_split > 0: dataset, _, resume = pd.split_processing( dataset, api, args, resume, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) datasets[0] = dataset # Check if the dataset has a categorical objective field and it # has a max_categories limit for categories if args.max_categories > 0 and len(datasets) == 1: if pd.check_max_categories(fields.fields[args.objective_id_]): distribution = pd.get_categories_distribution( dataset, args.objective_id_) if distribution and len(distribution) > args.max_categories: categories = [element[0] for element in distribution] other_label = pd.create_other_label(categories, other_label) datasets, resume = pd.create_categories_datasets( dataset, distribution, fields, args, api, resume, session_file=session_file, path=path, log=log, other_label=other_label) else: sys.exit("The provided objective field is not categorical nor " "a full terms only text field. " "Only these fields can be used with" " --max-categories") # If any of the transformations is applied, # generate a new dataset from the given list of datasets if args.new_dataset: dataset, resume = pd.create_new_dataset(datasets, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) datasets = [dataset] # Check if the dataset has a generators file associated with it, and # generate a new dataset with the specified field structure. Also # if the --to-dataset flag is used to clone or sample the original dataset if args.new_fields or args.sample_rate != 1 or \ (args.lisp_filter or args.json_filter) and not a.has_source(args): if fields is None: if isinstance(dataset, basestring): dataset = u.check_resource(dataset, api=api) fields = Fields(dataset, csv_properties) args.objective_id_ = get_objective_id(args, fields) args.objective_name_ = fields.field_name(args.objective_id_) dataset, resume = pd.create_new_dataset(dataset, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) datasets[0] = dataset # rebuild fields structure for new ids and fields csv_properties.update({ 'objective_field': args.objective_name_, 'objective_field_present': True }) fields = pd.get_fields_structure(dataset, csv_properties) args.objective_id_ = get_objective_id(args, fields) if args.multi_label and dataset and multi_label_data is None: multi_label_data = l.get_multi_label_data(dataset) (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(args.objective_field, labels, multi_label_data, fields, multi_label_fields) if dataset: # retrieves max_categories data, if any args.max_categories = get_metadata(dataset, 'max_categories', args.max_categories) other_label = get_metadata(dataset, 'other_label', other_label) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)