def get_input_fields(resource, referrer=None): """New list of input fields """ if referrer is None: referrer = {} input_fields_ids = resource.get('input_fields', []) if referrer: referrer_input_fields = [[]] # compare fields by name resource_fields = Fields({ 'resource': resource['resource'], 'object': resource }) referrer_fields = Fields({ 'resource': referrer['resource'], 'object': referrer }) input_fields = [ resource_fields.field_name(field_id) for field_id in input_fields_ids ] input_fields = sorted(input_fields) referrer_type = get_resource_type(referrer) if referrer_type == 'dataset': referrer_fields = Fields(referrer_fields.preferred_fields()) referrer_fields_names = sorted( \ [field['name'] for _, field in referrer_fields.fields.items()]) else: referrer_fields_names = sorted( \ referrer_fields.fields_by_name.keys()) # check referrer input fields to see if they are equal referrer_input_fields.append(referrer_fields_names) # check whether the resource has an objective field not included in # the input fields list resource_type = get_resource_type(resource) if resource_type == 'model': objective_id = resource.get('objective_field') try: objective_id = objective_id.get('id') except AttributeError: pass referrer_objective = resource_fields.field_name(objective_id) referrer_input_fields.append([ name for name in referrer_fields_names if name != referrer_objective ]) if input_fields in referrer_input_fields: return [] return referrer_fields.fields.keys()
def get_input_fields(resource, referrer=None): """New list of input fields """ if referrer is None: referrer = {} input_fields_ids = resource.get('input_fields', []) if referrer: referrer_input_fields = [[]] # compare fields by name resource_fields = Fields( {'resource': resource['resource'], 'object': resource}) referrer_fields = Fields( {'resource': referrer['resource'], 'object': referrer}) input_fields = [resource_fields.field_name(field_id) for field_id in input_fields_ids] input_fields = sorted(input_fields) referrer_type = get_resource_type(referrer) if referrer_type == 'dataset': referrer_fields = Fields(referrer_fields.preferred_fields()) referrer_fields_names = sorted( \ [field['name'] for _, field in referrer_fields.fields.items()]) else: referrer_fields_names = sorted( \ referrer_fields.fields_by_name.keys()) # check referrer input fields to see if they are equal referrer_input_fields.append(referrer_fields_names) # check whether the resource has an objective field not included in # the input fields list resource_type = get_resource_type(resource) if resource_type == 'model': objective_id = resource.get('objective_field') try: objective_id = objective_id.get('id') except AttributeError: pass referrer_objective = resource_fields.field_name( objective_id) referrer_input_fields.append([name for name in referrer_fields_names if name != referrer_objective]) if input_fields in referrer_input_fields: return [] return referrer_fields.fields.keys()
def create_kfold_datasets_file(args, api, common_options, resume=False): """Create the kfold dataset resources and store their ids in a file one per line """ message = ('Creating the kfold datasets............\n') u.log_message(message, log_file=session_file, console=args.verbosity) if args.output_dir is None: args.output_dir = a.NOW # retrieve dataset dataset_id = bigml.api.get_dataset_id(args.dataset) if dataset_id: dataset = api.check_resource(dataset_id) try: args.objective_field = int(args.objective_field) except (TypeError, ValueError): pass # if the user provided no objective field, try to use the one in the # dataset if args.objective_field is None: try: args.objective_field = dataset['object'][ 'objective_field']['column_number'] except KeyError: pass # check that kfold_field is unique fields = Fields(dataset, objective_field=args.objective_field, objective_field_present=True) try: objective_id = fields.field_id(fields.objective_field) objective_name = fields.field_name(objective_id) except ValueError, exc: sys.exit(exc) kfold_field_name = avoid_duplicates(DEFAULT_KFOLD_FIELD, fields) # create jsons to generate partial datasets selecting_file_list, resume = create_kfold_json(args, kfold_field_name, objective_id, resume=resume) # generate test datasets datasets_file, resume = create_kfold_datasets(dataset_id, args, selecting_file_list, objective_name, common_options, resume=resume) return datasets_file, objective_name, resume
def create_kfold_datasets_file(args, api, common_options, resume=False): """Create the kfold dataset resources and store their ids in a file one per line """ message = ('Creating the kfold datasets............\n') u.log_message(message, log_file=session_file, console=args.verbosity) if args.output_dir is None: args.output_dir = a.NOW csv_properties = {} fields = None dataset = None datasets = [] if args.dataset_file: # dataset is retrieved from the contents of the given local JSON file model_dataset, csv_properties, fields = u.read_local_resource( args.dataset_file, csv_properties=csv_properties) if not args.datasets: datasets = [model_dataset] dataset = model_dataset else: datasets = u.read_datasets(args.datasets) dataset_id = dataset['resource'] elif args.dataset: dataset_id = bigml.api.get_dataset_id(args.dataset) datasets = [dataset_id] elif args.dataset_ids: datasets = args.dataset_ids dataset_id = datasets[0] if dataset_id: if not dataset: dataset = api.check_resource(dataset_id, query_string=ALL_FIELDS_QS) try: args.objective_field = int(args.objective_field) except (TypeError, ValueError): pass # if the user provided no objective field, try to use the one in the # dataset if args.objective_field is None: try: args.objective_field = dataset['object'][ 'objective_field']['column_number'] except KeyError: pass # check that kfold_field is unique fields = Fields(dataset, objective_field=args.objective_field, objective_field_present=True) if args.random_fields: default_candidates_limits(args, fields) try: objective_id = fields.field_id(fields.objective_field) objective_name = fields.field_name(objective_id) except ValueError, exc: sys.exit(exc) kfold_field_name = avoid_duplicates(DEFAULT_KFOLD_FIELD, fields) # create jsons to generate partial datasets selecting_file_list, resume = create_kfold_json(args, kfold_field_name, objective_id, resume=resume) # generate test datasets datasets_file, resume = create_kfold_datasets(dataset_id, args, selecting_file_list, common_options, resume=resume) return datasets_file, objective_name, resume
# initial feature set fields = Fields(dataset) excluded_features = ([] if args.exclude_features is None else args.exclude_features.split( args.args_separator)) try: excluded_ids = [fields.field_id(feature) for feature in excluded_features] objective_id = fields.field_id(objective_name) except ValueError, exc: sys.exit(exc) field_ids = [field_id for field_id in fields.preferred_fields() if field_id != objective_id and not field_id in excluded_ids] # headers are extended with a column per field fields_names = [fields.field_name(field_id) for field_id in field_ids] features_header.extend(fields_names) features_writer.writerow(features_header) initial_state = [False for field_id in field_ids] open_list = [(initial_state, - float('inf'), -float('inf'), 0)] closed_list = [] best_state, best_score, best_metric_value, best_counter = open_list[0] best_unchanged_count = 0 metric = args.optimize while best_unchanged_count < staleness and open_list: loop_counter += 1 features_set = find_max_state(open_list) state, score, metric_value, folder_counter = features_set if loop_counter > 1: csv_results = [loop_counter - 1, [int(in_set) for in_set in state], score, metric_value, best_score]
def create_kfold_datasets_file(args, api, command_obj, resume=False): """Create the kfold dataset resources and store their ids in a file one per line """ message = ('Creating the kfold datasets............\n') u.log_message(message, log_file=session_file, console=args.verbosity) if args.output_dir is None: args.output_dir = a.NOW csv_properties = {} fields = None dataset = None datasets = [] if args.dataset_file: # dataset is retrieved from the contents of the given local JSON file model_dataset, csv_properties, fields = u.read_local_resource( args.dataset_file, csv_properties=csv_properties) if not args.datasets: datasets = [model_dataset] dataset = model_dataset else: datasets = u.read_datasets(args.datasets) dataset_id = dataset['resource'] elif args.dataset: dataset_id = bigml.api.get_dataset_id(args.dataset) datasets = [dataset_id] elif args.dataset_ids: datasets = args.dataset_ids dataset_id = datasets[0] if dataset_id: if not dataset: dataset = api.check_resource(dataset_id, query_string=ALL_FIELDS_QS) try: args.objective_field = int(args.objective_field) except (TypeError, ValueError): pass # if the user provided no objective field, try to use the one in the # dataset if args.objective_field is None: try: args.objective_field = dataset['object']['objective_field'][ 'column_number'] except KeyError: pass # check that kfold_field is unique fields = Fields(dataset, objective_field=args.objective_field, objective_field_present=True) if args.random_fields: default_candidates_limits(args, fields) try: objective_id = fields.field_id(fields.objective_field) objective_name = fields.field_name(objective_id) except ValueError, exc: sys.exit(exc) kfold_field_name = avoid_duplicates(DEFAULT_KFOLD_FIELD, fields) # create jsons to generate partial datasets selecting_file_list, resume = create_kfold_json(args, kfold_field_name, objective_id, resume=resume) # generate test datasets datasets_file, resume = create_kfold_datasets(dataset_id, args, selecting_file_list, command_obj, resume=resume) return datasets_file, objective_name, resume
excluded_features = ([] if args.exclude_features is None else args.exclude_features.split(args.args_separator)) try: excluded_ids = [ fields.field_id(feature) for feature in excluded_features ] objective_id = fields.field_id(objective_name) except ValueError, exc: sys.exit(exc) field_ids = [ field_id for field_id in fields.preferred_fields() if field_id != objective_id and not field_id in excluded_ids ] field_ids.sort() # headers are extended with a column per field fields_names = [fields.field_name(field_id) for field_id in field_ids] features_header.extend(fields_names) features_writer.writerow(features_header) initial_state = [False for field_id in field_ids] open_list = [(initial_state, -float('inf'), -float('inf'), 0)] closed_list = [] best_state, best_score, best_metric_value, best_counter = open_list[0] best_unchanged_count = 0 metric = args.optimize while best_unchanged_count < staleness and open_list: loop_counter += 1 features_set = find_max_state(open_list) state, score, metric_value, _ = features_set if loop_counter > 1: csv_results = [loop_counter - 1, \ [int(in_set) for in_set in state], \
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ source = None dataset = None model = None models = None fields = None other_label = OTHER ensemble_ids = [] multi_label_data = None multi_label_fields = [] #local_ensemble = None test_dataset = None datasets = None # variables from command-line options resume = args.resume_ model_ids = args.model_ids_ output = args.output dataset_fields = args.dataset_fields_ check_args_coherence(args) path = u.check_dir(output) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # labels to be used in multi-label expansion labels = (None if args.labels is None else [label.strip() for label in args.labels.split(args.args_separator)]) if labels is not None: labels = sorted([label for label in labels]) # multi_label file must be preprocessed to obtain a new extended file if args.multi_label and args.training_set is not None: (args.training_set, multi_label_data) = ps.multi_label_expansion( args.training_set, args.train_header, args, path, labels=labels, session_file=session_file) args.train_header = True args.objective_field = multi_label_data["objective_name"] all_labels = l.get_all_labels(multi_label_data) if not labels: labels = all_labels else: all_labels = labels if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) if args.source_file: # source is retrieved from the contents of the given local JSON file source, csv_properties, fields = u.read_local_resource( args.source_file, csv_properties=csv_properties) else: # source is retrieved from the remote object source, resume, csv_properties, fields = ps.source_processing( api, args, resume, csv_properties=csv_properties, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) if source is not None: args.source = bigml.api.get_source_id(source) if args.multi_label and source: multi_label_data = l.get_multi_label_data(source) (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(args.objective_field, labels, multi_label_data, fields, multi_label_fields) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) if args.dataset_file: # dataset is retrieved from the contents of the given local JSON file model_dataset, csv_properties, fields = u.read_local_resource( args.dataset_file, csv_properties=csv_properties) if not args.datasets: datasets = [model_dataset] dataset = model_dataset else: datasets = u.read_datasets(args.datasets) if not datasets: # dataset is retrieved from the remote object datasets, resume, csv_properties, fields = pd.dataset_processing( source, api, args, resume, fields=fields, csv_properties=csv_properties, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) if datasets: dataset = datasets[0] if args.to_csv is not None: resume = pd.export_dataset(dataset, api, args, resume, session_file=session_file, path=path) # Now we have a dataset, let's check if there's an objective_field # given by the user and update it in the fields structure args.objective_id_ = get_objective_id(args, fields) # If test_split is used, split the dataset in a training and a test dataset # according to the given split if args.test_split > 0: dataset, test_dataset, resume = pd.split_processing( dataset, api, args, resume, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) datasets[0] = dataset # Check if the dataset has a categorical objective field and it # has a max_categories limit for categories if args.max_categories > 0 and len(datasets) == 1: if pd.check_max_categories(fields.fields[args.objective_id_]): distribution = pd.get_categories_distribution(dataset, args.objective_id_) if distribution and len(distribution) > args.max_categories: categories = [element[0] for element in distribution] other_label = pd.create_other_label(categories, other_label) datasets, resume = pd.create_categories_datasets( dataset, distribution, fields, args, api, resume, session_file=session_file, path=path, log=log, other_label=other_label) else: sys.exit("The provided objective field is not categorical nor " "a full terms only text field. " "Only these fields can be used with" " --max-categories") # If multi-dataset flag is on, generate a new dataset from the given # list of datasets if args.multi_dataset: dataset, resume = pd.create_new_dataset( datasets, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) datasets = [dataset] # Check if the dataset has a generators file associated with it, and # generate a new dataset with the specified field structure. Also # if the --to-dataset flag is used to clone or sample the original dataset if args.new_fields or (args.sample_rate != 1 and args.no_model) or \ (args.lisp_filter or args.json_filter) and not has_source(args): if fields is None: if isinstance(dataset, basestring): dataset = u.check_resource(dataset, api=api) fields = Fields(dataset, csv_properties) args.objective_id_ = get_objective_id(args, fields) args.objective_name_ = fields.field_name(args.objective_id_) dataset, resume = pd.create_new_dataset( dataset, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) datasets[0] = dataset # rebuild fields structure for new ids and fields csv_properties.update({'objective_field': args.objective_name_, 'objective_field_present': True}) fields = pd.get_fields_structure(dataset, csv_properties) args.objective_id_ = get_objective_id(args, fields) if args.multi_label and dataset and multi_label_data is None: multi_label_data = l.get_multi_label_data(dataset) (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(args.objective_field, labels, multi_label_data, fields, multi_label_fields) if dataset: # retrieves max_categories data, if any args.max_categories = get_metadata(dataset, 'max_categories', args.max_categories) other_label = get_metadata(dataset, 'other_label', other_label) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) if args.model_file: # model is retrieved from the contents of the given local JSON file model, csv_properties, fields = u.read_local_resource( args.model_file, csv_properties=csv_properties) models = [model] model_ids = [model['resource']] ensemble_ids = [] elif args.ensemble_file: # model is retrieved from the contents of the given local JSON file ensemble, csv_properties, fields = u.read_local_resource( args.ensemble_file, csv_properties=csv_properties) model_ids = ensemble['object']['models'][:] ensemble_ids = [ensemble['resource']] models = model_ids[:] model = retrieve_resource(args.retrieve_api_, models[0], query_string=r.ALL_FIELDS_QS) models[0] = model else: # model is retrieved from the remote object models, model_ids, ensemble_ids, resume = pm.models_processing( datasets, models, model_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log, labels=labels, multi_label_data=multi_label_data, other_label=other_label) if models: model = models[0] single_model = len(models) == 1 # If multi-label flag is set and no training_set was provided, label # info is extracted from the user_metadata. If models belong to an # ensemble, the ensemble must be retrieved to get the user_metadata. if model and args.multi_label and multi_label_data is None: if len(ensemble_ids) > 0 and isinstance(ensemble_ids[0], dict): resource = ensemble_ids[0] elif belongs_to_ensemble(model): ensemble_id = get_ensemble_id(model) resource = r.get_ensemble(ensemble_id, api=api, verbosity=args.verbosity, session_file=session_file) else: resource = model multi_label_data = l.get_multi_label_data(resource) # We update the model's public state if needed if model: if (isinstance(model, basestring) or bigml.api.get_status(model)['code'] != bigml.api.FINISHED): if not args.evaluate and not a.has_train(args) and \ not a.has_test(args): query_string = MINIMUM_MODEL elif not args.test_header: query_string = r.ALL_FIELDS_QS else: query_string = "%s;%s" % (r.ALL_FIELDS_QS, r.FIELDS_QS) model = u.check_resource(model, api.get_model, query_string=query_string) models[0] = model if (args.black_box or args.white_box or (args.shared_flag and r.shared_changed(args.shared, model))): model_args = {} if args.shared_flag and r.shared_changed(args.shared, model): model_args.update(shared=args.shared) if args.black_box or args.white_box: model_args.update(r.set_publish_model_args(args)) if model_args: model = r.update_model(model, model_args, args, api=api, path=path, session_file=session_file) models[0] = model # We get the fields of the model if we haven't got # them yet and need them if model and not args.evaluate and (a.has_test(args) or args.export_fields): # if we are using boosted ensembles to predict, activate boosting if model['object'].get('boosted_ensemble'): args.boosting = True # If more than one model, use the full field structure if (not single_model and not args.multi_label and belongs_to_ensemble(model)): if len(ensemble_ids) > 0: ensemble_id = ensemble_ids[0] args.ensemble_ids_ = ensemble_ids else: ensemble_id = get_ensemble_id(model) fields = pm.get_model_fields( model, csv_properties, args, single_model=single_model, multi_label_data=multi_label_data) # Free memory after getting fields # local_ensemble = None gc.collect() # Fills in all_labels from user_metadata if args.multi_label and not all_labels: (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(args.objective_field, labels, multi_label_data, fields, multi_label_fields) if model: # retrieves max_categories data, if any args.max_categories = get_metadata(model, 'max_categories', args.max_categories) other_label = get_metadata(model, 'other_label', other_label) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) # If predicting if (models and (a.has_test(args) or (test_dataset and args.remote)) and not args.evaluate): models_per_label = 1 if test_dataset is None: test_dataset = get_test_dataset(args) if args.multi_label: # When prediction starts from existing models, the # multi_label_fields can be retrieved from the user_metadata # in the models if args.multi_label_fields is None and multi_label_fields: multi_label_field_names = [field[1] for field in multi_label_fields] args.multi_label_fields = ",".join(multi_label_field_names) test_set = ps.multi_label_expansion( args.test_set, args.test_header, args, path, labels=labels, session_file=session_file, input_flag=True)[0] test_set_header = True # Remote predictions: predictions are computed as batch predictions # in bigml.com except when --no-batch flag is set on or multi-label # or max-categories are used if (args.remote and not args.no_batch and not args.multi_label and not args.method == COMBINATION): # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) if args.to_dataset and args.dataset_off: model = api.check_resource(model['resource'], query_string=r.ALL_FIELDS_QS) model_fields = Fields(model) objective_field_name = model_fields.field_name( \ model_fields.objective_field) if objective_field_name in test_fields.fields_by_name.keys(): args.prediction_name = "%s (predicted)" % \ objective_field_name batch_prediction_args = r.set_batch_prediction_args( args, fields=fields, dataset_fields=test_fields) remote_predict(model, test_dataset, batch_prediction_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: models_per_label = args.number_of_models if (args.multi_label and len(ensemble_ids) > 0 and args.number_of_models == 1): # use case where ensembles are read from a file models_per_label = len(models) / len(ensemble_ids) predict(models, fields, args, api=api, log=log, resume=resume, session_file=session_file, labels=labels, models_per_label=models_per_label, other_label=other_label, multi_label_data=multi_label_data) # When combine_votes flag is used, retrieve the predictions files saved # in the comma separated list of directories and combine them if args.votes_files_: model_id = re.sub(r'.*(model_[a-f0-9]{24})__predictions\.csv$', r'\1', args.votes_files_[0]).replace("_", "/") try: model = u.check_resource(model_id, api.get_model) except ValueError, exception: sys.exit("Failed to get model %s: %s" % (model_id, str(exception))) local_model = Model(model) message = u.dated("Combining votes.\n") u.log_message(message, log_file=session_file, console=args.verbosity) combine_votes(args.votes_files_, local_model.to_prediction, output, method=args.method)
def compute_output(api, args): """ Creates a dataset using the `training_set`. """ source = None dataset = None fields = None other_label = OTHER multi_label_data = None multi_label_fields = [] datasets = None # variables from command-line options resume = args.resume_ output = args.output check_args_coherence(args) path = u.check_dir(output) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # labels to be used in multi-label expansion labels = (None if args.labels is None else [ label.strip() for label in args.labels.split(args.args_separator) ]) if labels is not None: labels = sorted([label for label in labels]) # multi_label file must be preprocessed to obtain a new extended file if args.multi_label and args.training_set is not None: (args.training_set, multi_label_data) = ps.multi_label_expansion( args.training_set, args.train_header, args, path, labels=labels, session_file=session_file) args.train_header = True args.objective_field = multi_label_data["objective_name"] all_labels = l.get_all_labels(multi_label_data) if not labels: labels = all_labels else: all_labels = labels if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) if args.source_file: # source is retrieved from the contents of the given local JSON file source, csv_properties, fields = u.read_local_resource( args.source_file, csv_properties=csv_properties) else: # source is retrieved from the remote object source, resume, csv_properties, fields = ps.source_processing( api, args, resume, csv_properties=csv_properties, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) if source is not None: args.source = bigml.api.get_source_id(source) if args.multi_label and source: multi_label_data = l.get_multi_label_data(source) (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(args.objective_field, labels, multi_label_data, fields, multi_label_fields) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) if args.dataset_file: # dataset is retrieved from the contents of the given local JSON file model_dataset, csv_properties, fields = u.read_local_resource( args.dataset_file, csv_properties=csv_properties) if not args.datasets: datasets = [model_dataset] dataset = model_dataset else: datasets = u.read_datasets(args.datasets) if not datasets: # dataset is retrieved from the remote object datasets, resume, csv_properties, fields = pd.dataset_processing( source, api, args, resume, fields=fields, csv_properties=csv_properties, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) if datasets: dataset = datasets[-1] if args.to_csv is not None: resume = pd.export_dataset(dataset, api, args, resume, session_file=session_file, path=path) # Now we have a dataset, let's check if there's an objective_field # given by the user and update it in the fields structure args.objective_id_ = get_objective_id(args, fields) # If test_split is used, split the dataset in a training and a test dataset # according to the given split if args.test_split > 0: dataset, _, resume = pd.split_processing( dataset, api, args, resume, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) datasets[0] = dataset # Check if the dataset has a categorical objective field and it # has a max_categories limit for categories if args.max_categories > 0 and len(datasets) == 1: if pd.check_max_categories(fields.fields[args.objective_id_]): distribution = pd.get_categories_distribution( dataset, args.objective_id_) if distribution and len(distribution) > args.max_categories: categories = [element[0] for element in distribution] other_label = pd.create_other_label(categories, other_label) datasets, resume = pd.create_categories_datasets( dataset, distribution, fields, args, api, resume, session_file=session_file, path=path, log=log, other_label=other_label) else: sys.exit("The provided objective field is not categorical nor " "a full terms only text field. " "Only these fields can be used with" " --max-categories") # If any of the transformations is applied, # generate a new dataset from the given list of datasets if args.new_dataset: dataset, resume = pd.create_new_dataset(datasets, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) datasets = [dataset] # Check if the dataset has a generators file associated with it, and # generate a new dataset with the specified field structure. Also # if the --to-dataset flag is used to clone or sample the original dataset if args.new_fields or args.sample_rate != 1 or \ (args.lisp_filter or args.json_filter) and not a.has_source(args): if fields is None: if isinstance(dataset, basestring): dataset = u.check_resource(dataset, api=api) fields = Fields(dataset, csv_properties) args.objective_id_ = get_objective_id(args, fields) args.objective_name_ = fields.field_name(args.objective_id_) dataset, resume = pd.create_new_dataset(dataset, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) datasets[0] = dataset # rebuild fields structure for new ids and fields csv_properties.update({ 'objective_field': args.objective_name_, 'objective_field_present': True }) fields = pd.get_fields_structure(dataset, csv_properties) args.objective_id_ = get_objective_id(args, fields) if args.multi_label and dataset and multi_label_data is None: multi_label_data = l.get_multi_label_data(dataset) (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(args.objective_field, labels, multi_label_data, fields, multi_label_fields) if dataset: # retrieves max_categories data, if any args.max_categories = get_metadata(dataset, 'max_categories', args.max_categories) other_label = get_metadata(dataset, 'other_label', other_label) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ source = None dataset = None model = None models = None fields = None other_label = OTHER ensemble_ids = [] multi_label_data = None multi_label_fields = [] #local_ensemble = None test_dataset = None datasets = None # variables from command-line options resume = args.resume_ model_ids = args.model_ids_ output = args.predictions dataset_fields = args.dataset_fields_ check_args_coherence(args) path = u.check_dir(output) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # labels to be used in multi-label expansion labels = (None if args.labels is None else [label.strip() for label in args.labels.split(args.args_separator)]) if labels is not None: labels = sorted([label for label in labels]) # multi_label file must be preprocessed to obtain a new extended file if args.multi_label and args.training_set is not None: (args.training_set, multi_label_data) = ps.multi_label_expansion( args.training_set, args.train_header, args, path, labels=labels, session_file=session_file) args.train_header = True args.objective_field = multi_label_data["objective_name"] all_labels = l.get_all_labels(multi_label_data) if not labels: labels = all_labels else: all_labels = labels if args.objective_field: csv_properties.update({'objective_field': args.objective_field}) if args.source_file: # source is retrieved from the contents of the given local JSON file source, csv_properties, fields = u.read_local_resource( args.source_file, csv_properties=csv_properties) else: # source is retrieved from the remote object source, resume, csv_properties, fields = ps.source_processing( api, args, resume, csv_properties=csv_properties, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) if args.multi_label and source: multi_label_data = l.get_multi_label_data(source) (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(args.objective_field, labels, multi_label_data, fields, multi_label_fields) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) if args.dataset_file: # dataset is retrieved from the contents of the given local JSON file model_dataset, csv_properties, fields = u.read_local_resource( args.dataset_file, csv_properties=csv_properties) if not args.datasets: datasets = [model_dataset] dataset = model_dataset else: datasets = u.read_datasets(args.datasets) if not datasets: # dataset is retrieved from the remote object datasets, resume, csv_properties, fields = pd.dataset_processing( source, api, args, resume, fields=fields, csv_properties=csv_properties, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) if datasets: dataset = datasets[0] if args.to_csv is not None: resume = pd.export_dataset(dataset, api, args, resume, session_file=session_file, path=path) # Now we have a dataset, let's check if there's an objective_field # given by the user and update it in the fields structure args.objective_id_ = get_objective_id(args, fields) # If test_split is used, split the dataset in a training and a test dataset # according to the given split if args.test_split > 0: dataset, test_dataset, resume = pd.split_processing( dataset, api, args, resume, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) datasets[0] = dataset # Check if the dataset has a categorical objective field and it # has a max_categories limit for categories if args.max_categories > 0 and len(datasets) == 1: if pd.check_max_categories(fields.fields[args.objective_id_]): distribution = pd.get_categories_distribution(dataset, args.objective_id_) if distribution and len(distribution) > args.max_categories: categories = [element[0] for element in distribution] other_label = pd.create_other_label(categories, other_label) datasets, resume = pd.create_categories_datasets( dataset, distribution, fields, args, api, resume, session_file=session_file, path=path, log=log, other_label=other_label) else: sys.exit("The provided objective field is not categorical nor " "a full terms only text field. " "Only these fields can be used with" " --max-categories") # If multi-dataset flag is on, generate a new dataset from the given # list of datasets if args.multi_dataset: dataset, resume = pd.create_new_dataset( datasets, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) datasets = [dataset] # Check if the dataset has a generators file associated with it, and # generate a new dataset with the specified field structure. Also # if the --to-dataset flag is used to clone or sample the original dataset if args.new_fields or (args.sample_rate != 1 and args.no_model) or \ (args.lisp_filter or args.json_filter) and not has_source(args): if fields is None: if isinstance(dataset, basestring): dataset = u.check_resource(dataset, api=api) fields = Fields(dataset, csv_properties) args.objective_id_ = get_objective_id(args, fields) args.objective_name_ = fields.field_name(args.objective_id_) dataset, resume = pd.create_new_dataset( dataset, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) datasets[0] = dataset # rebuild fields structure for new ids and fields csv_properties.update({'objective_field': args.objective_name_, 'objective_field_present': True}) fields = pd.get_fields_structure(dataset, csv_properties) args.objective_id_ = get_objective_id(args, fields) if args.multi_label and dataset and multi_label_data is None: multi_label_data = l.get_multi_label_data(dataset) (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(args.objective_field, labels, multi_label_data, fields, multi_label_fields) if dataset: # retrieves max_categories data, if any args.max_categories = get_metadata(dataset, 'max_categories', args.max_categories) other_label = get_metadata(dataset, 'other_label', other_label) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) if args.model_file: # model is retrieved from the contents of the given local JSON file model, csv_properties, fields = u.read_local_resource( args.model_file, csv_properties=csv_properties) models = [model] model_ids = [model['resource']] ensemble_ids = [] elif args.ensemble_file: # model is retrieved from the contents of the given local JSON file ensemble, csv_properties, fields = u.read_local_resource( args.ensemble_file, csv_properties=csv_properties) model_ids = ensemble['object']['models'][:] ensemble_ids = [ensemble['resource']] models = model_ids[:] model = retrieve_resource(bigml.api.BigML(storage='./storage'), models[0], query_string=r.ALL_FIELDS_QS) models[0] = model else: # model is retrieved from the remote object models, model_ids, ensemble_ids, resume = pm.models_processing( datasets, models, model_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log, labels=labels, multi_label_data=multi_label_data, other_label=other_label) if models: model = models[0] single_model = len(models) == 1 # If multi-label flag is set and no training_set was provided, label # info is extracted from the user_metadata. If models belong to an # ensemble, the ensemble must be retrieved to get the user_metadata. if model and args.multi_label and multi_label_data is None: if len(ensemble_ids) > 0 and isinstance(ensemble_ids[0], dict): resource = ensemble_ids[0] elif belongs_to_ensemble(model): ensemble_id = get_ensemble_id(model) resource = r.get_ensemble(ensemble_id, api=api, verbosity=args.verbosity, session_file=session_file) else: resource = model multi_label_data = l.get_multi_label_data(resource) # We update the model's public state if needed if model: if (isinstance(model, basestring) or bigml.api.get_status(model)['code'] != bigml.api.FINISHED): if not args.evaluate and not a.has_train(args) and \ not a.has_test(args) : query_string = MINIMUM_MODEL elif not args.test_header: query_string = r.ALL_FIELDS_QS else: query_string = "%s;%s" % (r.ALL_FIELDS_QS, r.FIELDS_QS) model = u.check_resource(model, api.get_model, query_string=query_string) models[0] = model if (args.black_box or args.white_box or (args.shared_flag and r.shared_changed(args.shared, model))): model_args = {} if args.shared_flag and r.shared_changed(args.shared, model): model_args.update(shared=args.shared) if args.black_box or args.white_box: model_args.update(r.set_publish_model_args(args)) if model_args: model = r.update_model(model, model_args, args, api=api, path=path, session_file=session_file) models[0] = model # We get the fields of the model if we haven't got # them yet and need them if model and not args.evaluate and (a.has_test(args) or args.export_fields): # If more than one model, use the full field structure if (not single_model and not args.multi_label and belongs_to_ensemble(model)): if len(ensemble_ids) > 0: ensemble_id = ensemble_ids[0] args.ensemble_ids_ = ensemble_ids else: ensemble_id = get_ensemble_id(model) fields = pm.get_model_fields( model, csv_properties, args, single_model=single_model, multi_label_data=multi_label_data) # Free memory after getting fields # local_ensemble = None gc.collect() # Fills in all_labels from user_metadata if args.multi_label and not all_labels: (args.objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(args.objective_field, labels, multi_label_data, fields, multi_label_fields) if model: # retrieves max_categories data, if any args.max_categories = get_metadata(model, 'max_categories', args.max_categories) other_label = get_metadata(model, 'other_label', other_label) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) # If predicting if (models and (a.has_test(args) or (test_dataset and args.remote)) and not args.evaluate): models_per_label = 1 if test_dataset is None: test_dataset = get_test_dataset(args) if args.multi_label: # When prediction starts from existing models, the # multi_label_fields can be retrieved from the user_metadata # in the models if args.multi_label_fields is None and multi_label_fields: multi_label_field_names = [field[1] for field in multi_label_fields] args.multi_label_fields = ",".join(multi_label_field_names) test_set = ps.multi_label_expansion( args.test_set, args.test_header, args, path, labels=labels, session_file=session_file, input_flag=True)[0] test_set_header = True # Remote predictions: predictions are computed as batch predictions # in bigml.com except when --no-batch flag is set on or multi-label # or max-categories are used if (args.remote and not args.no_batch and not args.multi_label and not args.method in [THRESHOLD_CODE, COMBINATION]): # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) if args.to_dataset and args.dataset_off: model = api.check_resource(model['resource'], query_string=r.ALL_FIELDS_QS) model_fields = Fields(model) objective_field_name = model_fields.field_name( \ model_fields.objective_field) if objective_field_name in test_fields.fields_by_name.keys(): args.prediction_name = "%s (predicted)" % \ objective_field_name batch_prediction_args = r.set_batch_prediction_args( args, fields=fields, dataset_fields=test_fields) remote_predict(model, test_dataset, batch_prediction_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: models_per_label = args.number_of_models if (args.multi_label and len(ensemble_ids) > 0 and args.number_of_models == 1): # use case where ensembles are read from a file models_per_label = len(models) / len(ensemble_ids) predict(models, fields, args, api=api, log=log, resume=resume, session_file=session_file, labels=labels, models_per_label=models_per_label, other_label=other_label, multi_label_data=multi_label_data) # When combine_votes flag is used, retrieve the predictions files saved # in the comma separated list of directories and combine them if args.votes_files_: model_id = re.sub(r'.*(model_[a-f0-9]{24})__predictions\.csv$', r'\1', args.votes_files_[0]).replace("_", "/") try: model = u.check_resource(model_id, api.get_model) except ValueError, exception: sys.exit("Failed to get model %s: %s" % (model_id, str(exception))) local_model = Model(model) message = u.dated("Combining votes.\n") u.log_message(message, log_file=session_file, console=args.verbosity) combine_votes(args.votes_files_, local_model.to_prediction, output, method=args.method)
fields = Fields(dataset) excluded_features = ([] if args.exclude_features is None else args.exclude_features.split( args.args_separator)) try: excluded_ids = [fields.field_id(feature) for feature in excluded_features] objective_id = fields.field_id(objective_name) except ValueError, exc: sys.exit(exc) field_ids = [field_id for field_id in fields.preferred_fields() if field_id != objective_id and not field_id in excluded_ids] field_ids.sort() # headers are extended with a column per field fields_names = [fields.field_name(field_id) for field_id in field_ids] features_header.extend(fields_names) features_writer.writerow(features_header) initial_state = [False for field_id in field_ids] open_list = [(initial_state, - float('inf'), -float('inf'), 0)] closed_list = [] best_state, best_score, best_metric_value, best_counter = open_list[0] best_unchanged_count = 0 metric = args.optimize while best_unchanged_count < staleness and open_list: loop_counter += 1 features_set = find_max_state(open_list) state, score, metric_value, _ = features_set if loop_counter > 1: csv_results = [loop_counter - 1, \ [int(in_set) for in_set in state], \
args.args_separator)) excluded_ids = [fields.field_id(feature) for feature in excluded_features] objective_id = fields.field_id(objective_name) field_ids = [field_id for field_id in fields.preferred_fields() if field_id != objective_id and not field_id in excluded_ids] initial_state = [False for field_id in field_ids] open_list = [(initial_state, 0)] closed_list = [] best_score = -1 best_unchanged_count = 0 metric = args.maximize while best_unchanged_count < staleness and open_list: (state, score) = find_max_state(open_list) state_fields = [fields.field_name(field_ids[i]) for (i, val) in enumerate(state) if val] closed_list.append((state, score)) open_list.remove((state, score)) if (score - EPSILON) > best_score: best_state = state best_score = score best_unchanged_count = 0 if state_fields: message = 'New best state: %s\n' % (state_fields) u.log_message(message, log_file=session_file, console=args.verbosity) if metric in PERCENT_EVAL_METRICS: message = '%s = %0.2f%%\n' % (metric.capitalize(), score * 100) else:
def best_first_search(datasets_file, api, args, common_options, staleness=None, penalty=None, objective_name=None, resume=False): """Selecting the fields to be used in the model construction """ counter = 0 loop_counter = 0 features_file = os.path.normpath(os.path.join(args.output_dir, FEATURES_LOG)) with open(features_file, 'w', 0) as features_handler: features_writer = csv.writer(features_handler, lineterminator="\n") features_writer.writerow([ "step", "state", "score", "metric_value", "best_score"]) features_handler.flush() if staleness is None: staleness = DEFAULT_STALENESS if penalty is None: penalty = DEFAULT_PENALTY # retrieving the first dataset in the file try: with open(datasets_file) as datasets_handler: dataset_id = datasets_handler.readline().strip() except IOError, exc: sys.exit("Could not read the generated datasets file: %s" % str(exc)) dataset = api.check_resource(dataset_id, api.get_dataset) # initial feature set fields = Fields(dataset) excluded_features = ([] if args.exclude_features is None else args.exclude_features.split( args.args_separator)) excluded_ids = [fields.field_id(feature) for feature in excluded_features] objective_id = fields.field_id(objective_name) field_ids = [field_id for field_id in fields.preferred_fields() if field_id != objective_id and not field_id in excluded_ids] initial_state = [False for field_id in field_ids] open_list = [(initial_state, - float('inf'), -float('inf'))] closed_list = [] best_state, best_score, best_metric_value = open_list[0] best_unchanged_count = 0 metric = args.maximize while best_unchanged_count < staleness and open_list: loop_counter += 1 features_set = find_max_state(open_list) state, score, metric_value = features_set features_writer.writerow([ loop_counter, [int(in_set) for in_set in state], score, metric_value, best_score]) features_handler.flush() state_fields = [fields.field_name(field_ids[index]) for (index, in_set) in enumerate(state) if in_set] closed_list.append(features_set) open_list.remove(features_set) if (score - EPSILON) > best_score: best_state, best_score, best_metric_value = features_set best_unchanged_count = 0 if state_fields: message = 'New best state: %s\n' % (state_fields) u.log_message(message, log_file=session_file, console=args.verbosity) if metric in PERCENT_EVAL_METRICS: message = '%s = %0.2f%% (score = %s)\n' % ( metric.capitalize(), metric_value * 100, score) else: message = '%s = %f (score = %s)\n' % ( metric.capitalize(),metric_value, score) u.log_message(message, log_file=session_file, console=args.verbosity) else: best_unchanged_count += 1 children = expand_state(state) for child in children: if (child not in [state for state, _, _ in open_list] and child not in [state for state, _, _ in closed_list]): input_fields = [fields.field_name(field_id) for (i, field_id) in enumerate(field_ids) if child[i]] # create models and evaluation with input_fields args.model_fields = args.args_separator.join(input_fields) counter += 1 (score, metric_value, metric, resume) = kfold_evaluate(datasets_file, args, counter, common_options, penalty=penalty, resume=resume, metric=metric) open_list.append((child, score, metric_value)) best_features = [fields.field_name(field_ids[i]) for (i, score) in enumerate(best_state) if score] message = (u'The best feature subset is: %s \n' % u", ".join(best_features)) u.log_message(message, log_file=session_file, console=1) if metric in PERCENT_EVAL_METRICS: message = (u'%s = %0.2f%%\n' % (metric.capitalize(), (best_metric_value * 100))) else: message = (u'%s = %f\n' % (metric.capitalize(), best_metric_value)) u.log_message(message, log_file=session_file, console=1) message = (u'Evaluated %d/%d feature subsets\n' % ((len(open_list) + len(closed_list)), 2 ** len(field_ids))) u.log_message(message, log_file=session_file, console=1)