def models_processing(datasets, models, model_ids, objective_field, fields, api, args, resume, name=None, description=None, model_fields=None, session_file=None, path=None, log=None, labels=None, multi_label_data=None, other_label=None): """Creates or retrieves models from the input data """ ensemble_ids = [] # If we have a dataset but not a model, we create the model if the no_model # flag hasn't been set up. if datasets and not (has_models(args) or args.no_model): dataset = datasets[0] model_ids = [] models = [] if args.multi_label: # If --number-of-models is not set or is 1, create one model per # label. Otherwise, create one ensemble per label with the required # number of models if args.number_of_models < 2: models, model_ids, resume = model_per_label( labels, datasets, fields, objective_field, api, args, resume, name, description, model_fields, multi_label_data, session_file, path, log) else: (ensembles, ensemble_ids, models, model_ids, resume) = ensemble_per_label( labels, dataset, fields, objective_field, api, args, resume, name, description, model_fields, multi_label_data, session_file, path, log) elif args.number_of_models > 1: ensembles = [] # Ensemble of models (ensembles, ensemble_ids, models, model_ids, resume) = ensemble_processing( datasets, objective_field, fields, api, args, resume, name=name, description=description, model_fields=model_fields, session_file=session_file, path=path, log=log) ensemble = ensembles[0] args.ensemble = bigml.api.get_ensemble_id(ensemble) else: # Set of partial datasets created setting args.max_categories if len(datasets) > 1 and args.max_categories: args.number_of_models = len(datasets) # Cross-validation case: we create 2 * n models to be validated # holding out an n% of data if args.cross_validation_rate > 0: if args.number_of_evaluations > 0: args.number_of_models = args.number_of_evaluations else: args.number_of_models = int(MONTECARLO_FACTOR * args.cross_validation_rate) if resume: resume, model_ids = c.checkpoint( c.are_models_created, path, args.number_of_models, debug=args.debug) if not resume: message = u.dated("Found %s models out of %s. Resuming.\n" % (len(model_ids), args.number_of_models)) u.log_message(message, log_file=session_file, console=args.verbosity) models = model_ids args.number_of_models -= len(model_ids) if args.max_categories > 0: objective_field = None model_args = r.set_model_args(name, description, args, objective_field, fields, model_fields, other_label) models, model_ids = r.create_models(datasets, models, model_args, args, api, path, session_file, log) # If a model is provided, we use it. elif args.model: model_ids = [args.model] models = model_ids[:] elif args.models or args.model_tag: models = model_ids[:] if args.ensemble: ensemble = r.get_ensemble(args.ensemble, api, args.verbosity, session_file) ensemble_ids = [ensemble] model_ids = ensemble['object']['models'] models = model_ids[:] if args.ensembles or args.ensemble_tag: model_ids = [] ensemble_ids = [] # Parses ensemble/ids if provided. if args.ensemble_tag: ensemble_ids = (ensemble_ids + u.list_ids(api.list_ensembles, "tags__in=%s" % args.ensemble_tag)) else: ensemble_ids = u.read_resources(args.ensembles) for ensemble_id in ensemble_ids: ensemble = r.get_ensemble(ensemble_id, api) if args.ensemble is None: args.ensemble = ensemble_id model_ids.extend(ensemble['object']['models']) models = model_ids[:] # If we are going to predict we must retrieve the models if model_ids and args.test_set and not args.evaluate: models, model_ids = r.get_models(models, args, api, session_file) return models, model_ids, ensemble_ids, resume
def models_processing(datasets, models, model_ids, api, args, resume, fields=None, session_file=None, path=None, log=None, labels=None, multi_label_data=None, other_label=None): """Creates or retrieves models from the input data """ ensemble_ids = [] # If we have a dataset but not a model, we create the model if the no_model # flag hasn't been set up. if datasets and not (args.has_models_ or args.no_model): dataset = datasets[0] model_ids = [] models = [] if args.multi_label: # If --number-of-models is not set or is 1, and there's # no boosting options on, create one model per # label. Otherwise, create one ensemble per label with the required # number of models if args.number_of_models < 2 and not args.boosting: models, model_ids, resume = model_per_label( labels, datasets, api, args, resume, fields=fields, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) else: (ensembles, ensemble_ids, models, model_ids, resume) = ensemble_per_label( labels, dataset, api, args, resume, fields=fields, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) elif args.number_of_models > 1 or args.boosting: ensembles = [] # Ensembles of models (ensembles, ensemble_ids, models, model_ids, resume) = ensemble_processing(datasets, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) ensemble = ensembles[0] args.ensemble = bigml.api.get_ensemble_id(ensemble) else: # Set of partial datasets created setting args.max_categories if len(datasets) > 1 and args.max_categories: args.number_of_models = len(datasets) if ((args.test_datasets and args.evaluate) or (args.datasets and args.evaluate and args.dataset_off)): args.number_of_models = len(args.dataset_ids) # Cross-validation case: we create 2 * n models to be validated # holding out an n% of data if args.cross_validation_rate > 0: if args.number_of_evaluations > 0: args.number_of_models = args.number_of_evaluations else: args.number_of_models = int(MONTECARLO_FACTOR * args.cross_validation_rate) if resume: resume, model_ids = c.checkpoint(c.are_models_created, path, args.number_of_models, debug=args.debug) if not resume: message = u.dated( "Found %s models out of %s. Resuming.\n" % (len(model_ids), args.number_of_models)) u.log_message(message, log_file=session_file, console=args.verbosity) models = model_ids args.number_of_models -= len(model_ids) model_args = r.set_model_args(args, fields=fields, objective_id=args.objective_id_, model_fields=args.model_fields_, other_label=other_label) models, model_ids = r.create_models(datasets, models, model_args, args, api, path, session_file, log) # If a model is provided, we use it. elif args.model: model_ids = [args.model] models = model_ids[:] elif args.models or args.model_tag: models = model_ids[:] if args.ensemble: if not args.ensemble in ensemble_ids: ensemble_ids.append(args.ensemble) if not args.evaluate: ensemble = r.get_ensemble(args.ensemble, api, args.verbosity, session_file) model_ids = ensemble['object']['models'] models = model_ids[:] if args.ensembles or args.ensemble_tag: model_ids = [] ensemble_ids = [] # Parses ensemble/ids if provided. if args.ensemble_tag: ensemble_ids = (ensemble_ids + u.list_ids( api.list_ensembles, "tags__in=%s" % args.ensemble_tag)) else: ensemble_ids = u.read_resources(args.ensembles) for ensemble_id in ensemble_ids: ensemble = r.get_ensemble(ensemble_id, api) if args.ensemble is None: args.ensemble = ensemble_id model_ids.extend(ensemble['object']['models']) models = model_ids[:] # If we are going to predict we must retrieve the models if model_ids and args.test_set and not args.evaluate: models, model_ids = r.get_models(models, args, api, session_file) return models, model_ids, ensemble_ids, resume