예제 #1
0
def models_processing(datasets, models, model_ids, objective_field, fields,
                      api, args, resume,
                      name=None, description=None, model_fields=None,
                      session_file=None, path=None,
                      log=None, labels=None, multi_label_data=None,
                      other_label=None):
    """Creates or retrieves models from the input data

    """
    ensemble_ids = []

    # If we have a dataset but not a model, we create the model if the no_model
    # flag hasn't been set up.
    if datasets and not (has_models(args) or args.no_model):
        dataset = datasets[0]
        model_ids = []
        models = []
        if args.multi_label:
            # If --number-of-models is not set or is 1, create one model per
            # label. Otherwise, create one ensemble per label with the required
            # number of models
            if args.number_of_models < 2:
                models, model_ids, resume = model_per_label(
                    labels, datasets, fields,
                    objective_field, api, args, resume, name, description,
                    model_fields, multi_label_data, session_file, path, log)
            else:
                (ensembles, ensemble_ids,
                 models, model_ids, resume) = ensemble_per_label(
                     labels, dataset, fields,
                     objective_field, api, args, resume, name, description,
                     model_fields, multi_label_data, session_file, path, log)

        elif args.number_of_models > 1:
            ensembles = []
            # Ensemble of models
            (ensembles, ensemble_ids,
             models, model_ids, resume) = ensemble_processing(
                 datasets, objective_field, fields, api, args, resume,
                 name=name, description=description, model_fields=model_fields,
                 session_file=session_file, path=path, log=log)
            ensemble = ensembles[0]
            args.ensemble = bigml.api.get_ensemble_id(ensemble)

        else:
            # Set of partial datasets created setting args.max_categories
            if len(datasets) > 1 and args.max_categories:
                args.number_of_models = len(datasets)
            # Cross-validation case: we create 2 * n models to be validated
            # holding out an n% of data
            if args.cross_validation_rate > 0:
                if args.number_of_evaluations > 0:
                    args.number_of_models = args.number_of_evaluations
                else:
                    args.number_of_models = int(MONTECARLO_FACTOR *
                                                args.cross_validation_rate)
            if resume:
                resume, model_ids = c.checkpoint(
                    c.are_models_created, path, args.number_of_models,
                    debug=args.debug)
                if not resume:
                    message = u.dated("Found %s models out of %s. Resuming.\n"
                                      % (len(model_ids),
                                        args.number_of_models))
                    u.log_message(message, log_file=session_file,
                                  console=args.verbosity)

                models = model_ids
                args.number_of_models -= len(model_ids)
            if args.max_categories > 0:
                objective_field = None

            model_args = r.set_model_args(name, description, args,
                                          objective_field, fields,
                                          model_fields, other_label)
            models, model_ids = r.create_models(datasets, models,
                                                model_args, args, api,
                                                path, session_file, log)
    # If a model is provided, we use it.
    elif args.model:
        model_ids = [args.model]
        models = model_ids[:]

    elif args.models or args.model_tag:
        models = model_ids[:]

    if args.ensemble:
        ensemble = r.get_ensemble(args.ensemble, api, args.verbosity,
                                  session_file)
        ensemble_ids = [ensemble]
        model_ids = ensemble['object']['models']

        models = model_ids[:]

    if args.ensembles or args.ensemble_tag:
        model_ids = []
        ensemble_ids = []
        # Parses ensemble/ids if provided.
        if args.ensemble_tag:
            ensemble_ids = (ensemble_ids +
                            u.list_ids(api.list_ensembles,
                                       "tags__in=%s" % args.ensemble_tag))
        else:
            ensemble_ids = u.read_resources(args.ensembles)
        for ensemble_id in ensemble_ids:
            ensemble = r.get_ensemble(ensemble_id, api)
            if args.ensemble is None:
                args.ensemble = ensemble_id
            model_ids.extend(ensemble['object']['models'])
        models = model_ids[:]

    # If we are going to predict we must retrieve the models
    if model_ids and args.test_set and not args.evaluate:
        models, model_ids = r.get_models(models, args, api, session_file)

    return models, model_ids, ensemble_ids, resume
예제 #2
0
def models_processing(datasets,
                      models,
                      model_ids,
                      api,
                      args,
                      resume,
                      fields=None,
                      session_file=None,
                      path=None,
                      log=None,
                      labels=None,
                      multi_label_data=None,
                      other_label=None):
    """Creates or retrieves models from the input data

    """
    ensemble_ids = []

    # If we have a dataset but not a model, we create the model if the no_model
    # flag hasn't been set up.
    if datasets and not (args.has_models_ or args.no_model):
        dataset = datasets[0]
        model_ids = []
        models = []
        if args.multi_label:
            # If --number-of-models is not set or is 1, and there's
            # no boosting options on, create one model per
            # label. Otherwise, create one ensemble per label with the required
            # number of models
            if args.number_of_models < 2 and not args.boosting:
                models, model_ids, resume = model_per_label(
                    labels,
                    datasets,
                    api,
                    args,
                    resume,
                    fields=fields,
                    multi_label_data=multi_label_data,
                    session_file=session_file,
                    path=path,
                    log=log)
            else:
                (ensembles, ensemble_ids, models, model_ids,
                 resume) = ensemble_per_label(
                     labels,
                     dataset,
                     api,
                     args,
                     resume,
                     fields=fields,
                     multi_label_data=multi_label_data,
                     session_file=session_file,
                     path=path,
                     log=log)

        elif args.number_of_models > 1 or args.boosting:
            ensembles = []
            # Ensembles of models
            (ensembles, ensemble_ids, models, model_ids,
             resume) = ensemble_processing(datasets,
                                           api,
                                           args,
                                           resume,
                                           fields=fields,
                                           session_file=session_file,
                                           path=path,
                                           log=log)
            ensemble = ensembles[0]
            args.ensemble = bigml.api.get_ensemble_id(ensemble)

        else:
            # Set of partial datasets created setting args.max_categories
            if len(datasets) > 1 and args.max_categories:
                args.number_of_models = len(datasets)
            if ((args.test_datasets and args.evaluate)
                    or (args.datasets and args.evaluate and args.dataset_off)):
                args.number_of_models = len(args.dataset_ids)
            # Cross-validation case: we create 2 * n models to be validated
            # holding out an n% of data
            if args.cross_validation_rate > 0:
                if args.number_of_evaluations > 0:
                    args.number_of_models = args.number_of_evaluations
                else:
                    args.number_of_models = int(MONTECARLO_FACTOR *
                                                args.cross_validation_rate)
            if resume:
                resume, model_ids = c.checkpoint(c.are_models_created,
                                                 path,
                                                 args.number_of_models,
                                                 debug=args.debug)
                if not resume:
                    message = u.dated(
                        "Found %s models out of %s. Resuming.\n" %
                        (len(model_ids), args.number_of_models))
                    u.log_message(message,
                                  log_file=session_file,
                                  console=args.verbosity)

                models = model_ids
                args.number_of_models -= len(model_ids)
            model_args = r.set_model_args(args,
                                          fields=fields,
                                          objective_id=args.objective_id_,
                                          model_fields=args.model_fields_,
                                          other_label=other_label)
            models, model_ids = r.create_models(datasets, models, model_args,
                                                args, api, path, session_file,
                                                log)
    # If a model is provided, we use it.
    elif args.model:
        model_ids = [args.model]
        models = model_ids[:]

    elif args.models or args.model_tag:
        models = model_ids[:]

    if args.ensemble:
        if not args.ensemble in ensemble_ids:
            ensemble_ids.append(args.ensemble)
        if not args.evaluate:
            ensemble = r.get_ensemble(args.ensemble, api, args.verbosity,
                                      session_file)
            model_ids = ensemble['object']['models']
            models = model_ids[:]

    if args.ensembles or args.ensemble_tag:
        model_ids = []
        ensemble_ids = []
        # Parses ensemble/ids if provided.
        if args.ensemble_tag:
            ensemble_ids = (ensemble_ids + u.list_ids(
                api.list_ensembles, "tags__in=%s" % args.ensemble_tag))
        else:
            ensemble_ids = u.read_resources(args.ensembles)
        for ensemble_id in ensemble_ids:
            ensemble = r.get_ensemble(ensemble_id, api)
            if args.ensemble is None:
                args.ensemble = ensemble_id
            model_ids.extend(ensemble['object']['models'])
        models = model_ids[:]

    # If we are going to predict we must retrieve the models
    if model_ids and args.test_set and not args.evaluate:
        models, model_ids = r.get_models(models, args, api, session_file)

    return models, model_ids, ensemble_ids, resume