Esempio n. 1
0
def alternative_dataset_processing(
    dataset_or_source, suffix, dataset_args, api, args, resume, session_file=None, path=None, log=None
):
    """Creates a dataset. Used in splits to generate train and test datasets

    """
    alternative_dataset = None
    # if resuming, try to extract dataset form log files
    if resume:
        message = u.dated("Dataset not found. Resuming.\n")
        resume, alternative_dataset = c.checkpoint(
            c.is_dataset_created,
            path,
            "_%s" % suffix,
            debug=args.debug,
            message=message,
            log_file=session_file,
            console=args.verbosity,
        )

    if alternative_dataset is None:
        alternative_dataset = r.create_dataset(
            dataset_or_source, dataset_args, args, api, path, session_file, log, suffix
        )
        if alternative_dataset:
            alternative_dataset = r.get_dataset(alternative_dataset, api, args.verbosity, session_file)
    return alternative_dataset, resume
Esempio n. 2
0
def alternative_dataset_processing(dataset_or_source,
                                   suffix,
                                   dataset_args,
                                   api,
                                   args,
                                   resume,
                                   session_file=None,
                                   path=None,
                                   log=None):
    """Creates a dataset. Used in splits to generate train and test datasets

    """
    alternative_dataset = None
    # if resuming, try to extract dataset form log files
    if resume:
        message = u.dated("Dataset not found. Resuming.\n")
        resume, alternative_dataset = c.checkpoint(c.is_dataset_created,
                                                   path,
                                                   "_%s" % suffix,
                                                   debug=args.debug,
                                                   message=message,
                                                   log_file=session_file,
                                                   console=args.verbosity)

    if alternative_dataset is None:
        alternative_dataset = r.create_dataset(dataset_or_source, dataset_args,
                                               args, api, path, session_file,
                                               log, suffix)
        if alternative_dataset:
            alternative_dataset = r.get_dataset(alternative_dataset, api,
                                                args.verbosity, session_file)
    return alternative_dataset, resume
Esempio n. 3
0
def split_processing(dataset, name, description, api, args, resume,
                     session_file=None, path=None, log=None):
    """Splits a dataset into train and test datasets
    """
    train_dataset = None
    test_dataset = None
    sample_rate = 1 - args.test_split
    # if resuming, try to extract train dataset form log files
    if resume:
        message = u.dated("Dataset not found. Resuming.\n")
        resume, train_dataset = c.checkpoint(
            c.is_dataset_created, path, "_train", debug=args.debug,
            message=message, log_file=session_file, console=args.verbosity)

    if train_dataset is None:
        dataset_split_args = r.set_dataset_split_args(
            "%s - train (%s %%)" % (name,
            int(sample_rate * 100)), description, args,
            sample_rate, out_of_bag=False)
        train_dataset = r.create_dataset(
            dataset, dataset_split_args, args, api, path, session_file,
            log, "train")
        if train_dataset:
            train_dataset = r.get_dataset(train_dataset, api,
                                          args.verbosity, session_file)

    # if resuming, try to extract test dataset form log files
    if resume:
        message = u.dated("Dataset not found. Resuming.\n")
        resume, test_dataset = c.checkpoint(
            c.is_dataset_created, path, "_test", debug=args.debug,
            message=message, log_file=session_file, console=args.verbosity)

    if test_dataset is None:
        dataset_split_args = r.set_dataset_split_args(
            "%s - test (%s %%)" % (name,
            int(args.test_split * 100)), description, args,
            sample_rate, out_of_bag=True)
        test_dataset = r.create_dataset(
            dataset, dataset_split_args, args, api, path, session_file,
            log, "test")
        if test_dataset:
            test_dataset = r.get_dataset(test_dataset, api, args.verbosity,
                                         session_file)
    return train_dataset, test_dataset, resume
Esempio n. 4
0
def dataset_processing(source,
                       training_set,
                       test_set,
                       model_ids,
                       name,
                       description,
                       fields,
                       dataset_fields,
                       api,
                       args,
                       resume,
                       csv_properties=None,
                       session_file=None,
                       path=None,
                       log=None):
    """Creating or retrieving dataset from input arguments

    """
    dataset = None
    if (training_set or args.source or (args.evaluate and test_set)):
        # if resuming, try to extract args.dataset form log files
        if resume:
            message = u.dated("Dataset not found. Resuming.\n")
            resume, args.dataset = c.checkpoint(c.is_dataset_created,
                                                path,
                                                debug=args.debug,
                                                message=message,
                                                log_file=session_file,
                                                console=args.verbosity)

    # If we have a source but no dataset or model has been provided, we
    # create a new dataset if the no_dataset option isn't set up. Also
    # if evaluate is set and test_set has been provided.
    if ((source and not args.dataset and not args.model and not model_ids
         and not args.no_dataset)
            or (args.evaluate and args.test_set and not args.dataset)):
        dataset_args = r.set_dataset_args(name, description, args, fields,
                                          dataset_fields)
        dataset = r.create_dataset(source, dataset_args, args, api, path,
                                   session_file, log)

    # If a dataset is provided, let's retrieve it.
    elif args.dataset:
        dataset = bigml.api.get_dataset_id(args.dataset)

    # If we already have a dataset, we check the status and get the fields if
    # we hadn't them yet.
    if dataset:
        dataset = r.get_dataset(dataset, api, args.verbosity, session_file)
        if not csv_properties and 'locale' in dataset['object']:
            csv_properties = {'data_locale': dataset['object']['locale']}
        fields = Fields(dataset['object']['fields'], **csv_properties)
        if args.public_dataset:
            r.publish_dataset(dataset, api, args, session_file)
    return dataset, resume, csv_properties, fields
Esempio n. 5
0
def create_new_dataset(datasets,
                       api,
                       args,
                       resume,
                       fields=None,
                       session_file=None,
                       path=None,
                       log=None):
    """Generates a new dataset using the generators given in a generators file
       or a multi-dataset from a list of datasets

    """
    origin_resource = datasets
    if not isinstance(datasets, basestring) and args.multi_dataset:
        suffix = "multi"
    else:
        datasets = []
        suffix = "gen"
    number_of_datasets = 1
    if resume:
        resume, datasets = c.checkpoint(c.are_datasets_created,
                                        path,
                                        number_of_datasets,
                                        debug=args.debug,
                                        suffix=suffix)
        if not resume:
            message = u.dated("Found %s datasets out of %s. Resuming.\n" %
                              (len(datasets), number_of_datasets))
            u.log_message(message,
                          log_file=session_file,
                          console=args.verbosity)
    if not resume:
        dataset_args = r.set_dataset_args(args, fields)
        if args.multi_dataset and args.multi_dataset_json:
            dataset_args.update(args.multi_dataset_json)
        elif hasattr(args, 'anomalies_dataset') and args.anomalies_dataset:
            dataset_args.update({'lisp_filter': args.anomaly_filter_})
        elif hasattr(args, 'lisp_filter') and args.lisp_filter:
            dataset_args.update({'lisp_filter': args.lisp_filter})
        elif hasattr(args, 'json_filter') and args.json_filter:
            dataset_args.update({'json_filter': args.json_filter})
        else:
            dataset_args.update(args.dataset_json_generators)
        new_dataset = r.create_dataset(origin_resource,
                                       dataset_args,
                                       args,
                                       api=api,
                                       path=path,
                                       session_file=session_file,
                                       log=log,
                                       dataset_type=suffix)
    else:
        new_dataset = datasets[0]
    return new_dataset, resume
Esempio n. 6
0
def create_categories_datasets(dataset, distribution,
                               fields, args, api, resume,
                               session_file=None, path=None, log=None,
                               other_label=OTHER):
    """Generates a new dataset using a subset of categories of the original one

    """

    if args.max_categories < 1:
        sys.exit("--max-categories can only be a positive number.")
    datasets = []
    categories_splits = [distribution[i: i + args.max_categories] for i
                         in range(0, len(distribution), args.max_categories)]
    number_of_datasets = len(categories_splits)

    if resume:
        resume, datasets = c.checkpoint(
            c.are_datasets_created, path, number_of_datasets,
            debug=args.debug)
        if not resume:
            message = u.dated("Found %s datasets out of %s. Resuming.\n"
                              % (len(datasets),
                                 number_of_datasets))
            u.log_message(message, log_file=session_file,
                          console=args.verbosity)
    if not resume:
        for i in range(len(datasets), number_of_datasets):
            split = categories_splits[i]
            category_selector = "(if (or"
            for element in split:
                category = element[0]
                category_selector += " (= v \"%s\")" % category
            category_selector += ") v \"%s\")" % other_label
            category_generator = "(let (v (f %s)) %s)" % (
                fields.objective_field, category_selector)
            try:
                dataset_args = {
                    "all_but": [fields.objective_field],
                    "new_fields": [
                        {"name": fields.field_name(fields.objective_field),
                         "field": category_generator,
                         "label": "max_categories: %s" % args.max_categories}],
                    "user_metadata":
                    {"max_categories": args.max_categories,
                     "other_label": other_label}}
            except ValueError, exc:
                sys.exit(exc)
            new_dataset = r.create_dataset(
                dataset, dataset_args, args, api=api, path=path,
                session_file=session_file, log=log, dataset_type="parts")
            new_dataset = bigml.api.check_resource(new_dataset,
                                                   api.get_dataset)
            datasets.append(new_dataset)
Esempio n. 7
0
def create_new_dataset(
    datasets,
    api,
    args,
    resume,
    name=None,
    description=None,
    fields=None,
    dataset_fields=None,
    objective_field=None,
    session_file=None,
    path=None,
    log=None,
):
    """Generates a new dataset using the generators given in a generators file
       or a multi-dataset from a list of datasets

    """
    origin_resource = datasets
    if not isinstance(datasets, basestring) and args.multi_dataset:
        suffix = "multi"
    else:
        datasets = []
        suffix = "gen"
    number_of_datasets = 1
    if resume:
        resume, datasets = c.checkpoint(
            c.are_datasets_created, path, number_of_datasets, debug=args.debug, suffix=suffix
        )
        if not resume:
            message = u.dated("Found %s datasets out of %s. Resuming.\n" % (len(datasets), number_of_datasets))
            u.log_message(message, log_file=session_file, console=args.verbosity)
    if not resume:
        dataset_args = r.set_dataset_args(
            name, description, args, fields, dataset_fields, objective_field=objective_field
        )
        if args.multi_dataset and args.multi_dataset_json:
            dataset_args.update(args.multi_dataset_json)
        else:
            dataset_args.update(args.dataset_json_generators)
        new_dataset = r.create_dataset(
            origin_resource,
            dataset_args,
            args,
            api=api,
            path=path,
            session_file=session_file,
            log=log,
            dataset_type=suffix,
        )
    else:
        new_dataset = datasets[0]
    return new_dataset, resume
Esempio n. 8
0
def create_categories_datasets(dataset, distribution,
                               fields, args, api, resume,
                               session_file=None, path=None, log=None,
                               other_label=OTHER):
    """Generates a new dataset using a subset of categories of the original one

    """

    if args.max_categories < 1:
        sys.exit("--max-categories can only be a positive number.")
    datasets = []
    categories_splits = [distribution[i: i + args.max_categories] for i
                         in range(0, len(distribution), args.max_categories)]
    number_of_datasets = len(categories_splits)

    if resume:
        resume, datasets = c.checkpoint(
            c.are_datasets_created, path, number_of_datasets,
            debug=args.debug)
        if not resume:
            message = u.dated("Found %s datasets out of %s. Resuming.\n"
                              % (len(datasets),
                                 number_of_datasets))
            u.log_message(message, log_file=session_file,
                          console=args.verbosity)
    if not resume:
        for i in range(len(datasets), number_of_datasets):
            split = categories_splits[i]
            category_selector = "(if (or"
            for element in split:
                category = element[0]
                category_selector += " (= v \"%s\")" % category
            category_selector += ") v \"%s\")" % other_label
            category_generator = "(let (v (f %s)) %s)" % (
                fields.objective_field, category_selector)
            try:
                dataset_args = {
                    "all_but": [fields.objective_field],
                    "new_fields": [
                        {"name": fields.field_name(fields.objective_field),
                         "field": category_generator,
                         "label": "max_categories: %s" % args.max_categories}],
                    "user_metadata":
                    {"max_categories": args.max_categories,
                     "other_label": other_label}}
            except ValueError, exc:
                sys.exit(exc)
            new_dataset = r.create_dataset(
                dataset, dataset_args, args, api=api, path=path,
                session_file=session_file, log=log, dataset_type="parts")
            new_dataset = bigml.api.check_resource(new_dataset,
                                                   api.get_dataset)
            datasets.append(new_dataset)
Esempio n. 9
0
def create_new_dataset(datasets, api, args, resume, name=None,
                       description=None, fields=None,
                       dataset_fields=None, objective_field=None,
                       session_file=None, path=None, log=None):
    """Generates a new dataset using the generators given in a generators file
       or a multi-dataset from a list of datasets

    """
    origin_resource = datasets
    if not isinstance(datasets, basestring) and args.multi_dataset:
        suffix = "multi"
    else:
        datasets = []
        suffix = "gen"
    number_of_datasets = 1
    if resume:
        resume, datasets = c.checkpoint(
            c.are_datasets_created, path, number_of_datasets,
            debug=args.debug, suffix=suffix)
        if not resume:
            message = u.dated("Found %s datasets out of %s. Resuming.\n"
                              % (len(datasets),
                                 number_of_datasets))
            u.log_message(message, log_file=session_file,
                          console=args.verbosity)
    if not resume:
        if args.multi_dataset:
            dataset_args = r.set_dataset_args(name, description, args,
                                              fields, dataset_fields,
                                              objective_field=objective_field)
            if args.multi_dataset_json:
                dataset_args.update(args.multi_dataset_json)
        else:
            dataset_args = {}
            dataset_args.update(args.dataset_json_generators)
            dataset_args.update(r.set_dataset_args(
                name, description, args, fields, dataset_fields,
                objective_field=objective_field))
        new_dataset = r.create_dataset(origin_resource, dataset_args,
                                       args,
                                       api=api, path=path,
                                       session_file=session_file,
                                       log=log, dataset_type=suffix)
    else:
        new_dataset = datasets[0]
    return new_dataset, resume
Esempio n. 10
0
def dataset_processing(source, training_set, test_set, model_ids, name,
                       description, fields, dataset_fields, api, args,
                       resume, csv_properties=None,
                       session_file=None, path=None, log=None):
    """Creating or retrieving dataset from input arguments

    """
    dataset = None
    if (training_set or args.source or (args.evaluate and test_set)):
        # if resuming, try to extract args.dataset form log files
        if resume:
            message = u.dated("Dataset not found. Resuming.\n")
            resume, args.dataset = c.checkpoint(
                c.is_dataset_created, path, debug=args.debug, message=message,
                log_file=session_file, console=args.verbosity)

    # If we have a source but no dataset or model has been provided, we
    # create a new dataset if the no_dataset option isn't set up. Also
    # if evaluate is set and test_set has been provided.
    if ((source and not args.dataset and not args.model and not model_ids and
            not args.no_dataset) or
            (args.evaluate and args.test_set and not args.dataset)):
        dataset_args = r.set_dataset_args(name, description, args, fields,
                                          dataset_fields)
        dataset = r.create_dataset(source, dataset_args, args, api,
                                   path, session_file, log)

    # If a dataset is provided, let's retrieve it.
    elif args.dataset:
        dataset = bigml.api.get_dataset_id(args.dataset)

    # If we already have a dataset, we check the status and get the fields if
    # we hadn't them yet.
    if dataset:
        dataset = r.get_dataset(dataset, api, args.verbosity, session_file)
        if not csv_properties and 'locale' in dataset['object']:
            csv_properties = {
                'data_locale': dataset['object']['locale']}
        fields = Fields(dataset['object']['fields'], **csv_properties)
        if args.public_dataset:
            r.publish_dataset(dataset, api, args, session_file)
    return dataset, resume, csv_properties, fields
Esempio n. 11
0
def dataset_processing(source,
                       api,
                       args,
                       resume,
                       fields=None,
                       csv_properties=None,
                       multi_label_data=None,
                       session_file=None,
                       path=None,
                       log=None):
    """Creating or retrieving dataset from input arguments

    """
    datasets = []
    dataset = None
    if (args.training_set or args.source or
        (hasattr(args, "evaluate") and args.evaluate and args.test_set)):
        # if resuming, try to extract args.dataset form log files
        if resume:
            message = u.dated("Dataset not found. Resuming.\n")
            resume, args.dataset = c.checkpoint(c.is_dataset_created,
                                                path,
                                                debug=args.debug,
                                                message=message,
                                                log_file=session_file,
                                                console=args.verbosity)

    # If we have a source but no dataset or model has been provided, we
    # create a new dataset if the no_dataset option isn't set up. Also
    # if evaluate is set and test_set has been provided.
    if ((source and not args.has_datasets_ and not args.has_models_
         and not args.no_dataset)
            or (hasattr(args, "evaluate") and args.evaluate and args.test_set
                and not args.dataset)):
        dataset_args = r.set_dataset_args(args,
                                          fields,
                                          multi_label_data=multi_label_data)
        dataset = r.create_dataset(source, dataset_args, args, api, path,
                                   session_file, log)

    # If set of datasets is provided, let's check their ids.
    elif args.dataset_ids:
        for i in range(0, len(args.dataset_ids)):
            dataset_id = args.dataset_ids[i]
            if isinstance(dataset_id, dict) and "id" in dataset_id:
                dataset_id = dataset_id["id"]
            datasets.append(bigml.api.get_dataset_id(dataset_id))
        dataset = datasets[0]
    # If a dataset is provided, let's retrieve it.
    elif args.dataset:
        dataset = bigml.api.get_dataset_id(args.dataset)

    # If we already have a dataset, we check the status and get the fields if
    # we hadn't them yet.
    if dataset:
        dataset = r.get_dataset(dataset, api, args.verbosity, session_file)

        if ('object' in dataset and 'objective_field' in dataset['object']
                and 'column_number' in dataset['object']['objective_field']):
            dataset_objective = dataset['object']['objective_field'][
                'column_number']
            csv_properties.update(objective_field=dataset_objective,
                                  objective_field_present=True)

        fields = get_fields_structure(dataset, csv_properties)

        if args.public_dataset:
            r.publish_dataset(dataset, args, api, session_file)

        if hasattr(args, 'objective_field'):
            new_objective = get_new_objective(fields, args.objective_field)
        else:
            new_objective = None
        updated = False
        # We'll update the dataset if
        #  the flag --dataset_attributes is used
        #  the --multi-label flag is used and there's an --objective-field
        #  the --max-categories flag is used and there's an --objective-field
        #  the --impor-fields flag is used
        if check_dataset_update(args, dataset):
            dataset_args = r.set_dataset_args(args, fields)
            if args.shared_flag and r.shared_changed(args.shared, dataset):
                dataset_args.update(shared=args.shared)
            dataset = r.update_dataset(dataset,
                                       dataset_args,
                                       args,
                                       api=api,
                                       path=path,
                                       session_file=session_file)
            dataset = r.get_dataset(dataset, api, args.verbosity, session_file)
            updated = True
        if new_objective is not None:
            csv_properties.update(objective_field=args.objective_field,
                                  objective_field_present=True)
            updated = True
        if updated:
            fields = Fields(dataset['object']['fields'], **csv_properties)
        if not datasets:
            datasets = [dataset]
        else:
            datasets[0] = dataset
    return datasets, resume, csv_properties, fields
Esempio n. 12
0
def compute_output(api, args):
    """ Creates one or more models using the `training_set` or uses the ids
    of previously created BigML models to make predictions for the `test_set`.

    """

    cluster = None
    clusters = None
    # no multi-label support at present

    # variables from command-line options
    resume = args.resume_
    cluster_ids = args.cluster_ids_
    output = args.predictions
    # there's only one cluster to be generated at present
    args.max_parallel_clusters = 1
    # clusters cannot be published yet.
    args.public_cluster = False

    # It is compulsory to have a description to publish either datasets or
    # clusters
    if (not args.description_
            and (args.public_cluster or args.public_dataset)):
        sys.exit("You should provide a description to publish.")

    # When using --new-fields, it is compulsory to specify also a dataset
    # id
    if args.new_fields and not args.dataset:
        sys.exit("To use --new-fields you must also provide a dataset id"
                 " to generate the new dataset from it.")

    path = u.check_dir(output)
    session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG)
    csv_properties = {}
    # If logging is required set the file for logging
    log = None
    if args.log_file:
        u.check_dir(args.log_file)
        log = args.log_file
        # If --clear_logs the log files are cleared
        clear_log_files([log])

    # basic pre-model step: creating or retrieving the source related info
    source, resume, csv_properties, fields = pms.get_source_info(
        api, args, resume, csv_properties, session_file, path, log)
    # basic pre-model step: creating or retrieving the dataset related info
    dataset_properties = pms.get_dataset_info(api, args, resume, source,
                                              csv_properties, fields,
                                              session_file, path, log)
    (_, datasets, test_dataset, resume, csv_properties,
     fields) = dataset_properties
    if args.cluster_file:
        # cluster is retrieved from the contents of the given local JSON file
        cluster, csv_properties, fields = u.read_local_resource(
            args.cluster_file, csv_properties=csv_properties)
        clusters = [cluster]
        cluster_ids = [cluster['resource']]
    else:
        # cluster is retrieved from the remote object
        clusters, cluster_ids, resume = pc.clusters_processing(
            datasets,
            clusters,
            cluster_ids,
            api,
            args,
            resume,
            fields=fields,
            session_file=session_file,
            path=path,
            log=log)
        if clusters:
            cluster = clusters[0]

    # We update the cluster's public state if needed
    if cluster:
        if isinstance(cluster, basestring):
            if args.cluster_datasets is None and not a.has_test(args):
                query_string = MINIMUM_MODEL
            else:
                query_string = ''
            cluster = u.check_resource(cluster,
                                       api.get_cluster,
                                       query_string=query_string)
        clusters[0] = cluster
        if (args.public_cluster or
            (args.shared_flag and r.shared_changed(args.shared, cluster))):
            cluster_args = {}
            if args.shared_flag and r.shared_changed(args.shared, cluster):
                cluster_args.update(shared=args.shared)
            if args.public_cluster:
                cluster_args.update(r.set_publish_cluster_args(args))
            if cluster_args:
                cluster = r.update_cluster(cluster,
                                           cluster_args,
                                           args,
                                           api=api,
                                           path=path,
                                           session_file=session_file)
                clusters[0] = cluster

    # We get the fields of the cluster if we haven't got
    # them yet and need them
    if cluster and (args.test_set or args.export_fields):
        if isinstance(cluster, dict):
            cluster = cluster['resource']
            cluster = u.check_resource(cluster,
                                       api.get_cluster,
                                       query_string=r.ALL_FIELDS_QS)
        fields = pc.get_cluster_fields(cluster, csv_properties, args)

    # If predicting
    if clusters and (a.has_test(args) or (test_dataset and args.remote)):
        if test_dataset is None:
            test_dataset = get_test_dataset(args)

        # Remote centroids: centroids are computed as batch centroids
        # in bigml.com except when --no-batch flag is set on
        if args.remote and not args.no_batch:
            # create test source from file
            test_name = "%s - test" % args.name
            if args.test_source is None:
                test_properties = ps.test_source_processing(
                    api,
                    args,
                    resume,
                    name=test_name,
                    session_file=session_file,
                    path=path,
                    log=log)
                (test_source, resume, csv_properties,
                 test_fields) = test_properties
            else:
                test_source_id = bigml.api.get_source_id(args.test_source)
                test_source = api.check_resource(test_source_id)
            if test_dataset is None:
                # create test dataset from test source
                dataset_args = r.set_basic_dataset_args(args, name=test_name)
                test_dataset, resume = pd.alternative_dataset_processing(
                    test_source,
                    "test",
                    dataset_args,
                    api,
                    args,
                    resume,
                    session_file=session_file,
                    path=path,
                    log=log)
            else:
                test_dataset_id = bigml.api.get_dataset_id(test_dataset)
                test_dataset = api.check_resource(test_dataset_id)
            test_fields = pd.get_fields_structure(test_dataset, csv_properties)
            batch_centroid_args = r.set_batch_centroid_args(
                args, fields=fields, dataset_fields=test_fields)

            remote_centroid(cluster,
                            test_dataset,
                            batch_centroid_args,
                            args,
                            api,
                            resume,
                            prediction_file=output,
                            session_file=session_file,
                            path=path,
                            log=log)

        else:
            centroid(clusters, fields, args, session_file=session_file)

    if cluster and args.cluster_datasets is not None:
        cluster = api.check_resource(cluster)
        centroids_info = cluster['object']['clusters']['clusters']
        centroids = {
            centroid['name']: centroid['id']
            for centroid in centroids_info
        }
        cluster_datasets = cluster['object']['cluster_datasets']
        if args.cluster_datasets == '':
            centroid_ids = centroids.values()
        else:
            centroid_ids = [
                centroids[cluster_name]
                for cluster_name in args.cluster_datasets_
                if cluster_datasets.get(centroids[cluster_name], '') == ''
            ]

        for centroid_id in centroid_ids:
            dataset_args = {'centroid': centroid_id}
            r.create_dataset(cluster,
                             dataset_args,
                             args,
                             api=api,
                             path=path,
                             session_file=session_file,
                             log=log,
                             dataset_type='cluster')

    if cluster and args.cluster_models is not None:
        cluster = api.check_resource(cluster)
        centroids_info = cluster['object']['clusters']['clusters']
        centroids = {
            centroid['name']: centroid['id']
            for centroid in centroids_info
        }
        models = cluster['object']['cluster_models']
        if args.cluster_models == '':
            centroid_ids = centroids.values()
        else:
            centroid_ids = [
                centroids[cluster_name]
                for cluster_name in args.cluster_models_
                if models.get(centroids[cluster_name], '') == ''
            ]

        for centroid_id in centroid_ids:
            model_args = {'centroid': centroid_id}
            r.create_model(cluster,
                           model_args,
                           args,
                           api=api,
                           path=path,
                           session_file=session_file,
                           log=log,
                           model_type='cluster')

    if fields and args.export_fields:
        fields.summary_csv(os.path.join(path, args.export_fields))

    u.print_generated_files(path,
                            log_file=session_file,
                            verbosity=args.verbosity)
    if args.reports:
        clear_reports(path)
        if args.upload:
            upload_reports(args.reports, path)
Esempio n. 13
0
def compute_output(api, args):
    """ Creates one or more models using the `training_set` or uses the ids
    of previously created BigML models to make predictions for the `test_set`.

    """

    cluster = None
    clusters = None
    # no multi-label support at present

    # variables from command-line options
    resume = args.resume_
    cluster_ids = args.cluster_ids_
    output = args.predictions
    # there's only one cluster to be generated at present
    args.max_parallel_clusters = 1
    # clusters cannot be published yet.
    args.public_cluster = False

    # It is compulsory to have a description to publish either datasets or
    # clusters
    if (not args.description_ and (args.public_cluster or
                                   args.public_dataset)):
        sys.exit("You should provide a description to publish.")

    # When using --new-fields, it is compulsory to specify also a dataset
    # id
    if args.new_fields and not args.dataset:
        sys.exit("To use --new-fields you must also provide a dataset id"
                 " to generate the new dataset from it.")

    path = u.check_dir(output)
    session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG)
    csv_properties = {}
    # If logging is required set the file for logging
    log = None
    if args.log_file:
        u.check_dir(args.log_file)
        log = args.log_file
        # If --clear_logs the log files are cleared
        clear_log_files([log])

    # basic pre-model step: creating or retrieving the source related info
    source, resume, csv_properties, fields = pms.get_source_info(
        api, args, resume, csv_properties, session_file, path, log)
    # basic pre-model step: creating or retrieving the dataset related info
    dataset_properties = pms.get_dataset_info(
        api, args, resume, source,
        csv_properties, fields, session_file, path, log)
    (_, datasets, test_dataset,
     resume, csv_properties, fields) = dataset_properties
    if args.cluster_file:
        # cluster is retrieved from the contents of the given local JSON file
        cluster, csv_properties, fields = u.read_local_resource(
            args.cluster_file,
            csv_properties=csv_properties)
        clusters = [cluster]
        cluster_ids = [cluster['resource']]
    else:
        # cluster is retrieved from the remote object
        clusters, cluster_ids, resume = pc.clusters_processing(
            datasets, clusters, cluster_ids, api, args, resume, fields=fields,
            session_file=session_file, path=path, log=log)
        if clusters:
            cluster = clusters[0]

    # We update the cluster's public state if needed
    if cluster:
        if isinstance(cluster, basestring):
            if args.cluster_datasets is None and not a.has_test(args):
                query_string = MINIMUM_MODEL
            else:
                query_string = ''
            cluster = u.check_resource(cluster, api.get_cluster,
                                       query_string=query_string)
        clusters[0] = cluster
        if (args.public_cluster or
                (args.shared_flag and r.shared_changed(args.shared, cluster))):
            cluster_args = {}
            if args.shared_flag and r.shared_changed(args.shared, cluster):
                cluster_args.update(shared=args.shared)
            if args.public_cluster:
                cluster_args.update(r.set_publish_cluster_args(args))
            if cluster_args:
                cluster = r.update_cluster(cluster, cluster_args, args,
                                           api=api, path=path,
                                           session_file=session_file)
                clusters[0] = cluster

    # We get the fields of the cluster if we haven't got
    # them yet and need them
    if cluster and args.test_set:
        fields = pc.get_cluster_fields(cluster, csv_properties, args)

    # If predicting
    if clusters and (a.has_test(args) or (test_dataset and args.remote)):
        if test_dataset is None:
            test_dataset = get_test_dataset(args)

        # Remote centroids: centroids are computed as batch centroids
        # in bigml.com except when --no-batch flag is set on
        if args.remote and not args.no_batch:
            # create test source from file
            test_name = "%s - test" % args.name
            if args.test_source is None:
                test_properties = ps.test_source_processing(
                    api, args, resume, name=test_name,
                    session_file=session_file, path=path, log=log)
                (test_source, resume,
                 csv_properties, test_fields) = test_properties
            else:
                test_source_id = bigml.api.get_source_id(args.test_source)
                test_source = api.check_resource(test_source_id)
            if test_dataset is None:
                # create test dataset from test source
                dataset_args = r.set_basic_dataset_args(args, name=test_name)
                test_dataset, resume = pd.alternative_dataset_processing(
                    test_source, "test", dataset_args, api, args,
                    resume, session_file=session_file, path=path, log=log)
            else:
                test_dataset_id = bigml.api.get_dataset_id(test_dataset)
                test_dataset = api.check_resource(test_dataset_id)
            test_fields = pd.get_fields_structure(test_dataset,
                                                  csv_properties)
            batch_centroid_args = r.set_batch_centroid_args(
                args, fields=fields,
                dataset_fields=test_fields)

            remote_centroid(cluster, test_dataset, batch_centroid_args, args,
                            api, resume, prediction_file=output,
                            session_file=session_file, path=path, log=log)

        else:
            centroid(clusters, fields, args, session_file=session_file)

    if cluster and args.cluster_datasets is not None:
        centroids_info = cluster['object']['clusters']['clusters']
        centroids = {centroid['name']: centroid['id']
                     for centroid in centroids_info}
        datasets = cluster['object']['cluster_datasets']
        if args.cluster_datasets == '':
            centroid_ids = centroids.values()
        else:
            centroid_ids = [centroids[cluster_name] for cluster_name in
                            args.cluster_datasets_
                            if datasets[centroids[cluster_name]] == '']

        for centroid_id in centroid_ids:
            dataset_args = {'centroid': centroid_id}
            r.create_dataset(cluster, dataset_args, args, api=api, path=path,
                             session_file=session_file, log=log,
                             dataset_type='cluster')

    u.print_generated_files(path, log_file=session_file,
                            verbosity=args.verbosity)
    if args.reports:
        clear_reports(path)
        if args.upload:
            upload_reports(args.reports, path)
Esempio n. 14
0
def split_processing(dataset,
                     name,
                     description,
                     api,
                     args,
                     resume,
                     session_file=None,
                     path=None,
                     log=None):
    """Splits a dataset into train and test datasets
    """
    train_dataset = None
    test_dataset = None
    sample_rate = 1 - args.test_split
    # if resuming, try to extract train dataset form log files
    if resume:
        message = u.dated("Dataset not found. Resuming.\n")
        resume, train_dataset = c.checkpoint(c.is_dataset_created,
                                             path,
                                             "_train",
                                             debug=args.debug,
                                             message=message,
                                             log_file=session_file,
                                             console=args.verbosity)

    if train_dataset is None:
        dataset_split_args = r.set_dataset_split_args(
            "%s - train (%s %%)" % (name, int(sample_rate * 100)),
            description,
            args,
            sample_rate,
            out_of_bag=False)
        train_dataset = r.create_dataset(dataset, dataset_split_args, args,
                                         api, path, session_file, log, "train")
        if train_dataset:
            train_dataset = r.get_dataset(train_dataset, api, args.verbosity,
                                          session_file)

    # if resuming, try to extract test dataset form log files
    if resume:
        message = u.dated("Dataset not found. Resuming.\n")
        resume, test_dataset = c.checkpoint(c.is_dataset_created,
                                            path,
                                            "_test",
                                            debug=args.debug,
                                            message=message,
                                            log_file=session_file,
                                            console=args.verbosity)

    if test_dataset is None:
        dataset_split_args = r.set_dataset_split_args(
            "%s - test (%s %%)" % (name, int(args.test_split * 100)),
            description,
            args,
            sample_rate,
            out_of_bag=True)
        test_dataset = r.create_dataset(dataset, dataset_split_args, args, api,
                                        path, session_file, log, "test")
        if test_dataset:
            test_dataset = r.get_dataset(test_dataset, api, args.verbosity,
                                         session_file)
    return train_dataset, test_dataset, resume
Esempio n. 15
0
def dataset_processing(source, training_set, test_set, fields, objective_field,
                       api, args, resume,  name=None, description=None,
                       dataset_fields=None, multi_label_data=None,
                       csv_properties=None,
                       session_file=None, path=None, log=None):
    """Creating or retrieving dataset from input arguments

    """
    datasets = []
    dataset = None
    if (training_set or args.source or (args.evaluate and test_set)):
        # if resuming, try to extract args.dataset form log files
        if resume:
            message = u.dated("Dataset not found. Resuming.\n")
            resume, args.dataset = c.checkpoint(
                c.is_dataset_created, path, debug=args.debug, message=message,
                log_file=session_file, console=args.verbosity)

    # If we have a source but no dataset or model has been provided, we
    # create a new dataset if the no_dataset option isn't set up. Also
    # if evaluate is set and test_set has been provided.
    if ((source and not has_datasets(args) and not has_models(args)
         and not args.no_dataset) or
            (args.evaluate and args.test_set and not args.dataset)):
        dataset_args = r.set_dataset_args(name, description, args, fields,
                                          dataset_fields,
                                          objective_field=objective_field,
                                          multi_label_data=multi_label_data)
        dataset = r.create_dataset(source, dataset_args, args, api,
                                   path, session_file, log)

    # If a dataset is provided, let's retrieve it.
    elif args.dataset:
        dataset = bigml.api.get_dataset_id(args.dataset)

    # If set of datasets is provided, let's check their ids.
    elif args.dataset_ids:
        for i in range(0, len(args.dataset_ids)):
            datasets.append(bigml.api.get_dataset_id(args.dataset_ids[i]))
        dataset = datasets[0]

    # If we already have a dataset, we check the status and get the fields if
    # we hadn't them yet.
    if dataset:
        dataset = r.get_dataset(dataset, api, args.verbosity, session_file)

        if ('object' in dataset and 'objective_field' in dataset['object'] and
            'column_number' in dataset['object']['objective_field']):
            dataset_objective = dataset[
                'object']['objective_field']['column_number']
            csv_properties.update(objective_field=dataset_objective,
                  objective_field_present=True)

        fields = get_fields_structure(dataset, csv_properties)

        if args.public_dataset:
            r.publish_dataset(dataset, args, api, session_file)

        new_objective = get_new_objective(fields, args.objective_field,
                                          dataset)

        if (new_objective is not None or args.dataset_attributes or
            r.shared_changed(args.shared, dataset)):
            dataset_args = r.set_dataset_args(name, description, args, fields,
                                              dataset_fields, objective_field)
            dataset_args.update(shared=args.shared)
            dataset = r.update_dataset(dataset, dataset_args, args,
                                       api=api, path=path,
                                       session_file=session_file)
            dataset = r.get_dataset(dataset, api, args.verbosity, session_file)
            csv_properties.update(objective_field=objective_field,
                                  objective_field_present=True)
            fields = Fields(dataset['object']['fields'], **csv_properties)
        if not datasets:
            datasets = [dataset]
        else:
            datasets[0] = dataset
    return datasets, resume, csv_properties, fields
Esempio n. 16
0
def dataset_processing(
    source,
    training_set,
    test_set,
    fields,
    objective_field,
    api,
    args,
    resume,
    name=None,
    description=None,
    dataset_fields=None,
    multi_label_data=None,
    csv_properties=None,
    session_file=None,
    path=None,
    log=None,
):
    """Creating or retrieving dataset from input arguments

    """
    datasets = []
    dataset = None
    if training_set or args.source or (args.evaluate and test_set):
        # if resuming, try to extract args.dataset form log files
        if resume:
            message = u.dated("Dataset not found. Resuming.\n")
            resume, args.dataset = c.checkpoint(
                c.is_dataset_created,
                path,
                debug=args.debug,
                message=message,
                log_file=session_file,
                console=args.verbosity,
            )

    # If we have a source but no dataset or model has been provided, we
    # create a new dataset if the no_dataset option isn't set up. Also
    # if evaluate is set and test_set has been provided.
    if (source and not has_datasets(args) and not has_models(args) and not args.no_dataset) or (
        args.evaluate and args.test_set and not args.dataset
    ):
        dataset_args = r.set_dataset_args(
            name,
            description,
            args,
            fields,
            dataset_fields,
            objective_field=objective_field,
            multi_label_data=multi_label_data,
        )
        dataset = r.create_dataset(source, dataset_args, args, api, path, session_file, log)

    # If a dataset is provided, let's retrieve it.
    elif args.dataset:
        dataset = bigml.api.get_dataset_id(args.dataset)

    # If set of datasets is provided, let's check their ids.
    elif args.dataset_ids:
        for i in range(0, len(args.dataset_ids)):
            datasets.append(bigml.api.get_dataset_id(args.dataset_ids[i]))
        dataset = datasets[0]

    # If we already have a dataset, we check the status and get the fields if
    # we hadn't them yet.
    if dataset:
        dataset = r.get_dataset(dataset, api, args.verbosity, session_file)

        if (
            "object" in dataset
            and "objective_field" in dataset["object"]
            and "column_number" in dataset["object"]["objective_field"]
        ):
            dataset_objective = dataset["object"]["objective_field"]["column_number"]
            csv_properties.update(objective_field=dataset_objective, objective_field_present=True)

        fields = get_fields_structure(dataset, csv_properties)

        if args.public_dataset:
            r.publish_dataset(dataset, args, api, session_file)

        new_objective = get_new_objective(fields, args.objective_field, dataset)

        if (
            new_objective is not None
            or args.dataset_attributes
            or (args.shared_flag and r.shared_changed(args.shared, dataset))
        ):
            dataset_args = r.set_dataset_args(name, description, args, fields, dataset_fields, objective_field)
            if args.shared_flag and r.shared_changed(args.shared, dataset):
                dataset_args.update(shared=args.shared)
            dataset = r.update_dataset(dataset, dataset_args, args, api=api, path=path, session_file=session_file)
            dataset = r.get_dataset(dataset, api, args.verbosity, session_file)
            csv_properties.update(objective_field=objective_field, objective_field_present=True)
            fields = Fields(dataset["object"]["fields"], **csv_properties)
        if not datasets:
            datasets = [dataset]
        else:
            datasets[0] = dataset
    return datasets, resume, csv_properties, fields
Esempio n. 17
0
def dataset_processing(source, api, args, resume,
                       fields=None,
                       csv_properties=None,
                       multi_label_data=None,
                       session_file=None, path=None, log=None):
    """Creating or retrieving dataset from input arguments

    """
    datasets = []
    dataset = None
    if (args.training_set or args.source or (
            hasattr(args, "evaluate") and args.evaluate and args.test_set)):
        # if resuming, try to extract args.dataset form log files
        if resume:
            message = u.dated("Dataset not found. Resuming.\n")
            resume, args.dataset = c.checkpoint(
                c.is_dataset_created, path, debug=args.debug, message=message,
                log_file=session_file, console=args.verbosity)

    # If we have a source but no dataset or model has been provided, we
    # create a new dataset if the no_dataset option isn't set up. Also
    # if evaluate is set and test_set has been provided.
    if ((source and not args.has_datasets_ and not args.has_models_
         and not args.no_dataset) or
            (hasattr(args, "evaluate") and args.evaluate and
             args.test_set and not args.dataset)):
        dataset_args = r.set_dataset_args(args, fields,
                                          multi_label_data=multi_label_data)
        dataset = r.create_dataset(source, dataset_args, args, api,
                                   path, session_file, log)

    # If a dataset is provided, let's retrieve it.
    elif args.dataset:
        dataset = bigml.api.get_dataset_id(args.dataset)

    # If set of datasets is provided, let's check their ids.
    elif args.dataset_ids:
        for i in range(0, len(args.dataset_ids)):
            datasets.append(bigml.api.get_dataset_id(args.dataset_ids[i]))
        dataset = datasets[0]

    # If we already have a dataset, we check the status and get the fields if
    # we hadn't them yet.
    if dataset:
        dataset = r.get_dataset(dataset, api, args.verbosity, session_file)

        if ('object' in dataset and 'objective_field' in dataset['object'] and
                'column_number' in dataset['object']['objective_field']):
            dataset_objective = dataset[
                'object']['objective_field']['column_number']
            csv_properties.update(objective_field=dataset_objective,
                                  objective_field_present=True)

        fields = get_fields_structure(dataset, csv_properties)

        if args.public_dataset:
            r.publish_dataset(dataset, args, api, session_file)

        if hasattr(args, 'objective_field'):
            new_objective = get_new_objective(fields, args.objective_field)
        else:
            new_objective = None
        updated = False
        # We'll update the dataset if
        #  the flag --dataset_attributes is used
        #  the --multi-label flag is used and there's an --objective-field
        #  the --max-categories flag is used and there's an --objective-field
        if check_dataset_update(args, dataset):
            dataset_args = r.set_dataset_args(args, fields)
            if args.shared_flag and r.shared_changed(args.shared, dataset):
                dataset_args.update(shared=args.shared)
            dataset = r.update_dataset(dataset, dataset_args, args,
                                       api=api, path=path,
                                       session_file=session_file)
            dataset = r.get_dataset(dataset, api, args.verbosity, session_file)
            updated = True
        if new_objective is not None:
            csv_properties.update(objective_field=args.objective_field,
                                  objective_field_present=True)
            updated = True
        if updated:
            fields = Fields(dataset['object']['fields'], **csv_properties)
        if not datasets:
            datasets = [dataset]
        else:
            datasets[0] = dataset
    return datasets, resume, csv_properties, fields
Esempio n. 18
0
def compute_output(api, args, training_set, test_set=None, output=None,
                   objective_field=None,
                   description=None,
                   field_attributes=None,
                   types=None,
                   dataset_fields=None,
                   model_fields=None,
                   name=None, training_set_header=True,
                   test_set_header=True, model_ids=None,
                   votes_files=None, resume=False, fields_map=None):
    """ Creates one or more models using the `training_set` or uses the ids
    of previously created BigML models to make predictions for the `test_set`.

    """
    source = None
    dataset = None
    model = None
    models = None
    fields = None

    # It is compulsory to have a description to publish either datasets or
    # models
    if (not description and
            (args.black_box or args.white_box or args.public_dataset)):
        raise Exception("You should provide a description to publish.")

    path = u.check_dir(output)
    session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG)
    csv_properties = {}
    # If logging is required, open the file for logging
    log = None
    if args.log_file:
        u.check_dir(args.log_file)
        log = args.log_file
        # If --clear_logs the log files are cleared
        if args.clear_logs:
            try:
                open(log, 'w', 0).close()
            except IOError:
                pass

    # Starting source processing

    if (training_set or (args.evaluate and test_set)):
        # If resuming, try to extract args.source form log files
        if resume:
            resume, args.source = u.checkpoint(u.is_source_created, path,
                                               debug=args.debug)
            if not resume:
                message = u.dated("Source not found. Resuming.\n")
                u.log_message(message, log_file=session_file,
                              console=args.verbosity)

    # If neither a previous source, dataset or model are provided.
    # we create a new one. Also if --evaluate and test data are provided
    # we create a new dataset to test with.
    data_set, data_set_header = r.data_to_source(training_set, test_set,
                                                 training_set_header,
                                                 test_set_header, args)
    if data_set is not None:
        source_args = r.set_source_args(data_set_header, name, description,
                                        args)
        source = r.create_source(data_set, source_args, args, api,
                                 path, session_file, log)

    # If a source is provided either through the command line or in resume
    # steps, we use it.
    elif args.source:
        source = bigml.api.get_source_id(args.source)

    # If we already have source, we check that is finished, extract the
    # fields, and update them if needed.
    if source:
        source = r.get_source(source, api, args.verbosity, session_file)
        if 'source_parser' in source['object']:
            source_parser = source['object']['source_parser']
            if 'missing_tokens' in source_parser:
                csv_properties['missing_tokens'] = (
                    source_parser['missing_tokens'])
            if 'data_locale' in source_parser:
                csv_properties['data_locale'] = source_parser['locale']

        fields = Fields(source['object']['fields'], **csv_properties)
        if field_attributes:
            source = r.update_source_fields(source, field_attributes, fields,
                                            api, args.verbosity,
                                            session_file)
        if types:
            source = r.update_source_fields(source, types, fields, api,
                                            args.verbosity, session_file)

    # End of source processing

    # Starting dataset processing

    if (training_set or args.source or (args.evaluate and test_set)):
        # if resuming, try to extract args.dataset form log files
        if resume:
            resume, args.dataset = u.checkpoint(u.is_dataset_created, path,
                                                debug=args.debug)
            if not resume:
                message = u.dated("Dataset not found. Resuming.\n")
                u.log_message(message, log_file=session_file,
                              console=args.verbosity)
    # If we have a source but not dataset or model has been provided, we
    # create a new dataset if the no_dataset option isn't set up. Also
    # if evaluate is set and test_set has been provided.
    if ((source and not args.dataset and not args.model and not model_ids and
            not args.no_dataset) or
            (args.evaluate and args.test_set and not args.dataset)):
        dataset_args = r.set_dataset_args(name, description, args, fields,
                                          dataset_fields)
        dataset = r.create_dataset(source, dataset_args, args, api,
                                   path, session_file, log)

    # If a dataset is provided, let's retrieve it.
    elif args.dataset:
        dataset = bigml.api.get_dataset_id(args.dataset)

    # If we already have a dataset, we check the status and get the fields if
    # we hadn't them yet.
    if dataset:
        dataset = r.get_dataset(dataset, api, args.verbosity, session_file)
        if not csv_properties and 'locale' in dataset['object']:
            csv_properties = {
                'data_locale': dataset['object']['locale']}
        fields = Fields(dataset['object']['fields'], **csv_properties)
        if args.public_dataset:
            r.publish_dataset(dataset, api, args, session_file)

    #end of dataset processing

    #start of model processing

    # If we have a dataset but not a model, we create the model if the no_model
    # flag hasn't been set up.
    if (dataset and not args.model and not model_ids and not args.no_model):
        model_ids = []
        models = []
        if resume:
            resume, model_ids = u.checkpoint(u.are_models_created, path,
                                             args.number_of_models,
                                             debug=args.debug)
            if not resume:
                message = u.dated("Found %s models out of %s. Resuming.\n" %
                                  (len(model_ids),
                                   args.number_of_models))
                u.log_message(message, log_file=session_file,
                              console=args.verbosity)
            models = model_ids
            args.number_of_models -= len(model_ids)

        model_args = r.set_model_args(name, description, args,
                                      objective_field, fields, model_fields)
        models, model_ids = r.create_models(dataset, models,
                                            model_args, args, api,
                                            path, session_file, log)
        model = models[0]
    # If a model is provided, we use it.
    elif args.model:
        model = args.model
        model_ids = [model]
        models = [model]

    elif args.models or args.model_tag:
        models = model_ids[:]
        model = models[0]

    # If we are going to predict we must retrieve the models
    if model_ids and test_set and not args.evaluate:
        models, model_ids = r.get_models(models, args, api, session_file)
        model = models[0]

    # We get the fields of the model if we haven't got
    # them yet and update its public state if needed
    if model and not args.evaluate and (test_set or args.black_box
                                        or args.white_box):
        if args.black_box or args.white_box:
            model = r.publish_model(model, args, api, session_file)
            models[0] = model
        if not csv_properties:
            csv_properties = {}
        csv_properties.update(verbose=True)
        if args.user_locale is None:
            args.user_locale = model['object'].get('locale', None)
        csv_properties.update(data_locale=args.user_locale)
        if 'model_fields' in model['object']['model']:
            model_fields = model['object']['model']['model_fields'].keys()
            csv_properties.update(include=model_fields)
        if 'missing_tokens' in model['object']['model']:
            missing_tokens = model['object']['model']['missing_tokens']
        else:
            missing_tokens = MISSING_TOKENS
        csv_properties.update(missing_tokens=missing_tokens)
        objective_field = models[0]['object']['objective_fields']
        if isinstance(objective_field, list):
            objective_field = objective_field[0]
        csv_properties.update(objective_field=objective_field)
        fields = Fields(model['object']['model']['fields'], **csv_properties)

    # end of model processing

    # If predicting
    if models and test_set and not args.evaluate:
        predict(test_set, test_set_header, models, fields, output,
                objective_field, args.remote, api, log,
                args.max_batch_models, args.method, resume, args.tag,
                args.verbosity, session_file, args.debug)

    # When combine_votes flag is used, retrieve the predictions files saved
    # in the comma separated list of directories and combine them
    if votes_files:
        model_id = re.sub(r'.*(model_[a-f0-9]{24})__predictions\.csv$',
                          r'\1', votes_files[0]).replace("_", "/")
        try:
            model = api.check_resource(model_id, api.get_model)
        except ValueError, exception:
            sys.exit("Failed to get model %s: %s" % (model_id, str(exception)))

        local_model = Model(model)
        message = u.dated("Combining votes.\n")
        u.log_message(message, log_file=session_file,
                      console=args.verbosity)
        u.combine_votes(votes_files, local_model.to_prediction,
                        output, args.method)