Example #1
0
def test_source_processing(api, args, resume,
                           name=None, csv_properties=None,
                           session_file=None, path=None, log=None):
    """Creating or retrieving a test data source from input arguments

    """
    test_source = None
    fields = None
    if csv_properties is None:
        csv_properties = {}
    if args.test_set and args.remote:
        # If resuming, try to extract args.source form log files
        if resume:
            message = u.dated("Test source not found. Resuming.\n")
            resume, args.test_source = c.checkpoint(
                c.is_source_created, path, suffix="_test", debug=args.debug,
                message=message, log_file=session_file, console=args.verbosity)

        if not resume:
            source_args = r.set_source_args(args, name=name,
                                            data_set_header=args.test_header)
            test_source = r.create_source(args.test_set, source_args, args,
                                          api, path, session_file, log,
                                          source_type="test")

    # If a source is provided either through the command line or in resume
    # steps, we use it.
    elif args.test_source:
        test_source = bigml.api.get_source_id(args.test_source)

    # If we already have source, we check that is finished, extract the
    # fields, and update them if needed.
    if test_source:
        test_source = r.get_source(test_source, api, args.verbosity,
                                   session_file)
        if 'source_parser' in test_source['object']:
            source_parser = test_source['object']['source_parser']
            if 'missing_tokens' in source_parser:
                csv_properties['missing_tokens'] = (
                    source_parser['missing_tokens'])
            if 'locale' in source_parser:
                csv_properties['data_locale'] = source_parser['locale']
                if (args.user_locale is not None and
                        bigml_locale(args.user_locale) ==
                        source_parser['locale']):
                    args.user_locale = None

        fields = Fields(test_source['object']['fields'], **csv_properties)

        if (args.field_attributes_ or args.types_ or args.user_locale
                or args.json_args.get('source')):
            # avoid updating project_id in source
            project_id, args.project_id = args.project_id, None
            test_source_args = r.set_source_args(args, fields=fields)
            test_source = r.update_source(test_source, test_source_args, args,
                                          api, session_file)
            args.project_id = project_id
            fields = Fields(test_source['object']['fields'], **csv_properties)

    return test_source, resume, csv_properties, fields
Example #2
0
def source_processing(training_set, test_set, training_set_header,
                      test_set_header, api, args, resume,
                      name=None, description=None,
                      csv_properties=None, field_attributes=None, types=None,
                      session_file=None, path=None, log=None):
    """Creating or retrieving a data source from input arguments

    """
    source = None
    fields = None
    if (training_set or (args.evaluate and test_set)):
        # If resuming, try to extract args.source form log files

        if resume:
            message = u.dated("Source not found. Resuming.\n")
            resume, args.source = c.checkpoint(
                c.is_source_created, path, debug=args.debug, message=message,
                log_file=session_file, console=args.verbosity)

    # If neither a previous source, dataset or model are provided.
    # we create a new one. Also if --evaluate and test data are provided
    # we create a new dataset to test with.
    data_set, data_set_header = r.data_to_source(training_set, test_set,
                                                 training_set_header,
                                                 test_set_header, args)
    if data_set is not None:
        source_args = r.set_source_args(data_set_header, name, description,
                                        args)
        source = r.create_source(data_set, source_args, args, api,
                                 path, session_file, log)

    # If a source is provided either through the command line or in resume
    # steps, we use it.
    elif args.source:
        source = bigml.api.get_source_id(args.source)

    # If we already have source, we check that is finished, extract the
    # fields, and update them if needed.
    if source:
        source = r.get_source(source, api, args.verbosity, session_file)
        if 'source_parser' in source['object']:
            source_parser = source['object']['source_parser']
            if 'missing_tokens' in source_parser:
                csv_properties['missing_tokens'] = (
                    source_parser['missing_tokens'])
            if 'data_locale' in source_parser:
                csv_properties['data_locale'] = source_parser['locale']

        fields = Fields(source['object']['fields'], **csv_properties)
        if field_attributes:
            source = r.update_source_fields(source, field_attributes, fields,
                                            api, args.verbosity,
                                            session_file)
        if types:
            source = r.update_source_fields(source, types, fields, api,
                                            args.verbosity, session_file)
        if field_attributes or types:
            fields = Fields(source['object']['fields'], **csv_properties)

    return source, resume, csv_properties, fields
Example #3
0
def source_processing(
    api, args, resume, csv_properties=None, multi_label_data=None, session_file=None, path=None, log=None
):
    """Creating or retrieving a data source from input arguments

    """
    source = None
    fields = None
    if args.training_set or (hasattr(args, "evaluate") and args.evaluate and args.test_set):
        # If resuming, try to extract args.source form log files

        if resume:
            message = u.dated("Source not found. Resuming.\n")
            resume, args.source = c.checkpoint(
                c.is_source_created,
                path,
                debug=args.debug,
                message=message,
                log_file=session_file,
                console=args.verbosity,
            )

    # If neither a previous source, dataset or model are provided.
    # we create a new one. Also if --evaluate and test data are provided
    # we create a new dataset to test with.
    data_set, data_set_header = r.data_to_source(args)
    if data_set is not None:
        source_args = r.set_source_args(args, multi_label_data=multi_label_data, data_set_header=data_set_header)
        source = r.create_source(data_set, source_args, args, api, path, session_file, log)

    # If a source is provided either through the command line or in resume
    # steps, we use it.
    elif args.source:
        source = bigml.api.get_source_id(args.source)

    # If we already have source, we check that is finished, extract the
    # fields, and update them if needed.
    if source:
        source = r.get_source(source, api, args.verbosity, session_file)
        if "source_parser" in source["object"]:
            source_parser = source["object"]["source_parser"]
            if "missing_tokens" in source_parser:
                csv_properties["missing_tokens"] = source_parser["missing_tokens"]
            if "locale" in source_parser:
                csv_properties["data_locale"] = source_parser["locale"]
                # No changes if user locale is the one in the source.
                if args.user_locale is not None and bigml_locale(args.user_locale) == source_parser["locale"]:
                    args.user_locale = None
        fields = Fields(source["object"]["fields"], **csv_properties)

        if args.field_attributes_ or args.types_ or args.user_locale or args.json_args.get("source"):
            source_args = r.set_source_args(args, fields=fields)
            source = r.update_source(source, source_args, args, api, session_file)
            fields = Fields(source["object"]["fields"], **csv_properties)

    return source, resume, csv_properties, fields
Example #4
0
def test_source_processing(api,
                           args,
                           resume,
                           name=None,
                           csv_properties=None,
                           session_file=None,
                           path=None,
                           log=None):
    """Creating or retrieving a test data source from input arguments

    """
    test_source = None
    fields = None
    if csv_properties is None:
        csv_properties = {}
    if args.test_set and args.remote:
        # If resuming, try to extract args.source form log files
        if resume:
            message = u.dated("Test source not found. Resuming.\n")
            resume, args.test_source = c.checkpoint(c.is_source_created,
                                                    path,
                                                    suffix="_test",
                                                    debug=args.debug,
                                                    message=message,
                                                    log_file=session_file,
                                                    console=args.verbosity)

        if not resume:
            source_args = r.set_source_args(args,
                                            name=name,
                                            data_set_header=args.test_header)
            test_source = r.create_source(args.test_set,
                                          source_args,
                                          args,
                                          api,
                                          path,
                                          session_file,
                                          log,
                                          source_type="test")

    # If a source is provided either through the command line or in resume
    # steps, we use it.
    elif args.test_source:
        test_source = bigml.api.get_source_id(args.test_source)

    # If we already have source, we check that is finished, extract the
    # fields, and update them if needed.
    if test_source:
        test_source = r.get_source(test_source, api, args.verbosity,
                                   session_file)
        if 'source_parser' in test_source['object']:
            source_parser = test_source['object']['source_parser']
            if 'missing_tokens' in source_parser:
                csv_properties['missing_tokens'] = (
                    source_parser['missing_tokens'])
            if 'locale' in source_parser:
                csv_properties['data_locale'] = source_parser['locale']
                if (args.user_locale is not None and bigml_locale(
                        args.user_locale) == source_parser['locale']):
                    args.user_locale = None

        fields = Fields(test_source['object']['fields'], **csv_properties)

        if (args.field_attributes_ or args.types_ or args.user_locale
                or args.json_args.get('source')):
            # avoid updating project_id in source
            project_id, args.project_id = args.project_id, None
            test_source_args = r.set_source_args(args, fields=fields)
            test_source = r.update_source(test_source, source_args, args, api,
                                          session_file)
            args.project_id = project_id
            fields = Fields(source['object']['fields'], **csv_properties)

    return test_source, resume, csv_properties, fields
Example #5
0
def source_processing(api,
                      args,
                      resume,
                      csv_properties=None,
                      multi_label_data=None,
                      session_file=None,
                      path=None,
                      log=None):
    """Creating or retrieving a data source from input arguments

    """
    source = None
    fields = None
    if (args.training_set or
        (hasattr(args, "evaluate") and args.evaluate and args.test_set)):
        # If resuming, try to extract args.source form log files

        if resume:
            message = u.dated("Source not found. Resuming.\n")
            resume, args.source = c.checkpoint(c.is_source_created,
                                               path,
                                               debug=args.debug,
                                               message=message,
                                               log_file=session_file,
                                               console=args.verbosity)

    # If neither a previous source, dataset or model are provided.
    # we create a new one. Also if --evaluate and test data are provided
    # we create a new dataset to test with.
    data_set, data_set_header = r.data_to_source(args)
    if data_set is not None:
        # Check if there's a created project for it
        args.project_id = pp.project_processing(api,
                                                args,
                                                resume,
                                                session_file=session_file,
                                                path=path,
                                                log=log)
        source_args = r.set_source_args(args,
                                        multi_label_data=multi_label_data,
                                        data_set_header=data_set_header)
        source = r.create_source(data_set, source_args, args, api, path,
                                 session_file, log)

    # If a source is provided either through the command line or in resume
    # steps, we use it.
    elif args.source:
        source = bigml.api.get_source_id(args.source)

    # If we already have source, we check that is finished, extract the
    # fields, and update them if needed.
    if source:
        source = r.get_source(source, api, args.verbosity, session_file)
        if 'source_parser' in source['object']:
            source_parser = source['object']['source_parser']
            if 'missing_tokens' in source_parser:
                csv_properties['missing_tokens'] = (
                    source_parser['missing_tokens'])
            if 'locale' in source_parser:
                csv_properties['data_locale'] = source_parser['locale']
                # No changes if user locale is the one in the source.
                if (args.user_locale is not None and bigml_locale(
                        args.user_locale) == source_parser['locale']):
                    args.user_locale = None
        fields = Fields(source['object']['fields'], **csv_properties)

        if (args.field_attributes_ or args.types_ or args.user_locale
                or args.json_args.get('source')):
            # avoid updating project_id in source
            project_id, args.project_id = args.project_id, None
            source_args = r.set_source_args(args, fields=fields)
            source = r.update_source(source, source_args, args, api,
                                     session_file)
            args.project_id = project_id
            fields = Fields(source['object']['fields'], **csv_properties)

    return source, resume, csv_properties, fields
Example #6
0
def source_processing(training_set,
                      test_set,
                      training_set_header,
                      test_set_header,
                      name,
                      description,
                      api,
                      args,
                      resume,
                      csv_properties=None,
                      field_attributes=None,
                      types=None,
                      session_file=None,
                      path=None,
                      log=None):
    """Creating or retrieving a data source from input arguments

    """
    source = None
    fields = None
    if (training_set or (args.evaluate and test_set)):
        # If resuming, try to extract args.source form log files

        if resume:
            message = u.dated("Source not found. Resuming.\n")
            resume, args.source = c.checkpoint(c.is_source_created,
                                               path,
                                               debug=args.debug,
                                               message=message,
                                               log_file=session_file,
                                               console=args.verbosity)

    # If neither a previous source, dataset or model are provided.
    # we create a new one. Also if --evaluate and test data are provided
    # we create a new dataset to test with.
    data_set, data_set_header = r.data_to_source(training_set, test_set,
                                                 training_set_header,
                                                 test_set_header, args)
    if data_set is not None:
        source_args = r.set_source_args(data_set_header, name, description,
                                        args)
        source = r.create_source(data_set, source_args, args, api, path,
                                 session_file, log)

    # If a source is provided either through the command line or in resume
    # steps, we use it.
    elif args.source:
        source = bigml.api.get_source_id(args.source)

    # If we already have source, we check that is finished, extract the
    # fields, and update them if needed.
    if source:
        source = r.get_source(source, api, args.verbosity, session_file)
        if 'source_parser' in source['object']:
            source_parser = source['object']['source_parser']
            if 'missing_tokens' in source_parser:
                csv_properties['missing_tokens'] = (
                    source_parser['missing_tokens'])
            if 'data_locale' in source_parser:
                csv_properties['data_locale'] = source_parser['locale']

        fields = Fields(source['object']['fields'], **csv_properties)
        if field_attributes:
            source = r.update_source_fields(source, field_attributes, fields,
                                            api, args.verbosity, session_file)
        if types:
            source = r.update_source_fields(source, types, fields, api,
                                            args.verbosity, session_file)
    return source, resume, csv_properties, fields
Example #7
0
def compute_output(api, args, training_set, test_set=None, output=None,
                   objective_field=None,
                   description=None,
                   field_attributes=None,
                   types=None,
                   dataset_fields=None,
                   model_fields=None,
                   name=None, training_set_header=True,
                   test_set_header=True, model_ids=None,
                   votes_files=None, resume=False, fields_map=None):
    """ Creates one or more models using the `training_set` or uses the ids
    of previously created BigML models to make predictions for the `test_set`.

    """
    source = None
    dataset = None
    model = None
    models = None
    fields = None

    # It is compulsory to have a description to publish either datasets or
    # models
    if (not description and
            (args.black_box or args.white_box or args.public_dataset)):
        raise Exception("You should provide a description to publish.")

    path = u.check_dir(output)
    session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG)
    csv_properties = {}
    # If logging is required, open the file for logging
    log = None
    if args.log_file:
        u.check_dir(args.log_file)
        log = args.log_file
        # If --clear_logs the log files are cleared
        if args.clear_logs:
            try:
                open(log, 'w', 0).close()
            except IOError:
                pass

    # Starting source processing

    if (training_set or (args.evaluate and test_set)):
        # If resuming, try to extract args.source form log files
        if resume:
            resume, args.source = u.checkpoint(u.is_source_created, path,
                                               debug=args.debug)
            if not resume:
                message = u.dated("Source not found. Resuming.\n")
                u.log_message(message, log_file=session_file,
                              console=args.verbosity)

    # If neither a previous source, dataset or model are provided.
    # we create a new one. Also if --evaluate and test data are provided
    # we create a new dataset to test with.
    data_set, data_set_header = r.data_to_source(training_set, test_set,
                                                 training_set_header,
                                                 test_set_header, args)
    if data_set is not None:
        source_args = r.set_source_args(data_set_header, name, description,
                                        args)
        source = r.create_source(data_set, source_args, args, api,
                                 path, session_file, log)

    # If a source is provided either through the command line or in resume
    # steps, we use it.
    elif args.source:
        source = bigml.api.get_source_id(args.source)

    # If we already have source, we check that is finished, extract the
    # fields, and update them if needed.
    if source:
        source = r.get_source(source, api, args.verbosity, session_file)
        if 'source_parser' in source['object']:
            source_parser = source['object']['source_parser']
            if 'missing_tokens' in source_parser:
                csv_properties['missing_tokens'] = (
                    source_parser['missing_tokens'])
            if 'data_locale' in source_parser:
                csv_properties['data_locale'] = source_parser['locale']

        fields = Fields(source['object']['fields'], **csv_properties)
        if field_attributes:
            source = r.update_source_fields(source, field_attributes, fields,
                                            api, args.verbosity,
                                            session_file)
        if types:
            source = r.update_source_fields(source, types, fields, api,
                                            args.verbosity, session_file)

    # End of source processing

    # Starting dataset processing

    if (training_set or args.source or (args.evaluate and test_set)):
        # if resuming, try to extract args.dataset form log files
        if resume:
            resume, args.dataset = u.checkpoint(u.is_dataset_created, path,
                                                debug=args.debug)
            if not resume:
                message = u.dated("Dataset not found. Resuming.\n")
                u.log_message(message, log_file=session_file,
                              console=args.verbosity)
    # If we have a source but not dataset or model has been provided, we
    # create a new dataset if the no_dataset option isn't set up. Also
    # if evaluate is set and test_set has been provided.
    if ((source and not args.dataset and not args.model and not model_ids and
            not args.no_dataset) or
            (args.evaluate and args.test_set and not args.dataset)):
        dataset_args = r.set_dataset_args(name, description, args, fields,
                                          dataset_fields)
        dataset = r.create_dataset(source, dataset_args, args, api,
                                   path, session_file, log)

    # If a dataset is provided, let's retrieve it.
    elif args.dataset:
        dataset = bigml.api.get_dataset_id(args.dataset)

    # If we already have a dataset, we check the status and get the fields if
    # we hadn't them yet.
    if dataset:
        dataset = r.get_dataset(dataset, api, args.verbosity, session_file)
        if not csv_properties and 'locale' in dataset['object']:
            csv_properties = {
                'data_locale': dataset['object']['locale']}
        fields = Fields(dataset['object']['fields'], **csv_properties)
        if args.public_dataset:
            r.publish_dataset(dataset, api, args, session_file)

    #end of dataset processing

    #start of model processing

    # If we have a dataset but not a model, we create the model if the no_model
    # flag hasn't been set up.
    if (dataset and not args.model and not model_ids and not args.no_model):
        model_ids = []
        models = []
        if resume:
            resume, model_ids = u.checkpoint(u.are_models_created, path,
                                             args.number_of_models,
                                             debug=args.debug)
            if not resume:
                message = u.dated("Found %s models out of %s. Resuming.\n" %
                                  (len(model_ids),
                                   args.number_of_models))
                u.log_message(message, log_file=session_file,
                              console=args.verbosity)
            models = model_ids
            args.number_of_models -= len(model_ids)

        model_args = r.set_model_args(name, description, args,
                                      objective_field, fields, model_fields)
        models, model_ids = r.create_models(dataset, models,
                                            model_args, args, api,
                                            path, session_file, log)
        model = models[0]
    # If a model is provided, we use it.
    elif args.model:
        model = args.model
        model_ids = [model]
        models = [model]

    elif args.models or args.model_tag:
        models = model_ids[:]
        model = models[0]

    # If we are going to predict we must retrieve the models
    if model_ids and test_set and not args.evaluate:
        models, model_ids = r.get_models(models, args, api, session_file)
        model = models[0]

    # We get the fields of the model if we haven't got
    # them yet and update its public state if needed
    if model and not args.evaluate and (test_set or args.black_box
                                        or args.white_box):
        if args.black_box or args.white_box:
            model = r.publish_model(model, args, api, session_file)
            models[0] = model
        if not csv_properties:
            csv_properties = {}
        csv_properties.update(verbose=True)
        if args.user_locale is None:
            args.user_locale = model['object'].get('locale', None)
        csv_properties.update(data_locale=args.user_locale)
        if 'model_fields' in model['object']['model']:
            model_fields = model['object']['model']['model_fields'].keys()
            csv_properties.update(include=model_fields)
        if 'missing_tokens' in model['object']['model']:
            missing_tokens = model['object']['model']['missing_tokens']
        else:
            missing_tokens = MISSING_TOKENS
        csv_properties.update(missing_tokens=missing_tokens)
        objective_field = models[0]['object']['objective_fields']
        if isinstance(objective_field, list):
            objective_field = objective_field[0]
        csv_properties.update(objective_field=objective_field)
        fields = Fields(model['object']['model']['fields'], **csv_properties)

    # end of model processing

    # If predicting
    if models and test_set and not args.evaluate:
        predict(test_set, test_set_header, models, fields, output,
                objective_field, args.remote, api, log,
                args.max_batch_models, args.method, resume, args.tag,
                args.verbosity, session_file, args.debug)

    # When combine_votes flag is used, retrieve the predictions files saved
    # in the comma separated list of directories and combine them
    if votes_files:
        model_id = re.sub(r'.*(model_[a-f0-9]{24})__predictions\.csv$',
                          r'\1', votes_files[0]).replace("_", "/")
        try:
            model = api.check_resource(model_id, api.get_model)
        except ValueError, exception:
            sys.exit("Failed to get model %s: %s" % (model_id, str(exception)))

        local_model = Model(model)
        message = u.dated("Combining votes.\n")
        u.log_message(message, log_file=session_file,
                      console=args.verbosity)
        u.combine_votes(votes_files, local_model.to_prediction,
                        output, args.method)