Ejemplo n.º 1
0
def get_input_fields(resource, referrer=None):
    """New list of input fields

    """
    if referrer is None:
        referrer = {}
    input_fields_ids = resource.get('input_fields', [])
    if referrer:
        referrer_input_fields = [[]]
        # compare fields by name
        resource_fields = Fields({
            'resource': resource['resource'],
            'object': resource
        })
        referrer_fields = Fields({
            'resource': referrer['resource'],
            'object': referrer
        })
        input_fields = [
            resource_fields.field_name(field_id)
            for field_id in input_fields_ids
        ]
        input_fields = sorted(input_fields)
        referrer_type = get_resource_type(referrer)
        if referrer_type == 'dataset':
            referrer_fields = Fields(referrer_fields.preferred_fields())
            referrer_fields_names = sorted( \
                [field['name'] for _, field in referrer_fields.fields.items()])
        else:
            referrer_fields_names = sorted( \
                referrer_fields.fields_by_name.keys())
        # check referrer input fields to see if they are equal
        referrer_input_fields.append(referrer_fields_names)
        # check whether the resource has an objective field not included in
        # the input fields list
        resource_type = get_resource_type(resource)
        if resource_type == 'model':
            objective_id = resource.get('objective_field')
            try:
                objective_id = objective_id.get('id')
            except AttributeError:
                pass
            referrer_objective = resource_fields.field_name(objective_id)
            referrer_input_fields.append([
                name for name in referrer_fields_names
                if name != referrer_objective
            ])
        if input_fields in referrer_input_fields:
            return []
    return referrer_fields.fields.keys()
Ejemplo n.º 2
0
def pca_processing(datasets, pca, \
    pca_ids, api, args, resume, fields=None, \
    session_file=None, path=None, log=None):
    """Creates or retrieves pca from the input data

    """

    # If we have a dataset but not a model, we create the model if the no_model
    # flag hasn't been set up.
    if datasets and not (has_pca(args) or \
            args.no_pca):
        pca_ids = []
        pcas = []

        # Only 1 pca per bigmler command at present
        number_of_pcas = 1
        if resume:
            resume, pca_ids = c.checkpoint( \
                c.are_pcas_created, path, \
                number_of_pcas, debug=args.debug)
            if not resume:
                message = u.dated("Found %s pcas out of %s."
                                  " Resuming.\n" %
                                  (len(pca_ids), number_of_pcas))
                u.log_message(message,
                              log_file=session_file,
                              console=args.verbosity)

            pcas = pca_ids
            number_of_pcas -= len(pca_ids)

        args.exclude_fields = []
        if args.exclude_objective:
            dataset = datasets[0]
            fields = Fields(dataset)
            objective_id =  \
                fields.fields_by_column_number[fields.objective_field]
            args.exclude_fields = [objective_id]
        pca_args = r.set_pca_args( \
            args, fields=fields, \
            pca_fields=args.pca_fields_)
        pca = \
            r.create_pca( \
            datasets, pca, pca_args, \
            args, api, path, session_file, log)

    # If a pca is provided, we use it.
    elif args.pca:
        pca_ids = [args.pca]
        pca = pca_ids[0]

    elif args.pca or args.pca_tag:
        pca = pca_ids[0]

    # If we are going to create projections, we must retrieve the pca
    if pca_ids and (args.test_set or args.export_fields):
        pca = \
            r.get_pca(pca, args, api, session_file)

    return pca, resume
Ejemplo n.º 3
0
def read_local_resource(path, csv_properties=None):
    """Read the JSON resource structure information from the given file.

    """
    resource = empty_resource()
    if csv_properties is None:
        csv_properties = {}
    fields = None
    open_mode = "rt" if PYTHON3 else "rb"
    with open(path, open_mode) as resource_file:
        try:
            resource = json.loads(resource_file.read())
        except IOError:
            pass
    resource_id = resource.get('resource')
    if resource_id is None:
        sys.exit("Failed to extract a BigML resource structure from the"
                 " contents of file %s." % path)
    if resource.get('object') is None:
        resource = {
            'resource': resource_id,
            'object': resource,
            'error': None,
            'code': bigml.api.HTTP_OK
        }
    fields, resource_locale, missing_tokens = get_fields_structure(resource)
    if missing_tokens:
        csv_properties['missing_tokens'] = missing_tokens
    if resource_locale:
        csv_properties['data_locale'] = resource_locale
    if fields:
        fields = Fields(resource, **csv_properties)
    return resource, csv_properties, fields
Ejemplo n.º 4
0
def get_model_fields(model, model_fields, csv_properties, args):
    """Retrieves fields info from model resource

    """
    if not csv_properties:
        csv_properties = {}
    csv_properties.update(verbose=True)
    if args.user_locale is None:
        args.user_locale = model['object'].get('locale', None)
    csv_properties.update(data_locale=args.user_locale)
    if 'model_fields' in model['object']['model']:
        model_fields = model['object']['model']['model_fields'].keys()
        csv_properties.update(include=model_fields)
    if 'missing_tokens' in model['object']['model']:
        missing_tokens = model['object']['model']['missing_tokens']
    else:
        missing_tokens = MISSING_TOKENS
    csv_properties.update(missing_tokens=missing_tokens)
    objective_field = model['object']['objective_fields']
    if isinstance(objective_field, list):
        objective_field = objective_field[0]
    csv_properties.update(objective_field=objective_field)
    fields = Fields(model['object']['model']['fields'], **csv_properties)

    return fields, objective_field
Ejemplo n.º 5
0
def get_input_fields(resource, referrer=None):
    """New list of input fields

    """
    if referrer is None:
        referrer = {}
    input_fields_ids = resource.get('input_fields', [])
    if referrer:
        referrer_fields = Fields({
            'resource': referrer['resource'],
            'object': referrer
        })
        referrer_fields_ids = referrer_fields.fields.keys()
        # case where objective field is not in input fields
        # check whether the resource has an objective field not included in
        # the input fields list
        resource_type = get_resource_type(resource)
        if resource_type == 'model':
            objective_id = resource.get('objective_field')
            try:
                objective_id = objective_id.get('id')
            except AttributeError:
                pass
            if objective_id not in input_fields_ids:
                input_fields_ids.append(objective_id)
        if input_fields_ids.sort() == referrer_fields_ids.sort():
            return []
    return input_fields_ids
Ejemplo n.º 6
0
def get_fields_changes(resource,
                       referrer=None,
                       updatable_attrs=DEFAULT_UPDATABLE):
    """Changed field attributes

    """
    if referrer is None:
        referrer = {}
    fields_attributes = {}

    resource_fields = Fields({
        'resource': resource['resource'],
        'object': resource
    }).fields
    resource_type = get_resource_type(resource)
    # for sources, extract all the updatable attributes
    if resource_type == 'source':
        updatable_attrs = SOURCE_UPDATABLE
        for field_id in resource_fields.keys():
            field_opts = {}
            field = resource_fields[field_id]
            for attribute in updatable_attrs:
                if field.get(attribute):
                    field_opts.update({attribute: field[attribute]})
            if field_opts != {}:
                fields_attributes.update({field_id: field_opts})
        return fields_attributes
    # for the rest of resources, check which attributes changed
    if referrer:
        referrer_fields = Fields({
            'resource': referrer['resource'],
            'object': referrer
        }).fields
        for field_id in resource_fields.keys():
            field_opts = {}
            if not field_id in referrer_fields.keys():
                continue
            field = resource_fields[field_id]

            for attribute in updatable_attrs:
                ref_values = ["", referrer_fields[field_id].get(attribute, "")]
                if not field.get(attribute, "") in ref_values:
                    field_opts.update({attribute: field[attribute]})

            if field_opts != {}:
                fields_attributes.update({field_id: field_opts})
    return fields_attributes
Ejemplo n.º 7
0
def update_with_summary_file(step, resource, summary_file):
    fields = Fields(resource)
    changes = fields.filter_fields_update( \
        fields.new_fields_structure(res_filename(summary_file)))
    resource_type = get_resource_type(resource)
    resource = world.api.updaters[resource_type](resource, changes)
    world.api.ok(resource)
    setattr(world, resource_type, resource)
Ejemplo n.º 8
0
def get_fields_structure(resource, csv_properties):
    """Builds a Fields object from the fields information in the resource

    """
    if not csv_properties and 'locale' in resource['object']:
        csv_properties = {'data_locale': resource['object']['locale']}
    fields = Fields(resource['object']['fields'], **csv_properties)
    return fields
Ejemplo n.º 9
0
def fusion_processing(fusion, \
    fusion_ids, api, args, resume, fields=None, \
    session_file=None, path=None, log=None):
    """Creates or retrieves fusion from the input data

    """

    # If we have a models' list but not a fusion,
    # we create the model if the no_model
    # flag hasn't been set up.
    if args.fusion_models_ is not None and not has_fusion(args):
        fusion_ids = []

        # Only 1 fusion per bigmler command at present
        number_of_fusions = 1
        if resume:
            resume, fusion_ids = c.checkpoint( \
                c.are_fusions_created, path, \
                number_of_fusions, debug=args.debug)
            if not resume:
                message = u.dated("Found %s fusions out of %s."
                                  " Resuming.\n" %
                                  (len(fusion_ids), number_of_fusions))
                u.log_message(message,
                              log_file=session_file,
                              console=args.verbosity)

            fusion = fusion_ids[0]
            first_model_id = api.get_fusion(fusion)[ \
                "object"]["fusion"]["models"][0]["id"]
            first_model_kind = api.get_fusion(fusion)[ \
                "object"]["fusion"]["models"][0]["kind"]
            first_model = api.getters[first_model_kind](first_model_id)
            fields = Fields(first_model)
            number_of_fusions -= len(fusion_ids)

        fusion_args = r.set_fusion_args( \
            args, fields)
        fusion = \
            r.create_fusion( \
            args.fusion_models_, fusion, fusion_args, \
            args, api, path, session_file, log)

    # If a fusion is provided, we use it.
    elif args.fusion:
        fusion_ids = [args.fusion]
        fusion = fusion_ids[0]

    elif args.fusion or args.fusion_tag:
        fusion = fusion_ids[0]

    # If we are going to create predictions, we must retrieve the fusion
    if fusion_ids and args.test_set:
        fusion = \
            r.get_fusion(fusion, args, api, session_file)
        args.objective_field = fusion['object']['objective_field_name']

    return fusion, resume
Ejemplo n.º 10
0
def dataset_processing(source,
                       training_set,
                       test_set,
                       model_ids,
                       name,
                       description,
                       fields,
                       dataset_fields,
                       api,
                       args,
                       resume,
                       csv_properties=None,
                       session_file=None,
                       path=None,
                       log=None):
    """Creating or retrieving dataset from input arguments

    """
    dataset = None
    if (training_set or args.source or (args.evaluate and test_set)):
        # if resuming, try to extract args.dataset form log files
        if resume:
            message = u.dated("Dataset not found. Resuming.\n")
            resume, args.dataset = c.checkpoint(c.is_dataset_created,
                                                path,
                                                debug=args.debug,
                                                message=message,
                                                log_file=session_file,
                                                console=args.verbosity)

    # If we have a source but no dataset or model has been provided, we
    # create a new dataset if the no_dataset option isn't set up. Also
    # if evaluate is set and test_set has been provided.
    if ((source and not args.dataset and not args.model and not model_ids
         and not args.no_dataset)
            or (args.evaluate and args.test_set and not args.dataset)):
        dataset_args = r.set_dataset_args(name, description, args, fields,
                                          dataset_fields)
        dataset = r.create_dataset(source, dataset_args, args, api, path,
                                   session_file, log)

    # If a dataset is provided, let's retrieve it.
    elif args.dataset:
        dataset = bigml.api.get_dataset_id(args.dataset)

    # If we already have a dataset, we check the status and get the fields if
    # we hadn't them yet.
    if dataset:
        dataset = r.get_dataset(dataset, api, args.verbosity, session_file)
        if not csv_properties and 'locale' in dataset['object']:
            csv_properties = {'data_locale': dataset['object']['locale']}
        fields = Fields(dataset['object']['fields'], **csv_properties)
        if args.public_dataset:
            r.publish_dataset(dataset, api, args, session_file)
    return dataset, resume, csv_properties, fields
Ejemplo n.º 11
0
def get_fusion_fields(fusion, csv_properties, args):
    """Retrieves fields info from Fusion resource

    """
    args.retrieve_api_.ok(fusion)
    if not csv_properties:
        csv_properties = {}
    csv_properties.update(verbose=True)
    csv_properties.update(missing_tokens=DEFAULT_MISSING_TOKENS)
    return Fields(fusion['object']['fusion']['fields'], \
        **csv_properties)
Ejemplo n.º 12
0
def get_topic_model_fields(topic_model, csv_properties, args):
    """Retrieves fields info from topic model resource

    """
    if not csv_properties:
        csv_properties = {}
    csv_properties.update(verbose=True)
    if args.user_locale is None:
        args.user_locale = topic_model['object'].get('locale', None)
    csv_properties.update(data_locale=args.user_locale)
    csv_properties.update(missing_tokens=DEFAULT_MISSING_TOKENS)
    return Fields(topic_model, **csv_properties)
Ejemplo n.º 13
0
def get_cluster_fields(cluster, csv_properties, args):
    """Retrieves fields info from cluster resource

    """
    if not csv_properties:
        csv_properties = {}
    csv_properties.update(verbose=True)
    if args.user_locale is None:
        args.user_locale = cluster['object'].get('locale', None)
    csv_properties.update(data_locale=args.user_locale)
    csv_properties.update(missing_tokens=DEFAULT_MISSING_TOKENS)
    return Fields(cluster['object']['clusters']['fields'], **csv_properties)
Ejemplo n.º 14
0
def get_model_fields(model,
                     csv_properties,
                     args,
                     single_model=True,
                     multi_label_data=None,
                     local_ensemble=None):
    """Retrieves fields info from model resource

    """
    if not csv_properties:
        csv_properties = {}
    csv_properties.update(verbose=True)
    if args.user_locale is None:
        args.user_locale = model['object'].get('locale', None)
    csv_properties.update(data_locale=args.user_locale)
    if single_model and 'model_fields' in model['object']['model']:
        model_fields = model['object']['model']['model_fields'].keys()
        csv_properties.update(include=model_fields)
    else:
        csv_properties.update(include=None)
    if 'missing_tokens' in model['object']['model']:
        missing_tokens = model['object']['model']['missing_tokens']
    else:
        missing_tokens = MISSING_TOKENS
    csv_properties.update(missing_tokens=missing_tokens)
    # if the model belongs to a multi-label set of models, the real objective
    # field is never amongst the set of fields of each individual model, so
    # we must add it.
    fields_dict = copy.deepcopy(model['object']['model']['fields'])

    if args.multi_label:
        # Adds the real objective field to fields_dict
        objective_field = multi_label_data['objective_name']
        objective_id = multi_label_data['objective_id']
        objective_column = multi_label_data['objective_column']
        fields_dict[objective_id] = {
            "op_type": "categorical",
            "name": objective_field,
            "column_number": objective_column
        }
    else:
        if local_ensemble is not None:
            fields_dict = copy.deepcopy(local_ensemble.fields)
        objective_field = model['object']['objective_fields']
        if isinstance(objective_field, list):
            objective_field = objective_field[0]
    csv_properties.update(objective_field=objective_field)

    fields = Fields(fields_dict, **csv_properties)
    return fields, objective_field
Ejemplo n.º 15
0
def get_logistic_fields(logistic_regression, csv_properties, args):
    """Retrieves fields info from logistic regression resource

    """
    if not csv_properties:
        csv_properties = {}
    csv_properties.update(verbose=True)
    if args.user_locale is None:
        args.user_locale = logistic_regression['object'].get('locale', None)
    csv_properties.update(data_locale=args.user_locale)
    csv_properties.update(missing_tokens=DEFAULT_MISSING_TOKENS)
    return Fields(logistic_regression['object'][ \
        'logistic_regression']['fields'], \
        **csv_properties)
Ejemplo n.º 16
0
def best_first_search(datasets_file,
                      api,
                      args,
                      common_options,
                      staleness=None,
                      penalty=None,
                      objective_name=None,
                      resume=False):
    """Selecting the fields to be used in the model construction

    """
    counter = 0
    loop_counter = 0
    features_file = os.path.normpath(
        os.path.join(args.output_dir, FEATURES_LOG))
    with open(features_file, u.open_mode("w")) as features_handler:
        features_writer = csv.writer(features_handler, lineterminator="\n")
        features_writer.writerow(
            ["step", "state", "score", "metric_value", "best_score"])
        features_handler.flush()
        if staleness is None:
            staleness = DEFAULT_STALENESS
        if penalty is None:
            penalty = DEFAULT_PENALTY
        # retrieving the first dataset in the file
        try:
            with open(datasets_file, u.open_mode("r")) as datasets_handler:
                dataset_id = datasets_handler.readline().strip()
        except IOError, exc:
            sys.exit("Could not read the generated datasets file: %s" %
                     str(exc))
        try:
            stored_dataset = u.storage_file_name(args.output_dir, dataset_id)
            with open(stored_dataset, u.open_mode("r")) as dataset_handler:
                dataset = json.loads(dataset_handler.read())
        except IOError:
            dataset = api.check_resource(dataset_id,
                                         query_string=ALL_FIELDS_QS)
        # initial feature set
        fields = Fields(dataset)
        excluded_features = ([] if args.exclude_features is None else
                             args.exclude_features.split(args.args_separator))
        try:
            excluded_ids = [
                fields.field_id(feature) for feature in excluded_features
            ]
            objective_id = fields.field_id(objective_name)
        except ValueError, exc:
            sys.exit(exc)
Ejemplo n.º 17
0
def get_deepnet_fields(deepnet, csv_properties, args):
    """Retrieves fields info from deepnet resource

    """
    if not csv_properties:
        csv_properties = {}
    csv_properties.update(verbose=True)
    if args.user_locale is None:
        args.user_locale = deepnet['object'].get('locale', None)
    csv_properties.update(data_locale=args.user_locale)
    csv_properties.update(missing_tokens=DEFAULT_MISSING_TOKENS)
    csv_properties.update(objective_field=deepnet['object'].get( \
        'objective_field'))
    return Fields(deepnet['object'][ \
        'deepnet']['fields'], **csv_properties)
Ejemplo n.º 18
0
def get_pca_fields(pca, csv_properties, args):
    """Retrieves fields info from PCA resource

    """
    args.retrieve_api_.ok(pca)
    if not csv_properties:
        csv_properties = {}
    csv_properties.update(verbose=True)
    if args.user_locale is None:
        args.user_locale = pca['object'].get('locale', None)
    csv_properties.update(data_locale=args.user_locale)
    csv_properties.update(missing_tokens=DEFAULT_MISSING_TOKENS)
    if args.exclude_objective:
        csv_properties.update({"objective_field_present": False})
        csv_properties.update({"objective_field": None})
    return Fields(pca['object']['pca']['fields'], \
        **csv_properties)
Ejemplo n.º 19
0
def create_kfold_datasets_file(args, api, common_options, resume=False):
    """Create the kfold dataset resources and store their ids in a file
       one per line

    """
    message = ('Creating the kfold datasets............\n')
    u.log_message(message, log_file=session_file, console=args.verbosity)
    if args.output_dir is None:
        args.output_dir = a.NOW
    # retrieve dataset
    dataset_id = bigml.api.get_dataset_id(args.dataset)
    if dataset_id:
        dataset = api.check_resource(dataset_id)
        try:
            args.objective_field = int(args.objective_field)
        except (TypeError, ValueError):
            pass
        # if the user provided no objective field, try to use the one in the
        # dataset
        if args.objective_field is None:
            try:
                args.objective_field = dataset['object'][
                    'objective_field']['column_number']
            except KeyError:
                pass
        # check that kfold_field is unique
        fields = Fields(dataset, objective_field=args.objective_field,
                        objective_field_present=True)
        try:
            objective_id = fields.field_id(fields.objective_field)
            objective_name = fields.field_name(objective_id)
        except ValueError, exc:
            sys.exit(exc)
        kfold_field_name = avoid_duplicates(DEFAULT_KFOLD_FIELD, fields)
        # create jsons to generate partial datasets
        selecting_file_list, resume = create_kfold_json(args, kfold_field_name,
                                                        objective_id,
                                                        resume=resume)
        # generate test datasets
        datasets_file, resume = create_kfold_datasets(dataset_id, args,
                                                      selecting_file_list,
                                                      objective_name,
                                                      common_options,
                                                      resume=resume)
        return datasets_file, objective_name, resume
Ejemplo n.º 20
0
def test_source_processing(api,
                           args,
                           resume,
                           name=None,
                           csv_properties=None,
                           session_file=None,
                           path=None,
                           log=None):
    """Creating or retrieving a test data source from input arguments

    """
    test_source = None
    fields = None
    if csv_properties is None:
        csv_properties = {}
    if args.test_set and args.remote:
        # If resuming, try to extract args.source form log files
        if resume:
            message = u.dated("Test source not found. Resuming.\n")
            resume, args.test_source = c.checkpoint(c.is_source_created,
                                                    path,
                                                    suffix="_test",
                                                    debug=args.debug,
                                                    message=message,
                                                    log_file=session_file,
                                                    console=args.verbosity)

        if not resume:
            source_args = r.set_source_args(args,
                                            name=name,
                                            data_set_header=args.test_header)
            test_source = r.create_source(args.test_set,
                                          source_args,
                                          args,
                                          api,
                                          path,
                                          session_file,
                                          log,
                                          source_type="test")

    # If a source is provided either through the command line or in resume
    # steps, we use it.
    elif args.test_source:
        test_source = bigml.api.get_source_id(args.test_source)

    # If we already have source, we check that is finished, extract the
    # fields, and update them if needed.
    if test_source:
        test_source = r.get_source(test_source, api, args.verbosity,
                                   session_file)
        if 'source_parser' in test_source['object']:
            source_parser = test_source['object']['source_parser']
            if 'missing_tokens' in source_parser:
                csv_properties['missing_tokens'] = (
                    source_parser['missing_tokens'])
            if 'locale' in source_parser:
                csv_properties['data_locale'] = source_parser['locale']
                if (args.user_locale is not None and bigml_locale(
                        args.user_locale) == source_parser['locale']):
                    args.user_locale = None

        fields = Fields(test_source['object']['fields'], **csv_properties)

        if (args.field_attributes_ or args.types_ or args.user_locale
                or args.json_args.get('source')):
            # avoid updating project_id in source
            project_id, args.project_id = args.project_id, None
            test_source_args = r.set_source_args(args, fields=fields)
            test_source = r.update_source(test_source, source_args, args, api,
                                          session_file)
            args.project_id = project_id
            fields = Fields(source['object']['fields'], **csv_properties)

    return test_source, resume, csv_properties, fields
Ejemplo n.º 21
0
def source_processing(api,
                      args,
                      resume,
                      csv_properties=None,
                      multi_label_data=None,
                      session_file=None,
                      path=None,
                      log=None):
    """Creating or retrieving a data source from input arguments

    """
    source = None
    fields = None
    if (args.training_set or
        (hasattr(args, "evaluate") and args.evaluate and args.test_set)):
        # If resuming, try to extract args.source form log files

        if resume:
            message = u.dated("Source not found. Resuming.\n")
            resume, args.source = c.checkpoint(c.is_source_created,
                                               path,
                                               debug=args.debug,
                                               message=message,
                                               log_file=session_file,
                                               console=args.verbosity)

    # If neither a previous source, dataset or model are provided.
    # we create a new one. Also if --evaluate and test data are provided
    # we create a new dataset to test with.
    data_set, data_set_header = r.data_to_source(args)
    if data_set is not None:
        # Check if there's a created project for it
        args.project_id = pp.project_processing(api,
                                                args,
                                                resume,
                                                session_file=session_file,
                                                path=path,
                                                log=log)
        source_args = r.set_source_args(args,
                                        multi_label_data=multi_label_data,
                                        data_set_header=data_set_header)
        source = r.create_source(data_set, source_args, args, api, path,
                                 session_file, log)

    # If a source is provided either through the command line or in resume
    # steps, we use it.
    elif args.source:
        source = bigml.api.get_source_id(args.source)

    # If we already have source, we check that is finished, extract the
    # fields, and update them if needed.
    if source:
        source = r.get_source(source, api, args.verbosity, session_file)
        if 'source_parser' in source['object']:
            source_parser = source['object']['source_parser']
            if 'missing_tokens' in source_parser:
                csv_properties['missing_tokens'] = (
                    source_parser['missing_tokens'])
            if 'locale' in source_parser:
                csv_properties['data_locale'] = source_parser['locale']
                # No changes if user locale is the one in the source.
                if (args.user_locale is not None and bigml_locale(
                        args.user_locale) == source_parser['locale']):
                    args.user_locale = None
        fields = Fields(source['object']['fields'], **csv_properties)

        if (args.field_attributes_ or args.types_ or args.user_locale
                or args.json_args.get('source')):
            # avoid updating project_id in source
            project_id, args.project_id = args.project_id, None
            source_args = r.set_source_args(args, fields=fields)
            source = r.update_source(source, source_args, args, api,
                                     session_file)
            args.project_id = project_id
            fields = Fields(source['object']['fields'], **csv_properties)

    return source, resume, csv_properties, fields
Ejemplo n.º 22
0
def dataset_processing(source,
                       api,
                       args,
                       resume,
                       fields=None,
                       csv_properties=None,
                       multi_label_data=None,
                       session_file=None,
                       path=None,
                       log=None):
    """Creating or retrieving dataset from input arguments

    """
    datasets = []
    dataset = None
    if (args.training_set or args.source or
        (hasattr(args, "evaluate") and args.evaluate and args.test_set)):
        # if resuming, try to extract args.dataset form log files
        if resume:
            message = u.dated("Dataset not found. Resuming.\n")
            resume, args.dataset = c.checkpoint(c.is_dataset_created,
                                                path,
                                                debug=args.debug,
                                                message=message,
                                                log_file=session_file,
                                                console=args.verbosity)

    # If we have a source but no dataset or model has been provided, we
    # create a new dataset if the no_dataset option isn't set up. Also
    # if evaluate is set and test_set has been provided.
    if ((source and not args.has_datasets_ and not args.has_models_
         and not args.no_dataset)
            or (hasattr(args, "evaluate") and args.evaluate and args.test_set
                and not args.dataset)):
        dataset_args = r.set_dataset_args(args,
                                          fields,
                                          multi_label_data=multi_label_data)
        dataset = r.create_dataset(source, dataset_args, args, api, path,
                                   session_file, log)

    # If set of datasets is provided, let's check their ids.
    elif args.dataset_ids:
        for i in range(0, len(args.dataset_ids)):
            dataset_id = args.dataset_ids[i]
            if isinstance(dataset_id, dict) and "id" in dataset_id:
                dataset_id = dataset_id["id"]
            datasets.append(bigml.api.get_dataset_id(dataset_id))
        dataset = datasets[0]
    # If a dataset is provided, let's retrieve it.
    elif args.dataset:
        dataset = bigml.api.get_dataset_id(args.dataset)

    # If we already have a dataset, we check the status and get the fields if
    # we hadn't them yet.
    if dataset:
        dataset = r.get_dataset(dataset, api, args.verbosity, session_file)

        if ('object' in dataset and 'objective_field' in dataset['object']
                and 'column_number' in dataset['object']['objective_field']):
            dataset_objective = dataset['object']['objective_field'][
                'column_number']
            csv_properties.update(objective_field=dataset_objective,
                                  objective_field_present=True)

        fields = get_fields_structure(dataset, csv_properties)

        if args.public_dataset:
            r.publish_dataset(dataset, args, api, session_file)

        if hasattr(args, 'objective_field'):
            new_objective = get_new_objective(fields, args.objective_field)
        else:
            new_objective = None
        updated = False
        # We'll update the dataset if
        #  the flag --dataset_attributes is used
        #  the --multi-label flag is used and there's an --objective-field
        #  the --max-categories flag is used and there's an --objective-field
        #  the --impor-fields flag is used
        if check_dataset_update(args, dataset):
            dataset_args = r.set_dataset_args(args, fields)
            if args.shared_flag and r.shared_changed(args.shared, dataset):
                dataset_args.update(shared=args.shared)
            dataset = r.update_dataset(dataset,
                                       dataset_args,
                                       args,
                                       api=api,
                                       path=path,
                                       session_file=session_file)
            dataset = r.get_dataset(dataset, api, args.verbosity, session_file)
            updated = True
        if new_objective is not None:
            csv_properties.update(objective_field=args.objective_field,
                                  objective_field_present=True)
            updated = True
        if updated:
            fields = Fields(dataset['object']['fields'], **csv_properties)
        if not datasets:
            datasets = [dataset]
        else:
            datasets[0] = dataset
    return datasets, resume, csv_properties, fields
Ejemplo n.º 23
0
def i_get_the_missing_values(step):
    resource = world.dataset
    fields = Fields(resource['fields'])
    world.step_result = fields.missing_counts()
Ejemplo n.º 24
0
def create_fields(step, objective_column):
    world.fields = Fields(world.source,
                          objective_field=int(objective_column),
                          objective_field_present=True)
Ejemplo n.º 25
0
    def reify_dataset(self, resource_id):
        """Extracts the REST API arguments from the dataset JSON structure

        """
        child = self.get_resource(resource_id)
        origin, parent_id = u.get_origin_info(child)
        parent = self.get_resource(parent_id)

        opts = {"create": {}, "update": {}, "get": {}}

        # as two-steps result from a cluster or batch prediction, centroid
        # or anomaly score
        grandparent = parent
        if origin in ['origin_batch_resource', 'cluster']:
            if origin == "cluster":
                opts['create'].update({"centroid": child['centroid']})
            grandparents = u.get_origin_info(parent)
            # batch resources have two parents, choose the dataset
            if origin == "origin_batch_resource" and \
                    isinstance(grandparents, list):
                for gp_origin, grandparent in grandparents:
                    if gp_origin == "dataset":
                        break
            else:
                _, grandparent = grandparents
            grandparent = self.get_resource(grandparent)

        # options common to all model types
        call = "update" if origin == "origin_batch_resource" else "create"
        u.common_dataset_opts(child, grandparent, opts, call=call)

        # update options
        dataset_defaults = DEFAULTS["dataset"].get("update", {})

        for attribute, default_value in dataset_defaults.items():
            opts["update"].update(
                u.default_setting(child, attribute, *default_value))
        # name, exclude automatic naming alternatives
        autonames = [u'']
        u.non_automatic_name(child, opts, autonames=autonames)

        # objective field
        resource_fields = Fields({
            'resource': child['resource'],
            'object': child
        })
        objective_id = child['objective_field']['id']
        preferred_fields = resource_fields.preferred_fields()
        # if there's no preferred fields, use the fields structure
        if len(preferred_fields.keys()) == 0:
            preferred_fields = resource_fields.fields
        max_column = sorted([
            field['column_number'] for _, field in preferred_fields.items()
            if field['optype'] != "text"
        ],
                            reverse=True)[0]
        objective_column = resource_fields.fields[objective_id][ \
            'column_number']
        if objective_column != max_column:
            opts['create'].update({"objective_field": {"id": objective_id}})

        if origin != "origin_batch_resource":
            # resize
            if (child['size'] != grandparent['size']
                    and get_resource_type(parent) == 'source'):
                opts['create'].update({"size": child['size']})

            # generated fields
            if child.get('new_fields', None):
                new_fields = child['new_fields']
                for new_field in new_fields:
                    new_field['field'] = new_field['generator']
                    del new_field['generator']

                opts['create'].update({"new_fields": new_fields})

            u.range_opts(child, grandparent, opts)

        # for batch_predictions, batch_clusters, batch_anomalies generated
        # datasets, attributes cannot be set at creation time, so we
        # must update the resource instead
        suffix = None
        if origin == "origin_batch_resource":
            opts["update"].update(opts["create"])
            opts["create"] = {}
            suffix = "['object']['output_dataset_resource']"
        calls = u.build_calls(resource_id, [parent_id], opts, suffix=suffix)
        self.add(resource_id, calls)
Ejemplo n.º 26
0
def create_fields_from_dataset(step, objective_column):
    world.fields = Fields(world.dataset,
                          objective_field=int(objective_column),
                          objective_field_present=True)
Ejemplo n.º 27
0
def create_kfold_datasets_file(args, api, command_obj, resume=False):
    """Create the kfold dataset resources and store their ids in a file
       one per line

    """
    message = ('Creating the kfold datasets............\n')
    u.log_message(message, log_file=session_file, console=args.verbosity)
    if args.output_dir is None:
        args.output_dir = a.NOW

    csv_properties = {}
    fields = None
    dataset = None
    datasets = []
    if args.dataset_file:
        # dataset is retrieved from the contents of the given local JSON file
        model_dataset, csv_properties, fields = u.read_local_resource(
            args.dataset_file, csv_properties=csv_properties)
        if not args.datasets:
            datasets = [model_dataset]
            dataset = model_dataset
        else:
            datasets = u.read_datasets(args.datasets)
        dataset_id = dataset['resource']
    elif args.dataset:
        dataset_id = bigml.api.get_dataset_id(args.dataset)
        datasets = [dataset_id]
    elif args.dataset_ids:
        datasets = args.dataset_ids
        dataset_id = datasets[0]

    if dataset_id:
        if not dataset:
            dataset = api.check_resource(dataset_id,
                                         query_string=ALL_FIELDS_QS)
        try:
            args.objective_field = int(args.objective_field)
        except (TypeError, ValueError):
            pass
        # if the user provided no objective field, try to use the one in the
        # dataset
        if args.objective_field is None:
            try:
                args.objective_field = dataset['object']['objective_field'][
                    'column_number']
            except KeyError:
                pass
        # check that kfold_field is unique
        fields = Fields(dataset,
                        objective_field=args.objective_field,
                        objective_field_present=True)
        if args.random_fields:
            default_candidates_limits(args, fields)
        try:
            objective_id = fields.field_id(fields.objective_field)
            objective_name = fields.field_name(objective_id)
        except ValueError, exc:
            sys.exit(exc)
        kfold_field_name = avoid_duplicates(DEFAULT_KFOLD_FIELD, fields)
        # create jsons to generate partial datasets
        selecting_file_list, resume = create_kfold_json(args,
                                                        kfold_field_name,
                                                        objective_id,
                                                        resume=resume)
        # generate test datasets
        datasets_file, resume = create_kfold_datasets(dataset_id,
                                                      args,
                                                      selecting_file_list,
                                                      command_obj,
                                                      resume=resume)
        return datasets_file, objective_name, resume
Ejemplo n.º 28
0
 if penalty is None:
     penalty = DEFAULT_PENALTY
 # retrieving the first dataset in the file
 try:
     with open(datasets_file, u.open_mode("r")) as datasets_handler:
         dataset_id = datasets_handler.readline().strip()
 except IOError, exc:
     sys.exit("Could not read the generated datasets file: %s" % str(exc))
 try:
     stored_dataset = u.storage_file_name(args.output_dir, dataset_id)
     with open(stored_dataset, u.open_mode("r")) as dataset_handler:
         dataset = json.loads(dataset_handler.read())
 except IOError:
     dataset = api.check_resource(dataset_id, query_string=ALL_FIELDS_QS)
 # initial feature set
 fields = Fields(dataset)
 excluded_features = ([] if args.exclude_features is None else
                      args.exclude_features.split(args.args_separator))
 try:
     excluded_ids = [
         fields.field_id(feature) for feature in excluded_features
     ]
     objective_id = fields.field_id(objective_name)
 except ValueError, exc:
     sys.exit(exc)
 field_ids = [
     field_id for field_id in fields.preferred_fields()
     if field_id != objective_id and not field_id in excluded_ids
 ]
 field_ids.sort()
 # headers are extended with a column per field
Ejemplo n.º 29
0
def source_processing(training_set,
                      test_set,
                      training_set_header,
                      test_set_header,
                      name,
                      description,
                      api,
                      args,
                      resume,
                      csv_properties=None,
                      field_attributes=None,
                      types=None,
                      session_file=None,
                      path=None,
                      log=None):
    """Creating or retrieving a data source from input arguments

    """
    source = None
    fields = None
    if (training_set or (args.evaluate and test_set)):
        # If resuming, try to extract args.source form log files

        if resume:
            message = u.dated("Source not found. Resuming.\n")
            resume, args.source = c.checkpoint(c.is_source_created,
                                               path,
                                               debug=args.debug,
                                               message=message,
                                               log_file=session_file,
                                               console=args.verbosity)

    # If neither a previous source, dataset or model are provided.
    # we create a new one. Also if --evaluate and test data are provided
    # we create a new dataset to test with.
    data_set, data_set_header = r.data_to_source(training_set, test_set,
                                                 training_set_header,
                                                 test_set_header, args)
    if data_set is not None:
        source_args = r.set_source_args(data_set_header, name, description,
                                        args)
        source = r.create_source(data_set, source_args, args, api, path,
                                 session_file, log)

    # If a source is provided either through the command line or in resume
    # steps, we use it.
    elif args.source:
        source = bigml.api.get_source_id(args.source)

    # If we already have source, we check that is finished, extract the
    # fields, and update them if needed.
    if source:
        source = r.get_source(source, api, args.verbosity, session_file)
        if 'source_parser' in source['object']:
            source_parser = source['object']['source_parser']
            if 'missing_tokens' in source_parser:
                csv_properties['missing_tokens'] = (
                    source_parser['missing_tokens'])
            if 'data_locale' in source_parser:
                csv_properties['data_locale'] = source_parser['locale']

        fields = Fields(source['object']['fields'], **csv_properties)
        if field_attributes:
            source = r.update_source_fields(source, field_attributes, fields,
                                            api, args.verbosity, session_file)
        if types:
            source = r.update_source_fields(source, types, fields, api,
                                            args.verbosity, session_file)
    return source, resume, csv_properties, fields
Ejemplo n.º 30
0
def compute_output(api, args):
    """ Creates a dataset using the `training_set`.

    """

    source = None
    dataset = None
    fields = None
    other_label = OTHER
    multi_label_data = None
    multi_label_fields = []
    datasets = None

    # variables from command-line options
    resume = args.resume_
    output = args.output

    check_args_coherence(args)
    path = u.check_dir(output)

    session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG)
    csv_properties = {}
    # If logging is required set the file for logging
    log = None
    if args.log_file:
        u.check_dir(args.log_file)
        log = args.log_file
        # If --clear_logs the log files are cleared
        clear_log_files([log])

    # labels to be used in multi-label expansion
    labels = (None if args.labels is None else [
        label.strip() for label in args.labels.split(args.args_separator)
    ])
    if labels is not None:
        labels = sorted([label for label in labels])

    # multi_label file must be preprocessed to obtain a new extended file
    if args.multi_label and args.training_set is not None:
        (args.training_set, multi_label_data) = ps.multi_label_expansion(
            args.training_set,
            args.train_header,
            args,
            path,
            labels=labels,
            session_file=session_file)
        args.train_header = True
        args.objective_field = multi_label_data["objective_name"]
        all_labels = l.get_all_labels(multi_label_data)
        if not labels:
            labels = all_labels
    else:
        all_labels = labels
    if args.objective_field:
        csv_properties.update({'objective_field': args.objective_field})
    if args.source_file:
        # source is retrieved from the contents of the given local JSON file
        source, csv_properties, fields = u.read_local_resource(
            args.source_file, csv_properties=csv_properties)
    else:
        # source is retrieved from the remote object
        source, resume, csv_properties, fields = ps.source_processing(
            api,
            args,
            resume,
            csv_properties=csv_properties,
            multi_label_data=multi_label_data,
            session_file=session_file,
            path=path,
            log=log)
    if source is not None:
        args.source = bigml.api.get_source_id(source)
    if args.multi_label and source:
        multi_label_data = l.get_multi_label_data(source)
        (args.objective_field, labels, all_labels,
         multi_label_fields) = l.multi_label_sync(args.objective_field, labels,
                                                  multi_label_data, fields,
                                                  multi_label_fields)
    if fields and args.export_fields:
        fields.summary_csv(os.path.join(path, args.export_fields))
    if args.dataset_file:
        # dataset is retrieved from the contents of the given local JSON file
        model_dataset, csv_properties, fields = u.read_local_resource(
            args.dataset_file, csv_properties=csv_properties)
        if not args.datasets:
            datasets = [model_dataset]
            dataset = model_dataset
        else:
            datasets = u.read_datasets(args.datasets)
    if not datasets:
        # dataset is retrieved from the remote object
        datasets, resume, csv_properties, fields = pd.dataset_processing(
            source,
            api,
            args,
            resume,
            fields=fields,
            csv_properties=csv_properties,
            multi_label_data=multi_label_data,
            session_file=session_file,
            path=path,
            log=log)

    if datasets:
        dataset = datasets[-1]
        if args.to_csv is not None:
            resume = pd.export_dataset(dataset,
                                       api,
                                       args,
                                       resume,
                                       session_file=session_file,
                                       path=path)

        # Now we have a dataset, let's check if there's an objective_field
        # given by the user and update it in the fields structure
        args.objective_id_ = get_objective_id(args, fields)

    # If test_split is used, split the dataset in a training and a test dataset
    # according to the given split
    if args.test_split > 0:
        dataset, _, resume = pd.split_processing(
            dataset,
            api,
            args,
            resume,
            multi_label_data=multi_label_data,
            session_file=session_file,
            path=path,
            log=log)
        datasets[0] = dataset

    # Check if the dataset has a categorical objective field and it
    # has a max_categories limit for categories
    if args.max_categories > 0 and len(datasets) == 1:
        if pd.check_max_categories(fields.fields[args.objective_id_]):
            distribution = pd.get_categories_distribution(
                dataset, args.objective_id_)
            if distribution and len(distribution) > args.max_categories:
                categories = [element[0] for element in distribution]
                other_label = pd.create_other_label(categories, other_label)
                datasets, resume = pd.create_categories_datasets(
                    dataset,
                    distribution,
                    fields,
                    args,
                    api,
                    resume,
                    session_file=session_file,
                    path=path,
                    log=log,
                    other_label=other_label)
        else:
            sys.exit("The provided objective field is not categorical nor "
                     "a full terms only text field. "
                     "Only these fields can be used with"
                     "  --max-categories")

    # If any of the transformations is applied,
    # generate a new dataset from the given list of datasets
    if args.new_dataset:
        dataset, resume = pd.create_new_dataset(datasets,
                                                api,
                                                args,
                                                resume,
                                                fields=fields,
                                                session_file=session_file,
                                                path=path,
                                                log=log)
        datasets = [dataset]

    # Check if the dataset has a generators file associated with it, and
    # generate a new dataset with the specified field structure. Also
    # if the --to-dataset flag is used to clone or sample the original dataset
    if args.new_fields or args.sample_rate != 1 or \
            (args.lisp_filter or args.json_filter) and not a.has_source(args):
        if fields is None:
            if isinstance(dataset, basestring):
                dataset = u.check_resource(dataset, api=api)
            fields = Fields(dataset, csv_properties)
        args.objective_id_ = get_objective_id(args, fields)
        args.objective_name_ = fields.field_name(args.objective_id_)
        dataset, resume = pd.create_new_dataset(dataset,
                                                api,
                                                args,
                                                resume,
                                                fields=fields,
                                                session_file=session_file,
                                                path=path,
                                                log=log)
        datasets[0] = dataset
        # rebuild fields structure for new ids and fields
        csv_properties.update({
            'objective_field': args.objective_name_,
            'objective_field_present': True
        })
        fields = pd.get_fields_structure(dataset, csv_properties)
        args.objective_id_ = get_objective_id(args, fields)
    if args.multi_label and dataset and multi_label_data is None:
        multi_label_data = l.get_multi_label_data(dataset)
        (args.objective_field, labels, all_labels,
         multi_label_fields) = l.multi_label_sync(args.objective_field, labels,
                                                  multi_label_data, fields,
                                                  multi_label_fields)

    if dataset:
        # retrieves max_categories data, if any
        args.max_categories = get_metadata(dataset, 'max_categories',
                                           args.max_categories)
        other_label = get_metadata(dataset, 'other_label', other_label)
    if fields and args.export_fields:
        fields.summary_csv(os.path.join(path, args.export_fields))

    u.print_generated_files(path,
                            log_file=session_file,
                            verbosity=args.verbosity)
    if args.reports:
        clear_reports(path)
        if args.upload:
            upload_reports(args.reports, path)