コード例 #1
0
ファイル: datasets.py プロジェクト: javs0188/bigmler
def dataset_processing(source,
                       api,
                       args,
                       resume,
                       fields=None,
                       csv_properties=None,
                       multi_label_data=None,
                       session_file=None,
                       path=None,
                       log=None):
    """Creating or retrieving dataset from input arguments

    """
    datasets = []
    dataset = None
    if (args.training_set or args.source or
        (hasattr(args, "evaluate") and args.evaluate and args.test_set)):
        # if resuming, try to extract args.dataset form log files
        if resume:
            message = u.dated("Dataset not found. Resuming.\n")
            resume, args.dataset = c.checkpoint(c.is_dataset_created,
                                                path,
                                                debug=args.debug,
                                                message=message,
                                                log_file=session_file,
                                                console=args.verbosity)

    # If we have a source but no dataset or model has been provided, we
    # create a new dataset if the no_dataset option isn't set up. Also
    # if evaluate is set and test_set has been provided.
    if ((source and not args.has_datasets_ and not args.has_models_
         and not args.no_dataset)
            or (hasattr(args, "evaluate") and args.evaluate and args.test_set
                and not args.dataset)):
        dataset_args = r.set_dataset_args(args,
                                          fields,
                                          multi_label_data=multi_label_data)
        dataset = r.create_dataset(source, dataset_args, args, api, path,
                                   session_file, log)

    # If set of datasets is provided, let's check their ids.
    elif args.dataset_ids:
        for i in range(0, len(args.dataset_ids)):
            dataset_id = args.dataset_ids[i]
            if isinstance(dataset_id, dict) and "id" in dataset_id:
                dataset_id = dataset_id["id"]
            datasets.append(bigml.api.get_dataset_id(dataset_id))
        dataset = datasets[0]
    # If a dataset is provided, let's retrieve it.
    elif args.dataset:
        dataset = bigml.api.get_dataset_id(args.dataset)

    # If we already have a dataset, we check the status and get the fields if
    # we hadn't them yet.
    if dataset:
        dataset = r.get_dataset(dataset, api, args.verbosity, session_file)

        if ('object' in dataset and 'objective_field' in dataset['object']
                and 'column_number' in dataset['object']['objective_field']):
            dataset_objective = dataset['object']['objective_field'][
                'column_number']
            csv_properties.update(objective_field=dataset_objective,
                                  objective_field_present=True)

        fields = get_fields_structure(dataset, csv_properties)

        if args.public_dataset:
            r.publish_dataset(dataset, args, api, session_file)

        if hasattr(args, 'objective_field'):
            new_objective = get_new_objective(fields, args.objective_field)
        else:
            new_objective = None
        updated = False
        # We'll update the dataset if
        #  the flag --dataset_attributes is used
        #  the --multi-label flag is used and there's an --objective-field
        #  the --max-categories flag is used and there's an --objective-field
        #  the --impor-fields flag is used
        if check_dataset_update(args, dataset):
            dataset_args = r.set_dataset_args(args, fields)
            if args.shared_flag and r.shared_changed(args.shared, dataset):
                dataset_args.update(shared=args.shared)
            dataset = r.update_dataset(dataset,
                                       dataset_args,
                                       args,
                                       api=api,
                                       path=path,
                                       session_file=session_file)
            dataset = r.get_dataset(dataset, api, args.verbosity, session_file)
            updated = True
        if new_objective is not None:
            csv_properties.update(objective_field=args.objective_field,
                                  objective_field_present=True)
            updated = True
        if updated:
            fields = Fields(dataset['object']['fields'], **csv_properties)
        if not datasets:
            datasets = [dataset]
        else:
            datasets[0] = dataset
    return datasets, resume, csv_properties, fields
コード例 #2
0
ファイル: datasets.py プロジェクト: narayana1208/bigmler
def dataset_processing(
    source,
    training_set,
    test_set,
    fields,
    objective_field,
    api,
    args,
    resume,
    name=None,
    description=None,
    dataset_fields=None,
    multi_label_data=None,
    csv_properties=None,
    session_file=None,
    path=None,
    log=None,
):
    """Creating or retrieving dataset from input arguments

    """
    datasets = []
    dataset = None
    if training_set or args.source or (args.evaluate and test_set):
        # if resuming, try to extract args.dataset form log files
        if resume:
            message = u.dated("Dataset not found. Resuming.\n")
            resume, args.dataset = c.checkpoint(
                c.is_dataset_created,
                path,
                debug=args.debug,
                message=message,
                log_file=session_file,
                console=args.verbosity,
            )

    # If we have a source but no dataset or model has been provided, we
    # create a new dataset if the no_dataset option isn't set up. Also
    # if evaluate is set and test_set has been provided.
    if (source and not has_datasets(args) and not has_models(args) and not args.no_dataset) or (
        args.evaluate and args.test_set and not args.dataset
    ):
        dataset_args = r.set_dataset_args(
            name,
            description,
            args,
            fields,
            dataset_fields,
            objective_field=objective_field,
            multi_label_data=multi_label_data,
        )
        dataset = r.create_dataset(source, dataset_args, args, api, path, session_file, log)

    # If a dataset is provided, let's retrieve it.
    elif args.dataset:
        dataset = bigml.api.get_dataset_id(args.dataset)

    # If set of datasets is provided, let's check their ids.
    elif args.dataset_ids:
        for i in range(0, len(args.dataset_ids)):
            datasets.append(bigml.api.get_dataset_id(args.dataset_ids[i]))
        dataset = datasets[0]

    # If we already have a dataset, we check the status and get the fields if
    # we hadn't them yet.
    if dataset:
        dataset = r.get_dataset(dataset, api, args.verbosity, session_file)

        if (
            "object" in dataset
            and "objective_field" in dataset["object"]
            and "column_number" in dataset["object"]["objective_field"]
        ):
            dataset_objective = dataset["object"]["objective_field"]["column_number"]
            csv_properties.update(objective_field=dataset_objective, objective_field_present=True)

        fields = get_fields_structure(dataset, csv_properties)

        if args.public_dataset:
            r.publish_dataset(dataset, args, api, session_file)

        new_objective = get_new_objective(fields, args.objective_field, dataset)

        if (
            new_objective is not None
            or args.dataset_attributes
            or (args.shared_flag and r.shared_changed(args.shared, dataset))
        ):
            dataset_args = r.set_dataset_args(name, description, args, fields, dataset_fields, objective_field)
            if args.shared_flag and r.shared_changed(args.shared, dataset):
                dataset_args.update(shared=args.shared)
            dataset = r.update_dataset(dataset, dataset_args, args, api=api, path=path, session_file=session_file)
            dataset = r.get_dataset(dataset, api, args.verbosity, session_file)
            csv_properties.update(objective_field=objective_field, objective_field_present=True)
            fields = Fields(dataset["object"]["fields"], **csv_properties)
        if not datasets:
            datasets = [dataset]
        else:
            datasets[0] = dataset
    return datasets, resume, csv_properties, fields
コード例 #3
0
ファイル: datasets.py プロジェクト: mani4malar/bigmler
def dataset_processing(source, api, args, resume,
                       fields=None,
                       csv_properties=None,
                       multi_label_data=None,
                       session_file=None, path=None, log=None):
    """Creating or retrieving dataset from input arguments

    """
    datasets = []
    dataset = None
    if (args.training_set or args.source or (
            hasattr(args, "evaluate") and args.evaluate and args.test_set)):
        # if resuming, try to extract args.dataset form log files
        if resume:
            message = u.dated("Dataset not found. Resuming.\n")
            resume, args.dataset = c.checkpoint(
                c.is_dataset_created, path, debug=args.debug, message=message,
                log_file=session_file, console=args.verbosity)

    # If we have a source but no dataset or model has been provided, we
    # create a new dataset if the no_dataset option isn't set up. Also
    # if evaluate is set and test_set has been provided.
    if ((source and not args.has_datasets_ and not args.has_models_
         and not args.no_dataset) or
            (hasattr(args, "evaluate") and args.evaluate and
             args.test_set and not args.dataset)):
        dataset_args = r.set_dataset_args(args, fields,
                                          multi_label_data=multi_label_data)
        dataset = r.create_dataset(source, dataset_args, args, api,
                                   path, session_file, log)

    # If a dataset is provided, let's retrieve it.
    elif args.dataset:
        dataset = bigml.api.get_dataset_id(args.dataset)

    # If set of datasets is provided, let's check their ids.
    elif args.dataset_ids:
        for i in range(0, len(args.dataset_ids)):
            datasets.append(bigml.api.get_dataset_id(args.dataset_ids[i]))
        dataset = datasets[0]

    # If we already have a dataset, we check the status and get the fields if
    # we hadn't them yet.
    if dataset:
        dataset = r.get_dataset(dataset, api, args.verbosity, session_file)

        if ('object' in dataset and 'objective_field' in dataset['object'] and
                'column_number' in dataset['object']['objective_field']):
            dataset_objective = dataset[
                'object']['objective_field']['column_number']
            csv_properties.update(objective_field=dataset_objective,
                                  objective_field_present=True)

        fields = get_fields_structure(dataset, csv_properties)

        if args.public_dataset:
            r.publish_dataset(dataset, args, api, session_file)

        if hasattr(args, 'objective_field'):
            new_objective = get_new_objective(fields, args.objective_field)
        else:
            new_objective = None
        updated = False
        # We'll update the dataset if
        #  the flag --dataset_attributes is used
        #  the --multi-label flag is used and there's an --objective-field
        #  the --max-categories flag is used and there's an --objective-field
        if check_dataset_update(args, dataset):
            dataset_args = r.set_dataset_args(args, fields)
            if args.shared_flag and r.shared_changed(args.shared, dataset):
                dataset_args.update(shared=args.shared)
            dataset = r.update_dataset(dataset, dataset_args, args,
                                       api=api, path=path,
                                       session_file=session_file)
            dataset = r.get_dataset(dataset, api, args.verbosity, session_file)
            updated = True
        if new_objective is not None:
            csv_properties.update(objective_field=args.objective_field,
                                  objective_field_present=True)
            updated = True
        if updated:
            fields = Fields(dataset['object']['fields'], **csv_properties)
        if not datasets:
            datasets = [dataset]
        else:
            datasets[0] = dataset
    return datasets, resume, csv_properties, fields
コード例 #4
0
def dataset_processing(source, training_set, test_set, fields, objective_field,
                       api, args, resume,  name=None, description=None,
                       dataset_fields=None, multi_label_data=None,
                       csv_properties=None,
                       session_file=None, path=None, log=None):
    """Creating or retrieving dataset from input arguments

    """
    datasets = []
    dataset = None
    if (training_set or args.source or (args.evaluate and test_set)):
        # if resuming, try to extract args.dataset form log files
        if resume:
            message = u.dated("Dataset not found. Resuming.\n")
            resume, args.dataset = c.checkpoint(
                c.is_dataset_created, path, debug=args.debug, message=message,
                log_file=session_file, console=args.verbosity)

    # If we have a source but no dataset or model has been provided, we
    # create a new dataset if the no_dataset option isn't set up. Also
    # if evaluate is set and test_set has been provided.
    if ((source and not has_datasets(args) and not has_models(args)
         and not args.no_dataset) or
            (args.evaluate and args.test_set and not args.dataset)):
        dataset_args = r.set_dataset_args(name, description, args, fields,
                                          dataset_fields,
                                          objective_field=objective_field,
                                          multi_label_data=multi_label_data)
        dataset = r.create_dataset(source, dataset_args, args, api,
                                   path, session_file, log)

    # If a dataset is provided, let's retrieve it.
    elif args.dataset:
        dataset = bigml.api.get_dataset_id(args.dataset)

    # If set of datasets is provided, let's check their ids.
    elif args.dataset_ids:
        for i in range(0, len(args.dataset_ids)):
            datasets.append(bigml.api.get_dataset_id(args.dataset_ids[i]))
        dataset = datasets[0]

    # If we already have a dataset, we check the status and get the fields if
    # we hadn't them yet.
    if dataset:
        dataset = r.get_dataset(dataset, api, args.verbosity, session_file)

        if ('object' in dataset and 'objective_field' in dataset['object'] and
            'column_number' in dataset['object']['objective_field']):
            dataset_objective = dataset[
                'object']['objective_field']['column_number']
            csv_properties.update(objective_field=dataset_objective,
                  objective_field_present=True)

        fields = get_fields_structure(dataset, csv_properties)

        if args.public_dataset:
            r.publish_dataset(dataset, args, api, session_file)

        new_objective = get_new_objective(fields, args.objective_field,
                                          dataset)

        if (new_objective is not None or args.dataset_attributes or
            r.shared_changed(args.shared, dataset)):
            dataset_args = r.set_dataset_args(name, description, args, fields,
                                              dataset_fields, objective_field)
            dataset_args.update(shared=args.shared)
            dataset = r.update_dataset(dataset, dataset_args, args,
                                       api=api, path=path,
                                       session_file=session_file)
            dataset = r.get_dataset(dataset, api, args.verbosity, session_file)
            csv_properties.update(objective_field=objective_field,
                                  objective_field_present=True)
            fields = Fields(dataset['object']['fields'], **csv_properties)
        if not datasets:
            datasets = [dataset]
        else:
            datasets[0] = dataset
    return datasets, resume, csv_properties, fields