Ejemplo n.º 1
0
def kfold_cross_validate_cli(
        k_fold,
        config=None,
        config_file=None,
        dataset=None,
        data_format=None,
        output_directory='results',
        random_seed=default_random_seed,
        skip_save_k_fold_split_indices=False,
        **kwargs
):
    """Wrapper function to performs k-fold cross validation.

    # Inputs
    :param k_fold: (int) number of folds to create for the cross-validation
    :param config: (dict, default: None) a dictionary containing
            information needed to build a model. Refer to the [User Guide]
           (http://ludwig.ai/user_guide/#model-config) for details.
    :param config_file: (string, optional, default: `None`) path to
           a YAML file containing the config. If available it will be
           used instead of the config dict.
    :param data_csv: (string, default: None)
    :param output_directory: (string, default: 'results')
    :param random_seed: (int) Random seed used k-fold splits.
    :param skip_save_k_fold_split_indices: (boolean, default: False) Disables
            saving k-fold split indices

    :return: None
    """

    if config is None and config_file is None:
        raise ValueError(
            "No config is provided 'config' or "
            "'config_file' must be provided."
        )
    elif config is not None and config_file is not None:
        raise ValueError(
            "Cannot specify both 'config' and 'config_file'"
            ", proivde only one of the parameters."
        )

    (kfold_cv_stats,
     kfold_split_indices) = kfold_cross_validate(
        k_fold,
        config=config if config is not None else
        config_file,
        dataset=dataset,
        data_format=data_format,
        output_directory=output_directory,
        random_seed=random_seed
    )

    # save k-fold cv statistics
    save_json(os.path.join(output_directory, 'kfold_training_statistics.json'),
              kfold_cv_stats)

    # save k-fold split indices
    if not skip_save_k_fold_split_indices:
        save_json(os.path.join(output_directory, 'kfold_split_indices.json'),
                  kfold_split_indices)
Ejemplo n.º 2
0
def test_kfold_cv_api_from_file():
    # k-fold_cross_validate api with config_file
    num_folds = 3

    # setup temporary directory to run test
    with tempfile.TemporaryDirectory() as tmpdir:

        # setup required data structures for test
        training_data_fp = os.path.join(tmpdir, 'train.csv')
        config_fp = os.path.join(tmpdir, 'config.yaml')

        # generate synthetic data for the test
        input_features = [
            numerical_feature(normalization='zscore'),
            numerical_feature(normalization='zscore')
        ]

        output_features = [
            category_feature(vocab_size=2, reduce_input='sum')
        ]

        generate_data(input_features, output_features, training_data_fp)

        # generate config file
        config = {
            'input_features': input_features,
            'output_features': output_features,
            'combiner': {'type': 'concat', 'fc_size': 14},
            'training': {'epochs': 2}
        }

        with open(config_fp, 'w') as f:
            yaml.dump(config, f)

        # test kfold_cross_validate api with config file

        # execute k-fold cross validation run
        (
            kfold_cv_stats,
            kfold_split_indices
        ) = kfold_cross_validate(
            3,
            config=config_fp,
            dataset=training_data_fp
        )

        # correct structure for results from kfold cv
        for key in ['fold_' + str(i + 1)
                    for i in range(num_folds)] + ['overall']:
            assert key in kfold_cv_stats

        for key in ['fold_' + str(i + 1) for i in range(num_folds)]:
            assert key in kfold_split_indices
Ejemplo n.º 3
0
def test_kfold_cv_api_from_file():
    # k-fold_cross_validate api with config file
    num_folds = 3

    # setup temporary directory to run test
    with tempfile.TemporaryDirectory() as tmpdir:

        # setup required data structures for test
        training_data_fp = os.path.join(tmpdir, "train.csv")
        config_fp = os.path.join(tmpdir, "config.yaml")

        # generate synthetic data for the test
        input_features = [
            number_feature(normalization="zscore"),
            number_feature(normalization="zscore")
        ]

        output_features = [category_feature(vocab_size=3, reduce_input="sum")]

        generate_data(input_features, output_features, training_data_fp)

        # generate config file
        config = {
            "input_features": input_features,
            "output_features": output_features,
            "combiner": {
                "type": "concat",
                "output_size": 14
            },
            TRAINER: {
                "epochs": 2
            },
        }

        with open(config_fp, "w") as f:
            yaml.dump(config, f)

        # test kfold_cross_validate api with config file

        # execute k-fold cross validation run
        (kfold_cv_stats,
         kfold_split_indices) = kfold_cross_validate(3,
                                                     config=config_fp,
                                                     dataset=training_data_fp)

        # correct structure for results from kfold cv
        for key in ["fold_" + str(i + 1)
                    for i in range(num_folds)] + ["overall"]:
            assert key in kfold_cv_stats

        for key in ["fold_" + str(i + 1) for i in range(num_folds)]:
            assert key in kfold_split_indices
Ejemplo n.º 4
0
def test_kfold_cv_dataset_formats(data_format):

    # k-fold_cross_validate api with in-memory model definition
    num_folds = 3

    # setup temporary directory to run test
    with tempfile.TemporaryDirectory() as tmpdir:

        # setup required data structures for test
        training_data_fp = os.path.join(tmpdir, 'train.csv')

        # generate synthetic data for the test
        input_features = [
            numerical_feature(normalization='zscore'),
            numerical_feature(normalization='zscore')
        ]

        output_features = [
            numerical_feature()
        ]

        generate_data(input_features, output_features, training_data_fp)
        dataset_to_use = create_data_set_to_use(data_format, training_data_fp)

        # generate model definition file
        model_definition = {
            'input_features': input_features,
            'output_features': output_features,
            'combiner': {'type': 'concat', 'fc_size': 14},
            'training': {'epochs': 2}
        }

        # test kfold_cross_validate api with model definition in-memory

        # execute k-fold cross validation run
        (
            kfold_cv_stats,
            kfold_split_indices
        ) = kfold_cross_validate(
            3,
            model_definition=model_definition,
            dataset=dataset_to_use
        )

        # correct structure for results from kfold cv
        for key in ['fold_' + str(i + 1)
                    for i in range(num_folds)] + ['overall']:
            assert key in kfold_cv_stats

        for key in ['fold_' + str(i + 1) for i in range(num_folds)]:
            assert key in kfold_split_indices
Ejemplo n.º 5
0
def test_kfold_cv_dataset_formats(data_format):
    # k-fold_cross_validate api with in-memory config
    num_folds = 3

    # setup temporary directory to run test
    with tempfile.TemporaryDirectory() as tmpdir:

        # setup required data structures for test
        training_data_fp = os.path.join(tmpdir, "train.csv")

        # generate synthetic data for the test
        input_features = [
            number_feature(normalization="zscore"),
            number_feature(normalization="zscore")
        ]

        output_features = [number_feature()]

        generate_data(input_features, output_features, training_data_fp)
        dataset_to_use = create_data_set_to_use(data_format, training_data_fp)

        # generate config file
        config = {
            "input_features": input_features,
            "output_features": output_features,
            "combiner": {
                "type": "concat",
                "output_size": 14
            },
            TRAINER: {
                "epochs": 2
            },
        }

        # test kfold_cross_validate api with config in-memory

        # execute k-fold cross validation run
        (kfold_cv_stats,
         kfold_split_indices) = kfold_cross_validate(3,
                                                     config=config,
                                                     dataset=dataset_to_use)

        # correct structure for results from kfold cv
        for key in ["fold_" + str(i + 1)
                    for i in range(num_folds)] + ["overall"]:
            assert key in kfold_cv_stats

        for key in ["fold_" + str(i + 1) for i in range(num_folds)]:
            assert key in kfold_split_indices
Ejemplo n.º 6
0
def kfold_cross_validate_cli(k_fold,
                             model_definition=None,
                             model_definition_file=None,
                             data_csv=None,
                             output_directory='results',
                             random_seed=default_random_seed,
                             skip_save_k_fold_split_indices=False,
                             **kwargs):
    """Wrapper function to performs k-fold cross validation.

    # Inputs
    :param k_fold: (int) number of folds to create for the cross-validation
    :param model_definition: (dict, default: None) a dictionary containing
            information needed to build a model. Refer to the [User Guide]
           (http://ludwig.ai/user_guide/#model-definition) for details.
    :param model_definition_file: (string, optional, default: `None`) path to
           a YAML file containing the model definition. If available it will be
           used instead of the model_definition dict.
    :param data_csv: (string, default: None)
    :param output_directory: (string, default: 'results')
    :param random_seed: (int) Random seed used k-fold splits.
    :param skip_save_k_fold_split_indices: (boolean, default: False) Disables
            saving k-fold split indices

    :return: None
    """
    model_definition = check_which_model_definition(model_definition,
                                                    model_definition_file)

    (kfold_cv_stats, kfold_split_indices) = kfold_cross_validate(
        k_fold,
        model_definition=model_definition,
        data_csv=data_csv,
        output_directory=output_directory,
        random_seed=random_seed)

    # save k-fold cv statistics
    save_json(os.path.join(output_directory, 'kfold_training_statistics.json'),
              kfold_cv_stats)

    # save k-fold split indices
    if not skip_save_k_fold_split_indices:
        save_json(os.path.join(output_directory, 'kfold_split_indices.json'),
                  kfold_split_indices)
Ejemplo n.º 7
0
def kfold_cross_validate_cli(
    k_fold,
    config=None,
    dataset=None,
    data_format=None,
    output_directory="results",
    random_seed=default_random_seed,
    skip_save_k_fold_split_indices=False,
    **kwargs,
):
    """Wrapper function to performs k-fold cross validation.

    # Inputs
    :param k_fold: (int) number of folds to create for the cross-validation
    :param config: (Union[str, dict], default: None) a dictionary or file path
            containing model configuration. Refer to the [User Guide]
           (http://ludwig.ai/user_guide/#model-config) for details.
    :param dataset: (string, default: None)
    :param output_directory: (string, default: 'results')
    :param random_seed: (int) Random seed used k-fold splits.
    :param skip_save_k_fold_split_indices: (boolean, default: False) Disables
            saving k-fold split indices

    :return: None
    """

    (kfold_cv_stats, kfold_split_indices) = kfold_cross_validate(
        k_fold,
        config=config,
        dataset=dataset,
        data_format=data_format,
        output_directory=output_directory,
        random_seed=random_seed,
    )

    # save k-fold cv statistics
    save_json(os.path.join(output_directory, "kfold_training_statistics.json"),
              kfold_cv_stats)

    # save k-fold split indices
    if not skip_save_k_fold_split_indices:
        save_json(os.path.join(output_directory, "kfold_split_indices.json"),
                  kfold_split_indices)