def kfold_cross_validate_cli( k_fold, config=None, config_file=None, dataset=None, data_format=None, output_directory='results', random_seed=default_random_seed, skip_save_k_fold_split_indices=False, **kwargs ): """Wrapper function to performs k-fold cross validation. # Inputs :param k_fold: (int) number of folds to create for the cross-validation :param config: (dict, default: None) a dictionary containing information needed to build a model. Refer to the [User Guide] (http://ludwig.ai/user_guide/#model-config) for details. :param config_file: (string, optional, default: `None`) path to a YAML file containing the config. If available it will be used instead of the config dict. :param data_csv: (string, default: None) :param output_directory: (string, default: 'results') :param random_seed: (int) Random seed used k-fold splits. :param skip_save_k_fold_split_indices: (boolean, default: False) Disables saving k-fold split indices :return: None """ if config is None and config_file is None: raise ValueError( "No config is provided 'config' or " "'config_file' must be provided." ) elif config is not None and config_file is not None: raise ValueError( "Cannot specify both 'config' and 'config_file'" ", proivde only one of the parameters." ) (kfold_cv_stats, kfold_split_indices) = kfold_cross_validate( k_fold, config=config if config is not None else config_file, dataset=dataset, data_format=data_format, output_directory=output_directory, random_seed=random_seed ) # save k-fold cv statistics save_json(os.path.join(output_directory, 'kfold_training_statistics.json'), kfold_cv_stats) # save k-fold split indices if not skip_save_k_fold_split_indices: save_json(os.path.join(output_directory, 'kfold_split_indices.json'), kfold_split_indices)
def test_kfold_cv_api_from_file(): # k-fold_cross_validate api with config_file num_folds = 3 # setup temporary directory to run test with tempfile.TemporaryDirectory() as tmpdir: # setup required data structures for test training_data_fp = os.path.join(tmpdir, 'train.csv') config_fp = os.path.join(tmpdir, 'config.yaml') # generate synthetic data for the test input_features = [ numerical_feature(normalization='zscore'), numerical_feature(normalization='zscore') ] output_features = [ category_feature(vocab_size=2, reduce_input='sum') ] generate_data(input_features, output_features, training_data_fp) # generate config file config = { 'input_features': input_features, 'output_features': output_features, 'combiner': {'type': 'concat', 'fc_size': 14}, 'training': {'epochs': 2} } with open(config_fp, 'w') as f: yaml.dump(config, f) # test kfold_cross_validate api with config file # execute k-fold cross validation run ( kfold_cv_stats, kfold_split_indices ) = kfold_cross_validate( 3, config=config_fp, dataset=training_data_fp ) # correct structure for results from kfold cv for key in ['fold_' + str(i + 1) for i in range(num_folds)] + ['overall']: assert key in kfold_cv_stats for key in ['fold_' + str(i + 1) for i in range(num_folds)]: assert key in kfold_split_indices
def test_kfold_cv_api_from_file(): # k-fold_cross_validate api with config file num_folds = 3 # setup temporary directory to run test with tempfile.TemporaryDirectory() as tmpdir: # setup required data structures for test training_data_fp = os.path.join(tmpdir, "train.csv") config_fp = os.path.join(tmpdir, "config.yaml") # generate synthetic data for the test input_features = [ number_feature(normalization="zscore"), number_feature(normalization="zscore") ] output_features = [category_feature(vocab_size=3, reduce_input="sum")] generate_data(input_features, output_features, training_data_fp) # generate config file config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "output_size": 14 }, TRAINER: { "epochs": 2 }, } with open(config_fp, "w") as f: yaml.dump(config, f) # test kfold_cross_validate api with config file # execute k-fold cross validation run (kfold_cv_stats, kfold_split_indices) = kfold_cross_validate(3, config=config_fp, dataset=training_data_fp) # correct structure for results from kfold cv for key in ["fold_" + str(i + 1) for i in range(num_folds)] + ["overall"]: assert key in kfold_cv_stats for key in ["fold_" + str(i + 1) for i in range(num_folds)]: assert key in kfold_split_indices
def test_kfold_cv_dataset_formats(data_format): # k-fold_cross_validate api with in-memory model definition num_folds = 3 # setup temporary directory to run test with tempfile.TemporaryDirectory() as tmpdir: # setup required data structures for test training_data_fp = os.path.join(tmpdir, 'train.csv') # generate synthetic data for the test input_features = [ numerical_feature(normalization='zscore'), numerical_feature(normalization='zscore') ] output_features = [ numerical_feature() ] generate_data(input_features, output_features, training_data_fp) dataset_to_use = create_data_set_to_use(data_format, training_data_fp) # generate model definition file model_definition = { 'input_features': input_features, 'output_features': output_features, 'combiner': {'type': 'concat', 'fc_size': 14}, 'training': {'epochs': 2} } # test kfold_cross_validate api with model definition in-memory # execute k-fold cross validation run ( kfold_cv_stats, kfold_split_indices ) = kfold_cross_validate( 3, model_definition=model_definition, dataset=dataset_to_use ) # correct structure for results from kfold cv for key in ['fold_' + str(i + 1) for i in range(num_folds)] + ['overall']: assert key in kfold_cv_stats for key in ['fold_' + str(i + 1) for i in range(num_folds)]: assert key in kfold_split_indices
def test_kfold_cv_dataset_formats(data_format): # k-fold_cross_validate api with in-memory config num_folds = 3 # setup temporary directory to run test with tempfile.TemporaryDirectory() as tmpdir: # setup required data structures for test training_data_fp = os.path.join(tmpdir, "train.csv") # generate synthetic data for the test input_features = [ number_feature(normalization="zscore"), number_feature(normalization="zscore") ] output_features = [number_feature()] generate_data(input_features, output_features, training_data_fp) dataset_to_use = create_data_set_to_use(data_format, training_data_fp) # generate config file config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "output_size": 14 }, TRAINER: { "epochs": 2 }, } # test kfold_cross_validate api with config in-memory # execute k-fold cross validation run (kfold_cv_stats, kfold_split_indices) = kfold_cross_validate(3, config=config, dataset=dataset_to_use) # correct structure for results from kfold cv for key in ["fold_" + str(i + 1) for i in range(num_folds)] + ["overall"]: assert key in kfold_cv_stats for key in ["fold_" + str(i + 1) for i in range(num_folds)]: assert key in kfold_split_indices
def kfold_cross_validate_cli(k_fold, model_definition=None, model_definition_file=None, data_csv=None, output_directory='results', random_seed=default_random_seed, skip_save_k_fold_split_indices=False, **kwargs): """Wrapper function to performs k-fold cross validation. # Inputs :param k_fold: (int) number of folds to create for the cross-validation :param model_definition: (dict, default: None) a dictionary containing information needed to build a model. Refer to the [User Guide] (http://ludwig.ai/user_guide/#model-definition) for details. :param model_definition_file: (string, optional, default: `None`) path to a YAML file containing the model definition. If available it will be used instead of the model_definition dict. :param data_csv: (string, default: None) :param output_directory: (string, default: 'results') :param random_seed: (int) Random seed used k-fold splits. :param skip_save_k_fold_split_indices: (boolean, default: False) Disables saving k-fold split indices :return: None """ model_definition = check_which_model_definition(model_definition, model_definition_file) (kfold_cv_stats, kfold_split_indices) = kfold_cross_validate( k_fold, model_definition=model_definition, data_csv=data_csv, output_directory=output_directory, random_seed=random_seed) # save k-fold cv statistics save_json(os.path.join(output_directory, 'kfold_training_statistics.json'), kfold_cv_stats) # save k-fold split indices if not skip_save_k_fold_split_indices: save_json(os.path.join(output_directory, 'kfold_split_indices.json'), kfold_split_indices)
def kfold_cross_validate_cli( k_fold, config=None, dataset=None, data_format=None, output_directory="results", random_seed=default_random_seed, skip_save_k_fold_split_indices=False, **kwargs, ): """Wrapper function to performs k-fold cross validation. # Inputs :param k_fold: (int) number of folds to create for the cross-validation :param config: (Union[str, dict], default: None) a dictionary or file path containing model configuration. Refer to the [User Guide] (http://ludwig.ai/user_guide/#model-config) for details. :param dataset: (string, default: None) :param output_directory: (string, default: 'results') :param random_seed: (int) Random seed used k-fold splits. :param skip_save_k_fold_split_indices: (boolean, default: False) Disables saving k-fold split indices :return: None """ (kfold_cv_stats, kfold_split_indices) = kfold_cross_validate( k_fold, config=config, dataset=dataset, data_format=data_format, output_directory=output_directory, random_seed=random_seed, ) # save k-fold cv statistics save_json(os.path.join(output_directory, "kfold_training_statistics.json"), kfold_cv_stats) # save k-fold split indices if not skip_save_k_fold_split_indices: save_json(os.path.join(output_directory, "kfold_split_indices.json"), kfold_split_indices)