def test_predict_on_subset_with_existing_model(): """ Test generating predictions on subset with existing model """ # Create data files make_single_file_featureset_data() # train and save a model on the training file train_fs = NDJReader.for_path(join(_my_dir, 'train', 'train_single_file.jsonlines')).read() learner = Learner('RandomForestClassifier') learner.train(train_fs, grid_search=True, grid_objective="accuracy") model_filename = join(_my_dir, 'output', ('train_test_single_file_train_train_' 'single_file.jsonlines_test_test_single' '_file_subset.jsonlines_RandomForestClassifier' '.model')) learner.save(model_filename) # Run experiment config_path = fill_in_config_paths_for_single_file(join(_my_dir, "configs", "test_single_file_saved_subset" ".template.cfg"), join(_my_dir, 'train', 'train_single_file.jsonlines'), join(_my_dir, 'test', 'test_single_file_subset.' 'jsonlines')) run_configuration(config_path, quiet=True, overwrite=False) # Check results with open(join(_my_dir, 'output', ('train_test_single_file_train_train_' 'single_file.jsonlines_test_test_single' '_file_subset.jsonlines_RandomForestClassifier' '.results.json'))) as f: result_dict = json.load(f)[0] assert_almost_equal(result_dict['score'], 0.7333333)
def test_folds_file_logging_num_folds(): """ Test that, when `folds_file` is used, the log prints the number of folds, instead of the entire cv_folds data. And that the folds file warning shows up in the log file. """ # Run experiment suffix = '.jsonlines' train_path = join(_my_dir, 'train', 'f0{}'.format(suffix)) config_path = fill_in_config_paths_for_single_file( join(_my_dir, "configs", "test_folds_file" ".template.cfg"), train_path, None) run_configuration(config_path, quiet=True) # Check experiment log output with open(join(_my_dir, 'output', 'test_folds_file_logging.log')) as f: cv_file_pattern = re.compile( 'Specifying "folds_file" overrides both explicit and default "num_cv_folds".' ) matches = re.findall(cv_file_pattern, f.read()) assert_equal(len(matches), 1) # Check job log output with open( join( _my_dir, 'output', 'test_folds_file_logging_train_f0.' 'jsonlines_LogisticRegression.log')) as f: cv_folds_pattern = re.compile( "(Task: cross_validate\n)(.+)(Cross-validating \([0-9]+ folds\))") matches = re.findall(cv_folds_pattern, f.read()) assert_equal(len(matches), 1)
def test_train_file_test_file_ablation(): """ Test that specifying ablation with train and test file is ignored """ # Create data files make_single_file_featureset_data() # Run experiment config_path = fill_in_config_paths_for_single_file(join(_my_dir, "configs", "test_single_file" ".template.cfg"), join(_my_dir, 'train', 'train_single_file' '.jsonlines'), join(_my_dir, 'test', 'test_single_file.' 'jsonlines')) run_configuration(config_path, quiet=True, ablation=None) # check that we see the message that ablation was ignored in the experiment log # Check experiment log output with open(join(_my_dir, 'output', 'train_test_single_file.log')) as f: cv_file_pattern = re.compile('Not enough featuresets for ablation. Ignoring.') matches = re.findall(cv_file_pattern, f.read()) eq_(len(matches), 1)
def test_folds_file_logging_num_folds(): """ Test when using `folds_file`, log shows number of folds and appropriate warning. """ # Run experiment suffix = '.jsonlines' train_path = join(_my_dir, 'train', 'f0{}'.format(suffix)) config_path = fill_in_config_paths_for_single_file(join(_my_dir, "configs", "test_folds_file" ".template.cfg"), train_path, None) run_configuration(config_path, quiet=True) # Check experiment log output with open(join(_my_dir, 'output', 'test_folds_file_logging.log')) as f: cv_file_pattern = re.compile('Specifying "folds_file" overrides both explicit and default "num_cv_folds".') matches = re.findall(cv_file_pattern, f.read()) assert_equal(len(matches), 1) # Check job log output with open(join(_my_dir, 'output', 'test_folds_file_logging_train_f0.' 'jsonlines_LogisticRegression.log')) as f: cv_folds_pattern = re.compile("(Task: cross_validate\n)(.+)(Cross-validating \([0-9]+ folds\))") matches = re.findall(cv_folds_pattern, f.read()) assert_equal(len(matches), 1)
def test_predict_on_subset_with_existing_model(): """ Test generating predictions on subset with existing model """ # Create data files make_single_file_featureset_data() # train and save a model on the training file train_fs = NDJReader.for_path(join(_my_dir, 'train', 'train_single_file.jsonlines')).read() learner = Learner('RandomForestClassifier') learner.train(train_fs, grid_search=True, grid_objective="accuracy") model_filename = join(_my_dir, 'output', ('train_test_single_file_train_train_' 'single_file.jsonlines_test_test_single' '_file_subset.jsonlines_RandomForestClassifier' '.model')) learner.save(model_filename) # Run experiment config_path = fill_in_config_paths_for_single_file(join(_my_dir, "configs", "test_single_file_saved_subset" ".template.cfg"), join(_my_dir, 'train', 'train_single_file.jsonlines'), join(_my_dir, 'test', 'test_single_file_subset.' 'jsonlines')) run_configuration(config_path, quiet=True, overwrite=False) # Check results with open(join(_my_dir, 'output', ('train_test_single_file_train_train_' 'single_file.jsonlines_test_test_single' '_file_subset.jsonlines_RandomForestClassifier' '.results.json'))) as f: result_dict = json.load(f)[0] assert_almost_equal(result_dict['accuracy'], 0.7333333)
def test_train_file_test_file(): """ Test that train_file and test_file experiments work """ # Create data files make_single_file_featureset_data() # Run experiment config_path = fill_in_config_paths_for_single_file( join(_my_dir, "configs", "test_single_file" ".template.cfg"), join(_my_dir, 'train', 'train_single_file' '.jsonlines'), join(_my_dir, 'test', 'test_single_file.' 'jsonlines')) run_configuration(config_path, quiet=True) # Check results with open( join(_my_dir, 'output', ('train_test_single_file_train_train_' 'single_file.jsonlines_test_test_single' '_file.jsonlines_RandomForestClassifier' '.results.json'))) as f: result_dict = json.load(f)[0] assert_almost_equal(result_dict['score'], 0.925)
def test_train_file_test_file(): """ Test that train_file and test_file experiments work """ # Create data files make_single_file_featureset_data() # Run experiment config_path = fill_in_config_paths_for_single_file(join(_my_dir, "configs", "test_single_file" ".template.cfg"), join(_my_dir, 'train', 'train_single_file' '.jsonlines'), join(_my_dir, 'test', 'test_single_file.' 'jsonlines')) run_configuration(config_path, quiet=True) # Check results with open(join(_my_dir, 'output', ('train_test_single_file_train_train_' 'single_file.jsonlines_test_test_single' '_file.jsonlines_RandomForestClassifier' '.results.json'))) as f: result_dict = json.load(f)[0] assert_almost_equal(result_dict['score'], 0.925)
def test_test_file_and_test_directory(): """ Test that test_file + test_directory = ValueError """ # Run experiment config_path = fill_in_config_paths_for_single_file( join(_my_dir, "configs", "test_single_file" ".template.cfg"), join(_my_dir, 'train', 'train_single_file' '.jsonlines'), join(_my_dir, 'test', 'test_single_file.' 'jsonlines'), test_directory='foo') _parse_config_file(config_path)
def test_cross_validate_task(): """ Test that 10-fold cross_validate experiments work. Test that fold ids get correctly saved. """ # Run experiment suffix = '.jsonlines' train_path = join(_my_dir, 'train', 'f0{}'.format(suffix)) config_path = fill_in_config_paths_for_single_file( join(_my_dir, "configs", "test_save_cv_folds" ".template.cfg"), train_path, None) run_configuration(config_path, quiet=True) # Check final average results with open( join( _my_dir, 'output', 'test_save_cv_folds_train_f0.' + 'jsonlines_LogisticRegression.results.json')) as f: result_dict = json.load(f)[10] assert_almost_equal(result_dict['score'], 0.517) # Check that the fold ids were saved correctly expected_skll_ids = {} examples = _load_featureset(train_path, '', suffix, quiet=True) kfold = StratifiedKFold(n_splits=10) for fold_num, (_, test_indices) in enumerate( kfold.split(examples.features, examples.labels)): for index in test_indices: expected_skll_ids[examples.ids[index]] = fold_num skll_fold_ids = {} with open(join(_my_dir, 'output', 'test_save_cv_folds_skll_fold_ids.csv')) as f: reader = csv.DictReader(f) for row in reader: skll_fold_ids[row['id']] = row['cv_test_fold'] # convert the dictionary to strings (sorted by key) for quick comparison skll_fold_ids_str = ''.join('{}{}'.format(key, val) for key, val in sorted(skll_fold_ids.items())) expected_skll_ids_str = ''.join( '{}{}'.format(key, val) for key, val in sorted(expected_skll_ids.items())) assert_equal(skll_fold_ids_str, expected_skll_ids_str)
def test_test_file_and_test_directory(): """ Test that test_file + test_directory = ValueError """ # Run experiment config_path = fill_in_config_paths_for_single_file(join(_my_dir, "configs", "test_single_file" ".template.cfg"), join(_my_dir, 'train', 'train_single_file' '.jsonlines'), join(_my_dir, 'test', 'test_single_file.' 'jsonlines'), test_directory='foo') _parse_config_file(config_path)
def test_cross_validate_task(): """ Test that 10-fold cross_validate experiments work. Test that fold ids get correctly saved. """ # Run experiment suffix = '.jsonlines' train_path = join(_my_dir, 'train', 'f0{}'.format(suffix)) config_path = fill_in_config_paths_for_single_file(join(_my_dir, "configs", "test_save_cv_folds" ".template.cfg"), train_path, None) run_configuration(config_path, quiet=True) # Check final average results with open(join(_my_dir, 'output', 'test_save_cv_folds_train_f0.' + 'jsonlines_LogisticRegression.results.json')) as f: result_dict = json.load(f)[10] assert_almost_equal(result_dict['score'], 0.517) # Check that the fold ids were saved correctly expected_skll_ids = {} examples = _load_featureset(train_path, '', suffix, quiet=True) kfold = StratifiedKFold(examples.labels, n_folds=10) for fold_num, (_, test_indices) in enumerate(kfold): for index in test_indices: expected_skll_ids[examples.ids[index]] = fold_num skll_fold_ids = {} with open(join(_my_dir, 'output', 'test_save_cv_folds_skll_fold_ids.csv')) as f: reader = csv.DictReader(f) for row in reader: skll_fold_ids[row['id']] = row['cv_test_fold'] # convert the dictionary to strings (sorted by key) for quick comparison skll_fold_ids_str = ''.join('{}{}'.format(key, val) for key, val in sorted(skll_fold_ids.items())) expected_skll_ids_str = ''.join('{}{}'.format(key, val) for key, val in sorted(expected_skll_ids.items())) assert_equal(skll_fold_ids_str, expected_skll_ids_str)
def test_folds_file_logging_grid_search(): """ Test that, when `folds_file` is used but `use_folds_file` for grid search is specified, that we get an appropriate message in the log. """ # Run experiment suffix = '.jsonlines' train_path = join(_my_dir, 'train', 'f0{}'.format(suffix)) config_path = fill_in_config_paths_for_single_file( join(_my_dir, "configs", "test_folds_file_grid" ".template.cfg"), train_path, None) run_configuration(config_path, quiet=True) # Check experiment log output with open(join(_my_dir, 'output', 'test_folds_file_logging.log')) as f: cv_file_pattern = re.compile( 'Specifying "folds_file" overrides both explicit and default "num_cv_folds".\n(.+)The specified "folds_file" will not be used for inner grid search.' ) matches = re.findall(cv_file_pattern, f.read()) assert_equal(len(matches), 1)
def test_cv_folds_file_logging(): """ Test that, when `cv_folds_file` is used, the log prints the number of folds, instead of the entire cv_folds data. """ # Run experiment suffix = '.jsonlines' train_path = join(_my_dir, 'train', 'f0{}'.format(suffix)) config_path = fill_in_config_paths_for_single_file( join(_my_dir, "configs", "test_cv_folds_file" ".template.cfg"), train_path, None) run_configuration(config_path, quiet=True) # Check log output with open( join( _my_dir, 'output', 'test_cv_folds_file_logging_train_f0.' + 'jsonlines_LogisticRegression.log')) as f: cv_folds_pattern = re.compile( "Task: cross_validate\nCross-validating \([0-9]+ folds\)") matches = re.findall(cv_folds_pattern, f.read()) assert_equal(len(matches), 1)
def test_folds_file_logging_grid_search(): """ Test that, when `folds_file` is used but `use_folds_file` for grid search is specified, that we get an appropriate message in the log. """ # Run experiment suffix = '.jsonlines' train_path = join(_my_dir, 'train', 'f0{}'.format(suffix)) config_path = fill_in_config_paths_for_single_file(join(_my_dir, "configs", "test_folds_file_grid" ".template.cfg"), train_path, None) run_configuration(config_path, quiet=True) # Check experiment log output with open(join(_my_dir, 'output', 'test_folds_file_logging.log')) as f: cv_file_pattern = re.compile('Specifying "folds_file" overrides both explicit and default "num_cv_folds".\n(.+)The specified "folds_file" will not be used for inner grid search.') matches = re.findall(cv_file_pattern, f.read()) assert_equal(len(matches), 1)