コード例 #1
0
def test_main_workflow(tmp_config_dir, tmp_output_dir):
    """
    """

    for test_config_tuple in test_config_tuples:

        extract_config_filename = test_config_tuple[0]
        # have to put tmp_output_dir into yaml file
        tmp_config_path = rewrite_config(
            tmp_config_dir,
            replace_dict={
                'output_dir':
                ('replace with tmp_output_dir', str(tmp_output_dir))
            },
            config_filename=extract_config_filename)
        hvc.extract(tmp_config_path)
        extract_outputs = list(
            filter(os.path.isdir,
                   glob.glob(os.path.join(str(tmp_output_dir), '*extract*'))))
        extract_outputs.sort(key=os.path.getmtime)
        extract_output_dir = (extract_outputs[-1]
                              )  # [-1] is newest dir, after sort
        assert check_extract_output(extract_output_dir)

        feature_file = glob.glob(os.path.join(extract_output_dir, 'summary*'))
        feature_file = feature_file[0]  # because glob returns list

        select_config_filenames = test_config_tuple[1]

        while True:
            try:
                select_config_filename = select_config_filenames.pop()
                tmp_config_path = rewrite_config(
                    tmp_config_dir,
                    replace_dict={
                        'feature_file':
                        ('replace with feature_file', feature_file),
                        'output_dir':
                        ('replace with tmp_output_dir', str(tmp_output_dir))
                    },
                    config_filename=select_config_filename)
                hvc.select(tmp_config_path)
                select_outputs = list(
                    filter(
                        os.path.isdir,
                        glob.glob(os.path.join(str(tmp_output_dir),
                                               '*select*'))))
                select_outputs.sort(key=os.path.getmtime)
                select_output_dir = (select_outputs[-1]
                                     )  # [-1] is newest dir, after sort
                assert check_select_output(tmp_config_path, select_output_dir)
            except IndexError:  # because pop from empty list
                break
コード例 #2
0
    def test_select_knn_ftr_grp(self, tmp_output_dir, test_data_dir):
        # test select with features for model specified by feature list indices
        feature_file_path = os.path.join(test_data_dir, 'feature_files',
                                         'knn.features')

        hvc.select(feature_file_path=feature_file_path,
                   feature_group='knn',
                   train_samples_range=range(100, 401, 100),
                   num_replicates=5,
                   num_test_samples=400,
                   model_name='knn',
                   hyperparameters={'k': 4},
                   output_dir=str(tmp_output_dir))
コード例 #3
0
    def test_select_knn_ftr_list_indices(self, tmp_output_dir, test_data_dir):
        # test select with features for model specified by feature list indices
        feature_file_path = glob(
            os.path.join(test_data_dir, 'feature_files',
                         'test_extract_knn*'))[0]

        hvc.select(feature_file_path=feature_file_path,
                   feature_list_indices=[0, 1, 2, 3, 4, 5, 6, 7, 8],
                   train_samples_range=range(100, 401, 100),
                   num_replicates=5,
                   num_test_samples=400,
                   model_name='knn',
                   hyperparameters={'k': 4},
                   output_dir=str(tmp_output_dir))
コード例 #4
0
    def test_select_flatwindow_ftr_grp(self, tmp_output_dir, test_data_dir):
        # test select with features for model specified by feature list indices
        feature_file_path = os.path.join(test_data_dir, 'feature_files',
                                         'flatwindow.features')

        hvc.select(feature_file_path=feature_file_path,
                   model_name='flatwindow',
                   hyperparameters={
                       'batch_size': 32,
                       'epochs': 3
                   },
                   neuralnet_input='flatwindow',
                   train_samples_range=range(200, 401, 200),
                   num_replicates=3,
                   num_test_samples=400,
                   output_dir=str(tmp_output_dir))
コード例 #5
0
    def test_select_svm_ftr_grp(self, tmp_output_dir, test_data_dir):
        # test select with features for model specified by feature list indices
        feature_file_path = glob(
            os.path.join(test_data_dir, 'feature_files',
                         'test_extract_svm*'))[0]

        hvc.select(feature_file_path=feature_file_path,
                   feature_group='svm',
                   train_samples_range=range(100, 401, 100),
                   num_replicates=5,
                   num_test_samples=400,
                   model_name='svm',
                   hyperparameters={
                       'C': 1,
                       'gamma': 0.01
                   },
                   output_dir=str(tmp_output_dir))
コード例 #6
0
    def _yaml_config_asserts(self, select_yaml_config_path, tmp_output_dir,
                             feature_file):
        select_config_rewritten = rewrite_config(
            select_yaml_config_path,
            tmp_output_dir,
            replace_dict={
                'feature_file': ('replace with feature_file', feature_file),
                'output_dir':
                ('replace with tmp_output_dir', str(tmp_output_dir))
            })
        select_outputs_before = glob(
            os.path.join(str(tmp_output_dir), 'select_output*',
                         'summary_model_select_file*'))
        hvc.select(select_config_rewritten)
        # helper function with assertions shared by all
        # tests for hvc.select run with config.yml files
        select_outputs_after = glob(
            os.path.join(str(tmp_output_dir), 'select_output*',
                         'summary_model_select_file*'))
        select_output = [
            after for after in select_outputs_after
            if after not in select_outputs_before
        ]
        # should only be one summary output file
        assert len(select_output) == 1

        # now check for every model in config
        # if there is corresponding folder with model files etc
        select_config = hvc.parse_config(select_config_rewritten, 'select')
        select_output_dir = os.path.dirname(select_output[0])
        select_model_dirs = next(
            os.walk(select_output_dir))[1]  # [1] to return just dir names
        select_model_folder_names = [
            determine_model_output_folder_name(model_dict)
            for model_dict in select_config['models']
        ]
        for folder_name in select_model_folder_names:
            assert folder_name in select_model_dirs

        return True
コード例 #7
0
def run_main_workflow(tmp_output_dir, script_tuple_dict, configs_path):
    """tests main workflow for hybrid-vocal-classifier
    by iterating through test_main_workflow_dict,
    running the scripts named in each tuple in the dict
    """

    extract_config_filename = os.path.join(configs_path,
                                           script_tuple_dict['extract'])
    replace_dict = {
        'output_dir': ('replace with tmp_output_dir', str(tmp_output_dir))
    }
    # have to put tmp_output_dir into yaml file
    extract_config_rewritten = rewrite_config(extract_config_filename,
                                              tmp_output_dir, replace_dict)
    hvc.extract(extract_config_rewritten)
    extract_outputs = list(
        filter(os.path.isdir,
               glob(os.path.join(str(tmp_output_dir), '*extract*'))))
    extract_outputs.sort(key=os.path.getmtime)
    extract_output_dir = (extract_outputs[-1]
                          )  # [-1] is newest dir, after sort
    assert check_extract_output(extract_output_dir)

    feature_file = glob(os.path.join(extract_output_dir, 'features_created*'))
    feature_file = feature_file[0]  # because glob returns list

    os.remove(extract_config_rewritten)

    select_and_predict_tuples = script_tuple_dict['select and predict']
    for select_and_predict_tuple in select_and_predict_tuples:
        (select_config_filename,
         predict_config_filename) = select_and_predict_tuple
        select_config_filename = os.path.join(configs_path,
                                              select_config_filename)

        select_config_rewritten = rewrite_config(
            select_config_filename,
            tmp_output_dir,
            replace_dict={
                'feature_file': ('replace with feature_file', feature_file),
                'output_dir':
                ('replace with tmp_output_dir', str(tmp_output_dir))
            })
        hvc.select(select_config_rewritten)
        select_outputs = list(
            filter(os.path.isdir,
                   glob(os.path.join(str(tmp_output_dir), '*select*'))))
        select_outputs.sort(key=os.path.getmtime)
        select_output_dir = (select_outputs[-1]
                             )  # [-1] is newest dir, after sort
        assert check_select_output(select_config_rewritten, select_output_dir)
        os.remove(select_config_rewritten)

        select_outputs.sort(key=os.path.getmtime)
        select_output_dir = (select_outputs[-1]
                             )  # [-1] is newest dir, after sort
        model_meta_files = glob(os.path.join(select_output_dir, '*', '*meta*'))
        replace_dict = {
            'model_meta_file':
            ('replace with model_file', model_meta_files[-1]),
            'output_dir': ('replace with tmp_output_dir', str(tmp_output_dir))
        }
        predict_config_filename_with_path = os.path.join(
            configs_path, predict_config_filename)

        predict_config_rewritten = rewrite_config(
            predict_config_filename_with_path, tmp_output_dir, replace_dict)
        hvc.predict(predict_config_rewritten)
        os.remove(predict_config_rewritten)
        predict_outputs = list(
            filter(os.path.isdir,
                   glob(os.path.join(str(tmp_output_dir), '*predict*'))))
        predict_outputs.sort(key=os.path.getmtime)
        predict_output_dir = (predict_outputs[-1]
                              )  # [-1] is newest dir, after sort
        feature_files = glob(os.path.join(predict_output_dir, 'feature*'))
        for ftr_filename in feature_files:
            ftr_file = joblib.load(ftr_filename)
            assert 'pred_labels' in ftr_file
            if 'predict_proba_True' in extract_config_filename:
                assert 'pred_probs' in ftr_file
                assert ftr_file['pred_labels'].shape[0] == ftr_file[
                    'pred_probs'].shape[0]
コード例 #8
0
 def test_select_multiple_ftr_grp(self):
     """test select with features for model specified by list of multiple feature groups"""
     knn_select_config = os.path.join(
         configs, 'test_select_multiple_ftr_grp.config.yml')
     hvc.select(knn_select_config)
コード例 #9
0
 def test_select_knn_ftr_grp(self):
     """test select with features for model specified by a feature group"""
     knn_select_config = os.path.join(
         configs, 'test_select_knn_ftr_list_inds.config.yml')
     hvc.select(knn_select_config)
コード例 #10
0
from glob import glob
import hvc

# 0. create training data
# In this case, we download already labeled data from an open repository.
# String in quotes matches with the name of one of the folders in the repository.
hvc.utils.fetch('gy6or6.032612')

# 1. pick a model and 2. extract features for that model
# Model and features are defined in extract.config.yml file.
hvc.extract('gy6or6_autolabel_example.knn.extract.config.yml')

# 3. pick hyperparameters for model
# Load summary feature file to use with helper functions for
# finding best hyperparameters.
summary_file = glob('./extract_output*/summary*')
summary_data = hvc.load_feature_file(summary_file)
# In this case, we picked a k-nearest neighbors model
# and we want to find what value of k will give us the highest accuracy
cv_scores, best_k = hvc.utils.find_best_k(summary_data['features'],
                                          summary_data['labels'],
                                          k_range=range(1, 11))

# 4. Fit the **model** to the data and 5. Select the **best** model
hvc.select('gy6or6_autolabel.example.select.knn.config.yml')

# 6. **Predict** labels for unlabeled data using the fit model.
hvc.predict('gy6or6_autolabel.example.predict.knn.config.yml')