Beispiel #1
0
def main():
    extract_configs = glob(os.path.join(here, '../config.yml/test_extract*'))
    for extract_config in extract_configs:
        print('running {} to create feature file'.format(extract_config))
        replace_dict = {'output_dir': ('replace with tmp_output_dir', here)}
        # have to put tmp_output_dir into yaml file
        extract_config_rewritten = rewrite_config(extract_config, here,
                                                  replace_dict)
        hvc.extract(extract_config_rewritten)
        extract_output_dir = glob(os.path.join(here, '*extract*output*'))
        if len(extract_output_dir) != 1:
            raise ValueError(
                'incorrect number of outputs when looking for extract '
                'ouput dirs:\n{}'.format(extract_output_dir))
        else:
            extract_output_dir = extract_output_dir[0]

        features_created = glob(
            os.path.join(extract_output_dir, 'features_created*'))
        if len(features_created) != 1:
            raise ValueError(
                'incorrect number of outputs when looking for extract '
                'feature files:\n{}'.format(features_created))
        else:
            features_created = features_created[0]
        prefix = os.path.basename(extract_config_rewritten)
        suffix = os.path.basename(features_created)
        movename = prefix + '.' + suffix
        shutil.move(src=features_created, dst=os.path.join(here, movename))
        os.rmdir(extract_output_dir)
        os.remove(extract_config_rewritten)
    def tests_for_all_extract(self):
        search_path = os.path.join(
            configs,
            os.path.normpath('test_data/config.yml/'
                             'test_extract_*.config.yml'))
        extract_config_files = glob.glob(search_path)
        for extract_config_file in extract_config_files:
            if os.getcwd() != homedir:
                os.chdir(homedir)
            hvc.extract(extract_config_file)
            extract_config = hvc.parse_config(extract_config_file, 'extract')

            for todo in extract_config['todo_list']:
                # switch to test dir
                os.chdir(todo['output_dir'])
                extract_outputs = list(
                    filter(os.path.isdir, glob.glob('*extract_output*')))
                extract_outputs.sort(key=os.path.getmtime)

                os.chdir(extract_outputs[-1])  # most recent
                ftr_files = glob.glob('features_from*')
                ftr_dicts = []
                for ftr_file in ftr_files:
                    ftr_dicts.append(joblib.load(ftr_file))

                if any(['features' in ftr_dict for ftr_dict in ftr_dicts]):
                    assert all(
                        ['features' in ftr_dict for ftr_dict in ftr_dicts])
                    for ftr_dict in ftr_dicts:
                        labels = ftr_dict['labels']
                        if 'features' in ftr_dict:
                            features = ftr_dict['features']
                            assert features.shape[0] == len(labels)

                    # make sure number of features i.e. columns is constant across feature matrices
                    ftr_cols = [
                        ftr_dict['features'].shape[1] for ftr_dict in ftr_dicts
                    ]
                    assert np.unique(ftr_cols).shape[-1] == 1

                if any([
                        'neuralnets_input_dict' in ftr_dict
                        for ftr_dict in ftr_dicts
                ]):
                    assert all([
                        'neuralnets_input_dict' in ftr_dict
                        for ftr_dict in ftr_dicts
                    ])

                # make sure rows in summary dict features == sum of rows of each ftr file features
                summary_file = glob.glob('summary_feature_file_*')
                # (should only be one summary file)
                assert len(summary_file) == 1
                summary_dict = joblib.load(summary_file[0])
def test_main_workflow(tmp_config_dir, tmp_output_dir):
    """
    """

    for test_config_tuple in test_config_tuples:

        extract_config_filename = test_config_tuple[0]
        # have to put tmp_output_dir into yaml file
        tmp_config_path = rewrite_config(
            tmp_config_dir,
            replace_dict={
                'output_dir':
                ('replace with tmp_output_dir', str(tmp_output_dir))
            },
            config_filename=extract_config_filename)
        hvc.extract(tmp_config_path)
        extract_outputs = list(
            filter(os.path.isdir,
                   glob.glob(os.path.join(str(tmp_output_dir), '*extract*'))))
        extract_outputs.sort(key=os.path.getmtime)
        extract_output_dir = (extract_outputs[-1]
                              )  # [-1] is newest dir, after sort
        assert check_extract_output(extract_output_dir)

        feature_file = glob.glob(os.path.join(extract_output_dir, 'summary*'))
        feature_file = feature_file[0]  # because glob returns list

        select_config_filenames = test_config_tuple[1]

        while True:
            try:
                select_config_filename = select_config_filenames.pop()
                tmp_config_path = rewrite_config(
                    tmp_config_dir,
                    replace_dict={
                        'feature_file':
                        ('replace with feature_file', feature_file),
                        'output_dir':
                        ('replace with tmp_output_dir', str(tmp_output_dir))
                    },
                    config_filename=select_config_filename)
                hvc.select(tmp_config_path)
                select_outputs = list(
                    filter(
                        os.path.isdir,
                        glob.glob(os.path.join(str(tmp_output_dir),
                                               '*select*'))))
                select_outputs.sort(key=os.path.getmtime)
                select_output_dir = (select_outputs[-1]
                                     )  # [-1] is newest dir, after sort
                assert check_select_output(tmp_config_path, select_output_dir)
            except IndexError:  # because pop from empty list
                break
Beispiel #4
0
    def test_annotation_file_cbins(self, test_data_dir, tmp_output_dir):
        # test that calling extract doesn't fail when we
        # pass a data_dirs list that contain cbin audio files
        cbin_dirs = [
            'cbins/gy6or6/032312',
            'cbins/gy6or6/032412']
        cbin_dirs = [
            os.path.join(test_data_dir,
                         os.path.normpath(cbin_dir))
            for cbin_dir in cbin_dirs
        ]

        notmat_list = []
        for cbin_dir in cbin_dirs:
            notmat_list.extend(
                glob(os.path.join(cbin_dir, '*.not.mat'))
            )
        # below, sorted() so it's the same order on different platforms
        notmat_list = sorted(notmat_list)
        csv_filename = os.path.join(str(tmp_output_dir),
                                    'test.csv')
        annotation.notmat_list_to_csv(notmat_list, csv_filename)

        file_format = 'cbin'
        labels_to_use = 'iabcdefghjk'
        feature_group = 'knn'
        return_features = True
        ftrs = hvc.extract(file_format=file_format,
                           annotation_file=csv_filename,
                           labels_to_use=labels_to_use,
                           feature_group=feature_group,
                           output_dir=str(tmp_output_dir),
                           return_features=return_features)
        assert type(ftrs) == dict
        assert sorted(ftrs.keys()) == ['features', 'labels']
Beispiel #5
0
    def _yaml_config_asserts(self,
                             extract_yaml_config_file,
                             tmp_output_dir):
        replace_dict = {'output_dir':
                            ('replace with tmp_output_dir',
                             str(tmp_output_dir))}
        # have to put tmp_output_dir into yaml file
        extract_config_rewritten = rewrite_config(extract_yaml_config_file,
                                                  tmp_output_dir,
                                                  replace_dict)

        # helper function that is called by tests below
        hvc.extract(extract_config_rewritten)
        extract_config = hvc.parse_config(extract_config_rewritten,
                                          'extract')

        for todo in extract_config['todo_list']:
            os.chdir(todo['output_dir'])
            extract_outputs = list(
                filter(os.path.isdir, glob('*extract_output*')
                       )
            )
            extract_outputs.sort(key=os.path.getmtime)

            os.chdir(extract_outputs[-1])  # most recent
            ftr_files = glob('features_from*')
            ftr_dicts = []
            for ftr_file in ftr_files:
                ftr_dicts.append(joblib.load(ftr_file))

            if any(['features' in ftr_dict for ftr_dict in ftr_dicts]):
                assert all(['features' in ftr_dict for ftr_dict in ftr_dicts])
                for ftr_dict in ftr_dicts:
                    labels = ftr_dict['labels']
                    if 'features' in ftr_dict:
                        features = ftr_dict['features']
                        assert features.shape[0] == len(labels)

                # make sure number of features i.e. columns is constant across feature matrices
                ftr_cols = [ftr_dict['features'].shape[1] for ftr_dict in ftr_dicts]
                assert np.unique(ftr_cols).shape[-1] == 1

            if any(['neuralnets_input_dict' in ftr_dict for ftr_dict in ftr_dicts]):
                assert all(['neuralnets_input_dict' in ftr_dict for ftr_dict in ftr_dicts])
Beispiel #6
0
def main():
    feature_files_to_create = [
        'knn',
        'svm',
        'multiple_feature_groups',
        'flatwindow',
    ]
    for feature_to_create in feature_files_to_create:
        extract_config = os.path.join(here,
                                      '..',
                                      'config.yml',
                                      'test_extract_{}.config.yml'
                                      .format(feature_to_create)
                                      )
        print('running {} to create feature file'.format(extract_config))
        replace_dict = {'output_dir':
                            ('replace with tmp_output_dir',
                             here)}
        # have to put tmp_output_dir into yaml file
        extract_config_rewritten = rewrite_config(extract_config,
                                                  here,
                                                  replace_dict)
        hvc.extract(extract_config_rewritten)
        extract_output_dir = glob(os.path.join(here,
                                               '*extract*output*'))
        if len(extract_output_dir) != 1:
            raise ValueError('incorrect number of outputs when looking for extract '
                             'ouput dirs:\n{}'. format(extract_output_dir))
        else:
            extract_output_dir = extract_output_dir[0]

        features_created = glob(os.path.join(extract_output_dir,
                                             'features_created*'))
        if len(features_created) != 1:
            raise ValueError('incorrect number of outputs when looking for extract '
                             'feature files:\n{}'. format(features_created))
        else:
            features_created = features_created[0]
        movename = feature_to_create + '.' + 'features'
        shutil.move(src=features_created,
                    dst=os.path.join(here, movename))
        os.rmdir(extract_output_dir)
        os.remove(extract_config_rewritten)
Beispiel #7
0
    def test_data_dirs_cbins(self, test_data_dir, tmp_output_dir):
        # test that calling extract doesn't fail when we
        # pass a data_dirs list that contain cbin audio files
        data_dirs = [
            'cbins/gy6or6/032312',
            'cbins/gy6or6/032412']
        data_dirs = [
            os.path.join(test_data_dir,
                         os.path.normpath(data_dir))
            for data_dir in data_dirs
        ]

        file_format = 'cbin'
        labels_to_use = 'iabcdefghjk'
        feature_group = 'knn'
        return_features = True
        ftrs = hvc.extract(data_dirs=data_dirs,
                           file_format=file_format,
                           labels_to_use=labels_to_use,
                           feature_group=feature_group,
                           output_dir=str(tmp_output_dir),
                           return_features=return_features)
        assert type(ftrs) == dict
        assert sorted(ftrs.keys()) == ['features', 'labels']
Beispiel #8
0
def run_main_workflow(tmp_output_dir, script_tuple_dict, configs_path):
    """tests main workflow for hybrid-vocal-classifier
    by iterating through test_main_workflow_dict,
    running the scripts named in each tuple in the dict
    """

    extract_config_filename = os.path.join(configs_path,
                                           script_tuple_dict['extract'])
    replace_dict = {
        'output_dir': ('replace with tmp_output_dir', str(tmp_output_dir))
    }
    # have to put tmp_output_dir into yaml file
    extract_config_rewritten = rewrite_config(extract_config_filename,
                                              tmp_output_dir, replace_dict)
    hvc.extract(extract_config_rewritten)
    extract_outputs = list(
        filter(os.path.isdir,
               glob(os.path.join(str(tmp_output_dir), '*extract*'))))
    extract_outputs.sort(key=os.path.getmtime)
    extract_output_dir = (extract_outputs[-1]
                          )  # [-1] is newest dir, after sort
    assert check_extract_output(extract_output_dir)

    feature_file = glob(os.path.join(extract_output_dir, 'features_created*'))
    feature_file = feature_file[0]  # because glob returns list

    os.remove(extract_config_rewritten)

    select_and_predict_tuples = script_tuple_dict['select and predict']
    for select_and_predict_tuple in select_and_predict_tuples:
        (select_config_filename,
         predict_config_filename) = select_and_predict_tuple
        select_config_filename = os.path.join(configs_path,
                                              select_config_filename)

        select_config_rewritten = rewrite_config(
            select_config_filename,
            tmp_output_dir,
            replace_dict={
                'feature_file': ('replace with feature_file', feature_file),
                'output_dir':
                ('replace with tmp_output_dir', str(tmp_output_dir))
            })
        hvc.select(select_config_rewritten)
        select_outputs = list(
            filter(os.path.isdir,
                   glob(os.path.join(str(tmp_output_dir), '*select*'))))
        select_outputs.sort(key=os.path.getmtime)
        select_output_dir = (select_outputs[-1]
                             )  # [-1] is newest dir, after sort
        assert check_select_output(select_config_rewritten, select_output_dir)
        os.remove(select_config_rewritten)

        select_outputs.sort(key=os.path.getmtime)
        select_output_dir = (select_outputs[-1]
                             )  # [-1] is newest dir, after sort
        model_meta_files = glob(os.path.join(select_output_dir, '*', '*meta*'))
        replace_dict = {
            'model_meta_file':
            ('replace with model_file', model_meta_files[-1]),
            'output_dir': ('replace with tmp_output_dir', str(tmp_output_dir))
        }
        predict_config_filename_with_path = os.path.join(
            configs_path, predict_config_filename)

        predict_config_rewritten = rewrite_config(
            predict_config_filename_with_path, tmp_output_dir, replace_dict)
        hvc.predict(predict_config_rewritten)
        os.remove(predict_config_rewritten)
        predict_outputs = list(
            filter(os.path.isdir,
                   glob(os.path.join(str(tmp_output_dir), '*predict*'))))
        predict_outputs.sort(key=os.path.getmtime)
        predict_output_dir = (predict_outputs[-1]
                              )  # [-1] is newest dir, after sort
        feature_files = glob(os.path.join(predict_output_dir, 'feature*'))
        for ftr_filename in feature_files:
            ftr_file = joblib.load(ftr_filename)
            assert 'pred_labels' in ftr_file
            if 'predict_proba_True' in extract_config_filename:
                assert 'pred_probs' in ftr_file
                assert ftr_file['pred_labels'].shape[0] == ftr_file[
                    'pred_probs'].shape[0]
Beispiel #9
0
from glob import glob
import hvc

# 0. create training data
# In this case, we download already labeled data from an open repository.
# String in quotes matches with the name of one of the folders in the repository.
hvc.utils.fetch('gy6or6.032612')

# 1. pick a model and 2. extract features for that model
# Model and features are defined in extract.config.yml file.
hvc.extract('gy6or6_autolabel_example.knn.extract.config.yml')

# 3. pick hyperparameters for model
# Load summary feature file to use with helper functions for
# finding best hyperparameters.
summary_file = glob('./extract_output*/summary*')
summary_data = hvc.load_feature_file(summary_file)
# In this case, we picked a k-nearest neighbors model
# and we want to find what value of k will give us the highest accuracy
cv_scores, best_k = hvc.utils.find_best_k(summary_data['features'],
                                          summary_data['labels'],
                                          k_range=range(1, 11))

# 4. Fit the **model** to the data and 5. Select the **best** model
hvc.select('gy6or6_autolabel.example.select.knn.config.yml')

# 6. **Predict** labels for unlabeled data using the fit model.
hvc.predict('gy6or6_autolabel.example.predict.knn.config.yml')