def main(): extract_configs = glob(os.path.join(here, '../config.yml/test_extract*')) for extract_config in extract_configs: print('running {} to create feature file'.format(extract_config)) replace_dict = {'output_dir': ('replace with tmp_output_dir', here)} # have to put tmp_output_dir into yaml file extract_config_rewritten = rewrite_config(extract_config, here, replace_dict) hvc.extract(extract_config_rewritten) extract_output_dir = glob(os.path.join(here, '*extract*output*')) if len(extract_output_dir) != 1: raise ValueError( 'incorrect number of outputs when looking for extract ' 'ouput dirs:\n{}'.format(extract_output_dir)) else: extract_output_dir = extract_output_dir[0] features_created = glob( os.path.join(extract_output_dir, 'features_created*')) if len(features_created) != 1: raise ValueError( 'incorrect number of outputs when looking for extract ' 'feature files:\n{}'.format(features_created)) else: features_created = features_created[0] prefix = os.path.basename(extract_config_rewritten) suffix = os.path.basename(features_created) movename = prefix + '.' + suffix shutil.move(src=features_created, dst=os.path.join(here, movename)) os.rmdir(extract_output_dir) os.remove(extract_config_rewritten)
def tests_for_all_extract(self): search_path = os.path.join( configs, os.path.normpath('test_data/config.yml/' 'test_extract_*.config.yml')) extract_config_files = glob.glob(search_path) for extract_config_file in extract_config_files: if os.getcwd() != homedir: os.chdir(homedir) hvc.extract(extract_config_file) extract_config = hvc.parse_config(extract_config_file, 'extract') for todo in extract_config['todo_list']: # switch to test dir os.chdir(todo['output_dir']) extract_outputs = list( filter(os.path.isdir, glob.glob('*extract_output*'))) extract_outputs.sort(key=os.path.getmtime) os.chdir(extract_outputs[-1]) # most recent ftr_files = glob.glob('features_from*') ftr_dicts = [] for ftr_file in ftr_files: ftr_dicts.append(joblib.load(ftr_file)) if any(['features' in ftr_dict for ftr_dict in ftr_dicts]): assert all( ['features' in ftr_dict for ftr_dict in ftr_dicts]) for ftr_dict in ftr_dicts: labels = ftr_dict['labels'] if 'features' in ftr_dict: features = ftr_dict['features'] assert features.shape[0] == len(labels) # make sure number of features i.e. columns is constant across feature matrices ftr_cols = [ ftr_dict['features'].shape[1] for ftr_dict in ftr_dicts ] assert np.unique(ftr_cols).shape[-1] == 1 if any([ 'neuralnets_input_dict' in ftr_dict for ftr_dict in ftr_dicts ]): assert all([ 'neuralnets_input_dict' in ftr_dict for ftr_dict in ftr_dicts ]) # make sure rows in summary dict features == sum of rows of each ftr file features summary_file = glob.glob('summary_feature_file_*') # (should only be one summary file) assert len(summary_file) == 1 summary_dict = joblib.load(summary_file[0])
def test_main_workflow(tmp_config_dir, tmp_output_dir): """ """ for test_config_tuple in test_config_tuples: extract_config_filename = test_config_tuple[0] # have to put tmp_output_dir into yaml file tmp_config_path = rewrite_config( tmp_config_dir, replace_dict={ 'output_dir': ('replace with tmp_output_dir', str(tmp_output_dir)) }, config_filename=extract_config_filename) hvc.extract(tmp_config_path) extract_outputs = list( filter(os.path.isdir, glob.glob(os.path.join(str(tmp_output_dir), '*extract*')))) extract_outputs.sort(key=os.path.getmtime) extract_output_dir = (extract_outputs[-1] ) # [-1] is newest dir, after sort assert check_extract_output(extract_output_dir) feature_file = glob.glob(os.path.join(extract_output_dir, 'summary*')) feature_file = feature_file[0] # because glob returns list select_config_filenames = test_config_tuple[1] while True: try: select_config_filename = select_config_filenames.pop() tmp_config_path = rewrite_config( tmp_config_dir, replace_dict={ 'feature_file': ('replace with feature_file', feature_file), 'output_dir': ('replace with tmp_output_dir', str(tmp_output_dir)) }, config_filename=select_config_filename) hvc.select(tmp_config_path) select_outputs = list( filter( os.path.isdir, glob.glob(os.path.join(str(tmp_output_dir), '*select*')))) select_outputs.sort(key=os.path.getmtime) select_output_dir = (select_outputs[-1] ) # [-1] is newest dir, after sort assert check_select_output(tmp_config_path, select_output_dir) except IndexError: # because pop from empty list break
def test_annotation_file_cbins(self, test_data_dir, tmp_output_dir): # test that calling extract doesn't fail when we # pass a data_dirs list that contain cbin audio files cbin_dirs = [ 'cbins/gy6or6/032312', 'cbins/gy6or6/032412'] cbin_dirs = [ os.path.join(test_data_dir, os.path.normpath(cbin_dir)) for cbin_dir in cbin_dirs ] notmat_list = [] for cbin_dir in cbin_dirs: notmat_list.extend( glob(os.path.join(cbin_dir, '*.not.mat')) ) # below, sorted() so it's the same order on different platforms notmat_list = sorted(notmat_list) csv_filename = os.path.join(str(tmp_output_dir), 'test.csv') annotation.notmat_list_to_csv(notmat_list, csv_filename) file_format = 'cbin' labels_to_use = 'iabcdefghjk' feature_group = 'knn' return_features = True ftrs = hvc.extract(file_format=file_format, annotation_file=csv_filename, labels_to_use=labels_to_use, feature_group=feature_group, output_dir=str(tmp_output_dir), return_features=return_features) assert type(ftrs) == dict assert sorted(ftrs.keys()) == ['features', 'labels']
def _yaml_config_asserts(self, extract_yaml_config_file, tmp_output_dir): replace_dict = {'output_dir': ('replace with tmp_output_dir', str(tmp_output_dir))} # have to put tmp_output_dir into yaml file extract_config_rewritten = rewrite_config(extract_yaml_config_file, tmp_output_dir, replace_dict) # helper function that is called by tests below hvc.extract(extract_config_rewritten) extract_config = hvc.parse_config(extract_config_rewritten, 'extract') for todo in extract_config['todo_list']: os.chdir(todo['output_dir']) extract_outputs = list( filter(os.path.isdir, glob('*extract_output*') ) ) extract_outputs.sort(key=os.path.getmtime) os.chdir(extract_outputs[-1]) # most recent ftr_files = glob('features_from*') ftr_dicts = [] for ftr_file in ftr_files: ftr_dicts.append(joblib.load(ftr_file)) if any(['features' in ftr_dict for ftr_dict in ftr_dicts]): assert all(['features' in ftr_dict for ftr_dict in ftr_dicts]) for ftr_dict in ftr_dicts: labels = ftr_dict['labels'] if 'features' in ftr_dict: features = ftr_dict['features'] assert features.shape[0] == len(labels) # make sure number of features i.e. columns is constant across feature matrices ftr_cols = [ftr_dict['features'].shape[1] for ftr_dict in ftr_dicts] assert np.unique(ftr_cols).shape[-1] == 1 if any(['neuralnets_input_dict' in ftr_dict for ftr_dict in ftr_dicts]): assert all(['neuralnets_input_dict' in ftr_dict for ftr_dict in ftr_dicts])
def main(): feature_files_to_create = [ 'knn', 'svm', 'multiple_feature_groups', 'flatwindow', ] for feature_to_create in feature_files_to_create: extract_config = os.path.join(here, '..', 'config.yml', 'test_extract_{}.config.yml' .format(feature_to_create) ) print('running {} to create feature file'.format(extract_config)) replace_dict = {'output_dir': ('replace with tmp_output_dir', here)} # have to put tmp_output_dir into yaml file extract_config_rewritten = rewrite_config(extract_config, here, replace_dict) hvc.extract(extract_config_rewritten) extract_output_dir = glob(os.path.join(here, '*extract*output*')) if len(extract_output_dir) != 1: raise ValueError('incorrect number of outputs when looking for extract ' 'ouput dirs:\n{}'. format(extract_output_dir)) else: extract_output_dir = extract_output_dir[0] features_created = glob(os.path.join(extract_output_dir, 'features_created*')) if len(features_created) != 1: raise ValueError('incorrect number of outputs when looking for extract ' 'feature files:\n{}'. format(features_created)) else: features_created = features_created[0] movename = feature_to_create + '.' + 'features' shutil.move(src=features_created, dst=os.path.join(here, movename)) os.rmdir(extract_output_dir) os.remove(extract_config_rewritten)
def test_data_dirs_cbins(self, test_data_dir, tmp_output_dir): # test that calling extract doesn't fail when we # pass a data_dirs list that contain cbin audio files data_dirs = [ 'cbins/gy6or6/032312', 'cbins/gy6or6/032412'] data_dirs = [ os.path.join(test_data_dir, os.path.normpath(data_dir)) for data_dir in data_dirs ] file_format = 'cbin' labels_to_use = 'iabcdefghjk' feature_group = 'knn' return_features = True ftrs = hvc.extract(data_dirs=data_dirs, file_format=file_format, labels_to_use=labels_to_use, feature_group=feature_group, output_dir=str(tmp_output_dir), return_features=return_features) assert type(ftrs) == dict assert sorted(ftrs.keys()) == ['features', 'labels']
def run_main_workflow(tmp_output_dir, script_tuple_dict, configs_path): """tests main workflow for hybrid-vocal-classifier by iterating through test_main_workflow_dict, running the scripts named in each tuple in the dict """ extract_config_filename = os.path.join(configs_path, script_tuple_dict['extract']) replace_dict = { 'output_dir': ('replace with tmp_output_dir', str(tmp_output_dir)) } # have to put tmp_output_dir into yaml file extract_config_rewritten = rewrite_config(extract_config_filename, tmp_output_dir, replace_dict) hvc.extract(extract_config_rewritten) extract_outputs = list( filter(os.path.isdir, glob(os.path.join(str(tmp_output_dir), '*extract*')))) extract_outputs.sort(key=os.path.getmtime) extract_output_dir = (extract_outputs[-1] ) # [-1] is newest dir, after sort assert check_extract_output(extract_output_dir) feature_file = glob(os.path.join(extract_output_dir, 'features_created*')) feature_file = feature_file[0] # because glob returns list os.remove(extract_config_rewritten) select_and_predict_tuples = script_tuple_dict['select and predict'] for select_and_predict_tuple in select_and_predict_tuples: (select_config_filename, predict_config_filename) = select_and_predict_tuple select_config_filename = os.path.join(configs_path, select_config_filename) select_config_rewritten = rewrite_config( select_config_filename, tmp_output_dir, replace_dict={ 'feature_file': ('replace with feature_file', feature_file), 'output_dir': ('replace with tmp_output_dir', str(tmp_output_dir)) }) hvc.select(select_config_rewritten) select_outputs = list( filter(os.path.isdir, glob(os.path.join(str(tmp_output_dir), '*select*')))) select_outputs.sort(key=os.path.getmtime) select_output_dir = (select_outputs[-1] ) # [-1] is newest dir, after sort assert check_select_output(select_config_rewritten, select_output_dir) os.remove(select_config_rewritten) select_outputs.sort(key=os.path.getmtime) select_output_dir = (select_outputs[-1] ) # [-1] is newest dir, after sort model_meta_files = glob(os.path.join(select_output_dir, '*', '*meta*')) replace_dict = { 'model_meta_file': ('replace with model_file', model_meta_files[-1]), 'output_dir': ('replace with tmp_output_dir', str(tmp_output_dir)) } predict_config_filename_with_path = os.path.join( configs_path, predict_config_filename) predict_config_rewritten = rewrite_config( predict_config_filename_with_path, tmp_output_dir, replace_dict) hvc.predict(predict_config_rewritten) os.remove(predict_config_rewritten) predict_outputs = list( filter(os.path.isdir, glob(os.path.join(str(tmp_output_dir), '*predict*')))) predict_outputs.sort(key=os.path.getmtime) predict_output_dir = (predict_outputs[-1] ) # [-1] is newest dir, after sort feature_files = glob(os.path.join(predict_output_dir, 'feature*')) for ftr_filename in feature_files: ftr_file = joblib.load(ftr_filename) assert 'pred_labels' in ftr_file if 'predict_proba_True' in extract_config_filename: assert 'pred_probs' in ftr_file assert ftr_file['pred_labels'].shape[0] == ftr_file[ 'pred_probs'].shape[0]
from glob import glob import hvc # 0. create training data # In this case, we download already labeled data from an open repository. # String in quotes matches with the name of one of the folders in the repository. hvc.utils.fetch('gy6or6.032612') # 1. pick a model and 2. extract features for that model # Model and features are defined in extract.config.yml file. hvc.extract('gy6or6_autolabel_example.knn.extract.config.yml') # 3. pick hyperparameters for model # Load summary feature file to use with helper functions for # finding best hyperparameters. summary_file = glob('./extract_output*/summary*') summary_data = hvc.load_feature_file(summary_file) # In this case, we picked a k-nearest neighbors model # and we want to find what value of k will give us the highest accuracy cv_scores, best_k = hvc.utils.find_best_k(summary_data['features'], summary_data['labels'], k_range=range(1, 11)) # 4. Fit the **model** to the data and 5. Select the **best** model hvc.select('gy6or6_autolabel.example.select.knn.config.yml') # 6. **Predict** labels for unlabeled data using the fit model. hvc.predict('gy6or6_autolabel.example.predict.knn.config.yml')