def test_main_workflow(tmp_config_dir, tmp_output_dir): """ """ for test_config_tuple in test_config_tuples: extract_config_filename = test_config_tuple[0] # have to put tmp_output_dir into yaml file tmp_config_path = rewrite_config( tmp_config_dir, replace_dict={ 'output_dir': ('replace with tmp_output_dir', str(tmp_output_dir)) }, config_filename=extract_config_filename) hvc.extract(tmp_config_path) extract_outputs = list( filter(os.path.isdir, glob.glob(os.path.join(str(tmp_output_dir), '*extract*')))) extract_outputs.sort(key=os.path.getmtime) extract_output_dir = (extract_outputs[-1] ) # [-1] is newest dir, after sort assert check_extract_output(extract_output_dir) feature_file = glob.glob(os.path.join(extract_output_dir, 'summary*')) feature_file = feature_file[0] # because glob returns list select_config_filenames = test_config_tuple[1] while True: try: select_config_filename = select_config_filenames.pop() tmp_config_path = rewrite_config( tmp_config_dir, replace_dict={ 'feature_file': ('replace with feature_file', feature_file), 'output_dir': ('replace with tmp_output_dir', str(tmp_output_dir)) }, config_filename=select_config_filename) hvc.select(tmp_config_path) select_outputs = list( filter( os.path.isdir, glob.glob(os.path.join(str(tmp_output_dir), '*select*')))) select_outputs.sort(key=os.path.getmtime) select_output_dir = (select_outputs[-1] ) # [-1] is newest dir, after sort assert check_select_output(tmp_config_path, select_output_dir) except IndexError: # because pop from empty list break
def test_select_knn_ftr_grp(self, tmp_output_dir, test_data_dir): # test select with features for model specified by feature list indices feature_file_path = os.path.join(test_data_dir, 'feature_files', 'knn.features') hvc.select(feature_file_path=feature_file_path, feature_group='knn', train_samples_range=range(100, 401, 100), num_replicates=5, num_test_samples=400, model_name='knn', hyperparameters={'k': 4}, output_dir=str(tmp_output_dir))
def test_select_knn_ftr_list_indices(self, tmp_output_dir, test_data_dir): # test select with features for model specified by feature list indices feature_file_path = glob( os.path.join(test_data_dir, 'feature_files', 'test_extract_knn*'))[0] hvc.select(feature_file_path=feature_file_path, feature_list_indices=[0, 1, 2, 3, 4, 5, 6, 7, 8], train_samples_range=range(100, 401, 100), num_replicates=5, num_test_samples=400, model_name='knn', hyperparameters={'k': 4}, output_dir=str(tmp_output_dir))
def test_select_flatwindow_ftr_grp(self, tmp_output_dir, test_data_dir): # test select with features for model specified by feature list indices feature_file_path = os.path.join(test_data_dir, 'feature_files', 'flatwindow.features') hvc.select(feature_file_path=feature_file_path, model_name='flatwindow', hyperparameters={ 'batch_size': 32, 'epochs': 3 }, neuralnet_input='flatwindow', train_samples_range=range(200, 401, 200), num_replicates=3, num_test_samples=400, output_dir=str(tmp_output_dir))
def test_select_svm_ftr_grp(self, tmp_output_dir, test_data_dir): # test select with features for model specified by feature list indices feature_file_path = glob( os.path.join(test_data_dir, 'feature_files', 'test_extract_svm*'))[0] hvc.select(feature_file_path=feature_file_path, feature_group='svm', train_samples_range=range(100, 401, 100), num_replicates=5, num_test_samples=400, model_name='svm', hyperparameters={ 'C': 1, 'gamma': 0.01 }, output_dir=str(tmp_output_dir))
def _yaml_config_asserts(self, select_yaml_config_path, tmp_output_dir, feature_file): select_config_rewritten = rewrite_config( select_yaml_config_path, tmp_output_dir, replace_dict={ 'feature_file': ('replace with feature_file', feature_file), 'output_dir': ('replace with tmp_output_dir', str(tmp_output_dir)) }) select_outputs_before = glob( os.path.join(str(tmp_output_dir), 'select_output*', 'summary_model_select_file*')) hvc.select(select_config_rewritten) # helper function with assertions shared by all # tests for hvc.select run with config.yml files select_outputs_after = glob( os.path.join(str(tmp_output_dir), 'select_output*', 'summary_model_select_file*')) select_output = [ after for after in select_outputs_after if after not in select_outputs_before ] # should only be one summary output file assert len(select_output) == 1 # now check for every model in config # if there is corresponding folder with model files etc select_config = hvc.parse_config(select_config_rewritten, 'select') select_output_dir = os.path.dirname(select_output[0]) select_model_dirs = next( os.walk(select_output_dir))[1] # [1] to return just dir names select_model_folder_names = [ determine_model_output_folder_name(model_dict) for model_dict in select_config['models'] ] for folder_name in select_model_folder_names: assert folder_name in select_model_dirs return True
def run_main_workflow(tmp_output_dir, script_tuple_dict, configs_path): """tests main workflow for hybrid-vocal-classifier by iterating through test_main_workflow_dict, running the scripts named in each tuple in the dict """ extract_config_filename = os.path.join(configs_path, script_tuple_dict['extract']) replace_dict = { 'output_dir': ('replace with tmp_output_dir', str(tmp_output_dir)) } # have to put tmp_output_dir into yaml file extract_config_rewritten = rewrite_config(extract_config_filename, tmp_output_dir, replace_dict) hvc.extract(extract_config_rewritten) extract_outputs = list( filter(os.path.isdir, glob(os.path.join(str(tmp_output_dir), '*extract*')))) extract_outputs.sort(key=os.path.getmtime) extract_output_dir = (extract_outputs[-1] ) # [-1] is newest dir, after sort assert check_extract_output(extract_output_dir) feature_file = glob(os.path.join(extract_output_dir, 'features_created*')) feature_file = feature_file[0] # because glob returns list os.remove(extract_config_rewritten) select_and_predict_tuples = script_tuple_dict['select and predict'] for select_and_predict_tuple in select_and_predict_tuples: (select_config_filename, predict_config_filename) = select_and_predict_tuple select_config_filename = os.path.join(configs_path, select_config_filename) select_config_rewritten = rewrite_config( select_config_filename, tmp_output_dir, replace_dict={ 'feature_file': ('replace with feature_file', feature_file), 'output_dir': ('replace with tmp_output_dir', str(tmp_output_dir)) }) hvc.select(select_config_rewritten) select_outputs = list( filter(os.path.isdir, glob(os.path.join(str(tmp_output_dir), '*select*')))) select_outputs.sort(key=os.path.getmtime) select_output_dir = (select_outputs[-1] ) # [-1] is newest dir, after sort assert check_select_output(select_config_rewritten, select_output_dir) os.remove(select_config_rewritten) select_outputs.sort(key=os.path.getmtime) select_output_dir = (select_outputs[-1] ) # [-1] is newest dir, after sort model_meta_files = glob(os.path.join(select_output_dir, '*', '*meta*')) replace_dict = { 'model_meta_file': ('replace with model_file', model_meta_files[-1]), 'output_dir': ('replace with tmp_output_dir', str(tmp_output_dir)) } predict_config_filename_with_path = os.path.join( configs_path, predict_config_filename) predict_config_rewritten = rewrite_config( predict_config_filename_with_path, tmp_output_dir, replace_dict) hvc.predict(predict_config_rewritten) os.remove(predict_config_rewritten) predict_outputs = list( filter(os.path.isdir, glob(os.path.join(str(tmp_output_dir), '*predict*')))) predict_outputs.sort(key=os.path.getmtime) predict_output_dir = (predict_outputs[-1] ) # [-1] is newest dir, after sort feature_files = glob(os.path.join(predict_output_dir, 'feature*')) for ftr_filename in feature_files: ftr_file = joblib.load(ftr_filename) assert 'pred_labels' in ftr_file if 'predict_proba_True' in extract_config_filename: assert 'pred_probs' in ftr_file assert ftr_file['pred_labels'].shape[0] == ftr_file[ 'pred_probs'].shape[0]
def test_select_multiple_ftr_grp(self): """test select with features for model specified by list of multiple feature groups""" knn_select_config = os.path.join( configs, 'test_select_multiple_ftr_grp.config.yml') hvc.select(knn_select_config)
def test_select_knn_ftr_grp(self): """test select with features for model specified by a feature group""" knn_select_config = os.path.join( configs, 'test_select_knn_ftr_list_inds.config.yml') hvc.select(knn_select_config)
from glob import glob import hvc # 0. create training data # In this case, we download already labeled data from an open repository. # String in quotes matches with the name of one of the folders in the repository. hvc.utils.fetch('gy6or6.032612') # 1. pick a model and 2. extract features for that model # Model and features are defined in extract.config.yml file. hvc.extract('gy6or6_autolabel_example.knn.extract.config.yml') # 3. pick hyperparameters for model # Load summary feature file to use with helper functions for # finding best hyperparameters. summary_file = glob('./extract_output*/summary*') summary_data = hvc.load_feature_file(summary_file) # In this case, we picked a k-nearest neighbors model # and we want to find what value of k will give us the highest accuracy cv_scores, best_k = hvc.utils.find_best_k(summary_data['features'], summary_data['labels'], k_range=range(1, 11)) # 4. Fit the **model** to the data and 5. Select the **best** model hvc.select('gy6or6_autolabel.example.select.knn.config.yml') # 6. **Predict** labels for unlabeled data using the fit model. hvc.predict('gy6or6_autolabel.example.predict.knn.config.yml')