def _yaml_config_run(self, predict_yaml_config_path, tmp_output_dir, model_meta_file): predict_config_rewritten = rewrite_config(predict_yaml_config_path, tmp_output_dir, replace_dict={'model_meta_file': ('replace with model_meta_file', model_meta_file), 'output_dir': ('replace with tmp_output_dir', str(tmp_output_dir))}) predict_outputs_before = glob(os.path.join(str(tmp_output_dir), 'predict_output*', 'features_created*')) hvc.predict(predict_config_rewritten) # helper function with assertions shared by all # tests for hvc.select run with config.yml files predict_outputs_after = glob(os.path.join(str(tmp_output_dir), 'predict_output*', 'features_created*')) predict_output = [after for after in predict_outputs_after if after not in predict_outputs_before] # should only be one summary output file if len(predict_output) != 1: raise ValueError('found wrong number of predict outputs after ' 'running .yaml config {}.\n' 'This was the output found: {}' .format(predict_config_rewritten, predict_output)) else: predict_output = predict_output[0] predict = joblib.load(predict_output) self._generic_predict_asserts(predict)
def test_predict_flatwindow_data_dirs(self, tmp_output_dir, test_data_dir): # tests predict with svm model, using data dirs data_dirs = [ 'cbins/gy6or6/032312', 'cbins/gy6or6/032412'] data_dirs = [ os.path.join(test_data_dir, os.path.normpath(data_dir)) for data_dir in data_dirs ] file_format = 'cbin' model_meta_file = os.path.join(test_data_dir, 'model_files', 'flatwindow.meta') output_dir = tmp_output_dir # explicitly set segment to None because we want to test # that default behavior works that happens when # we supply argument for data_dirs parameter, **and** # segment is set to None (as it should be by default) segment = None predict_proba = False return_predictions = True predict = hvc.predict(data_dirs=data_dirs, file_format=file_format, model_meta_file=model_meta_file, segment=segment, output_dir=str(tmp_output_dir), predict_proba=predict_proba, return_predictions=return_predictions) self._generic_predict_asserts(predict)
def test_data_dirs_cbins(self, tmp_output_dir, test_data_dir): """test that calling predict doesn't fail when we pass a data_dirs list that contain cbin audio files""" data_dirs = [ 'cbins/gy6or6/032312', 'cbins/gy6or6/032412'] data_dirs = [ os.path.join(test_data_dir, os.path.normpath(data_dir)) for data_dir in data_dirs ] file_format = 'cbin' model_meta_file = '' output_dir = tmp_output_dir # explicitly set segment to None because we want to test # that default behavior works that happens when # we supply argument for data_dirs parameter, **and** # segment is set to None (as it should be by default) segment = None predict_proba = False convert_to = 'cbin' # to check that this works return_predictions = True predict = hvc.predict(data_dirs=data_dirs, file_format=file_format, model_meta_file=model_meta_file, segment=segment, predict_proba=predict_proba, convert_to=convert_to, return_predictions=return_predictions) assert type(predict) == dict for key in ['labels', 'pred_labels', 'songfile_IDs', 'onsets_Hz', 'offsets_Hz', 'features',]: assert key in predict
def test_predict_knn_data_dirs_notmat(self, tmp_output_dir, test_data_dir): # tests predict with knn model, using data dirs, and # converting output to notmat files data_dirs = [ 'cbins/gy6or6/032312', 'cbins/gy6or6/032412'] data_dirs = [ os.path.join(test_data_dir, os.path.normpath(data_dir)) for data_dir in data_dirs ] file_format = 'cbin' model_meta_file = os.path.join(test_data_dir, 'model_files', 'knn.meta') output_dir = tmp_output_dir # explicitly set segment to None because we want to test # that default behavior works that happens when # we supply argument for data_dirs parameter, **and** # segment is set to None (as it should be by default) segment = None predict_proba = False convert_to = 'notmat' # to check that this works return_predictions = True predict = hvc.predict(data_dirs=data_dirs, file_format=file_format, model_meta_file=model_meta_file, segment=segment, output_dir=str(tmp_output_dir), predict_proba=predict_proba, convert_to=convert_to, return_predictions=return_predictions) assert type(predict) == dict for key in ['labels', 'pred_labels', 'songfile_IDs', 'onsets_Hz', 'offsets_Hz', 'features',]: assert key in predict
def run_main_workflow(tmp_output_dir, script_tuple_dict, configs_path): """tests main workflow for hybrid-vocal-classifier by iterating through test_main_workflow_dict, running the scripts named in each tuple in the dict """ extract_config_filename = os.path.join(configs_path, script_tuple_dict['extract']) replace_dict = { 'output_dir': ('replace with tmp_output_dir', str(tmp_output_dir)) } # have to put tmp_output_dir into yaml file extract_config_rewritten = rewrite_config(extract_config_filename, tmp_output_dir, replace_dict) hvc.extract(extract_config_rewritten) extract_outputs = list( filter(os.path.isdir, glob(os.path.join(str(tmp_output_dir), '*extract*')))) extract_outputs.sort(key=os.path.getmtime) extract_output_dir = (extract_outputs[-1] ) # [-1] is newest dir, after sort assert check_extract_output(extract_output_dir) feature_file = glob(os.path.join(extract_output_dir, 'features_created*')) feature_file = feature_file[0] # because glob returns list os.remove(extract_config_rewritten) select_and_predict_tuples = script_tuple_dict['select and predict'] for select_and_predict_tuple in select_and_predict_tuples: (select_config_filename, predict_config_filename) = select_and_predict_tuple select_config_filename = os.path.join(configs_path, select_config_filename) select_config_rewritten = rewrite_config( select_config_filename, tmp_output_dir, replace_dict={ 'feature_file': ('replace with feature_file', feature_file), 'output_dir': ('replace with tmp_output_dir', str(tmp_output_dir)) }) hvc.select(select_config_rewritten) select_outputs = list( filter(os.path.isdir, glob(os.path.join(str(tmp_output_dir), '*select*')))) select_outputs.sort(key=os.path.getmtime) select_output_dir = (select_outputs[-1] ) # [-1] is newest dir, after sort assert check_select_output(select_config_rewritten, select_output_dir) os.remove(select_config_rewritten) select_outputs.sort(key=os.path.getmtime) select_output_dir = (select_outputs[-1] ) # [-1] is newest dir, after sort model_meta_files = glob(os.path.join(select_output_dir, '*', '*meta*')) replace_dict = { 'model_meta_file': ('replace with model_file', model_meta_files[-1]), 'output_dir': ('replace with tmp_output_dir', str(tmp_output_dir)) } predict_config_filename_with_path = os.path.join( configs_path, predict_config_filename) predict_config_rewritten = rewrite_config( predict_config_filename_with_path, tmp_output_dir, replace_dict) hvc.predict(predict_config_rewritten) os.remove(predict_config_rewritten) predict_outputs = list( filter(os.path.isdir, glob(os.path.join(str(tmp_output_dir), '*predict*')))) predict_outputs.sort(key=os.path.getmtime) predict_output_dir = (predict_outputs[-1] ) # [-1] is newest dir, after sort feature_files = glob(os.path.join(predict_output_dir, 'feature*')) for ftr_filename in feature_files: ftr_file = joblib.load(ftr_filename) assert 'pred_labels' in ftr_file if 'predict_proba_True' in extract_config_filename: assert 'pred_probs' in ftr_file assert ftr_file['pred_labels'].shape[0] == ftr_file[ 'pred_probs'].shape[0]
from glob import glob import hvc # 0. create training data # In this case, we download already labeled data from an open repository. # String in quotes matches with the name of one of the folders in the repository. hvc.utils.fetch('gy6or6.032612') # 1. pick a model and 2. extract features for that model # Model and features are defined in extract.config.yml file. hvc.extract('gy6or6_autolabel_example.knn.extract.config.yml') # 3. pick hyperparameters for model # Load summary feature file to use with helper functions for # finding best hyperparameters. summary_file = glob('./extract_output*/summary*') summary_data = hvc.load_feature_file(summary_file) # In this case, we picked a k-nearest neighbors model # and we want to find what value of k will give us the highest accuracy cv_scores, best_k = hvc.utils.find_best_k(summary_data['features'], summary_data['labels'], k_range=range(1, 11)) # 4. Fit the **model** to the data and 5. Select the **best** model hvc.select('gy6or6_autolabel.example.select.knn.config.yml') # 6. **Predict** labels for unlabeled data using the fit model. hvc.predict('gy6or6_autolabel.example.predict.knn.config.yml')