def augment(self, x): clip_samples = len(x) logger = logging.getLogger('sox') logger.propagate = False tfm = sox.Transformer() tfm.set_globals(verbosity=0) tfm.pitch(self.random_state.uniform(-0.1, 0.1, 1)[0]) tfm.contrast(self.random_state.uniform(0, 100, 1)[0]) tfm.equalizer(frequency=self.loguniform(32, 4096, 1)[0], width_q=self.random_state.uniform(1, 2, 1)[0], gain_db=self.random_state.uniform(-30, 10, 1)[0]) tfm.equalizer(frequency=self.loguniform(32, 4096, 1)[0], width_q=self.random_state.uniform(1, 2, 1)[0], gain_db=self.random_state.uniform(-30, 10, 1)[0]) tfm.reverb(reverberance=self.random_state.uniform(0, 70, 1)[0]) aug_x = tfm.build_array(input_array=x, sample_rate_in=self.sample_rate) aug_x = pad_truncate_sequence(aug_x, clip_samples) return aug_x
def calculate_feature_for_all_audio_files(args): '''Calculate feature of audio files and write out features to a hdf5 file. Args: dataset_dir: string workspace: string subtask: 'a' | 'b' | 'c' data_type: 'development' | 'evaluation' mini_data: bool, set True for debugging on a small part of data ''' # Arguments & parameters dataset_dir = args.dataset_dir workspace = args.workspace subtask = args.subtask data_type = args.data_type mini_data = args.mini_data sample_rate = config.sample_rate window_size = config.window_size hop_size = config.hop_size mel_bins = config.mel_bins fmin = config.fmin fmax = config.fmax frames_per_second = config.frames_per_second frames_num = config.frames_num total_samples = config.total_samples lb_to_idx = config.lb_to_idx # Paths if mini_data: prefix = 'minidata_' else: prefix = '' sub_dir = get_subdir(subtask, data_type) metadata_path = os.path.join(dataset_dir, sub_dir, 'meta.csv') audios_dir = os.path.join(dataset_dir, sub_dir, 'audio') feature_path = os.path.join( workspace, 'features_side', '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), '{}.h5'.format(sub_dir)) create_folder(os.path.dirname(feature_path)) # Feature extractor feature_extractor = LogMelExtractor(sample_rate=sample_rate, window_size=window_size, hop_size=hop_size, mel_bins=mel_bins, fmin=fmin, fmax=fmax) # Read metadata meta_dict = read_metadata(metadata_path) # Extract features and targets if mini_data: mini_num = 10 total_num = len(meta_dict['audio_name']) random_state = np.random.RandomState(1234) indexes = random_state.choice(total_num, size=mini_num, replace=False) meta_dict['audio_name'] = meta_dict['audio_name'][indexes] meta_dict['scene_label'] = meta_dict['scene_label'][indexes] meta_dict['identifier'] = meta_dict['identifier'][indexes] meta_dict['source_label'] = meta_dict['source_label'][indexes] print('Extracting features of all audio files ...') extract_time = time.time() # Hdf5 file for storing features and targets hf = h5py.File(feature_path, 'w') hf.create_dataset( name='audio_name', data=[audio_name.encode() for audio_name in meta_dict['audio_name']], dtype='S80') if 'scene_label' in meta_dict.keys(): hf.create_dataset(name='scene_label', data=[ scene_label.encode() for scene_label in meta_dict['scene_label'] ], dtype='S24') if 'identifier' in meta_dict.keys(): hf.create_dataset(name='identifier', data=[ identifier.encode() for identifier in meta_dict['identifier'] ], dtype='S24') if 'source_label' in meta_dict.keys(): hf.create_dataset(name='source_label', data=[ source_label.encode() for source_label in meta_dict['source_label'] ], dtype='S8') hf.create_dataset(name='feature_side', shape=(0, frames_num, mel_bins), maxshape=(None, frames_num, mel_bins), dtype=np.float32) for (n, audio_name) in enumerate(meta_dict['audio_name']): audio_path = os.path.join(audios_dir, audio_name) print(n, audio_path) # Read audio (audio, _) = read_side_audio(audio_path=audio_path, target_fs=sample_rate) # Pad or truncate audio recording to the same length audio = pad_truncate_sequence(audio, total_samples) # Extract feature feature = feature_extractor.transform(audio) # Remove the extra log mel spectrogram frames caused by padding zero feature = feature[0:frames_num] hf['feature_side'].resize((n + 1, frames_num, mel_bins)) hf['feature_side'][n] = feature hf.close() print('Write hdf5 file to {} using {:.3f} s'.format( feature_path, time.time() - extract_time))
def calculate_feature_for_all_audio_files(args): '''Calculate feature of audio files and write out features to a hdf5 file. Args: dataset_dir: string workspace: string subtask: 'a' | 'b' | 'c' data_type: 'development' | 'evaluation' mini_data: bool, set True for debugging on a small part of data ''' # Arguments & parameters # dataset_dir = args.dataset_dir # workspace = args.workspace # subtask = args.subtask # data_type = args.data_type # mini_data = args.mini_data dataset_dir = 'D:/Project/DCASE_test/Data' workspace = 'D:/Project/DCASE_test' subtask = 'a' data_type = 'development' mini_data = False sample_rate = config.sample_rate window_size = config.window_size hop_size = config.hop_size mel_bins = config.mel_bins fmin = config.fmin fmax = config.fmax frames_per_second = config.frames_per_second frames_num = config.frames_num total_samples = config.total_samples lb_to_idx = config.lb_to_idx mfcc_frames = config.mfcc_frames n_mfcc = config.n_mfcc mfcc_hop_size = config.mfcc_hop_size gamm_frames = config.gamm_frames n_gamm = config.n_gamm # Paths if mini_data: prefix = 'minidata_' else: prefix = '' sub_dir = get_subdir(subtask, data_type) audios_dir = os.path.join(dataset_dir, sub_dir, 'audio') if data_type == 'development': metadata_path = os.path.join(dataset_dir, sub_dir, 'meta.csv') elif data_type == 'leaderboard': metadata_path = os.path.join(dataset_dir, sub_dir, 'evaluation_setup', 'test.csv') else: raise Exception('Incorrect data_type!') feature_path = os.path.join(workspace, 'features', '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), '{}.h5'.format(sub_dir)) create_folder(os.path.dirname(feature_path)) # Feature extractor feature_extractor = LogMelExtractor( sample_rate=sample_rate, window_size=window_size, hop_size=hop_size, mel_bins=mel_bins, fmin=fmin, fmax=fmax) # Read metadata meta_dict = read_metadata(metadata_path) # Extract features and targets if mini_data: mini_num = 300 total_num = len(meta_dict['audio_name']) random_state = np.random.RandomState(1234) indexes = random_state.choice(total_num, size=mini_num, replace=False) for key in meta_dict.keys(): meta_dict[key] = meta_dict[key][indexes] print('Extracting features of all audio files ...') extract_time = time.time() # Hdf5 file for storing features and targets hf = h5py.File(feature_path, 'w') hf.create_dataset( name='audio_name', data=[audio_name.encode() for audio_name in meta_dict['audio_name']], dtype='S80') if 'scene_label' in meta_dict.keys(): hf.create_dataset( name='scene_label', data=[scene_label.encode() for scene_label in meta_dict['scene_label']], dtype='S24') if 'identifier' in meta_dict.keys(): hf.create_dataset( name='identifier', data=[identifier.encode() for identifier in meta_dict['identifier']], dtype='S24') if 'source_label' in meta_dict.keys(): hf.create_dataset( name='source_label', data=[source_label.encode() for source_label in meta_dict['source_label']], dtype='S8') hf.create_dataset( name='feature', shape=(0, total_samples), maxshape=(None, total_samples), dtype=np.float32) hf.create_dataset( name='feature_gamm', shape=(0, gamm_frames, n_gamm), maxshape=(None, gamm_frames, n_gamm), dtype=np.float32) hf.create_dataset( name='feature_mfcc', shape=(0, mfcc_frames, n_mfcc), maxshape=(None, mfcc_frames, n_mfcc), dtype=np.float32) hf.create_dataset( name='feature_panns', shape=(0, 320000), maxshape=(None, 320000), dtype=np.float32) for (n, audio_name) in enumerate(meta_dict['audio_name']): audio_path = os.path.join(audios_dir, audio_name) print(n, audio_path) # Read audio (audio, _) = read_audio( audio_path=audio_path, target_fs=sample_rate) audio = pad_truncate_sequence(audio, total_samples) (audio_gamm, _) = read_audio_gamm( audio_path=audio_path, target_fs=sample_rate) fea_gamm, _ = gtg_in_dB(audio_gamm, sample_rate) fea_gamm = fea_gamm.transpose(1, 0) sound, fs = librosa.load(audio_path) fea_mfcc = librosa.feature.mfcc(y=sound, sr=fs, hop_length=mfcc_hop_size, n_mfcc=n_mfcc) fea_mfcc = fea_mfcc.transpose(1, 0) (waveform, _) = librosa.core.load(audio_path, sr=32000, mono=True) feature = feature_extractor.transform(audio) feature = feature[0 : frames_num] hf['feature'].resize((n + 1, total_samples)) hf['feature'][n] = audio hf['feature_gamm'].resize((n + 1, gamm_frames, n_gamm)) hf['feature_gamm'][n] = fea_gamm hf['feature_mfcc'].resize((n + 1, mfcc_frames, n_mfcc)) hf['feature_mfcc'][n] = fea_mfcc hf['feature_panns'].resize((n + 1, 320000)) hf['feature_panns'][n] = waveform hf.close() print('Write hdf5 file to {} using {:.3f} s'.format( feature_path, time.time() - extract_time))
def calculate_feature_for_all_audio_files(args): '''Calculate feature of audio files and write out features to a single hdf5 file. Args: dataset_dir: string workspace: string data_type: 'development' | 'evaluation' mini_data: bool, set True for debugging on a small part of data ''' # Arguments & parameters dataset_dir = args.dataset_dir workspace = args.workspace data_type = args.data_type mini_data = args.mini_data sample_rate = config.sample_rate window_size = config.window_size hop_size = config.hop_size mel_bins = config.mel_bins fmin = config.fmin fmax = config.fmax frames_per_second = config.frames_per_second frames_num = config.frames_num total_samples = config.total_samples classes_num = config.classes_num lb_to_idx = config.lb_to_idx # Paths if mini_data: prefix = 'minidata_' else: prefix = '' relative_name = get_relative_path_no_extension(data_type) audios_dir = os.path.join(dataset_dir, 'audio', relative_name) if data_type == 'validation': metadata_path = os.path.join(dataset_dir, 'metadata', 'validation', '{}.csv'.format(relative_name)) else: metadata_path = os.path.join(dataset_dir, 'metadata', '{}.csv'.format(relative_name)) feature_path = os.path.join( workspace, 'features', '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), '{}.h5'.format(relative_name)) create_folder(os.path.dirname(feature_path)) # Feature extractor feature_extractor = LogMelExtractor(sample_rate=sample_rate, window_size=window_size, hop_size=hop_size, mel_bins=mel_bins, fmin=fmin, fmax=fmax) # Read metadata (data_dict, has_weak_labels, has_strong_labels) = read_metadata(metadata_path) # Extract features and targets audio_names = sorted([*data_dict.keys()]) if mini_data: random_state = np.random.RandomState(1234) random_state.shuffle(audio_names) audio_names = audio_names[0:10] print('Extracting features of all audio files ...') extract_time = time.time() # Hdf5 file for storing features and targets hf = h5py.File(feature_path, 'w') hf.create_dataset(name='audio_name', data=[audio_name.encode() for audio_name in audio_names], dtype='S64') hf.create_dataset(name='feature', shape=(0, frames_num, mel_bins), maxshape=(None, frames_num, mel_bins), dtype=np.float32) if has_weak_labels: hf.create_dataset(name='weak_target', shape=(0, classes_num), maxshape=(None, classes_num), dtype=np.bool) if has_strong_labels: hf.create_dataset(name='strong_target', shape=(0, frames_num, classes_num), maxshape=(None, frames_num, classes_num), dtype=np.bool) for (n, audio_name) in enumerate(audio_names): audio_path = os.path.join(audios_dir, audio_name) print(n, audio_path) # Read audio (audio, _) = read_audio(audio_path=audio_path, target_fs=sample_rate) # Pad or truncate audio recording audio = pad_truncate_sequence(audio, total_samples) # Extract feature feature = feature_extractor.transform(audio) # Remove the extra frames caused by padding zero feature = feature[0:frames_num] hf['feature'].resize((n + 1, frames_num, mel_bins)) hf['feature'][n] = feature if has_weak_labels: weak_labels = data_dict[audio_name]['weak_labels'] hf['weak_target'].resize((n + 1, classes_num)) hf['weak_target'][n] = labels_to_target(weak_labels, classes_num, lb_to_idx) if has_strong_labels: events = data_dict[audio_name]['strong_labels'] hf['strong_target'].resize((n + 1, frames_num, classes_num)) hf['strong_target'][n] = events_to_target( events=events, frames_num=frames_num, classes_num=classes_num, frames_per_second=frames_per_second, lb_to_idx=lb_to_idx) hf.close() print('Write hdf5 file to {} using {:.3f} s'.format( feature_path, time.time() - extract_time))
def calculate_feature_for_all_audio_files(args): '''Calculate feature of audio files and write out features to a single hdf5 file. Args: dataset_dir: string workspace: string data_type: 'train' | 'validate' | 'evaluate' mini_data: bool, set True for debugging on a small part of data ''' # Arguments & parameters dataset_dir = args.dataset_dir data_type = args.data_type workspace = args.workspace mini_data = args.mini_data sample_rate = config.sample_rate window_size = config.window_size hop_size = config.hop_size mel_bins = config.mel_bins fmin = config.fmin fmax = config.fmax frames_per_second = config.frames_per_second frames_num = config.frames_num total_samples = config.total_samples # Paths if mini_data: prefix = 'minidata_' else: prefix = '' metadata_path = os.path.join(dataset_dir, 'annotations.csv') if data_type in ['train', 'validate']: audios_dir = os.path.join(dataset_dir, data_type) elif data_type == 'evaluate': audios_dir = os.path.join(dataset_dir, 'audio-eval') feature_path = os.path.join(workspace, 'features', '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), '{}.h5'.format(data_type)) create_folder(os.path.dirname(feature_path)) # Feature extractor feature_extractor = LogMelExtractor( sample_rate=sample_rate, window_size=window_size, hop_size=hop_size, mel_bins=mel_bins, fmin=fmin, fmax=fmax) # Read metadata print('Extracting features of all audio files ...') extract_time = time.time() if data_type in ['train', 'validate']: meta_dict = read_metadata(metadata_path, data_type, mini_data) elif data_type == 'evaluate': meta_dict = read_evaluate_metadata(audios_dir, mini_data) # Hdf5 containing features and targets hf = h5py.File(feature_path, 'w') hf.create_dataset( name='audio_name', data=[audio_name.encode() for audio_name in meta_dict['audio_name']], dtype='S32') if 'fine_target' in meta_dict.keys(): hf.create_dataset( name='fine_target', data=meta_dict['fine_target'], dtype=np.float32) if 'coarse_target' in meta_dict.keys(): hf.create_dataset( name='coarse_target', data=meta_dict['coarse_target'], dtype=np.float32) hf.create_dataset( name='feature', shape=(0, frames_num, mel_bins), maxshape=(None, frames_num, mel_bins), dtype=np.float32) for (n, audio_name) in enumerate(meta_dict['audio_name']): audio_path = os.path.join(audios_dir, audio_name) print(n, audio_path) # Read audio (audio, _) = read_audio( audio_path=audio_path, target_fs=sample_rate) # Pad or truncate audio recording audio = pad_truncate_sequence(audio, total_samples) # Extract feature feature = feature_extractor.transform(audio) # Remove the extra frames caused by padding zero feature = feature[0 : frames_num] hf['feature'].resize((n + 1, frames_num, mel_bins)) hf['feature'][n] = feature hf.close() print('Write hdf5 file to {} using {:.3f} s'.format( feature_path, time.time() - extract_time))
def pack_audio_files_to_hdf5(args): """Pack waveform to hdf5 file. Args: dataset_dir: str, directory of dataset workspace: str, Directory of your workspace data_type: 'training' | 'testing' | 'evaluation' mini_data: bool, set True for debugging on a small part of data """ # Arguments & parameters dataset_dir = args.dataset_dir workspace = args.workspace data_type = args.data_type mini_data = args.mini_data sample_rate = config.sample_rate audio_length = config.audio_length classes_num = config.classes_num lb_to_idx = config.lb_to_idx frames_per_second = config.frames_per_second frames_num = frames_per_second * config.audio_duration has_strong_target = data_type in ['testing', 'evaluation'] # Paths audios_dir = os.path.join(dataset_dir, data_type) weak_label_csv_path = os.path.join(dataset_dir, 'metadata', get_weak_csv_filename(data_type)) if data_type == 'testing': strong_label_csv_path = os.path.join(dataset_dir, 'metadata', 'groundtruth_strong_label_testing_set.csv') elif data_type == 'evaluation': strong_label_csv_path = os.path.join(dataset_dir, 'metadata', 'groundtruth_strong_label_evaluation_set.csv') if mini_data: packed_hdf5_path = os.path.join(workspace, 'features', 'minidata_{}.waveform.h5'.format(data_type)) else: packed_hdf5_path = os.path.join(workspace, 'features', '{}.waveform.h5'.format(data_type)) create_folder(os.path.dirname(packed_hdf5_path)) # Read metadata weak_meta_list = read_weak_csv(weak_label_csv_path, data_type) # Use a small amount of data for debugging if mini_data: random.seed(1234) random.shuffle(weak_meta_list) weak_meta_list = weak_meta_list[0 : 100] audios_num = len(weak_meta_list) feature_time = time.time() with h5py.File(packed_hdf5_path, 'w') as hf: hf.create_dataset( name='audio_name', shape=(audios_num,), dtype='S80') hf.create_dataset( name='waveform', shape=(audios_num, audio_length), dtype=np.int32) hf.create_dataset( name='weak_target', shape=(audios_num, classes_num), dtype=np.float32) if has_strong_target: strong_meta_dict = read_strong_csv(strong_label_csv_path) hf.create_dataset( name='strong_target', shape=(0, frames_num, classes_num), maxshape=(None, frames_num, classes_num), dtype=np.bool) for n in range(audios_num): print(n) weak_meta_dict = weak_meta_list[n] audio_name = weak_meta_dict['audio_name'] audio_path = os.path.join(audios_dir, audio_name) (audio, fs) = librosa.core.load(audio_path, sr=sample_rate, mono=True) audio = pad_truncate_sequence(audio, audio_length) hf['audio_name'][n] = audio_name.encode() hf['waveform'][n] = float32_to_int16(audio) hf['weak_target'][n] = weak_target = get_weak_target( weak_meta_dict['labels'], lb_to_idx) if has_strong_target: strong_target = get_strong_target( weak_meta_dict['audio_name'][1:], strong_meta_dict, frames_num, frames_per_second, lb_to_idx) hf['strong_target'].resize((n + 1, frames_num, classes_num)) hf['strong_target'][n] = strong_target print('Write hdf5 to {}'.format(packed_hdf5_path)) print('Time: {:.3f} s'.format(time.time() - feature_time))