def f(n): audio_path = os.path.join(audios_dir, meta_dict['audio_name'][n]) if os.path.isfile(audio_path): logging.info('{} {}'.format(n, audio_path)) (audio, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True) audio = pad_or_truncate(audio, clip_samples) hf['audio_name'][n] = meta_dict['audio_name'][n].encode() hf['waveform'][n] = float32_to_int16(audio) hf['target'][n] = meta_dict['target'][n] else: logging.info('{} File does not exist! {}'.format( n, audio_path))
def pack_waveforms_to_hdf5(args): """Pack waveforms to a single hdf5 file. """ # Arguments & parameters audios_dir = args.audios_dir csv_path = args.csv_path waveform_hdf5_path = args.waveform_hdf5_path target_hdf5_path = args.target_hdf5_path mini_data = args.mini_data audio_length = config.audio_length classes_num = config.classes_num sample_rate = config.sample_rate # Paths if mini_data: prefix = 'mini_' waveform_hdf5_path += '.mini' target_hdf5_path += '.mini' else: prefix = '' create_folder(os.path.dirname(waveform_hdf5_path)) create_folder(os.path.dirname(target_hdf5_path)) logs_dir = '_logs/pack_waveforms_to_hdf5/{}{}'.format( prefix, get_filename(csv_path)) create_folder(logs_dir) create_logging(logs_dir, filemode='w') logging.info('Write logs to {}'.format(logs_dir)) # Read csv file meta_dict = read_metadata(csv_path) if mini_data: mini_num = 10 for key in meta_dict.keys(): meta_dict[key] = meta_dict[key][0:mini_num] audios_num = len(meta_dict['audio_name']) # Pack waveform to hdf5 total_time = time.time() with h5py.File(waveform_hdf5_path, 'w') as hf: hf.create_dataset('audio_name', shape=((audios_num, )), dtype='S20') hf.create_dataset('waveform', shape=((audios_num, audio_length)), dtype=np.int16) hf.create_dataset('target', shape=((audios_num, classes_num)), dtype=np.bool) hf.attrs.create('sample_rate', data=sample_rate, dtype=np.int32) # Read audio for n in range(audios_num): audio_path = os.path.join(audios_dir, meta_dict['audio_name'][n]) if os.path.isfile(audio_path): logging.info('{} {}'.format(n, audio_path)) (audio, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True) audio = pad_or_truncate(audio, audio_length) hf['audio_name'][n] = meta_dict['audio_name'][n].encode() hf['waveform'][n] = float32_to_int16(audio) hf['target'][n] = meta_dict['target'][n] else: logging.info('{} File does not exist! {}'.format( n, audio_path)) # Pack target to hdf5 hdf5_name = target_hdf5_path.split('/')[-1] with h5py.File(target_hdf5_path, 'w') as target_hf: target_hf.create_dataset('audio_name', data=hf['audio_name'][:], dtype='S20') target_hf.create_dataset('hdf5_name', data=[hdf5_name.encode()] * audios_num, dtype='S40') target_hf.create_dataset('index_in_hdf5', data=np.arange(audios_num), dtype=np.int32) target_hf.create_dataset('target', data=hf['target'][:], dtype=np.bool) logging.info('Write to {}'.format(waveform_hdf5_path)) logging.info('Write to {}'.format(target_hdf5_path)) logging.info('Pack hdf5 time: {:.3f}'.format(time.time() - total_time))
def pack_audio_files_to_hdf5_ramas(args): # Arguments & parameters dataset_dir = args.dataset_dir workspace = args.workspace mini_data = args.mini_data sample_rate = config.sample_rate clip_samples = config.clip_samples classes_num = config.classes_num lb_to_idx = config.lb_to_idx # Paths audios_dir = os.path.join(dataset_dir) if mini_data: packed_hdf5_path = os.path.join(workspace, 'features_ramas', 'minidata_waveform.h5') else: packed_hdf5_path = os.path.join(workspace, 'features_ramas', 'waveform_meta_test.h5') create_folder(os.path.dirname(packed_hdf5_path)) # (audio_names, audio_paths) = traverse_folder(audios_dir) # audio_names = sorted(audio_names) # audio_paths = sorted(audio_paths) meta_df = pd.read_csv( '/home/den/DATASETS/AUDIO/preprocessed/ramas/meta_test.csv', sep=',') meta_df = meta_df[meta_df.cur_label.isin(['hap', 'ang', 'neu', 'sad'])] audio_names = list(meta_df.cur_name) audio_paths = [ os.path.join('/home/den/DATASETS/AUDIO/preprocessed/ramas/data', audio_name) for audio_name in audio_names ] meta_dict = { 'audio_name': np.array(audio_names), 'audio_path': np.array(audio_paths), 'target': np.array([ lb_to_idx[list( meta_df[meta_df.cur_name == audio_name].cur_label)[0]] for audio_name in audio_names ]), 'fold': np.array([1 for audio_name in audio_names]) } if mini_data: mini_num = 10 total_num = len(meta_dict['audio_name']) random_state = np.random.RandomState(1234) indexes = random_state.choice(total_num, size=mini_num, replace=False) for key in meta_dict.keys(): meta_dict[key] = meta_dict[key][indexes] audios_num = len(meta_dict['audio_name']) feature_time = time.time() with h5py.File(packed_hdf5_path, 'w') as hf: hf.create_dataset(name='audio_name', shape=(audios_num, ), dtype='S80') hf.create_dataset(name='waveform', shape=(audios_num, clip_samples), dtype=np.int16) hf.create_dataset(name='target', shape=(audios_num, classes_num), dtype=np.float32) hf.create_dataset(name='fold', shape=(audios_num, ), dtype=np.int32) for n in range(audios_num): print(n) audio_name = meta_dict['audio_name'][n] fold = meta_dict['fold'][n] audio_path = meta_dict['audio_path'][n] (audio, fs) = librosa.core.load(audio_path, sr=sample_rate, mono=True) audio = pad_truncate_sequence(audio, clip_samples) hf['audio_name'][n] = audio_name.encode() hf['waveform'][n] = float32_to_int16(audio) hf['target'][n] = to_one_hot(meta_dict['target'][n], classes_num) hf['fold'][n] = meta_dict['fold'][n] print('Write hdf5 to {}'.format(packed_hdf5_path)) print('Time: {:.3f} s'.format(time.time() - feature_time))
def pack_audio_files_to_hdf5(args): # Arguments & parameters dataset_dir = args.dataset_dir workspace = args.workspace mini_data = args.mini_data sample_rate = config.sample_rate clip_samples = config.clip_samples classes_num = config.classes_num lb_to_idx = config.lb_to_idx # Paths audios_dir = os.path.join(dataset_dir) if mini_data: packed_hdf5_path = os.path.join(workspace, 'features', 'minidata_waveform.h5') else: packed_hdf5_path = os.path.join(workspace, 'features', 'waveform.h5') create_folder(os.path.dirname(packed_hdf5_path)) (audio_names, audio_paths) = traverse_folder(audios_dir) audio_names = sorted(audio_names) audio_paths = sorted(audio_paths) meta_dict = { 'audio_name': np.array(audio_names), 'audio_path': np.array(audio_paths), 'target': np.array([ lb_to_idx[audio_name.split('.')[0]] for audio_name in audio_names ]), 'fold': np.arange(len(audio_names)) % 10 + 1 } if mini_data: mini_num = 10 total_num = len(meta_dict['audio_name']) random_state = np.random.RandomState(1234) indexes = random_state.choice(total_num, size=mini_num, replace=False) for key in meta_dict.keys(): meta_dict[key] = meta_dict[key][indexes] audios_num = len(meta_dict['audio_name']) feature_time = time.time() with h5py.File(packed_hdf5_path, 'w') as hf: hf.create_dataset(name='audio_name', shape=(audios_num, ), dtype='S80') hf.create_dataset(name='waveform', shape=(audios_num, clip_samples), dtype=np.int16) hf.create_dataset(name='target', shape=(audios_num, classes_num), dtype=np.float32) hf.create_dataset(name='fold', shape=(audios_num, ), dtype=np.int32) for n in range(audios_num): print(n) audio_name = meta_dict['audio_name'][n] fold = meta_dict['fold'][n] audio_path = meta_dict['audio_path'][n] (audio, fs) = librosa.core.load(audio_path, sr=sample_rate, mono=True) audio = pad_truncate_sequence(audio, clip_samples) hf['audio_name'][n] = audio_name.encode() hf['waveform'][n] = float32_to_int16(audio) hf['target'][n] = to_one_hot(meta_dict['target'][n], classes_num) hf['fold'][n] = meta_dict['fold'][n] print('Write hdf5 to {}'.format(packed_hdf5_path)) print('Time: {:.3f} s'.format(time.time() - feature_time))
def pack_waveforms_to_hdf5(args): """Pack waveform and target of several audio clips to a single hdf5 file. This can speed up loading and training. """ # Arguments & parameters audios_dir = args.audios_dir csv_path = args.csv_path waveforms_hdf5_path = args.waveforms_hdf5_path mini_data = args.mini_data clip_samples = config.clip_samples classes_num = config.classes_num sample_rate = config.sample_rate id_to_ix = config.id_to_ix # Paths if mini_data: prefix = 'mini_' waveforms_hdf5_path += '.mini' else: prefix = '' create_folder(os.path.dirname(waveforms_hdf5_path)) logs_dir = '_logs/pack_waveforms_to_hdf5/{}{}'.format( prefix, get_filename(csv_path)) create_folder(logs_dir) create_logging(logs_dir, filemode='w') logging.info('Write logs to {}'.format(logs_dir)) # Read csv file meta_dict = read_metadata(csv_path, classes_num, id_to_ix) if mini_data: mini_num = 10 for key in meta_dict.keys(): meta_dict[key] = meta_dict[key][0:mini_num] audios_num = len(meta_dict['audio_name']) # Pack waveform to hdf5 total_time = time.time() with h5py.File(waveforms_hdf5_path, 'w') as hf: hf.create_dataset('audio_name', shape=((audios_num, )), dtype='S20') hf.create_dataset('waveform', shape=((audios_num, clip_samples)), dtype=np.int16) hf.create_dataset('target', shape=((audios_num, classes_num)), dtype=np.bool) hf.attrs.create('sample_rate', data=sample_rate, dtype=np.int32) # Pack waveform & target of several audio clips to a single hdf5 file for n in range(audios_num): audio_path = os.path.join(audios_dir, meta_dict['audio_name'][n]) if os.path.isfile(audio_path): logging.info('{} {}'.format(n, audio_path)) (audio, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True) audio = pad_or_truncate(audio, clip_samples) hf['audio_name'][n] = meta_dict['audio_name'][n].encode() hf['waveform'][n] = float32_to_int16(audio) hf['target'][n] = meta_dict['target'][n] else: logging.info('{} File does not exist! {}'.format( n, audio_path)) logging.info('Write to {}'.format(waveforms_hdf5_path)) logging.info('Pack hdf5 time: {:.3f}'.format(time.time() - total_time))
def pack_audio_files_to_hdf5(args): """Pack waveform to hdf5 file. Args: dataset_dir: str, directory of dataset workspace: str, Directory of your workspace data_type: 'training' | 'testing' | 'evaluation' mini_data: bool, set True for debugging on a small part of data """ # Arguments & parameters dataset_dir = args.dataset_dir workspace = args.workspace data_type = args.data_type mini_data = args.mini_data sample_rate = config.sample_rate audio_length = config.audio_length classes_num = config.classes_num lb_to_idx = config.lb_to_idx frames_per_second = config.frames_per_second frames_num = frames_per_second * config.audio_duration has_strong_target = data_type in ['testing', 'evaluation'] # Paths audios_dir = os.path.join(dataset_dir, data_type) weak_label_csv_path = os.path.join(dataset_dir, 'metadata', get_weak_csv_filename(data_type)) if data_type == 'testing': strong_label_csv_path = os.path.join(dataset_dir, 'metadata', 'groundtruth_strong_label_testing_set.csv') elif data_type == 'evaluation': strong_label_csv_path = os.path.join(dataset_dir, 'metadata', 'groundtruth_strong_label_evaluation_set.csv') if mini_data: packed_hdf5_path = os.path.join(workspace, 'features', 'minidata_{}.waveform.h5'.format(data_type)) else: packed_hdf5_path = os.path.join(workspace, 'features', '{}.waveform.h5'.format(data_type)) create_folder(os.path.dirname(packed_hdf5_path)) # Read metadata weak_meta_list = read_weak_csv(weak_label_csv_path, data_type) # Use a small amount of data for debugging if mini_data: random.seed(1234) random.shuffle(weak_meta_list) weak_meta_list = weak_meta_list[0 : 100] audios_num = len(weak_meta_list) feature_time = time.time() with h5py.File(packed_hdf5_path, 'w') as hf: hf.create_dataset( name='audio_name', shape=(audios_num,), dtype='S80') hf.create_dataset( name='waveform', shape=(audios_num, audio_length), dtype=np.int32) hf.create_dataset( name='weak_target', shape=(audios_num, classes_num), dtype=np.float32) if has_strong_target: strong_meta_dict = read_strong_csv(strong_label_csv_path) hf.create_dataset( name='strong_target', shape=(0, frames_num, classes_num), maxshape=(None, frames_num, classes_num), dtype=np.bool) for n in range(audios_num): print(n) weak_meta_dict = weak_meta_list[n] audio_name = weak_meta_dict['audio_name'] audio_path = os.path.join(audios_dir, audio_name) (audio, fs) = librosa.core.load(audio_path, sr=sample_rate, mono=True) audio = pad_truncate_sequence(audio, audio_length) hf['audio_name'][n] = audio_name.encode() hf['waveform'][n] = float32_to_int16(audio) hf['weak_target'][n] = weak_target = get_weak_target( weak_meta_dict['labels'], lb_to_idx) if has_strong_target: strong_target = get_strong_target( weak_meta_dict['audio_name'][1:], strong_meta_dict, frames_num, frames_per_second, lb_to_idx) hf['strong_target'].resize((n + 1, frames_num, classes_num)) hf['strong_target'][n] = strong_target print('Write hdf5 to {}'.format(packed_hdf5_path)) print('Time: {:.3f} s'.format(time.time() - feature_time))
def pack_maps_dataset_to_hdf5(args): """MAPS is a piano dataset only used for evaluating our piano transcription system (optional). Ref: [1] Emiya, Valentin. "MAPS Database A piano database for multipitch estimation and automatic transcription of music. 2016 Load & resample MAPS audio files, then write to hdf5 files. Args: dataset_dir: str, directory of dataset workspace: str, directory of your workspace """ # Arguments & parameters dataset_dir = args.dataset_dir workspace = args.workspace sample_rate = config.sample_rate pianos = ['ENSTDkCl', 'ENSTDkAm'] # Paths waveform_hdf5s_dir = os.path.join(workspace, 'hdf5s', 'maps') logs_dir = os.path.join(workspace, 'logs', get_filename(__file__)) create_logging(logs_dir, filemode='w') logging.info(args) feature_time = time.time() count = 0 # Load & resample each audio file to a hdf5 file for piano in pianos: sub_dir = os.path.join(dataset_dir, piano, 'MUS') audio_names = [os.path.splitext(name)[0] for name in os.listdir(sub_dir) if os.path.splitext(name)[-1] == '.mid'] for audio_name in audio_names: print('{} {}'.format(count, audio_name)) audio_path = '{}.wav'.format(os.path.join(sub_dir, audio_name)) midi_path = '{}.mid'.format(os.path.join(sub_dir, audio_name)) (audio, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True) midi_dict = read_maps_midi(midi_path) packed_hdf5_path = os.path.join(waveform_hdf5s_dir, '{}.h5'.format(audio_name)) create_folder(os.path.dirname(packed_hdf5_path)) with h5py.File(packed_hdf5_path, 'w') as hf: hf.attrs.create('split', data='test'.encode(), dtype='S20') hf.attrs.create('midi_filename', data='{}.mid'.format(audio_name).encode(), dtype='S100') hf.attrs.create('audio_filename', data='{}.wav'.format(audio_name).encode(), dtype='S100') hf.create_dataset(name='midi_event', data=[e.encode() for e in midi_dict['midi_event']], dtype='S100') hf.create_dataset(name='midi_event_time', data=midi_dict['midi_event_time'], dtype=np.float32) hf.create_dataset(name='waveform', data=float32_to_int16(audio), dtype=np.int16) count += 1 logging.info('Write hdf5 to {}'.format(packed_hdf5_path)) logging.info('Time: {:.3f} s'.format(time.time() - feature_time))
def pack_maestro_dataset_to_hdf5(args): """Load & resample MAESTRO audio files, then write to hdf5 files. Args: dataset_dir: str, directory of dataset workspace: str, directory of your workspace """ # Arguments & parameters dataset_dir = args.dataset_dir workspace = args.workspace sample_rate = config.sample_rate # Paths csv_path = os.path.join(dataset_dir, 'maestro-v2.0.0.csv') waveform_hdf5s_dir = os.path.join(workspace, 'hdf5s', 'maestro') logs_dir = os.path.join(workspace, 'logs', get_filename(__file__)) create_logging(logs_dir, filemode='w') logging.info(args) # Read meta dict meta_dict = read_metadata(csv_path) audios_num = len(meta_dict['canonical_composer']) logging.info('Total audios number: {}'.format(audios_num)) feature_time = time.time() # Load & resample each audio file to a hdf5 file for n in range(audios_num): logging.info('{} {}'.format(n, meta_dict['midi_filename'][n])) # Read midi midi_path = os.path.join(dataset_dir, meta_dict['midi_filename'][n]) midi_dict = read_midi(midi_path) # Load audio audio_path = os.path.join(dataset_dir, meta_dict['audio_filename'][n]) (audio, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True) packed_hdf5_path = os.path.join(waveform_hdf5s_dir, '{}.h5'.format( os.path.splitext(meta_dict['audio_filename'][n])[0])) create_folder(os.path.dirname(packed_hdf5_path)) with h5py.File(packed_hdf5_path, 'w') as hf: hf.attrs.create('canonical_composer', data=meta_dict['canonical_composer'][n].encode(), dtype='S100') hf.attrs.create('canonical_title', data=meta_dict['canonical_title'][n].encode(), dtype='S100') hf.attrs.create('split', data=meta_dict['split'][n].encode(), dtype='S20') hf.attrs.create('year', data=meta_dict['year'][n].encode(), dtype='S10') hf.attrs.create('midi_filename', data=meta_dict['midi_filename'][n].encode(), dtype='S100') hf.attrs.create('audio_filename', data=meta_dict['audio_filename'][n].encode(), dtype='S100') hf.attrs.create('duration', data=meta_dict['duration'][n], dtype=np.float32) hf.create_dataset(name='midi_event', data=[e.encode() for e in midi_dict['midi_event']], dtype='S100') hf.create_dataset(name='midi_event_time', data=midi_dict['midi_event_time'], dtype=np.float32) hf.create_dataset(name='waveform', data=float32_to_int16(audio), dtype=np.int16) logging.info('Write hdf5 to {}'.format(packed_hdf5_path)) logging.info('Time: {:.3f} s'.format(time.time() - feature_time))