Esempio n. 1
0
    def augment(self, x):
        clip_samples = len(x)

        logger = logging.getLogger('sox')
        logger.propagate = False

        tfm = sox.Transformer()
        tfm.set_globals(verbosity=0)

        tfm.pitch(self.random_state.uniform(-0.1, 0.1, 1)[0])
        tfm.contrast(self.random_state.uniform(0, 100, 1)[0])

        tfm.equalizer(frequency=self.loguniform(32, 4096, 1)[0],
                      width_q=self.random_state.uniform(1, 2, 1)[0],
                      gain_db=self.random_state.uniform(-30, 10, 1)[0])

        tfm.equalizer(frequency=self.loguniform(32, 4096, 1)[0],
                      width_q=self.random_state.uniform(1, 2, 1)[0],
                      gain_db=self.random_state.uniform(-30, 10, 1)[0])

        tfm.reverb(reverberance=self.random_state.uniform(0, 70, 1)[0])

        aug_x = tfm.build_array(input_array=x, sample_rate_in=self.sample_rate)
        aug_x = pad_truncate_sequence(aug_x, clip_samples)

        return aug_x
Esempio n. 2
0
def calculate_feature_for_all_audio_files(args):
    '''Calculate feature of audio files and write out features to a hdf5 file. 
    
    Args:
      dataset_dir: string
      workspace: string
      subtask: 'a' | 'b' | 'c'
      data_type: 'development' | 'evaluation'
      mini_data: bool, set True for debugging on a small part of data
    '''

    # Arguments & parameters
    dataset_dir = args.dataset_dir
    workspace = args.workspace
    subtask = args.subtask
    data_type = args.data_type
    mini_data = args.mini_data

    sample_rate = config.sample_rate
    window_size = config.window_size
    hop_size = config.hop_size
    mel_bins = config.mel_bins
    fmin = config.fmin
    fmax = config.fmax
    frames_per_second = config.frames_per_second
    frames_num = config.frames_num
    total_samples = config.total_samples
    lb_to_idx = config.lb_to_idx

    # Paths
    if mini_data:
        prefix = 'minidata_'
    else:
        prefix = ''

    sub_dir = get_subdir(subtask, data_type)
    metadata_path = os.path.join(dataset_dir, sub_dir, 'meta.csv')
    audios_dir = os.path.join(dataset_dir, sub_dir, 'audio')

    feature_path = os.path.join(
        workspace, 'features_side',
        '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second,
                                             mel_bins),
        '{}.h5'.format(sub_dir))
    create_folder(os.path.dirname(feature_path))

    # Feature extractor
    feature_extractor = LogMelExtractor(sample_rate=sample_rate,
                                        window_size=window_size,
                                        hop_size=hop_size,
                                        mel_bins=mel_bins,
                                        fmin=fmin,
                                        fmax=fmax)

    # Read metadata
    meta_dict = read_metadata(metadata_path)

    # Extract features and targets
    if mini_data:
        mini_num = 10
        total_num = len(meta_dict['audio_name'])
        random_state = np.random.RandomState(1234)
        indexes = random_state.choice(total_num, size=mini_num, replace=False)
        meta_dict['audio_name'] = meta_dict['audio_name'][indexes]
        meta_dict['scene_label'] = meta_dict['scene_label'][indexes]
        meta_dict['identifier'] = meta_dict['identifier'][indexes]
        meta_dict['source_label'] = meta_dict['source_label'][indexes]

    print('Extracting features of all audio files ...')
    extract_time = time.time()

    # Hdf5 file for storing features and targets
    hf = h5py.File(feature_path, 'w')

    hf.create_dataset(
        name='audio_name',
        data=[audio_name.encode() for audio_name in meta_dict['audio_name']],
        dtype='S80')

    if 'scene_label' in meta_dict.keys():
        hf.create_dataset(name='scene_label',
                          data=[
                              scene_label.encode()
                              for scene_label in meta_dict['scene_label']
                          ],
                          dtype='S24')

    if 'identifier' in meta_dict.keys():
        hf.create_dataset(name='identifier',
                          data=[
                              identifier.encode()
                              for identifier in meta_dict['identifier']
                          ],
                          dtype='S24')

    if 'source_label' in meta_dict.keys():
        hf.create_dataset(name='source_label',
                          data=[
                              source_label.encode()
                              for source_label in meta_dict['source_label']
                          ],
                          dtype='S8')

    hf.create_dataset(name='feature_side',
                      shape=(0, frames_num, mel_bins),
                      maxshape=(None, frames_num, mel_bins),
                      dtype=np.float32)

    for (n, audio_name) in enumerate(meta_dict['audio_name']):
        audio_path = os.path.join(audios_dir, audio_name)
        print(n, audio_path)

        # Read audio
        (audio, _) = read_side_audio(audio_path=audio_path,
                                     target_fs=sample_rate)

        # Pad or truncate audio recording to the same length
        audio = pad_truncate_sequence(audio, total_samples)

        # Extract feature
        feature = feature_extractor.transform(audio)

        # Remove the extra log mel spectrogram frames caused by padding zero
        feature = feature[0:frames_num]

        hf['feature_side'].resize((n + 1, frames_num, mel_bins))
        hf['feature_side'][n] = feature

    hf.close()

    print('Write hdf5 file to {} using {:.3f} s'.format(
        feature_path,
        time.time() - extract_time))
def calculate_feature_for_all_audio_files(args):
    '''Calculate feature of audio files and write out features to a hdf5 file. 
    
    Args:
      dataset_dir: string
      workspace: string
      subtask: 'a' | 'b' | 'c'
      data_type: 'development' | 'evaluation'
      mini_data: bool, set True for debugging on a small part of data
    '''

    # Arguments & parameters
    # dataset_dir = args.dataset_dir
    # workspace = args.workspace
    # subtask = args.subtask
    # data_type = args.data_type
    # mini_data = args.mini_data
    
    dataset_dir = 'D:/Project/DCASE_test/Data'
    workspace = 'D:/Project/DCASE_test'
    subtask = 'a'
    data_type = 'development'
    mini_data = False
    
    sample_rate = config.sample_rate
    window_size = config.window_size
    hop_size = config.hop_size
    mel_bins = config.mel_bins
    fmin = config.fmin
    fmax = config.fmax
    frames_per_second = config.frames_per_second
    frames_num = config.frames_num
    total_samples = config.total_samples
    lb_to_idx = config.lb_to_idx
    mfcc_frames = config.mfcc_frames
    n_mfcc = config.n_mfcc
    mfcc_hop_size = config.mfcc_hop_size
    gamm_frames = config.gamm_frames
    n_gamm = config.n_gamm
    # Paths
    if mini_data:
        prefix = 'minidata_'
    else:
        prefix = ''
        
    sub_dir = get_subdir(subtask, data_type)
    audios_dir = os.path.join(dataset_dir, sub_dir, 'audio')

    if data_type == 'development':
        metadata_path = os.path.join(dataset_dir, sub_dir, 'meta.csv')
    elif data_type == 'leaderboard':
        metadata_path = os.path.join(dataset_dir, sub_dir, 'evaluation_setup', 'test.csv')
    else:
        raise Exception('Incorrect data_type!')
    
    feature_path = os.path.join(workspace, 'features', 
        '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 
        '{}.h5'.format(sub_dir))
    create_folder(os.path.dirname(feature_path))
        
    # Feature extractor
    feature_extractor = LogMelExtractor(
        sample_rate=sample_rate, 
        window_size=window_size, 
        hop_size=hop_size, 
        mel_bins=mel_bins, 
        fmin=fmin, 
        fmax=fmax)

    # Read metadata
    meta_dict = read_metadata(metadata_path)

    # Extract features and targets 
    if mini_data:
        mini_num = 300
        total_num = len(meta_dict['audio_name'])
        random_state = np.random.RandomState(1234)
        indexes = random_state.choice(total_num, size=mini_num, replace=False)
        for key in meta_dict.keys():
            meta_dict[key] = meta_dict[key][indexes]
        
    print('Extracting features of all audio files ...')
    extract_time = time.time()
    
    # Hdf5 file for storing features and targets
    hf = h5py.File(feature_path, 'w')

    hf.create_dataset(
        name='audio_name', 
        data=[audio_name.encode() for audio_name in meta_dict['audio_name']], 
        dtype='S80')

    if 'scene_label' in meta_dict.keys():
        hf.create_dataset(
            name='scene_label', 
            data=[scene_label.encode() for scene_label in meta_dict['scene_label']], 
            dtype='S24')
            
    if 'identifier' in meta_dict.keys():
        hf.create_dataset(
            name='identifier', 
            data=[identifier.encode() for identifier in meta_dict['identifier']], 
            dtype='S24')
            
    if 'source_label' in meta_dict.keys():
        hf.create_dataset(
            name='source_label', 
            data=[source_label.encode() for source_label in meta_dict['source_label']], 
            dtype='S8')

    hf.create_dataset(
        name='feature', 
        shape=(0, total_samples), 
        maxshape=(None, total_samples), 
        dtype=np.float32)
    hf.create_dataset(
        name='feature_gamm', 
        shape=(0, gamm_frames, n_gamm), 
        maxshape=(None, gamm_frames, n_gamm), 
        dtype=np.float32)
    hf.create_dataset(
        name='feature_mfcc', 
        shape=(0, mfcc_frames, n_mfcc), 
        maxshape=(None, mfcc_frames, n_mfcc), 
        dtype=np.float32)
    hf.create_dataset(
        name='feature_panns', 
        shape=(0, 320000), 
        maxshape=(None, 320000), 
        dtype=np.float32)
    
    for (n, audio_name) in enumerate(meta_dict['audio_name']):
        audio_path = os.path.join(audios_dir, audio_name)
        print(n, audio_path)
        
        # Read audio
        (audio, _) = read_audio(
            audio_path=audio_path, 
            target_fs=sample_rate)
        
        audio = pad_truncate_sequence(audio, total_samples)
        
        (audio_gamm, _) = read_audio_gamm(
            audio_path=audio_path, 
            target_fs=sample_rate)
        fea_gamm, _ = gtg_in_dB(audio_gamm, sample_rate) 
        fea_gamm = fea_gamm.transpose(1, 0)
        sound, fs = librosa.load(audio_path)
        fea_mfcc = librosa.feature.mfcc(y=sound, sr=fs, hop_length=mfcc_hop_size, n_mfcc=n_mfcc)
        fea_mfcc = fea_mfcc.transpose(1, 0)
        (waveform, _) = librosa.core.load(audio_path, sr=32000, mono=True)
        
        feature = feature_extractor.transform(audio)
        feature = feature[0 : frames_num]
        
        hf['feature'].resize((n + 1, total_samples))
        hf['feature'][n] = audio        
        hf['feature_gamm'].resize((n + 1, gamm_frames, n_gamm))
        hf['feature_gamm'][n] = fea_gamm
        hf['feature_mfcc'].resize((n + 1, mfcc_frames, n_mfcc))
        hf['feature_mfcc'][n] = fea_mfcc
        hf['feature_panns'].resize((n + 1, 320000))
        hf['feature_panns'][n] = waveform
            
    hf.close()
        
    print('Write hdf5 file to {} using {:.3f} s'.format(
        feature_path, time.time() - extract_time))
Esempio n. 4
0
def calculate_feature_for_all_audio_files(args):
    '''Calculate feature of audio files and write out features to a single hdf5 
    file. 
    
    Args:
      dataset_dir: string
      workspace: string
      data_type: 'development' | 'evaluation'
      mini_data: bool, set True for debugging on a small part of data
    '''

    # Arguments & parameters
    dataset_dir = args.dataset_dir
    workspace = args.workspace
    data_type = args.data_type
    mini_data = args.mini_data

    sample_rate = config.sample_rate
    window_size = config.window_size
    hop_size = config.hop_size
    mel_bins = config.mel_bins
    fmin = config.fmin
    fmax = config.fmax
    frames_per_second = config.frames_per_second
    frames_num = config.frames_num
    total_samples = config.total_samples
    classes_num = config.classes_num
    lb_to_idx = config.lb_to_idx

    # Paths
    if mini_data:
        prefix = 'minidata_'
    else:
        prefix = ''

    relative_name = get_relative_path_no_extension(data_type)
    audios_dir = os.path.join(dataset_dir, 'audio', relative_name)

    if data_type == 'validation':
        metadata_path = os.path.join(dataset_dir, 'metadata', 'validation',
                                     '{}.csv'.format(relative_name))
    else:
        metadata_path = os.path.join(dataset_dir, 'metadata',
                                     '{}.csv'.format(relative_name))

    feature_path = os.path.join(
        workspace, 'features',
        '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second,
                                             mel_bins),
        '{}.h5'.format(relative_name))
    create_folder(os.path.dirname(feature_path))

    # Feature extractor
    feature_extractor = LogMelExtractor(sample_rate=sample_rate,
                                        window_size=window_size,
                                        hop_size=hop_size,
                                        mel_bins=mel_bins,
                                        fmin=fmin,
                                        fmax=fmax)

    # Read metadata
    (data_dict, has_weak_labels,
     has_strong_labels) = read_metadata(metadata_path)

    # Extract features and targets
    audio_names = sorted([*data_dict.keys()])

    if mini_data:
        random_state = np.random.RandomState(1234)
        random_state.shuffle(audio_names)
        audio_names = audio_names[0:10]

    print('Extracting features of all audio files ...')
    extract_time = time.time()

    # Hdf5 file for storing features and targets
    hf = h5py.File(feature_path, 'w')

    hf.create_dataset(name='audio_name',
                      data=[audio_name.encode() for audio_name in audio_names],
                      dtype='S64')

    hf.create_dataset(name='feature',
                      shape=(0, frames_num, mel_bins),
                      maxshape=(None, frames_num, mel_bins),
                      dtype=np.float32)

    if has_weak_labels:
        hf.create_dataset(name='weak_target',
                          shape=(0, classes_num),
                          maxshape=(None, classes_num),
                          dtype=np.bool)

    if has_strong_labels:
        hf.create_dataset(name='strong_target',
                          shape=(0, frames_num, classes_num),
                          maxshape=(None, frames_num, classes_num),
                          dtype=np.bool)

    for (n, audio_name) in enumerate(audio_names):
        audio_path = os.path.join(audios_dir, audio_name)
        print(n, audio_path)

        # Read audio
        (audio, _) = read_audio(audio_path=audio_path, target_fs=sample_rate)

        # Pad or truncate audio recording
        audio = pad_truncate_sequence(audio, total_samples)

        # Extract feature
        feature = feature_extractor.transform(audio)

        # Remove the extra frames caused by padding zero
        feature = feature[0:frames_num]

        hf['feature'].resize((n + 1, frames_num, mel_bins))
        hf['feature'][n] = feature

        if has_weak_labels:
            weak_labels = data_dict[audio_name]['weak_labels']
            hf['weak_target'].resize((n + 1, classes_num))
            hf['weak_target'][n] = labels_to_target(weak_labels, classes_num,
                                                    lb_to_idx)

        if has_strong_labels:
            events = data_dict[audio_name]['strong_labels']
            hf['strong_target'].resize((n + 1, frames_num, classes_num))
            hf['strong_target'][n] = events_to_target(
                events=events,
                frames_num=frames_num,
                classes_num=classes_num,
                frames_per_second=frames_per_second,
                lb_to_idx=lb_to_idx)

    hf.close()

    print('Write hdf5 file to {} using {:.3f} s'.format(
        feature_path,
        time.time() - extract_time))
Esempio n. 5
0
def calculate_feature_for_all_audio_files(args):
    '''Calculate feature of audio files and write out features to a single hdf5 
    file. 
    
    Args:
      dataset_dir: string
      workspace: string
      data_type: 'train' | 'validate' | 'evaluate'
      mini_data: bool, set True for debugging on a small part of data
    '''
    
    # Arguments & parameters
    dataset_dir = args.dataset_dir
    data_type = args.data_type
    workspace = args.workspace
    mini_data = args.mini_data
    
    sample_rate = config.sample_rate
    window_size = config.window_size
    hop_size = config.hop_size
    mel_bins = config.mel_bins
    fmin = config.fmin
    fmax = config.fmax
    frames_per_second = config.frames_per_second
    frames_num = config.frames_num
    total_samples = config.total_samples
    
    # Paths
    if mini_data:
        prefix = 'minidata_'
    else:
        prefix = ''
        
    metadata_path = os.path.join(dataset_dir, 'annotations.csv')

    if data_type in ['train', 'validate']:
        audios_dir = os.path.join(dataset_dir, data_type)
    elif data_type == 'evaluate':
        audios_dir = os.path.join(dataset_dir, 'audio-eval')
    
    feature_path = os.path.join(workspace, 'features', 
        '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 
        '{}.h5'.format(data_type))
    create_folder(os.path.dirname(feature_path))
        
    # Feature extractor
    feature_extractor = LogMelExtractor(
        sample_rate=sample_rate, 
        window_size=window_size, 
        hop_size=hop_size, 
        mel_bins=mel_bins, 
        fmin=fmin, 
        fmax=fmax)

    # Read metadata
    print('Extracting features of all audio files ...')
    extract_time = time.time()
    
    if data_type in ['train', 'validate']:
        meta_dict = read_metadata(metadata_path, data_type, mini_data)
    elif data_type == 'evaluate':
        meta_dict = read_evaluate_metadata(audios_dir, mini_data)

    # Hdf5 containing features and targets
    hf = h5py.File(feature_path, 'w')

    hf.create_dataset(
        name='audio_name', 
        data=[audio_name.encode() for audio_name in meta_dict['audio_name']], 
        dtype='S32')

    if 'fine_target' in meta_dict.keys():
        hf.create_dataset(
            name='fine_target', 
            data=meta_dict['fine_target'], 
            dtype=np.float32)
            
    if 'coarse_target' in meta_dict.keys():
        hf.create_dataset(
            name='coarse_target', 
            data=meta_dict['coarse_target'], 
            dtype=np.float32)

    hf.create_dataset(
        name='feature', 
        shape=(0, frames_num, mel_bins), 
        maxshape=(None, frames_num, mel_bins), 
        dtype=np.float32)

    for (n, audio_name) in enumerate(meta_dict['audio_name']):
        audio_path = os.path.join(audios_dir, audio_name)
        print(n, audio_path)
        
        # Read audio
        (audio, _) = read_audio(
            audio_path=audio_path, 
            target_fs=sample_rate)

        # Pad or truncate audio recording
        audio = pad_truncate_sequence(audio, total_samples)
        
        # Extract feature
        feature = feature_extractor.transform(audio)
        
        # Remove the extra frames caused by padding zero
        feature = feature[0 : frames_num]
        
        hf['feature'].resize((n + 1, frames_num, mel_bins))
        hf['feature'][n] = feature
        
    hf.close()
        
    print('Write hdf5 file to {} using {:.3f} s'.format(
        feature_path, time.time() - extract_time))
def pack_audio_files_to_hdf5(args):
    """Pack waveform to hdf5 file. 

    Args:
      dataset_dir: str, directory of dataset
      workspace: str, Directory of your workspace
      data_type: 'training' | 'testing' | 'evaluation'
      mini_data: bool, set True for debugging on a small part of data
    """

    # Arguments & parameters
    dataset_dir = args.dataset_dir
    workspace = args.workspace
    data_type = args.data_type
    mini_data = args.mini_data

    sample_rate = config.sample_rate
    audio_length = config.audio_length
    classes_num = config.classes_num
    lb_to_idx = config.lb_to_idx
    frames_per_second = config.frames_per_second
    frames_num = frames_per_second * config.audio_duration

    has_strong_target = data_type in ['testing', 'evaluation']

    # Paths
    audios_dir = os.path.join(dataset_dir, data_type)
    weak_label_csv_path = os.path.join(dataset_dir, 'metadata', 
        get_weak_csv_filename(data_type))

    if data_type == 'testing':
        strong_label_csv_path = os.path.join(dataset_dir, 'metadata', 
            'groundtruth_strong_label_testing_set.csv')
    elif data_type == 'evaluation':
        strong_label_csv_path = os.path.join(dataset_dir, 'metadata', 
            'groundtruth_strong_label_evaluation_set.csv')

    if mini_data:
        packed_hdf5_path = os.path.join(workspace, 'features', 
            'minidata_{}.waveform.h5'.format(data_type))
    else:
        packed_hdf5_path = os.path.join(workspace, 'features', 
            '{}.waveform.h5'.format(data_type))
    create_folder(os.path.dirname(packed_hdf5_path))

    # Read metadata
    weak_meta_list = read_weak_csv(weak_label_csv_path, data_type)

    # Use a small amount of data for debugging
    if mini_data:
        random.seed(1234)
        random.shuffle(weak_meta_list)
        weak_meta_list = weak_meta_list[0 : 100]

    audios_num = len(weak_meta_list)

    feature_time = time.time()
    with h5py.File(packed_hdf5_path, 'w') as hf:
        hf.create_dataset(
            name='audio_name', 
            shape=(audios_num,), 
            dtype='S80')

        hf.create_dataset(
            name='waveform', 
            shape=(audios_num, audio_length), 
            dtype=np.int32)

        hf.create_dataset(
            name='weak_target', 
            shape=(audios_num, classes_num), 
            dtype=np.float32)

        if has_strong_target:
            strong_meta_dict = read_strong_csv(strong_label_csv_path)        
            
            hf.create_dataset(
                name='strong_target', 
                shape=(0, frames_num, classes_num), 
                maxshape=(None, frames_num, classes_num), 
                dtype=np.bool)

        for n in range(audios_num):
            print(n)
            weak_meta_dict = weak_meta_list[n]
            audio_name = weak_meta_dict['audio_name']
            audio_path = os.path.join(audios_dir, audio_name)
            (audio, fs) = librosa.core.load(audio_path, sr=sample_rate, mono=True)
            audio = pad_truncate_sequence(audio, audio_length)

            hf['audio_name'][n] = audio_name.encode()
            hf['waveform'][n] = float32_to_int16(audio)
            hf['weak_target'][n] = weak_target = get_weak_target(
                weak_meta_dict['labels'], lb_to_idx)

            if has_strong_target:
                strong_target = get_strong_target(
                    weak_meta_dict['audio_name'][1:], strong_meta_dict, 
                    frames_num, frames_per_second, lb_to_idx)
                
                hf['strong_target'].resize((n + 1, frames_num, classes_num))
                hf['strong_target'][n] = strong_target

    print('Write hdf5 to {}'.format(packed_hdf5_path))
    print('Time: {:.3f} s'.format(time.time() - feature_time))