def calculate_logmel(audio_path, sample_rate, feature_extractor):

    # Read audio
    (audio, fs) = read_audio(audio_path, target_fs=sample_rate, mono=False)

    events_audio = audio[:, 0]
    scene_audio = audio[:, 1]
    mixed_audio = np.mean(audio, axis=-1)
    '''We do not divide the maximum value of an audio here because we assume 
    the low energy of an audio may also contain information of a scene. '''

    # Extract feature
    mixture_logmel = feature_extractor.transform(mixed_audio)
    mixture_stft = feature_extractor.transform_stft(mixed_audio)
    events_stft = feature_extractor.transform_stft(events_audio)
    scene_stft = feature_extractor.transform_stft(scene_audio)

    dict = {
        'mixture_logmel': mixture_logmel,
        'mixture_stft': mixture_stft,
        'events_stft': events_stft,
        'scene_stft': scene_stft
    }

    return dict
def calculate_logmel(audio_path, sample_rate, extractor):

    (audio, _) = read_audio(audio_path, target_fs=sample_rate)

    audio = audio / np.max(np.abs(audio))

    feature = extractor.transform(audio)

    return feature
Beispiel #3
0
def calculate_logmel(audio_path, sample_rate, feature_extractor):
    # Read audio
    (audio, fs) = read_audio(audio_path, target_fs=sample_rate)
    '''We do not divide the maximum value of an audio here because we assume 
    the low energy of an audio may also contain information of a scene. '''

    # Extract feature
    feature = feature_extractor.transform(audio)

    return feature
Beispiel #4
0
def calculate_multi_logmel(audio_path, sample_rate, feature_extractor):
    # Read stereo audio
    (audio, fs) = read_audio(audio_path, target_fs=sample_rate)
    '''We do not divide the maximum value of an audio here because we assume 
    the low energy of an audio may also contain information of a scene. '''

    # Extract feature
    l_feature = feature_extractor.transform(audio[0])
    r_feature = feature_extractor.transform(audio[1])

    return np.stack([l_feature, r_feature], axis=0)
def calculate_logmel(audio_path, sample_rate, feature_extractor):

    # Read audio
    (audio, fs) = read_audio(audio_path, target_fs=sample_rate)

    # Normalize energy
    audio /= np.max(np.abs(audio))

    # Extract feature
    feature = feature_extractor.transform(audio)

    return feature
Beispiel #6
0
def calculate_hpss_logmel(audio_path, sample_rate, feature_extractor):
    # Read stereo audio
    (audio, fs) = read_audio(audio_path, target_fs=sample_rate)
    '''We do not divide the maximum value of an audio here because we assume 
    the low energy of an audio may also contain information of a scene. '''

    # Extract feature
    h, p = librosa.effects.hpss(audio)
    h_feature = feature_extractor.transform(h)
    p_feature = feature_extractor.transform(p)

    return np.stack([h_feature, p_feature], axis=0)
Beispiel #7
0
    def __getitem__(self, index):

        # Read audio
        (audio, fs) = read_audio(self.audio_names[index],
                                 target_fs=self.sample_rate)
        audio /= max(1., np.max(np.abs(audio)))

        # Cut silence
        frame_length = 2048
        hop_length = 512
        threshold = 0.01
        energy = librosa.feature.rmse(audio,
                                      frame_length=frame_length,
                                      hop_length=hop_length,
                                      center=True)[0]
        frames = np.nonzero(energy > threshold)[0]
        indices = librosa.core.frames_to_samples(frames, hop_length=hop_length)

        # Abandon too short clips
        if len(indices) < 2:
            audio = np.zeros(10000)
        else:
            audio = audio[indices[0]:indices[-1]]

        if len(audio) < 10000:
            audio = np.zeros(10000)
        else:
            audio = audio[0:70000]  # To not over use 12 GB GPU RAM

        # Mu-law
        _mulaw = mu_law.MuLaw(mu=self.quantize_bins)
        _quantize = mu_law.Quantize(quantize=self.quantize_bins)

        audio = _mulaw.transform(audio)
        audio = _quantize.transform(audio)
        audio = torch.LongTensor(audio)

        # Get global condition
        self.audio_names[index]
        global_condition = int(self.audio_names[index].split('/')[-1][1:4])
        global_condition = torch.tensor(global_condition)

        return audio, global_condition
def create_mixed_audios(args):
    """Create mixed audios using the meta from the mixture yaml file. 
    """

    # Arguments & parameters
    dcase2018_task1_dataset_dir = args.dcase2018_task1_dataset_dir
    dcase2018_task2_dataset_dir = args.dcase2018_task2_dataset_dir
    workspace = args.workspace
    scene_type = args.scene_type
    snr = args.snr

    sample_rate = config.sample_rate
    clip_duration = config.clip_duration
    audio_samples = int(sample_rate * clip_duration)
    random_state = np.random.RandomState(1234)

    # Paths
    mixture_yaml_path = os.path.join(workspace, 'mixture.yaml')

    out_audios_dir = os.path.join(
        workspace, 'mixed_audios',
        'scene_type={},snr={}'.format(scene_type, snr))

    create_folder(out_audios_dir)

    create_mixed_audio_time = time.time()

    # Read mixture yaml file
    with open(mixture_yaml_path, 'r') as f:
        data_list = yaml.load(f)

    for n, data in enumerate(data_list):

        if n % 10 == 0:
            logging.info('{} / {} mixed audios created'
                         ''.format(n, len(data_list)))

        if scene_type == 'white_noise':
            scene_audio = random_state.uniform(0., 1., audio_samples)

        elif scene_type == 'dcase2018_task1':
            scene_audio_name = data['scene_audio_name']
            scene_audio_path = os.path.join(dcase2018_task1_dataset_dir,
                                            'audio', scene_audio_name)

            (scene_audio, fs) = read_audio(scene_audio_path,
                                           target_fs=sample_rate)

        # Normalize scene audio
        scene_audio = normalize_to_energy(scene_audio, db=0)

        # Reserve space
        events_audio = np.zeros(audio_samples)

        # Read sound events audio
        for event in data['events']:

            audio_name = event['event_audio_name']
            onset = int(event['onset'] * sample_rate)
            offset = int(event['offset'] * sample_rate)

            audio_path = os.path.join(dcase2018_task2_dataset_dir,
                                      'audio_train', audio_name)

            (event_audio, fs) = read_audio(audio_path, target_fs=sample_rate)

            event_audio = normalize_to_energy(event_audio, db=snr)

            events_audio[onset:offset] = event_audio[0:offset - onset]

        stereo_audio = np.array((events_audio, scene_audio)).T
        '''shape: (samples, 2)'''

        # Normalize
        stereo_audio /= np.max(np.abs(stereo_audio))

        # Write out audio
        out_audio_path = os.path.join(out_audios_dir, data['mixture_name'])
        write_audio(out_audio_path, stereo_audio, sample_rate)

    logging.info('Write out audio finished! {} s'
                 ''.format(time.time() - create_mixed_audio_time))
def plot_waveform(args):
    
    # Arugments & parameters
    workspace = args.workspace
    holdout_fold = args.holdout_fold
    scene_type = args.scene_type
    snr = args.snr
    cuda = args.cuda

    labels = config.labels
    classes_num = len(labels)
    sample_rate = config.sample_rate
    window_size = config.window_size
    overlap = config.overlap
    hop_size = window_size-overlap
    mel_bins = config.mel_bins
    seq_len = config.seq_len
    ix_to_lb = config.ix_to_lb
    
    thres = 0.1
    batch_size = 24
    
    # Paths
    hdf5_path = os.path.join(workspace, 'features', 'logmel', 
        'scene_type={},snr={}'.format(scene_type, snr), 'development.h5')
        
    yaml_path = os.path.join(workspace, 'mixture.yaml')
    
    audios_dir = os.path.join(workspace, 'mixed_audios', 
        'scene_type={},snr={}'.format(scene_type, snr))
    
    # Load yaml file
    load_yaml_time = time.time()
    with open(yaml_path, 'r') as f:
        meta = yaml.load(f)        
    print('Load yaml file time: {:.3f} s'.format(time.time() - load_yaml_time))

    # Data generator
    generator = InferenceDataGenerator(
        hdf5_path=hdf5_path,
        batch_size=batch_size, 
        holdout_fold=holdout_fold)

    generate_func = generator.generate_validate(
        data_type='validate', 
        shuffle=False, 
        max_iteration=None)
    
    # Evaluate on mini-batch
    for (iteration, data) in enumerate(generate_func):
        
        print(iteration)
        
        (batch_x, batch_y, batch_audio_names) = data
            
        batch_x = move_data_to_gpu(batch_x, cuda)

        batch_gt_masks = []
        batch_single_gt_masks = []
        batch_mixture_stfts = []

        for n in range(len(batch_audio_names)):
            curr_meta = search_meta_by_mixture_name(meta, batch_audio_names[n])
            curr_events = curr_meta['events']
              
            gt_indexes = get_ground_truth_indexes(curr_events)
            gt_sed = get_sed_from_meta(curr_events) # (seq_len, classes_num)
              
            (events_stft, scene_stft, mixture_stft) = \
                generator.get_events_scene_mixture_stft(batch_audio_names[n])
                
            gt_mask = ideal_ratio_mask(events_stft, scene_stft)    # (seq_len, fft_size)
            
            gt_masks = gt_mask[:, :, None] * gt_sed[:, None, :] # (seq_len, fft_size, classes_num)
            gt_masks = gt_masks.astype(np.float32)
            batch_gt_masks.append(gt_masks)
            batch_single_gt_masks.append(gt_mask)
            
            batch_mixture_stfts.append(mixture_stft)
            
        # Plot waveform & spectrogram & ideal ratio mask
        if True:
            for n in range(len(batch_x)):

                print(batch_audio_names[n])
                print(batch_y[n])
                target_labels = target_to_labels(batch_y[n], labels)
                print(target_labels)
                
                mixed_audio_path = os.path.join(audios_dir, batch_audio_names[n])
                (mixed_audio, _) = read_audio(mixed_audio_path, target_fs=config.sample_rate, mono=True)
                mixed_audio /= np.max(np.abs(mixed_audio))
                
                fig, axs = plt.subplots(3, 1, figsize=(6, 6))
                
                axs[0].plot(mixed_audio)
                axs[0].set_title('Waveform')
                axs[0].xaxis.set_ticks([0, len(mixed_audio)])
                axs[0].xaxis.set_ticklabels(['0.0', '10.0 s'])
                axs[0].set_xlim(0, len(mixed_audio))
                axs[0].set_ylim(-1, 1)
                axs[0].set_xlabel('time')
                axs[0].set_ylabel('Amplitude')
                
                axs[1].matshow(np.log(batch_mixture_stfts[n]).T, origin='lower', aspect='auto', cmap='jet')
                axs[1].set_title('Spectrogram')
                axs[1].xaxis.set_ticks([0, 310])
                axs[1].xaxis.set_ticklabels(['0.0', '10.0 s'])
                axs[1].xaxis.tick_bottom()
                axs[1].yaxis.set_ticks([0, 1024])
                axs[1].yaxis.set_ticklabels(['0', '1025'])
                axs[1].set_xlabel('time')
                axs[1].set_ylabel('FFT bins')
                
                axs[2].matshow(batch_single_gt_masks[n].T, origin='lower', aspect='auto', cmap='jet')
                axs[2].set_title('Ideal ratio mask')
                axs[2].xaxis.set_ticks([0, 310])
                axs[2].xaxis.set_ticklabels(['0.0', '10.0 s'])
                axs[2].xaxis.tick_bottom()
                axs[2].yaxis.set_ticks([0, 1024])
                axs[2].yaxis.set_ticklabels(['0', '1025'])
                axs[2].set_xlabel('time')
                axs[2].set_ylabel('FFT bins')
                
                plt.tight_layout()
                plt.show()
def plot_mel_masks(args):
    
    # Arugments & parameters
    workspace = args.workspace
    holdout_fold = args.holdout_fold
    scene_type = args.scene_type
    snr = args.snr
    iteration = args.iteration
    model_type = args.model_type
    cuda = args.cuda

    labels = config.labels
    classes_num = len(labels)
    sample_rate = config.sample_rate
    window_size = config.window_size
    overlap = config.overlap
    hop_size = window_size-overlap
    mel_bins = config.mel_bins
    seq_len = config.seq_len
    ix_to_lb = config.ix_to_lb
    
    thres = 0.1
    batch_size = 24

    # Paths
    hdf5_path = os.path.join(workspace, 'features', 'logmel', 
        'scene_type={},snr={}'.format(scene_type, snr), 'development.h5')

    model_path = os.path.join(workspace, 'models', 'main_pytorch', 
        'model_type={}'.format(model_type), 'scene_type={},snr={}'
        ''.format(scene_type, snr), 'holdout_fold{}'.format(holdout_fold), 
        'md_{}_iters.tar'.format(iteration))
    
    yaml_path = os.path.join(workspace, 'mixture.yaml')
    
    audios_dir = os.path.join(workspace, 'mixed_audios', 
                              'scene_type={},snr={}'.format(scene_type, snr))
    
    sep_wavs_dir = os.path.join(workspace, 'separated_wavs', 'main_pytorch', 
        'model_type={}'.format(model_type), 
        'scene_type={},snr={}'.format(scene_type, snr), 
        'holdout_fold{}'.format(holdout_fold))
        
    create_folder(sep_wavs_dir)
    
    # Load yaml file
    load_yaml_time = time.time()
    with open(yaml_path, 'r') as f:
        meta = yaml.load(f)        
    print('Load yaml file time: {:.3f} s'.format(time.time() - load_yaml_time))
    
    feature_extractor = LogMelExtractor(
        sample_rate=sample_rate, 
        window_size=window_size, 
        overlap=overlap, 
        mel_bins=mel_bins)

    inverse_melW = feature_extractor.get_inverse_melW()
    
    # Load model
    Model = get_model(model_type)
    model = Model(classes_num, seq_len, mel_bins, cuda)
    checkpoint = torch.load(model_path)
    model.load_state_dict(checkpoint['state_dict'])

    if cuda:
        model.cuda()

    # Data generator
    generator = InferenceDataGenerator(
        hdf5_path=hdf5_path,
        batch_size=batch_size, 
        holdout_fold=holdout_fold)

    generate_func = generator.generate_validate(
        data_type='validate', 
        shuffle=False, 
        max_iteration=None)
    
    # Evaluate on mini-batch
    for (iteration, data) in enumerate(generate_func):
        
        (batch_x, batch_y, batch_audio_names) = data            
        batch_x = move_data_to_gpu(batch_x, cuda)

        # Predict
        with torch.no_grad():
            model.eval()
            (batch_output, batch_bottleneck) = model(
                batch_x, return_bottleneck=True)
    
        batch_output = batch_output.data.cpu().numpy()
        '''(batch_size, classes_num)'''
        
        batch_bottleneck = batch_bottleneck.data.cpu().numpy()  
        '''(batch_size, classes_num, seq_len, mel_bins)'''

        batch_pred_sed = np.mean(batch_bottleneck, axis=-1)
        batch_pred_sed = np.transpose(batch_pred_sed, (0, 2, 1))    
        '''(batch_size, seq_len, classes_num)'''
        
        batch_gt_masks = []
        
        for n in range(len(batch_audio_names)):
            curr_meta = search_meta_by_mixture_name(meta, batch_audio_names[n])
            curr_events = curr_meta['events']
              
            pred_indexes = np.where(batch_output[n] > thres)[0]
            gt_indexes = get_ground_truth_indexes(curr_events)
 
            gt_sed = get_sed_from_meta(curr_events) # (seq_len, classes_num)
            
            pred_sed = np.zeros((seq_len, classes_num))
            pred_sed[:, pred_indexes] = batch_pred_sed[n][:, pred_indexes]  # (seq_len, classes_num)
 
            (events_stft, scene_stft, _) = generator.get_events_scene_mixture_stft(batch_audio_names[n])
            events_stft = np.dot(events_stft, feature_extractor.melW)
            scene_stft = np.dot(scene_stft, feature_extractor.melW)
            
            gt_mask = ideal_binary_mask(events_stft, scene_stft)    # (seq_len, fft_size)
            
            gt_masks = gt_mask[:, :, None] * gt_sed[:, None, :] # (seq_len, fft_size, classes_num)
            gt_masks = gt_masks.astype(np.float32)
            batch_gt_masks.append(gt_masks)
            
            pred_masks = batch_bottleneck[n].transpose(1, 2, 0) # (seq_len, fft_size, classes_num)

            # Save out separated audio
            if True:
                curr_audio_name = curr_meta['mixture_name']
                audio_path = os.path.join(audios_dir, curr_audio_name)
                (mixed_audio, fs) = read_audio(audio_path, target_fs=sample_rate, mono=True)
                
                out_wav_path = os.path.join(sep_wavs_dir, curr_audio_name)
                write_audio(out_wav_path, mixed_audio, sample_rate)
                
                window = np.hamming(window_size)
                mixed_stft_cmplx = stft(x=mixed_audio, window_size=window_size, hop_size=hop_size, window=window, mode='complex')
                mixed_stft_cmplx = mixed_stft_cmplx[0 : seq_len, :]
                mixed_stft = np.abs(mixed_stft_cmplx)
                
                for k in gt_indexes:
                    masked_stft = np.dot(pred_masks[:, :, k], inverse_melW) * mixed_stft
                    masked_stft_cmplx = real_to_complex(masked_stft, mixed_stft_cmplx)
                    
                    frames = istft(masked_stft_cmplx)
                    cola_constant = get_cola_constant(hop_size, window)
                    sep_audio = overlap_add(frames, hop_size, cola_constant)
                    
                    sep_wav_path = os.path.join(sep_wavs_dir, '{}_{}.wav'.format(os.path.splitext(curr_audio_name)[0], ix_to_lb[k]))
                    write_audio(sep_wav_path, sep_audio, sample_rate)
                    print('Audio wrote to {}'.format(sep_wav_path))
      
        # Visualize learned representations
        if True:
            for n in range(len(batch_output)):
            
                # Plot segmentation masks. (00013.wav is used for plot in the paper)
                print('audio_name: {}'.format(batch_audio_names[n]))
                print('target: {}'.format(batch_y[n]))
                target_labels = target_to_labels(batch_y[n], labels)
                print('target labels: {}'.format(target_labels))
            
                (events_stft, scene_stft, _) = generator.get_events_scene_mixture_stft(batch_audio_names[n])
    
                fig, axs = plt.subplots(7, 7, figsize=(15, 10))
                for k in range(classes_num):
                    axs[k // 6, k % 6].matshow(batch_bottleneck[n, k].T, origin='lower', aspect='auto', cmap='jet')
                    if labels[k] in target_labels:
                        color = 'r'
                    else:
                        color = 'k'
                    axs[k // 6, k % 6].set_title(labels[k], color=color)
                    axs[k // 6, k % 6].xaxis.set_ticks([])
                    axs[k // 6, k % 6].yaxis.set_ticks([])
                    axs[k // 6, k % 6].set_xlabel('time')
                    axs[k // 6, k % 6].set_ylabel('mel bins')
                    
                axs[6, 5].matshow(np.log(events_stft + 1e-8).T, origin='lower', aspect='auto', cmap='jet')
                axs[6, 5].set_title('Spectrogram (in log scale)')
                axs[6, 5].xaxis.set_ticks([0, 310])
                axs[6, 5].xaxis.set_ticklabels(['0.0', '10.0 s'])
                axs[6, 5].xaxis.tick_bottom()
                axs[6, 5].yaxis.set_ticks([0, 1024])
                axs[6, 5].yaxis.set_ticklabels(['0', '1025'])
                axs[6, 5].set_xlabel('time')
                axs[6, 5].set_ylabel('FFT bins')
                
                axs[6, 6].matshow(np.log(np.dot(events_stft, feature_extractor.melW) + 1e-8).T, origin='lower', aspect='auto', cmap='jet')
                axs[6, 6].set_title('Log mel pectrogram')
                axs[6, 6].xaxis.set_ticks([0, 310])
                axs[6, 6].xaxis.set_ticklabels(['0.0', '10.0 s'])
                axs[6, 6].xaxis.tick_bottom()
                axs[6, 6].yaxis.set_ticks([0, 63])
                axs[6, 6].yaxis.set_ticklabels(['0', '64'])
                axs[6, 6].set_xlabel('time')
                axs[6, 6].set_ylabel('mel bins')
                
                plt.tight_layout(pad=0.5, w_pad=0.5, h_pad=0.5)
                plt.show()
                
                # Plot frame-wise SED
                fig, ax = plt.subplots(1, 1, figsize=(4, 4))
                score_mat = []
                for k in range(classes_num):
                    score = np.mean(batch_bottleneck[n, k], axis=-1)
                    score_mat.append(score)
                    
                score_mat = np.array(score_mat)
                
                ax.matshow(score_mat, origin='lower', aspect='auto', cmap='jet')
                ax.set_title('Frame-wise predictions')
                ax.xaxis.set_ticks([0, 310])
                ax.xaxis.set_ticklabels(['0.0', '10.0 s'])
                ax.xaxis.tick_bottom()
                ax.set_xlabel('time')
                ax.yaxis.set_ticks(np.arange(classes_num))
                ax.yaxis.set_ticklabels(config.labels, fontsize='xx-small')
                ax.yaxis.grid(color='k', linestyle='solid', linewidth=0.3)
                
                plt.tight_layout(pad=0.5, w_pad=0.5, h_pad=0.5)
                plt.show()
                
                # Plot event-wise SED
                est_event_list = get_est_event_list(batch_pred_sed[n:n+1], batch_audio_names[n:n+1], labels)
                event_mat = event_list_to_matrix(est_event_list)
                
                fig, ax = plt.subplots(1, 1, figsize=(4, 4))
                ax.matshow(event_mat.T, origin='lower', aspect='auto', cmap='jet')
                ax.set_title('Event-wise predictions')
                ax.xaxis.set_ticks([0, 310])
                ax.xaxis.set_ticklabels(['0.0', '10.0 s'])
                ax.xaxis.tick_bottom()
                ax.set_xlabel('time')
                ax.yaxis.set_ticks(np.arange(classes_num))
                ax.yaxis.set_ticklabels(config.labels, fontsize='xx-small')
                ax.yaxis.grid(color='k', linestyle='solid', linewidth=0.3)
                
                plt.tight_layout(pad=0.5, w_pad=0.5, h_pad=0.5)
                plt.show()
                
                # Plot event-wise ground truth
                ref_event_list = get_ref_event_list(meta, batch_audio_names[n:n+1])
                event_mat = event_list_to_matrix(ref_event_list)
                
                fig, ax = plt.subplots(1, 1, figsize=(4, 4))
                ax.matshow(event_mat.T, origin='lower', aspect='auto', cmap='jet')
                ax.set_title('Event-wise ground truth')
                ax.xaxis.set_ticks([0, 310])
                ax.xaxis.set_ticklabels(['0.0', '10.0 s'])
                ax.xaxis.tick_bottom()
                ax.set_xlabel('time')
                ax.yaxis.set_ticks(np.arange(classes_num))
                ax.yaxis.set_ticklabels(config.labels, fontsize='xx-small')
                ax.yaxis.grid(color='k', linestyle='solid', linewidth=0.3)
                
                plt.tight_layout(pad=0.5, w_pad=0.5, h_pad=0.5)
                plt.show()
Beispiel #11
0
def calculate_feature_for_all_audio_files(args):
    '''Calculate feature of audio files and write out features to a hdf5 file. 
    
    Args:
      dataset_dir: string
      workspace: string
      data_type: 'train_curated', 'train_noisy', 'test'
      mini_data: bool, set True for debugging on a small part of data
    '''

    # Arguments & parameters
    dataset_dir = DATASET_DIR
    workspace = WORKSPACE
    data_type = args.data_type
    mini_data = args.mini_data

    sample_rate = config.sample_rate
    window_size = config.window_size
    hop_size = config.hop_size
    mel_bins = config.mel_bins
    fmin = config.fmin
    fmax = config.fmax
    frames_per_second = config.frames_per_second
    lb_to_idx = config.lb_to_idx

    # Paths
    if mini_data:
        prefix = 'minidata_'
    else:
        prefix = ''

    if data_type in ['train_curated', 'train_noisy']:
        metadata_path = os.path.join(dataset_dir, '{}.csv'.format(data_type))
    else:
        pass

    audios_dir = os.path.join(dataset_dir, data_type)

    feature_path = os.path.join(
        workspace, 'features',
        '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second,
                                             mel_bins),
        '{}.h5'.format(data_type))
    create_folder(os.path.dirname(feature_path))

    # Read meta data
    if data_type in ['train_curated', 'train_noisy']:
        meta_dict = read_metadata(metadata_path, lb_to_idx)
    elif data_type == 'test':
        meta_dict = {'audio_name': np.array(sorted(os.listdir(audios_dir)))}

    # Feature extractor
    feature_extractor = LogMelExtractor(sample_rate=sample_rate,
                                        window_size=window_size,
                                        hop_size=hop_size,
                                        mel_bins=mel_bins,
                                        fmin=fmin,
                                        fmax=fmax)

    if mini_data:
        mini_num = 100
        total_num = len(meta_dict['audio_name'])
        random_state = np.random.RandomState(1234)
        indexes = random_state.choice(total_num, size=mini_num, replace=False)
        meta_dict['audio_name'] = meta_dict['audio_name'][indexes]
        if 'target' in meta_dict:
            meta_dict['target'] = meta_dict['target'][indexes]

    # Hdf5 file for storing features and targets
    print('Extracting features of all audio files ...')
    extract_time = time.time()

    audios_num = len(meta_dict['audio_name'])

    hf = h5py.File(feature_path, 'w')

    hf.create_dataset(
        name='audio_name',
        data=[audio_name.encode() for audio_name in meta_dict['audio_name']],
        dtype='S20')

    if 'target' in meta_dict:
        hf.create_dataset(name='target',
                          data=meta_dict['target'],
                          dtype=np.bool)

    hf.create_dataset(name='feature',
                      shape=(0, mel_bins),
                      maxshape=(None, mel_bins),
                      dtype=np.float32)

    hf.create_dataset(name='begin_index', shape=(audios_num, ), dtype=np.int32)

    hf.create_dataset(name='end_index', shape=(audios_num, ), dtype=np.int32)

    for (n, audio_name) in enumerate(meta_dict['audio_name']):
        audio_path = os.path.join(audios_dir, audio_name)
        print(n, audio_path)

        # Read audio
        (audio, _) = read_audio(audio_path=audio_path, target_fs=sample_rate)

        # Extract feature
        feature = feature_extractor.transform(audio)
        print(feature.shape)

        begin_index = hf['feature'].shape[0]
        end_index = begin_index + feature.shape[0]
        hf['feature'].resize((end_index, mel_bins))
        hf['feature'][begin_index:end_index, :] = feature

        hf['begin_index'][n] = begin_index
        hf['end_index'][n] = end_index

    hf.close()

    print('Write hdf5 file to {} using {:.3f} s'.format(
        feature_path,
        time.time() - extract_time))
def calculate_feature_for_all_audio_files(args):
    '''Calculate feature of audio files and write out features to a hdf5 file. 
    
    Args:
      dataset_dir: string
      workspace: string
      subtask: 'a' | 'b' | 'c'
      data_type: 'development' | 'evaluation'
      mini_data: bool, set True for debugging on a small part of data
    '''


    # Arguments & parameters
    # dataset_dir = args.dataset_dir
    # workspace = args.workspace
    # subtask = args.subtask
    # data_type = args.data_type
    # mini_data = args.mini_data
    
    # test 1
    dataset_dir = 'D:/Project/DCASE_test/Data'
    workspace = 'D:/Project/DCASE_test'
    subtask = 'a'
    data_type = 'development'
    mini_data = True

    
    
    sample_rate = config.sample_rate
    window_size = config.window_size
    hop_size = config.hop_size
    mel_bins = config.mel_bins
    fmin = config.fmin
    fmax = config.fmax
    frames_per_second = config.frames_per_second
    frames_num = config.frames_num
    total_samples = config.total_samples
    lb_to_idx = config.lb_to_idx
    
    # Paths
    if mini_data:
        prefix = 'minidata_'
    else:
        prefix = ''
        
    sub_dir = get_subdir(subtask, data_type)
    audios_dir = os.path.join(dataset_dir, sub_dir, 'audio')

    if data_type == 'development':
        metadata_path = os.path.join(dataset_dir, sub_dir, 'meta.csv')
    elif data_type == 'leaderboard':
        metadata_path = os.path.join(dataset_dir, sub_dir, 'evaluation_setup', 'test.csv')
    else:
        raise Exception('Incorrect data_type!')
    
    feature_path = os.path.join(workspace, 'features', 
        '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 
        '{}.h5'.format(sub_dir))
    create_folder(os.path.dirname(feature_path))
        
    # Feature extractor
    feature_extractor = LogMelExtractor(
        sample_rate=sample_rate, 
        window_size=window_size, 
        hop_size=hop_size, 
        mel_bins=mel_bins, 
        fmin=fmin, 
        fmax=fmax)

    # Read metadata
    meta_dict = read_metadata(metadata_path)

    # Extract features and targets 
    if mini_data:
        mini_num = 20
        total_num = len(meta_dict['audio_name'])
        random_state = np.random.RandomState(1234)
        indexes = random_state.choice(total_num, size=mini_num, replace=False)
        for key in meta_dict.keys():
            meta_dict[key] = meta_dict[key][indexes]
        
    print('Extracting features of all audio files ...')
    extract_time = time.time()
    
    # Hdf5 file for storing features and targets
    hf = h5py.File(feature_path, 'w')

    hf.create_dataset(
        name='audio_name', 
        data=[audio_name.encode() for audio_name in meta_dict['audio_name']], 
        dtype='S80')

    if 'scene_label' in meta_dict.keys():
        hf.create_dataset(
            name='scene_label', 
            data=[scene_label.encode() for scene_label in meta_dict['scene_label']], 
            dtype='S24')
            
    if 'identifier' in meta_dict.keys():
        hf.create_dataset(
            name='identifier', 
            data=[identifier.encode() for identifier in meta_dict['identifier']], 
            dtype='S24')
            
    if 'source_label' in meta_dict.keys():
        hf.create_dataset(
            name='source_label', 
            data=[source_label.encode() for source_label in meta_dict['source_label']], 
            dtype='S8')

    hf.create_dataset(
        name='feature', 
        shape=(0, frames_num, mel_bins), 
        maxshape=(None, frames_num, mel_bins), 
        dtype=np.float32)

    for (n, audio_name) in enumerate(meta_dict['audio_name']):
        audio_path = os.path.join(audios_dir, audio_name)
        print(n, audio_path)
        
        # Read audio
        (audio, _) = read_audio(
            audio_path=audio_path, 
            target_fs=sample_rate)
        
        # Pad or truncate audio recording to the same length
        audio = pad_truncate_sequence(audio, total_samples)
        
        # Extract feature
        feature = feature_extractor.transform(audio)
        
        # Remove the extra log mel spectrogram frames caused by padding zero
        feature = feature[0 : frames_num]
        
        hf['feature'].resize((n + 1, frames_num, mel_bins))
        hf['feature'][n] = feature
            
    hf.close()
        
    print('Write hdf5 file to {} using {:.3f} s'.format(
        feature_path, time.time() - extract_time))
Beispiel #13
0
def create_mixture_yaml(args):
    """Create mixture yaml file containing a list of information. Each 
    information looks like:
    
    - events:
    - event_audio_name: 19f45b13.wav
        event_label: Tambourine
        offset: 1.22
        onset: 0.5
    - event_audio_name: 63874688.wav
        event_label: Scissors
        offset: 3.38
        onset: 3.0
    - event_audio_name: cd3e20ec.wav
        event_label: Computer_keyboard
        offset: 7.5
        onset: 5.5
    fold: 1
    mixture_name: 00000.wav
    scene_audio_name: metro_station-barcelona-62-1861-a.wav
    """

    # Arguments & parameters
    dcase2018_task1_dataset_dir = args.dcase2018_task1_dataset_dir
    dcase2018_task2_dataset_dir = args.dcase2018_task2_dataset_dir

    workspace = args.workspace

    random_state = np.random.RandomState(1234)
    folds = [1, 2, 3, 4]
    mixed_audios_per_fold = 2000
    events_per_clip = 3
    total_events_per_fold = mixed_audios_per_fold * events_per_clip

    # Paths
    dcase2018_task1_meta = os.path.join(dcase2018_task1_dataset_dir,
                                        'meta.csv')

    dcase2018_task2_meta = os.path.join(workspace,
                                        'dcase2018_task2_validate_meta.csv')

    out_yaml_path = os.path.join(workspace, 'mixture.yaml')
    create_folder(os.path.dirname(out_yaml_path))

    # DCASE 2018 Task 1 acoutic scenes meta
    df_scenes = pd.read_csv(dcase2018_task1_meta, sep='\t')
    scene_names = np.array(df_scenes['filename'])
    random_state.shuffle(scene_names)

    # DCASE 2018 Task 2 sound events meta
    df_events = pd.read_csv(dcase2018_task2_meta, sep=',')
    events_audio_num = len(df_events)

    acoustic_scene_index = 0
    data_list = []

    # Calculate mixture meta
    for fold in folds:

        # Selected audios indexes
        bool_selected = (df_events['fold'] == fold) & \
                        (df_events['manually_verified'] == 1)

        selected_event_indexes = np.arange(events_audio_num)[bool_selected]

        repeated_event_indexes = repeat_array(array=selected_event_indexes,
                                              max_len=total_events_per_fold,
                                              random_state=random_state)

        for n in range(mixed_audios_per_fold):

            if acoustic_scene_index % 100 == 0:
                print('Fold {}, {} / {} mixture infos created'
                      ''.format(fold, acoustic_scene_index,
                                mixed_audios_per_fold * len(folds)))

            event_indexes_for_one_clip = repeated_event_indexes[
                n * events_per_clip:(n + 1) * events_per_clip]

            events = []

            for j, index in enumerate(event_indexes_for_one_clip):

                event_audio_name = df_events.fname[index]
                event_label = df_events.label[index]
                onset = j * 2.5 + 0.5  # Onsets of events are 0.5 s, 3.0 s,
                # 5.5 s in an audio clip.

                event_audio_path = os.path.join(dcase2018_task2_dataset_dir,
                                                'audio_train',
                                                event_audio_name)

                (audio, fs) = read_audio(event_audio_path)
                audio_duration = len(audio) / float(fs)
                audio_duration = min(audio_duration, 2.0)  # Clip maximum
                # duration to 2.0 s.

                offset = onset + audio_duration

                events.append({
                    'event_audio_name': event_audio_name,
                    'event_label': event_label,
                    'onset': onset,
                    'offset': offset
                })

            scene_audio_name = scene_names[acoustic_scene_index].split('/')[1]

            data = {
                'mixture_name': '{:05d}.wav'.format(acoustic_scene_index),
                'fold': fold,
                'events': events,
                'scene_audio_name': scene_audio_name
            }
            data_list.append(data)

            acoustic_scene_index += 1

    # Write out yaml file
    with open(out_yaml_path, 'w') as f:
        yaml.dump(data_list, f, default_flow_style=False)

    print('Write out mixture yaml to {}'.format(out_yaml_path))
Beispiel #14
0
def logmel(args):

    # Arguments & parameters
    dataset_dir = args.dataset_dir
    workspace = args.workspace
    
    sample_rate = config.sample_rate
    window_size = config.window_size
    overlap = config.overlap
    seq_len = config.seq_len
    mel_bins = config.mel_bins
    
    # Paths
    audio_dir = os.path.join(dataset_dir, 'wav')
    
    validation_csv_path = os.path.join(workspace, 'validation.csv')
    
    hdf5_path = os.path.join(workspace, 'features', 'logmel', 'dev.h5')
    create_folder(os.path.dirname(hdf5_path))
    
    # Load data
    df = pd.read_csv(validation_csv_path)
    df = pd.DataFrame(df)
    
    audio_num = len(df)
    
    feature_extractor = LogMelExtractor(sample_rate=sample_rate, 
                                        window_size=window_size, 
                                        overlap=overlap, 
                                        mel_bins=mel_bins)
    
    begin_time = time.time()
    
    # Write out features to hdf5
    with h5py.File(hdf5_path, 'w') as hf:
        
        dt = h5py.special_dtype(vlen=str)
        
        # Reserve space
        hf.create_dataset(name='feature', shape=(audio_num, seq_len, mel_bins), dtype=np.float32)
        hf.create_dataset(name='itemid', shape=(audio_num,), dtype='S50')
        hf.create_dataset(name='datasetid', shape=(audio_num,), dtype='S20')
        hf.create_dataset(name='hasbird', shape=(audio_num,), dtype=np.int32)
        hf.create_dataset(name='fold', shape=(audio_num,), dtype=np.int32)
   
        n = 0
        
        for row in df.iterrows():
    
            itemid = row[1]['itemid']
            datasetid = row[1]['datasetid']
            hasbird = row[1]['hasbird']
            fold = row[1]['fold']
        
            print(n, itemid)
        
            # Calculate feature
            audio_path = os.path.join(audio_dir, '{}.wav'.format(itemid))
            (audio, fs) = read_audio(audio_path, target_fs=sample_rate)
            
            feature = feature_extractor.transform(audio)
            
            feature = pad_or_trunc(feature, seq_len)
    
            hf['feature'][n] = feature
            hf['itemid'][n] = itemid.encode()
            hf['datasetid'][n] = datasetid.encode()
            hf['hasbird'][n] = hasbird
            hf['fold'][n] = fold
            
            if False:
                print(n, itemid, datasetid, hasbird)
                plt.matshow(feature.T, origin='lower', aspect='auto', cmap='jet')
                plt.show()
            
            n += 1
                
    print("Write out to {}".format(hdf5_path))
    print("Time: {} s".format(time.time() - begin_time))
def calculate_logmel_features(config):

    # Arguments & parameters
    workspace = config.workspace

    sample_rate = config.sr
    window_size = config.window_size
    overlap = config.overlap
    seq_len = config.seq_len
    mel_bins = config.mel_bins
    stft_bins = window_size // 2 + 1
    classes_num = len(config.labels)
    lb_to_ix = config.lb_to_ix

    # Paths
    audio_dir = config.audio_dir

    yaml_path = config.out_yaml_path

    hdf5_path = config.h5_path

    create_folder(hdf5_path.parents[0])

    # # Load  yaml
    load_time = time.time()

    with open(yaml_path, 'r') as f:
        data_list = yaml.load(f)

    logging.info('Loading yaml time: {} s' ''.format(time.time() - load_time))

    # Feature extractor
    feature_extractor = LogMelExtractor(sample_rate=sample_rate,
                                        window_size=window_size,
                                        overlap=overlap,
                                        mel_bins=mel_bins)

    # Create hdf5 file
    write_hdf5_time = time.time()

    hf = h5py.File(hdf5_path, 'w')

    hf.create_dataset(name='logmel',
                      shape=(0, seq_len, mel_bins),
                      maxshape=(None, seq_len, mel_bins),
                      dtype=np.float32)

    hf.create_dataset(name='target',
                      shape=(0, classes_num),
                      maxshape=(None, classes_num),
                      dtype=np.int32)

    audio_names = []
    folds = []
    item_counts = 0

    for n, data in enumerate(data_list):

        if n % 10 == 0:
            logging.info('{} / {} audio features calculated'
                         ''.format(n, len(data_list)))

        audio_path = audio_dir / f'{data["fname"]}'

        # Read audio
        (audio, fs) = read_audio(audio_path, target_fs=config.sr, mono=True)

        for i in range(0, len(audio), config.sr * config.period):

            start = i
            end = i + config.sr * config.period

            audio_segment = audio[start:end]

            audio_names.append(data['fname'] + "-" + str(i / config.sr))
            folds.append(data['fold'])

            # Extract feature
            features_dict = calculate_logmel(
                audio_segment, feature_extractor=feature_extractor)

            # Write out features
            hf['logmel'].resize((item_counts + 1, seq_len, mel_bins))
            hf['logmel'][item_counts] = features_dict['logmel']

            # Write out target
            target = get_target_from_events(data['events'], lb_to_ix,
                                            start / config.sr, end / config.sr)
            hf['target'].resize((item_counts + 1, classes_num))
            hf['target'][item_counts] = target

            item_counts += 1

    hf.create_dataset(name='audio_name',
                      data=[s.encode() for s in audio_names],
                      dtype='S40')

    hf.create_dataset(name='fold', data=folds, dtype=np.int32)

    hf.close()

    logging.info('Write out hdf5 file to {}'.format(hdf5_path))
    logging.info('Time spent: {} s'.format(time.time() - write_hdf5_time))
Beispiel #16
0
def calculate_feature_for_all_audio_files(args):
    '''Calculate feature of audio files and write out features to a hdf5 file. 
    
    Args:
      dataset_dir: string
      workspace: string
      subtask: 'a' | 'b' | 'c'
      data_type: 'development' | 'evaluation'
      mini_data: bool, set True for debugging on a small part of data
    '''

    # Arguments & parameters
    dataset_dir = args.dataset_dir
    workspace = args.workspace
    subtask = args.subtask
    data_type = args.data_type
    mini_data = args.mini_data

    sample_rate = config.sample_rate
    window_size = config.window_size
    hop_size = config.hop_size
    mel_bins = config.mel_bins
    fmin = config.fmin
    fmax = config.fmax
    frames_per_second = config.frames_per_second
    frames_num = config.frames_num
    total_samples = config.total_samples
    lb_to_idx = config.lb_to_idx
    mfcc_frames = config.mfcc_frames
    n_mfcc = config.n_mfcc
    mfcc_hop_size = config.mfcc_hop_size
    gamm_frames = config.gamm_frames
    n_gamm = config.n_gamm
    # Paths
    if mini_data:
        prefix = 'minidata_'
    else:
        prefix = ''

    sub_dir = get_subdir(subtask, data_type)
    audios_dir = os.path.join(dataset_dir, sub_dir, 'audio')

    if data_type == 'development':
        metadata_path = os.path.join(dataset_dir, sub_dir, 'meta.csv')
    elif data_type == 'leaderboard':
        metadata_path = os.path.join(dataset_dir, sub_dir, 'evaluation_setup',
                                     'test.csv')
    elif data_type == 'evaluation':
        metadata_path = os.path.join(dataset_dir, sub_dir, 'evaluation_setup',
                                     'fold1_test.csv')
    else:
        raise Exception('Incorrect data_type!')

    feature_path = os.path.join(
        workspace, 'features',
        '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second,
                                             mel_bins),
        '{}.h5'.format(sub_dir))
    create_folder(os.path.dirname(feature_path))

    # Feature extractor
    feature_extractor = LogMelExtractor(sample_rate=sample_rate,
                                        window_size=window_size,
                                        hop_size=hop_size,
                                        mel_bins=mel_bins,
                                        fmin=fmin,
                                        fmax=fmax)

    # Read metadata
    meta_dict = read_metadata(metadata_path)

    # Extract features and targets
    if mini_data:
        mini_num = 10
        total_num = len(meta_dict['audio_name'])
        random_state = np.random.RandomState(1234)
        indexes = random_state.choice(total_num, size=mini_num, replace=False)
        for key in meta_dict.keys():
            meta_dict[key] = meta_dict[key][indexes]

    print('Extracting features of all audio files ...')
    extract_time = time.time()

    # Hdf5 file for storing features and targets
    hf = h5py.File(feature_path, 'w')

    hf.create_dataset(
        name='audio_name',
        data=[audio_name.encode() for audio_name in meta_dict['audio_name']],
        dtype='S80')

    if 'scene_label' in meta_dict.keys():
        hf.create_dataset(name='scene_label',
                          data=[
                              scene_label.encode()
                              for scene_label in meta_dict['scene_label']
                          ],
                          dtype='S24')

    if 'identifier' in meta_dict.keys():
        hf.create_dataset(name='identifier',
                          data=[
                              identifier.encode()
                              for identifier in meta_dict['identifier']
                          ],
                          dtype='S24')

    if 'source_label' in meta_dict.keys():
        hf.create_dataset(name='source_label',
                          data=[
                              source_label.encode()
                              for source_label in meta_dict['source_label']
                          ],
                          dtype='S8')

    hf.create_dataset(name='feature',
                      shape=(0, total_samples),
                      maxshape=(None, total_samples),
                      dtype=np.float32)
    hf.create_dataset(name='feature_gamm',
                      shape=(0, gamm_frames, n_gamm),
                      maxshape=(None, gamm_frames, n_gamm),
                      dtype=np.float32)
    hf.create_dataset(name='feature_mfcc',
                      shape=(0, mfcc_frames, n_mfcc),
                      maxshape=(None, mfcc_frames, n_mfcc),
                      dtype=np.float32)
    hf.create_dataset(name='feature_panns',
                      shape=(0, 320000),
                      maxshape=(None, 320000),
                      dtype=np.float32)

    for (n, audio_name) in enumerate(meta_dict['audio_name']):
        audio_path = os.path.join(audios_dir, audio_name)
        print(n, audio_path)

        # Read audio
        (audio, _) = read_audio(audio_path=audio_path, target_fs=sample_rate)
        audio = audio[:sample_rate * 10]
        (audio_gamm, _) = read_audio_gamm(audio_path=audio_path,
                                          target_fs=sample_rate)
        fea_gamm, _ = gtg_in_dB(audio_gamm, sample_rate)
        fea_gamm = fea_gamm.transpose(1, 0)
        sound, fs = librosa.load(audio_path)
        fea_mfcc = librosa.feature.mfcc(y=sound,
                                        sr=fs,
                                        hop_length=mfcc_hop_size,
                                        n_mfcc=n_mfcc)
        fea_mfcc = fea_mfcc.transpose(1, 0)
        (waveform, _) = librosa.core.load(audio_path, sr=32000, mono=True)
        waveform = waveform[:320000]

        hf['feature'].resize((n + 1, total_samples))
        hf['feature'][n] = audio
        hf['feature_gamm'].resize((n + 1, gamm_frames, n_gamm))
        hf['feature_gamm'][n] = fea_gamm
        hf['feature_mfcc'].resize((n + 1, mfcc_frames, n_mfcc))
        hf['feature_mfcc'][n] = fea_mfcc
        hf['feature_panns'].resize((n + 1, 320000))
        hf['feature_panns'][n] = waveform

    hf.close()

    print('Write hdf5 file to {} using {:.3f} s'.format(
        feature_path,
        time.time() - extract_time))
def calculate_feature_for_all_audio_files(args):
    '''Calculate feature of audio files and write out features to a single hdf5 
    file. 
    
    Args:
      dataset_dir: string
      workspace: string
      data_type: 'development' | 'evaluation'
      mini_data: bool, set True for debugging on a small part of data
    '''

    # Arguments & parameters
    dataset_dir = args.dataset_dir
    workspace = args.workspace
    data_type = args.data_type
    mini_data = args.mini_data

    sample_rate = config.sample_rate
    window_size = config.window_size
    hop_size = config.hop_size
    mel_bins = config.mel_bins
    fmin = config.fmin
    fmax = config.fmax
    frames_per_second = config.frames_per_second
    frames_num = config.frames_num
    total_samples = config.total_samples
    classes_num = config.classes_num
    lb_to_idx = config.lb_to_idx

    # Paths
    if mini_data:
        prefix = 'minidata_'
    else:
        prefix = ''

    relative_name = get_relative_path_no_extension(data_type)
    audios_dir = os.path.join(dataset_dir, 'audio', relative_name)

    if data_type == 'validation':
        metadata_path = os.path.join(dataset_dir, 'metadata', 'validation',
                                     '{}.csv'.format(relative_name))
    else:
        metadata_path = os.path.join(dataset_dir, 'metadata',
                                     '{}.csv'.format(relative_name))

    feature_path = os.path.join(
        workspace, 'features',
        '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second,
                                             mel_bins),
        '{}.h5'.format(relative_name))
    create_folder(os.path.dirname(feature_path))

    # Feature extractor
    feature_extractor = LogMelExtractor(sample_rate=sample_rate,
                                        window_size=window_size,
                                        hop_size=hop_size,
                                        mel_bins=mel_bins,
                                        fmin=fmin,
                                        fmax=fmax)

    # Read metadata
    (data_dict, has_weak_labels,
     has_strong_labels) = read_metadata(metadata_path)

    # Extract features and targets
    audio_names = sorted([*data_dict.keys()])

    if mini_data:
        random_state = np.random.RandomState(1234)
        random_state.shuffle(audio_names)
        audio_names = audio_names[0:10]

    print('Extracting features of all audio files ...')
    extract_time = time.time()

    # Hdf5 file for storing features and targets
    hf = h5py.File(feature_path, 'w')

    hf.create_dataset(name='audio_name',
                      data=[audio_name.encode() for audio_name in audio_names],
                      dtype='S64')

    hf.create_dataset(name='feature',
                      shape=(0, frames_num, mel_bins),
                      maxshape=(None, frames_num, mel_bins),
                      dtype=np.float32)

    if has_weak_labels:
        hf.create_dataset(name='weak_target',
                          shape=(0, classes_num),
                          maxshape=(None, classes_num),
                          dtype=np.bool)

    if has_strong_labels:
        hf.create_dataset(name='strong_target',
                          shape=(0, frames_num, classes_num),
                          maxshape=(None, frames_num, classes_num),
                          dtype=np.bool)

    for (n, audio_name) in enumerate(audio_names):
        audio_path = os.path.join(audios_dir, audio_name)
        print(n, audio_path)

        # Read audio
        (audio, _) = read_audio(audio_path=audio_path, target_fs=sample_rate)

        # Pad or truncate audio recording
        audio = pad_truncate_sequence(audio, total_samples)

        # Extract feature
        feature = feature_extractor.transform(audio)

        # Remove the extra frames caused by padding zero
        feature = feature[0:frames_num]

        hf['feature'].resize((n + 1, frames_num, mel_bins))
        hf['feature'][n] = feature

        if has_weak_labels:
            weak_labels = data_dict[audio_name]['weak_labels']
            hf['weak_target'].resize((n + 1, classes_num))
            hf['weak_target'][n] = labels_to_target(weak_labels, classes_num,
                                                    lb_to_idx)

        if has_strong_labels:
            events = data_dict[audio_name]['strong_labels']
            hf['strong_target'].resize((n + 1, frames_num, classes_num))
            hf['strong_target'][n] = events_to_target(
                events=events,
                frames_num=frames_num,
                classes_num=classes_num,
                frames_per_second=frames_per_second,
                lb_to_idx=lb_to_idx)

    hf.close()

    print('Write hdf5 file to {} using {:.3f} s'.format(
        feature_path,
        time.time() - extract_time))
Beispiel #18
0
def calculate_feature_for_all_audio_files(args):
    '''Calculate feature of audio files and write out features to a single hdf5 
    file. 
    
    Args:
      dataset_dir: string
      workspace: string
      data_type: 'train' | 'validate' | 'evaluate'
      mini_data: bool, set True for debugging on a small part of data
    '''
    
    # Arguments & parameters
    dataset_dir = args.dataset_dir
    data_type = args.data_type
    workspace = args.workspace
    mini_data = args.mini_data
    
    sample_rate = config.sample_rate
    window_size = config.window_size
    hop_size = config.hop_size
    mel_bins = config.mel_bins
    fmin = config.fmin
    fmax = config.fmax
    frames_per_second = config.frames_per_second
    frames_num = config.frames_num
    total_samples = config.total_samples
    
    # Paths
    if mini_data:
        prefix = 'minidata_'
    else:
        prefix = ''
        
    metadata_path = os.path.join(dataset_dir, 'annotations.csv')

    if data_type in ['train', 'validate']:
        audios_dir = os.path.join(dataset_dir, data_type)
    elif data_type == 'evaluate':
        audios_dir = os.path.join(dataset_dir, 'audio-eval')
    
    feature_path = os.path.join(workspace, 'features', 
        '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 
        '{}.h5'.format(data_type))
    create_folder(os.path.dirname(feature_path))
        
    # Feature extractor
    feature_extractor = LogMelExtractor(
        sample_rate=sample_rate, 
        window_size=window_size, 
        hop_size=hop_size, 
        mel_bins=mel_bins, 
        fmin=fmin, 
        fmax=fmax)

    # Read metadata
    print('Extracting features of all audio files ...')
    extract_time = time.time()
    
    if data_type in ['train', 'validate']:
        meta_dict = read_metadata(metadata_path, data_type, mini_data)
    elif data_type == 'evaluate':
        meta_dict = read_evaluate_metadata(audios_dir, mini_data)

    # Hdf5 containing features and targets
    hf = h5py.File(feature_path, 'w')

    hf.create_dataset(
        name='audio_name', 
        data=[audio_name.encode() for audio_name in meta_dict['audio_name']], 
        dtype='S32')

    if 'fine_target' in meta_dict.keys():
        hf.create_dataset(
            name='fine_target', 
            data=meta_dict['fine_target'], 
            dtype=np.float32)
            
    if 'coarse_target' in meta_dict.keys():
        hf.create_dataset(
            name='coarse_target', 
            data=meta_dict['coarse_target'], 
            dtype=np.float32)

    hf.create_dataset(
        name='feature', 
        shape=(0, frames_num, mel_bins), 
        maxshape=(None, frames_num, mel_bins), 
        dtype=np.float32)

    for (n, audio_name) in enumerate(meta_dict['audio_name']):
        audio_path = os.path.join(audios_dir, audio_name)
        print(n, audio_path)
        
        # Read audio
        (audio, _) = read_audio(
            audio_path=audio_path, 
            target_fs=sample_rate)

        # Pad or truncate audio recording
        audio = pad_truncate_sequence(audio, total_samples)
        
        # Extract feature
        feature = feature_extractor.transform(audio)
        
        # Remove the extra frames caused by padding zero
        feature = feature[0 : frames_num]
        
        hf['feature'].resize((n + 1, frames_num, mel_bins))
        hf['feature'][n] = feature
        
    hf.close()
        
    print('Write hdf5 file to {} using {:.3f} s'.format(
        feature_path, time.time() - extract_time))
def create_mixed_audio(args):

    # Arguments & parameters
    dcase2018_task1_dataset_dir = args.dcase2018_task1_dataset_dir
    dcase2018_task2_dataset_dir = args.dcase2018_task2_dataset_dir
    workspace = args.workspace
    scene_type = args.scene_type
    snr = args.snr

    sample_rate = config.sample_rate
    clip_duration = 10.
    audio_len = int(sample_rate * clip_duration)
    random_state = np.random.RandomState(1234)

    # Paths
    yaml_path = os.path.join(workspace, 'yaml_files', 'mixture.yaml')

    out_audio_dir = os.path.join(
        workspace, 'mixed_audios',
        'scene_type={},snr={}'.format(scene_type, snr))
    create_folder(out_audio_dir)

    with open(yaml_path, 'r') as f:
        data = yaml.load(f)

    create_audio_time = time.time()

    for n in range(len(data)):

        if n % 10 == 0:
            logging.info(n)

        if scene_type == 'white_noise':
            scene_audio = random_state.uniform(0., 1., audio_len)

        elif scene_type == 'dcase2018_task1':
            scene_audio_name = data[n]['scene_audio_name']
            scene_audio_path = os.path.join(dcase2018_task1_dataset_dir,
                                            'audio', scene_audio_name)

            (scene_audio, fs) = read_audio(scene_audio_path,
                                           target_fs=sample_rate)

        # Normalize scene audio
        scene_audio = normalize_to_energy(scene_audio, db=-snr)

        # Reserve space
        events_audio = np.zeros(audio_len)

        for (j, event) in enumerate(data[n]['events']):

            audio_name = event['event_audio_name']
            onset = int(event['onset'] * sample_rate)
            offset = int(event['offset'] * sample_rate)

            audio_path = os.path.join(dcase2018_task2_dataset_dir,
                                      'audio_train', audio_name)
            (audio, fs) = read_audio(audio_path, target_fs=sample_rate)

            audio = normalize_to_energy(audio, db=0.)

            events_audio[onset:offset] = audio[0:offset - onset]

        stereo_audio = np.array((events_audio, scene_audio)).T
        stereo_audio /= np.max(np.abs(stereo_audio))

        out_audio_path = os.path.join(out_audio_dir, data[n]['mixture_name'])
        write_audio(out_audio_path, stereo_audio, sample_rate)

    logging.info('Write out audio finished! {} s'.format(time.time() -
                                                         create_audio_time))
def create_mixture_yaml(args):

    # Arguments & parameters
    dcase2018_task1_dataset_dir = args.dcase2018_task1_dataset_dir
    dcase2018_task2_dataset_dir = args.dcase2018_task2_dataset_dir

    workspace = args.workspace

    random_state = np.random.RandomState(1234)
    folds = [1, 2, 3, 4]
    mixed_audios_num = 2000
    events_per_clip = 3

    # Paths
    dcase2018_task1_meta = os.path.join(dcase2018_task1_dataset_dir,
                                        'meta.csv')
    dcase2018_task2_meta = os.path.join(workspace,
                                        'dcase2018_task2_validate_meta.csv')

    yaml_path = os.path.join(workspace, 'mixture.yaml')
    create_folder(os.path.dirname(yaml_path))

    # Scenes meta
    df_scenes = pd.read_csv(dcase2018_task1_meta, sep='\t')
    scene_names = np.array(df_scenes['filename'])
    random_state.shuffle(scene_names)

    # Events meta
    df_events = pd.read_csv(dcase2018_task2_meta, sep=',')

    #
    count = 0
    data_list = []

    for fold in folds:

        bool_selected = (df_events['fold']
                         == fold) & (df_events['manually_verified'] == 1)
        event_audio_names = np.array(df_events.fname[bool_selected])
        event_labels = np.array(df_events.label[bool_selected])

        indexes = np.arange(len(event_audio_names))

        repeated_indexes = repeat_indexes(indexes,
                                          mixed_audios_num * events_per_clip,
                                          random_state)

        for n in range(mixed_audios_num):

            if count % 100 == 0:
                print(count)

            current_idxes = repeated_indexes[n * events_per_clip:(n + 1) *
                                             events_per_clip]

            events = []

            for (j, idx) in enumerate(current_idxes):

                event_audio_name = event_audio_names[idx]
                event_label = event_labels[idx]
                onset = j * 2.5 + 0.5

                audio_path = os.path.join(dcase2018_task2_dataset_dir,
                                          'audio_train', event_audio_name)
                (audio, fs) = read_audio(audio_path)
                audio_duration = len(audio) / float(fs)
                audio_duration = min(audio_duration, 2.0)

                offset = onset + audio_duration

                events.append({
                    'event_audio_name': event_audio_name,
                    'event_label': event_label,
                    'onset': onset,
                    'offset': offset
                })

            scene_audio_name = scene_names[count].split('/')[1]

            data = {
                'mixture_name': '{:05d}.wav'.format(count),
                'fold': fold,
                'events': events,
                'scene_audio_name': scene_audio_name
            }
            data_list.append(data)

            count += 1

            # if count == 30:
            #     break

        with open(yaml_path, 'w') as f:
            f.write(yaml.dump(data_list, default_flow_style=False))