def visualize(self, data_type, max_iteration=None):
        '''Visualize the log mel spectrogram. 
        
        Args:
          data_type: 'train' | 'validate'
          max_iteration: None | int, use maximum iteration of partial data for
              fast evaluation
        '''

        mel_bins = config.mel_bins
        audio_duration = config.audio_duration
        frames_num = config.frames_num
        coarse_classes_num = config.coarse_classes_num
        coarse_idx_to_lb = config.coarse_idx_to_lb

        generate_func = self.data_generator.generate_validate(
            data_type=data_type, max_iteration=max_iteration)

        # Forward
        output_dict = forward(model=self.model,
                              generate_func=generate_func,
                              cuda=self.cuda,
                              return_input=True,
                              return_target=True)

        rows_num = 3
        cols_num = 3

        fig, axs = plt.subplots(rows_num, cols_num, figsize=(10, 5))

        for k in range(coarse_classes_num):
            for n, audio_name in enumerate(output_dict['audio_name']):
                if output_dict['coarse_target'][n, k] > 0.5:
                    row = k // cols_num
                    col = k % cols_num
                    title = coarse_idx_to_lb[k]
                    title = '{}\n{}'.format(coarse_idx_to_lb[k], audio_name)
                    axs[row, col].set_title(title, color='r')
                    logmel = inverse_scale(output_dict['feature'][n],
                                           self.data_generator.scalar['mean'],
                                           self.data_generator.scalar['std'])
                    axs[row, col].matshow(logmel.T,
                                          origin='lower',
                                          aspect='auto',
                                          cmap='jet')
                    axs[row, col].set_xticks([0, frames_num])
                    axs[row, col].set_xticklabels(
                        ['0', '{:.1f} s'.format(audio_duration)])
                    axs[row, col].xaxis.set_ticks_position('bottom')
                    axs[row, col].set_ylabel('Mel bins')
                    axs[row, col].set_yticks([])
                    break

        for k in range(coarse_classes_num, rows_num * cols_num):
            row = k // cols_num
            col = k % cols_num
            axs[row, col].set_visible(False)

        fig.tight_layout(pad=0, w_pad=0, h_pad=0)
        plt.show()
Example #2
0
    def visualize(self, data_type, source, max_iteration=None):
        '''Visualize log mel spectrogram of different sound classes.
        
        Args: 
          data_type: 'train' | 'validate'
          source: 'a' | 'b' | 'c'
          max_iteration: None | int, maximum iteration to run to speed up evaluation
        '''
        mel_bins = config.mel_bins
        audio_duration = config.audio_duration
        frames_num = config.frames_num
        labels = config.labels
        in_domain_classes_num = len(config.labels) - 1
        idx_to_lb = config.idx_to_lb

        generate_func = self.data_generator.generate_validate(
            data_type=data_type, source=source, max_iteration=max_iteration)

        # Forward
        output_dict = forward(model=self.model,
                              generate_func=generate_func,
                              cuda=self.cuda,
                              return_input=True,
                              return_target=True)

        # Plot log mel spectrogram of different sound classes
        rows_num = 3
        cols_num = 4

        fig, axs = plt.subplots(rows_num, cols_num, figsize=(10, 5))

        for k in range(in_domain_classes_num):
            for n, audio_name in enumerate(output_dict['audio_name']):
                if output_dict['target'][n, k] == 1:
                    title = idx_to_lb[k]
                    row = k // cols_num
                    col = k % cols_num
                    axs[row, col].set_title(title, color='r')
                    logmel = inverse_scale(output_dict['feature'][n],
                                           self.data_generator.scalar['mean'],
                                           self.data_generator.scalar['std'])
                    axs[row, col].matshow(logmel.T,
                                          origin='lower',
                                          aspect='auto',
                                          cmap='jet')
                    axs[row, col].set_xticks([0, frames_num])
                    axs[row, col].set_xticklabels(
                        ['0', '{:.1f} s'.format(audio_duration)])
                    axs[row, col].xaxis.set_ticks_position('bottom')
                    axs[row, col].set_ylabel('Mel bins')
                    axs[row, col].set_yticks([])
                    break

        for k in range(in_domain_classes_num, rows_num * cols_num):
            row = k // cols_num
            col = k % cols_num
            axs[row, col].set_visible(False)

        fig.tight_layout(pad=0, w_pad=0, h_pad=0)
        plt.show()
Example #3
0
    def visualize(self, data_type, max_validate_num=None):
        '''Visualize the log mel spectrogram, reference and prediction of 
        sound events, elevation and azimuth. 
        
        Args:
          data_type: 'train' | 'validate'
          max_validate_num: None | int, maximum iteration to run to speed up 
              evaluation
        '''

        mel_bins = config.mel_bins
        frames_per_second = config.frames_per_second
        classes_num = config.classes_num
        labels = config.labels

        # Forward
        generate_func = self.data_generator.generate_validate(
            data_type=data_type, max_validate_num=max_validate_num)

        list_dict = forward(model=self.model,
                            generate_func=generate_func,
                            cuda=self.cuda,
                            return_input=True,
                            return_target=True)

        for n, dict in enumerate(list_dict):
            print('File: {}'.format(dict['name']))

            frames_num = dict['target_event'].shape[1]
            length_in_second = frames_num / float(frames_per_second)

            fig, axs = plt.subplots(4, 2, figsize=(15, 10))
            logmel = inverse_scale(dict['feature'][0][0],
                                   self.data_generator.scalar['mean'],
                                   self.data_generator.scalar['std'])
            axs[0, 0].matshow(logmel.T,
                              origin='lower',
                              aspect='auto',
                              cmap='jet')
            axs[1, 0].matshow(dict['target_event'][0].T,
                              origin='lower',
                              aspect='auto',
                              cmap='jet')
            axs[2, 0].matshow(dict['output_event'][0].T,
                              origin='lower',
                              aspect='auto',
                              cmap='jet')

            axs[0, 0].set_title('Log mel spectrogram', color='r')
            axs[1, 0].set_title('Reference sound events', color='r')
            axs[2, 0].set_title('Predicted sound events', color='b')

            for i in range(4):
                for j in range(1):
                    axs[i, j].set_xticks([0, frames_num])
                    axs[i, j].set_xticklabels(
                        ['0', '{:.1f} s'.format(length_in_second)])
                    axs[i, j].xaxis.set_ticks_position('bottom')
                    axs[i, j].set_yticks(np.arange(classes_num))
                    axs[i, j].set_yticklabels(labels)
                    axs[i, j].yaxis.grid(color='w',
                                         linestyle='solid',
                                         linewidth=0.2)

            axs[0, 0].set_ylabel('Mel bins')
            axs[0, 0].set_yticks([0, mel_bins])
            axs[0, 0].set_yticklabels([0, mel_bins])
            axs[3, 0].set_visible(False)
            axs[0, 1].set_visible(False)
            axs[1, 1].set_visible(False)
            axs[2, 1].set_visible(False)
            axs[3, 1].set_visible(False)

            fig.tight_layout()
            plt.show()
Example #4
0
    def visualize(self, data_type, max_iteration=None):
        '''Visualize logmel spectrogram, reference and prediction. 
        
        Args: 
          data_type: 'train' | 'validate'
          max_iteration: None | int, maximum iteration to run to speed up 
              evaluation
        '''
        generate_func = self.data_generator.generate_validate(
            data_type=data_type, max_iteration=max_iteration)

        mel_bins = config.mel_bins
        audio_duration = config.audio_duration
        labels = config.labels

        # Forward
        generate_func = self.data_generator.generate_validate(
            data_type=data_type)

        # Forward
        output_dict = forward(model=self.model,
                              generate_func=generate_func,
                              cuda=self.cuda,
                              return_input=True,
                              return_target=True)

        (audios_num, frames_num,
         classes_num) = output_dict['framewise_output'].shape

        for n in range(audios_num):
            print('File: {}'.format(output_dict['audio_name'][n]))

            for k in range(classes_num):
                print('{:<20}{:<8}{:.3f}'.format(
                    labels[k], output_dict['weak_target'][n, k],
                    output_dict['clipwise_output'][n, k]))

            event_prediction = np.zeros((frames_num, classes_num))

            for k in range(classes_num):
                if output_dict['clipwise_output'][n, k] \
                    > self.sed_params_dict['sed_high_threshold']:

                    bgn_fin_pairs = activity_detection(
                        x=output_dict['framewise_output'][n, :, k],
                        thres=self.sed_params_dict['sed_high_threshold'],
                        low_thres=self.sed_params_dict['sed_low_threshold'],
                        n_smooth=self.sed_params_dict['n_smooth'],
                        n_salt=self.sed_params_dict['n_salt'])

                    for pair in bgn_fin_pairs:
                        event_prediction[pair[0]:pair[1], k] = 1

            # Plot
            fig, axs = plt.subplots(4, 1, figsize=(10, 8))
            logmel = inverse_scale(output_dict['feature'][n],
                                   self.data_generator.scalar['mean'],
                                   self.data_generator.scalar['std'])
            axs[0].matshow(logmel.T, origin='lower', aspect='auto', cmap='jet')
            if 'strong_target' in output_dict.keys():
                axs[1].matshow(output_dict['strong_target'][n].T,
                               origin='lower',
                               aspect='auto',
                               cmap='jet')
            masked_framewise_output = output_dict['framewise_output'][
                n] * output_dict['clipwise_output'][n]
            axs[2].matshow(masked_framewise_output.T,
                           origin='lower',
                           aspect='auto',
                           cmap='jet')
            axs[3].matshow(event_prediction.T,
                           origin='lower',
                           aspect='auto',
                           cmap='jet')

            axs[0].set_title('Log mel spectrogram', color='r')
            axs[1].set_title('Reference sound events', color='r')
            axs[2].set_title('Framewise prediction', color='b')
            axs[3].set_title('Eventwise prediction', color='b')

            for i in range(4):
                axs[i].set_xticks([0, frames_num])
                axs[i].set_xticklabels(
                    ['0', '{:.1f} s'.format(audio_duration)])
                axs[i].xaxis.set_ticks_position('bottom')
                axs[i].set_yticks(np.arange(classes_num))
                axs[i].set_yticklabels(labels)
                axs[i].yaxis.grid(color='w', linestyle='solid', linewidth=0.2)

            axs[0].set_ylabel('Mel bins')
            axs[0].set_yticks([0, mel_bins])
            axs[0].set_yticklabels([0, mel_bins])

            fig.tight_layout()
            plt.show()
Example #5
0
    def visualize(self,
                  data_type,
                  target_source,
                  save_fig_path,
                  max_iteration=None):
        '''Visualize logmel of different sound classes. 
        
        Args: 
          data_type: 'train' | 'validate'
          target_source: 'curated' | 'noisy'
          save_fig_path: string, path to save figure
          max_iteration: None | int, maximum iteration to run to speed up evaluation
        '''

        generate_func = self.data_generator.generate_validate(
            data_type=data_type,
            target_source=target_source,
            max_iteration=max_iteration)

        # Results of segments
        output_dict = forward_infer(model=self.model,
                                    generate_func=generate_func,
                                    cuda=self.cuda,
                                    return_target=True,
                                    return_input=True)

        target = output_dict['target']
        output = output_dict['output']
        feature = output_dict['feature']

        (audios_num, segment_frames, mel_bins) = feature.shape
        segment_duration = segment_frames / self.frames_per_second

        # Plot log mel spectrogram of different sound classes
        rows_num = 10
        cols_num = 8

        fig, axs = plt.subplots(rows_num, cols_num, figsize=(15, 15))

        for k in range(self.classes_num):
            for n, audio_name in enumerate(output_dict['audio_name']):
                if target[n, k] == 1:
                    title = self.idx_to_lb[k][0:20]
                    row = k // cols_num
                    col = k % cols_num
                    axs[row, col].set_title(title, color='r', fontsize=9)
                    logmel = inverse_scale(feature[n],
                                           self.data_generator.scalar['mean'],
                                           self.data_generator.scalar['std'])
                    axs[row, col].matshow(logmel.T,
                                          origin='lower',
                                          aspect='auto',
                                          cmap='jet')
                    axs[row, col].set_xticks([0, segment_frames])
                    axs[row, col].set_xticklabels(
                        ['0', '{:.1f} s'.format(segment_duration)], fontsize=6)
                    axs[row, col].xaxis.set_ticks_position('bottom')
                    axs[row, col].set_ylabel('Mel bins', fontsize=7)
                    axs[row, col].set_yticks([])
                    break

        for k in range(self.classes_num, rows_num * cols_num):
            row = k // cols_num
            col = k % cols_num
            axs[row, col].set_visible(False)

        plt.tight_layout(pad=0, w_pad=0, h_pad=0)
        plt.savefig(save_fig_path)
        logging.info('Save figure to {}'.format(save_fig_path))
Example #6
0
 def inverse_transform(self, x):
     return inverse_scale(x, self.mean_x, self.std_x)