def visualize(self, data_type, max_iteration=None): '''Visualize the log mel spectrogram. Args: data_type: 'train' | 'validate' max_iteration: None | int, use maximum iteration of partial data for fast evaluation ''' mel_bins = config.mel_bins audio_duration = config.audio_duration frames_num = config.frames_num coarse_classes_num = config.coarse_classes_num coarse_idx_to_lb = config.coarse_idx_to_lb generate_func = self.data_generator.generate_validate( data_type=data_type, max_iteration=max_iteration) # Forward output_dict = forward(model=self.model, generate_func=generate_func, cuda=self.cuda, return_input=True, return_target=True) rows_num = 3 cols_num = 3 fig, axs = plt.subplots(rows_num, cols_num, figsize=(10, 5)) for k in range(coarse_classes_num): for n, audio_name in enumerate(output_dict['audio_name']): if output_dict['coarse_target'][n, k] > 0.5: row = k // cols_num col = k % cols_num title = coarse_idx_to_lb[k] title = '{}\n{}'.format(coarse_idx_to_lb[k], audio_name) axs[row, col].set_title(title, color='r') logmel = inverse_scale(output_dict['feature'][n], self.data_generator.scalar['mean'], self.data_generator.scalar['std']) axs[row, col].matshow(logmel.T, origin='lower', aspect='auto', cmap='jet') axs[row, col].set_xticks([0, frames_num]) axs[row, col].set_xticklabels( ['0', '{:.1f} s'.format(audio_duration)]) axs[row, col].xaxis.set_ticks_position('bottom') axs[row, col].set_ylabel('Mel bins') axs[row, col].set_yticks([]) break for k in range(coarse_classes_num, rows_num * cols_num): row = k // cols_num col = k % cols_num axs[row, col].set_visible(False) fig.tight_layout(pad=0, w_pad=0, h_pad=0) plt.show()
def visualize(self, data_type, source, max_iteration=None): '''Visualize log mel spectrogram of different sound classes. Args: data_type: 'train' | 'validate' source: 'a' | 'b' | 'c' max_iteration: None | int, maximum iteration to run to speed up evaluation ''' mel_bins = config.mel_bins audio_duration = config.audio_duration frames_num = config.frames_num labels = config.labels in_domain_classes_num = len(config.labels) - 1 idx_to_lb = config.idx_to_lb generate_func = self.data_generator.generate_validate( data_type=data_type, source=source, max_iteration=max_iteration) # Forward output_dict = forward(model=self.model, generate_func=generate_func, cuda=self.cuda, return_input=True, return_target=True) # Plot log mel spectrogram of different sound classes rows_num = 3 cols_num = 4 fig, axs = plt.subplots(rows_num, cols_num, figsize=(10, 5)) for k in range(in_domain_classes_num): for n, audio_name in enumerate(output_dict['audio_name']): if output_dict['target'][n, k] == 1: title = idx_to_lb[k] row = k // cols_num col = k % cols_num axs[row, col].set_title(title, color='r') logmel = inverse_scale(output_dict['feature'][n], self.data_generator.scalar['mean'], self.data_generator.scalar['std']) axs[row, col].matshow(logmel.T, origin='lower', aspect='auto', cmap='jet') axs[row, col].set_xticks([0, frames_num]) axs[row, col].set_xticklabels( ['0', '{:.1f} s'.format(audio_duration)]) axs[row, col].xaxis.set_ticks_position('bottom') axs[row, col].set_ylabel('Mel bins') axs[row, col].set_yticks([]) break for k in range(in_domain_classes_num, rows_num * cols_num): row = k // cols_num col = k % cols_num axs[row, col].set_visible(False) fig.tight_layout(pad=0, w_pad=0, h_pad=0) plt.show()
def visualize(self, data_type, max_validate_num=None): '''Visualize the log mel spectrogram, reference and prediction of sound events, elevation and azimuth. Args: data_type: 'train' | 'validate' max_validate_num: None | int, maximum iteration to run to speed up evaluation ''' mel_bins = config.mel_bins frames_per_second = config.frames_per_second classes_num = config.classes_num labels = config.labels # Forward generate_func = self.data_generator.generate_validate( data_type=data_type, max_validate_num=max_validate_num) list_dict = forward(model=self.model, generate_func=generate_func, cuda=self.cuda, return_input=True, return_target=True) for n, dict in enumerate(list_dict): print('File: {}'.format(dict['name'])) frames_num = dict['target_event'].shape[1] length_in_second = frames_num / float(frames_per_second) fig, axs = plt.subplots(4, 2, figsize=(15, 10)) logmel = inverse_scale(dict['feature'][0][0], self.data_generator.scalar['mean'], self.data_generator.scalar['std']) axs[0, 0].matshow(logmel.T, origin='lower', aspect='auto', cmap='jet') axs[1, 0].matshow(dict['target_event'][0].T, origin='lower', aspect='auto', cmap='jet') axs[2, 0].matshow(dict['output_event'][0].T, origin='lower', aspect='auto', cmap='jet') axs[0, 0].set_title('Log mel spectrogram', color='r') axs[1, 0].set_title('Reference sound events', color='r') axs[2, 0].set_title('Predicted sound events', color='b') for i in range(4): for j in range(1): axs[i, j].set_xticks([0, frames_num]) axs[i, j].set_xticklabels( ['0', '{:.1f} s'.format(length_in_second)]) axs[i, j].xaxis.set_ticks_position('bottom') axs[i, j].set_yticks(np.arange(classes_num)) axs[i, j].set_yticklabels(labels) axs[i, j].yaxis.grid(color='w', linestyle='solid', linewidth=0.2) axs[0, 0].set_ylabel('Mel bins') axs[0, 0].set_yticks([0, mel_bins]) axs[0, 0].set_yticklabels([0, mel_bins]) axs[3, 0].set_visible(False) axs[0, 1].set_visible(False) axs[1, 1].set_visible(False) axs[2, 1].set_visible(False) axs[3, 1].set_visible(False) fig.tight_layout() plt.show()
def visualize(self, data_type, max_iteration=None): '''Visualize logmel spectrogram, reference and prediction. Args: data_type: 'train' | 'validate' max_iteration: None | int, maximum iteration to run to speed up evaluation ''' generate_func = self.data_generator.generate_validate( data_type=data_type, max_iteration=max_iteration) mel_bins = config.mel_bins audio_duration = config.audio_duration labels = config.labels # Forward generate_func = self.data_generator.generate_validate( data_type=data_type) # Forward output_dict = forward(model=self.model, generate_func=generate_func, cuda=self.cuda, return_input=True, return_target=True) (audios_num, frames_num, classes_num) = output_dict['framewise_output'].shape for n in range(audios_num): print('File: {}'.format(output_dict['audio_name'][n])) for k in range(classes_num): print('{:<20}{:<8}{:.3f}'.format( labels[k], output_dict['weak_target'][n, k], output_dict['clipwise_output'][n, k])) event_prediction = np.zeros((frames_num, classes_num)) for k in range(classes_num): if output_dict['clipwise_output'][n, k] \ > self.sed_params_dict['sed_high_threshold']: bgn_fin_pairs = activity_detection( x=output_dict['framewise_output'][n, :, k], thres=self.sed_params_dict['sed_high_threshold'], low_thres=self.sed_params_dict['sed_low_threshold'], n_smooth=self.sed_params_dict['n_smooth'], n_salt=self.sed_params_dict['n_salt']) for pair in bgn_fin_pairs: event_prediction[pair[0]:pair[1], k] = 1 # Plot fig, axs = plt.subplots(4, 1, figsize=(10, 8)) logmel = inverse_scale(output_dict['feature'][n], self.data_generator.scalar['mean'], self.data_generator.scalar['std']) axs[0].matshow(logmel.T, origin='lower', aspect='auto', cmap='jet') if 'strong_target' in output_dict.keys(): axs[1].matshow(output_dict['strong_target'][n].T, origin='lower', aspect='auto', cmap='jet') masked_framewise_output = output_dict['framewise_output'][ n] * output_dict['clipwise_output'][n] axs[2].matshow(masked_framewise_output.T, origin='lower', aspect='auto', cmap='jet') axs[3].matshow(event_prediction.T, origin='lower', aspect='auto', cmap='jet') axs[0].set_title('Log mel spectrogram', color='r') axs[1].set_title('Reference sound events', color='r') axs[2].set_title('Framewise prediction', color='b') axs[3].set_title('Eventwise prediction', color='b') for i in range(4): axs[i].set_xticks([0, frames_num]) axs[i].set_xticklabels( ['0', '{:.1f} s'.format(audio_duration)]) axs[i].xaxis.set_ticks_position('bottom') axs[i].set_yticks(np.arange(classes_num)) axs[i].set_yticklabels(labels) axs[i].yaxis.grid(color='w', linestyle='solid', linewidth=0.2) axs[0].set_ylabel('Mel bins') axs[0].set_yticks([0, mel_bins]) axs[0].set_yticklabels([0, mel_bins]) fig.tight_layout() plt.show()
def visualize(self, data_type, target_source, save_fig_path, max_iteration=None): '''Visualize logmel of different sound classes. Args: data_type: 'train' | 'validate' target_source: 'curated' | 'noisy' save_fig_path: string, path to save figure max_iteration: None | int, maximum iteration to run to speed up evaluation ''' generate_func = self.data_generator.generate_validate( data_type=data_type, target_source=target_source, max_iteration=max_iteration) # Results of segments output_dict = forward_infer(model=self.model, generate_func=generate_func, cuda=self.cuda, return_target=True, return_input=True) target = output_dict['target'] output = output_dict['output'] feature = output_dict['feature'] (audios_num, segment_frames, mel_bins) = feature.shape segment_duration = segment_frames / self.frames_per_second # Plot log mel spectrogram of different sound classes rows_num = 10 cols_num = 8 fig, axs = plt.subplots(rows_num, cols_num, figsize=(15, 15)) for k in range(self.classes_num): for n, audio_name in enumerate(output_dict['audio_name']): if target[n, k] == 1: title = self.idx_to_lb[k][0:20] row = k // cols_num col = k % cols_num axs[row, col].set_title(title, color='r', fontsize=9) logmel = inverse_scale(feature[n], self.data_generator.scalar['mean'], self.data_generator.scalar['std']) axs[row, col].matshow(logmel.T, origin='lower', aspect='auto', cmap='jet') axs[row, col].set_xticks([0, segment_frames]) axs[row, col].set_xticklabels( ['0', '{:.1f} s'.format(segment_duration)], fontsize=6) axs[row, col].xaxis.set_ticks_position('bottom') axs[row, col].set_ylabel('Mel bins', fontsize=7) axs[row, col].set_yticks([]) break for k in range(self.classes_num, rows_num * cols_num): row = k // cols_num col = k % cols_num axs[row, col].set_visible(False) plt.tight_layout(pad=0, w_pad=0, h_pad=0) plt.savefig(save_fig_path) logging.info('Save figure to {}'.format(save_fig_path))
def inverse_transform(self, x): return inverse_scale(x, self.mean_x, self.std_x)