def save_filtered_result(self): util.write_wav(self.filtered_wav)
def explain(model, input, output_filename_prefix, sample_rate, output_path): batch_size = 1 if len(input['noisy']) < model.receptive_field_length: raise ValueError( 'Input is not long enough to be used with this model.') num_output_samples = input['noisy'].shape[0] - ( model.receptive_field_length - 1) num_fragments = int(np.ceil(num_output_samples / model.target_field_length)) num_batches = int(np.ceil(num_fragments / batch_size)) denoised_output = [] noise_output = [] num_pad_values = 0 fragment_i = 0 for batch_i in tqdm.tqdm(range(0, num_batches)): if batch_i == num_batches - 1: #If its the last batch' batch_size = num_fragments - batch_i * batch_size input_batch = np.zeros((batch_size, model.input_length)) #Assemble batch for batch_fragment_i in range(0, batch_size): if fragment_i + model.target_field_length > num_output_samples: remainder = input['noisy'][fragment_i:] current_fragment = np.zeros((model.input_length, )) current_fragment[:remainder.shape[0]] = remainder num_pad_values = model.input_length - remainder.shape[0] else: current_fragment = input['noisy'][fragment_i:fragment_i + model.input_length] input_batch[batch_fragment_i, :] = current_fragment fragment_i += model.target_field_length denoised_output_fragments = model.denoise_batch( {'data_input': input_batch}) layer_outputs = model.get_layer_outputs(input_batch) plot_layer_outputs(layer_outputs, 2, output_path) if type(denoised_output_fragments) is list: noise_output_fragment = denoised_output_fragments[1] denoised_output_fragment = denoised_output_fragments[0] denoised_output_fragment = denoised_output_fragment[:, model. target_padding: model. target_padding + model. target_field_length] denoised_output_fragment = denoised_output_fragment.flatten().tolist() if noise_output_fragment is not None: noise_output_fragment = noise_output_fragment[:, model. target_padding:model. target_padding + model. target_field_length] noise_output_fragment = noise_output_fragment.flatten().tolist() if type(denoised_output_fragments) is float: denoised_output_fragment = [denoised_output_fragment] if type(noise_output_fragment) is float: noise_output_fragment = [noise_output_fragment] denoised_output = denoised_output + denoised_output_fragment noise_output = noise_output + noise_output_fragment denoised_output = np.array(denoised_output) noise_output = np.array(noise_output) if num_pad_values != 0: denoised_output = denoised_output[:-num_pad_values] noise_output = noise_output[:-num_pad_values] valid_noisy_signal = input['noisy'][model.half_receptive_field_length:model .half_receptive_field_length + len(denoised_output)] if input['clean'] is not None: input['noise'] = input['noisy'] - input['clean'] valid_clean_signal = input['clean'][model.half_receptive_field_length: model.half_receptive_field_length + len(denoised_output)] noise_in_denoised_output = denoised_output - valid_clean_signal rms_clean = util.rms(valid_clean_signal) rms_noise_out = util.rms(noise_in_denoised_output) rms_noise_in = util.rms(input['noise']) new_snr_db = int(np.round(util.snr_db(rms_clean, rms_noise_out))) initial_snr_db = int(np.round(util.snr_db(rms_clean, rms_noise_in))) output_clean_filename = output_filename_prefix + 'clean.wav' output_clean_filepath = os.path.join(output_path, output_clean_filename) util.write_wav(valid_clean_signal, output_clean_filepath, sample_rate) output_denoised_filename = output_filename_prefix + 'denoised_%ddB.wav' % new_snr_db output_noisy_filename = output_filename_prefix + 'noisy_%ddB.wav' % initial_snr_db else: output_denoised_filename = output_filename_prefix + 'denoised.wav' output_noisy_filename = output_filename_prefix + 'noisy.wav' output_noise_filename = output_filename_prefix + 'noise.wav' output_denoised_filepath = os.path.join(output_path, output_denoised_filename) output_noisy_filepath = os.path.join(output_path, output_noisy_filename) output_noise_filepath = os.path.join(output_path, output_noise_filename) util.write_wav(denoised_output, output_denoised_filepath, sample_rate) util.write_wav(valid_noisy_signal, output_noisy_filepath, sample_rate) util.write_wav(noise_output, output_noise_filepath, sample_rate)
def separate_sample(model, input, batch_size, output_filename_prefix, sample_rate, output_path, target): if target == 'singing-voice': if len(input['mixture']) < model.receptive_field_length: raise ValueError('Input is not long enough to be used with this model.') num_output_samples = input['mixture'].shape[0] - (model.receptive_field_length - 1) num_fragments = int(np.ceil(num_output_samples / model.target_field_length)) num_batches = int(np.ceil(num_fragments / batch_size)) vocals_output = [] num_pad_values = 0 fragment_i = 0 for batch_i in tqdm.tqdm(range(0, num_batches)): if batch_i == num_batches - 1: # If its the last batch batch_size = num_fragments - batch_i * batch_size input_batch = np.zeros((batch_size, model.input_length)) # Assemble batch for batch_fragment_i in range(0, batch_size): if fragment_i + model.target_field_length > num_output_samples: remainder = input['mixture'][fragment_i:] current_fragment = np.zeros((model.input_length,)) current_fragment[:remainder.shape[0]] = remainder num_pad_values = model.input_length - remainder.shape[0] else: current_fragment = input['mixture'][fragment_i:fragment_i + model.input_length] input_batch[batch_fragment_i, :] = current_fragment fragment_i += model.target_field_length separated_output_fragments = model.separate_batch({'data_input': input_batch}) if type(separated_output_fragments) is list: vocals_output_fragment = separated_output_fragments[0] vocals_output_fragment = vocals_output_fragment[:, model.target_padding: model.target_padding + model.target_field_length] vocals_output_fragment = vocals_output_fragment.flatten().tolist() if type(separated_output_fragments) is float: vocals_output_fragment = [vocals_output_fragment] vocals_output = vocals_output + vocals_output_fragment vocals_output = np.array(vocals_output) if num_pad_values != 0: vocals_output = vocals_output[:-num_pad_values] mixture_valid_signal = input['mixture'][ model.half_receptive_field_length:model.half_receptive_field_length + len(vocals_output)] accompaniment_output = mixture_valid_signal - vocals_output output_vocals_filename = output_filename_prefix + '_vocals.wav' output_accompaniment_filename = output_filename_prefix + '_accompaniment.wav' output_vocals_filepath = os.path.join(output_path, output_vocals_filename) output_accompaniment_filepath = os.path.join(output_path, output_accompaniment_filename) util.write_wav(vocals_output, output_vocals_filepath, sample_rate) util.write_wav(accompaniment_output, output_accompaniment_filepath, sample_rate) if target == 'multi-instrument': if len(input['mixture']) < model.receptive_field_length: raise ValueError('Input is not long enough to be used with this model.') num_output_samples = input['mixture'].shape[0] - (model.receptive_field_length - 1) num_fragments = int(np.ceil(num_output_samples / model.target_field_length)) num_batches = int(np.ceil(num_fragments / batch_size)) vocals_output = [] drums_output = [] bass_output = [] num_pad_values = 0 fragment_i = 0 for batch_i in tqdm.tqdm(range(0, num_batches)): if batch_i == num_batches - 1: # If its the last batch batch_size = num_fragments - batch_i * batch_size input_batch = np.zeros((batch_size, model.input_length)) # Assemble batch for batch_fragment_i in range(0, batch_size): if fragment_i + model.target_field_length > num_output_samples: remainder = input['mixture'][fragment_i:] current_fragment = np.zeros((model.input_length,)) current_fragment[:remainder.shape[0]] = remainder num_pad_values = model.input_length - remainder.shape[0] else: current_fragment = input['mixture'][fragment_i:fragment_i + model.input_length] input_batch[batch_fragment_i, :] = current_fragment fragment_i += model.target_field_length separated_output_fragments = model.separate_batch({'data_input': input_batch}) if type(separated_output_fragments) is list: vocals_output_fragment = separated_output_fragments[0] drums_output_fragment = separated_output_fragments[1] bass_output_fragment = separated_output_fragments[2] vocals_output_fragment = vocals_output_fragment[:, model.target_padding: model.target_padding + model.target_field_length] vocals_output_fragment = vocals_output_fragment.flatten().tolist() drums_output_fragment = drums_output_fragment[:, model.target_padding: model.target_padding + model.target_field_length] drums_output_fragment = drums_output_fragment.flatten().tolist() bass_output_fragment = bass_output_fragment[:, model.target_padding: model.target_padding + model.target_field_length] bass_output_fragment = bass_output_fragment.flatten().tolist() if type(separated_output_fragments) is float: vocals_output_fragment = [vocals_output_fragment] if type(drums_output_fragment) is float: drums_output_fragment = [drums_output_fragment] if type(bass_output_fragment) is float: bass_output_fragment = [bass_output_fragment] vocals_output = vocals_output + vocals_output_fragment drums_output = drums_output + drums_output_fragment bass_output = bass_output + bass_output_fragment vocals_output = np.array(vocals_output) drums_output = np.array(drums_output) bass_output = np.array(bass_output) if num_pad_values != 0: vocals_output = vocals_output[:-num_pad_values] drums_output = drums_output[:-num_pad_values] bass_output = bass_output[:-num_pad_values] mixture_valid_signal = input['mixture'][ model.half_receptive_field_length:model.half_receptive_field_length + len(vocals_output)] other_output = mixture_valid_signal - vocals_output - drums_output - bass_output output_vocals_filename = output_filename_prefix + '_vocals.wav' output_drums_filename = output_filename_prefix + '_drums.wav' output_bass_filename = output_filename_prefix + '_bass.wav' output_other_filename = output_filename_prefix + '_other.wav' output_vocals_filepath = os.path.join(output_path, output_vocals_filename) output_drums_filepath = os.path.join(output_path, output_drums_filename) output_bass_filepath = os.path.join(output_path, output_bass_filename) output_other_filepath = os.path.join(output_path, output_other_filename) util.write_wav(vocals_output, output_vocals_filepath, sample_rate) util.write_wav(drums_output, output_drums_filepath, sample_rate) util.write_wav(bass_output, output_bass_filepath, sample_rate) util.write_wav(other_output, output_other_filepath, sample_rate)
def denoise_sample(model, input, condition_input, batch_size, output_filename_prefix, sample_rate, n_spk, n_channel, output_path, save_wav=False, spk_gender=None, use_pit=False, pad=False): if pad: noisy_pad = np.zeros( (model.half_receptive_field_length * 2 + len(input['noisy']))) noisy_pad[model.half_receptive_field_length:model. half_receptive_field_length + len(input['noisy'])] = input['noisy'] input['noisy'] = noisy_pad if len(input['noisy']) < model.receptive_field_length: raise ValueError( 'Input is not long enough to be used with this model.') num_output_samples = input['noisy'].shape[0] - ( model.receptive_field_length - 1) num_fragments = int(np.ceil(num_output_samples / model.target_field_length)) num_batches = int(np.ceil(num_fragments / batch_size)) ch_gender = {'ch1': {'M': 0, 'F': 0}, 'ch2': {'M': 0, 'F': 0}} # output_1 = [] # output_2 = [] output = [[] for _ in range(n_channel)] num_pad_values = 0 fragment_i = 0 for batch_i in tqdm.tqdm(range(0, num_batches)): if batch_i == num_batches - 1: #If its the last batch' batch_size = num_fragments - batch_i * batch_size # condition_batch = np.array([condition_input, ] * batch_size, dtype='uint8') input_batch = np.zeros((batch_size, model.input_length)) #Assemble batch for batch_fragment_i in range(0, batch_size): if fragment_i + model.target_field_length > num_output_samples: remainder = input['noisy'][fragment_i:] current_fragment = np.zeros((model.input_length, )) current_fragment[:remainder.shape[0]] = remainder num_pad_values = model.input_length - remainder.shape[0] else: current_fragment = input['noisy'][fragment_i:fragment_i + model.input_length] input_batch[batch_fragment_i, :] = current_fragment fragment_i += model.target_field_length # output_1_fragments = model.denoise_batch({'data_input': input_batch, 'condition_input': condition_batch}) input_batch = np.concatenate([ np.expand_dims(input_batch, 0), np.zeros_like(np.expand_dims(input_batch, 0)) ]) input_batch = np.transpose(input_batch, (1, 0, 2)) output_fragments = model.denoise_batch({'data_input': input_batch}) output_fragments = output_fragments[:, :, model.target_padding:model. target_padding + model.target_field_length] for i in range(n_channel): output[i] += output_fragments[:, i].flatten().tolist() # output_1_fragment = output_fragments[:, 0] # output_2_fragment = output_fragments[:, 1] # output_1_fragment = output_1_fragment[:, model.target_padding: model.target_padding + model.target_field_length] # output_1_fragment = output_1_fragment.flatten().tolist() # output_2_fragment = output_2_fragment[:, model.target_padding: model.target_padding + model.target_field_length] # output_2_fragment = output_2_fragment.flatten().tolist() # output_1 = output_1 + output_1_fragment # output_2 = output_2 + output_2_fragment output = np.array(output) # output_1 = np.array(output_1) # output_2 = np.array(output_2) if num_pad_values != 0: output = output[:, :-num_pad_values] # output_1 = output_1[:-num_pad_values] # output_2 = output_2[:-num_pad_values] voice_len = len(output[0]) valid_noisy_signal = input['noisy'][model.half_receptive_field_length:model .half_receptive_field_length + voice_len] valid_clean_signal_1 = input['clean_1'][ model.half_receptive_field_length:model.half_receptive_field_length + voice_len] if not pad else input['clean_1'] valid_clean_signal_2 = input['clean_2'][ model.half_receptive_field_length:model.half_receptive_field_length + voice_len] if not pad else input['clean_2'] if use_pit == True: pit_output_1 = [] pit_output_2 = [] pit_idx1 = [] pit_idx2 = [] for f in range(num_fragments): c1 = valid_clean_signal_1[f * model.target_field_length:(f + 1) * model.target_field_length] c2 = valid_clean_signal_2[f * model.target_field_length:(f + 1) * model.target_field_length] o = output[:, f * model.target_field_length:(f + 1) * model.target_field_length] # o1 = output_1[f*model.target_field_length:(f+1)*model.target_field_length] # o2 = output_2[f*model.target_field_length:(f+1)*model.target_field_length] perms = np.array( list(itertools.permutations(range(n_channel), n_spk))) perms_onehot = (np.arange(perms.max() + 1) == perms[..., None]).astype(int) cross_loss = np.expand_dims(np.array([c1, c2]), 1) - np.expand_dims(o, 0) cross_loss_abs = np.sum(np.abs(cross_loss), 2) loss_sets = np.einsum('ij,pij->p', cross_loss_abs, perms_onehot) best_perm = perms[np.argmin(loss_sets)] pit_output_1 += o[best_perm[0]].tolist() pit_output_2 += o[best_perm[1]].tolist() pit_idx1.append(best_perm[0]) pit_idx2.append(best_perm[1]) # perm = np.argmin([np.sum(np.abs(c1-o1)+np.abs(c2-o2)), np.sum(np.abs(c1-o2)+np.abs(c2-o1))]) # if perm == 0: # ch_gender['ch1'][spk_gender[0]] += 1 # ch_gender['ch2'][spk_gender[1]] += 1 # pit_output_1 += o1.tolist() # pit_output_2 += o2.tolist() # else: # ch_gender['ch1'][spk_gender[1]] += 1 # ch_gender['ch2'][spk_gender[0]] += 1 # pit_output_1 += o2.tolist() # pit_output_2 += o1.tolist() # valid_clean_signal_1 = np.zeros_like(valid_clean_signal_1) # valid_clean_signal_1.fill(1e-16) # pit_output_1 = np.zeros_like(pit_output_1) # pit_output_1.fill(1e-14) clean_wav = np.array([valid_clean_signal_1, valid_clean_signal_2]) noisy_wav = np.array([pit_output_1, pit_output_2]) _sdr1, _sir, _sar, _popt = mir_eval.separation.bss_eval_sources( clean_wav[0], noisy_wav[0]) _sdr2, _sir, _sar, _popt = mir_eval.separation.bss_eval_sources( clean_wav[1], noisy_wav[1]) # s1 = signal_to_distortion_ratio(clean_wav[0], noisy_wav[0]) # s2 = signal_to_distortion_ratio(clean_wav[1], noisy_wav[1]) # print(s1, s2) return np.array([_sdr1, _sdr2]), ch_gender, [pit_idx1, pit_idx2] else: clean_wav = np.array([valid_clean_signal_1, valid_clean_signal_2]) perms = np.array(list(itertools.permutations(range(n_channel), n_spk))) perms_onehot = (np.arange(perms.max() + 1) == perms[..., None]).astype(int) cross_loss = np.expand_dims(clean_wav, 1) - np.expand_dims(output, 0) cross_loss_abs = np.sum(np.abs(cross_loss), 2) loss_sets = np.einsum('ij,pij->p', cross_loss_abs, perms_onehot) best_perm = perms[np.argmin(loss_sets)] _sdr1, _sir, _sar, _popt = mir_eval.separation.bss_eval_sources( clean_wav[0], output[best_perm[0]]) _sdr2, _sir, _sar, _popt = mir_eval.separation.bss_eval_sources( clean_wav[1], output[best_perm[1]]) # noisy_wav = np.array([output_1, output_2]) return np.array([_sdr1, _sdr2]), ch_gender, best_perm # _sdr1, _sir, _sar, _popt = mir_eval.separation.bss_eval_sources(np.array(clean_wav[0]), np.array(noisy_wav[0])) # _sdr2, _sir, _sar, _popt = mir_eval.separation.bss_eval_sources(np.array(clean_wav[1]), np.array(noisy_wav[1])) # _sdr3, _sir, _sar, _popt = mir_eval.separation.bss_eval_sources(np.array(clean_wav[0]), np.array(noisy_wav[1])) # _sdr4, _sir, _sar, _popt = mir_eval.separation.bss_eval_sources(np.array(clean_wav[1]), np.array(noisy_wav[0])) # if _sdr1 + _sdr2 > _sdr3 + _sdr4: # ch_gender['ch1'][spk_gender[0]] += 1 # ch_gender['ch2'][spk_gender[1]] += 1 # return np.array([_sdr1, _sdr2]), ch_gender # else: # ch_gender['ch1'][spk_gender[1]] += 1 # ch_gender['ch2'][spk_gender[0]] += 1 # return np.array([_sdr3, _sdr4]), ch_gender if save_wav: output_original_filename = output_filename_prefix + 'orig.wav' output_s1_filename = output_filename_prefix + 's1.wav' output_s2_filename = output_filename_prefix + 's2.wav' output_original_filepath = os.path.join(output_path, output_original_filename) output_s1_filepath = os.path.join(output_path, output_s1_filename) output_s2_filepath = os.path.join(output_path, output_s2_filename) print(output_denoised_filepath) util.write_wav(valid_noisy_signal, output_original_filepath, sample_rate) util.write_wav(output_1, output_s1_filepath, sample_rate) util.write_wav(output_2, output_s2_filepath, sample_rate)