def run_one_epoch_eval(self, epoch): """ Validation """ self.model.eval() n_sources_count = {} with torch.no_grad(): for idx, (mixture, sources, segment_IDs) in enumerate(self.valid_loader): if self.use_cuda: mixture = mixture.cuda() sources = sources.cuda() sources, n_sources = nn.utils.rnn.pad_packed_sequence( sources, batch_first=True) n_sources = n_sources.tolist() output_one_and_rest = self.model(mixture) output_one = output_one_and_rest[:, 0:1] output_rest = output_one_and_rest[:, 1:] output = output_one for source_idx in range(1, n_sources[0] - 1): output_one_and_rest = self.model(output_rest) output_one = output_one_and_rest[:, 0:1] output_rest = output_one_and_rest[:, 1:] output = torch.cat([output, output_one], dim=1) output = torch.cat([output, output_rest], dim=1) if not n_sources[0] in n_sources_count.keys(): n_sources_count[n_sources[0]] = 0 if n_sources_count[n_sources[0]] < 5: mixture = mixture[0].squeeze(dim=0).detach().cpu().numpy() estimated_sources = output[0].detach().cpu().numpy() save_dir = os.path.join(self.sample_dir, segment_IDs[0]) os.makedirs(save_dir, exist_ok=True) save_path = os.path.join(save_dir, "mixture.wav") norm = np.abs(mixture).max() mixture = mixture / norm write_wav(save_path, signal=mixture, sr=self.sr) for source_idx, estimated_source in enumerate( estimated_sources): save_path = os.path.join( save_dir, "epoch{}-{}.wav".format(epoch + 1, source_idx + 1)) norm = np.abs(estimated_source).max() estimated_source = estimated_source / norm write_wav(save_path, signal=estimated_source, sr=self.sr) n_sources_count[n_sources[0]] += 1 return -1
def _test(method='AuxLaplaceIVA'): np.random.seed(111) # Room impulse response sr = 16000 reverb = 0.16 duration = 0.5 samples = int(duration * sr) mic_intervals = [8, 8, 8, 8, 8, 8, 8] mic_indices = [2, 5] degrees = [60, 300] titles = ['man-16000', 'woman-16000'] mixed_signal = _convolve_mird(titles, reverb=reverb, degrees=degrees, mic_intervals=mic_intervals, mic_indices=mic_indices, samples=samples) n_channels, T = mixed_signal.shape # STFT fft_size, hop_size = 2048, 1024 mixture = stft(mixed_signal, fft_size=fft_size, hop_size=hop_size) # IVA lr = 0.1 n_sources = len(titles) iteration = 200 if method == 'GradLaplaceIVA': iva = GradLaplaceIVA(lr=lr) iteration = 5000 elif method == 'NaturalGradLaplaceIVA': iva = NaturalGradLaplaceIVA(lr=lr) iteration = 200 elif method == 'AuxLaplaceIVA': iva = AuxLaplaceIVA() iteration = 50 else: raise ValueError("Not support method {}".format(method)) estimation = iva(mixture, iteration=iteration) estimated_signal = istft(estimation, fft_size=fft_size, hop_size=hop_size, length=T) print("Mixture: {}, Estimation: {}".format(mixed_signal.shape, estimated_signal.shape)) for idx in range(n_sources): _estimated_signal = estimated_signal[idx] write_wav("data/IVA/{}/mixture-{}_estimated-iter{}-{}.wav".format(method, sr, iteration, idx), signal=_estimated_signal, sr=sr) plt.figure() plt.plot(iva.loss, color='black') plt.xlabel('Iteration') plt.ylabel('Loss') plt.savefig('data/IVA/{}/loss.png'.format(method), bbox_inches='tight') plt.close()
def _test_conv(): sr = 16000 reverb = 0.16 duration = 0.5 samples = int(duration * sr) mic_indices = [2, 5] degrees = [60, 300] titles = ['man-16000', 'woman-16000'] mixed_signal = _convolve_mird(titles, reverb=reverb, degrees=degrees, mic_indices=mic_indices, samples=samples) write_wav("data/multi-channel/mixture-{}.wav".format(sr), mixed_signal.T, sr=sr)
def process_offline(sr, num_chunk, duration=5, model_path=None, save_dir="results"): num_loop = int(duration * sr / num_chunk) sequence = [] P = pyaudio.PyAudio() # Record stream = P.open(format=FORMAT, channels=NUM_CHANNEL, rate=sr, input_device_index=DEVICE_INDEX, frames_per_buffer=num_chunk, input=True, output=False) for i in range(num_loop): input = stream.read(num_chunk) sequence.append(input) time = int(i*num_chunk/sr) show_progress_bar(time, duration) show_progress_bar(duration, duration) print() stream.stop_stream() stream.close() P.terminate() print("Stop recording") os.makedirs(save_dir, exist_ok=True) # Save signal = b"".join(sequence) signal = np.frombuffer(signal, dtype=np.int16) signal = signal / 32768 save_path = os.path.join(save_dir, "mixture.wav") write_wav(save_path, signal=signal, sr=sr) # Separate by DNN model = load_model(model_path) model.eval() print("Start separation...") with torch.no_grad(): mixture = torch.Tensor(signal).float() mixture = mixture.unsqueeze(dim=0).unsqueeze(dim=0) estimated_sources = model(mixture) estimated_sources = estimated_sources.squeeze(dim=0).detach().cpu().numpy() print("Finished separation...") for idx, estimated_source in enumerate(estimated_sources): save_path = os.path.join(save_dir, "estimated-{}.wav".format(idx)) write_wav(save_path, signal=estimated_source, sr=sr)
def _test(method, n_bases=10, partitioning=False): np.random.seed(111) # Room impulse response sr = 16000 reverb = 0.16 duration = 0.5 samples = int(duration * sr) mic_intervals = [8, 8, 8, 8, 8, 8, 8] mic_indices = [2, 5] degrees = [60, 300] titles = ['man-16000', 'woman-16000'] mixed_signal = _convolve_mird(titles, reverb=reverb, degrees=degrees, mic_intervals=mic_intervals, mic_indices=mic_indices, samples=samples) n_sources, T = mixed_signal.shape # STFT fft_size, hop_size = 2048, 1024 mixture = stft(mixed_signal, fft_size=fft_size, hop_size=hop_size) # ILRMA n_channels = len(titles) iteration = 200 if method == 'Gauss': ilrma = GaussILRMA(n_bases=n_bases, partitioning=partitioning) elif method == 't': ilrma = tILRMA(n_bases=n_bases, partitioning=partitioning) else: raise ValueError("Not support {}-ILRMA.".format(method)) estimation = ilrma(mixture, iteration=iteration) estimated_signal = istft(estimation, fft_size=fft_size, hop_size=hop_size, length=T) print("Mixture: {}, Estimation: {}".format(mixed_signal.shape, estimated_signal.shape)) for idx in range(n_channels): _estimated_signal = estimated_signal[idx] write_wav("data/ILRMA/{}ILMRA/partitioning{}/mixture-{}_estimated-iter{}-{}.wav".format(method, int(partitioning), sr, iteration, idx), signal=_estimated_signal, sr=sr) plt.figure() plt.plot(ilrma.loss, color='black') plt.xlabel('Iteration') plt.ylabel('Loss') plt.savefig('data/ILRMA/{}ILMRA/partitioning{}/loss.png'.format(method, int(partitioning)), bbox_inches='tight') plt.close()
def run_one_epoch_eval(self, epoch): """ Validation """ self.model.eval() valid_loss = 0 n_valid = len(self.valid_loader.dataset) with torch.no_grad(): for idx, (mixture, sources, segment_IDs) in enumerate(self.valid_loader): if self.use_cuda: mixture = mixture.cuda() sources = sources.cuda() output = self.model(mixture) loss, _ = self.pit_criterion(output, sources, batch_mean=False) loss = loss.sum(dim=0) valid_loss += loss.item() if idx < 5: mixture = mixture[0].squeeze(dim=0).detach().cpu().numpy() estimated_sources = output[0].detach().cpu().numpy() save_dir = os.path.join(self.sample_dir, segment_IDs[0]) os.makedirs(save_dir, exist_ok=True) save_path = os.path.join(save_dir, "mixture.wav") norm = np.abs(mixture).max() mixture = mixture / norm write_wav(save_path, signal=mixture, sr=self.sr) for source_idx, estimated_source in enumerate( estimated_sources): save_path = os.path.join( save_dir, "epoch{}-{}.wav".format(epoch + 1, source_idx + 1)) norm = np.abs(estimated_source).max() estimated_source = estimated_source / norm write_wav(save_path, signal=estimated_source, sr=self.sr) valid_loss /= n_valid return valid_loss
def _test(method='IBM'): if method == 'IBM': mask = ideal_binary_mask(amplitude) elif method == 'IRM': mask = ideal_ratio_mask(amplitude) elif method == 'WFM': mask = wiener_filter_mask(amplitude) else: raise NotImplementedError("Not support {}".format(method)) estimated_amplitude = amplitude * mask real, imag = estimated_amplitude * torch.cos(phase_mixture), estimated_amplitude * torch.sin(phase_mixture) estimated_spectrgram = torch.cat([real.unsqueeze(dim=3), imag.unsqueeze(dim=3)], dim=3) estimated_signal = istft(estimated_spectrgram, T=T) estimated_signal = estimated_signal.detach().cpu().numpy() for signal, tag in zip(estimated_signal, ['man', 'woman']): write_wav("data/frequency_mask/{}-estimated_{}.wav".format(tag, method), signal=signal, sr=16000)
def run_one_epoch_eval(self, epoch): """ Validation """ self.model.eval() with torch.no_grad(): for idx, (mixture, sources) in enumerate(self.valid_loader): if self.use_cuda: mixture = mixture.cuda() sources = sources.cuda() output_one_and_rest = self.model(mixture) output_one = output_one_and_rest[:,0:1] output_rest = output_one_and_rest[:,1:] output = output_one for source_idx in range(1, self.n_sources-1): output_one_and_rest = self.model(output_rest) output_one = output_one_and_rest[:,0:1] output_rest = output_one_and_rest[:,1:] output = torch.cat([output, output_one], dim=1) output = torch.cat([output, output_rest], dim=1) if idx < 5: mixture = mixture[0].squeeze(dim=0).detach().cpu().numpy() estimated_sources = output[0].detach().cpu().numpy() save_dir = os.path.join(self.sample_dir, "{}".format(idx+1)) os.makedirs(save_dir, exist_ok=True) save_path = os.path.join(save_dir, "mixture.wav") norm = np.abs(mixture).max() mixture = mixture / norm write_wav(save_path, signal=mixture, sr=self.sr) for source_idx, estimated_source in enumerate(estimated_sources): save_path = os.path.join(save_dir, "epoch{}-{}.wav".format(epoch+1,source_idx+1)) norm = np.abs(estimated_source).max() estimated_source = estimated_source / norm write_wav(save_path, signal=estimated_source, sr=self.sr) return -1
def run(self): self.model.eval() test_loss = 0 test_loss_improvement = 0 test_pesq = 0 n_test = len(self.loader.dataset) with torch.no_grad(): for idx, (mixture, sources, segment_IDs) in enumerate(self.loader): if self.use_cuda: mixture = mixture.cuda() sources = sources.cuda() loss_mixture, _ = self.pit_criterion(mixture, sources, batch_mean=False) loss_mixture = loss_mixture.sum(dim=0) output = self.model(mixture) loss, perm_idx = self.pit_criterion(output, sources, batch_mean=False) loss = loss.sum(dim=0) loss_improvement = loss_mixture.item() - loss.item() mixture = mixture[0].squeeze(dim=0).cpu().numpy() # -> (T,) sources = sources[0].cpu().numpy() # -> (n_sources, T) estimated_sources = output[0].cpu().numpy( ) # -> (n_sources, T) perm_idx = perm_idx[0] # -> (n_sources,) segment_IDs = segment_IDs[0] # -> (n_sources,) norm = np.abs(mixture).max() mixture /= norm mixture_ID = "+".join(segment_IDs) if idx < 10 and self.out_dir is not None: mixture_path = os.path.join(self.out_dir, "{}.wav".format(mixture_ID)) write_wav(mixture_path, signal=mixture, sr=self.sr) mixture_path = "tmp-mixture.wav" write_wav(mixture_path, signal=mixture, sr=self.sr) for order_idx in range(self.n_sources): source, estimated_source = sources[ order_idx], estimated_sources[perm_idx[order_idx]] segment_ID = segment_IDs[order_idx] # Target norm = np.abs(source).max() source /= norm if idx < 10 and self.out_dir is not None: source_path = os.path.join( self.out_dir, "{}_{}-target.wav".format(mixture_ID, order_idx)) write_wav(source_path, signal=source, sr=self.sr) source_path = "tmp-{}-target.wav".format(order_idx) write_wav(source_path, signal=source, sr=self.sr) # Estimated source norm = np.abs(estimated_source).max() estimated_source /= norm if idx < 10 and self.out_dir is not None: estimated_path = os.path.join( self.out_dir, "{}_{}-estimated.wav".format( mixture_ID, order_idx)) write_wav(estimated_path, signal=estimated_source, sr=self.sr) estimated_path = "tmp-{}-estimated.wav".format(order_idx) write_wav(estimated_path, signal=estimated_source, sr=self.sr) pesq = 0 for source_idx in range(self.n_sources): source_path = "tmp-{}-target.wav".format(source_idx) estimated_path = "tmp-{}-estimated.wav".format(source_idx) command = "./PESQ +{} {} {}".format( self.sr, source_path, estimated_path) command += " | grep Prediction | awk '{print $5}'" pesq_output = subprocess.check_output(command, shell=True) pesq_output = pesq_output.decode().strip() pesq += float(pesq_output) subprocess.call("rm {}".format(source_path), shell=True) subprocess.call("rm {}".format(estimated_path), shell=True) pesq /= self.n_sources print("{}, {:.3f}, {:.3f}, {:.3f}".format( mixture_ID, loss.item(), loss_improvement, pesq), flush=True) test_loss += loss.item() test_loss_improvement += loss_improvement test_pesq += pesq test_loss /= n_test test_loss_improvement /= n_test test_pesq /= n_test print("Loss: {:.3f}, loss improvement: {:3f}, PESQ: {:.3f}".format( test_loss, test_loss_improvement, test_pesq))
def _test(metric='EUC'): torch.manual_seed(111) fft_size, hop_size = 1024, 256 n_bases = 6 iteration = 100 signal, sr = read_wav("data/music-8000.wav") T = len(signal) signal = torch.Tensor(signal).unsqueeze(dim=0) stft = BatchSTFT(fft_size=fft_size, hop_size=hop_size) istft = BatchInvSTFT(fft_size=fft_size, hop_size=hop_size) spectrogram = stft(signal).squeeze(dim=0) real = spectrogram[...,0] imag = spectrogram[...,1] amplitude = torch.sqrt(real**2 + imag**2) power = amplitude**2 log_spectrogram = 10 * torch.log10(power + EPS) plt.figure() plt.pcolormesh(log_spectrogram, cmap='jet') plt.colorbar() plt.savefig('data/NMF/spectrogram.png', bbox_inches='tight') plt.close() nmf = NMF(n_bases, metric=metric) nmf.update(power, iteration=iteration) estimated_power = torch.matmul(nmf.base, nmf.activation) estimated_amplitude = torch.sqrt(estimated_power) ratio = estimated_amplitude / (amplitude + EPS) estimated_real, estimated_imag = ratio * real, ratio * imag estimated_spectrogram = torch.cat([estimated_real.unsqueeze(dim=2), estimated_imag.unsqueeze(dim=2)], dim=2).unsqueeze(dim=0) estimated_signal = istft(estimated_spectrogram, T=T) estimated_signal = estimated_signal.squeeze(dim=0).numpy() estimated_signal = estimated_signal / np.abs(estimated_signal).max() write_wav("data/NMF/{}/music-8000-estimated-iter{}.wav".format(metric, iteration), signal=estimated_signal, sr=8000) for idx in range(n_bases): estimated_power = torch.matmul(nmf.base[:, idx: idx+1], nmf.activation[idx: idx+1, :]) estimated_amplitude = torch.sqrt(estimated_power) ratio = estimated_amplitude / (amplitude + EPS) estimated_real, estimated_imag = ratio * real, ratio * imag estimated_spectrogram = torch.cat([estimated_real.unsqueeze(dim=2), estimated_imag.unsqueeze(dim=2)], dim=2).unsqueeze(dim=0) estimated_signal = istft(estimated_spectrogram, T=T) estimated_signal = estimated_signal.squeeze(dim=0).numpy() estimated_signal = estimated_signal / np.abs(estimated_signal).max() write_wav("data/NMF/{}/music-8000-estimated-iter{}-base{}.wav".format(metric, iteration, idx), signal=estimated_signal, sr=8000) log_spectrogram = 10 * torch.log10(estimated_power + EPS).numpy() plt.figure() plt.pcolormesh(log_spectrogram, cmap='jet') plt.colorbar() plt.savefig('data/NMF/{}/estimated-spectrogram-iter{}-base{}.png'.format(metric, iteration, idx), bbox_inches='tight') plt.close() plt.figure() plt.plot(nmf.loss) plt.savefig('data/NMF/{}/loss.png'.format(metric), bbox_inches='tight') plt.close()
def run_one_epoch_eval(self, epoch): # Override """ Validation """ n_sources = self.n_sources F_bin = self.F_bin self.model.eval() valid_loss = 0 n_valid = len(self.valid_loader.dataset) with torch.no_grad(): for idx, (mixture, sources, threshold_weight) in enumerate(self.valid_loader): """ mixture (batch_size, 1, F_bin, T_bin, 2) sources (batch_size, n_sources, F_bin, T_bin, 2) threshold_weight (batch_size, F_bin, T_bin) """ if self.use_cuda: mixture = mixture.cuda() sources = sources.cuda() threshold_weight = threshold_weight.cuda() real, imag = mixture[...,0], mixture[...,1] mixture_amplitude = torch.sqrt(real**2+imag**2) real, imag = sources[...,0], sources[...,1] sources_amplitude = torch.sqrt(real**2+imag**2) output = self.model(mixture_amplitude, threshold_weight=threshold_weight, n_sources=n_sources) # At the test phase, assignment may be unknown. loss, _ = pit(self.criterion, output, sources_amplitude, batch_mean=False) loss = loss.sum(dim=0) valid_loss += loss.item() if idx < 5: mixture = mixture[0].cpu() # -> (1, 2*F_bin, T_bin) mixture_amplitude = mixture_amplitude[0].cpu() # -> (1, F_bin, T_bin) estimated_sources_amplitude = output[0].cpu() # -> (n_sources, F_bin, T_bin) ratio = estimated_sources_amplitude / mixture_amplitude real, imag = mixture[...,0], mixture[...,1] real, imag = ratio * real, ratio * imag estimated_sources = torch.cat([real.unsqueeze(dim=3), imag.unsqueeze(dim=3)], dim=3) # -> (n_sources, F_bin, T_bin, 2) estimated_sources = self.istft(estimated_sources) # -> (n_sources, T) estimated_sources = estimated_sources.cpu().numpy() mixture = self.istft(mixture) # -> (1, T) mixture = mixture.squeeze(dim=0).numpy() # -> (T,) save_dir = os.path.join(self.sample_dir, "{}".format(idx+1)) os.makedirs(save_dir, exist_ok=True) save_path = os.path.join(save_dir, "mixture.wav") norm = np.abs(mixture).max() mixture = mixture / norm write_wav(save_path, signal=mixture, sr=self.sr) for source_idx, estimated_source in enumerate(estimated_sources): save_path = os.path.join(save_dir, "epoch{}-{}.wav".format(epoch+1,source_idx+1)) norm = np.abs(estimated_source).max() estimated_source = estimated_source / norm write_wav(save_path, signal=estimated_source, sr=self.sr) valid_loss /= n_valid return valid_loss
def process_offline(sr, num_chunk, duration=5, model_path=None, save_dir="results", args=None): num_loop = int(duration * sr / num_chunk) sequence = [] P = pyaudio.PyAudio() # Record stream = P.open(format=FORMAT, channels=NUM_CHANNEL, rate=sr, input_device_index=DEVICE_INDEX, frames_per_buffer=num_chunk, input=True, output=False) for i in range(num_loop): input = stream.read(num_chunk) sequence.append(input) time = int(i * num_chunk / sr) show_progress_bar(time, duration) show_progress_bar(duration, duration) print() stream.stop_stream() stream.close() P.terminate() print("Stop recording") os.makedirs(save_dir, exist_ok=True) # Save signal = b"".join(sequence) signal = np.frombuffer(signal, dtype=np.int16) signal = signal / 32768 save_path = os.path.join(save_dir, "mixture.wav") write_wav(save_path, signal=signal, sr=sr) # Separate by DNN model = load_model(model_path) model.eval() fft_size, hop_size = args.fft_size, args.hop_size window_fn = args.window_fn if hop_size is None: hop_size = fft_size // 2 n_sources = args.n_sources iter_clustering = args.iter_clustering F_bin = fft_size // 2 + 1 stft = BatchSTFT(fft_size, hop_size=hop_size, window_fn=window_fn) istft = BatchInvSTFT(fft_size, hop_size=hop_size, window_fn=window_fn) print("Start separation...") with torch.no_grad(): mixture = torch.Tensor(signal).float() T = mixture.size(0) mixture = mixture.unsqueeze(dim=0) mixture = stft(mixture).unsqueeze(dim=0) real, imag = mixture[:, :, :F_bin], mixture[:, :, F_bin:] mixture_amplitude = torch.sqrt(real**2 + imag**2) estimated_sources_amplitude = model( mixture_amplitude, n_sources=n_sources, iter_clustering=iter_clustering) # TODO: Args, threshold ratio = estimated_sources_amplitude / mixture_amplitude real, imag = ratio * real, ratio * imag estimated_sources = torch.cat([real, imag], dim=2) estimated_sources = estimated_sources.squeeze(dim=0) estimated_sources = istft(estimated_sources, T=T).numpy() print("Finished separation...") for idx, estimated_source in enumerate(estimated_sources): save_path = os.path.join(save_dir, "estimated-{}.wav".format(idx)) write_wav(save_path, signal=estimated_source, sr=sr)
if __name__ == '__main__': import numpy as np from scipy.signal import resample_poly from utils.utils_audio import read_wav, write_wav from stft import BatchSTFT, BatchInvSTFT torch.manual_seed(111) fft_size, hop_size = 1024, 256 n_basis = 4 source1, sr = read_wav("data/man-44100.wav") source1 = resample_poly(source1, up=16000, down=sr) write_wav("data/man-16000.wav", signal=source1, sr=16000) T = len(source1) source2, sr = read_wav("data/woman-44100.wav") source2 = resample_poly(source2, up=16000, down=sr) write_wav("data/woman-16000.wav", signal=source2, sr=16000) mixture = source1 + source2 write_wav("data/mixture-16000.wav", signal=mixture, sr=16000) stft = BatchSTFT(fft_size=fft_size, hop_size=hop_size) istft = BatchInvSTFT(fft_size=fft_size, hop_size=hop_size) mixture = torch.Tensor(mixture).unsqueeze(dim=0) source1 = torch.Tensor(source1).unsqueeze(dim=0) source2 = torch.Tensor(source2).unsqueeze(dim=0)
def _test(metric='EUC'): np.random.seed(111) fft_size, hop_size = 1024, 256 n_bases = 6 iteration = 100 signal, sr = read_wav("data/single-channel/music-8000.wav") T = len(signal) spectrogram = stft(signal, fft_size=fft_size, hop_size=hop_size) amplitude = np.abs(spectrogram) power = amplitude**2 if metric == 'EUC': nmf = EUCNMF(n_bases) elif metric == 'IS': nmf = ISNMF(n_bases) elif metric == 'KL': nmf = KLNMF(n_bases) else: raise NotImplementedError("Not support {}-NMF".format(metric)) nmf.update(power, iteration=iteration) amplitude[amplitude < EPS] = EPS estimated_power = nmf.base @ nmf.activation estimated_amplitude = np.sqrt(estimated_power) ratio = estimated_amplitude / amplitude estimated_spectrogram = ratio * spectrogram estimated_signal = istft(estimated_spectrogram, fft_size=fft_size, hop_size=hop_size, length=T) estimated_signal = estimated_signal / np.abs(estimated_signal).max() write_wav("data/NMF/{}/music-8000-estimated-iter{}.wav".format( metric, iteration), signal=estimated_signal, sr=8000) power[power < EPS] = EPS log_spectrogram = 10 * np.log10(power) plt.figure() plt.pcolormesh(log_spectrogram, cmap='jet') plt.colorbar() plt.savefig('data/NMF/spectrogram.png', bbox_inches='tight') plt.close() for idx in range(n_bases): estimated_power = nmf.base[:, idx:idx + 1] @ nmf.activation[idx:idx + 1, :] estimated_amplitude = np.sqrt(estimated_power) ratio = estimated_amplitude / amplitude estimated_spectrogram = ratio * spectrogram estimated_signal = istft(estimated_spectrogram, fft_size=fft_size, hop_size=hop_size, length=T) estimated_signal = estimated_signal / np.abs(estimated_signal).max() write_wav("data/NMF/{}/music-8000-estimated-iter{}-base{}.wav".format( metric, iteration, idx), signal=estimated_signal, sr=8000) estimated_power[estimated_power < EPS] = EPS log_spectrogram = 10 * np.log10(estimated_power) plt.figure() plt.pcolormesh(log_spectrogram, cmap='jet') plt.colorbar() plt.savefig( 'data/NMF/{}/estimated-spectrogram-iter{}-base{}.png'.format( metric, iteration, idx), bbox_inches='tight') plt.close() plt.figure() plt.plot(nmf.loss, color='black') plt.xlabel('Iteration') plt.ylabel('Loss') plt.savefig('data/NMF/{}/loss.png'.format(metric), bbox_inches='tight') plt.close()
def run(self): self.model.eval() test_loss = 0 test_loss_improvement = 0 test_sdr_improvement = 0 test_sir_improvement = 0 test_sar = 0 test_pesq = 0 n_pesq_error = 0 n_test = len(self.loader.dataset) print("ID, Loss, Loss improvement, SDR improvement, SIR improvement, SAR, PESQ", flush=True) with torch.no_grad(): for idx, (mixture, sources, segment_IDs) in enumerate(self.loader): if self.use_cuda: mixture = mixture.cuda() sources = sources.cuda() loss_mixture, _ = self.pit_criterion(mixture, sources, batch_mean=False) loss_mixture = loss_mixture.sum(dim=0) output = self.model(mixture) loss, perm_idx = self.pit_criterion(output, sources, batch_mean=False) loss = loss.sum(dim=0) loss_improvement = loss_mixture.item() - loss.item() mixture = mixture[0].squeeze(dim=0).cpu().numpy() # -> (T,) sources = sources[0].cpu().numpy() # -> (n_sources, T) estimated_sources = output[0].cpu().numpy() # -> (n_sources, T) perm_idx = perm_idx[0] # -> (n_sources,) segment_IDs = segment_IDs[0] # -> (n_sources,) repeated_mixture = np.tile(mixture, reps=(self.n_sources, 1)) result_estimated = bss_eval_sources( reference_sources=sources, estimated_sources=estimated_sources ) result_mixed = bss_eval_sources( reference_sources=sources, estimated_sources=repeated_mixture ) sdr_improvement = np.mean(result_estimated[0] - result_mixed[0]) sir_improvement = np.mean(result_estimated[1] - result_mixed[1]) sar = np.mean(result_estimated[2]) norm = np.abs(mixture).max() mixture /= norm mixture_ID = "+".join(segment_IDs) if idx < 10 and self.out_dir is not None: mixture_path = os.path.join(self.out_dir, "{}.wav".format(mixture_ID)) write_wav(mixture_path, signal=mixture, sr=self.sr) mixture_path = "tmp-mixture.wav" write_wav(mixture_path, signal=mixture, sr=self.sr) for order_idx in range(self.n_sources): source, estimated_source = sources[order_idx], estimated_sources[perm_idx[order_idx]] segment_ID = segment_IDs[order_idx] # Target norm = np.abs(source).max() source /= norm if idx < 10 and self.out_dir is not None: source_path = os.path.join(self.out_dir, "{}_{}-target.wav".format(mixture_ID, order_idx)) write_wav(source_path, signal=source, sr=self.sr) source_path = "tmp-{}-target.wav".format(order_idx) write_wav(source_path, signal=source, sr=self.sr) # Estimated source norm = np.abs(estimated_source).max() estimated_source /= norm if idx < 10 and self.out_dir is not None: estimated_path = os.path.join(self.out_dir, "{}_{}-estimated.wav".format(mixture_ID, order_idx)) write_wav(estimated_path, signal=estimated_source, sr=self.sr) estimated_path = "tmp-{}-estimated.wav".format(order_idx) write_wav(estimated_path, signal=estimated_source, sr=self.sr) pesq = 0 for source_idx in range(self.n_sources): source_path = "tmp-{}-target.wav".format(source_idx) estimated_path = "tmp-{}-estimated.wav".format(source_idx) command = "./PESQ +{} {} {}".format(self.sr, source_path, estimated_path) command += " | grep Prediction | awk '{print $5}'" pesq_output = subprocess.check_output(command, shell=True) pesq_output = pesq_output.decode().strip() if pesq_output == '': # If processing error occurs in PESQ software, it is regarded as PESQ score is 0. n_pesq_error += 1 else: pesq += float(pesq_output) subprocess.call("rm {}".format(source_path), shell=True) subprocess.call("rm {}".format(estimated_path), shell=True) pesq /= self.n_sources print("{}, {:.3f}, {:.3f}, {:.3f}, {:.3f}, {:.3f}, {:.3f}".format(mixture_ID, loss.item(), loss_improvement, sdr_improvement, sir_improvement, sar, pesq), flush=True) test_loss += loss.item() test_loss_improvement += loss_improvement test_sdr_improvement += sdr_improvement test_sir_improvement += sir_improvement test_sar += sar test_pesq += pesq test_loss /= n_test test_loss_improvement /= n_test test_sdr_improvement /= n_test test_sir_improvement /= n_test test_sar /= n_test test_pesq /= n_test print("Loss: {:.3f}, loss improvement: {:3f}, SDR improvement: {:3f}, SIR improvement: {:3f}, SAR: {:3f}, PESQ: {:.3f}".format(test_loss, test_loss_improvement, test_sdr_improvement, test_sir_improvement, test_sar, test_pesq)) print("Evaluation of PESQ returns error {} times".format(n_pesq_error))
def run(self): n_sources = self.n_sources F_bin = self.F_bin self.model.eval() test_loss = 0 test_pesq = 0 n_pesq_error = 0 n_test = len(self.loader.dataset) with torch.no_grad(): for idx, (mixture, sources, ideal_mask, threshold_weight, T, segment_IDs) in enumerate(self.loader): """ mixture (1, 1, F_bin, T_bin, 2) sources (1, n_sources, F_bin, T_bin, 2) assignment (1, n_sources, F_bin, T_bin) threshold_weight (1, F_bin, T_bin) T (1,) """ if self.use_cuda: mixture = mixture.cuda() sources = sources.cuda() ideal_mask = ideal_mask.cuda() threshold_weight = threshold_weight.cuda() real, imag = mixture[...,0], mixture[...,1] mixture_amplitude = torch.sqrt(real**2+imag**2) # -> (1, 1, F_bin, T_bin) real, imag = sources[...,0], sources[...,1] sources_amplitude = torch.sqrt(real**2+imag**2) output = self.model(mixture_amplitude, assignment=None, threshold_weight=threshold_weight, n_sources=n_sources) loss, perm_idx = self.pit_criterion(output, sources_amplitude, batch_mean=False) loss = loss.sum(dim=0) mixture = mixture[0].cpu() sources = sources[0].cpu() mixture_amplitude = mixture_amplitude[0].cpu() # -> (1, F_bin, T_bin) estimated_sources_amplitude = output[0].cpu() # -> (n_sources, F_bin, T_bin) ratio = estimated_sources_amplitude / mixture_amplitude real, imag = mixture[...,0], mixture[...,1] # -> (1, F_bin, T_bin), (1, F_bin, T_bin) real, imag = ratio * real, ratio * imag # -> (n_sources, F_bin, T_bin), (n_sources, F_bin, T_bin) estimated_sources = torch.cat([real.unsqueeze(dim=3), imag.unsqueeze(dim=3)], dim=3) # -> (n_sources, F_bin, T_bin, 2) perm_idx = perm_idx[0] # -> (n_sources,) T = T[0] # -> () segment_IDs = segment_IDs[0] # -> (n_sources,) mixture = self.istft(mixture, T=T).squeeze(dim=0).numpy() # -> (T,) sources = self.istft(sources, T=T).numpy() # -> (n_sources, T) estimated_sources = self.istft(estimated_sources, T=T).numpy() # -> (n_sources, T) norm = np.abs(mixture).max() mixture /= norm mixture_ID = "+".join(segment_IDs) if idx < 10 and self.out_dir is not None: mixture_path = os.path.join(self.out_dir, "{}.wav".format(mixture_ID)) write_wav(mixture_path, signal=mixture, sr=self.sr) mixture_path = "tmp-mixture.wav" write_wav(mixture_path, signal=mixture, sr=self.sr) for order_idx in range(self.n_sources): source, estimated_source = sources[order_idx], estimated_sources[perm_idx[order_idx]] segment_ID = segment_IDs[order_idx] # Target norm = np.abs(source).max() source /= norm if idx < 10 and self.out_dir is not None: source_path = os.path.join(self.out_dir, "{}_{}-target.wav".format(mixture_ID, order_idx)) write_wav(source_path, signal=source, sr=self.sr) source_path = "tmp-{}-target.wav".format(order_idx) write_wav(source_path, signal=source, sr=self.sr) # Estimated source norm = np.abs(estimated_source).max() estimated_source /= norm if idx < 10 and self.out_dir is not None: estimated_path = os.path.join(self.out_dir, "{}_{}-estimated.wav".format(mixture_ID, order_idx)) write_wav(estimated_path, signal=estimated_source, sr=self.sr) estimated_path = "tmp-{}-estimated.wav".format(order_idx) write_wav(estimated_path, signal=estimated_source, sr=self.sr) pesq = 0 for source_idx in range(self.n_sources): source_path = "tmp-{}-target.wav".format(source_idx) estimated_path = "tmp-{}-estimated.wav".format(source_idx) command = "./PESQ +{} {} {}".format(self.sr, source_path, estimated_path) command += " | grep Prediction | awk '{print $5}'" pesq_output = subprocess.check_output(command, shell=True) pesq_output = pesq_output.decode().strip() if pesq_output == '': # If processing error occurs in PESQ software, it is regarded as PESQ score is 0. n_pesq_error += 1 else: pesq += float(pesq_output) subprocess.call("rm {}".format(source_path), shell=True) subprocess.call("rm {}".format(estimated_path), shell=True) pesq /= self.n_sources print("{}, {:.3f}, {:.3f}".format(mixture_ID, loss.item(), pesq), flush=True) test_loss += loss.item() test_pesq += pesq test_loss /= n_test test_pesq /= n_test print("Loss: {:.3f}, PESQ: {:.3f}".format(test_loss, test_pesq)) print("Evaluation of PESQ returns error {} times".format(n_pesq_error))
if __name__ == '__main__': import os import numpy as np from scipy.signal import resample_poly from utils.utils_audio import read_wav, write_wav os.makedirs("data/GriffinLim", exist_ok=True) torch.manual_seed(111) fft_size, hop_size = 1024, 256 n_basis = 4 signal, sr = read_wav("data/man-44100.wav") signal = resample_poly(signal, up=16000, down=sr) write_wav("data/man-16000.wav", signal=signal, sr=16000) T = len(signal) signal = torch.Tensor(signal).unsqueeze(dim=0) stft = BatchSTFT(fft_size=fft_size, hop_size=hop_size) istft = BatchInvSTFT(fft_size=fft_size, hop_size=hop_size) spectrogram = stft(signal) oracle_signal = istft(spectrogram, T=T) oracle_signal = oracle_signal.squeeze(dim=0).numpy() write_wav("data/man-oracle.wav", signal=oracle_signal, sr=16000) griffin_lim = GriffinLim(fft_size, hop_size=hop_size) spectrogram = spectrogram.squeeze(dim=0)
def _test(method='DSBF'): # Room impulse response sr = 16000 reverb = 0.16 duration = 0.5 samples = int(duration * sr) mic_intervals = [3, 3, 3, 8, 3, 3, 3] mic_indices = [0, 1, 2, 3, 4, 5, 6, 7] mic_position = np.array([[0.13, 0], [0.10, 0], [0.07, 0], [0.04, 0], [-0.04, 0], [-0.07, 0], [-0.10, 0], [-0.13, 0]]) degrees = [0, 90] titles = ['man-16000', 'woman-16000'] n_sources, n_channels = len(degrees), len(mic_indices) mixed_signal = _convolve_mird(titles, reverb=reverb, degrees=degrees, mic_intervals=mic_intervals, mic_indices=mic_indices, samples=samples) _, T = mixed_signal.shape # STFT fft_size, hop_size = 2048, 1024 n_bins = fft_size // 2 + 1 frequency = np.arange(0, n_bins) * sr / fft_size mixture = stft(mixed_signal, fft_size=fft_size, hop_size=hop_size) # (n_channels, n_bins, n_frames) # Steeing vectors degrees = np.array(degrees) / 180 * np.pi x_source, y_source = np.sin(degrees), np.cos(degrees) # (n_sources,) source_position = np.vstack([x_source, y_source]).transpose(1, 0) # (n_sources, 2) steering_vector = np.exp( 2j * np.pi * frequency[:, np.newaxis, np.newaxis] * np.sum(source_position * mic_position[:, np.newaxis, :], axis=2) / sound_speed) # (n_bins, n_channels, n_sources) steering_vector = steering_vector / np.sqrt(len(mic_indices)) if method == 'DSBF': beamformer = DelaySumBeamformer(steering_vector=steering_vector) elif method == 'MVDR': beamformer = MVDRBeamformer(steering_vector=steering_vector) else: raise NotImplementedError("Not support {} beamformer".format(method)) estimation = beamformer(mixture) spectrogram = np.abs(estimation) log_spectrogram = 10 * np.log10(spectrogram**2) N, F_bin, T_bin = log_spectrogram.shape t = np.arange(T_bin + 1) f = np.arange(F_bin + 1) for n in range(N): plt.figure() plt.pcolormesh(t, f, log_spectrogram[n], cmap='jet') plt.savefig("data/Beamform/{}/specrtogram-{}.png".format(method, n), bbox_inches='tight') plt.close() estimated_signal = istft(estimation, fft_size=fft_size, hop_size=hop_size, length=T) print("Mixture: {}, Estimation: {}".format(mixed_signal.shape, estimated_signal.shape)) for idx in range(n_sources): _estimated_signal = estimated_signal[idx] write_wav("data/Beamform/{}/mixture-{}_estimated-{}.wav".format( method, sr, idx), signal=_estimated_signal, sr=sr)
if __name__ == '__main__': import os import numpy as np from scipy.signal import resample_poly from utils.utils_audio import read_wav, write_wav from stft import BatchSTFT, BatchInvSTFT os.makedirs("data/frequency_mask", exist_ok=True) fft_size, hop_size = 1024, 256 n_basis = 4 source1, sr = read_wav("data/man-44100.wav") source1 = resample_poly(source1, up=16000, down=sr) write_wav("data/man-16000.wav", signal=source1, sr=16000) T = len(source1) source2, sr = read_wav("data/woman-44100.wav") source2 = resample_poly(source2, up=16000, down=sr) write_wav("data/woman-16000.wav", signal=source2, sr=16000) mixture = source1 + source2 write_wav("data/mixture-16000.wav", signal=mixture, sr=16000) stft = BatchSTFT(fft_size=fft_size, hop_size=hop_size) istft = BatchInvSTFT(fft_size=fft_size, hop_size=hop_size) mixture = torch.Tensor(mixture).unsqueeze(dim=0) source1 = torch.Tensor(source1).unsqueeze(dim=0) source2 = torch.Tensor(source2).unsqueeze(dim=0)