def __call__(self, noise, clean1, clean2, face1, face2): noise = xp.asarray(noise).astype(xp.float32) clean1 = xp.asarray(clean1).astype(xp.float32) clean2 = xp.asarray(clean2).astype(xp.float32) face1 = xp.asarray(face1).astype(xp.float32)[:, :, :, xp.newaxis] face2 = xp.asarray(face2).astype(xp.float32)[:, :, :, xp.newaxis] clean = xp.concatenate((clean1, clean2), axis=3) compressed_noise, _ = op.compress_audio(noise) compressed_clean, _ = op.compress_audio(clean) mask1, mask2 = self.estimate_mask(spec=compressed_noise, face1=face1, face2=face2) separated1 = op.mul(mask1, compressed_noise) separated2 = op.mul(mask2, compressed_noise) separated = F.concat((separated1, separated2), axis=3) # (6, 2, 301, 514) loss = evaluate_loss(self, separated, compressed_clean) return loss
def __call__(self, noise, clean): noise = xp.asarray(noise).astype(xp.float32) clean = xp.asarray(clean).astype(xp.float32) compressed_noise, _ = op.compress_audio(noise) compressed_clean, _ = op.compress_audio(clean) mask, _ = self.estimate_mask(spec=compressed_noise) separated = op.mul(mask, compressed_noise) loss = evaluate_loss(self, separated, compressed_clean) return loss
def predict(model): print("estimate mask...") if env.INPUT_FACE == 0: noise, clean1 = dataset.load_dataset_audio( list([ env.TRAIN + 7, env.TRAIN + 1, env.TRAIN + 8, env.TRAIN + 3, env.TRAIN + 4, env.TRAIN + 5, env.TRAIN + 6 ])) compressed_noise, _ = op.compress_audio(noise) mask1, mask2 = model.estimate_mask(spec=compressed_noise) elif env.INPUT_FACE == 1: noise, clean1, face1 = dataset.load_dataset_single( list([ env.TRAIN, env.TRAIN + 1, env.TRAIN + 2, env.TRAIN + 3, env.TRAIN + 4, env.TRAIN + 5, env.TRAIN + 6 ])) compressed_noise, _ = op.compress_audio(noise) mask1, mask2 = model.estimate_mask(spec=compressed_noise, face=face1) else: noise, clean1, clean2, face1, face2 = dataset.load_dataset_double( list([ env.TRAIN, env.TRAIN + 1, env.TRAIN + 2, env.TRAIN + 3, env.TRAIN + 4, env.TRAIN + 5, env.TRAIN + 6 ])) compressed_noise, _ = op.compress_audio(noise) mask1, mask2 = model.estimate_mask(spec=compressed_noise, face1=face1, face2=face2) print("mul mask...") compressed_separated1 = op.mul(mask1, compressed_noise) compressed_separated2 = op.mul(mask2, compressed_noise) compressed_clean1, _ = op.compress_audio(clean1) loss = evaluate_loss(model, compressed_separated1, compressed_clean1) print(loss) print("reconstruct audio...") n = op.reconstruct_audio_complex(chainer.cuda.to_cpu(compressed_noise)) c1 = op.reconstruct_audio_complex(chainer.cuda.to_cpu(compressed_clean1)) # c2 = op.reconstruct_audio_complex(chainer.cuda.to_cpu(compressed_clean2)) y1 = op.reconstruct_audio_complex( chainer.cuda.to_cpu(compressed_separated1.data)) y2 = op.reconstruct_audio_complex( chainer.cuda.to_cpu(compressed_separated2.data)) print("save files...") for i in range(n.shape[2]): print("{0}/{1}".format(i + 1, n.shape[2])) util.istft_and_save( "{}/{}-synthesis.wav".format(os.environ['RESULT_DIR'], i), n[:, :, i]) util.istft_and_save( "{}/{}-clean1.wav".format(os.environ['RESULT_DIR'], i), c1[:, :, i]) # util.istft_and_save("{}/{}-clean2.wav".format(os.environ['RESULT_DIR'], i), c2[:, :, i]) util.istft_and_save( "{}/{}-separated1.wav".format(os.environ['RESULT_DIR'], i), y1[:, :, i]) util.istft_and_save( "{}/{}-separated2.wav".format(os.environ['RESULT_DIR'], i), y2[:, :, i])