def run(args): stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, "transpose": True # F x T instead of T x F } spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) for key, spectrogram in spectrogram_reader: logger.info("Processing utterance {}...".format(key)) separated = auxiva(spectrogram, args.epochs) for idx in range(separated.shape[0]): samps = istft( separated[idx], **stft_kwargs, norm=spectrogram_reader.samp_norm(key)) write_wav( os.path.join(args.dst_dir, "{}.SRC{:d}.wav".format( key, idx + 1)), samps, fs=args.fs) logger.info("Processed {:d} utterances".format(len(spectrogram_reader)))
def run(args): # shape: T x F, complex stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, } spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader( args.mask_scp) num_done = 0 with WaveWriter(args.dst_dir, fs=args.samp_freq) as writer: for key, specs in spectrogram_reader: if key in mask_reader: num_done += 1 mask = mask_reader[key] if args.transpose: mask = np.transpose(mask) logger.info("Processing utterance {}...".format(key)) if mask.shape != specs.shape: raise ValueError( "Dimention mismatch between mask and spectrogram" "({0[0]} x {0[1]} vs {1[0]} x {1[1]}), need check configures" .format(mask.shape, specs.shape)) nsamps = spectrogram_reader.nsamps( key) if args.keep_length else None norm = spectrogram_reader.samp_norm(key) samps = istft( specs * mask, **stft_kwargs, norm=norm, nsamps=nsamps) writer.write(key, samps) logger.info("Processed {:d} utterances over {:d}".format( num_done, len(spectrogram_reader)))
def run(args): # shape: T x F, complex stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, } spectrogram_reader = SpectrogramReader( args.wav_scp, **stft_kwargs, round_power_of_two=args.round_power_of_two) phase_reader = None if args.phase_ref: phase_reader = SpectrogramReader( args.phase_ref, **stft_kwargs, round_power_of_two=args.round_power_of_two) logger.info("Using phase reference from {}".format(args.phase_ref)) MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader} mask_reader = MaskReader[args.fmt](args.mask_scp) num_done = 0 with WaveWriter(args.dst_dir, fs=args.samp_freq) as writer: for key, specs in spectrogram_reader: # specs: T x F if key in mask_reader: num_done += 1 mask = mask_reader[key] # mask sure mask in T x F _, F = specs.shape if mask.shape[0] == F: mask = np.transpose(mask) logger.info("Processing utterance {}...".format(key)) if mask.shape != specs.shape: raise ValueError( "Dimention mismatch between mask and spectrogram" "({0[0]} x {0[1]} vs {1[0]} x {1[1]}), need check configures" .format(mask.shape, specs.shape)) nsamps = spectrogram_reader.nsamps( key) if args.keep_length else None norm = spectrogram_reader.samp_norm(key) # use phase from ref if phase_reader is not None: angle = np.angle(phase_reader[key]) phase = np.exp(angle * 1j) samps = istft(np.abs(specs) * mask * phase, **stft_kwargs, norm=norm, nsamps=nsamps) else: samps = istft(specs * mask, **stft_kwargs, norm=norm, nsamps=nsamps) writer.write(key, samps) logger.info("Processed {:d} utterances over {:d}".format( num_done, len(spectrogram_reader)))
def run(args): stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": False # F x T } spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader( args.mask_scp) num_bins = nfft(args.frame_length) // 2 + 1 supported_beamformer = { "mvdr": MvdrBeamformer(num_bins), "gevd": GevdBeamformer(num_bins), "pmwf": PmwfBeamformer(num_bins) } beamformer = supported_beamformer[args.beamformer] num_utts = 0 for key, stft_mat in spectrogram_reader: if key in mask_reader: num_utts += 1 norm = spectrogram_reader.samp_norm(key) logger.info("Processing utterance {}(norm to {:.2f})...".format( key, norm)) # prefer T x F speech_mask = mask_reader[key] if args.trans: speech_mask = np.transpose(speech_mask) # stft_enh, stft_mat: F x T stft_enh = beamformer.run(speech_mask, stft_mat, normalize=args.postf) # masking beamformer output if necessary if args.mask: stft_enh = stft_enh * np.transpose(speech_mask) istft(os.path.join(args.dst_dir, '{}.wav'.format(key)), stft_enh, norm=norm, fs=args.samp_freq, **stft_kwargs) logger.info("Processed {:d} utterances out of {:d}".format( num_utts, len(spectrogram_reader)))