def run(args): # shape: T x F, complex stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, } spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader( args.mask_scp) num_utts = 0 fs = args.samp_freq, for key, specs in spectrogram_reader: if key in mask_reader: num_utts += 1 mask = mask_reader[key] if args.transpose: mask = np.transpose(mask) logger.info("Processing utterance {}...".format(key)) if mask.shape != specs.shape: raise ValueError( "Dimention mismatch between mask and spectrogram" "({0[0]} x {0[1]} vs {1[0]} x {1[1]}), need check configures" .format(mask.shape, specs.shape)) nsamps = spectrogram_reader.nsamps( key) if args.keep_length else None istft(os.path.join(args.dst_dir, "{}.wav".format(key)), specs * mask, **stft_kwargs, fs=fs, nsamps=nsamps) logger.info("Processed {} utterances".format(num_utts))
def run(args): stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": True # T x F } wpe_kwargs = { "taps": args.taps, "delay": args.delay, "iters": args.iters, "psd_context": args.context } spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) if not os.path.exists(args.dst_dir): os.makedirs(args.dst_dir) for key, reverbed in spectrogram_reader: # N x T x F => F x N x T reverbed = np.transpose(reverbed, [2, 0, 1]) # F x N x T dereverb = wpe(reverbed, **wpe_kwargs) # F x N x T => N x T x F dereverb = np.transpose(dereverb, [1, 2, 0]) # write for each channel for chid in range(dereverb.shape[0]): chpath = os.path.join(args.dst_dir, "{}.CH{:d}.wav".format(key, chid + 1)) istft(chpath, dereverb[chid], **stft_kwargs, fs=args.samp_freq) logger.info("Processed {:d} utterances".format(len(spectrogram_reader)))
def run(args): # shape: T x F, complex stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, } spectrogram_reader = SpectrogramReader( args.wav_scp, **stft_kwargs, round_power_of_two=args.round_power_of_two) phase_reader = None if args.phase_ref: phase_reader = SpectrogramReader( args.phase_ref, **stft_kwargs, round_power_of_two=args.round_power_of_two) logger.info("Using phase reference from {}".format(args.phase_ref)) MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader} mask_reader = MaskReader[args.fmt](args.mask_scp) num_done = 0 with WaveWriter(args.dst_dir, fs=args.samp_freq) as writer: for key, specs in spectrogram_reader: # specs: T x F if key in mask_reader: num_done += 1 mask = mask_reader[key] # mask sure mask in T x F _, F = specs.shape if mask.shape[0] == F: mask = np.transpose(mask) logger.info("Processing utterance {}...".format(key)) if mask.shape != specs.shape: raise ValueError( "Dimention mismatch between mask and spectrogram" "({0[0]} x {0[1]} vs {1[0]} x {1[1]}), need check configures" .format(mask.shape, specs.shape)) nsamps = spectrogram_reader.nsamps( key) if args.keep_length else None norm = spectrogram_reader.samp_norm(key) # use phase from ref if phase_reader is not None: angle = np.angle(phase_reader[key]) phase = np.exp(angle * 1j) samps = istft(np.abs(specs) * mask * phase, **stft_kwargs, norm=norm, nsamps=nsamps) else: samps = istft(specs * mask, **stft_kwargs, norm=norm, nsamps=nsamps) writer.write(key, samps) logger.info("Processed {:d} utterances over {:d}".format( num_done, len(spectrogram_reader)))
def run(args): stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, "transpose": True # F x T instead of T x F } spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) for key, spectrogram in spectrogram_reader: logger.info("Processing utterance {}...".format(key)) separated = auxiva(spectrogram, args.epochs) for idx in range(separated.shape[0]): samps = istft( separated[idx], **stft_kwargs, norm=spectrogram_reader.samp_norm(key)) write_wav( os.path.join(args.dst_dir, "{}.SRC{:d}.wav".format( key, idx + 1)), samps, fs=args.fs) logger.info("Processed {:d} utterances".format(len(spectrogram_reader)))
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, "transpose": False } topo = list(map(float, args.linear_topo.split(","))) doa = args.doa if args.doa > 0 else 180 + args.doa if doa < 0 or doa > 180: raise RuntimeError("Illegal value for DoA: {:.2f}".format(args.doa)) spectrogram_reader = SpectrogramReader( args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) beamformer = DSBeamformer(topo) logger.info("Initialize {:d} channel DSBeamformer".format(len(topo))) with WaveWriter(args.dst_dir, fs=args.fs) as writer: for key, stft_src in spectrogram_reader: stft_enh = beamformer.run( doa, stft_src, c=args.speed, sample_rate=args.fs) power = spectrogram_reader.power(key) samps = istft(stft_enh, **stft_kwargs, power=power) writer.write(key, samps) logger.info("Processed {:d} utterances".format(len(spectrogram_reader)))
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, "transpose": False } spectrogram_reader = SpectrogramReader( args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) weights_dict = loadmat(args.weights) if args.weight_key not in weights_dict: raise KeyError("Weight key error: no \'{}\' in {}".format( args.weight_key, args.weights)) beamformer = FixedBeamformer(weights_dict[args.weight_key]) with WaveWriter(args.dump_dir) as writer: for key, stft_mat in spectrogram_reader: logger.info("Processing utterance {}...".format(key)) stft_enh = beamformer.run(stft_mat) # do not normalize samps = istft(stft_enh, **stft_kwargs) writer.write(key, samps) logger.info("Processed {:d} utterances".format(len(spectrogram_reader)))
def run(args): # shape: T x F, complex stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, } spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader( args.mask_scp) num_done = 0 with WaveWriter(args.dst_dir, fs=args.samp_freq) as writer: for key, specs in spectrogram_reader: if key in mask_reader: num_done += 1 mask = mask_reader[key] if args.transpose: mask = np.transpose(mask) logger.info("Processing utterance {}...".format(key)) if mask.shape != specs.shape: raise ValueError( "Dimention mismatch between mask and spectrogram" "({0[0]} x {0[1]} vs {1[0]} x {1[1]}), need check configures" .format(mask.shape, specs.shape)) nsamps = spectrogram_reader.nsamps( key) if args.keep_length else None norm = spectrogram_reader.samp_norm(key) samps = istft( specs * mask, **stft_kwargs, norm=norm, nsamps=nsamps) writer.write(key, samps) logger.info("Processed {:d} utterances over {:d}".format( num_done, len(spectrogram_reader)))
def run(args): stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": False # F x T } spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader( args.mask_scp) num_bins = nfft(args.frame_length) // 2 + 1 supported_beamformer = { "mvdr": MvdrBeamformer(num_bins), "gevd": GevdBeamformer(num_bins), "pmwf": PmwfBeamformer(num_bins) } beamformer = supported_beamformer[args.beamformer] num_utts = 0 for key, stft_mat in spectrogram_reader: if key in mask_reader: num_utts += 1 norm = spectrogram_reader.samp_norm(key) logger.info("Processing utterance {}(norm to {:.2f})...".format( key, norm)) # prefer T x F speech_mask = mask_reader[key] if args.trans: speech_mask = np.transpose(speech_mask) # stft_enh, stft_mat: F x T stft_enh = beamformer.run(speech_mask, stft_mat, normalize=args.postf) # masking beamformer output if necessary if args.mask: stft_enh = stft_enh * np.transpose(speech_mask) istft(os.path.join(args.dst_dir, '{}.wav'.format(key)), stft_enh, norm=norm, fs=args.samp_freq, **stft_kwargs) logger.info("Processed {:d} utterances out of {:d}".format( num_utts, len(spectrogram_reader)))
def run(args): # return complex result stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center } logger.info("Using mask: {}".format(args.mask.upper())) mixture_reader = SpectrogramReader(args.mix_scp, **stft_kwargs) ref_scp_list = args.ref_scp.split(",") logger.info("Number of speakers: {:d}".format(len(ref_scp_list))) targets_reader = [ SpectrogramReader(scp, **stft_kwargs) for scp in ref_scp_list ] num_utts = 0 for key, mixture in tqdm(mixture_reader): nsamps = mixture_reader.nsamps(key) if args.keep_length else None skip = False for reader in targets_reader: if key not in reader: logger.info("Skip utterance {}, missing targets".format(key)) skip = True break if skip: continue num_utts += 1 targets_list = [reader[key] for reader in targets_reader] spk_masks = compute_mask(mixture, targets_list, args.mask) for index, mask in enumerate(spk_masks): istft(os.path.join(args.dump_dir, '{}.s{}.wav'.format(key, index + 1)), mixture * mask, **stft_kwargs, fs=args.fs, nsamps=nsamps) logger.info("Processed {} utterance!".format(num_utts))
def run(args): stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, "transpose": False } spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) weights_dict = loadmat(args.weights) if args.weight_key not in weights_dict: raise KeyError("Weight key error: no \'{}\' in {}".format( args.weight_key, args.weights)) beamformer = FixedBeamformer(weights_dict[args.weight_key]) num_utts = 0 for key, stft_mat in spectrogram_reader: num_utts += 1 logger.info("Processing utterance {}".format(key)) stft_enh = beamformer.run(stft_mat) # do not normalize istft(os.path.join(args.dst_dir, '{}.wav'.format(key)), stft_enh, **stft_kwargs) logger.info("Processed {} utterances".format(num_utts))
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, } FeatureReader = {"numpy": NumpyReader, "kaldi": ScriptReader} feature_reader = FeatureReader[args.fmt](args.feat_scp) phase_reader = None if args.phase_ref: phase_reader = SpectrogramReader( args.phase_ref, **stft_kwargs, round_power_of_two=args.round_power_of_two) logger.info(f"Using phase reference from {args.phase_ref}") with WaveWriter(args.dump_dir, fs=args.sr, normalize=args.normalize) as writer: for key, spec in feature_reader: logger.info(f"Processing utterance {key}...") # if log, tranform to linear if args.apply_log: spec = np.exp(spec) # if power spectrum, tranform to magnitude spectrum if args.apply_pow: spec = np.sqrt(spec) if phase_reader is None: # griffin lim samps = griffin_lim(spec, epoches=args.epoches, transpose=True, norm=0.8, **stft_kwargs) else: if key not in phase_reader: raise KeyError(f"Missing key {key} in phase reader") ref = phase_reader[key] angle = np.angle(ref[0] if ref.ndim == 3 else ref) phase = np.exp(angle * 1j) samps = istft(spec * phase, **stft_kwargs, norm=0.8) writer.write(key, samps) logger.info(f"Processed {len(feature_reader)} utterance done")
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": True # T x F } wpe_kwargs = { "num_iters": args.num_iters, "context": args.context, "taps": args.taps, "delay": args.delay } spectrogram_reader = SpectrogramReader( args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) num_done = 0 with WaveWriter(args.dst_dir, fs=args.samp_fs) as writer: for key, reverbed in spectrogram_reader: logger.info("Processing utt {}...".format(key)) # N x T x F => F x N x T reverbed = np.transpose(reverbed, (2, 0, 1)) try: # F x N x T dereverb = wpe(reverbed, **wpe_kwargs) except np.linalg.LinAlgError: logger.warn("{}: Failed cause LinAlgError in wpe".format(key)) continue # F x N x T => N x T x F dereverb = np.transpose(dereverb, (1, 2, 0)) # dump multi-channel samps = np.stack( [istft(spectra, **stft_kwargs) for spectra in dereverb]) writer.write(key, samps) # show progress cause slow speed num_done += 1 if not num_done % 100: logger.info("Processed {:d} utterances...".format(num_done)) logger.info("Processed {:d} utterances over {:d}".format( num_done, len(spectrogram_reader)))
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": False # F x T } spectrogram_reader = SpectrogramReader( args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader} mask_reader = MaskReader[args.fmt](args.mask_scp) online = False num_bins = nfft(args.frame_len) // 2 + 1 supported_beamformer = { "mvdr": MvdrBeamformer(num_bins), "gevd": GevdBeamformer(num_bins), "pmwf": PmwfBeamformer(num_bins) } supported_online_beamformer = { "mvdr": OnlineMvdrBeamformer(num_bins, args.channels, args.alpha), "gevd": OnlineGevdBeamformer(num_bins, args.channels, args.alpha), } if args.chunk_size <= 0: logger.info("Using offline {} beamformer".format(args.beamformer)) beamformer = supported_beamformer[args.beamformer] else: if args.chunk_size < 32: raise RuntimeError( "Seems chunk size({:.2f}) too small for online beamformer". format(args.chunk_size)) beamformer = supported_online_beamformer[args.beamformer] online = True logger.info("Using online {} beamformer, chunk size = {:d}".format( args.beamformer, args.chunk_size)) num_done = 0 with WaveWriter(args.dst_dir, fs=args.samp_freq) as writer: for key, stft_mat in spectrogram_reader: if key in mask_reader: num_done += 1 power = spectrogram_reader.power(key) logger.info( "Processing utterance {}, signal power {:.2f}...".format( key, 10 * np.log10(power + 1e-5))) # prefer T x F speech_mask = mask_reader[key] # constraint [0, 1] speech_mask = np.minimum(speech_mask, 1) # make sure speech_mask at shape T x F _, F, _ = stft_mat.shape # if in F x T if speech_mask.shape[0] == F: speech_mask = np.transpose(speech_mask) # stft_enh, stft_mat: (N) x F x T if not online: stft_enh = beamformer.run(speech_mask, stft_mat, normalize=args.ban) else: stft_enh = do_online_beamform(beamformer, speech_mask, stft_mat, args) # masking beamformer output if necessary if args.mask: stft_enh = stft_enh * np.transpose(speech_mask) samps = istft(stft_enh, power=power, **stft_kwargs) writer.write(key, samps) logger.info("Processed {:d} utterances out of {:d}".format( num_done, len(spectrogram_reader)))
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": False # F x T } spectrogram_reader = SpectrogramReader( args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader} tgt_mask_reader = MaskReader[args.fmt](args.tgt_mask) itf_mask_reader = MaskReader[args.fmt]( args.tgt_mask) if args.itf_mask else None if itf_mask_reader is not None: logger.info(f"Using interfering masks from {args.itf_mask}") online = False num_bins = nfft(args.frame_len) // 2 + 1 ref_channel = args.pmwf_ref if args.pmwf_ref >= 0 else None supported_beamformer = { "mvdr": MvdrBeamformer(num_bins), "gevd": GevdBeamformer(num_bins), "pmwf-0": PmwfBeamformer(num_bins, beta=0, ref_channel=ref_channel), "pmwf-1": PmwfBeamformer(num_bins, beta=1, ref_channel=ref_channel) } supported_online_beamformer = { "mvdr": OnlineMvdrBeamformer(num_bins, args.channels, args.alpha), "gevd": OnlineGevdBeamformer(num_bins, args.channels, args.alpha), } if args.chunk_size <= 0: logger.info(f"Using offline {args.beamformer} beamformer") beamformer = supported_beamformer[args.beamformer] else: if args.chunk_size < 32: raise RuntimeError(f"Seems chunk size({args.chunk_size:.2f}) " + "too small for online beamformer") beamformer = supported_online_beamformer[args.beamformer] online = True logger.info(f"Using online {args.beamformer} beamformer, " + f"chunk size = {args.chunk_size:d}") num_done = 0 with WaveWriter(args.dst_dir, fs=args.samp_freq) as writer: for key, stft_mat in spectrogram_reader: if key in tgt_mask_reader: power = spectrogram_reader.power(key) logger.info( f"Processing utterance {key}, " + f"signal power {10 * np.log10(power + 1e-5):.2f}...") # prefer T x F speech_mask = tgt_mask_reader[key] # constraint [0, 1] if itf_mask_reader is None: speech_mask = np.minimum(speech_mask, 1) interf_mask = None else: interf_mask = itf_mask_reader[key] # make sure speech_mask at shape T x F _, F, _ = stft_mat.shape # if in F x T if speech_mask.shape[0] == F: speech_mask = np.transpose(speech_mask) if interf_mask is not None: interf_mask = np.transpose(interf_mask) if 0.5 < args.vad_proportion < 1: vad_mask, N = compute_vad_masks(stft_mat[0], args.vad_proportion) logger.info(f"Filtering {N} TF-masks...") speech_mask = np.where(vad_mask, 1.0e-4, speech_mask) if interf_mask is not None: interf_mask = np.where(vad_mask, 1.0e-4, interf_mask) # stft_enh, stft_mat: (N) x F x T try: if not online: stft_enh = beamformer.run(speech_mask, stft_mat, noise_mask=interf_mask, normalize=args.ban) else: stft_enh = do_online_beamform(beamformer, speech_mask, interf_mask, stft_mat, args) except np.linalg.LinAlgError: logger.error(f"Raise linalg error: {key}") continue # masking beamformer output if necessary if args.mask: stft_enh = stft_enh * np.transpose(speech_mask) samps = istft(stft_enh, power=power, **stft_kwargs) writer.write(key, samps) num_done += 1 logger.info(f"Processed {num_done:d} utterances " + f"out of {len(spectrogram_reader):d}")