def compute_spatial_feats(args, S): if args.type == "srp": num_ffts = nfft( args.frame_len) if args.round_power_of_two else args.frame_len srp_kwargs = { "sample_frequency": args.samp_frequency, "num_doa": args.num_doa, "num_bins": num_ffts // 2 + 1, "samp_doa": not args.samp_tdoa } linear_topo = list(map(float, args.linear_topo.split(","))) return srp_phat_linear(S, linear_topo, **srp_kwargs) elif args.type == "ipd": if S.ndim < 3: raise ValueError("Only one-channel STFT available") ipd_list = [] for p in args.ipd_index.split(";"): indexes = list(map(int, p.split(","))) if len(indexes) != 2: raise ValueError( "Invalid --ipd.index configuration detected: {}".format( args.ipd_index)) L, R = indexes if R > S.shape[0]: raise RuntimeError("Could not access channel {:d}".format(R)) ipd_mat = ipd(S[L], S[R], cos=args.ipd_cos, sin=args.ipd_sin) ipd_list.append(ipd_mat) # concat along frequency axis return np.hstack(ipd_list) else: return msc(S, context=args.msc_ctx)
def run(args): griffin_lim_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, "transpose": True, "epochs": args.epochs } feature_reader = ScriptReader(args.feat_scp) if args.fbank: mel_kwargs = { "n_mels": args.num_bins, "fmin": args.min_freq, "fmax": args.max_freq, "htk": True } # N x F mel_weights = audio_lib.filters.mel(args.samp_freq, nfft(args.frame_length), **mel_kwargs) # F x N mel_inv_weights = np.linalg.pinv(mel_weights) with WaveWriter( args.dump_dir, fs=args.samp_freq, normalize=args.normalize) as writer: for key, spec in feature_reader: # if log, tranform to linear if args.apply_log: spec = np.exp(spec) # convert fbank to spectrum # feat: T x N if args.fbank: spec = np.maximum(spec @ np.transpose(mel_inv_weights), EPSILON) # if power spectrum, tranform to magnitude spectrum if args.apply_pow: spec = np.sqrt(spec) if spec.shape[1] - 1 != nfft(args.frame_length) // 2: raise RuntimeError("Seems missing --fbank options?") # griffin lim samps = griffin_lim(spec, **griffin_lim_kwargs) writer.write(key, samps) logger.info("Processed {:d} utterance done".format(len(feature_reader)))
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "round_power_of_two": args.round_power_of_two, "window": args.window, "center": args.center, "transpose": False } spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader} init_mask_reader = MaskReader[args.fmt]( args.init_mask) if args.init_mask else None n_fft = nfft(args.frame_len) if args.round_power_of_two else args.frame_len # now use pb_bss pb_perm_solver = load_module(pb_bss_align_url) aligner = pb_perm_solver.DHTVPermutationAlignment.from_stft_size(n_fft) num_done = 0 with NumpyWriter(args.dst_dir) as writer: dst_dir = Path(args.dst_dir) for key, stft in spectrogram_reader: if not (dst_dir / f"{key}.npy").exists(): # K x F x T init_mask = None if init_mask_reader and key in init_mask_reader: init_mask = init_mask_reader[key] logger.info("Using external mask to initialize cacgmm") # stft: N x F x T trainer = CacgmmTrainer(stft, args.num_classes, gamma=init_mask, cgmm_init=args.cgmm_init) try: # EM progress masks = trainer.train(args.num_epoches) # align if needed if not args.cgmm_init or args.num_classes != 2: masks = aligner(masks) logger.info( "Permutation align done for each frequency") num_done += 1 writer.write(key, masks.astype(np.float32)) logger.info(f"Training utterance {key} ... Done") except np.linalg.LinAlgError: logger.warn(f"Training utterance {key} ... Failed") else: logger.info(f"Training utterance {key} ... Skip") logger.info( f"Train {num_done:d} utterances over {len(spectrogram_reader):d}")
def run(args): stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": False # F x T } spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader( args.mask_scp) num_bins = nfft(args.frame_length) // 2 + 1 supported_beamformer = { "mvdr": MvdrBeamformer(num_bins), "gevd": GevdBeamformer(num_bins), "pmwf": PmwfBeamformer(num_bins) } beamformer = supported_beamformer[args.beamformer] num_utts = 0 for key, stft_mat in spectrogram_reader: if key in mask_reader: num_utts += 1 norm = spectrogram_reader.samp_norm(key) logger.info("Processing utterance {}(norm to {:.2f})...".format( key, norm)) # prefer T x F speech_mask = mask_reader[key] if args.trans: speech_mask = np.transpose(speech_mask) # stft_enh, stft_mat: F x T stft_enh = beamformer.run(speech_mask, stft_mat, normalize=args.postf) # masking beamformer output if necessary if args.mask: stft_enh = stft_enh * np.transpose(speech_mask) istft(os.path.join(args.dst_dir, '{}.wav'.format(key)), stft_enh, norm=norm, fs=args.samp_freq, **stft_kwargs) logger.info("Processed {:d} utterances out of {:d}".format( num_utts, len(spectrogram_reader)))
def run(args): srp_pair = [ tuple(map(int, p.split(","))) for p in args.diag_pair.split(";") ] if not len(srp_pair): raise RuntimeError("Bad configurations with --pair {}".format( args.pair)) logger.info("Compute gcc with {}".format(srp_pair)) stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "round_power_of_two": args.round_power_of_two, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": True # T x F } num_done = 0 num_ffts = nfft( args.frame_len) if args.round_power_of_two else args.frame_len reader = SpectrogramReader(args.wav_scp, **stft_kwargs) with ArchiveWriter(args.srp_ark, args.scp) as writer: for key, stft_mat in reader: num_done += 1 srp = [] # N x T x F for (i, j) in srp_pair: srp.append( gcc_phat_diag( stft_mat[i], stft_mat[j], min(i, j) * np.pi * 2 / args.n, args.d, num_bins=num_ffts // 2 + 1, sr=args.sr, num_doa=args.num_doa)) srp = sum(srp) / len(srp_pair) nan = np.sum(np.isnan(srp)) if nan: raise RuntimeError("Matrix {} has nan ({:d}} items)".format( key, nan)) writer.write(key, srp) if not num_done % 1000: logger.info("Processed {:d} utterances...".format(num_done)) logger.info("Processd {:d} utterances done".format(len(reader)))
def run(args): mel_kwargs = { "n_mels": args.num_bins, "fmin": args.min_freq, "fmax": args.max_freq, "htk": True } stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "round_power_of_two": args.round_power_of_two, "window": args.window, "center": args.center, # false to comparable with kaldi "apply_log": False, "apply_pow": False, "normalize": args.norm, "apply_abs": True, "transpose": False # F x T } if args.max_freq > args.samp_freq // 2: raise RuntimeError("Max frequency for mel exceeds sample frequency") spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) # N x F mel_weights = audio_lib.filters.mel( args.samp_freq, nfft(args.frame_len) if args.round_power_of_two else args.frame_len, **mel_kwargs) WriterImpl = {"kaldi": ArchiveWriter, "exraw": BinaryWriter}[args.format] with WriterImpl(args.dup_ark, args.scp) as writer: for key, spectrum in spectrogram_reader: # N x F * F x T = N * T => T x N fbank = np.transpose( np.dot(mel_weights, spectrum[0] if spectrum.ndim == 3 else spectrum)) if args.log: fbank = np.log(np.maximum(fbank, EPSILON)) writer.write(key, fbank) logger.info("Process {:d} utterances".format(len(spectrogram_reader)))
def compute_spatial_feats(args, S): if args.type == "srp": srp_kwargs = { "sample_frequency": args.samp_frequency, "num_doa": args.num_doa, "num_bins": nfft(args.frame_length) // 2 + 1, "samp_doa": not args.samp_tdoa } linear_topo = list(map(float, args.linear_topo.split(","))) return srp_phat(S, linear_topo, **srp_kwargs) elif args.type == "ipd": indexes = list(map(int, args.ipd_index.split(","))) if len(indexes) != 2: raise ValueError( "Invalid --ipd.index configuration detected: {}".format( args.ipd_index)) if S.ndim < 3: raise ValueError("Only one-channel STFT available") L, R = indexes return ipd(S[L], S[R], sin=args.ipd_sin) else: return msc(S, context=args.context)
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "round_power_of_two": args.round_power_of_two, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": False # F x T } feat_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader} mask_reader = MaskReader[args.fmt](args.mask_scp) num_bins = nfft(args.frame_len) // 2 + 1 beamformer = MvdrBeamformer(num_bins) num_done = 0 with ArchiveWriter(args.dup_ark, args.scp) as writer: for key, spect in feat_reader: if key in mask_reader: speech_masks = mask_reader[key] # make sure speech_masks in T x F _, F, _ = spect.shape if speech_masks.shape[0] == F: speech_masks = np.transpose(speech_masks) speech_masks = np.minimum(speech_masks, 1) # spectrogram: N x F x T speech_covar = beamformer.compute_covar_mat( speech_masks, spect) sv = beamformer.compute_steer_vector(speech_covar) df = directional_feats(spect, sv.T) writer.write(key, df) num_done += 1 if not num_done % 1000: logger.info("Processed {:d} utterance...".format(num_done)) else: logger.warn("Missing TF-mask for utterance {}".format(key)) logger.info("Processed {:d} utterances over {:d}".format( num_done, len(feat_reader)))
def run(args): stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": False # F x T } feat_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader( args.mask_scp) num_bins = nfft(args.frame_length) // 2 + 1 beamformer = MvdrBeamformer(num_bins) num_done = 0 with ArchiveWriter(args.dup_ark, args.scp) as writer: for key, spect in feat_reader: if key in mask_reader: speech_masks = mask_reader[key] if args.trans: speech_masks = np.transpose(speech_masks) speech_masks = np.minimum(speech_masks, 1) # spectrogram: N x F x T speech_covar = beamformer.compute_covar_mat( speech_masks, spect) sv = beamformer.compute_steer_vector(speech_covar) df = directional_feats(spect, sv.T) writer.write(key, df) num_done += 1 if not num_done % 1000: logger.info("Processed {:d} utterance...".format(num_done)) else: logger.warn("Missing TF-mask for utterance {}".format(key)) logger.info("Processed {:d} utterances over {:d}".format( num_done, len(feat_reader)))
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": False # F x T } spectrogram_reader = SpectrogramReader( args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader} mask_reader = MaskReader[args.fmt](args.mask_scp) online = False num_bins = nfft(args.frame_len) // 2 + 1 supported_beamformer = { "mvdr": MvdrBeamformer(num_bins), "gevd": GevdBeamformer(num_bins), "pmwf": PmwfBeamformer(num_bins) } supported_online_beamformer = { "mvdr": OnlineMvdrBeamformer(num_bins, args.channels, args.alpha), "gevd": OnlineGevdBeamformer(num_bins, args.channels, args.alpha), } if args.chunk_size <= 0: logger.info("Using offline {} beamformer".format(args.beamformer)) beamformer = supported_beamformer[args.beamformer] else: if args.chunk_size < 32: raise RuntimeError( "Seems chunk size({:.2f}) too small for online beamformer". format(args.chunk_size)) beamformer = supported_online_beamformer[args.beamformer] online = True logger.info("Using online {} beamformer, chunk size = {:d}".format( args.beamformer, args.chunk_size)) num_done = 0 with WaveWriter(args.dst_dir, fs=args.samp_freq) as writer: for key, stft_mat in spectrogram_reader: if key in mask_reader: num_done += 1 power = spectrogram_reader.power(key) logger.info( "Processing utterance {}, signal power {:.2f}...".format( key, 10 * np.log10(power + 1e-5))) # prefer T x F speech_mask = mask_reader[key] # constraint [0, 1] speech_mask = np.minimum(speech_mask, 1) # make sure speech_mask at shape T x F _, F, _ = stft_mat.shape # if in F x T if speech_mask.shape[0] == F: speech_mask = np.transpose(speech_mask) # stft_enh, stft_mat: (N) x F x T if not online: stft_enh = beamformer.run(speech_mask, stft_mat, normalize=args.ban) else: stft_enh = do_online_beamform(beamformer, speech_mask, stft_mat, args) # masking beamformer output if necessary if args.mask: stft_enh = stft_enh * np.transpose(speech_mask) samps = istft(stft_enh, power=power, **stft_kwargs) writer.write(key, samps) logger.info("Processed {:d} utterances out of {:d}".format( num_done, len(spectrogram_reader)))
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": False # F x T } spectrogram_reader = SpectrogramReader( args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader} tgt_mask_reader = MaskReader[args.fmt](args.tgt_mask) itf_mask_reader = MaskReader[args.fmt]( args.tgt_mask) if args.itf_mask else None if itf_mask_reader is not None: logger.info(f"Using interfering masks from {args.itf_mask}") online = False num_bins = nfft(args.frame_len) // 2 + 1 ref_channel = args.pmwf_ref if args.pmwf_ref >= 0 else None supported_beamformer = { "mvdr": MvdrBeamformer(num_bins), "gevd": GevdBeamformer(num_bins), "pmwf-0": PmwfBeamformer(num_bins, beta=0, ref_channel=ref_channel), "pmwf-1": PmwfBeamformer(num_bins, beta=1, ref_channel=ref_channel) } supported_online_beamformer = { "mvdr": OnlineMvdrBeamformer(num_bins, args.channels, args.alpha), "gevd": OnlineGevdBeamformer(num_bins, args.channels, args.alpha), } if args.chunk_size <= 0: logger.info(f"Using offline {args.beamformer} beamformer") beamformer = supported_beamformer[args.beamformer] else: if args.chunk_size < 32: raise RuntimeError(f"Seems chunk size({args.chunk_size:.2f}) " + "too small for online beamformer") beamformer = supported_online_beamformer[args.beamformer] online = True logger.info(f"Using online {args.beamformer} beamformer, " + f"chunk size = {args.chunk_size:d}") num_done = 0 with WaveWriter(args.dst_dir, fs=args.samp_freq) as writer: for key, stft_mat in spectrogram_reader: if key in tgt_mask_reader: power = spectrogram_reader.power(key) logger.info( f"Processing utterance {key}, " + f"signal power {10 * np.log10(power + 1e-5):.2f}...") # prefer T x F speech_mask = tgt_mask_reader[key] # constraint [0, 1] if itf_mask_reader is None: speech_mask = np.minimum(speech_mask, 1) interf_mask = None else: interf_mask = itf_mask_reader[key] # make sure speech_mask at shape T x F _, F, _ = stft_mat.shape # if in F x T if speech_mask.shape[0] == F: speech_mask = np.transpose(speech_mask) if interf_mask is not None: interf_mask = np.transpose(interf_mask) if 0.5 < args.vad_proportion < 1: vad_mask, N = compute_vad_masks(stft_mat[0], args.vad_proportion) logger.info(f"Filtering {N} TF-masks...") speech_mask = np.where(vad_mask, 1.0e-4, speech_mask) if interf_mask is not None: interf_mask = np.where(vad_mask, 1.0e-4, interf_mask) # stft_enh, stft_mat: (N) x F x T try: if not online: stft_enh = beamformer.run(speech_mask, stft_mat, noise_mask=interf_mask, normalize=args.ban) else: stft_enh = do_online_beamform(beamformer, speech_mask, interf_mask, stft_mat, args) except np.linalg.LinAlgError: logger.error(f"Raise linalg error: {key}") continue # masking beamformer output if necessary if args.mask: stft_enh = stft_enh * np.transpose(speech_mask) samps = istft(stft_enh, power=power, **stft_kwargs) writer.write(key, samps) num_done += 1 logger.info(f"Processed {num_done:d} utterances " + f"out of {len(spectrogram_reader):d}")