def compute_spatial_feats(args, S): if args.type == "srp": num_ffts = nextpow2( args.frame_len) if args.round_power_of_two else args.frame_len srp_kwargs = { "sample_frequency": args.samp_frequency, "num_doa": args.num_doa, "num_bins": num_ffts // 2 + 1, "samp_doa": not args.samp_tdoa } return srp_phat_linear(S, args.linear_topo, **srp_kwargs) elif args.type == "ipd": if S.ndim < 3: raise ValueError("Only one-channel STFT available") ipd_list = [] for p in args.ipd_pair.split(";"): indexes = list(map(int, p.split(","))) if len(indexes) != 2: raise ValueError("Invalid --ipd.pair configuration " + f"detected: {args.ipd_pair}") L, R = indexes if R > S.shape[0]: raise RuntimeError(f"Could not access channel {R}") ipd_mat = ipd(S[L], S[R], cos=args.ipd_cos, sin=args.ipd_sin) ipd_list.append(ipd_mat) # concat along frequency axis return np.hstack(ipd_list) else: return msc(S, context=args.msc_ctx)
def run(args): srp_pair = [ tuple(map(int, p.split(","))) for p in args.diag_pair.split(";") ] if not len(srp_pair): raise RuntimeError("Bad configurations with --pair {}".format( args.pair)) logger.info("Compute gcc with {}".format(srp_pair)) stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "round_power_of_two": args.round_power_of_two, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": True # T x F } num_done = 0 num_ffts = nextpow2( args.frame_len) if args.round_power_of_two else args.frame_len reader = SpectrogramReader(args.wav_scp, **stft_kwargs) with ArchiveWriter(args.srp_ark, args.scp) as writer: for key, stft_mat in reader: num_done += 1 srp = [] # N x T x F for (i, j) in srp_pair: srp.append( gcc_phat_diag(stft_mat[i], stft_mat[j], min(i, j) * np.pi * 2 / args.n, args.d, num_bins=num_ffts // 2 + 1, sr=args.sr, num_doa=args.num_doa)) srp = sum(srp) / len(srp_pair) nan = np.sum(np.isnan(srp)) if nan: raise RuntimeError("Matrix {} has nan ({:d}} items)".format( key, nan)) writer.write(key, srp) if not num_done % 1000: logger.info("Processed {:d} utterances...".format(num_done)) logger.info("Processd {:d} utterances done".format(len(reader)))
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "round_power_of_two": args.round_power_of_two, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": False # F x T } feat_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader} mask_reader = MaskReader[args.fmt](args.mask_scp) num_bins = nextpow2(args.frame_len) // 2 + 1 beamformer = MvdrBeamformer(num_bins) df_pair = [tuple(map(int, p.split(","))) for p in args.df_pair.split(";")] if not len(df_pair): raise RuntimeError(f"Bad configurations with --pair {args.pair}") logger.info(f"Compute directional feature with {df_pair}") num_done = 0 with ArchiveWriter(args.dup_ark, args.scp) as writer: for key, spect in feat_reader: if key in mask_reader: speech_masks = mask_reader[key] # make sure speech_masks in T x F _, F, _ = spect.shape if speech_masks.shape[0] == F: speech_masks = np.transpose(speech_masks) speech_masks = np.minimum(speech_masks, 1) # spectrogram: N x F x T speech_covar = beamformer.compute_covar_mat( speech_masks, spect) sv = beamformer.compute_steer_vector(speech_covar) df = directional_feats(spect, sv.T, df_pair=df_pair) writer.write(key, df) num_done += 1 if not num_done % 1000: logger.info(f"Processed {num_done:d} utterance...") else: logger.warn(f"Missing TF-mask for utterance {key}") logger.info(f"Processed {num_done:d} utterances over {len(feat_reader):d}")
def run(args): mel_kwargs = { "n_mels": args.num_bins, "fmin": args.min_freq, "fmax": args.max_freq, "htk": True } stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "round_power_of_two": args.round_power_of_two, "window": args.window, "center": args.center, # false to comparable with kaldi "apply_log": False, "apply_pow": False, "normalize": args.norm, "apply_abs": True, "transpose": False # F x T } if args.max_freq > args.sr // 2: raise RuntimeError("Max frequency for mel exceeds sample frequency") spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) # N x F mel_weights = filters.mel( args.sr, nextpow2(args.frame_len) if args.round_power_of_two else args.frame_len, **mel_kwargs) WriterImpl = {"kaldi": ArchiveWriter, "exraw": BinaryWriter}[args.format] with WriterImpl(args.dup_ark, args.scp) as writer: for key, spectrum in spectrogram_reader: # N x F * F x T = N * T => T x N fbank = np.transpose( np.dot(mel_weights, spectrum[0] if spectrum.ndim == 3 else spectrum)) if args.log: fbank = np.log(np.maximum(fbank, EPSILON)) writer.write(key, fbank) logger.info("Process {:d} utterances".format(len(spectrogram_reader)))
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "round_power_of_two": args.round_power_of_two, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": False # F x T } feat_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader} mask_reader = MaskReader[args.fmt](args.mask_scp) num_bins = nextpow2(args.frame_len) // 2 + 1 beamformer = MvdrBeamformer(num_bins) num_done = 0 with ArchiveWriter(args.dup_ark, args.scp) as writer: for key, spect in feat_reader: if key in mask_reader: speech_masks = mask_reader[key] # make sure speech_masks in T x F _, F, _ = spect.shape if speech_masks.shape[0] == F: speech_masks = np.transpose(speech_masks) speech_masks = np.minimum(speech_masks, 1) # spectrogram: N x F x T speech_covar = beamformer.compute_covar_mat( speech_masks, spect) sv = beamformer.compute_steer_vector(speech_covar) df = directional_feats(spect, sv.T) writer.write(key, df) num_done += 1 if not num_done % 1000: logger.info("Processed {:d} utterance...".format(num_done)) else: logger.warn("Missing TF-mask for utterance {}".format(key)) logger.info("Processed {:d} utterances over {:d}".format( num_done, len(feat_reader)))
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": False # F x T } spectrogram_reader = SpectrogramReader( args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader} tgt_mask_reader = MaskReader[args.fmt](args.tgt_mask) itf_mask_reader = MaskReader[args.fmt]( args.tgt_mask) if args.itf_mask else None if itf_mask_reader is not None: logger.info(f"Using interfering masks from {args.itf_mask}") online = False num_bins = nextpow2(args.frame_len) // 2 + 1 supported_beamformer = { "mvdr": MvdrBeamformer(num_bins), "mpdr": MpdrBeamformer(num_bins), "mpdr-whiten": MpdrBeamformer(num_bins, whiten=True), "gevd": GevdBeamformer(num_bins), "pmwf-0": PmwfBeamformer(num_bins, beta=0, ref_channel=args.pmwf_ref, rank1_appro=args.rank1_appro), "pmwf-1": PmwfBeamformer(num_bins, beta=1, ref_channel=args.pmwf_ref, rank1_appro=args.rank1_appro) } supported_online_beamformer = { "mvdr": OnlineMvdrBeamformer(num_bins, args.channels, args.alpha), "gevd": OnlineGevdBeamformer(num_bins, args.channels, args.alpha), } if args.chunk_size <= 0: logger.info(f"Using offline {args.beamformer} beamformer") beamformer = supported_beamformer[args.beamformer] else: if args.chunk_size < 32: raise RuntimeError(f"Seems chunk size({args.chunk_size:.2f}) " + "too small for online beamformer") beamformer = supported_online_beamformer[args.beamformer] online = True logger.info(f"Using online {args.beamformer} beamformer, " + f"chunk size = {args.chunk_size:d}") num_done = 0 with WaveWriter(args.dst_dir, sr=args.sr) as writer: for key, stft_mat in spectrogram_reader: if key in tgt_mask_reader: power = spectrogram_reader.power(key) norm = spectrogram_reader.maxabs(key) logger.info( f"Processing utterance {key}, " + f"signal power {10 * np.log10(power + 1e-5):.2f}...") # prefer T x F speech_mask = tgt_mask_reader[key] # constraint [0, 1] if itf_mask_reader is None: speech_mask = np.minimum(speech_mask, 1) interf_mask = None else: interf_mask = itf_mask_reader[key] # make sure speech_mask at shape T x F _, F, _ = stft_mat.shape # if in F x T if speech_mask.shape[0] == F and speech_mask.shape[1] != F: speech_mask = np.transpose(speech_mask) if interf_mask is not None: interf_mask = np.transpose(interf_mask) if 0.5 < args.vad_proportion < 1: vad_mask, N = compute_vad_masks(stft_mat[0], args.vad_proportion) logger.info(f"Filtering {N} TF-masks...") speech_mask = np.where(vad_mask, 1.0e-4, speech_mask) if interf_mask is not None: interf_mask = np.where(vad_mask, 1.0e-4, interf_mask) # stft_enh, stft_mat: (N) x F x T try: if not online: stft_enh = beamformer.run(speech_mask, stft_mat, mask_n=interf_mask, ban=args.ban) else: stft_enh = do_online_beamform(beamformer, speech_mask, interf_mask, stft_mat, args) except np.linalg.LinAlgError: logger.error(f"Raise linalg error: {key}") continue # masking beamformer output if necessary if args.mask: stft_enh = stft_enh * np.transpose(speech_mask) samps = inverse_stft(stft_enh, norm=norm, **stft_kwargs) writer.write(key, samps) num_done += 1 logger.info(f"Processed {num_done:d} utterances " + f"out of {len(spectrogram_reader):d}")