def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "round_power_of_two": args.round_power_of_two, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": False # F x T } stft_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) if args.utt2idx: utt2idx = ScpReader(args.utt2idx, value_processor=int) logger.info(f"Using --utt2idx={args.utt2idx}") else: utt2idx = None logger.info(f"Using --doa-idx={args.doa_idx}") df_pair = [tuple(map(int, p.split(","))) for p in args.df_pair.split(";")] if not len(df_pair): raise RuntimeError(f"Bad configurations with --pair {args.pair}") logger.info(f"Compute directional feature with {df_pair}") # A x M x F steer_vector = np.load(args.steer_vector) num_done = 0 with ArchiveWriter(args.dup_ark, args.scp) as writer: for key, stft in stft_reader: # sv: M x F if utt2idx is None: idx = [int(v) for v in args.doa_idx.split(",")] dfs = [ directional_feats(stft, steer_vector[i], df_pair=df_pair) for i in idx ] if len(dfs) == 1: df = dfs[0] else: # N x T x F dfs = np.stack(dfs) df = dfs.transpose(1, 0, 2).reshape(dfs.shape[1], -1) elif key in utt2idx: # stft: M x F x T df = directional_feats(stft, steer_vector[utt2idx[key]], df_pair=df_pair) else: logger.warn(f"Missing utt2idx for utterance {key}") continue writer.write(key, df) num_done += 1 if not num_done % 1000: logger.info(f"Processed {num_done:d} utterance...") logger.info(f"Processed {num_done:d} utterances over {len(stft_reader):d}")
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "round_power_of_two": args.round_power_of_two, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": False # F x T } feat_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader} mask_reader = MaskReader[args.fmt](args.mask_scp) num_bins = nextpow2(args.frame_len) // 2 + 1 beamformer = MvdrBeamformer(num_bins) df_pair = [tuple(map(int, p.split(","))) for p in args.df_pair.split(";")] if not len(df_pair): raise RuntimeError(f"Bad configurations with --pair {args.pair}") logger.info(f"Compute directional feature with {df_pair}") num_done = 0 with ArchiveWriter(args.dup_ark, args.scp) as writer: for key, spect in feat_reader: if key in mask_reader: speech_masks = mask_reader[key] # make sure speech_masks in T x F _, F, _ = spect.shape if speech_masks.shape[0] == F: speech_masks = np.transpose(speech_masks) speech_masks = np.minimum(speech_masks, 1) # spectrogram: N x F x T speech_covar = beamformer.compute_covar_mat( speech_masks, spect) sv = beamformer.compute_steer_vector(speech_covar) df = directional_feats(spect, sv.T, df_pair=df_pair) writer.write(key, df) num_done += 1 if not num_done % 1000: logger.info(f"Processed {num_done:d} utterance...") else: logger.warn(f"Missing TF-mask for utterance {key}") logger.info(f"Processed {num_done:d} utterances over {len(feat_reader):d}")
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "round_power_of_two": args.round_power_of_two, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": False # F x T } feat_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader} mask_reader = MaskReader[args.fmt](args.mask_scp) num_bins = nfft(args.frame_len) // 2 + 1 beamformer = MvdrBeamformer(num_bins) num_done = 0 with ArchiveWriter(args.dup_ark, args.scp) as writer: for key, spect in feat_reader: if key in mask_reader: speech_masks = mask_reader[key] # make sure speech_masks in T x F _, F, _ = spect.shape if speech_masks.shape[0] == F: speech_masks = np.transpose(speech_masks) speech_masks = np.minimum(speech_masks, 1) # spectrogram: N x F x T speech_covar = beamformer.compute_covar_mat( speech_masks, spect) sv = beamformer.compute_steer_vector(speech_covar) df = directional_feats(spect, sv.T) writer.write(key, df) num_done += 1 if not num_done % 1000: logger.info("Processed {:d} utterance...".format(num_done)) else: logger.warn("Missing TF-mask for utterance {}".format(key)) logger.info("Processed {:d} utterances over {:d}".format( num_done, len(feat_reader)))
def run(args): stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": False # F x T } feat_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader( args.mask_scp) num_bins = nfft(args.frame_length) // 2 + 1 beamformer = MvdrBeamformer(num_bins) num_done = 0 with ArchiveWriter(args.dup_ark, args.scp) as writer: for key, spect in feat_reader: if key in mask_reader: speech_masks = mask_reader[key] if args.trans: speech_masks = np.transpose(speech_masks) speech_masks = np.minimum(speech_masks, 1) # spectrogram: N x F x T speech_covar = beamformer.compute_covar_mat( speech_masks, spect) sv = beamformer.compute_steer_vector(speech_covar) df = directional_feats(spect, sv.T) writer.write(key, df) num_done += 1 if not num_done % 1000: logger.info("Processed {:d} utterance...".format(num_done)) else: logger.warn("Missing TF-mask for utterance {}".format(key)) logger.info("Processed {:d} utterances over {:d}".format( num_done, len(feat_reader)))