Esempio n. 1
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "round_power_of_two": args.round_power_of_two,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": False  # F x T
    }
    stft_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    if args.utt2idx:
        utt2idx = ScpReader(args.utt2idx, value_processor=int)
        logger.info(f"Using --utt2idx={args.utt2idx}")
    else:
        utt2idx = None
        logger.info(f"Using --doa-idx={args.doa_idx}")

    df_pair = [tuple(map(int, p.split(","))) for p in args.df_pair.split(";")]
    if not len(df_pair):
        raise RuntimeError(f"Bad configurations with --pair {args.pair}")
    logger.info(f"Compute directional feature with {df_pair}")

    # A x M x F
    steer_vector = np.load(args.steer_vector)

    num_done = 0
    with ArchiveWriter(args.dup_ark, args.scp) as writer:
        for key, stft in stft_reader:
            # sv: M x F
            if utt2idx is None:
                idx = [int(v) for v in args.doa_idx.split(",")]
                dfs = [
                    directional_feats(stft, steer_vector[i], df_pair=df_pair)
                    for i in idx
                ]
                if len(dfs) == 1:
                    df = dfs[0]
                else:
                    # N x T x F
                    dfs = np.stack(dfs)
                    df = dfs.transpose(1, 0, 2).reshape(dfs.shape[1], -1)
            elif key in utt2idx:
                # stft: M x F x T
                df = directional_feats(stft,
                                       steer_vector[utt2idx[key]],
                                       df_pair=df_pair)
            else:
                logger.warn(f"Missing utt2idx for utterance {key}")
                continue
            writer.write(key, df)
            num_done += 1
            if not num_done % 1000:
                logger.info(f"Processed {num_done:d} utterance...")
    logger.info(f"Processed {num_done:d} utterances over {len(stft_reader):d}")
Esempio n. 2
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "round_power_of_two": args.round_power_of_two,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": False  # F x T
    }
    feat_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader}
    mask_reader = MaskReader[args.fmt](args.mask_scp)

    num_bins = nextpow2(args.frame_len) // 2 + 1
    beamformer = MvdrBeamformer(num_bins)

    df_pair = [tuple(map(int, p.split(","))) for p in args.df_pair.split(";")]
    if not len(df_pair):
        raise RuntimeError(f"Bad configurations with --pair {args.pair}")
    logger.info(f"Compute directional feature with {df_pair}")

    num_done = 0
    with ArchiveWriter(args.dup_ark, args.scp) as writer:
        for key, spect in feat_reader:
            if key in mask_reader:
                speech_masks = mask_reader[key]
                # make sure speech_masks in T x F
                _, F, _ = spect.shape
                if speech_masks.shape[0] == F:
                    speech_masks = np.transpose(speech_masks)
                speech_masks = np.minimum(speech_masks, 1)
                # spectrogram: N x F x T
                speech_covar = beamformer.compute_covar_mat(
                    speech_masks, spect)
                sv = beamformer.compute_steer_vector(speech_covar)
                df = directional_feats(spect, sv.T, df_pair=df_pair)
                writer.write(key, df)
                num_done += 1
                if not num_done % 1000:
                    logger.info(f"Processed {num_done:d} utterance...")
            else:
                logger.warn(f"Missing TF-mask for utterance {key}")
    logger.info(f"Processed {num_done:d} utterances over {len(feat_reader):d}")
Esempio n. 3
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "round_power_of_two": args.round_power_of_two,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": False  # F x T
    }
    feat_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader}
    mask_reader = MaskReader[args.fmt](args.mask_scp)

    num_bins = nfft(args.frame_len) // 2 + 1
    beamformer = MvdrBeamformer(num_bins)

    num_done = 0
    with ArchiveWriter(args.dup_ark, args.scp) as writer:
        for key, spect in feat_reader:
            if key in mask_reader:
                speech_masks = mask_reader[key]
                # make sure speech_masks in T x F
                _, F, _ = spect.shape
                if speech_masks.shape[0] == F:
                    speech_masks = np.transpose(speech_masks)
                speech_masks = np.minimum(speech_masks, 1)
                # spectrogram: N x F x T
                speech_covar = beamformer.compute_covar_mat(
                    speech_masks, spect)
                sv = beamformer.compute_steer_vector(speech_covar)
                df = directional_feats(spect, sv.T)
                writer.write(key, df)
                num_done += 1
                if not num_done % 1000:
                    logger.info("Processed {:d} utterance...".format(num_done))
            else:
                logger.warn("Missing TF-mask for utterance {}".format(key))
    logger.info("Processed {:d} utterances over {:d}".format(
        num_done, len(feat_reader)))
def run(args):
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": False  # F x T
    }
    feat_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader(
        args.mask_scp)

    num_bins = nfft(args.frame_length) // 2 + 1
    beamformer = MvdrBeamformer(num_bins)

    num_done = 0
    with ArchiveWriter(args.dup_ark, args.scp) as writer:
        for key, spect in feat_reader:
            if key in mask_reader:
                speech_masks = mask_reader[key]
                if args.trans:
                    speech_masks = np.transpose(speech_masks)
                speech_masks = np.minimum(speech_masks, 1)
                # spectrogram: N x F x T
                speech_covar = beamformer.compute_covar_mat(
                    speech_masks, spect)
                sv = beamformer.compute_steer_vector(speech_covar)
                df = directional_feats(spect, sv.T)
                writer.write(key, df)
                num_done += 1
                if not num_done % 1000:
                    logger.info("Processed {:d} utterance...".format(num_done))
            else:
                logger.warn("Missing TF-mask for utterance {}".format(key))
    logger.info("Processed {:d} utterances over {:d}".format(
        num_done, len(feat_reader)))