Example #1
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center,
        "transpose": False
    }
    topo = list(map(float, args.linear_topo.split(",")))
    doa = args.doa if args.doa > 0 else 180 + args.doa
    if doa < 0 or doa > 180:
        raise RuntimeError("Illegal value for DoA: {:.2f}".format(args.doa))

    spectrogram_reader = SpectrogramReader(
        args.wav_scp,
        round_power_of_two=args.round_power_of_two,
        **stft_kwargs)
    beamformer = DSBeamformer(topo)
    logger.info("Initialize {:d} channel DSBeamformer".format(len(topo)))

    with WaveWriter(args.dst_dir, fs=args.fs) as writer:
        for key, stft_src in spectrogram_reader:
            stft_enh = beamformer.run(
                doa, stft_src, c=args.speed, sample_rate=args.fs)
            power = spectrogram_reader.power(key)
            samps = istft(stft_enh, **stft_kwargs, power=power)
            writer.write(key, samps)
    logger.info("Processed {:d} utterances".format(len(spectrogram_reader)))
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": False  # F x T
    }
    spectrogram_reader = SpectrogramReader(
        args.wav_scp,
        round_power_of_two=args.round_power_of_two,
        **stft_kwargs)
    MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader}
    tgt_mask_reader = MaskReader[args.fmt](args.tgt_mask)
    itf_mask_reader = MaskReader[args.fmt](
        args.tgt_mask) if args.itf_mask else None
    if itf_mask_reader is not None:
        logger.info(f"Using interfering masks from {args.itf_mask}")
    online = False
    num_bins = nextpow2(args.frame_len) // 2 + 1
    supported_beamformer = {
        "mvdr":
        MvdrBeamformer(num_bins),
        "mpdr":
        MpdrBeamformer(num_bins),
        "mpdr-whiten":
        MpdrBeamformer(num_bins, whiten=True),
        "gevd":
        GevdBeamformer(num_bins),
        "pmwf-0":
        PmwfBeamformer(num_bins,
                       beta=0,
                       ref_channel=args.pmwf_ref,
                       rank1_appro=args.rank1_appro),
        "pmwf-1":
        PmwfBeamformer(num_bins,
                       beta=1,
                       ref_channel=args.pmwf_ref,
                       rank1_appro=args.rank1_appro)
    }
    supported_online_beamformer = {
        "mvdr": OnlineMvdrBeamformer(num_bins, args.channels, args.alpha),
        "gevd": OnlineGevdBeamformer(num_bins, args.channels, args.alpha),
    }
    if args.chunk_size <= 0:
        logger.info(f"Using offline {args.beamformer} beamformer")
        beamformer = supported_beamformer[args.beamformer]
    else:
        if args.chunk_size < 32:
            raise RuntimeError(f"Seems chunk size({args.chunk_size:.2f}) " +
                               "too small for online beamformer")
        beamformer = supported_online_beamformer[args.beamformer]
        online = True
        logger.info(f"Using online {args.beamformer} beamformer, " +
                    f"chunk size = {args.chunk_size:d}")

    num_done = 0
    with WaveWriter(args.dst_dir, sr=args.sr) as writer:
        for key, stft_mat in spectrogram_reader:
            if key in tgt_mask_reader:
                power = spectrogram_reader.power(key)
                norm = spectrogram_reader.maxabs(key)
                logger.info(
                    f"Processing utterance {key}, " +
                    f"signal power {10 * np.log10(power + 1e-5):.2f}...")
                # prefer T x F
                speech_mask = tgt_mask_reader[key]
                # constraint [0, 1]
                if itf_mask_reader is None:
                    speech_mask = np.minimum(speech_mask, 1)
                    interf_mask = None
                else:
                    interf_mask = itf_mask_reader[key]
                # make sure speech_mask at shape T x F
                _, F, _ = stft_mat.shape
                # if in F x T
                if speech_mask.shape[0] == F and speech_mask.shape[1] != F:
                    speech_mask = np.transpose(speech_mask)
                    if interf_mask is not None:
                        interf_mask = np.transpose(interf_mask)
                if 0.5 < args.vad_proportion < 1:
                    vad_mask, N = compute_vad_masks(stft_mat[0],
                                                    args.vad_proportion)
                    logger.info(f"Filtering {N} TF-masks...")
                    speech_mask = np.where(vad_mask, 1.0e-4, speech_mask)
                    if interf_mask is not None:
                        interf_mask = np.where(vad_mask, 1.0e-4, interf_mask)
                # stft_enh, stft_mat: (N) x F x T
                try:
                    if not online:
                        stft_enh = beamformer.run(speech_mask,
                                                  stft_mat,
                                                  mask_n=interf_mask,
                                                  ban=args.ban)
                    else:
                        stft_enh = do_online_beamform(beamformer, speech_mask,
                                                      interf_mask, stft_mat,
                                                      args)
                except np.linalg.LinAlgError:
                    logger.error(f"Raise linalg error: {key}")
                    continue
                # masking beamformer output if necessary
                if args.mask:
                    stft_enh = stft_enh * np.transpose(speech_mask)
                samps = inverse_stft(stft_enh, norm=norm, **stft_kwargs)
                writer.write(key, samps)
                num_done += 1
    logger.info(f"Processed {num_done:d} utterances " +
                f"out of {len(spectrogram_reader):d}")
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": False  # F x T
    }
    spectrogram_reader = SpectrogramReader(
        args.wav_scp,
        round_power_of_two=args.round_power_of_two,
        **stft_kwargs)
    MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader}
    mask_reader = MaskReader[args.fmt](args.mask_scp)

    online = False
    num_bins = nfft(args.frame_len) // 2 + 1

    supported_beamformer = {
        "mvdr": MvdrBeamformer(num_bins),
        "gevd": GevdBeamformer(num_bins),
        "pmwf": PmwfBeamformer(num_bins)
    }
    supported_online_beamformer = {
        "mvdr": OnlineMvdrBeamformer(num_bins, args.channels, args.alpha),
        "gevd": OnlineGevdBeamformer(num_bins, args.channels, args.alpha),
    }
    if args.chunk_size <= 0:
        logger.info("Using offline {} beamformer".format(args.beamformer))
        beamformer = supported_beamformer[args.beamformer]
    else:
        if args.chunk_size < 32:
            raise RuntimeError(
                "Seems chunk size({:.2f}) too small for online beamformer".
                format(args.chunk_size))
        beamformer = supported_online_beamformer[args.beamformer]
        online = True
        logger.info("Using online {} beamformer, chunk size = {:d}".format(
            args.beamformer, args.chunk_size))

    num_done = 0
    with WaveWriter(args.dst_dir, fs=args.samp_freq) as writer:
        for key, stft_mat in spectrogram_reader:
            if key in mask_reader:
                num_done += 1
                power = spectrogram_reader.power(key)
                logger.info(
                    "Processing utterance {}, signal power {:.2f}...".format(
                        key, 10 * np.log10(power + 1e-5)))
                # prefer T x F
                speech_mask = mask_reader[key]
                # constraint [0, 1]
                speech_mask = np.minimum(speech_mask, 1)
                # make sure speech_mask at shape T x F
                _, F, _ = stft_mat.shape
                # if in F x T
                if speech_mask.shape[0] == F:
                    speech_mask = np.transpose(speech_mask)
                # stft_enh, stft_mat: (N) x F x T
                if not online:
                    stft_enh = beamformer.run(speech_mask,
                                              stft_mat,
                                              normalize=args.ban)
                else:
                    stft_enh = do_online_beamform(beamformer, speech_mask,
                                                  stft_mat, args)
                # masking beamformer output if necessary
                if args.mask:
                    stft_enh = stft_enh * np.transpose(speech_mask)
                samps = istft(stft_enh, power=power, **stft_kwargs)
                writer.write(key, samps)
    logger.info("Processed {:d} utterances out of {:d}".format(
        num_done, len(spectrogram_reader)))