Exemple #1
0
def compute_spatial_feats(args, S):
    if args.type == "srp":
        num_ffts = nfft(
            args.frame_len) if args.round_power_of_two else args.frame_len
        srp_kwargs = {
            "sample_frequency": args.samp_frequency,
            "num_doa": args.num_doa,
            "num_bins": num_ffts // 2 + 1,
            "samp_doa": not args.samp_tdoa
        }
        linear_topo = list(map(float, args.linear_topo.split(",")))
        return srp_phat_linear(S, linear_topo, **srp_kwargs)
    elif args.type == "ipd":
        if S.ndim < 3:
            raise ValueError("Only one-channel STFT available")
        ipd_list = []
        for p in args.ipd_index.split(";"):
            indexes = list(map(int, p.split(",")))
            if len(indexes) != 2:
                raise ValueError(
                    "Invalid --ipd.index configuration detected: {}".format(
                        args.ipd_index))
            L, R = indexes
            if R > S.shape[0]:
                raise RuntimeError("Could not access channel {:d}".format(R))
            ipd_mat = ipd(S[L], S[R], cos=args.ipd_cos, sin=args.ipd_sin)
            ipd_list.append(ipd_mat)
        # concat along frequency axis
        return np.hstack(ipd_list)
    else:
        return msc(S, context=args.msc_ctx)
Exemple #2
0
def run(args):
    griffin_lim_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center,
        "transpose": True,
        "epochs": args.epochs
    }

    feature_reader = ScriptReader(args.feat_scp)

    if args.fbank:
        mel_kwargs = {
            "n_mels": args.num_bins,
            "fmin": args.min_freq,
            "fmax": args.max_freq,
            "htk": True
        }
        # N x F
        mel_weights = audio_lib.filters.mel(args.samp_freq,
                                            nfft(args.frame_length),
                                            **mel_kwargs)
        # F x N
        mel_inv_weights = np.linalg.pinv(mel_weights)

    with WaveWriter(
            args.dump_dir, fs=args.samp_freq,
            normalize=args.normalize) as writer:
        for key, spec in feature_reader:
            # if log, tranform to linear
            if args.apply_log:
                spec = np.exp(spec)
            # convert fbank to spectrum
            # feat: T x N
            if args.fbank:
                spec = np.maximum(spec @ np.transpose(mel_inv_weights),
                                  EPSILON)
            # if power spectrum, tranform to magnitude spectrum
            if args.apply_pow:
                spec = np.sqrt(spec)
            if spec.shape[1] - 1 != nfft(args.frame_length) // 2:
                raise RuntimeError("Seems missing --fbank options?")
            # griffin lim
            samps = griffin_lim(spec, **griffin_lim_kwargs)
            writer.write(key, samps)
    logger.info("Processed {:d} utterance done".format(len(feature_reader)))
Exemple #3
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "round_power_of_two": args.round_power_of_two,
        "window": args.window,
        "center": args.center,
        "transpose": False
    }

    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader}
    init_mask_reader = MaskReader[args.fmt](
        args.init_mask) if args.init_mask else None

    n_fft = nfft(args.frame_len) if args.round_power_of_two else args.frame_len
    # now use pb_bss
    pb_perm_solver = load_module(pb_bss_align_url)
    aligner = pb_perm_solver.DHTVPermutationAlignment.from_stft_size(n_fft)

    num_done = 0
    with NumpyWriter(args.dst_dir) as writer:
        dst_dir = Path(args.dst_dir)
        for key, stft in spectrogram_reader:
            if not (dst_dir / f"{key}.npy").exists():
                # K x F x T
                init_mask = None
                if init_mask_reader and key in init_mask_reader:
                    init_mask = init_mask_reader[key]
                    logger.info("Using external mask to initialize cacgmm")
                # stft: N x F x T
                trainer = CacgmmTrainer(stft,
                                        args.num_classes,
                                        gamma=init_mask,
                                        cgmm_init=args.cgmm_init)
                try:
                    # EM progress
                    masks = trainer.train(args.num_epoches)
                    # align if needed
                    if not args.cgmm_init or args.num_classes != 2:
                        masks = aligner(masks)
                        logger.info(
                            "Permutation align done for each frequency")
                    num_done += 1
                    writer.write(key, masks.astype(np.float32))
                    logger.info(f"Training utterance {key} ... Done")
                except np.linalg.LinAlgError:
                    logger.warn(f"Training utterance {key} ... Failed")
            else:
                logger.info(f"Training utterance {key} ... Skip")
    logger.info(
        f"Train {num_done:d} utterances over {len(spectrogram_reader):d}")
def run(args):
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": False  # F x T
    }
    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader(
        args.mask_scp)

    num_bins = nfft(args.frame_length) // 2 + 1
    supported_beamformer = {
        "mvdr": MvdrBeamformer(num_bins),
        "gevd": GevdBeamformer(num_bins),
        "pmwf": PmwfBeamformer(num_bins)
    }
    beamformer = supported_beamformer[args.beamformer]

    num_utts = 0
    for key, stft_mat in spectrogram_reader:
        if key in mask_reader:
            num_utts += 1
            norm = spectrogram_reader.samp_norm(key)
            logger.info("Processing utterance {}(norm to {:.2f})...".format(
                key, norm))
            # prefer T x F
            speech_mask = mask_reader[key]
            if args.trans:
                speech_mask = np.transpose(speech_mask)
            # stft_enh, stft_mat: F x T
            stft_enh = beamformer.run(speech_mask,
                                      stft_mat,
                                      normalize=args.postf)
            # masking beamformer output if necessary
            if args.mask:
                stft_enh = stft_enh * np.transpose(speech_mask)
            istft(os.path.join(args.dst_dir, '{}.wav'.format(key)),
                  stft_enh,
                  norm=norm,
                  fs=args.samp_freq,
                  **stft_kwargs)
    logger.info("Processed {:d} utterances out of {:d}".format(
        num_utts, len(spectrogram_reader)))
def run(args):
    srp_pair = [
        tuple(map(int, p.split(","))) for p in args.diag_pair.split(";")
    ]
    if not len(srp_pair):
        raise RuntimeError("Bad configurations with --pair {}".format(
            args.pair))
    logger.info("Compute gcc with {}".format(srp_pair))

    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "round_power_of_two": args.round_power_of_two,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": True  # T x F
    }
    num_done = 0
    num_ffts = nfft(
        args.frame_len) if args.round_power_of_two else args.frame_len
    reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    with ArchiveWriter(args.srp_ark, args.scp) as writer:
        for key, stft_mat in reader:
            num_done += 1
            srp = []
            # N x T x F
            for (i, j) in srp_pair:
                srp.append(
                    gcc_phat_diag(
                        stft_mat[i],
                        stft_mat[j],
                        min(i, j) * np.pi * 2 / args.n,
                        args.d,
                        num_bins=num_ffts // 2 + 1,
                        sr=args.sr,
                        num_doa=args.num_doa))
            srp = sum(srp) / len(srp_pair)
            nan = np.sum(np.isnan(srp))
            if nan:
                raise RuntimeError("Matrix {} has nan ({:d}} items)".format(
                    key, nan))
            writer.write(key, srp)
            if not num_done % 1000:
                logger.info("Processed {:d} utterances...".format(num_done))
    logger.info("Processd {:d} utterances done".format(len(reader)))
Exemple #6
0
def run(args):
    mel_kwargs = {
        "n_mels": args.num_bins,
        "fmin": args.min_freq,
        "fmax": args.max_freq,
        "htk": True
    }
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "round_power_of_two": args.round_power_of_two,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "apply_log": False,
        "apply_pow": False,
        "normalize": args.norm,
        "apply_abs": True,
        "transpose": False  # F x T
    }

    if args.max_freq > args.samp_freq // 2:
        raise RuntimeError("Max frequency for mel exceeds sample frequency")
    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    # N x F
    mel_weights = audio_lib.filters.mel(
        args.samp_freq,
        nfft(args.frame_len) if args.round_power_of_two else args.frame_len,
        **mel_kwargs)
    WriterImpl = {"kaldi": ArchiveWriter, "exraw": BinaryWriter}[args.format]

    with WriterImpl(args.dup_ark, args.scp) as writer:
        for key, spectrum in spectrogram_reader:
            # N x F * F x T = N * T => T x N
            fbank = np.transpose(
                np.dot(mel_weights,
                       spectrum[0] if spectrum.ndim == 3 else spectrum))
            if args.log:
                fbank = np.log(np.maximum(fbank, EPSILON))
            writer.write(key, fbank)
    logger.info("Process {:d} utterances".format(len(spectrogram_reader)))
Exemple #7
0
def compute_spatial_feats(args, S):
    if args.type == "srp":
        srp_kwargs = {
            "sample_frequency": args.samp_frequency,
            "num_doa": args.num_doa,
            "num_bins": nfft(args.frame_length) // 2 + 1,
            "samp_doa": not args.samp_tdoa
        }
        linear_topo = list(map(float, args.linear_topo.split(",")))
        return srp_phat(S, linear_topo, **srp_kwargs)
    elif args.type == "ipd":
        indexes = list(map(int, args.ipd_index.split(",")))
        if len(indexes) != 2:
            raise ValueError(
                "Invalid --ipd.index configuration detected: {}".format(
                    args.ipd_index))
        if S.ndim < 3:
            raise ValueError("Only one-channel STFT available")
        L, R = indexes
        return ipd(S[L], S[R], sin=args.ipd_sin)
    else:
        return msc(S, context=args.context)
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "round_power_of_two": args.round_power_of_two,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": False  # F x T
    }
    feat_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader}
    mask_reader = MaskReader[args.fmt](args.mask_scp)

    num_bins = nfft(args.frame_len) // 2 + 1
    beamformer = MvdrBeamformer(num_bins)

    num_done = 0
    with ArchiveWriter(args.dup_ark, args.scp) as writer:
        for key, spect in feat_reader:
            if key in mask_reader:
                speech_masks = mask_reader[key]
                # make sure speech_masks in T x F
                _, F, _ = spect.shape
                if speech_masks.shape[0] == F:
                    speech_masks = np.transpose(speech_masks)
                speech_masks = np.minimum(speech_masks, 1)
                # spectrogram: N x F x T
                speech_covar = beamformer.compute_covar_mat(
                    speech_masks, spect)
                sv = beamformer.compute_steer_vector(speech_covar)
                df = directional_feats(spect, sv.T)
                writer.write(key, df)
                num_done += 1
                if not num_done % 1000:
                    logger.info("Processed {:d} utterance...".format(num_done))
            else:
                logger.warn("Missing TF-mask for utterance {}".format(key))
    logger.info("Processed {:d} utterances over {:d}".format(
        num_done, len(feat_reader)))
def run(args):
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": False  # F x T
    }
    feat_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader(
        args.mask_scp)

    num_bins = nfft(args.frame_length) // 2 + 1
    beamformer = MvdrBeamformer(num_bins)

    num_done = 0
    with ArchiveWriter(args.dup_ark, args.scp) as writer:
        for key, spect in feat_reader:
            if key in mask_reader:
                speech_masks = mask_reader[key]
                if args.trans:
                    speech_masks = np.transpose(speech_masks)
                speech_masks = np.minimum(speech_masks, 1)
                # spectrogram: N x F x T
                speech_covar = beamformer.compute_covar_mat(
                    speech_masks, spect)
                sv = beamformer.compute_steer_vector(speech_covar)
                df = directional_feats(spect, sv.T)
                writer.write(key, df)
                num_done += 1
                if not num_done % 1000:
                    logger.info("Processed {:d} utterance...".format(num_done))
            else:
                logger.warn("Missing TF-mask for utterance {}".format(key))
    logger.info("Processed {:d} utterances over {:d}".format(
        num_done, len(feat_reader)))
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": False  # F x T
    }
    spectrogram_reader = SpectrogramReader(
        args.wav_scp,
        round_power_of_two=args.round_power_of_two,
        **stft_kwargs)
    MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader}
    mask_reader = MaskReader[args.fmt](args.mask_scp)

    online = False
    num_bins = nfft(args.frame_len) // 2 + 1

    supported_beamformer = {
        "mvdr": MvdrBeamformer(num_bins),
        "gevd": GevdBeamformer(num_bins),
        "pmwf": PmwfBeamformer(num_bins)
    }
    supported_online_beamformer = {
        "mvdr": OnlineMvdrBeamformer(num_bins, args.channels, args.alpha),
        "gevd": OnlineGevdBeamformer(num_bins, args.channels, args.alpha),
    }
    if args.chunk_size <= 0:
        logger.info("Using offline {} beamformer".format(args.beamformer))
        beamformer = supported_beamformer[args.beamformer]
    else:
        if args.chunk_size < 32:
            raise RuntimeError(
                "Seems chunk size({:.2f}) too small for online beamformer".
                format(args.chunk_size))
        beamformer = supported_online_beamformer[args.beamformer]
        online = True
        logger.info("Using online {} beamformer, chunk size = {:d}".format(
            args.beamformer, args.chunk_size))

    num_done = 0
    with WaveWriter(args.dst_dir, fs=args.samp_freq) as writer:
        for key, stft_mat in spectrogram_reader:
            if key in mask_reader:
                num_done += 1
                power = spectrogram_reader.power(key)
                logger.info(
                    "Processing utterance {}, signal power {:.2f}...".format(
                        key, 10 * np.log10(power + 1e-5)))
                # prefer T x F
                speech_mask = mask_reader[key]
                # constraint [0, 1]
                speech_mask = np.minimum(speech_mask, 1)
                # make sure speech_mask at shape T x F
                _, F, _ = stft_mat.shape
                # if in F x T
                if speech_mask.shape[0] == F:
                    speech_mask = np.transpose(speech_mask)
                # stft_enh, stft_mat: (N) x F x T
                if not online:
                    stft_enh = beamformer.run(speech_mask,
                                              stft_mat,
                                              normalize=args.ban)
                else:
                    stft_enh = do_online_beamform(beamformer, speech_mask,
                                                  stft_mat, args)
                # masking beamformer output if necessary
                if args.mask:
                    stft_enh = stft_enh * np.transpose(speech_mask)
                samps = istft(stft_enh, power=power, **stft_kwargs)
                writer.write(key, samps)
    logger.info("Processed {:d} utterances out of {:d}".format(
        num_done, len(spectrogram_reader)))
Exemple #11
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": False  # F x T
    }
    spectrogram_reader = SpectrogramReader(
        args.wav_scp,
        round_power_of_two=args.round_power_of_two,
        **stft_kwargs)
    MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader}
    tgt_mask_reader = MaskReader[args.fmt](args.tgt_mask)
    itf_mask_reader = MaskReader[args.fmt](
        args.tgt_mask) if args.itf_mask else None
    if itf_mask_reader is not None:
        logger.info(f"Using interfering masks from {args.itf_mask}")
    online = False
    num_bins = nfft(args.frame_len) // 2 + 1
    ref_channel = args.pmwf_ref if args.pmwf_ref >= 0 else None
    supported_beamformer = {
        "mvdr": MvdrBeamformer(num_bins),
        "gevd": GevdBeamformer(num_bins),
        "pmwf-0": PmwfBeamformer(num_bins, beta=0, ref_channel=ref_channel),
        "pmwf-1": PmwfBeamformer(num_bins, beta=1, ref_channel=ref_channel)
    }
    supported_online_beamformer = {
        "mvdr": OnlineMvdrBeamformer(num_bins, args.channels, args.alpha),
        "gevd": OnlineGevdBeamformer(num_bins, args.channels, args.alpha),
    }
    if args.chunk_size <= 0:
        logger.info(f"Using offline {args.beamformer} beamformer")
        beamformer = supported_beamformer[args.beamformer]
    else:
        if args.chunk_size < 32:
            raise RuntimeError(f"Seems chunk size({args.chunk_size:.2f}) " +
                               "too small for online beamformer")
        beamformer = supported_online_beamformer[args.beamformer]
        online = True
        logger.info(f"Using online {args.beamformer} beamformer, " +
                    f"chunk size = {args.chunk_size:d}")

    num_done = 0
    with WaveWriter(args.dst_dir, fs=args.samp_freq) as writer:
        for key, stft_mat in spectrogram_reader:
            if key in tgt_mask_reader:
                power = spectrogram_reader.power(key)
                logger.info(
                    f"Processing utterance {key}, " +
                    f"signal power {10 * np.log10(power + 1e-5):.2f}...")
                # prefer T x F
                speech_mask = tgt_mask_reader[key]
                # constraint [0, 1]
                if itf_mask_reader is None:
                    speech_mask = np.minimum(speech_mask, 1)
                    interf_mask = None
                else:
                    interf_mask = itf_mask_reader[key]
                # make sure speech_mask at shape T x F
                _, F, _ = stft_mat.shape
                # if in F x T
                if speech_mask.shape[0] == F:
                    speech_mask = np.transpose(speech_mask)
                    if interf_mask is not None:
                        interf_mask = np.transpose(interf_mask)
                if 0.5 < args.vad_proportion < 1:
                    vad_mask, N = compute_vad_masks(stft_mat[0],
                                                    args.vad_proportion)
                    logger.info(f"Filtering {N} TF-masks...")
                    speech_mask = np.where(vad_mask, 1.0e-4, speech_mask)
                    if interf_mask is not None:
                        interf_mask = np.where(vad_mask, 1.0e-4, interf_mask)
                # stft_enh, stft_mat: (N) x F x T
                try:
                    if not online:
                        stft_enh = beamformer.run(speech_mask,
                                                  stft_mat,
                                                  noise_mask=interf_mask,
                                                  normalize=args.ban)
                    else:
                        stft_enh = do_online_beamform(beamformer, speech_mask,
                                                      interf_mask, stft_mat,
                                                      args)
                except np.linalg.LinAlgError:
                    logger.error(f"Raise linalg error: {key}")
                    continue
                # masking beamformer output if necessary
                if args.mask:
                    stft_enh = stft_enh * np.transpose(speech_mask)
                samps = istft(stft_enh, power=power, **stft_kwargs)
                writer.write(key, samps)
                num_done += 1
    logger.info(f"Processed {num_done:d} utterances " +
                f"out of {len(spectrogram_reader):d}")