Exemple #1
0
def run(args):
    # shape: T x F, complex
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,
    }
    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader(
        args.mask_scp)

    num_utts = 0
    fs = args.samp_freq,
    for key, specs in spectrogram_reader:
        if key in mask_reader:
            num_utts += 1
            mask = mask_reader[key]
            if args.transpose:
                mask = np.transpose(mask)
            logger.info("Processing utterance {}...".format(key))
            if mask.shape != specs.shape:
                raise ValueError(
                    "Dimention mismatch between mask and spectrogram"
                    "({0[0]} x {0[1]} vs {1[0]} x {1[1]}), need check configures"
                    .format(mask.shape, specs.shape))
            nsamps = spectrogram_reader.nsamps(
                key) if args.keep_length else None
            istft(os.path.join(args.dst_dir, "{}.wav".format(key)),
                  specs * mask,
                  **stft_kwargs,
                  fs=fs,
                  nsamps=nsamps)
    logger.info("Processed {} utterances".format(num_utts))
Exemple #2
0
def run(args):
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": True  # T x F
    }
    wpe_kwargs = {
        "taps": args.taps,
        "delay": args.delay,
        "iters": args.iters,
        "psd_context": args.context
    }
    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)

    if not os.path.exists(args.dst_dir):
        os.makedirs(args.dst_dir)

    for key, reverbed in spectrogram_reader:
        # N x T x F => F x N x T
        reverbed = np.transpose(reverbed, [2, 0, 1])
        # F x N x T
        dereverb = wpe(reverbed, **wpe_kwargs)
        # F x N x T => N x T x F
        dereverb = np.transpose(dereverb, [1, 2, 0])
        # write for each channel
        for chid in range(dereverb.shape[0]):
            chpath = os.path.join(args.dst_dir,
                                  "{}.CH{:d}.wav".format(key, chid + 1))
            istft(chpath, dereverb[chid], **stft_kwargs, fs=args.samp_freq)
    logger.info("Processed {:d} utterances".format(len(spectrogram_reader)))
Exemple #3
0
def run(args):
    # shape: T x F, complex
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center,
    }
    spectrogram_reader = SpectrogramReader(
        args.wav_scp,
        **stft_kwargs,
        round_power_of_two=args.round_power_of_two)
    phase_reader = None
    if args.phase_ref:
        phase_reader = SpectrogramReader(
            args.phase_ref,
            **stft_kwargs,
            round_power_of_two=args.round_power_of_two)
        logger.info("Using phase reference from {}".format(args.phase_ref))
    MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader}
    mask_reader = MaskReader[args.fmt](args.mask_scp)

    num_done = 0
    with WaveWriter(args.dst_dir, fs=args.samp_freq) as writer:
        for key, specs in spectrogram_reader:
            # specs: T x F
            if key in mask_reader:
                num_done += 1
                mask = mask_reader[key]
                # mask sure mask in T x F
                _, F = specs.shape
                if mask.shape[0] == F:
                    mask = np.transpose(mask)
                logger.info("Processing utterance {}...".format(key))
                if mask.shape != specs.shape:
                    raise ValueError(
                        "Dimention mismatch between mask and spectrogram"
                        "({0[0]} x {0[1]} vs {1[0]} x {1[1]}), need check configures"
                        .format(mask.shape, specs.shape))
                nsamps = spectrogram_reader.nsamps(
                    key) if args.keep_length else None
                norm = spectrogram_reader.samp_norm(key)
                # use phase from ref
                if phase_reader is not None:
                    angle = np.angle(phase_reader[key])
                    phase = np.exp(angle * 1j)
                    samps = istft(np.abs(specs) * mask * phase,
                                  **stft_kwargs,
                                  norm=norm,
                                  nsamps=nsamps)
                else:
                    samps = istft(specs * mask,
                                  **stft_kwargs,
                                  norm=norm,
                                  nsamps=nsamps)
                writer.write(key, samps)
    logger.info("Processed {:d} utterances over {:d}".format(
        num_done, len(spectrogram_reader)))
Exemple #4
0
def run(args):
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,
        "transpose": True  # F x T instead of T x F
    }

    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)

    for key, spectrogram in spectrogram_reader:
        logger.info("Processing utterance {}...".format(key))
        separated = auxiva(spectrogram, args.epochs)
        for idx in range(separated.shape[0]):
            samps = istft(
                separated[idx],
                **stft_kwargs,
                norm=spectrogram_reader.samp_norm(key))
            write_wav(
                os.path.join(args.dst_dir, "{}.SRC{:d}.wav".format(
                    key, idx + 1)),
                samps,
                fs=args.fs)
    logger.info("Processed {:d} utterances".format(len(spectrogram_reader)))
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center,
        "transpose": False
    }
    topo = list(map(float, args.linear_topo.split(",")))
    doa = args.doa if args.doa > 0 else 180 + args.doa
    if doa < 0 or doa > 180:
        raise RuntimeError("Illegal value for DoA: {:.2f}".format(args.doa))

    spectrogram_reader = SpectrogramReader(
        args.wav_scp,
        round_power_of_two=args.round_power_of_two,
        **stft_kwargs)
    beamformer = DSBeamformer(topo)
    logger.info("Initialize {:d} channel DSBeamformer".format(len(topo)))

    with WaveWriter(args.dst_dir, fs=args.fs) as writer:
        for key, stft_src in spectrogram_reader:
            stft_enh = beamformer.run(
                doa, stft_src, c=args.speed, sample_rate=args.fs)
            power = spectrogram_reader.power(key)
            samps = istft(stft_enh, **stft_kwargs, power=power)
            writer.write(key, samps)
    logger.info("Processed {:d} utterances".format(len(spectrogram_reader)))
Exemple #6
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center,
        "transpose": False
    }
    spectrogram_reader = SpectrogramReader(
        args.wav_scp,
        round_power_of_two=args.round_power_of_two,
        **stft_kwargs)
    weights_dict = loadmat(args.weights)
    if args.weight_key not in weights_dict:
        raise KeyError("Weight key error: no \'{}\' in {}".format(
            args.weight_key, args.weights))

    beamformer = FixedBeamformer(weights_dict[args.weight_key])
    with WaveWriter(args.dump_dir) as writer:
        for key, stft_mat in spectrogram_reader:
            logger.info("Processing utterance {}...".format(key))
            stft_enh = beamformer.run(stft_mat)
            # do not normalize
            samps = istft(stft_enh, **stft_kwargs)
            writer.write(key, samps)
    logger.info("Processed {:d} utterances".format(len(spectrogram_reader)))
def run(args):
    # shape: T x F, complex
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,
    }
    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader(
        args.mask_scp)

    num_done = 0
    with WaveWriter(args.dst_dir, fs=args.samp_freq) as writer:
        for key, specs in spectrogram_reader:
            if key in mask_reader:
                num_done += 1
                mask = mask_reader[key]
                if args.transpose:
                    mask = np.transpose(mask)
                logger.info("Processing utterance {}...".format(key))
                if mask.shape != specs.shape:
                    raise ValueError(
                        "Dimention mismatch between mask and spectrogram"
                        "({0[0]} x {0[1]} vs {1[0]} x {1[1]}), need check configures"
                        .format(mask.shape, specs.shape))
                nsamps = spectrogram_reader.nsamps(
                    key) if args.keep_length else None
                norm = spectrogram_reader.samp_norm(key)
                samps = istft(
                    specs * mask, **stft_kwargs, norm=norm, nsamps=nsamps)
                writer.write(key, samps)
    logger.info("Processed {:d} utterances over {:d}".format(
        num_done, len(spectrogram_reader)))
def run(args):
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": False  # F x T
    }
    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader(
        args.mask_scp)

    num_bins = nfft(args.frame_length) // 2 + 1
    supported_beamformer = {
        "mvdr": MvdrBeamformer(num_bins),
        "gevd": GevdBeamformer(num_bins),
        "pmwf": PmwfBeamformer(num_bins)
    }
    beamformer = supported_beamformer[args.beamformer]

    num_utts = 0
    for key, stft_mat in spectrogram_reader:
        if key in mask_reader:
            num_utts += 1
            norm = spectrogram_reader.samp_norm(key)
            logger.info("Processing utterance {}(norm to {:.2f})...".format(
                key, norm))
            # prefer T x F
            speech_mask = mask_reader[key]
            if args.trans:
                speech_mask = np.transpose(speech_mask)
            # stft_enh, stft_mat: F x T
            stft_enh = beamformer.run(speech_mask,
                                      stft_mat,
                                      normalize=args.postf)
            # masking beamformer output if necessary
            if args.mask:
                stft_enh = stft_enh * np.transpose(speech_mask)
            istft(os.path.join(args.dst_dir, '{}.wav'.format(key)),
                  stft_enh,
                  norm=norm,
                  fs=args.samp_freq,
                  **stft_kwargs)
    logger.info("Processed {:d} utterances out of {:d}".format(
        num_utts, len(spectrogram_reader)))
def run(args):
    # return complex result
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center
    }
    logger.info("Using mask: {}".format(args.mask.upper()))
    mixture_reader = SpectrogramReader(args.mix_scp, **stft_kwargs)
    ref_scp_list = args.ref_scp.split(",")
    logger.info("Number of speakers: {:d}".format(len(ref_scp_list)))
    targets_reader = [
        SpectrogramReader(scp, **stft_kwargs) for scp in ref_scp_list
    ]
    num_utts = 0
    for key, mixture in tqdm(mixture_reader):
        nsamps = mixture_reader.nsamps(key) if args.keep_length else None
        skip = False
        for reader in targets_reader:
            if key not in reader:
                logger.info("Skip utterance {}, missing targets".format(key))
                skip = True
                break
        if skip:
            continue
        num_utts += 1
        targets_list = [reader[key] for reader in targets_reader]
        spk_masks = compute_mask(mixture, targets_list, args.mask)
        for index, mask in enumerate(spk_masks):
            istft(os.path.join(args.dump_dir,
                               '{}.s{}.wav'.format(key, index + 1)),
                  mixture * mask,
                  **stft_kwargs,
                  fs=args.fs,
                  nsamps=nsamps)
    logger.info("Processed {} utterance!".format(num_utts))
def run(args):
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,
        "transpose": False
    }
    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    weights_dict = loadmat(args.weights)
    if args.weight_key not in weights_dict:
        raise KeyError("Weight key error: no \'{}\' in {}".format(
            args.weight_key, args.weights))

    beamformer = FixedBeamformer(weights_dict[args.weight_key])
    num_utts = 0
    for key, stft_mat in spectrogram_reader:
        num_utts += 1
        logger.info("Processing utterance {}".format(key))
        stft_enh = beamformer.run(stft_mat)
        # do not normalize
        istft(os.path.join(args.dst_dir, '{}.wav'.format(key)), stft_enh,
              **stft_kwargs)
    logger.info("Processed {} utterances".format(num_utts))
Exemple #11
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center,
    }

    FeatureReader = {"numpy": NumpyReader, "kaldi": ScriptReader}
    feature_reader = FeatureReader[args.fmt](args.feat_scp)

    phase_reader = None
    if args.phase_ref:
        phase_reader = SpectrogramReader(
            args.phase_ref,
            **stft_kwargs,
            round_power_of_two=args.round_power_of_two)
        logger.info(f"Using phase reference from {args.phase_ref}")

    with WaveWriter(args.dump_dir, fs=args.sr,
                    normalize=args.normalize) as writer:
        for key, spec in feature_reader:
            logger.info(f"Processing utterance {key}...")
            # if log, tranform to linear
            if args.apply_log:
                spec = np.exp(spec)
            # if power spectrum, tranform to magnitude spectrum
            if args.apply_pow:
                spec = np.sqrt(spec)
            if phase_reader is None:
                # griffin lim
                samps = griffin_lim(spec,
                                    epoches=args.epoches,
                                    transpose=True,
                                    norm=0.8,
                                    **stft_kwargs)
            else:
                if key not in phase_reader:
                    raise KeyError(f"Missing key {key} in phase reader")
                ref = phase_reader[key]
                angle = np.angle(ref[0] if ref.ndim == 3 else ref)
                phase = np.exp(angle * 1j)
                samps = istft(spec * phase, **stft_kwargs, norm=0.8)
            writer.write(key, samps)
    logger.info(f"Processed {len(feature_reader)} utterance done")
Exemple #12
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": True  # T x F
    }
    wpe_kwargs = {
        "num_iters": args.num_iters,
        "context": args.context,
        "taps": args.taps,
        "delay": args.delay
    }
    spectrogram_reader = SpectrogramReader(
        args.wav_scp,
        round_power_of_two=args.round_power_of_two,
        **stft_kwargs)

    num_done = 0
    with WaveWriter(args.dst_dir, fs=args.samp_fs) as writer:
        for key, reverbed in spectrogram_reader:
            logger.info("Processing utt {}...".format(key))
            # N x T x F => F x N x T
            reverbed = np.transpose(reverbed, (2, 0, 1))
            try:
                # F x N x T
                dereverb = wpe(reverbed, **wpe_kwargs)
            except np.linalg.LinAlgError:
                logger.warn("{}: Failed cause LinAlgError in wpe".format(key))
                continue
            # F x N x T => N x T x F
            dereverb = np.transpose(dereverb, (1, 2, 0))
            # dump multi-channel
            samps = np.stack(
                [istft(spectra, **stft_kwargs) for spectra in dereverb])
            writer.write(key, samps)
            # show progress cause slow speed
            num_done += 1
            if not num_done % 100:
                logger.info("Processed {:d} utterances...".format(num_done))
    logger.info("Processed {:d} utterances over {:d}".format(
        num_done, len(spectrogram_reader)))
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": False  # F x T
    }
    spectrogram_reader = SpectrogramReader(
        args.wav_scp,
        round_power_of_two=args.round_power_of_two,
        **stft_kwargs)
    MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader}
    mask_reader = MaskReader[args.fmt](args.mask_scp)

    online = False
    num_bins = nfft(args.frame_len) // 2 + 1

    supported_beamformer = {
        "mvdr": MvdrBeamformer(num_bins),
        "gevd": GevdBeamformer(num_bins),
        "pmwf": PmwfBeamformer(num_bins)
    }
    supported_online_beamformer = {
        "mvdr": OnlineMvdrBeamformer(num_bins, args.channels, args.alpha),
        "gevd": OnlineGevdBeamformer(num_bins, args.channels, args.alpha),
    }
    if args.chunk_size <= 0:
        logger.info("Using offline {} beamformer".format(args.beamformer))
        beamformer = supported_beamformer[args.beamformer]
    else:
        if args.chunk_size < 32:
            raise RuntimeError(
                "Seems chunk size({:.2f}) too small for online beamformer".
                format(args.chunk_size))
        beamformer = supported_online_beamformer[args.beamformer]
        online = True
        logger.info("Using online {} beamformer, chunk size = {:d}".format(
            args.beamformer, args.chunk_size))

    num_done = 0
    with WaveWriter(args.dst_dir, fs=args.samp_freq) as writer:
        for key, stft_mat in spectrogram_reader:
            if key in mask_reader:
                num_done += 1
                power = spectrogram_reader.power(key)
                logger.info(
                    "Processing utterance {}, signal power {:.2f}...".format(
                        key, 10 * np.log10(power + 1e-5)))
                # prefer T x F
                speech_mask = mask_reader[key]
                # constraint [0, 1]
                speech_mask = np.minimum(speech_mask, 1)
                # make sure speech_mask at shape T x F
                _, F, _ = stft_mat.shape
                # if in F x T
                if speech_mask.shape[0] == F:
                    speech_mask = np.transpose(speech_mask)
                # stft_enh, stft_mat: (N) x F x T
                if not online:
                    stft_enh = beamformer.run(speech_mask,
                                              stft_mat,
                                              normalize=args.ban)
                else:
                    stft_enh = do_online_beamform(beamformer, speech_mask,
                                                  stft_mat, args)
                # masking beamformer output if necessary
                if args.mask:
                    stft_enh = stft_enh * np.transpose(speech_mask)
                samps = istft(stft_enh, power=power, **stft_kwargs)
                writer.write(key, samps)
    logger.info("Processed {:d} utterances out of {:d}".format(
        num_done, len(spectrogram_reader)))
Exemple #14
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": False  # F x T
    }
    spectrogram_reader = SpectrogramReader(
        args.wav_scp,
        round_power_of_two=args.round_power_of_two,
        **stft_kwargs)
    MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader}
    tgt_mask_reader = MaskReader[args.fmt](args.tgt_mask)
    itf_mask_reader = MaskReader[args.fmt](
        args.tgt_mask) if args.itf_mask else None
    if itf_mask_reader is not None:
        logger.info(f"Using interfering masks from {args.itf_mask}")
    online = False
    num_bins = nfft(args.frame_len) // 2 + 1
    ref_channel = args.pmwf_ref if args.pmwf_ref >= 0 else None
    supported_beamformer = {
        "mvdr": MvdrBeamformer(num_bins),
        "gevd": GevdBeamformer(num_bins),
        "pmwf-0": PmwfBeamformer(num_bins, beta=0, ref_channel=ref_channel),
        "pmwf-1": PmwfBeamformer(num_bins, beta=1, ref_channel=ref_channel)
    }
    supported_online_beamformer = {
        "mvdr": OnlineMvdrBeamformer(num_bins, args.channels, args.alpha),
        "gevd": OnlineGevdBeamformer(num_bins, args.channels, args.alpha),
    }
    if args.chunk_size <= 0:
        logger.info(f"Using offline {args.beamformer} beamformer")
        beamformer = supported_beamformer[args.beamformer]
    else:
        if args.chunk_size < 32:
            raise RuntimeError(f"Seems chunk size({args.chunk_size:.2f}) " +
                               "too small for online beamformer")
        beamformer = supported_online_beamformer[args.beamformer]
        online = True
        logger.info(f"Using online {args.beamformer} beamformer, " +
                    f"chunk size = {args.chunk_size:d}")

    num_done = 0
    with WaveWriter(args.dst_dir, fs=args.samp_freq) as writer:
        for key, stft_mat in spectrogram_reader:
            if key in tgt_mask_reader:
                power = spectrogram_reader.power(key)
                logger.info(
                    f"Processing utterance {key}, " +
                    f"signal power {10 * np.log10(power + 1e-5):.2f}...")
                # prefer T x F
                speech_mask = tgt_mask_reader[key]
                # constraint [0, 1]
                if itf_mask_reader is None:
                    speech_mask = np.minimum(speech_mask, 1)
                    interf_mask = None
                else:
                    interf_mask = itf_mask_reader[key]
                # make sure speech_mask at shape T x F
                _, F, _ = stft_mat.shape
                # if in F x T
                if speech_mask.shape[0] == F:
                    speech_mask = np.transpose(speech_mask)
                    if interf_mask is not None:
                        interf_mask = np.transpose(interf_mask)
                if 0.5 < args.vad_proportion < 1:
                    vad_mask, N = compute_vad_masks(stft_mat[0],
                                                    args.vad_proportion)
                    logger.info(f"Filtering {N} TF-masks...")
                    speech_mask = np.where(vad_mask, 1.0e-4, speech_mask)
                    if interf_mask is not None:
                        interf_mask = np.where(vad_mask, 1.0e-4, interf_mask)
                # stft_enh, stft_mat: (N) x F x T
                try:
                    if not online:
                        stft_enh = beamformer.run(speech_mask,
                                                  stft_mat,
                                                  noise_mask=interf_mask,
                                                  normalize=args.ban)
                    else:
                        stft_enh = do_online_beamform(beamformer, speech_mask,
                                                      interf_mask, stft_mat,
                                                      args)
                except np.linalg.LinAlgError:
                    logger.error(f"Raise linalg error: {key}")
                    continue
                # masking beamformer output if necessary
                if args.mask:
                    stft_enh = stft_enh * np.transpose(speech_mask)
                samps = istft(stft_enh, power=power, **stft_kwargs)
                writer.write(key, samps)
                num_done += 1
    logger.info(f"Processed {num_done:d} utterances " +
                f"out of {len(spectrogram_reader):d}")