Esempio n. 1
0
def run(args):
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,
        "transpose": True  # F x T instead of T x F
    }

    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)

    for key, spectrogram in spectrogram_reader:
        logger.info("Processing utterance {}...".format(key))
        separated = auxiva(spectrogram, args.epochs)
        for idx in range(separated.shape[0]):
            samps = istft(
                separated[idx],
                **stft_kwargs,
                norm=spectrogram_reader.samp_norm(key))
            write_wav(
                os.path.join(args.dst_dir, "{}.SRC{:d}.wav".format(
                    key, idx + 1)),
                samps,
                fs=args.fs)
    logger.info("Processed {:d} utterances".format(len(spectrogram_reader)))
Esempio n. 2
0
def run(args):
    # shape: T x F, complex
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,
    }
    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader(
        args.mask_scp)

    num_done = 0
    with WaveWriter(args.dst_dir, fs=args.samp_freq) as writer:
        for key, specs in spectrogram_reader:
            if key in mask_reader:
                num_done += 1
                mask = mask_reader[key]
                if args.transpose:
                    mask = np.transpose(mask)
                logger.info("Processing utterance {}...".format(key))
                if mask.shape != specs.shape:
                    raise ValueError(
                        "Dimention mismatch between mask and spectrogram"
                        "({0[0]} x {0[1]} vs {1[0]} x {1[1]}), need check configures"
                        .format(mask.shape, specs.shape))
                nsamps = spectrogram_reader.nsamps(
                    key) if args.keep_length else None
                norm = spectrogram_reader.samp_norm(key)
                samps = istft(
                    specs * mask, **stft_kwargs, norm=norm, nsamps=nsamps)
                writer.write(key, samps)
    logger.info("Processed {:d} utterances over {:d}".format(
        num_done, len(spectrogram_reader)))
Esempio n. 3
0
def run(args):
    # shape: T x F, complex
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center,
    }
    spectrogram_reader = SpectrogramReader(
        args.wav_scp,
        **stft_kwargs,
        round_power_of_two=args.round_power_of_two)
    phase_reader = None
    if args.phase_ref:
        phase_reader = SpectrogramReader(
            args.phase_ref,
            **stft_kwargs,
            round_power_of_two=args.round_power_of_two)
        logger.info("Using phase reference from {}".format(args.phase_ref))
    MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader}
    mask_reader = MaskReader[args.fmt](args.mask_scp)

    num_done = 0
    with WaveWriter(args.dst_dir, fs=args.samp_freq) as writer:
        for key, specs in spectrogram_reader:
            # specs: T x F
            if key in mask_reader:
                num_done += 1
                mask = mask_reader[key]
                # mask sure mask in T x F
                _, F = specs.shape
                if mask.shape[0] == F:
                    mask = np.transpose(mask)
                logger.info("Processing utterance {}...".format(key))
                if mask.shape != specs.shape:
                    raise ValueError(
                        "Dimention mismatch between mask and spectrogram"
                        "({0[0]} x {0[1]} vs {1[0]} x {1[1]}), need check configures"
                        .format(mask.shape, specs.shape))
                nsamps = spectrogram_reader.nsamps(
                    key) if args.keep_length else None
                norm = spectrogram_reader.samp_norm(key)
                # use phase from ref
                if phase_reader is not None:
                    angle = np.angle(phase_reader[key])
                    phase = np.exp(angle * 1j)
                    samps = istft(np.abs(specs) * mask * phase,
                                  **stft_kwargs,
                                  norm=norm,
                                  nsamps=nsamps)
                else:
                    samps = istft(specs * mask,
                                  **stft_kwargs,
                                  norm=norm,
                                  nsamps=nsamps)
                writer.write(key, samps)
    logger.info("Processed {:d} utterances over {:d}".format(
        num_done, len(spectrogram_reader)))
def run(args):
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": False  # F x T
    }
    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader(
        args.mask_scp)

    num_bins = nfft(args.frame_length) // 2 + 1
    supported_beamformer = {
        "mvdr": MvdrBeamformer(num_bins),
        "gevd": GevdBeamformer(num_bins),
        "pmwf": PmwfBeamformer(num_bins)
    }
    beamformer = supported_beamformer[args.beamformer]

    num_utts = 0
    for key, stft_mat in spectrogram_reader:
        if key in mask_reader:
            num_utts += 1
            norm = spectrogram_reader.samp_norm(key)
            logger.info("Processing utterance {}(norm to {:.2f})...".format(
                key, norm))
            # prefer T x F
            speech_mask = mask_reader[key]
            if args.trans:
                speech_mask = np.transpose(speech_mask)
            # stft_enh, stft_mat: F x T
            stft_enh = beamformer.run(speech_mask,
                                      stft_mat,
                                      normalize=args.postf)
            # masking beamformer output if necessary
            if args.mask:
                stft_enh = stft_enh * np.transpose(speech_mask)
            istft(os.path.join(args.dst_dir, '{}.wav'.format(key)),
                  stft_enh,
                  norm=norm,
                  fs=args.samp_freq,
                  **stft_kwargs)
    logger.info("Processed {:d} utterances out of {:d}".format(
        num_utts, len(spectrogram_reader)))