def run(args):
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,
        "transpose": False
    }

    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    init_mask_reader = ScriptReader(args.init_mask) if args.init_mask else None

    num_done = 0
    with NumpyWriter(args.dst_dir) as writer:
        for key, stft in spectrogram_reader:
            if not os.path.exists(
                    os.path.join(args.dst_dir, "{}.npy".format(key))):
                init_mask = None
                if init_mask_reader and key in init_mask_reader:
                    init_mask = init_mask_reader[key]
                    logger.info(
                        "Using external speech mask to initialize cgmm")
                # stft: N x F x T
                trainer = CgmmTrainer(stft, Ms=init_mask)
                try:
                    speech_masks = trainer.train(args.num_epochs)
                    num_done += 1
                    writer.write(key, speech_masks.astype(np.float32))
                    logger.info("Training utterance {} ... Done".format(key))
                except RuntimeError:
                    logger.warn("Training utterance {} ... Failed".format(key))
            else:
                logger.info("Training utterance {} ... Skip".format(key))
    logger.info("Train {:d} utterances over {:d}".format(
        num_done, len(spectrogram_reader)))
 def __init__(self, data_dir):
     depends = [os.path.join(data_dir, x) for x in ["feats.scp", "spk2utt"]]
     for depend in depends:
         if not os.path.exists(depend):
             raise RuntimeError("Missing {}!".format(depend))
     self.reader = ScriptReader(depends[0])
     self.spk2utt = parse_scps(depends[1], num_tokens=-1)
def run(args):
    # shape: T x F, complex
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,
    }
    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader(
        args.mask_scp)

    num_done = 0
    with WaveWriter(args.dst_dir, fs=args.samp_freq) as writer:
        for key, specs in spectrogram_reader:
            if key in mask_reader:
                num_done += 1
                mask = mask_reader[key]
                if args.transpose:
                    mask = np.transpose(mask)
                logger.info("Processing utterance {}...".format(key))
                if mask.shape != specs.shape:
                    raise ValueError(
                        "Dimention mismatch between mask and spectrogram"
                        "({0[0]} x {0[1]} vs {1[0]} x {1[1]}), need check configures"
                        .format(mask.shape, specs.shape))
                nsamps = spectrogram_reader.nsamps(
                    key) if args.keep_length else None
                norm = spectrogram_reader.samp_norm(key)
                samps = istft(
                    specs * mask, **stft_kwargs, norm=norm, nsamps=nsamps)
                writer.write(key, samps)
    logger.info("Processed {:d} utterances over {:d}".format(
        num_done, len(spectrogram_reader)))
Exemple #4
0
def run(args):
    # shape: T x F, complex
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,
    }
    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader(
        args.mask_scp)

    num_utts = 0
    fs = args.samp_freq,
    for key, specs in spectrogram_reader:
        if key in mask_reader:
            num_utts += 1
            mask = mask_reader[key]
            if args.transpose:
                mask = np.transpose(mask)
            logger.info("Processing utterance {}...".format(key))
            if mask.shape != specs.shape:
                raise ValueError(
                    "Dimention mismatch between mask and spectrogram"
                    "({0[0]} x {0[1]} vs {1[0]} x {1[1]}), need check configures"
                    .format(mask.shape, specs.shape))
            nsamps = spectrogram_reader.nsamps(
                key) if args.keep_length else None
            istft(os.path.join(args.dst_dir, "{}.wav".format(key)),
                  specs * mask,
                  **stft_kwargs,
                  fs=fs,
                  nsamps=nsamps)
    logger.info("Processed {} utterances".format(num_utts))
Exemple #5
0
def run(args):
    feats_reader = ScriptReader(args.feats_scp)
    computer = NnetComputer(args.checkpoint, args.gpu)
    with NumpyWriter(args.dump_dir) as writer:
        for key, feats in feats_reader:
            logger.info("Compute dvector on utterance {}...".format(key))
            dvector = computer.compute(feats)
            writer.write(key, dvector)
    logger.info("Compute over {:d} utterances".format(len(feats_reader)))
Exemple #6
0
def run(args):
    src_reader = ScriptReader(
        args.src_dec) if args.src == "scp" else ArchiveReader(args.src_dec)
    num_done = 0
    WriterImpl = {"npy": NumpyWriter, "mat": MatWriter}[args.dst]
    with WriterImpl(args.dst_dir, args.scp) as writer:
        for key, mat in src_reader:
            if args.trans:
                mat = np.transpose(mat)
            writer.write(key, mat)
            num_done += 1
    logger.info(f"Copy {num_done} into directory {args.dst_dir}")
Exemple #7
0
def run(args):
    src_format = args.input == "matrix"
    src_reader = ScriptReader(
        args.src_dec,
        matrix=src_format) if args.src == "scp" else ArchiveReader(
            args.src_dec, matrix=src_format)
    num_done = 0
    WriterImpl = {"npy": NumpyWriter, "mat": MatWriter}[args.dst]
    with WriterImpl(args.dst_dir, args.scp) as writer:
        for key, mat in src_reader:
            if args.trans:
                mat = np.transpose(mat)
            writer.write(key, mat)
            num_done += 1
    logger.info("Copy {0} {1} into directory {2}".format(
        num_done, "matrices" if src_format else "vectors", args.dst_dir))
Exemple #8
0
def run(args):
    griffin_lim_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center,
        "transpose": True,
        "epochs": args.epochs
    }

    feature_reader = ScriptReader(args.feat_scp)

    if args.fbank:
        mel_kwargs = {
            "n_mels": args.num_bins,
            "fmin": args.min_freq,
            "fmax": args.max_freq,
            "htk": True
        }
        # N x F
        mel_weights = audio_lib.filters.mel(args.samp_freq,
                                            nfft(args.frame_length),
                                            **mel_kwargs)
        # F x N
        mel_inv_weights = np.linalg.pinv(mel_weights)

    with WaveWriter(
            args.dump_dir, fs=args.samp_freq,
            normalize=args.normalize) as writer:
        for key, spec in feature_reader:
            # if log, tranform to linear
            if args.apply_log:
                spec = np.exp(spec)
            # convert fbank to spectrum
            # feat: T x N
            if args.fbank:
                spec = np.maximum(spec @ np.transpose(mel_inv_weights),
                                  EPSILON)
            # if power spectrum, tranform to magnitude spectrum
            if args.apply_pow:
                spec = np.sqrt(spec)
            if spec.shape[1] - 1 != nfft(args.frame_length) // 2:
                raise RuntimeError("Seems missing --fbank options?")
            # griffin lim
            samps = griffin_lim(spec, **griffin_lim_kwargs)
            writer.write(key, samps)
    logger.info("Processed {:d} utterance done".format(len(feature_reader)))
def run(args):
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": False  # F x T
    }
    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader(
        args.mask_scp)

    num_bins = nfft(args.frame_length) // 2 + 1
    supported_beamformer = {
        "mvdr": MvdrBeamformer(num_bins),
        "gevd": GevdBeamformer(num_bins),
        "pmwf": PmwfBeamformer(num_bins)
    }
    beamformer = supported_beamformer[args.beamformer]

    num_utts = 0
    for key, stft_mat in spectrogram_reader:
        if key in mask_reader:
            num_utts += 1
            norm = spectrogram_reader.samp_norm(key)
            logger.info("Processing utterance {}(norm to {:.2f})...".format(
                key, norm))
            # prefer T x F
            speech_mask = mask_reader[key]
            if args.trans:
                speech_mask = np.transpose(speech_mask)
            # stft_enh, stft_mat: F x T
            stft_enh = beamformer.run(speech_mask,
                                      stft_mat,
                                      normalize=args.postf)
            # masking beamformer output if necessary
            if args.mask:
                stft_enh = stft_enh * np.transpose(speech_mask)
            istft(os.path.join(args.dst_dir, '{}.wav'.format(key)),
                  stft_enh,
                  norm=norm,
                  fs=args.samp_freq,
                  **stft_kwargs)
    logger.info("Processed {:d} utterances out of {:d}".format(
        num_utts, len(spectrogram_reader)))
Exemple #10
0
def run(args):
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "round_power_of_two": args.round_power_of_two,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": False  # F x T
    }
    feat_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader(
        args.mask_scp)

    num_bins = nfft(args.frame_length) // 2 + 1
    beamformer = MvdrBeamformer(num_bins)

    num_done = 0
    with ArchiveWriter(args.dup_ark, args.scp) as writer:
        for key, spect in feat_reader:
            if key in mask_reader:
                speech_masks = mask_reader[key]
                if args.trans:
                    speech_masks = np.transpose(speech_masks)
                speech_masks = np.minimum(speech_masks, 1)
                # spectrogram: N x F x T
                speech_covar = beamformer.compute_covar_mat(
                    speech_masks, spect)
                sv = beamformer.compute_steer_vector(speech_covar)
                df = directional_feats(spect, sv.T)
                writer.write(key, df)
                num_done += 1
                if not num_done % 1000:
                    logger.info("Processed {:d} utterance...".format(num_done))
            else:
                logger.warn("Missing TF-mask for utterance {}".format(key))
    logger.info("Processed {:d} utterances over {:d}".format(
        num_done, len(feat_reader)))
 def Reader(scp, t):
     return NumpyReader(scp) if t == "numpy" else ScriptReader(scp)
def run(args):
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": False  # F x T
    }
    spectrogram_reader = SpectrogramReader(
        args.wav_scp,
        round_power_of_two=args.round_power_of_two,
        **stft_kwargs)
    mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader(
        args.mask_scp)

    online = False
    num_bins = nfft(args.frame_length) // 2 + 1

    supported_beamformer = {
        "mvdr": MvdrBeamformer(num_bins),
        "gevd": GevdBeamformer(num_bins),
        "pmwf": PmwfBeamformer(num_bins)
    }
    supported_online_beamformer = {
        "mvdr": OnlineMvdrBeamformer(num_bins, args.channels, args.alpha),
        "gevd": OnlineGevdBeamformer(num_bins, args.channels, args.alpha),
    }
    if args.chunk_size <= 0:
        logger.info("Using offline {} beamformer".format(args.beamformer))
        beamformer = supported_beamformer[args.beamformer]
    else:
        if args.chunk_size < 32:
            raise RuntimeError(
                "Seems chunk size({:.2f}) too small for online beamformer".
                format(args.chunk_size))
        beamformer = supported_online_beamformer[args.beamformer]
        online = True
        logger.info("Using online {} beamformer, chunk size = {:d}".format(
            args.beamformer, args.chunk_size))

    num_done = 0
    with WaveWriter(args.dst_dir, fs=args.samp_freq) as writer:
        for key, stft_mat in spectrogram_reader:
            if key in mask_reader:
                num_done += 1
                power = spectrogram_reader.power(key)
                logger.info(
                    "Processing utterance {}, signal power {:.2f}...".format(
                        key, 10 * np.log10(power + 1e-5)))
                # prefer T x F
                speech_mask = mask_reader[key]
                # constraint [0, 1]
                speech_mask = np.minimum(speech_mask, 1)
                if args.trans:
                    speech_mask = np.transpose(speech_mask)
                # stft_enh, stft_mat: (N) x F x T
                if not online:
                    stft_enh = beamformer.run(speech_mask,
                                              stft_mat,
                                              normalize=args.ban)
                else:
                    stft_enh = do_online_beamform(beamformer, speech_mask,
                                                  stft_mat, args)
                # masking beamformer output if necessary
                if args.mask:
                    stft_enh = stft_enh * np.transpose(speech_mask)
                samps = istft(stft_enh, power=power, **stft_kwargs)
                writer.write(key, samps)
    logger.info("Processed {:d} utterances out of {:d}".format(
        num_done, len(spectrogram_reader)))