コード例 #1
0
ファイル: compute_centroid.py プロジェクト: zhaoforever/setk
def run(args):
    numpy_reader = NumpyReader(args.npy_scp)

    spk2utt = parse_scps(args.spk2utt, num_tokens=-1) if args.spk2utt else None

    with NumpyWriter(args.dump_dir, args.scp) as writer:
        if spk2utt is None:
            for key, mat in numpy_reader:
                if mat.ndim != 2:
                    raise RuntimeError(
                        "--spk2utt is None, so input ndarray must be 2D, got {:d}"
                        .format(mat.ndim))
                if args.normalize:
                    mat = mat / np.linalg.norm(
                        mat, ord=2, axis=1, keepdims=True)
                writer.write(key, np.mean(mat, axis=0))
            logger.info("Processed {:d} speakers".format(len(numpy_reader)))
        else:
            for spkid, uttlist in spk2utt.items():
                spkset = []
                for uttid in uttlist:
                    vec = numpy_reader[uttid]
                    if vec.ndim != 1:
                        raise RuntimeError(
                            "--spk2utt is not None, expect input as vector, got {:d}"
                            .format(vec.ndim))
                    if args.normalize:
                        vec = vec / np.linalg.norm(vec)
                    spkset.append(vec)
                spk_mat = np.stack(spkset)
                writer.write(spkid, np.mean(spk_mat, axis=0))
            logger.info("Processed {:d} speakers".format(len(spk2utt)))
コード例 #2
0
def run(args):
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,
        "transpose": False
    }

    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    init_mask_reader = ScriptReader(args.init_mask) if args.init_mask else None

    num_done = 0
    with NumpyWriter(args.dst_dir) as writer:
        for key, stft in spectrogram_reader:
            if not os.path.exists(
                    os.path.join(args.dst_dir, "{}.npy".format(key))):
                init_mask = None
                if init_mask_reader and key in init_mask_reader:
                    init_mask = init_mask_reader[key]
                    logger.info(
                        "Using external speech mask to initialize cgmm")
                # stft: N x F x T
                trainer = CgmmTrainer(stft, Ms=init_mask)
                try:
                    speech_masks = trainer.train(args.num_epochs)
                    num_done += 1
                    writer.write(key, speech_masks.astype(np.float32))
                    logger.info("Training utterance {} ... Done".format(key))
                except RuntimeError:
                    logger.warn("Training utterance {} ... Failed".format(key))
            else:
                logger.info("Training utterance {} ... Skip".format(key))
    logger.info("Train {:d} utterances over {:d}".format(
        num_done, len(spectrogram_reader)))
コード例 #3
0
def run(args):
    # shape: T x F
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "round_power_of_two": args.round_power_of_two,
        "window": args.window,
        "center": args.center,
        "apply_abs": True,
    }
    spk_scps = args.spks.split(",")
    if len(spk_scps) < 2:
        raise RuntimeError("Please give at least 2 speakers")
    mix_reader = SpectrogramReader(args.mix, **stft_kwargs)
    spk_reader = [SpectrogramReader(spk, **stft_kwargs) for spk in spk_scps]

    with NumpyWriter(args.dir) as writer:
        for key, mix in mix_reader:
            T, F = mix.shape
            masks = np.zeros_like(mix, dtype=np.float32)
            # sil: -1
            mix_2db = 20 * np.log10(np.maximum(mix, EPSILON))
            sil_idx = mix_2db < (np.max(mix_2db) - args.beta)
            masks[sil_idx] = -1
            logger.info("For {}, silence covered {:.2f}%".format(
                key,
                np.sum(sil_idx) * 100 / (T * F)))
            # for each speaker
            act_idx = ~sil_idx
            labels = np.argmax(np.stack([reader[key]
                                         for reader in spk_reader]),
                               axis=0)
            masks[act_idx] = labels[act_idx]
            writer.write(key, masks)
    logger.info("Processed {:d} utterances done".format(len(mix_reader)))
コード例 #4
0
def run(args):
    feats_reader = ScriptReader(args.feats_scp)
    computer = NnetComputer(args.checkpoint, args.gpu)
    with NumpyWriter(args.dump_dir) as writer:
        for key, feats in feats_reader:
            logger.info("Compute dvector on utterance {}...".format(key))
            dvector = computer.compute(feats)
            writer.write(key, dvector)
    logger.info("Compute over {:d} utterances".format(len(feats_reader)))
コード例 #5
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "round_power_of_two": args.round_power_of_two,
        "window": args.window,
        "center": args.center,
        "transpose": False
    }
    np.random.seed(args.seed)
    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader}
    init_mask_reader = MaskReader[args.fmt](
        args.init_mask) if args.init_mask else None

    num_done = 0
    with NumpyWriter(args.dst_dir) as writer:
        dst_dir = Path(args.dst_dir)
        for key, stft in spectrogram_reader:
            if not (dst_dir / f"{key}.npy").exists():
                init_mask = None
                if init_mask_reader and key in init_mask_reader:
                    init_mask = init_mask_reader[key]
                    # T x F => F x T
                    if init_mask.ndim == 2:
                        init_mask = np.transpose(init_mask)
                    else:
                        init_mask = np.transpose(init_mask, (0, 2, 1))
                    logger.info("Using external TF-mask to initialize cgmm")
                # stft: N x F x T
                trainer = CgmmTrainer(stft,
                                      args.num_classes,
                                      gamma=init_mask,
                                      update_alpha=args.update_alpha)
                try:
                    masks = trainer.train(args.num_iters)
                    # K x F x T => K x T x F
                    masks = np.transpose(masks, (0, 2, 1))
                    num_done += 1
                    if args.solve_permu:
                        masks = permu_aligner(masks)
                        logger.info(
                            "Permutation alignment done on each frequency")
                    if args.num_classes == 2:
                        masks = masks[0]
                    writer.write(key, masks.astype(np.float32))
                    logger.info(f"Training utterance {key} ... Done")
                except RuntimeError:
                    logger.warn(f"Training utterance {key} ... Failed")
            else:
                logger.info(f"Training utterance {key} ... Skip")
    logger.info(
        f"Train {num_done:d} utterances over {len(spectrogram_reader):d}")
コード例 #6
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "round_power_of_two": args.round_power_of_two,
        "window": args.window,
        "center": args.center,
        "transpose": False
    }

    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader}
    init_mask_reader = MaskReader[args.fmt](
        args.init_mask) if args.init_mask else None

    n_fft = nfft(args.frame_len) if args.round_power_of_two else args.frame_len
    # now use pb_bss
    pb_perm_solver = load_module(pb_bss_align_url)
    aligner = pb_perm_solver.DHTVPermutationAlignment.from_stft_size(n_fft)

    num_done = 0
    with NumpyWriter(args.dst_dir) as writer:
        dst_dir = Path(args.dst_dir)
        for key, stft in spectrogram_reader:
            if not (dst_dir / f"{key}.npy").exists():
                # K x F x T
                init_mask = None
                if init_mask_reader and key in init_mask_reader:
                    init_mask = init_mask_reader[key]
                    logger.info("Using external mask to initialize cacgmm")
                # stft: N x F x T
                trainer = CacgmmTrainer(stft,
                                        args.num_classes,
                                        gamma=init_mask,
                                        cgmm_init=args.cgmm_init)
                try:
                    # EM progress
                    masks = trainer.train(args.num_epoches)
                    # align if needed
                    if not args.cgmm_init or args.num_classes != 2:
                        masks = aligner(masks)
                        logger.info(
                            "Permutation align done for each frequency")
                    num_done += 1
                    writer.write(key, masks.astype(np.float32))
                    logger.info(f"Training utterance {key} ... Done")
                except np.linalg.LinAlgError:
                    logger.warn(f"Training utterance {key} ... Failed")
            else:
                logger.info(f"Training utterance {key} ... Skip")
    logger.info(
        f"Train {num_done:d} utterances over {len(spectrogram_reader):d}")
コード例 #7
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "round_power_of_two": args.round_power_of_two,
        "window": args.window,
        "center": args.center,
        "transpose": False
    }
    np.random.seed(args.seed)
    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader}
    init_mask_reader = MaskReader[args.fmt](
        args.init_mask) if args.init_mask else None

    num_done = 0
    with NumpyWriter(args.dst_dir) as writer:
        dst_dir = Path(args.dst_dir)
        for key, stft in spectrogram_reader:
            if not (dst_dir / f"{key}.npy").exists():
                init_mask = None
                if init_mask_reader and key in init_mask_reader:
                    init_mask = init_mask_reader[key]
                    if args.trans_mask:
                        init_mask = np.transpose(init_mask)
                    logger.info(
                        "Using external speech mask to initialize cgmm")
                # stft: N x F x T
                trainer = CgmmTrainer(stft, gamma=init_mask)
                try:
                    speech_masks = trainer.train(args.num_epoches)
                    num_done += 1
                    speech_masks = np.transpose(speech_masks)
                    writer.write(key, speech_masks.astype(np.float32))
                    logger.info(f"Training utterance {key} ... Done")
                except RuntimeError:
                    logger.warn(f"Training utterance {key} ... Failed")
            else:
                logger.info(f"Training utterance {key} ... Skip")
    logger.info(
        f"Train {num_done:d} utterances over {len(spectrogram_reader):d}")
コード例 #8
0
ファイル: apply_omlsa.py プロジェクト: yaozengwei/setk
def run(args):
    if args.sr != 16000:
        raise ValueError("Now only support audio in 16kHz")
    # shape: T x F, complex
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center,
    }
    spectrogram_reader = SpectrogramReader(
        args.wav_scp,
        **stft_kwargs,
        round_power_of_two=args.round_power_of_two)

    if args.conf:
        with open(args.conf, "r") as conf:
            omlsa_conf = yaml.full_load(conf)
            suppressor = OMLSA(**omlsa_conf)
    else:
        suppressor = OMLSA()

    if args.output == "wave":
        with WaveWriter(args.dst_dir, fs=args.sr) as writer:
            for key, stft in spectrogram_reader:
                logger.info(f"Processing utterance {key}...")
                gain = suppressor.run(stft)
                samps = inverse_stft(gain * stft, **stft_kwargs)
                writer.write(key, samps)
    else:
        with NumpyWriter(args.dst_dir) as writer:
            for key, stft in spectrogram_reader:
                logger.info(f"Processing utterance {key}...")
                gain = suppressor.run(stft)
                writer.write(key, gain)
    logger.info(f"Processed {len(spectrogram_reader):d} utterances done")