def run(args): numpy_reader = NumpyReader(args.npy_scp) spk2utt = parse_scps(args.spk2utt, num_tokens=-1) if args.spk2utt else None with NumpyWriter(args.dump_dir, args.scp) as writer: if spk2utt is None: for key, mat in numpy_reader: if mat.ndim != 2: raise RuntimeError( "--spk2utt is None, so input ndarray must be 2D, got {:d}" .format(mat.ndim)) if args.normalize: mat = mat / np.linalg.norm( mat, ord=2, axis=1, keepdims=True) writer.write(key, np.mean(mat, axis=0)) logger.info("Processed {:d} speakers".format(len(numpy_reader))) else: for spkid, uttlist in spk2utt.items(): spkset = [] for uttid in uttlist: vec = numpy_reader[uttid] if vec.ndim != 1: raise RuntimeError( "--spk2utt is not None, expect input as vector, got {:d}" .format(vec.ndim)) if args.normalize: vec = vec / np.linalg.norm(vec) spkset.append(vec) spk_mat = np.stack(spkset) writer.write(spkid, np.mean(spk_mat, axis=0)) logger.info("Processed {:d} speakers".format(len(spk2utt)))
def run(args): stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, "transpose": False } spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) init_mask_reader = ScriptReader(args.init_mask) if args.init_mask else None num_done = 0 with NumpyWriter(args.dst_dir) as writer: for key, stft in spectrogram_reader: if not os.path.exists( os.path.join(args.dst_dir, "{}.npy".format(key))): init_mask = None if init_mask_reader and key in init_mask_reader: init_mask = init_mask_reader[key] logger.info( "Using external speech mask to initialize cgmm") # stft: N x F x T trainer = CgmmTrainer(stft, Ms=init_mask) try: speech_masks = trainer.train(args.num_epochs) num_done += 1 writer.write(key, speech_masks.astype(np.float32)) logger.info("Training utterance {} ... Done".format(key)) except RuntimeError: logger.warn("Training utterance {} ... Failed".format(key)) else: logger.info("Training utterance {} ... Skip".format(key)) logger.info("Train {:d} utterances over {:d}".format( num_done, len(spectrogram_reader)))
def run(args): # shape: T x F stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "round_power_of_two": args.round_power_of_two, "window": args.window, "center": args.center, "apply_abs": True, } spk_scps = args.spks.split(",") if len(spk_scps) < 2: raise RuntimeError("Please give at least 2 speakers") mix_reader = SpectrogramReader(args.mix, **stft_kwargs) spk_reader = [SpectrogramReader(spk, **stft_kwargs) for spk in spk_scps] with NumpyWriter(args.dir) as writer: for key, mix in mix_reader: T, F = mix.shape masks = np.zeros_like(mix, dtype=np.float32) # sil: -1 mix_2db = 20 * np.log10(np.maximum(mix, EPSILON)) sil_idx = mix_2db < (np.max(mix_2db) - args.beta) masks[sil_idx] = -1 logger.info("For {}, silence covered {:.2f}%".format( key, np.sum(sil_idx) * 100 / (T * F))) # for each speaker act_idx = ~sil_idx labels = np.argmax(np.stack([reader[key] for reader in spk_reader]), axis=0) masks[act_idx] = labels[act_idx] writer.write(key, masks) logger.info("Processed {:d} utterances done".format(len(mix_reader)))
def run(args): feats_reader = ScriptReader(args.feats_scp) computer = NnetComputer(args.checkpoint, args.gpu) with NumpyWriter(args.dump_dir) as writer: for key, feats in feats_reader: logger.info("Compute dvector on utterance {}...".format(key)) dvector = computer.compute(feats) writer.write(key, dvector) logger.info("Compute over {:d} utterances".format(len(feats_reader)))
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "round_power_of_two": args.round_power_of_two, "window": args.window, "center": args.center, "transpose": False } np.random.seed(args.seed) spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader} init_mask_reader = MaskReader[args.fmt]( args.init_mask) if args.init_mask else None num_done = 0 with NumpyWriter(args.dst_dir) as writer: dst_dir = Path(args.dst_dir) for key, stft in spectrogram_reader: if not (dst_dir / f"{key}.npy").exists(): init_mask = None if init_mask_reader and key in init_mask_reader: init_mask = init_mask_reader[key] # T x F => F x T if init_mask.ndim == 2: init_mask = np.transpose(init_mask) else: init_mask = np.transpose(init_mask, (0, 2, 1)) logger.info("Using external TF-mask to initialize cgmm") # stft: N x F x T trainer = CgmmTrainer(stft, args.num_classes, gamma=init_mask, update_alpha=args.update_alpha) try: masks = trainer.train(args.num_iters) # K x F x T => K x T x F masks = np.transpose(masks, (0, 2, 1)) num_done += 1 if args.solve_permu: masks = permu_aligner(masks) logger.info( "Permutation alignment done on each frequency") if args.num_classes == 2: masks = masks[0] writer.write(key, masks.astype(np.float32)) logger.info(f"Training utterance {key} ... Done") except RuntimeError: logger.warn(f"Training utterance {key} ... Failed") else: logger.info(f"Training utterance {key} ... Skip") logger.info( f"Train {num_done:d} utterances over {len(spectrogram_reader):d}")
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "round_power_of_two": args.round_power_of_two, "window": args.window, "center": args.center, "transpose": False } spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader} init_mask_reader = MaskReader[args.fmt]( args.init_mask) if args.init_mask else None n_fft = nfft(args.frame_len) if args.round_power_of_two else args.frame_len # now use pb_bss pb_perm_solver = load_module(pb_bss_align_url) aligner = pb_perm_solver.DHTVPermutationAlignment.from_stft_size(n_fft) num_done = 0 with NumpyWriter(args.dst_dir) as writer: dst_dir = Path(args.dst_dir) for key, stft in spectrogram_reader: if not (dst_dir / f"{key}.npy").exists(): # K x F x T init_mask = None if init_mask_reader and key in init_mask_reader: init_mask = init_mask_reader[key] logger.info("Using external mask to initialize cacgmm") # stft: N x F x T trainer = CacgmmTrainer(stft, args.num_classes, gamma=init_mask, cgmm_init=args.cgmm_init) try: # EM progress masks = trainer.train(args.num_epoches) # align if needed if not args.cgmm_init or args.num_classes != 2: masks = aligner(masks) logger.info( "Permutation align done for each frequency") num_done += 1 writer.write(key, masks.astype(np.float32)) logger.info(f"Training utterance {key} ... Done") except np.linalg.LinAlgError: logger.warn(f"Training utterance {key} ... Failed") else: logger.info(f"Training utterance {key} ... Skip") logger.info( f"Train {num_done:d} utterances over {len(spectrogram_reader):d}")
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "round_power_of_two": args.round_power_of_two, "window": args.window, "center": args.center, "transpose": False } np.random.seed(args.seed) spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader} init_mask_reader = MaskReader[args.fmt]( args.init_mask) if args.init_mask else None num_done = 0 with NumpyWriter(args.dst_dir) as writer: dst_dir = Path(args.dst_dir) for key, stft in spectrogram_reader: if not (dst_dir / f"{key}.npy").exists(): init_mask = None if init_mask_reader and key in init_mask_reader: init_mask = init_mask_reader[key] if args.trans_mask: init_mask = np.transpose(init_mask) logger.info( "Using external speech mask to initialize cgmm") # stft: N x F x T trainer = CgmmTrainer(stft, gamma=init_mask) try: speech_masks = trainer.train(args.num_epoches) num_done += 1 speech_masks = np.transpose(speech_masks) writer.write(key, speech_masks.astype(np.float32)) logger.info(f"Training utterance {key} ... Done") except RuntimeError: logger.warn(f"Training utterance {key} ... Failed") else: logger.info(f"Training utterance {key} ... Skip") logger.info( f"Train {num_done:d} utterances over {len(spectrogram_reader):d}")
def run(args): if args.sr != 16000: raise ValueError("Now only support audio in 16kHz") # shape: T x F, complex stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, } spectrogram_reader = SpectrogramReader( args.wav_scp, **stft_kwargs, round_power_of_two=args.round_power_of_two) if args.conf: with open(args.conf, "r") as conf: omlsa_conf = yaml.full_load(conf) suppressor = OMLSA(**omlsa_conf) else: suppressor = OMLSA() if args.output == "wave": with WaveWriter(args.dst_dir, fs=args.sr) as writer: for key, stft in spectrogram_reader: logger.info(f"Processing utterance {key}...") gain = suppressor.run(stft) samps = inverse_stft(gain * stft, **stft_kwargs) writer.write(key, samps) else: with NumpyWriter(args.dst_dir) as writer: for key, stft in spectrogram_reader: logger.info(f"Processing utterance {key}...") gain = suppressor.run(stft) writer.write(key, gain) logger.info(f"Processed {len(spectrogram_reader):d} utterances done")