def run(args): # shape: T x F stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "round_power_of_two": args.round_power_of_two, "window": args.window, "center": args.center, "apply_abs": True, } spk_scps = args.spks.split(",") if len(spk_scps) < 2: raise RuntimeError("Please give at least 2 speakers") mix_reader = SpectrogramReader(args.mix, **stft_kwargs) spk_reader = [SpectrogramReader(spk, **stft_kwargs) for spk in spk_scps] with NumpyWriter(args.dir) as writer: for key, mix in mix_reader: T, F = mix.shape masks = np.zeros_like(mix, dtype=np.float32) # sil: -1 mix_2db = 20 * np.log10(np.maximum(mix, EPSILON)) sil_idx = mix_2db < (np.max(mix_2db) - args.beta) masks[sil_idx] = -1 logger.info("For {}, silence covered {:.2f}%".format( key, np.sum(sil_idx) * 100 / (T * F))) # for each speaker act_idx = ~sil_idx labels = np.argmax(np.stack([reader[key] for reader in spk_reader]), axis=0) masks[act_idx] = labels[act_idx] writer.write(key, masks) logger.info("Processed {:d} utterances done".format(len(mix_reader)))
def run(args): # shape: T x F, complex stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, } spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader( args.mask_scp) num_done = 0 with WaveWriter(args.dst_dir, fs=args.samp_freq) as writer: for key, specs in spectrogram_reader: if key in mask_reader: num_done += 1 mask = mask_reader[key] if args.transpose: mask = np.transpose(mask) logger.info("Processing utterance {}...".format(key)) if mask.shape != specs.shape: raise ValueError( "Dimention mismatch between mask and spectrogram" "({0[0]} x {0[1]} vs {1[0]} x {1[1]}), need check configures" .format(mask.shape, specs.shape)) nsamps = spectrogram_reader.nsamps( key) if args.keep_length else None norm = spectrogram_reader.samp_norm(key) samps = istft( specs * mask, **stft_kwargs, norm=norm, nsamps=nsamps) writer.write(key, samps) logger.info("Processed {:d} utterances over {:d}".format( num_done, len(spectrogram_reader)))
def run(args): # shape: T x F, complex stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, } spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader( args.mask_scp) num_utts = 0 fs = args.samp_freq, for key, specs in spectrogram_reader: if key in mask_reader: num_utts += 1 mask = mask_reader[key] if args.transpose: mask = np.transpose(mask) logger.info("Processing utterance {}...".format(key)) if mask.shape != specs.shape: raise ValueError( "Dimention mismatch between mask and spectrogram" "({0[0]} x {0[1]} vs {1[0]} x {1[1]}), need check configures" .format(mask.shape, specs.shape)) nsamps = spectrogram_reader.nsamps( key) if args.keep_length else None istft(os.path.join(args.dst_dir, "{}.wav".format(key)), specs * mask, **stft_kwargs, fs=fs, nsamps=nsamps) logger.info("Processed {} utterances".format(num_utts))
def run(args): # shape: T x F, complex stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, # false to comparable with kaldi } speech_reader = SpectrogramReader(args.speech_scp, **stft_kwargs) bnoise_reader = SpectrogramReader(args.noise_scp, **stft_kwargs) num_utts = 0 cutoff = args.cutoff with ArchiveWriter(args.mask_ark, args.scp) as writer: for key, speech in speech_reader: if key in bnoise_reader: num_utts += 1 noise = bnoise_reader[key] mask = compute_mask(speech, noise, args.mask) if cutoff > 0: num_items = np.sum(mask > cutoff) mask = np.minimum(mask, cutoff) if num_items: logger.info("Clip {:d} items for utterance {}".format( num_items, key)) mask = np.maximum(mask, 0) writer.write(key, mask) logger.info("Processed {} utterances".format(num_utts))
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, "transpose": False } topo = list(map(float, args.linear_topo.split(","))) doa = args.doa if args.doa > 0 else 180 + args.doa if doa < 0 or doa > 180: raise RuntimeError("Illegal value for DoA: {:.2f}".format(args.doa)) spectrogram_reader = SpectrogramReader( args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) beamformer = DSBeamformer(topo) logger.info("Initialize {:d} channel DSBeamformer".format(len(topo))) with WaveWriter(args.dst_dir, fs=args.fs) as writer: for key, stft_src in spectrogram_reader: stft_enh = beamformer.run( doa, stft_src, c=args.speed, sample_rate=args.fs) power = spectrogram_reader.power(key) samps = istft(stft_enh, **stft_kwargs, power=power) writer.write(key, samps) logger.info("Processed {:d} utterances".format(len(spectrogram_reader)))
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, "transpose": False } spectrogram_reader = SpectrogramReader( args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) # F x N or B x F x N weights = np.load(args.weights) if weights.ndim == 2: beamformer = FixedBeamformer(weights) beam_index = None else: beamformer = [FixedBeamformer(w) for w in weights] if not args.beam: raise RuntimeError( "--beam must be assigned, as there are multiple beams") beam_index = ScpReader(args.beam, value_processor=lambda x: int) with WaveWriter(args.dst_dir) as writer: for key, stft_mat in spectrogram_reader: logger.info(f"Processing utterance {key}...") if beamformer: beam = beam_index[key] stft_enh = beamformer[beam].run(stft_mat) else: stft_enh = beamformer.run(stft_mat) norm = spectrogram_reader.maxabs(key) samps = inverse_stft(stft_enh, **stft_kwargs, norm=norm) writer.write(key, samps) logger.info(f"Processed {len(spectrogram_reader):d} utterances")
def run(args): stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, "transpose": True # F x T instead of T x F } spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) for key, spectrogram in spectrogram_reader: logger.info("Processing utterance {}...".format(key)) separated = auxiva(spectrogram, args.epochs) for idx in range(separated.shape[0]): samps = istft( separated[idx], **stft_kwargs, norm=spectrogram_reader.samp_norm(key)) write_wav( os.path.join(args.dst_dir, "{}.SRC{:d}.wav".format( key, idx + 1)), samps, fs=args.fs) logger.info("Processed {:d} utterances".format(len(spectrogram_reader)))
def run(args): # shape: T x F, complex stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, } spectrogram_reader = SpectrogramReader( args.wav_scp, **stft_kwargs, round_power_of_two=args.round_power_of_two) phase_reader = None if args.phase_ref: phase_reader = SpectrogramReader( args.phase_ref, **stft_kwargs, round_power_of_two=args.round_power_of_two) logger.info("Using phase reference from {}".format(args.phase_ref)) MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader} mask_reader = MaskReader[args.fmt](args.mask_scp) num_done = 0 with WaveWriter(args.dst_dir, fs=args.samp_freq) as writer: for key, specs in spectrogram_reader: # specs: T x F if key in mask_reader: num_done += 1 mask = mask_reader[key] # mask sure mask in T x F _, F = specs.shape if mask.shape[0] == F: mask = np.transpose(mask) logger.info("Processing utterance {}...".format(key)) if mask.shape != specs.shape: raise ValueError( "Dimention mismatch between mask and spectrogram" "({0[0]} x {0[1]} vs {1[0]} x {1[1]}), need check configures" .format(mask.shape, specs.shape)) nsamps = spectrogram_reader.nsamps( key) if args.keep_length else None norm = spectrogram_reader.samp_norm(key) # use phase from ref if phase_reader is not None: angle = np.angle(phase_reader[key]) phase = np.exp(angle * 1j) samps = istft(np.abs(specs) * mask * phase, **stft_kwargs, norm=norm, nsamps=nsamps) else: samps = istft(specs * mask, **stft_kwargs, norm=norm, nsamps=nsamps) writer.write(key, samps) logger.info("Processed {:d} utterances over {:d}".format( num_done, len(spectrogram_reader)))
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, "transpose": False } if args.geometry == "linear": topo = str2tuple(args.linear_topo) beamformer = LinearSDBeamformer(topo) logger.info(f"Initialize LinearSDBeamformer for array: {topo}") else: beamformer = CircularSDBeamformer(args.circular_radius, args.circular_around, center=args.circular_center) logger.info( "Initialize CircularSDBeamformer for " + f"radius = {args.circular_radius}, center = {args.circular_center}" ) utt2doa = None doa = None if args.utt2doa: utt2doa = ScpReader(args.utt2doa, value_processor=lambda x: float(x)) logger.info(f"Use --utt2doa={args.utt2doa} for each utterance") else: doa = args.doa if not check_doa(args.geometry, doa): logger.info(f"Invalid doa {doa:.2f} for {args.geometry} array") logger.info(f"Use --doa={doa:.2f} for all utterances") spectrogram_reader = SpectrogramReader( args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) done = 0 with WaveWriter(args.dst_dir, sr=args.sr) as writer: for key, stft_src in spectrogram_reader: if utt2doa: if key not in utt2doa: continue doa = utt2doa[key] if not check_doa(args.geometry, doa): logger.info(f"Invalid DoA {doa:.2f} for utterance {key}") continue stft_enh = beamformer.run(doa, stft_src, c=args.speed, sr=args.sr) done += 1 norm = spectrogram_reader.maxabs(key) samps = inverse_stft(stft_enh, **stft_kwargs, norm=norm) writer.write(key, samps) logger.info(f"Processed {done} utterances over {len(spectrogram_reader)}")
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, "transpose": False } utt2doa = None doa = None if args.utt2doa: utt2doa = ScpReader(args.utt2doa, value_processor=lambda x: float(x)) logger.info(f"Use utt2doa {args.utt2doa} for each utterance") else: doa = args.doa if doa < 0: doa = 180 + doa if doa < 0 or doa > 180: raise RuntimeError(f"Invalid doa {doa:.2f} for --doa") logger.info(f"Use DoA {doa:.2f} for all utterances") spectrogram_reader = SpectrogramReader( args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) done = 0 topo = str2tuple(args.linear_topo) beamformer = LinearDSBeamformer(topo) logger.info(f"Initialize channel LinearDSBeamformer for array: {topo}") with WaveWriter(args.dst_dir, fs=args.fs) as writer: for key, stft_src in spectrogram_reader: if utt2doa: if key not in utt2doa: continue doa = utt2doa[key] if doa < 0: doa = 180 + doa if doa < 0 or doa > 180: logger.info(f"Invalid doa {doa:.2f} for utterance {key}") continue stft_enh = beamformer.run(doa, stft_src, c=args.speed, sr=args.fs) done += 1 norm = spectrogram_reader.maxabs(key) samps = inverse_stft(stft_enh, **stft_kwargs, norm=norm) writer.write(key, samps) logger.info(f"Processed {done} utterances over {len(spectrogram_reader)}")
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, "transpose": False } spectrogram_reader = SpectrogramReader( args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) weights_dict = loadmat(args.weights) if args.weight_key not in weights_dict: raise KeyError("Weight key error: no \'{}\' in {}".format( args.weight_key, args.weights)) beamformer = FixedBeamformer(weights_dict[args.weight_key]) with WaveWriter(args.dump_dir) as writer: for key, stft_mat in spectrogram_reader: logger.info("Processing utterance {}...".format(key)) stft_enh = beamformer.run(stft_mat) # do not normalize samps = istft(stft_enh, **stft_kwargs) writer.write(key, samps) logger.info("Processed {:d} utterances".format(len(spectrogram_reader)))
def run(args): stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": True # T x F } wpe_kwargs = { "taps": args.taps, "delay": args.delay, "iters": args.iters, "psd_context": args.context } spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) if not os.path.exists(args.dst_dir): os.makedirs(args.dst_dir) for key, reverbed in spectrogram_reader: # N x T x F => F x N x T reverbed = np.transpose(reverbed, [2, 0, 1]) # F x N x T dereverb = wpe(reverbed, **wpe_kwargs) # F x N x T => N x T x F dereverb = np.transpose(dereverb, [1, 2, 0]) # write for each channel for chid in range(dereverb.shape[0]): samps = istft(dereverb[chid], **stft_kwargs) write_wav(os.path.join(args.dst_dir, "{}.CH{:d}.wav".format(key, chid + 1)), samps, fs=args.samp_freq) logger.info("Processed {:d} utterances".format(len(spectrogram_reader)))
def run(args): cache_dir = Path(args.cache_dir) cache_dir.mkdir(parents=True, exist_ok=True) stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "round_power_of_two": args.round_power_of_two, "window": args.window, "center": args.center # false to comparable with kaldi } reader = SpectrogramReader(args.wav_scp, **stft_kwargs, apply_abs=True, apply_log=True, transpose=True) for key, mat in reader: if mat.ndim == 3 and args.index >= 0: mat = mat[args.index] save_figure(key, mat, cache_dir / key.replace(".", "-"), cmap=args.cmap, hop=args.frame_hop, sr=args.sr, title=args.title)
def run(args): stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, "transpose": False } if not os.path.exists(args.dst_dir): os.makedirs(args.dst_dir) spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) num_done = 0 for key, stft in spectrogram_reader: if not os.path.exists( os.path.join(args.dst_dir, "{}.npy".format(key))): # stft: N x F x T trainer = CgmmTrainer(stft) try: speech_masks = trainer.train(args.num_epochs) num_done += 1 np.save( os.path.join(args.dst_dir, key), speech_masks.astype(np.float32)) logger.info("Training utterance {} ... Done".format(key)) except RuntimeError: logger.warn("Training utterance {} ... Failed".format(key)) else: logger.info("Training utterance {} ... Skip".format(key)) logger.info("Train {:d} utterances over {:d}".format( num_done, len(spectrogram_reader)))
def run(args): stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, "transpose": False } spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) init_mask_reader = ScriptReader(args.init_mask) if args.init_mask else None num_done = 0 with NumpyWriter(args.dst_dir) as writer: for key, stft in spectrogram_reader: if not os.path.exists( os.path.join(args.dst_dir, "{}.npy".format(key))): init_mask = None if init_mask_reader and key in init_mask_reader: init_mask = init_mask_reader[key] logger.info( "Using external speech mask to initialize cgmm") # stft: N x F x T trainer = CgmmTrainer(stft, Ms=init_mask) try: speech_masks = trainer.train(args.num_epochs) num_done += 1 writer.write(key, speech_masks.astype(np.float32)) logger.info("Training utterance {} ... Done".format(key)) except RuntimeError: logger.warn("Training utterance {} ... Failed".format(key)) else: logger.info("Training utterance {} ... Skip".format(key)) logger.info("Train {:d} utterances over {:d}".format( num_done, len(spectrogram_reader)))
def run(args): # shape: T x F, complex stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "round_power_of_two": args.round_power_of_two, "window": args.window, "center": args.center, # false to comparable with kaldi } speech_reader = SpectrogramReader(args.speech_scp, **stft_kwargs) denorm_reader = SpectrogramReader(args.denorm_scp, **stft_kwargs) num_utts = 0 cutoff = args.cutoff WriterImpl = {"kaldi": ArchiveWriter, "exraw": BinaryWriter}[args.format] with WriterImpl(args.mask_ark, args.scp) as writer: for key, speech in speech_reader: if key in denorm_reader: num_utts += 1 denorm = denorm_reader[key] mask = compute_mask(speech[0] if speech.ndim == 3 else speech, denorm[0] if denorm.ndim == 3 else denorm, args.mask) # iam, psm, psa if cutoff > 0: num_items = np.sum(mask > cutoff) mask = np.minimum(mask, cutoff) if num_items: percent = float(num_items) / mask.size logger.info( "Clip {:d}({:.2f}) items over {:.2f} for utterance {}" .format(num_items, percent, cutoff, key)) num_items = np.sum(mask < 0) # psm, psa if num_items: percent = float(num_items) / mask.size average = np.sum(mask[mask < 0]) / num_items logger.info( "Clip {:d}({:.2f}, {:.2f}) items below zero for utterance {}" .format(num_items, percent, average, key)) mask = np.maximum(mask, 0) writer.write(key, mask) else: logger.warn("Missing bg-noise for utterance {}".format(key)) logger.info("Processed {} utterances".format(num_utts))
def run(args): # shape: T x F, complex stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "round_power_of_two": args.round_power_of_two, "window": args.window, "center": args.center, # false to comparable with kaldi } clean_reader = SpectrogramReader(args.clean_scp, **stft_kwargs) noisy_reader = SpectrogramReader(args.noisy_scp, **stft_kwargs) num_utts = 0 cutoff = args.cutoff WriterImpl = {"kaldi": ArchiveWriter, "exraw": BinaryWriter}[args.format] with WriterImpl(args.mask_ark, args.scp) as writer: for key, clean in clean_reader: if key in noisy_reader: num_utts += 1 noisy = noisy_reader[key] mask = compute_mask(clean[0] if clean.ndim == 3 else clean, noisy[0] if noisy.ndim == 3 else noisy, args.mask) # iam, psm, psa if cutoff > 0: num_items = np.sum(mask > cutoff) mask = np.minimum(mask, cutoff) if num_items: percent = float(num_items) / mask.size logger.info( f"Clip {num_items:d}({percent:.2f}) items over " + f"{cutoff:.2f} for utterance {key}") num_items = np.sum(mask < 0) # psm, psa if num_items: percent = float(num_items) / mask.size average = np.sum(mask[mask < 0]) / num_items logger.info( f"Clip {num_items}({percent:.2f}, {average:.2f}) " + f"items below zero for utterance {key}") mask = np.maximum(mask, 0) writer.write(key, mask) else: logger.warn(f"Missing bg-noise for utterance {key}") logger.info(f"Processed {num_utts} utterances")
def run(args): stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": False # F x T } spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader( args.mask_scp) num_bins = nfft(args.frame_length) // 2 + 1 supported_beamformer = { "mvdr": MvdrBeamformer(num_bins), "gevd": GevdBeamformer(num_bins), "pmwf": PmwfBeamformer(num_bins) } beamformer = supported_beamformer[args.beamformer] num_utts = 0 for key, stft_mat in spectrogram_reader: if key in mask_reader: num_utts += 1 norm = spectrogram_reader.samp_norm(key) logger.info("Processing utterance {}(norm to {:.2f})...".format( key, norm)) # prefer T x F speech_mask = mask_reader[key] if args.trans: speech_mask = np.transpose(speech_mask) # stft_enh, stft_mat: F x T stft_enh = beamformer.run(speech_mask, stft_mat, normalize=args.postf) # masking beamformer output if necessary if args.mask: stft_enh = stft_enh * np.transpose(speech_mask) istft(os.path.join(args.dst_dir, '{}.wav'.format(key)), stft_enh, norm=norm, fs=args.samp_freq, **stft_kwargs) logger.info("Processed {:d} utterances out of {:d}".format( num_utts, len(spectrogram_reader)))
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": True # T x F } spectrogram_reader = SpectrogramReader( args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) num_done = 0 with WaveWriter(args.dst_dir, sr=args.sr) as writer: for key, obs in spectrogram_reader: logger.info(f"Processing utt {key}...") if obs.ndim != 3: raise RuntimeError(f"Expected 3D array, but got {obs.ndim}") try: # N x T x F => T x F tf_mask, wpd_enh = facted_wpd(obs, wpd_iters=args.wpd_iters, cgmm_iters=args.cgmm_iters, update_alpha=args.update_alpha, context=args.context, taps=args.taps, delay=args.delay) except np.linalg.LinAlgError: logger.warn(f"{key}: Failed cause LinAlgError in wpd") continue norm = spectrogram_reader.maxabs(key) # dump multi-channel samps = inverse_stft(wpd_enh, norm=norm, **stft_kwargs) writer.write(key, samps) if args.dump_mask: np.save(f"{args.dst_dir}/{key}", tf_mask[..., 0]) # show progress cause slow speed num_done += 1 if not num_done % 100: logger.info(f"Processed {num_done:d} utterances...") logger.info( f"Processed {num_done:d} utterances over {len(spectrogram_reader):d}")
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "round_power_of_two": args.round_power_of_two, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": False # F x T } stft_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) if args.utt2idx: utt2idx = ScpReader(args.utt2idx, value_processor=int) logger.info(f"Using --utt2idx={args.utt2idx}") else: utt2idx = None logger.info(f"Using --doa-idx={args.doa_idx}") df_pair = [tuple(map(int, p.split(","))) for p in args.df_pair.split(";")] if not len(df_pair): raise RuntimeError(f"Bad configurations with --pair {args.pair}") logger.info(f"Compute directional feature with {df_pair}") # A x M x F steer_vector = np.load(args.steer_vector) num_done = 0 with ArchiveWriter(args.dup_ark, args.scp) as writer: for key, stft in stft_reader: # sv: M x F if utt2idx is None: idx = [int(v) for v in args.doa_idx.split(",")] dfs = [ directional_feats(stft, steer_vector[i], df_pair=df_pair) for i in idx ] if len(dfs) == 1: df = dfs[0] else: # N x T x F dfs = np.stack(dfs) df = dfs.transpose(1, 0, 2).reshape(dfs.shape[1], -1) elif key in utt2idx: # stft: M x F x T df = directional_feats(stft, steer_vector[utt2idx[key]], df_pair=df_pair) else: logger.warn(f"Missing utt2idx for utterance {key}") continue writer.write(key, df) num_done += 1 if not num_done % 1000: logger.info(f"Processed {num_done:d} utterance...") logger.info(f"Processed {num_done:d} utterances over {len(stft_reader):d}")
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "round_power_of_two": args.round_power_of_two, "window": args.window, "center": args.center, "transpose": False } np.random.seed(args.seed) spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader} init_mask_reader = MaskReader[args.fmt]( args.init_mask) if args.init_mask else None num_done = 0 with NumpyWriter(args.dst_dir) as writer: dst_dir = Path(args.dst_dir) for key, stft in spectrogram_reader: if not (dst_dir / f"{key}.npy").exists(): init_mask = None if init_mask_reader and key in init_mask_reader: init_mask = init_mask_reader[key] # T x F => F x T if init_mask.ndim == 2: init_mask = np.transpose(init_mask) else: init_mask = np.transpose(init_mask, (0, 2, 1)) logger.info("Using external TF-mask to initialize cgmm") # stft: N x F x T trainer = CgmmTrainer(stft, args.num_classes, gamma=init_mask, update_alpha=args.update_alpha) try: masks = trainer.train(args.num_iters) # K x F x T => K x T x F masks = np.transpose(masks, (0, 2, 1)) num_done += 1 if args.solve_permu: masks = permu_aligner(masks) logger.info( "Permutation alignment done on each frequency") if args.num_classes == 2: masks = masks[0] writer.write(key, masks.astype(np.float32)) logger.info(f"Training utterance {key} ... Done") except RuntimeError: logger.warn(f"Training utterance {key} ... Failed") else: logger.info(f"Training utterance {key} ... Skip") logger.info( f"Train {num_done:d} utterances over {len(spectrogram_reader):d}")
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": True # T x F } wpe_kwargs = { "num_iters": args.num_iters, "context": args.context, "taps": args.taps, "delay": args.delay } spectrogram_reader = SpectrogramReader( args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) num_done = 0 with WaveWriter(args.dst_dir, fs=args.sr) as writer: for key, reverbed in spectrogram_reader: logger.info(f"Processing utt {key}...") if reverbed.ndim == 2: reverbed = reverbed[None, ...] # N x T x F => F x N x T reverbed = np.transpose(reverbed, (2, 0, 1)) try: if args.nara_wpe: from nara_wpe.wpe import wpe_v8 # T x F x N dereverb = wpe_v8(reverbed, taps=args.taps, delay=args.delay, iterations=args.num_iters, psd_context=args.context) else: dereverb = wpe(reverbed, **wpe_kwargs) except np.linalg.LinAlgError: logger.warn(f"{key}: Failed cause LinAlgError in wpe") continue # F x N x T => N x T x F dereverb = np.transpose(dereverb, (1, 2, 0)) # dump multi-channel samps = np.stack( [inverse_stft(spectra, **stft_kwargs) for spectra in dereverb]) writer.write(key, samps) # show progress cause slow speed num_done += 1 if not num_done % 100: logger.info(f"Processed {num_done:d} utterances...") logger.info( f"Processed {num_done:d} utterances over {len(spectrogram_reader):d}")
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "round_power_of_two": args.round_power_of_two, "window": args.window, "center": args.center, "transpose": False } spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader} init_mask_reader = MaskReader[args.fmt]( args.init_mask) if args.init_mask else None n_fft = nfft(args.frame_len) if args.round_power_of_two else args.frame_len # now use pb_bss pb_perm_solver = load_module(pb_bss_align_url) aligner = pb_perm_solver.DHTVPermutationAlignment.from_stft_size(n_fft) num_done = 0 with NumpyWriter(args.dst_dir) as writer: dst_dir = Path(args.dst_dir) for key, stft in spectrogram_reader: if not (dst_dir / f"{key}.npy").exists(): # K x F x T init_mask = None if init_mask_reader and key in init_mask_reader: init_mask = init_mask_reader[key] logger.info("Using external mask to initialize cacgmm") # stft: N x F x T trainer = CacgmmTrainer(stft, args.num_classes, gamma=init_mask, cgmm_init=args.cgmm_init) try: # EM progress masks = trainer.train(args.num_epoches) # align if needed if not args.cgmm_init or args.num_classes != 2: masks = aligner(masks) logger.info( "Permutation align done for each frequency") num_done += 1 writer.write(key, masks.astype(np.float32)) logger.info(f"Training utterance {key} ... Done") except np.linalg.LinAlgError: logger.warn(f"Training utterance {key} ... Failed") else: logger.info(f"Training utterance {key} ... Skip") logger.info( f"Train {num_done:d} utterances over {len(spectrogram_reader):d}")
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, "transpose": True # F x T instead of T x F } spectrogram_reader = SpectrogramReader( args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) for key, spectrogram in spectrogram_reader: logger.info(f"Processing utterance {key}...") separated = auxiva(spectrogram, args.epochs) norm = spectrogram_reader.maxabs(key) for idx in range(separated.shape[0]): samps = inverse_stft(separated[idx], **stft_kwargs, norm=norm) fname = Path(args.dst_dir) / f"{key}.src{idx + 1}.wav" write_wav(fname, samps, fs=args.fs) logger.info(f"Processed {len(spectrogram_reader)} utterances")
def run(args): # return complex result stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center } logger.info("Using mask: {}".format(args.mask.upper())) mixture_reader = SpectrogramReader( args.mix_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) ref_scp_list = args.ref_scp.split(",") logger.info("Number of speakers: {:d}".format(len(ref_scp_list))) targets_reader = [ SpectrogramReader(scp, **stft_kwargs) for scp in ref_scp_list ] num_utts = 0 for key, mixture in tqdm(mixture_reader): nsamps = mixture_reader.nsamps(key) if args.keep_length else None skip = False for reader in targets_reader: if key not in reader: logger.info("Skip utterance {}, missing targets".format(key)) skip = True break if skip: continue num_utts += 1 targets_list = [reader[key] for reader in targets_reader] spk_masks = compute_mask(mixture, targets_list, args.mask) for index, mask in enumerate(spk_masks): samps = istft(mixture * mask, **stft_kwargs, nsamps=nsamps) write_wav(os.path.join(args.dump_dir, "spk{:d}/{}.wav".format(index + 1, key)), samps, fs=args.fs) logger.info("Processed {} utterance!".format(num_utts))
def run(args): # return complex result stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center } logger.info(f"Using mask: {args.mask.upper()}") mixture_reader = SpectrogramReader( args.mix_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) ref_scp_list = args.ref_scp.split(",") logger.info(f"Number of speakers: {len(ref_scp_list)}") targets_reader = [ SpectrogramReader(scp, **stft_kwargs) for scp in ref_scp_list ] num_utts = 0 for key, mixture in tqdm(mixture_reader): nsamps = mixture_reader.nsamps(key) if args.keep_length else None skip = False for reader in targets_reader: if key not in reader: logger.info(f"Skip utterance {key}, missing targets") skip = True break if skip: continue num_utts += 1 targets_list = [reader[key] for reader in targets_reader] spk_masks = compute_mask(mixture, targets_list, args.mask) for index, mask in enumerate(spk_masks): samps = inverse_stft(mixture * mask, **stft_kwargs, nsamps=nsamps) write_wav(os.path.join(args.dump_dir, f"spk{index + 1}/{key}.wav"), samps, sr=args.sr) logger.info(f"Processed {num_utts} utterance")
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, } FeatureReader = {"numpy": NumpyReader, "kaldi": ScriptReader} feature_reader = FeatureReader[args.fmt](args.feat_scp) phase_reader = None if args.phase_ref: phase_reader = SpectrogramReader( args.phase_ref, **stft_kwargs, round_power_of_two=args.round_power_of_two) logger.info(f"Using phase reference from {args.phase_ref}") with WaveWriter(args.dump_dir, fs=args.sr, normalize=args.normalize) as writer: for key, spec in feature_reader: logger.info(f"Processing utterance {key}...") # if log, tranform to linear if args.apply_log: spec = np.exp(spec) # if power spectrum, tranform to magnitude spectrum if args.apply_pow: spec = np.sqrt(spec) if phase_reader is None: # griffin lim samps = griffin_lim(spec, epoches=args.epoches, transpose=True, norm=0.8, **stft_kwargs) else: if key not in phase_reader: raise KeyError(f"Missing key {key} in phase reader") ref = phase_reader[key] angle = np.angle(ref[0] if ref.ndim == 3 else ref) phase = np.exp(angle * 1j) samps = inverse_stft(spec * phase, **stft_kwargs, norm=0.8) writer.write(key, samps) logger.info(f"Processed {len(feature_reader)} utterance done")
def run(args): srp_pair = [ tuple(map(int, p.split(","))) for p in args.diag_pair.split(";") ] if not len(srp_pair): raise RuntimeError("Bad configurations with --pair {}".format( args.pair)) logger.info("Compute gcc with {}".format(srp_pair)) stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "round_power_of_two": args.round_power_of_two, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": True # T x F } num_done = 0 num_ffts = nfft( args.frame_len) if args.round_power_of_two else args.frame_len reader = SpectrogramReader(args.wav_scp, **stft_kwargs) with ArchiveWriter(args.srp_ark, args.scp) as writer: for key, stft_mat in reader: num_done += 1 srp = [] # N x T x F for (i, j) in srp_pair: srp.append( gcc_phat_diag( stft_mat[i], stft_mat[j], min(i, j) * np.pi * 2 / args.n, args.d, num_bins=num_ffts // 2 + 1, sr=args.sr, num_doa=args.num_doa)) srp = sum(srp) / len(srp_pair) nan = np.sum(np.isnan(srp)) if nan: raise RuntimeError("Matrix {} has nan ({:d}} items)".format( key, nan)) writer.write(key, srp) if not num_done % 1000: logger.info("Processed {:d} utterances...".format(num_done)) logger.info("Processd {:d} utterances done".format(len(reader)))
def run(args): stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, # false to comparable with kaldi "apply_log": args.apply_log, "apply_pow": args.apply_pow, "normalize": args.normalize, "apply_abs": True, "transpose": True # T x F } reader = SpectrogramReader(args.wav_scp, **stft_kwargs) with ArchiveWriter(args.dup_ark, args.scp) as writer: for key, feats in reader: # default using ch1 in multi-channel case writer.write(key, feats[0] if feats.ndim == 3 else feats) logger.info("Process {:d} utterances".format(len(reader)))
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": True # T x F } wpe_kwargs = { "num_iters": args.num_iters, "context": args.context, "taps": args.taps, "delay": args.delay } spectrogram_reader = SpectrogramReader( args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) num_done = 0 with WaveWriter(args.dst_dir, fs=args.samp_fs) as writer: for key, reverbed in spectrogram_reader: logger.info("Processing utt {}...".format(key)) # N x T x F => F x N x T reverbed = np.transpose(reverbed, (2, 0, 1)) try: # F x N x T dereverb = wpe(reverbed, **wpe_kwargs) except np.linalg.LinAlgError: logger.warn("{}: Failed cause LinAlgError in wpe".format(key)) continue # F x N x T => N x T x F dereverb = np.transpose(dereverb, (1, 2, 0)) # dump multi-channel samps = np.stack( [istft(spectra, **stft_kwargs) for spectra in dereverb]) writer.write(key, samps) # show progress cause slow speed num_done += 1 if not num_done % 100: logger.info("Processed {:d} utterances...".format(num_done)) logger.info("Processed {:d} utterances over {:d}".format( num_done, len(spectrogram_reader)))