def run(args): stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, "transpose": False } spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) init_mask_reader = ScriptReader(args.init_mask) if args.init_mask else None num_done = 0 with NumpyWriter(args.dst_dir) as writer: for key, stft in spectrogram_reader: if not os.path.exists( os.path.join(args.dst_dir, "{}.npy".format(key))): init_mask = None if init_mask_reader and key in init_mask_reader: init_mask = init_mask_reader[key] logger.info( "Using external speech mask to initialize cgmm") # stft: N x F x T trainer = CgmmTrainer(stft, Ms=init_mask) try: speech_masks = trainer.train(args.num_epochs) num_done += 1 writer.write(key, speech_masks.astype(np.float32)) logger.info("Training utterance {} ... Done".format(key)) except RuntimeError: logger.warn("Training utterance {} ... Failed".format(key)) else: logger.info("Training utterance {} ... Skip".format(key)) logger.info("Train {:d} utterances over {:d}".format( num_done, len(spectrogram_reader)))
def __init__(self, data_dir): depends = [os.path.join(data_dir, x) for x in ["feats.scp", "spk2utt"]] for depend in depends: if not os.path.exists(depend): raise RuntimeError("Missing {}!".format(depend)) self.reader = ScriptReader(depends[0]) self.spk2utt = parse_scps(depends[1], num_tokens=-1)
def run(args): # shape: T x F, complex stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, } spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader( args.mask_scp) num_done = 0 with WaveWriter(args.dst_dir, fs=args.samp_freq) as writer: for key, specs in spectrogram_reader: if key in mask_reader: num_done += 1 mask = mask_reader[key] if args.transpose: mask = np.transpose(mask) logger.info("Processing utterance {}...".format(key)) if mask.shape != specs.shape: raise ValueError( "Dimention mismatch between mask and spectrogram" "({0[0]} x {0[1]} vs {1[0]} x {1[1]}), need check configures" .format(mask.shape, specs.shape)) nsamps = spectrogram_reader.nsamps( key) if args.keep_length else None norm = spectrogram_reader.samp_norm(key) samps = istft( specs * mask, **stft_kwargs, norm=norm, nsamps=nsamps) writer.write(key, samps) logger.info("Processed {:d} utterances over {:d}".format( num_done, len(spectrogram_reader)))
def run(args): # shape: T x F, complex stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, } spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader( args.mask_scp) num_utts = 0 fs = args.samp_freq, for key, specs in spectrogram_reader: if key in mask_reader: num_utts += 1 mask = mask_reader[key] if args.transpose: mask = np.transpose(mask) logger.info("Processing utterance {}...".format(key)) if mask.shape != specs.shape: raise ValueError( "Dimention mismatch between mask and spectrogram" "({0[0]} x {0[1]} vs {1[0]} x {1[1]}), need check configures" .format(mask.shape, specs.shape)) nsamps = spectrogram_reader.nsamps( key) if args.keep_length else None istft(os.path.join(args.dst_dir, "{}.wav".format(key)), specs * mask, **stft_kwargs, fs=fs, nsamps=nsamps) logger.info("Processed {} utterances".format(num_utts))
def run(args): feats_reader = ScriptReader(args.feats_scp) computer = NnetComputer(args.checkpoint, args.gpu) with NumpyWriter(args.dump_dir) as writer: for key, feats in feats_reader: logger.info("Compute dvector on utterance {}...".format(key)) dvector = computer.compute(feats) writer.write(key, dvector) logger.info("Compute over {:d} utterances".format(len(feats_reader)))
def run(args): src_reader = ScriptReader( args.src_dec) if args.src == "scp" else ArchiveReader(args.src_dec) num_done = 0 WriterImpl = {"npy": NumpyWriter, "mat": MatWriter}[args.dst] with WriterImpl(args.dst_dir, args.scp) as writer: for key, mat in src_reader: if args.trans: mat = np.transpose(mat) writer.write(key, mat) num_done += 1 logger.info(f"Copy {num_done} into directory {args.dst_dir}")
def run(args): src_format = args.input == "matrix" src_reader = ScriptReader( args.src_dec, matrix=src_format) if args.src == "scp" else ArchiveReader( args.src_dec, matrix=src_format) num_done = 0 WriterImpl = {"npy": NumpyWriter, "mat": MatWriter}[args.dst] with WriterImpl(args.dst_dir, args.scp) as writer: for key, mat in src_reader: if args.trans: mat = np.transpose(mat) writer.write(key, mat) num_done += 1 logger.info("Copy {0} {1} into directory {2}".format( num_done, "matrices" if src_format else "vectors", args.dst_dir))
def run(args): griffin_lim_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, "transpose": True, "epochs": args.epochs } feature_reader = ScriptReader(args.feat_scp) if args.fbank: mel_kwargs = { "n_mels": args.num_bins, "fmin": args.min_freq, "fmax": args.max_freq, "htk": True } # N x F mel_weights = audio_lib.filters.mel(args.samp_freq, nfft(args.frame_length), **mel_kwargs) # F x N mel_inv_weights = np.linalg.pinv(mel_weights) with WaveWriter( args.dump_dir, fs=args.samp_freq, normalize=args.normalize) as writer: for key, spec in feature_reader: # if log, tranform to linear if args.apply_log: spec = np.exp(spec) # convert fbank to spectrum # feat: T x N if args.fbank: spec = np.maximum(spec @ np.transpose(mel_inv_weights), EPSILON) # if power spectrum, tranform to magnitude spectrum if args.apply_pow: spec = np.sqrt(spec) if spec.shape[1] - 1 != nfft(args.frame_length) // 2: raise RuntimeError("Seems missing --fbank options?") # griffin lim samps = griffin_lim(spec, **griffin_lim_kwargs) writer.write(key, samps) logger.info("Processed {:d} utterance done".format(len(feature_reader)))
def run(args): stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": False # F x T } spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader( args.mask_scp) num_bins = nfft(args.frame_length) // 2 + 1 supported_beamformer = { "mvdr": MvdrBeamformer(num_bins), "gevd": GevdBeamformer(num_bins), "pmwf": PmwfBeamformer(num_bins) } beamformer = supported_beamformer[args.beamformer] num_utts = 0 for key, stft_mat in spectrogram_reader: if key in mask_reader: num_utts += 1 norm = spectrogram_reader.samp_norm(key) logger.info("Processing utterance {}(norm to {:.2f})...".format( key, norm)) # prefer T x F speech_mask = mask_reader[key] if args.trans: speech_mask = np.transpose(speech_mask) # stft_enh, stft_mat: F x T stft_enh = beamformer.run(speech_mask, stft_mat, normalize=args.postf) # masking beamformer output if necessary if args.mask: stft_enh = stft_enh * np.transpose(speech_mask) istft(os.path.join(args.dst_dir, '{}.wav'.format(key)), stft_enh, norm=norm, fs=args.samp_freq, **stft_kwargs) logger.info("Processed {:d} utterances out of {:d}".format( num_utts, len(spectrogram_reader)))
def run(args): stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "round_power_of_two": args.round_power_of_two, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": False # F x T } feat_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader( args.mask_scp) num_bins = nfft(args.frame_length) // 2 + 1 beamformer = MvdrBeamformer(num_bins) num_done = 0 with ArchiveWriter(args.dup_ark, args.scp) as writer: for key, spect in feat_reader: if key in mask_reader: speech_masks = mask_reader[key] if args.trans: speech_masks = np.transpose(speech_masks) speech_masks = np.minimum(speech_masks, 1) # spectrogram: N x F x T speech_covar = beamformer.compute_covar_mat( speech_masks, spect) sv = beamformer.compute_steer_vector(speech_covar) df = directional_feats(spect, sv.T) writer.write(key, df) num_done += 1 if not num_done % 1000: logger.info("Processed {:d} utterance...".format(num_done)) else: logger.warn("Missing TF-mask for utterance {}".format(key)) logger.info("Processed {:d} utterances over {:d}".format( num_done, len(feat_reader)))
def Reader(scp, t): return NumpyReader(scp) if t == "numpy" else ScriptReader(scp)
def run(args): stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": False # F x T } spectrogram_reader = SpectrogramReader( args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader( args.mask_scp) online = False num_bins = nfft(args.frame_length) // 2 + 1 supported_beamformer = { "mvdr": MvdrBeamformer(num_bins), "gevd": GevdBeamformer(num_bins), "pmwf": PmwfBeamformer(num_bins) } supported_online_beamformer = { "mvdr": OnlineMvdrBeamformer(num_bins, args.channels, args.alpha), "gevd": OnlineGevdBeamformer(num_bins, args.channels, args.alpha), } if args.chunk_size <= 0: logger.info("Using offline {} beamformer".format(args.beamformer)) beamformer = supported_beamformer[args.beamformer] else: if args.chunk_size < 32: raise RuntimeError( "Seems chunk size({:.2f}) too small for online beamformer". format(args.chunk_size)) beamformer = supported_online_beamformer[args.beamformer] online = True logger.info("Using online {} beamformer, chunk size = {:d}".format( args.beamformer, args.chunk_size)) num_done = 0 with WaveWriter(args.dst_dir, fs=args.samp_freq) as writer: for key, stft_mat in spectrogram_reader: if key in mask_reader: num_done += 1 power = spectrogram_reader.power(key) logger.info( "Processing utterance {}, signal power {:.2f}...".format( key, 10 * np.log10(power + 1e-5))) # prefer T x F speech_mask = mask_reader[key] # constraint [0, 1] speech_mask = np.minimum(speech_mask, 1) if args.trans: speech_mask = np.transpose(speech_mask) # stft_enh, stft_mat: (N) x F x T if not online: stft_enh = beamformer.run(speech_mask, stft_mat, normalize=args.ban) else: stft_enh = do_online_beamform(beamformer, speech_mask, stft_mat, args) # masking beamformer output if necessary if args.mask: stft_enh = stft_enh * np.transpose(speech_mask) samps = istft(stft_enh, power=power, **stft_kwargs) writer.write(key, samps) logger.info("Processed {:d} utterances out of {:d}".format( num_done, len(spectrogram_reader)))