def test_KaldiReader(tmpdir, filetype): ark = str(tmpdir.join('a.foo')) scp = str(tmpdir.join('a.scp')) fs = 16000 with file_writer_helper(wspecifier=f'ark,scp:{ark},{scp}', filetype=filetype, write_num_frames='ark,t:out.txt', compress=False, compression_method=2, pcm_format='wav') as writer: if 'sound' in filetype: aaa = np.random.randint(-10, 10, 100, dtype=np.int16) bbb = np.random.randint(-10, 10, 50, dtype=np.int16) else: aaa = np.random.randn(10, 10) bbb = np.random.randn(13, 5) if 'sound' in filetype: writer['aaa'] = fs, aaa writer['bbb'] = fs, bbb else: writer['aaa'] = aaa writer['bbb'] = bbb valid = {'aaa': aaa, 'bbb': bbb} # 1. Test ark read if filetype != 'sound': for key, value in file_reader_helper(f'ark:{ark}', filetype=filetype, return_shape=False): if 'sound' in filetype: assert_scipy_wav_style(value) value = value[1] np.testing.assert_array_equal(value, valid[key]) # 2. Test scp read for key, value in file_reader_helper(f'scp:{scp}', filetype=filetype, return_shape=False): if 'sound' in filetype: assert_scipy_wav_style(value) value = value[1] np.testing.assert_array_equal(value, valid[key]) # 3. Test ark shape read if filetype != 'sound': for key, value in file_reader_helper(f'ark:{ark}', filetype=filetype, return_shape=True): if 'sound' in filetype: value = value[1] np.testing.assert_array_equal(value, valid[key].shape) # 4. Test scp shape read for key, value in file_reader_helper(f'scp:{scp}', filetype=filetype, return_shape=True): if 'sound' in filetype: value = value[1] np.testing.assert_array_equal(value, valid[key].shape)
def main(): args = get_parser().parse_args() # logging info logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) if ":" in args.stats_rspecifier_or_rxfilename: is_rspcifier = True if args.stats_filetype == "npy": stats_filetype = "hdf5" else: stats_filetype = args.stats_filetype stats_dict = dict( file_reader_helper(args.stats_rspecifier_or_rxfilename, stats_filetype)) else: is_rspcifier = False if args.stats_filetype == "mat": stats = kaldiio.load_mat(args.stats_rspecifier_or_rxfilename) else: stats = numpy.load(args.stats_rspecifier_or_rxfilename) stats_dict = {None: stats} cmvn = CMVN( stats=stats_dict, norm_means=args.norm_means, norm_vars=args.norm_vars, utt2spk=args.utt2spk, spk2utt=args.spk2utt, reverse=args.reverse, ) with file_writer_helper( args.wspecifier, filetype=args.out_filetype, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method, ) as writer: for utt, mat in file_reader_helper(args.rspecifier, args.in_filetype): if is_scipy_wav_style(mat): # If data is sound file, then got as Tuple[int, ndarray] rate, mat = mat mat = cmvn(mat, utt if is_rspcifier else None) writer[utt] = mat
def main(): parser = get_parser() args = parser.parse_args() # logging info logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) if args.preprocess_conf is not None: preprocessing = Transformation(args.preprocess_conf) logging.info('Apply preprocessing: {}'.format(preprocessing)) else: preprocessing = None # There are no necessary for matrix without preprocessing, # so change to file_reader_helper to return shape. # This make sense only with filetype="hdf5". for utt, mat in file_reader_helper(args.rspecifier, args.filetype, return_shape=preprocessing is None): if preprocessing is not None: if is_scipy_wav_style(mat): # If data is sound file, then got as Tuple[int, ndarray] rate, mat = mat mat = preprocessing(mat, uttid_list=utt) shape_str = ','.join(map(str, mat.shape)) else: if len(mat) == 2 and isinstance(mat[1], tuple): # If data is sound file, Tuple[int, Tuple[int, ...]] rate, mat = mat shape_str = ','.join(map(str, mat)) args.out.write('{} {}\n'.format(utt, shape_str))
def main(): parser = get_parser() args = parser.parse_args() # logging info logging.basicConfig( level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" ) logging.info(get_commandline_args()) # check directory if not os.path.exists(args.outdir): os.makedirs(args.outdir) for idx, (utt_id, lmspc) in enumerate( file_reader_helper(args.rspecifier, args.filetype), 1): if args.n_mels is not None: spc = logmelspc_to_linearspc(lmspc, fs=args.fs, n_mels=args.n_mels, n_fft=args.n_fft, fmin=args.fmin, fmax=args.fmax) else: spc = lmspc y = griffin_lim(spc, n_fft=args.n_fft, n_shift=args.n_shift, win_length=args.win_length, window=args.window, n_iters=args.iters) logging.info("(%d) %s" % (idx, utt_id)) write(args.outdir + "/%s.wav" % utt_id, args.fs, (y * np.iinfo(np.int16).max).astype(np.int16))
def main(): parser = get_parser() args = parser.parse_args() # logging info logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) if args.preprocess_conf is not None: preprocessing = Transformation(args.preprocess_conf) logging.info("Apply preprocessing: {}".format(preprocessing)) else: preprocessing = None with file_writer_helper( args.wspecifier, filetype=args.out_filetype, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method, ) as writer: for utt, mat in file_reader_helper(args.rspecifier, args.in_filetype): if is_scipy_wav_style(mat): # If data is sound file, then got as Tuple[int, ndarray] rate, mat = mat if preprocessing is not None: mat = preprocessing(mat, uttid_list=utt) # shape = (Time, Channel) if args.out_filetype in ["sound.hdf5", "sound"]: # Write Tuple[int, numpy.ndarray] (scipy style) writer[utt] = (rate, mat) else: writer[utt] = mat
def main(): parser = get_parser() args = parser.parse_args() # logging info logging.basicConfig( level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.info(get_commandline_args()) # check directory if not os.path.exists(args.outdir): os.makedirs(args.outdir) # load model config model_dir = os.path.dirname(args.model) train_args = torch.load(os.path.join(model_dir, "model.conf")) # load statistics scaler = StandardScaler() with h5py.File(os.path.join(model_dir, "stats.h5")) as f: scaler.mean_ = f["/melspc/mean"][()] scaler.scale_ = f["/melspc/scale"][()] # TODO(kan-bayashi): include following info as default coef = f["/mlsa/coef"][()] alpha = f["/mlsa/alpha"][()] # define MLSA filter for noise shaping mlsa_filter = TimeInvariantMLSAFilter( coef=coef, alpha=alpha, n_shift=args.n_shift, ) # define model and laod parameters device = torch.device( "cuda") if torch.cuda.is_available() else torch.device("cpu") model = WaveNet( n_quantize=train_args.n_quantize, n_aux=train_args.n_aux, n_resch=train_args.n_resch, n_skipch=train_args.n_skipch, dilation_depth=train_args.dilation_depth, dilation_repeat=train_args.dilation_repeat, kernel_size=train_args.kernel_size, upsampling_factor=train_args.upsampling_factor, ) model.load_state_dict(torch.load(args.model, map_location="cpu")["model"]) model.eval() model.to(device) for idx, (utt_id, lmspc) in enumerate( file_reader_helper(args.rspecifier, args.filetype), 1): logging.info("(%d) %s" % (idx, utt_id)) # perform preprocesing x = encode_mu_law(np.zeros( (1)), mu=train_args.n_quantize) # quatize initial seed waveform h = scaler.transform(lmspc) # normalize features # convert to tensor x = torch.tensor(x, dtype=torch.long, device=device) # (1,) h = torch.tensor(h, dtype=torch.float, device=device) # (T, n_aux) # get length of waveform n_samples = (h.shape[0] - 1) * args.n_shift + args.n_fft # generate start_time = time.time() with torch.no_grad(): y = model.generate(x, h, n_samples, interval=100) logging.info("generation speed = %s (sec / sample)" % ((time.time() - start_time) / (len(y) - 1))) y = decode_mu_law(y, mu=train_args.n_quantize) # apply mlsa filter for noise shaping y = mlsa_filter(y) # save as .wav file write( os.path.join(args.outdir, "%s.wav" % utt_id), args.fs, (y * np.iinfo(np.int16).max).astype(np.int16), )
def main(): args = get_parser().parse_args() logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) is_wspecifier = ":" in args.wspecifier_or_wxfilename if is_wspecifier: if args.spk2utt is not None: logging.info("Performing as speaker CMVN mode") utt2spk_dict = {} with open(args.spk2utt) as f: for line in f: spk, utts = line.rstrip().split(None, 1) for utt in utts.split(): utt2spk_dict[utt] = spk def utt2spk(x): return utt2spk_dict[x] else: logging.info("Performing as utterance CMVN mode") def utt2spk(x): return x if args.out_filetype == "npy": logging.warning("--out-filetype npy is allowed only for " "Global CMVN mode, changing to hdf5") args.out_filetype = "hdf5" else: logging.info("Performing as global CMVN mode") if args.spk2utt is not None: logging.warning("spk2utt is not used for global CMVN mode") def utt2spk(x): return None if args.out_filetype == "hdf5": logging.warning("--out-filetype hdf5 is not allowed for " "Global CMVN mode, changing to npy") args.out_filetype = "npy" if args.preprocess_conf is not None: preprocessing = Transformation(args.preprocess_conf) logging.info("Apply preprocessing: {}".format(preprocessing)) else: preprocessing = None # Calculate stats for each speaker counts = {} sum_feats = {} square_sum_feats = {} idx = 0 for idx, (utt, matrix) in enumerate( file_reader_helper(args.rspecifier, args.in_filetype), 1): if is_scipy_wav_style(matrix): # If data is sound file, then got as Tuple[int, ndarray] rate, matrix = matrix if preprocessing is not None: matrix = preprocessing(matrix, uttid_list=utt) spk = utt2spk(utt) # Init at the first seen of the spk if spk not in counts: counts[spk] = 0 feat_shape = matrix.shape[1:] # Accumulate in double precision sum_feats[spk] = np.zeros(feat_shape, dtype=np.float64) square_sum_feats[spk] = np.zeros(feat_shape, dtype=np.float64) counts[spk] += matrix.shape[0] sum_feats[spk] += matrix.sum(axis=0) square_sum_feats[spk] += (matrix**2).sum(axis=0) logging.info("Processed {} utterances".format(idx)) assert idx > 0, idx cmvn_stats = {} for spk in counts: feat_shape = sum_feats[spk].shape cmvn_shape = (2, feat_shape[0] + 1) + feat_shape[1:] _cmvn_stats = np.empty(cmvn_shape, dtype=np.float64) _cmvn_stats[0, :-1] = sum_feats[spk] _cmvn_stats[1, :-1] = square_sum_feats[spk] _cmvn_stats[0, -1] = counts[spk] _cmvn_stats[1, -1] = 0.0 # You can get the mean and std as following, # >>> N = _cmvn_stats[0, -1] # >>> mean = _cmvn_stats[0, :-1] / N # >>> std = np.sqrt(_cmvn_stats[1, :-1] / N - mean ** 2) cmvn_stats[spk] = _cmvn_stats # Per utterance or speaker CMVN if is_wspecifier: with file_writer_helper(args.wspecifier_or_wxfilename, filetype=args.out_filetype) as writer: for spk, mat in cmvn_stats.items(): writer[spk] = mat # Global CMVN else: matrix = cmvn_stats[None] if args.out_filetype == "npy": np.save(args.wspecifier_or_wxfilename, matrix) elif args.out_filetype == "mat": # Kaldi supports only matrix or vector kaldiio.save_mat(args.wspecifier_or_wxfilename, matrix) else: raise RuntimeError("Not supporting: --out-filetype {}".format( args.out_filetype))
def test_KaldiReader(tmpdir, filetype): ark = str(tmpdir.join("a.foo")) scp = str(tmpdir.join("a.scp")) fs = 16000 with file_writer_helper( wspecifier=f"ark,scp:{ark},{scp}", filetype=filetype, write_num_frames="ark,t:out.txt", compress=False, compression_method=2, pcm_format="wav", ) as writer: if "sound" in filetype: aaa = np.random.randint(-10, 10, 100, dtype=np.int16) bbb = np.random.randint(-10, 10, 50, dtype=np.int16) else: aaa = np.random.randn(10, 10) bbb = np.random.randn(13, 5) if "sound" in filetype: writer["aaa"] = fs, aaa writer["bbb"] = fs, bbb else: writer["aaa"] = aaa writer["bbb"] = bbb valid = {"aaa": aaa, "bbb": bbb} # 1. Test ark read if filetype != "sound": for key, value in file_reader_helper( f"ark:{ark}", filetype=filetype, return_shape=False ): if "sound" in filetype: assert_scipy_wav_style(value) value = value[1] np.testing.assert_array_equal(value, valid[key]) # 2. Test scp read for key, value in file_reader_helper( f"scp:{scp}", filetype=filetype, return_shape=False ): if "sound" in filetype: assert_scipy_wav_style(value) value = value[1] np.testing.assert_array_equal(value, valid[key]) # 3. Test ark shape read if filetype != "sound": for key, value in file_reader_helper( f"ark:{ark}", filetype=filetype, return_shape=True ): if "sound" in filetype: value = value[1] np.testing.assert_array_equal(value, valid[key].shape) # 4. Test scp shape read for key, value in file_reader_helper( f"scp:{scp}", filetype=filetype, return_shape=True ): if "sound" in filetype: value = value[1] np.testing.assert_array_equal(value, valid[key].shape)