def decode(args): '''RUN DECODING''' # read training config idim, odim, train_args = get_model_conf(args.model, args.model_conf) # show argments for key in sorted(vars(args).keys()): logging.info('ARGS: ' + key + ': ' + str(vars(args)[key])) # define model tacotron2 = Tacotron2(idim, odim, train_args) eos = str(tacotron2.idim - 1) # load trained model parameters logging.info('reading model parameters from ' + args.model) torch_load(args.model, tacotron2) tacotron2.eval() # set torch device device = torch.device("cuda" if args.ngpu > 0 else "cpu") tacotron2 = tacotron2.to(device) # read json data with open(args.json, 'rb') as f: js = json.load(f)['utts'] # chech direcitory outdir = os.path.dirname(args.out) if len(outdir) != 0 and not os.path.exists(outdir): os.makedirs(outdir) # write to ark and scp file (see https://github.com/vesis84/kaldi-io-for-python) arkscp = 'ark:| copy-feats --print-args=false ark:- ark,scp:%s.ark,%s.scp' % ( args.out, args.out) with torch.no_grad(), kaldi_io_py.open_or_fd(arkscp, 'wb') as f: for idx, utt_id in enumerate(js.keys()): x = js[utt_id]['output'][0]['tokenid'].split() + [eos] x = np.fromiter(map(int, x), dtype=np.int64) x = torch.LongTensor(x).to(device) # get speaker embedding if train_args.use_speaker_embedding: spemb = kaldi_io_py.read_vec_flt( js[utt_id]['input'][1]['feat']) spemb = torch.FloatTensor(spemb).to(device) else: spemb = None # decode and write outs, _, _ = tacotron2.inference(x, args, spemb) if outs.size(0) == x.size(0) * args.maxlenratio: logging.warn("output length reaches maximum length (%s)." % utt_id) logging.info( '(%d/%d) %s (size:%d->%d)' % (idx + 1, len(js.keys()), utt_id, x.size(0), outs.size(0))) kaldi_io_py.write_mat(f, outs.cpu().numpy(), utt_id)
def read_dict_scp(file_or_fd): """ ScpLazyDict = read_mat_scp(file_or_fd) Returns LazyScpDict with __getitem__ to read kaldi ark according to kaldi scp. file_or_fd : scp, gzipped scp, pipe or opened file descriptor. """ fd = open_or_fd(file_or_fd) d = dict() try: for line in fd: key, rxfile = line.decode('utf-8').split(' ') d[key] = rxfile finally: if fd is not file_or_fd: fd.close() return ScpLazyDict(d)
def read_mat_scp(file_or_fd): """ generator(key,mat) = read_mat_scp(file_or_fd) Returns generator of (key,matrix) tuples, read according to kaldi scp. file_or_fd : scp, gzipped scp, pipe or opened file descriptor. Iterate the scp: for key,mat in kaldi_io.read_mat_scp(file): ... Read scp to a 'dictionary': d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } """ fd = open_or_fd(file_or_fd) try: for line in fd: (key, rxfile) = line.split(' '.encode(), 1) if rxfile[:2] == '\0B': mat = _read_mat_binary(rxfile) else: mat = read_mat(rxfile) yield key.decode(), mat finally: if fd is not file_or_fd: fd.close()
def decode(args): '''RUN DECODING''' # read training config with open(args.model_conf, 'rb') as f: logging.info('reading a model config file from ' + args.model_conf) idim, odim, train_args = pickle.load(f) # show argments for key in sorted(vars(args).keys()): logging.info('ARGS: ' + key + ': ' + str(vars(args)[key])) # define output activation function if hasattr(train_args, 'output_activation'): if train_args.output_activation is None: output_activation_fn = None elif hasattr(torch.nn.functional, train_args.output_activation): output_activation_fn = getattr(torch.nn.functional, train_args.output_activation) else: raise ValueError('there is no such an activation function. (%s)' % train_args.output_activation) else: output_activation_fn = None # define model tacotron2 = Tacotron2( idim=idim, odim=odim, spk_embed_dim=train_args.spk_embed_dim if hasattr( train_args, "spk_embed_dim") else None, embed_dim=train_args.embed_dim, elayers=train_args.elayers, eunits=train_args.eunits, econv_layers=train_args.econv_layers, econv_chans=train_args.econv_chans, econv_filts=train_args.econv_filts, dlayers=train_args.dlayers, dunits=train_args.dunits, prenet_layers=train_args.prenet_layers, prenet_units=train_args.prenet_units, postnet_layers=train_args.postnet_layers, postnet_chans=train_args.postnet_chans, postnet_filts=train_args.postnet_filts, adim=train_args.adim, aconv_chans=train_args.aconv_chans, aconv_filts=train_args.aconv_filts, output_activation_fn=output_activation_fn, cumulate_att_w=train_args.cumulate_att_w, use_batch_norm=train_args.use_batch_norm, use_concate=train_args.use_concate, dropout=train_args.dropout_rate, zoneout=train_args.zoneout_rate, threshold=args.threshold, maxlenratio=args.maxlenratio, minlenratio=args.minlenratio, ) eos = str(tacotron2.idim - 1) # load trained model parameters logging.info('reading model parameters from ' + args.model) tacotron2.load_state_dict( torch.load(args.model, map_location=lambda storage, loc: storage)) tacotron2.eval() # Set gpu ngpu = args.ngpu if ngpu >= 1: gpu_id = range(ngpu) logging.info('gpu id: ' + str(gpu_id)) tacotron2.cuda() else: gpu_id = [-1] # read json data with open(args.json, 'rb') as f: js = json.load(f)['utts'] # chech direcitory outdir = os.path.dirname(args.out) if len(outdir) != 0 and not os.path.exists(outdir): os.makedirs(outdir) # check the use of embedding # TODO(kan-bayashi): need to remove in the future if hasattr(train_args, "spk_embed_dim"): if train_args.spk_embed_dim is not None: train_args.use_speaker_embedding = True else: train_args.use_speaker_embedding = False else: train_args.use_speaker_embedding = False # TODO(kan-bayashi): need to be fixed in pytorch v4 if not torch_is_old: torch.set_grad_enabled(False) # write to ark and scp file (see https://github.com/vesis84/kaldi-io-for-python) arkscp = 'ark:| copy-feats --print-args=false ark:- ark,scp:%s.ark,%s.scp' % ( args.out, args.out) with kaldi_io_py.open_or_fd(arkscp, 'wb') as f: for idx, utt_id in enumerate(js.keys()): x = js[utt_id]['output'][0]['tokenid'].split() + [eos] x = np.fromiter(map(int, x), dtype=np.int64) x = torch.from_numpy(x) if args.ngpu > 0: x = x.cuda() # TODO(kan-bayashi): need to be fixed in pytorch v4 if torch_is_old: x = Variable(x, volatile=True) # get speaker embedding if train_args.use_speaker_embedding: spemb = kaldi_io_py.read_vec_flt( js[utt_id]['input'][1]['feat']) spemb = torch.from_numpy(spemb) # TODO(kan-bayashi): need to be fixed in pytorch v4 if torch_is_old: spemb = Variable(spemb, volatile=True) if args.ngpu > 0: spemb = spemb.cuda() else: spemb = None # decode and write outs, _, _ = tacotron2.inference(x, spemb) if outs.size(0) == x.size(0) * args.maxlenratio: logging.warn("output length reaches maximum length (%s)." % utt_id) logging.info( '(%d/%d) %s (size:%d->%d)' % (idx + 1, len(js.keys()), utt_id, x.size(0), outs.size(0))) kaldi_io_py.write_mat(f, outs.data.cpu().numpy(), utt_id)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--fs', type=int, default=22050, help='Sampling frequency') parser.add_argument('--fmax', type=int, default=None, nargs='?', help='Maximum frequency') parser.add_argument('--fmin', type=int, default=None, nargs='?', help='Minimum frequency') parser.add_argument('--n_mels', type=int, default=80, help='Number of mel basis') parser.add_argument('--n_fft', type=int, default=1024, help='FFT length in point') parser.add_argument('--n_shift', type=int, default=512, help='Shift length in point') parser.add_argument('--window', type=str, default='hann', choices=['hann', 'hamming'], help='Type of window') parser.add_argument('scp', type=str, help='WAV scp files') parser.add_argument('out', type=str, help='Output file id') args = parser.parse_args() # logging info logging.basicConfig( level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" ) # load scp with open(args.scp, 'r') as f: scp = [x.replace('\n', '').split() for x in f.readlines()] # chech direcitory outdir = os.path.dirname(args.out) if len(outdir) != 0 and not os.path.exists(outdir): os.makedirs(outdir) # write to ark and scp file (see https://github.com/vesis84/kaldi-io-for-python) arkscp = 'ark:| copy-feats --print-args=false ark:- ark,scp:%s.ark,%s.scp' % ( args.out, args.out) # extract feature and then write as ark with scp format with kaldi_io_py.open_or_fd(arkscp, 'wb') as f: for idx, (utt_id, path) in enumerate(scp, 1): x, fs = librosa.core.load(path, sr=None) assert fs == args.fs lmspc = logmelspectrogram(x=x, fs=args.fs, n_mels=args.n_mels, n_fft=args.n_fft, n_shift=args.n_shift, window=args.window, fmin=args.fmin, fmax=args.fmax) logging.info("(%d/%d) %s" % (idx, len(scp), utt_id)) kaldi_io_py.write_mat(f, lmspc, utt_id)
def encode(args): '''Get ASR encoded representations...probably for xvectors''' # seed setting torch.manual_seed(args.seed) # read training config idim, odim, odim_adv, train_args = get_model_conf(args.model, args.model_conf) # load trained model parameters logging.info('reading model parameters from ' + args.model) e2e = E2E(idim, odim, train_args, odim_adv=odim_adv) model = Loss(e2e, train_args.mtlalpha) if train_args.rnnlm is not None: # set rnnlm. external rnnlm is used for recognition. model.predictor.rnnlm = rnnlm torch_load(args.model, model) e2e.recog_args = args # gpu if args.ngpu == 1: gpu_id = range(args.ngpu) logging.info('gpu id: ' + str(gpu_id)) model.cuda() arkscp = 'ark:| copy-feats --print-args=false ark:- ark,scp:%s.ark,%s.scp' % ( args.feats_out, args.feats_out) if args.batchsize == 0: with torch.no_grad(): with kaldi_io_py.open_or_fd(arkscp, 'wb') as f, open(args.feats_in, 'rb') as f2: lines = f2.read().splitlines() for idx, line in enumerate(lines, 1): line = line.strip().split() name = line[0] logging.info('(%d/%d) decoding ' + name, idx, len(lines)) feat = kaldi_io_py.read_mat(line[1]) rep = e2e.erep(feat) logging.info('Rep shape: %s', rep.shape) kaldi_io_py.write_mat(f, rep, name) else: try: from itertools import zip_longest as zip_longest except Exception: from itertools import izip_longest as zip_longest def grouper(n, iterable, fillvalue=None): kargs = [iter(iterable)] * n return zip_longest(*kargs, fillvalue=fillvalue) # Create json object for batch processing logging.info("Creating json for batch processing...") js = {} with open(args.feats_in, 'rb') as f: lines = f.read().splitlines() for line in lines: line = line.strip().split() name = line[0] featpath = line[1] feat_shape = kaldi_io_py.read_mat(featpath).shape js[name] = {'feat': featpath, 'shape': feat_shape} # sort data logging.info("Sorting data for batch processing...") keys = list(js.keys()) feat_lens = [js[key]['shape'][0] for key in keys] sorted_index = sorted(range(len(feat_lens)), key=lambda i: -feat_lens[i]) keys = [keys[i] for i in sorted_index] with torch.no_grad(): with kaldi_io_py.open_or_fd(arkscp, 'wb') as f: for names in grouper(args.batchsize, keys, None): names = [name for name in names if name] feats = [ kaldi_io_py.read_mat(js[name]['feat']) for name in names ] reps, replens = e2e.erep_batch(feats) print(reps.shape, replens) for i, rep in enumerate(reps): name = names[i] kaldi_io_py.write_mat(f, rep, name)
def tts_decode(args): '''RUN DECODING''' # read training config # idim, odim, train_args = get_model_conf(args.model, args.model_conf) # seed setting torch.manual_seed(args.seed) # show argments for key in sorted(vars(args).keys()): logging.info('ARGS: ' + key + ': ' + str(vars(args)[key])) # read training config with open(args.model_conf, "rb") as f: logging.info('reading a model config file from' + args.model_conf) idim_asr, odim_asr, train_args = pickle.load(f) for key in sorted(vars(args).keys()): logging.info('ARGS: ' + key + ': ' + str(vars(args)[key])) # specify model architecture logging.info('reading model parameters from' + args.model) e2e_asr = E2E(idim_asr, odim_asr, train_args) logging.info(e2e_asr) asr_loss = Loss(e2e_asr, train_args.mtlalpha) # specify model architecture for TTS # reverse input and output dimension tts_loss = setup_tts_loss(odim_asr, idim_asr - 3, train_args) logging.info(tts_loss) # define loss model = ASRTTSLoss(asr_loss, tts_loss, train_args) def cpu_loader(storage, location): return storage def remove_dataparallel(state_dict): from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): if k.startswith("module."): k = k[7:] new_state_dict[k] = v return new_state_dict model.load_state_dict( remove_dataparallel(torch.load(args.model, map_location=cpu_loader))) # define model tacotron2 = Tacotron2(idim, odim, train_args) eos = str(tacotron2.idim - 1) # load trained model parameters logging.info('reading model parameters from ' + args.model) torch_load(args.model, tacotron2) tacotron2.eval() # set torch device device = torch.device("cuda" if args.ngpu > 0 else "cpu") tacotron2 = tacotron2.to(device) # read json data with open(args.json, 'rb') as f: js = json.load(f)['utts'] # chech direcitory outdir = os.path.dirname(args.out) if len(outdir) != 0 and not os.path.exists(outdir): os.makedirs(outdir) # write to ark and scp file (see https://github.com/vesis84/kaldi-io-for-python) arkscp = 'ark:| copy-feats --print-args=false ark:- ark,scp:%s.ark,%s.scp' % ( args.out, args.out) with torch.no_grad(), kaldi_io_py.open_or_fd(arkscp, 'wb') as f: for idx, utt_id in enumerate(js.keys()): x = js[utt_id]['output'][0]['tokenid'].split() + [eos] x = np.fromiter(map(int, x), dtype=np.int64) x = torch.LongTensor(x).to(device) # get speaker embedding if train_args.use_speaker_embedding: spemb = kaldi_io_py.read_vec_flt( js[utt_id]['input'][1]['feat']) spemb = torch.FloatTensor(spemb).to(device) else: spemb = None # decode and write outs, _, _ = tacotron2.inference(x, args, spemb) if outs.size(0) == x.size(0) * args.maxlenratio: logging.warn("output length reaches maximum length (%s)." % utt_id) logging.info( '(%d/%d) %s (size:%d->%d)' % (idx + 1, len(js.keys()), utt_id, x.size(0), outs.size(0))) kaldi_io_py.write_mat(f, outs.cpu().numpy(), utt_id)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--fs', type=int, help='Sampling frequency') parser.add_argument('--n_fft', type=int, default=1024, help='FFT length in point') parser.add_argument('--n_shift', type=int, default=512, help='Shift length in point') parser.add_argument('--win_length', type=int, default=None, nargs='?', help='Analisys window length in point') parser.add_argument('--window', type=str, default='hann', choices=['hann', 'hamming'], help='Type of window') parser.add_argument('--write_utt2num_frames', type=strtobool, default=True, help='Whether to write utt2num file') parser.add_argument('scp', type=str, help='WAV scp files') parser.add_argument('out', type=str, help='Output file id') args = parser.parse_args() # logging info logging.basicConfig( level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" ) # load scp with open(args.scp, 'r') as f: scp = [x.replace('\n', '').split() for x in f.readlines()] if len(scp[0]) != 2: utt_ids = [scp_[0] for scp_ in scp] paths = [scp_[-2] for scp_ in scp] scp = [[utt_id, path] for utt_id, path in zip(utt_ids, paths)] # chech direcitory outdir = os.path.dirname(args.out) if len(outdir) != 0 and not os.path.exists(outdir): os.makedirs(outdir) # write to ark and scp file (see https://github.com/vesis84/kaldi-io-for-python) if args.write_utt2num_frames: job_id = "." + args.out.split(".")[-1] if args.out.split( ".")[-1].isdigit() else "" arkscp = ( 'ark:| copy-feats --print-args=false --write-num-frames=ark,t:%s ' 'ark:- ark,scp:%s.ark,%s.scp') % (os.path.dirname( args.out) + "/utt2num_frames" + job_id, args.out, args.out) else: arkscp = 'ark:| copy-feats --print-args=false ark:- ark,scp:%s.ark,%s.scp' % ( args.out, args.out) # extract feature and then write as ark with scp format with kaldi_io_py.open_or_fd(arkscp, 'wb') as f: for idx, (utt_id, path) in enumerate(scp, 1): x, fs = sf.read(path) assert fs == args.fs spc = spectrogram(x=x, fs=args.fs, n_fft=args.n_fft, n_shift=args.n_shift, win_length=args.win_length, window=args.window) logging.info("(%d/%d) %s" % (idx, len(scp), utt_id)) kaldi_io_py.write_mat(f, spc, utt_id)