def test_tacotron2_multi_gpu_trainable(model_dict): ngpu = 2 device_ids = list(range(ngpu)) bs = 10 maxin_len = 10 maxout_len = 10 idim = 5 odim = 10 model_args = make_model_args(**model_dict) loss_args = make_loss_args() batch = prepare_inputs(bs, idim, odim, maxin_len, maxout_len, model_args['spk_embed_dim'], model_args['spc_dim']) batch = (x.cuda() if x is not None else None for x in batch) # define model tacotron2 = Tacotron2(idim, odim, Namespace(**model_args)) tacotron2 = torch.nn.DataParallel(tacotron2, device_ids) model = Tacotron2Loss(tacotron2, **loss_args) optimizer = torch.optim.Adam(model.parameters()) model.cuda() # trainable loss = model(*batch) optimizer.zero_grad() loss.backward() optimizer.step()
def test_tacotron2_multi_gpu_trainable(): ngpu = 2 device_ids = list(range(ngpu)) bs = 2 maxin_len = 10 maxout_len = 10 idim = 5 odim = 10 batch = prepare_inputs(bs, idim, odim, maxin_len, maxout_len) batch = (x.cuda() for x in batch) xs, ilens, ys, labels, olens = batch # define model model_args = make_model_args() loss_args = make_loss_args() tacotron2 = Tacotron2(idim, odim, Namespace(**model_args)) tacotron2 = torch.nn.DataParallel(tacotron2, device_ids) model = Tacotron2Loss(tacotron2, **loss_args) optimizer = torch.optim.Adam(model.parameters()) model.cuda() # trainable loss = model(xs, ilens, ys, labels, olens) optimizer.zero_grad() loss.backward() optimizer.step()
def decode(args): '''RUN DECODING''' # read training config idim, odim, train_args = get_model_conf(args.model, args.model_conf) # show argments for key in sorted(vars(args).keys()): logging.info('ARGS: ' + key + ': ' + str(vars(args)[key])) # define model tacotron2 = Tacotron2(idim, odim, train_args) eos = str(tacotron2.idim - 1) # load trained model parameters logging.info('reading model parameters from ' + args.model) torch_load(args.model, tacotron2) tacotron2.eval() # set torch device device = torch.device("cuda" if args.ngpu > 0 else "cpu") tacotron2 = tacotron2.to(device) # read json data with open(args.json, 'rb') as f: js = json.load(f)['utts'] # chech direcitory outdir = os.path.dirname(args.out) if len(outdir) != 0 and not os.path.exists(outdir): os.makedirs(outdir) # write to ark and scp file (see https://github.com/vesis84/kaldi-io-for-python) arkscp = 'ark:| copy-feats --print-args=false ark:- ark,scp:%s.ark,%s.scp' % ( args.out, args.out) with torch.no_grad(), kaldi_io_py.open_or_fd(arkscp, 'wb') as f: for idx, utt_id in enumerate(js.keys()): x = js[utt_id]['output'][0]['tokenid'].split() + [eos] x = np.fromiter(map(int, x), dtype=np.int64) x = torch.LongTensor(x).to(device) # get speaker embedding if train_args.use_speaker_embedding: spemb = kaldi_io_py.read_vec_flt( js[utt_id]['input'][1]['feat']) spemb = torch.FloatTensor(spemb).to(device) else: spemb = None # decode and write outs, _, _ = tacotron2.inference(x, args, spemb) if outs.size(0) == x.size(0) * args.maxlenratio: logging.warn("output length reaches maximum length (%s)." % utt_id) logging.info( '(%d/%d) %s (size:%d->%d)' % (idx + 1, len(js.keys()), utt_id, x.size(0), outs.size(0))) kaldi_io_py.write_mat(f, outs.cpu().numpy(), utt_id)
def test_tacotron2_with_speaker_embedding_trainable_and_decodable( model_dict, loss_dict): # setup batch bs = 2 maxin_len = 10 maxout_len = 10 idim = 5 odim = 10 spk_embed_dim = 128 ilens = np.sort(np.random.randint(1, maxin_len, bs))[::-1].tolist() olens = np.sort(np.random.randint(1, maxout_len, bs))[::-1].tolist() xs = pad_ndarray_list([np.random.randint(0, idim, l) for l in ilens], 0) ys = pad_ndarray_list([np.random.randn(l, odim) for l in olens], 0) xs = torch.from_numpy(xs).long() ys = torch.from_numpy(ys).float() spembs = torch.from_numpy(np.random.randn(bs, spk_embed_dim)).float() # TODO(kan-bayashi): need to be modified in pytorch v4 labels = ys.new(ys.size(0), ys.size(1)).zero_() for i, l in enumerate(olens): labels[i, l - 1:] = 1 if torch_is_old: xs = Variable(xs) ys = Variable(ys) spembs = Variable(spembs) labels = Variable(labels) # define model model_args = make_model_args(spk_embed_dim=spk_embed_dim, **model_dict) loss_args = make_loss_args(**loss_dict) model = Tacotron2(idim, odim, **model_args) criterion = Tacotron2Loss(model, **loss_args) optimizer = torch.optim.Adam(model.parameters()) # trainable after, before, logits = model(xs, ilens, ys, spembs) loss = criterion(xs, ilens, ys, labels, olens, spembs) optimizer.zero_grad() loss.backward() optimizer.step() # decodable if torch_is_old: xs.volatile = True ys.volatile = True else: torch.set_grad_enabled(False) model.eval() yhat, probs, att_ws = model.inference(xs[0][:ilens[0]], spembs[0]) att_ws = model.calculate_all_attentions(xs, ilens, ys, spembs) assert att_ws.shape[0] == bs assert att_ws.shape[1] == max(olens) assert att_ws.shape[2] == max(ilens)
def test_tacotron2_trainable_and_decodable(model_dict, loss_dict): # setup batch bs = 2 maxin_len = 10 maxout_len = 10 idim = 5 odim = 10 batch = prepare_inputs(bs, idim, odim, maxin_len, maxout_len) xs, ilens, ys, labels, olens = batch # define model model_args = make_model_args(**model_dict) loss_args = make_loss_args(**loss_dict) inference_args = make_inference_args() model = Tacotron2(idim, odim, Namespace(**model_args)) criterion = Tacotron2Loss(model, **loss_args) optimizer = torch.optim.Adam(model.parameters()) if model_args['spk_embed_dim'] is not None: spembs = torch.from_numpy( np.random.randn(bs, model_args['spk_embed_dim'])).float() else: spembs = None # trainable loss = criterion(xs, ilens, ys, labels, olens, spembs) optimizer.zero_grad() loss.backward() optimizer.step() # decodable model.eval() with torch.no_grad(): spemb = None if model_args['spk_embed_dim'] is None else spembs[0] yhat, probs, att_ws = model.inference(xs[0][:ilens[0]], Namespace(**inference_args), spemb) att_ws = model.calculate_all_attentions(xs, ilens, ys, spembs) assert att_ws.shape[0] == bs assert att_ws.shape[1] == max(olens) assert att_ws.shape[2] == max(ilens)
def setup_tts_loss(odim_asr, idim_tts, args): from argparse import Namespace # define output activation function if args.tts_output_activation is None: output_activation_fn = None elif hasattr(torch.nn.functional, args.tts_output_activation): output_activation_fn = getattr(torch.nn.functional, args.tts_output_activation) else: raise ValueError('there is no such an activation function. (%s)' % args.tts_output_activation) tts_args = Namespace(spk_embed_dim=args.tts_spk_embed_dim, embed_dim=args.tts_embed_dim, elayers=args.tts_elayers, eunits=args.tts_eunits, econv_layers=args.tts_econv_layers, econv_chans=args.tts_econv_chans, econv_filts=args.tts_econv_filts, dlayers=args.tts_dlayers, dunits=args.tts_dunits, prenet_layers=args.tts_prenet_layers, prenet_units=args.tts_prenet_units, postnet_layers=args.tts_postnet_layers, postnet_chans=args.tts_postnet_chans, postnet_filts=args.tts_postnet_filts, output_activation=output_activation_fn, adim=args.tts_adim, aconv_chans=args.tts_aconv_chans, aconv_filts=args.tts_aconv_filts, cumulate_att_w=args.tts_cumulate_att_w, use_batch_norm=args.tts_use_batch_norm, use_concate=args.tts_use_concate, dropout=args.tts_dropout_rate, zoneout=args.tts_zoneout_rate, monotonic=args.tts_monotonic) e2e_tts = Tacotron2(idim=odim_asr, odim=idim_tts, args=tts_args) return Tacotron2Loss(model=e2e_tts, use_masking=args.tts_use_masking, bce_pos_weight=args.tts_bce_pos_weight)
def test_tacotron2_trainable_and_decodable(model_dict, loss_dict): # make args model_args = make_model_args(**model_dict) loss_args = make_loss_args(**loss_dict) inference_args = make_inference_args() # setup batch bs = 2 maxin_len = 10 maxout_len = 10 idim = 5 odim = 10 if model_args['use_cbhg']: model_args['spc_dim'] = 129 if model_args['use_speaker_embedding']: model_args['spk_embed_dim'] = 128 batch = prepare_inputs(bs, idim, odim, maxin_len, maxout_len, model_args['spk_embed_dim'], model_args['spc_dim']) xs, ilens, ys, labels, olens, spembs, spcs = batch # define model model = Tacotron2(idim, odim, Namespace(**model_args)) criterion = Tacotron2Loss(model, **loss_args) optimizer = torch.optim.Adam(model.parameters()) # trainable loss = criterion(xs, ilens, ys, labels, olens, spembs, spcs) optimizer.zero_grad() loss.backward() optimizer.step() # decodable model.eval() with torch.no_grad(): spemb = None if model_args['spk_embed_dim'] is None else spembs[0] model.inference(xs[0][:ilens[0]], Namespace(**inference_args), spemb) att_ws = model.calculate_all_attentions(xs, ilens, ys, spembs) assert att_ws.shape[0] == bs assert att_ws.shape[1] == max(olens) assert att_ws.shape[2] == max(ilens)
def train(args): '''RUN TRAINING''' # seed setting torch.manual_seed(args.seed) # use determinisitic computation or not if args.debugmode < 1: torch.backends.cudnn.deterministic = False logging.info('torch cudnn deterministic is disabled') else: torch.backends.cudnn.deterministic = True # check cuda availability if not torch.cuda.is_available(): logging.warning('cuda is not available') # get input and output dimension info with open(args.valid_json, 'rb') as f: valid_json = json.load(f)['utts'] utts = list(valid_json.keys()) # reverse input and output dimension idim = int(valid_json[utts[0]]['output'][0]['shape'][1]) odim = int(valid_json[utts[0]]['input'][0]['shape'][1]) if args.use_cbhg: args.spc_dim = int(valid_json[utts[0]]['input'][1]['shape'][1]) if args.use_speaker_embedding: args.spk_embed_dim = int(valid_json[utts[0]]['input'][1]['shape'][0]) else: args.spk_embed_dim = None logging.info('#input dims : ' + str(idim)) logging.info('#output dims: ' + str(odim)) # write model config if not os.path.exists(args.outdir): os.makedirs(args.outdir) model_conf = args.outdir + '/model.json' with open(model_conf, 'wb') as f: logging.info('writing a model config file to' + model_conf) f.write( json.dumps((idim, odim, vars(args)), indent=4, sort_keys=True).encode('utf_8')) for key in sorted(vars(args).keys()): logging.info('ARGS: ' + key + ': ' + str(vars(args)[key])) # specify model architecture tacotron2 = Tacotron2(idim, odim, args) logging.info(tacotron2) # check the use of multi-gpu if args.ngpu > 1: tacotron2 = torch.nn.DataParallel(tacotron2, device_ids=list(range(args.ngpu))) logging.info('batch size is automatically increased (%d -> %d)' % (args.batch_size, args.batch_size * args.ngpu)) args.batch_size *= args.ngpu # set torch device device = torch.device("cuda" if args.ngpu > 0 else "cpu") tacotron2 = tacotron2.to(device) # define loss model = Tacotron2Loss(tacotron2, args.use_masking, args.bce_pos_weight) reporter = model.reporter # Setup an optimizer optimizer = torch.optim.Adam(model.parameters(), args.lr, eps=args.eps, weight_decay=args.weight_decay) # FIXME: TOO DIRTY HACK setattr(optimizer, 'target', reporter) setattr(optimizer, 'serialize', lambda s: reporter.serialize(s)) # Setup a converter converter = CustomConverter(True, args.use_speaker_embedding, args.use_cbhg) # read json data with open(args.train_json, 'rb') as f: train_json = json.load(f)['utts'] with open(args.valid_json, 'rb') as f: valid_json = json.load(f)['utts'] # make minibatch list (variable length) train_batchset = make_batchset(train_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, args.batch_sort_key) valid_batchset = make_batchset(valid_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, args.batch_sort_key) # hack to make batchsze argument as 1 # actual bathsize is included in a list train_iter = chainer.iterators.MultiprocessIterator(TransformDataset( train_batchset, converter.transform), batch_size=1, n_processes=2, n_prefetch=8, maxtasksperchild=20) valid_iter = chainer.iterators.MultiprocessIterator(TransformDataset( valid_batchset, converter.transform), batch_size=1, repeat=False, shuffle=False, n_processes=2, n_prefetch=8, maxtasksperchild=20) # Set up a trainer updater = CustomUpdater(model, args.grad_clip, train_iter, optimizer, converter, device) trainer = training.Trainer(updater, (args.epochs, 'epoch'), out=args.outdir) # Resume from a snapshot if args.resume: logging.info('resumed from %s' % args.resume) torch_resume(args.resume, trainer) # Evaluate the model with the test dataset for each epoch trainer.extend( CustomEvaluator(model, valid_iter, reporter, converter, device)) # Save snapshot for each epoch trainer.extend(torch_snapshot(), trigger=(1, 'epoch')) # Save best models trainer.extend( extensions.snapshot_object(tacotron2, 'model.loss.best', savefun=torch_save), trigger=training.triggers.MinValueTrigger('validation/main/loss')) # Save attention figure for each epoch if args.num_save_attention > 0: data = sorted(list(valid_json.items())[:args.num_save_attention], key=lambda x: int(x[1]['input'][0]['shape'][1]), reverse=True) if hasattr(tacotron2, "module"): att_vis_fn = tacotron2.module.calculate_all_attentions else: att_vis_fn = tacotron2.calculate_all_attentions trainer.extend(PlotAttentionReport(att_vis_fn, data, args.outdir + '/att_ws', converter=CustomConverter( False, args.use_speaker_embedding), device=device, reverse=True), trigger=(1, 'epoch')) # Make a plot for training and validation values plot_keys = [ 'main/loss', 'validation/main/loss', 'main/l1_loss', 'validation/main/l1_loss', 'main/mse_loss', 'validation/main/mse_loss', 'main/bce_loss', 'validation/main/bce_loss' ] trainer.extend( extensions.PlotReport(['main/l1_loss', 'validation/main/l1_loss'], 'epoch', file_name='l1_loss.png')) trainer.extend( extensions.PlotReport(['main/mse_loss', 'validation/main/mse_loss'], 'epoch', file_name='mse_loss.png')) trainer.extend( extensions.PlotReport(['main/bce_loss', 'validation/main/bce_loss'], 'epoch', file_name='bce_loss.png')) if args.use_cbhg: plot_keys += [ 'main/cbhg_l1_loss', 'validation/main/cbhg_l1_loss', 'main/cbhg_mse_loss', 'validation/main/cbhg_mse_loss' ] trainer.extend( extensions.PlotReport( ['main/cbhg_l1_loss', 'validation/main/cbhg_l1_loss'], 'epoch', file_name='cbhg_l1_loss.png')) trainer.extend( extensions.PlotReport( ['main/cbhg_mse_loss', 'validation/main/cbhg_mse_loss'], 'epoch', file_name='cbhg_mse_loss.png')) trainer.extend( extensions.PlotReport(plot_keys, 'epoch', file_name='loss.png')) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport(trigger=(REPORT_INTERVAL, 'iteration'))) report_keys = plot_keys[:] report_keys[0:0] = ['epoch', 'iteration', 'elapsed_time'] trainer.extend(extensions.PrintReport(report_keys), trigger=(REPORT_INTERVAL, 'iteration')) trainer.extend(extensions.ProgressBar(update_interval=REPORT_INTERVAL)) # Run the training trainer.run()
def decode(args): '''RUN DECODING''' # read training config with open(args.model_conf, 'rb') as f: logging.info('reading a model config file from ' + args.model_conf) idim, odim, train_args = pickle.load(f) # show argments for key in sorted(vars(args).keys()): logging.info('ARGS: ' + key + ': ' + str(vars(args)[key])) # define output activation function if hasattr(train_args, 'output_activation'): if train_args.output_activation is None: output_activation_fn = None elif hasattr(torch.nn.functional, train_args.output_activation): output_activation_fn = getattr(torch.nn.functional, train_args.output_activation) else: raise ValueError('there is no such an activation function. (%s)' % train_args.output_activation) else: output_activation_fn = None # define model tacotron2 = Tacotron2( idim=idim, odim=odim, spk_embed_dim=train_args.spk_embed_dim if hasattr( train_args, "spk_embed_dim") else None, embed_dim=train_args.embed_dim, elayers=train_args.elayers, eunits=train_args.eunits, econv_layers=train_args.econv_layers, econv_chans=train_args.econv_chans, econv_filts=train_args.econv_filts, dlayers=train_args.dlayers, dunits=train_args.dunits, prenet_layers=train_args.prenet_layers, prenet_units=train_args.prenet_units, postnet_layers=train_args.postnet_layers, postnet_chans=train_args.postnet_chans, postnet_filts=train_args.postnet_filts, adim=train_args.adim, aconv_chans=train_args.aconv_chans, aconv_filts=train_args.aconv_filts, output_activation_fn=output_activation_fn, cumulate_att_w=train_args.cumulate_att_w, use_batch_norm=train_args.use_batch_norm, use_concate=train_args.use_concate, dropout=train_args.dropout_rate, zoneout=train_args.zoneout_rate, threshold=args.threshold, maxlenratio=args.maxlenratio, minlenratio=args.minlenratio, ) eos = str(tacotron2.idim - 1) # load trained model parameters logging.info('reading model parameters from ' + args.model) tacotron2.load_state_dict( torch.load(args.model, map_location=lambda storage, loc: storage)) tacotron2.eval() # Set gpu ngpu = args.ngpu if ngpu >= 1: gpu_id = range(ngpu) logging.info('gpu id: ' + str(gpu_id)) tacotron2.cuda() else: gpu_id = [-1] # read json data with open(args.json, 'rb') as f: js = json.load(f)['utts'] # chech direcitory outdir = os.path.dirname(args.out) if len(outdir) != 0 and not os.path.exists(outdir): os.makedirs(outdir) # check the use of embedding # TODO(kan-bayashi): need to remove in the future if hasattr(train_args, "spk_embed_dim"): if train_args.spk_embed_dim is not None: train_args.use_speaker_embedding = True else: train_args.use_speaker_embedding = False else: train_args.use_speaker_embedding = False # TODO(kan-bayashi): need to be fixed in pytorch v4 if not torch_is_old: torch.set_grad_enabled(False) # write to ark and scp file (see https://github.com/vesis84/kaldi-io-for-python) arkscp = 'ark:| copy-feats --print-args=false ark:- ark,scp:%s.ark,%s.scp' % ( args.out, args.out) with kaldi_io_py.open_or_fd(arkscp, 'wb') as f: for idx, utt_id in enumerate(js.keys()): x = js[utt_id]['output'][0]['tokenid'].split() + [eos] x = np.fromiter(map(int, x), dtype=np.int64) x = torch.from_numpy(x) if args.ngpu > 0: x = x.cuda() # TODO(kan-bayashi): need to be fixed in pytorch v4 if torch_is_old: x = Variable(x, volatile=True) # get speaker embedding if train_args.use_speaker_embedding: spemb = kaldi_io_py.read_vec_flt( js[utt_id]['input'][1]['feat']) spemb = torch.from_numpy(spemb) # TODO(kan-bayashi): need to be fixed in pytorch v4 if torch_is_old: spemb = Variable(spemb, volatile=True) if args.ngpu > 0: spemb = spemb.cuda() else: spemb = None # decode and write outs, _, _ = tacotron2.inference(x, spemb) if outs.size(0) == x.size(0) * args.maxlenratio: logging.warn("output length reaches maximum length (%s)." % utt_id) logging.info( '(%d/%d) %s (size:%d->%d)' % (idx + 1, len(js.keys()), utt_id, x.size(0), outs.size(0))) kaldi_io_py.write_mat(f, outs.data.cpu().numpy(), utt_id)
def train(args): '''RUN TRAINING''' # seed setting torch.manual_seed(args.seed) # use determinisitic computation or not if args.debugmode < 1: torch.backends.cudnn.deterministic = False logging.info('torch cudnn deterministic is disabled') else: torch.backends.cudnn.deterministic = True # check cuda availability if not torch.cuda.is_available(): logging.warning('cuda is not available') # get input and output dimension info with open(args.valid_json, 'rb') as f: valid_json = json.load(f)['utts'] utts = list(valid_json.keys()) # reverse input and output dimension idim = int(valid_json[utts[0]]['output'][0]['shape'][1]) odim = int(valid_json[utts[0]]['input'][0]['shape'][1]) if args.use_speaker_embedding: args.spk_embed_dim = int(valid_json[utts[0]]['input'][1]['shape'][0]) else: args.spk_embed_dim = None logging.info('#input dims : ' + str(idim)) logging.info('#output dims: ' + str(odim)) # write model config if not os.path.exists(args.outdir): os.makedirs(args.outdir) model_conf = args.outdir + '/model.conf' with open(model_conf, 'wb') as f: logging.info('writing a model config file to' + model_conf) pickle.dump((idim, odim, args), f) for key in sorted(vars(args).keys()): logging.info('ARGS: ' + key + ': ' + str(vars(args)[key])) # define output activation function if args.output_activation is None: output_activation_fn = None elif hasattr(torch.nn.functional, args.output_activation): output_activation_fn = getattr(torch.nn.functional, args.output_activation) else: raise ValueError('there is no such an activation function. (%s)' % args.output_activation) # specify model architecture tacotron2 = Tacotron2(idim=idim, odim=odim, spk_embed_dim=args.spk_embed_dim, embed_dim=args.embed_dim, elayers=args.elayers, eunits=args.eunits, econv_layers=args.econv_layers, econv_chans=args.econv_chans, econv_filts=args.econv_filts, dlayers=args.dlayers, dunits=args.dunits, prenet_layers=args.prenet_layers, prenet_units=args.prenet_units, postnet_layers=args.postnet_layers, postnet_chans=args.postnet_chans, postnet_filts=args.postnet_filts, output_activation_fn=output_activation_fn, adim=args.adim, aconv_chans=args.aconv_chans, aconv_filts=args.aconv_filts, cumulate_att_w=args.cumulate_att_w, use_batch_norm=args.use_batch_norm, use_concate=args.use_concate, dropout=args.dropout_rate, zoneout=args.zoneout_rate) logging.info(tacotron2) # Set gpu ngpu = args.ngpu if ngpu == 1: gpu_id = range(ngpu) logging.info('gpu id: ' + str(gpu_id)) tacotron2.cuda() elif ngpu > 1: gpu_id = range(ngpu) logging.info('gpu id: ' + str(gpu_id)) tacotron2 = torch.nn.DataParallel(tacotron2, device_ids=gpu_id) tacotron2.cuda() logging.info('batch size is automatically increased (%d -> %d)' % (args.batch_size, args.batch_size * ngpu)) args.batch_size *= ngpu else: gpu_id = [-1] # define loss model = Tacotron2Loss(model=tacotron2, use_masking=args.use_masking, bce_pos_weight=args.bce_pos_weight) reporter = model.reporter # Setup an optimizer optimizer = torch.optim.Adam(model.parameters(), args.lr, eps=args.eps, weight_decay=args.weight_decay) # FIXME: TOO DIRTY HACK setattr(optimizer, 'target', reporter) setattr(optimizer, 'serialize', lambda s: reporter.serialize(s)) # read json data with open(args.train_json, 'rb') as f: train_json = json.load(f)['utts'] with open(args.valid_json, 'rb') as f: valid_json = json.load(f)['utts'] # make minibatch list (variable length) train_batchset = make_batchset(train_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, args.batch_sort_key) valid_batchset = make_batchset(valid_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, args.batch_sort_key) # hack to make batchsze argument as 1 # actual bathsize is included in a list train_iter = chainer.iterators.SerialIterator(train_batchset, 1) valid_iter = chainer.iterators.SerialIterator(valid_batchset, 1, repeat=False, shuffle=False) # Set up a trainer converter = CustomConverter(gpu_id, True, args.use_speaker_embedding) updater = CustomUpdater(model, args.grad_clip, train_iter, optimizer, converter) trainer = training.Trainer(updater, (args.epochs, 'epoch'), out=args.outdir) # Resume from a snapshot if args.resume: logging.info('restored from %s' % args.resume) chainer.serializers.load_npz(args.resume, trainer) torch_load(args.outdir + '/model.ep.%d' % trainer.updater.epoch, tacotron2) model = trainer.updater.model # Evaluate the model with the test dataset for each epoch trainer.extend(CustomEvaluator(model, valid_iter, reporter, converter)) # Take a snapshot for each specified epoch trainer.extend( extensions.snapshot(filename='snapshot.ep.{.updater.epoch}'), trigger=(1, 'epoch')) # Save attention figure for each epoch if args.num_save_attention > 0: data = sorted(list(valid_json.items())[:args.num_save_attention], key=lambda x: int(x[1]['input'][0]['shape'][1]), reverse=True) trainer.extend(PlotAttentionReport( tacotron2, data, args.outdir + '/att_ws', CustomConverter(gpu_id, False, args.use_speaker_embedding), True), trigger=(1, 'epoch')) # Make a plot for training and validation values trainer.extend( extensions.PlotReport([ 'main/loss', 'validation/main/loss', 'main/l1_loss', 'validation/main/l1_loss', 'main/mse_loss', 'validation/main/mse_loss', 'main/bce_loss', 'validation/main/bce_loss' ], 'epoch', file_name='loss.png')) trainer.extend( extensions.PlotReport(['main/l1_loss', 'validation/main/l1_loss'], 'epoch', file_name='l1_loss.png')) trainer.extend( extensions.PlotReport(['main/mse_loss', 'validation/main/mse_loss'], 'epoch', file_name='mse_loss.png')) trainer.extend( extensions.PlotReport(['main/bce_loss', 'validation/main/bce_loss'], 'epoch', file_name='bce_loss.png')) # Save model for each epoch trainer.extend(extensions.snapshot_object(tacotron2, 'model.ep.{.updater.epoch}', savefun=torch_save), trigger=(1, 'epoch')) # Save best models trainer.extend( extensions.snapshot_object(tacotron2, 'model.loss.best', savefun=torch_save), trigger=training.triggers.MinValueTrigger('validation/main/loss')) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport(trigger=(100, 'iteration'))) report_keys = [ 'epoch', 'iteration', 'elapsed_time', 'main/loss', 'main/l1_loss', 'main/mse_loss', 'main/bce_loss', 'validation/main/loss', 'validation/main/l1_loss', 'validation/main/mse_loss', 'validation/main/bce_loss' ] trainer.extend(extensions.PrintReport(report_keys), trigger=(100, 'iteration')) trainer.extend(extensions.ProgressBar()) # Run the training trainer.run()
def tts_decode(args): '''RUN DECODING''' # read training config # idim, odim, train_args = get_model_conf(args.model, args.model_conf) # seed setting torch.manual_seed(args.seed) # show argments for key in sorted(vars(args).keys()): logging.info('ARGS: ' + key + ': ' + str(vars(args)[key])) # read training config with open(args.model_conf, "rb") as f: logging.info('reading a model config file from' + args.model_conf) idim_asr, odim_asr, train_args = pickle.load(f) for key in sorted(vars(args).keys()): logging.info('ARGS: ' + key + ': ' + str(vars(args)[key])) # specify model architecture logging.info('reading model parameters from' + args.model) e2e_asr = E2E(idim_asr, odim_asr, train_args) logging.info(e2e_asr) asr_loss = Loss(e2e_asr, train_args.mtlalpha) # specify model architecture for TTS # reverse input and output dimension tts_loss = setup_tts_loss(odim_asr, idim_asr - 3, train_args) logging.info(tts_loss) # define loss model = ASRTTSLoss(asr_loss, tts_loss, train_args) def cpu_loader(storage, location): return storage def remove_dataparallel(state_dict): from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): if k.startswith("module."): k = k[7:] new_state_dict[k] = v return new_state_dict model.load_state_dict( remove_dataparallel(torch.load(args.model, map_location=cpu_loader))) # define model tacotron2 = Tacotron2(idim, odim, train_args) eos = str(tacotron2.idim - 1) # load trained model parameters logging.info('reading model parameters from ' + args.model) torch_load(args.model, tacotron2) tacotron2.eval() # set torch device device = torch.device("cuda" if args.ngpu > 0 else "cpu") tacotron2 = tacotron2.to(device) # read json data with open(args.json, 'rb') as f: js = json.load(f)['utts'] # chech direcitory outdir = os.path.dirname(args.out) if len(outdir) != 0 and not os.path.exists(outdir): os.makedirs(outdir) # write to ark and scp file (see https://github.com/vesis84/kaldi-io-for-python) arkscp = 'ark:| copy-feats --print-args=false ark:- ark,scp:%s.ark,%s.scp' % ( args.out, args.out) with torch.no_grad(), kaldi_io_py.open_or_fd(arkscp, 'wb') as f: for idx, utt_id in enumerate(js.keys()): x = js[utt_id]['output'][0]['tokenid'].split() + [eos] x = np.fromiter(map(int, x), dtype=np.int64) x = torch.LongTensor(x).to(device) # get speaker embedding if train_args.use_speaker_embedding: spemb = kaldi_io_py.read_vec_flt( js[utt_id]['input'][1]['feat']) spemb = torch.FloatTensor(spemb).to(device) else: spemb = None # decode and write outs, _, _ = tacotron2.inference(x, args, spemb) if outs.size(0) == x.size(0) * args.maxlenratio: logging.warn("output length reaches maximum length (%s)." % utt_id) logging.info( '(%d/%d) %s (size:%d->%d)' % (idx + 1, len(js.keys()), utt_id, x.size(0), outs.size(0))) kaldi_io_py.write_mat(f, outs.cpu().numpy(), utt_id)