def test_tacotron2_multi_gpu_trainable(model_dict): ngpu = 2 device_ids = list(range(ngpu)) bs = 10 maxin_len = 10 maxout_len = 10 idim = 5 odim = 10 model_args = make_model_args(**model_dict) loss_args = make_loss_args() batch = prepare_inputs(bs, idim, odim, maxin_len, maxout_len, model_args['spk_embed_dim'], model_args['spc_dim']) batch = (x.cuda() if x is not None else None for x in batch) # define model tacotron2 = Tacotron2(idim, odim, Namespace(**model_args)) tacotron2 = torch.nn.DataParallel(tacotron2, device_ids) model = Tacotron2Loss(tacotron2, **loss_args) optimizer = torch.optim.Adam(model.parameters()) model.cuda() # trainable loss = model(*batch) optimizer.zero_grad() loss.backward() optimizer.step()
def test_tacotron2_multi_gpu_trainable(): ngpu = 2 device_ids = list(range(ngpu)) bs = 2 maxin_len = 10 maxout_len = 10 idim = 5 odim = 10 batch = prepare_inputs(bs, idim, odim, maxin_len, maxout_len) batch = (x.cuda() for x in batch) xs, ilens, ys, labels, olens = batch # define model model_args = make_model_args() loss_args = make_loss_args() tacotron2 = Tacotron2(idim, odim, Namespace(**model_args)) tacotron2 = torch.nn.DataParallel(tacotron2, device_ids) model = Tacotron2Loss(tacotron2, **loss_args) optimizer = torch.optim.Adam(model.parameters()) model.cuda() # trainable loss = model(xs, ilens, ys, labels, olens) optimizer.zero_grad() loss.backward() optimizer.step()
def test_tacotron2_with_speaker_embedding_trainable_and_decodable( model_dict, loss_dict): # setup batch bs = 2 maxin_len = 10 maxout_len = 10 idim = 5 odim = 10 spk_embed_dim = 128 ilens = np.sort(np.random.randint(1, maxin_len, bs))[::-1].tolist() olens = np.sort(np.random.randint(1, maxout_len, bs))[::-1].tolist() xs = pad_ndarray_list([np.random.randint(0, idim, l) for l in ilens], 0) ys = pad_ndarray_list([np.random.randn(l, odim) for l in olens], 0) xs = torch.from_numpy(xs).long() ys = torch.from_numpy(ys).float() spembs = torch.from_numpy(np.random.randn(bs, spk_embed_dim)).float() # TODO(kan-bayashi): need to be modified in pytorch v4 labels = ys.new(ys.size(0), ys.size(1)).zero_() for i, l in enumerate(olens): labels[i, l - 1:] = 1 if torch_is_old: xs = Variable(xs) ys = Variable(ys) spembs = Variable(spembs) labels = Variable(labels) # define model model_args = make_model_args(spk_embed_dim=spk_embed_dim, **model_dict) loss_args = make_loss_args(**loss_dict) model = Tacotron2(idim, odim, **model_args) criterion = Tacotron2Loss(model, **loss_args) optimizer = torch.optim.Adam(model.parameters()) # trainable after, before, logits = model(xs, ilens, ys, spembs) loss = criterion(xs, ilens, ys, labels, olens, spembs) optimizer.zero_grad() loss.backward() optimizer.step() # decodable if torch_is_old: xs.volatile = True ys.volatile = True else: torch.set_grad_enabled(False) model.eval() yhat, probs, att_ws = model.inference(xs[0][:ilens[0]], spembs[0]) att_ws = model.calculate_all_attentions(xs, ilens, ys, spembs) assert att_ws.shape[0] == bs assert att_ws.shape[1] == max(olens) assert att_ws.shape[2] == max(ilens)
def test_tacotron2_trainable_and_decodable(model_dict, loss_dict): # setup batch bs = 2 maxin_len = 10 maxout_len = 10 idim = 5 odim = 10 batch = prepare_inputs(bs, idim, odim, maxin_len, maxout_len) xs, ilens, ys, labels, olens = batch # define model model_args = make_model_args(**model_dict) loss_args = make_loss_args(**loss_dict) inference_args = make_inference_args() model = Tacotron2(idim, odim, Namespace(**model_args)) criterion = Tacotron2Loss(model, **loss_args) optimizer = torch.optim.Adam(model.parameters()) if model_args['spk_embed_dim'] is not None: spembs = torch.from_numpy( np.random.randn(bs, model_args['spk_embed_dim'])).float() else: spembs = None # trainable loss = criterion(xs, ilens, ys, labels, olens, spembs) optimizer.zero_grad() loss.backward() optimizer.step() # decodable model.eval() with torch.no_grad(): spemb = None if model_args['spk_embed_dim'] is None else spembs[0] yhat, probs, att_ws = model.inference(xs[0][:ilens[0]], Namespace(**inference_args), spemb) att_ws = model.calculate_all_attentions(xs, ilens, ys, spembs) assert att_ws.shape[0] == bs assert att_ws.shape[1] == max(olens) assert att_ws.shape[2] == max(ilens)
def setup_tts_loss(odim_asr, idim_tts, args): from argparse import Namespace # define output activation function if args.tts_output_activation is None: output_activation_fn = None elif hasattr(torch.nn.functional, args.tts_output_activation): output_activation_fn = getattr(torch.nn.functional, args.tts_output_activation) else: raise ValueError('there is no such an activation function. (%s)' % args.tts_output_activation) tts_args = Namespace(spk_embed_dim=args.tts_spk_embed_dim, embed_dim=args.tts_embed_dim, elayers=args.tts_elayers, eunits=args.tts_eunits, econv_layers=args.tts_econv_layers, econv_chans=args.tts_econv_chans, econv_filts=args.tts_econv_filts, dlayers=args.tts_dlayers, dunits=args.tts_dunits, prenet_layers=args.tts_prenet_layers, prenet_units=args.tts_prenet_units, postnet_layers=args.tts_postnet_layers, postnet_chans=args.tts_postnet_chans, postnet_filts=args.tts_postnet_filts, output_activation=output_activation_fn, adim=args.tts_adim, aconv_chans=args.tts_aconv_chans, aconv_filts=args.tts_aconv_filts, cumulate_att_w=args.tts_cumulate_att_w, use_batch_norm=args.tts_use_batch_norm, use_concate=args.tts_use_concate, dropout=args.tts_dropout_rate, zoneout=args.tts_zoneout_rate, monotonic=args.tts_monotonic) e2e_tts = Tacotron2(idim=odim_asr, odim=idim_tts, args=tts_args) return Tacotron2Loss(model=e2e_tts, use_masking=args.tts_use_masking, bce_pos_weight=args.tts_bce_pos_weight)
def test_tacotron2_trainable_and_decodable(model_dict, loss_dict): # make args model_args = make_model_args(**model_dict) loss_args = make_loss_args(**loss_dict) inference_args = make_inference_args() # setup batch bs = 2 maxin_len = 10 maxout_len = 10 idim = 5 odim = 10 if model_args['use_cbhg']: model_args['spc_dim'] = 129 if model_args['use_speaker_embedding']: model_args['spk_embed_dim'] = 128 batch = prepare_inputs(bs, idim, odim, maxin_len, maxout_len, model_args['spk_embed_dim'], model_args['spc_dim']) xs, ilens, ys, labels, olens, spembs, spcs = batch # define model model = Tacotron2(idim, odim, Namespace(**model_args)) criterion = Tacotron2Loss(model, **loss_args) optimizer = torch.optim.Adam(model.parameters()) # trainable loss = criterion(xs, ilens, ys, labels, olens, spembs, spcs) optimizer.zero_grad() loss.backward() optimizer.step() # decodable model.eval() with torch.no_grad(): spemb = None if model_args['spk_embed_dim'] is None else spembs[0] model.inference(xs[0][:ilens[0]], Namespace(**inference_args), spemb) att_ws = model.calculate_all_attentions(xs, ilens, ys, spembs) assert att_ws.shape[0] == bs assert att_ws.shape[1] == max(olens) assert att_ws.shape[2] == max(ilens)
def train(args): '''RUN TRAINING''' # seed setting torch.manual_seed(args.seed) # use determinisitic computation or not if args.debugmode < 1: torch.backends.cudnn.deterministic = False logging.info('torch cudnn deterministic is disabled') else: torch.backends.cudnn.deterministic = True # check cuda availability if not torch.cuda.is_available(): logging.warning('cuda is not available') # get input and output dimension info with open(args.valid_json, 'rb') as f: valid_json = json.load(f)['utts'] utts = list(valid_json.keys()) # reverse input and output dimension idim = int(valid_json[utts[0]]['output'][0]['shape'][1]) odim = int(valid_json[utts[0]]['input'][0]['shape'][1]) if args.use_cbhg: args.spc_dim = int(valid_json[utts[0]]['input'][1]['shape'][1]) if args.use_speaker_embedding: args.spk_embed_dim = int(valid_json[utts[0]]['input'][1]['shape'][0]) else: args.spk_embed_dim = None logging.info('#input dims : ' + str(idim)) logging.info('#output dims: ' + str(odim)) # write model config if not os.path.exists(args.outdir): os.makedirs(args.outdir) model_conf = args.outdir + '/model.json' with open(model_conf, 'wb') as f: logging.info('writing a model config file to' + model_conf) f.write( json.dumps((idim, odim, vars(args)), indent=4, sort_keys=True).encode('utf_8')) for key in sorted(vars(args).keys()): logging.info('ARGS: ' + key + ': ' + str(vars(args)[key])) # specify model architecture tacotron2 = Tacotron2(idim, odim, args) logging.info(tacotron2) # check the use of multi-gpu if args.ngpu > 1: tacotron2 = torch.nn.DataParallel(tacotron2, device_ids=list(range(args.ngpu))) logging.info('batch size is automatically increased (%d -> %d)' % (args.batch_size, args.batch_size * args.ngpu)) args.batch_size *= args.ngpu # set torch device device = torch.device("cuda" if args.ngpu > 0 else "cpu") tacotron2 = tacotron2.to(device) # define loss model = Tacotron2Loss(tacotron2, args.use_masking, args.bce_pos_weight) reporter = model.reporter # Setup an optimizer optimizer = torch.optim.Adam(model.parameters(), args.lr, eps=args.eps, weight_decay=args.weight_decay) # FIXME: TOO DIRTY HACK setattr(optimizer, 'target', reporter) setattr(optimizer, 'serialize', lambda s: reporter.serialize(s)) # Setup a converter converter = CustomConverter(True, args.use_speaker_embedding, args.use_cbhg) # read json data with open(args.train_json, 'rb') as f: train_json = json.load(f)['utts'] with open(args.valid_json, 'rb') as f: valid_json = json.load(f)['utts'] # make minibatch list (variable length) train_batchset = make_batchset(train_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, args.batch_sort_key) valid_batchset = make_batchset(valid_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, args.batch_sort_key) # hack to make batchsze argument as 1 # actual bathsize is included in a list train_iter = chainer.iterators.MultiprocessIterator(TransformDataset( train_batchset, converter.transform), batch_size=1, n_processes=2, n_prefetch=8, maxtasksperchild=20) valid_iter = chainer.iterators.MultiprocessIterator(TransformDataset( valid_batchset, converter.transform), batch_size=1, repeat=False, shuffle=False, n_processes=2, n_prefetch=8, maxtasksperchild=20) # Set up a trainer updater = CustomUpdater(model, args.grad_clip, train_iter, optimizer, converter, device) trainer = training.Trainer(updater, (args.epochs, 'epoch'), out=args.outdir) # Resume from a snapshot if args.resume: logging.info('resumed from %s' % args.resume) torch_resume(args.resume, trainer) # Evaluate the model with the test dataset for each epoch trainer.extend( CustomEvaluator(model, valid_iter, reporter, converter, device)) # Save snapshot for each epoch trainer.extend(torch_snapshot(), trigger=(1, 'epoch')) # Save best models trainer.extend( extensions.snapshot_object(tacotron2, 'model.loss.best', savefun=torch_save), trigger=training.triggers.MinValueTrigger('validation/main/loss')) # Save attention figure for each epoch if args.num_save_attention > 0: data = sorted(list(valid_json.items())[:args.num_save_attention], key=lambda x: int(x[1]['input'][0]['shape'][1]), reverse=True) if hasattr(tacotron2, "module"): att_vis_fn = tacotron2.module.calculate_all_attentions else: att_vis_fn = tacotron2.calculate_all_attentions trainer.extend(PlotAttentionReport(att_vis_fn, data, args.outdir + '/att_ws', converter=CustomConverter( False, args.use_speaker_embedding), device=device, reverse=True), trigger=(1, 'epoch')) # Make a plot for training and validation values plot_keys = [ 'main/loss', 'validation/main/loss', 'main/l1_loss', 'validation/main/l1_loss', 'main/mse_loss', 'validation/main/mse_loss', 'main/bce_loss', 'validation/main/bce_loss' ] trainer.extend( extensions.PlotReport(['main/l1_loss', 'validation/main/l1_loss'], 'epoch', file_name='l1_loss.png')) trainer.extend( extensions.PlotReport(['main/mse_loss', 'validation/main/mse_loss'], 'epoch', file_name='mse_loss.png')) trainer.extend( extensions.PlotReport(['main/bce_loss', 'validation/main/bce_loss'], 'epoch', file_name='bce_loss.png')) if args.use_cbhg: plot_keys += [ 'main/cbhg_l1_loss', 'validation/main/cbhg_l1_loss', 'main/cbhg_mse_loss', 'validation/main/cbhg_mse_loss' ] trainer.extend( extensions.PlotReport( ['main/cbhg_l1_loss', 'validation/main/cbhg_l1_loss'], 'epoch', file_name='cbhg_l1_loss.png')) trainer.extend( extensions.PlotReport( ['main/cbhg_mse_loss', 'validation/main/cbhg_mse_loss'], 'epoch', file_name='cbhg_mse_loss.png')) trainer.extend( extensions.PlotReport(plot_keys, 'epoch', file_name='loss.png')) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport(trigger=(REPORT_INTERVAL, 'iteration'))) report_keys = plot_keys[:] report_keys[0:0] = ['epoch', 'iteration', 'elapsed_time'] trainer.extend(extensions.PrintReport(report_keys), trigger=(REPORT_INTERVAL, 'iteration')) trainer.extend(extensions.ProgressBar(update_interval=REPORT_INTERVAL)) # Run the training trainer.run()
def train(args): '''RUN TRAINING''' # seed setting torch.manual_seed(args.seed) # use determinisitic computation or not if args.debugmode < 1: torch.backends.cudnn.deterministic = False logging.info('torch cudnn deterministic is disabled') else: torch.backends.cudnn.deterministic = True # check cuda availability if not torch.cuda.is_available(): logging.warning('cuda is not available') # get input and output dimension info with open(args.valid_json, 'rb') as f: valid_json = json.load(f)['utts'] utts = list(valid_json.keys()) # reverse input and output dimension idim = int(valid_json[utts[0]]['output'][0]['shape'][1]) odim = int(valid_json[utts[0]]['input'][0]['shape'][1]) if args.use_speaker_embedding: args.spk_embed_dim = int(valid_json[utts[0]]['input'][1]['shape'][0]) else: args.spk_embed_dim = None logging.info('#input dims : ' + str(idim)) logging.info('#output dims: ' + str(odim)) # write model config if not os.path.exists(args.outdir): os.makedirs(args.outdir) model_conf = args.outdir + '/model.conf' with open(model_conf, 'wb') as f: logging.info('writing a model config file to' + model_conf) pickle.dump((idim, odim, args), f) for key in sorted(vars(args).keys()): logging.info('ARGS: ' + key + ': ' + str(vars(args)[key])) # define output activation function if args.output_activation is None: output_activation_fn = None elif hasattr(torch.nn.functional, args.output_activation): output_activation_fn = getattr(torch.nn.functional, args.output_activation) else: raise ValueError('there is no such an activation function. (%s)' % args.output_activation) # specify model architecture tacotron2 = Tacotron2(idim=idim, odim=odim, spk_embed_dim=args.spk_embed_dim, embed_dim=args.embed_dim, elayers=args.elayers, eunits=args.eunits, econv_layers=args.econv_layers, econv_chans=args.econv_chans, econv_filts=args.econv_filts, dlayers=args.dlayers, dunits=args.dunits, prenet_layers=args.prenet_layers, prenet_units=args.prenet_units, postnet_layers=args.postnet_layers, postnet_chans=args.postnet_chans, postnet_filts=args.postnet_filts, output_activation_fn=output_activation_fn, adim=args.adim, aconv_chans=args.aconv_chans, aconv_filts=args.aconv_filts, cumulate_att_w=args.cumulate_att_w, use_batch_norm=args.use_batch_norm, use_concate=args.use_concate, dropout=args.dropout_rate, zoneout=args.zoneout_rate) logging.info(tacotron2) # Set gpu ngpu = args.ngpu if ngpu == 1: gpu_id = range(ngpu) logging.info('gpu id: ' + str(gpu_id)) tacotron2.cuda() elif ngpu > 1: gpu_id = range(ngpu) logging.info('gpu id: ' + str(gpu_id)) tacotron2 = torch.nn.DataParallel(tacotron2, device_ids=gpu_id) tacotron2.cuda() logging.info('batch size is automatically increased (%d -> %d)' % (args.batch_size, args.batch_size * ngpu)) args.batch_size *= ngpu else: gpu_id = [-1] # define loss model = Tacotron2Loss(model=tacotron2, use_masking=args.use_masking, bce_pos_weight=args.bce_pos_weight) reporter = model.reporter # Setup an optimizer optimizer = torch.optim.Adam(model.parameters(), args.lr, eps=args.eps, weight_decay=args.weight_decay) # FIXME: TOO DIRTY HACK setattr(optimizer, 'target', reporter) setattr(optimizer, 'serialize', lambda s: reporter.serialize(s)) # read json data with open(args.train_json, 'rb') as f: train_json = json.load(f)['utts'] with open(args.valid_json, 'rb') as f: valid_json = json.load(f)['utts'] # make minibatch list (variable length) train_batchset = make_batchset(train_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, args.batch_sort_key) valid_batchset = make_batchset(valid_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, args.batch_sort_key) # hack to make batchsze argument as 1 # actual bathsize is included in a list train_iter = chainer.iterators.SerialIterator(train_batchset, 1) valid_iter = chainer.iterators.SerialIterator(valid_batchset, 1, repeat=False, shuffle=False) # Set up a trainer converter = CustomConverter(gpu_id, True, args.use_speaker_embedding) updater = CustomUpdater(model, args.grad_clip, train_iter, optimizer, converter) trainer = training.Trainer(updater, (args.epochs, 'epoch'), out=args.outdir) # Resume from a snapshot if args.resume: logging.info('restored from %s' % args.resume) chainer.serializers.load_npz(args.resume, trainer) torch_load(args.outdir + '/model.ep.%d' % trainer.updater.epoch, tacotron2) model = trainer.updater.model # Evaluate the model with the test dataset for each epoch trainer.extend(CustomEvaluator(model, valid_iter, reporter, converter)) # Take a snapshot for each specified epoch trainer.extend( extensions.snapshot(filename='snapshot.ep.{.updater.epoch}'), trigger=(1, 'epoch')) # Save attention figure for each epoch if args.num_save_attention > 0: data = sorted(list(valid_json.items())[:args.num_save_attention], key=lambda x: int(x[1]['input'][0]['shape'][1]), reverse=True) trainer.extend(PlotAttentionReport( tacotron2, data, args.outdir + '/att_ws', CustomConverter(gpu_id, False, args.use_speaker_embedding), True), trigger=(1, 'epoch')) # Make a plot for training and validation values trainer.extend( extensions.PlotReport([ 'main/loss', 'validation/main/loss', 'main/l1_loss', 'validation/main/l1_loss', 'main/mse_loss', 'validation/main/mse_loss', 'main/bce_loss', 'validation/main/bce_loss' ], 'epoch', file_name='loss.png')) trainer.extend( extensions.PlotReport(['main/l1_loss', 'validation/main/l1_loss'], 'epoch', file_name='l1_loss.png')) trainer.extend( extensions.PlotReport(['main/mse_loss', 'validation/main/mse_loss'], 'epoch', file_name='mse_loss.png')) trainer.extend( extensions.PlotReport(['main/bce_loss', 'validation/main/bce_loss'], 'epoch', file_name='bce_loss.png')) # Save model for each epoch trainer.extend(extensions.snapshot_object(tacotron2, 'model.ep.{.updater.epoch}', savefun=torch_save), trigger=(1, 'epoch')) # Save best models trainer.extend( extensions.snapshot_object(tacotron2, 'model.loss.best', savefun=torch_save), trigger=training.triggers.MinValueTrigger('validation/main/loss')) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport(trigger=(100, 'iteration'))) report_keys = [ 'epoch', 'iteration', 'elapsed_time', 'main/loss', 'main/l1_loss', 'main/mse_loss', 'main/bce_loss', 'validation/main/loss', 'validation/main/l1_loss', 'validation/main/mse_loss', 'validation/main/bce_loss' ] trainer.extend(extensions.PrintReport(report_keys), trigger=(100, 'iteration')) trainer.extend(extensions.ProgressBar()) # Run the training trainer.run()