def main(): batch_size = 64 tacotron = Tacotron(batch_size=batch_size) num_lines = sum(1 for _ in open("training/train.txt")) max_index = num_lines // batch_size while True: index = 0 inputs, linear_targets, mel_targets = get_data(index, batch_size) tacotron.train(inputs, linear_targets, mel_targets) index += 1 if index >= max_index: index = 0
def main(): g = Tacotron() print("Training Graph loaded") with g.graph.as_default(): # Training sv = tf.train.Supervisor(logdir=hp.logdir, save_model_secs=0) with sv.managed_session() as sess: for epoch in range(1, hp.num_epochs + 1): if sv.should_stop(): break for step in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'): sess.run(g.train_op) # Write checkpoint files at every epoch gs = sess.run(g.global_step) sv.saver.save( sess, hp.logdir + '/model_epoch_%02d_gs_%d' % (epoch, gs))
def main(): config = ConfigXT() load = FileXT(config.audio_path) print( colored('Preprocessing audio for ', 'blue', attrs=['bold']) + load.basename) data = preprocess.preprocess(load.filename, config.speaker, config, verbose=False) dataloader = dataprocess.load_infer(data) model = Tacotron(config) model.load_state_dict( torch.load(config.model_path, map_location='cpu')['state_dict']) model = set_device(model, config.device) model.eval() print( colored('Generating mel-spectrogram with ', 'blue', attrs=['bold']) + config.model_path) mel = [] y_prev = set_device(torch.zeros(1, config.mel_size, 1), config.device) for batch in tqdm(dataloader, leave=False, ascii=True): x, y_prev, _ = set_device(batch, config.device) y_gen, _ = model(x, y_prev) mel.append(y_gen.data) y_prev = y_gen[..., -1].unsqueeze(-1) mel = torch.cat(mel, dim=-1) if config.vocoder == 'wavernn': wave = wavernn_infer(mel, config) elif config.vocoder == 'waveglow': wave = waveglow_infer(mel, config) savename = config.model_path.replace('.pt', '_') + FileXT( config.vocoder_path).basestem + '_speaker' + str( config.speaker) + '_' + load.basename torchaudio.save(savename, wave, config.sample_rate) print(colored('Audio generated to ', 'blue', attrs=['bold']) + savename)
# From https://pypi.org/project/tacotron/ import torch import soundfile as sf from univoc import Vocoder from tacotron import load_cmudict, text_to_id, Tacotron # download pretrained weights for the vocoder (and optionally move to GPU) vocoder = Vocoder.from_pretrained( "https://github.com/bshall/UniversalVocoding/releases/download/v0.2/univoc-ljspeech-7mtpaq.pt" ).cuda() # download pretrained weights for tacotron (and optionally move to GPU) tacotron = Tacotron.from_pretrained( "https://github.com/bshall/Tacotron/releases/download/v0.1/tacotron-ljspeech-yspjx3.pt" ).cuda() # load cmudict and add pronunciation of PyTorch cmudict = load_cmudict() text = "Your glasses are in Bangladesh." # convert text to phone ids x = torch.LongTensor(text_to_id(text, cmudict)).unsqueeze(0).cuda() # synthesize audio with torch.no_grad(): mel, _ = tacotron.generate(x) wav, sr = vocoder.generate(mel.transpose(1, 2)) print(wav)
def main(): tacotron = Tacotron(1, is_training=False) text_input = input()
def train_model(args): with open("tacotron/config.toml") as file: cfg = toml.load(file) tensorboard_path = Path("tensorboard") / args.checkpoint_dir checkpoint_dir = Path(args.checkpoint_dir) writer = SummaryWriter(tensorboard_path) tacotron = Tacotron(**cfg["model"]).cuda() optimizer = optim.Adam(tacotron.parameters(), lr=cfg["train"]["optimizer"]["lr"]) scaler = amp.GradScaler() scheduler = optim.lr_scheduler.MultiStepLR( optimizer=optimizer, milestones=cfg["train"]["scheduler"]["milestones"], gamma=cfg["train"]["scheduler"]["gamma"], ) if args.resume is not None: global_step = load_checkpoint( tacotron=tacotron, optimizer=optimizer, scaler=scaler, scheduler=scheduler, load_path=args.resume, ) else: global_step = 0 root_path = Path(args.dataset_dir) text_path = Path(args.text_path) dataset = TTSDataset(root_path, text_path) sampler = samplers.RandomSampler(dataset) batch_sampler = BucketBatchSampler( sampler=sampler, batch_size=cfg["train"]["batch_size"], drop_last=True, sort_key=dataset.sort_key, bucket_size_multiplier=cfg["train"]["bucket_size_multiplier"], ) collate_fn = partial( pad_collate, reduction_factor=cfg["model"]["decoder"]["reduction_factor"]) loader = DataLoader( dataset, batch_sampler=batch_sampler, collate_fn=collate_fn, num_workers=cfg["train"]["n_workers"], pin_memory=True, ) n_epochs = cfg["train"]["n_steps"] // len(loader) + 1 start_epoch = global_step // len(loader) + 1 for epoch in range(start_epoch, n_epochs + 1): average_loss = 0 for i, (mels, texts, mel_lengths, text_lengths, attn_flag) in enumerate(tqdm(loader), 1): mels, texts = mels.cuda(), texts.cuda() optimizer.zero_grad() with amp.autocast(): ys, alphas = tacotron(texts, mels) loss = F.l1_loss(ys, mels) scaler.scale(loss).backward() scaler.unscale_(optimizer) clip_grad_norm_(tacotron.parameters(), cfg["train"]["clip_grad_norm"]) scaler.step(optimizer) scaler.update() scheduler.step() global_step += 1 average_loss += (loss.item() - average_loss) / i if global_step % cfg["train"]["checkpoint_interval"] == 0: save_checkpoint( tacotron=tacotron, optimizer=optimizer, scaler=scaler, scheduler=scheduler, step=global_step, checkpoint_dir=checkpoint_dir, ) if attn_flag: index = attn_flag[0] alpha = alphas[ index, :text_lengths[index], :mel_lengths[index] // 2] alpha = alpha.detach().cpu().numpy() y = ys[index, :, :].detach().cpu().numpy() log_alignment(alpha, y, cfg["preprocess"], writer, global_step) writer.add_scalar("loss", average_loss, global_step) print( f"epoch {epoch} : loss {average_loss:.4f} : {scheduler.get_last_lr()}" )
def main(): config = ConfigXT() config_basename = FileXT(config.file).basename print("Configuration file: %s" % (config_basename)) checkpoint_path = config.checkpoint_path if not config.test_run: checkpoint_path = FileXT(config.checkpoint_path, '').create_path() config.save(os.path.join(checkpoint_path, config_basename)) writer = SummaryWriter(checkpoint_path) dataloader = dataprocess.load_train(config) model = Tacotron(config) model = set_device(model, config.device) criterion = torch.nn.L1Loss() optimizer = torch.optim.Adam(model.parameters(), lr=config.learn_rate, weight_decay=config.weight_decay) scheduler = StepLR(optimizer, step_size=len(dataloader.train) * config.step_size, gamma=config.factor) losses = [] loss_train = LossLog() loss_valid = LossLog() for epoch in range(config.stop_epoch): # Train Loop model.train() for batch in tqdm(dataloader.train, leave=False, ascii=True): x, y_prev, y = set_device(batch, config.device) optimizer.zero_grad() y_gen, y_decoder_gen = model(x, y_prev) loss = criterion(y_gen, y) + criterion(y_decoder_gen, y) loss.backward() if config.clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), config.clip_grad_norm) optimizer.step() scheduler.step() loss_train.add(loss.item(), y[0].size(0)) if not config.test_run: writer.add_scalar('train/l1_loss', loss.item(), loss_train.iteration) # Validation Loop model.eval() for batch in tqdm(dataloader.valid, leave=False, ascii=True): x, y_prev, y = set_device(batch, config.device) y_gen, y_decoder_gen = model(x, y_prev) loss = criterion(y_gen, y) + criterion(y_decoder_gen, y) loss_valid.add(loss.item(), y[0].size(0)) if not config.test_run: writer.add_scalar('valid/l1_loss', loss.item(), loss_valid.iteration) learn_rate = scheduler.get_lr()[0] print( "[Epoch %d/%d] [loss train: %.5f] [loss valid: %.5f] [lr: %.5f]" % (epoch, config.stop_epoch, loss_train.avg(), loss_valid.avg(), learn_rate)) losses.append([loss_train.avg(), loss_valid.avg()]) loss_train.reset() loss_valid.reset() if not config.test_run: loss_savename = os.path.join(checkpoint_path, 'loss.pt') torch.save(losses, loss_savename) savename = os.path.join(checkpoint_path, 'latest_checkpoint.pt') save_checkpoint(savename, model, optimizer, learn_rate, loss_train.iteration) if epoch % config.save_epoch == 0: savename = os.path.join(checkpoint_path, 'epoch' + str(epoch) + '.pt') save_checkpoint(savename, model, optimizer, learn_rate, loss_train.iteration)
def train_model(cfg): tensorboard_path = Path(utils.to_absolute_path("tensorboard")) / cfg.checkpoint_dir checkpoint_dir = Path(utils.to_absolute_path(cfg.checkpoint_dir)) writer = SummaryWriter(tensorboard_path) tacotron = Tacotron(**cfg.model).cuda() optimizer = optim.Adam(tacotron.parameters(), lr=cfg.train.optimizer.lr) scaler = amp.GradScaler() scheduler = optim.lr_scheduler.MultiStepLR( optimizer=optimizer, milestones=cfg.train.scheduler.milestones, gamma=cfg.train.scheduler.gamma, ) if cfg.resume: resume_path = utils.to_absolute_path(cfg.resume) global_step = load_checkpoint( tacotron=tacotron, optimizer=optimizer, scaler=scaler, scheduler=scheduler, load_path=resume_path, ) else: global_step = 0 root_path = Path(utils.to_absolute_path(cfg.dataset_dir)) text_path = Path(utils.to_absolute_path(cfg.text_path)) dataset = TTSDataset(root_path, text_path) sampler = samplers.RandomSampler(dataset) batch_sampler = BucketBatchSampler( sampler=sampler, batch_size=cfg.train.batch_size, drop_last=True, sort_key=dataset.sort_key, bucket_size_multiplier=cfg.train.bucket_size_multiplier, ) loader = DataLoader( dataset, batch_sampler=batch_sampler, collate_fn=pad_collate, num_workers=cfg.train.n_workers, pin_memory=True, ) n_epochs = cfg.train.n_steps // len(loader) + 1 start_epoch = global_step // len(loader) + 1 for epoch in range(start_epoch, n_epochs + 1): average_loss = 0 for i, (mels, texts, mel_lengths, text_lengths, attn_flag) in enumerate( tqdm(loader), 1 ): mels, texts = mels.cuda(), texts.cuda() optimizer.zero_grad() with amp.autocast(): ys, alphas = tacotron(texts, mels) loss = F.l1_loss(ys, mels) scaler.scale(loss).backward() scaler.unscale_(optimizer) grad_norm = clip_grad_norm_(tacotron.parameters(), cfg.train.clip_grad_norm) scaler.step(optimizer) scaler.update() scheduler.step() global_step += 1 average_loss += (loss.item() - average_loss) / i if global_step % cfg.train.checkpoint_interval == 0: save_checkpoint( tacotron=tacotron, optimizer=optimizer, scaler=scaler, scheduler=scheduler, step=global_step, checkpoint_dir=checkpoint_dir, ) if attn_flag: index = attn_flag[0] alpha = alphas[index, : text_lengths[index], : mel_lengths[index] // 2] alpha = alpha.detach().cpu().numpy() y = ys[index, :, :].detach().cpu().numpy() log_alignment(alpha, y, cfg.preprocess, writer, global_step) writer.add_scalar("loss", average_loss, global_step) print( f"epoch {epoch} : average loss {average_loss:.4f} : {scheduler.get_last_lr()}" )