def test_train_step(): input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8, )).long().to(device) input_lengths[-1] = 128 mel_spec = torch.rand(8, 120, c.audio['num_mels']).to(device) linear_spec = torch.rand(8, 120, c.audio['num_freq']).to(device) mel_lengths = torch.randint(20, 120, (8, )).long().to(device) mel_lengths[-1] = 120 stop_targets = torch.zeros(8, 120, 1).float().to(device) speaker_ids = torch.randint(0, 5, (8, )).long().to(device) for idx in mel_lengths: stop_targets[:, int(idx.item()):, 0] = 1.0 stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = L1LossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) model = Tacotron( num_chars=32, num_speakers=5, gst=True, postnet_output_dim=c.audio['num_freq'], decoder_output_dim=c.audio['num_mels'], r=c.r, memory_size=c.memory_size ).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor model.train() print(model) print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model))) model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(10): mel_out, linear_out, align, stop_tokens = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids) optimizer.zero_grad() loss = criterion(mel_out, mel_spec, mel_lengths) stop_loss = criterion_st(stop_tokens, stop_targets) loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional assert (param != param_ref).any( ), "param {} with shape {} not updated!! \n{}\n{}".format( count, param.shape, param, param_ref) count += 1
def test_in_out(self): # test input == target layer = L1LossMasked() dummy_input = T.ones(4, 8, 128).float() dummy_target = T.ones(4, 8, 128).float() dummy_length = (T.ones(4) * 8).long() output = layer(dummy_input, dummy_target, dummy_length) assert output.item() == 0.0 # test input != target dummy_input = T.ones(4, 8, 128).float() dummy_target = T.zeros(4, 8, 128).float() dummy_length = (T.ones(4) * 8).long() output = layer(dummy_input, dummy_target, dummy_length) assert output.item() == 1.0, "1.0 vs {}".format(output.data[0]) # test if padded values of input makes any difference dummy_input = T.ones(4, 8, 128).float() dummy_target = T.zeros(4, 8, 128).float() dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) assert output.item() == 1.0, "1.0 vs {}".format(output.data[0]) dummy_input = T.rand(4, 8, 128).float() dummy_target = dummy_input.detach() dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) assert output.item() == 0, "0 vs {}".format(output.data[0])
def test_train_step(self): input = torch.randint(0, 24, (8, 128)).long().to(device) mel_spec = torch.rand(8, 30, c.num_mels).to(device) linear_spec = torch.rand(8, 30, c.num_freq).to(device) mel_lengths = torch.randint(20, 30, (8, )).long().to(device) criterion = L1LossMasked().to(device) model = Tacotron(c.embedding_size, c.num_freq, c.num_mels, c.r).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(5): mel_out, linear_out, align = model.forward(input, mel_spec) optimizer.zero_grad() loss = criterion(mel_out, mel_spec, mel_lengths) loss = 0.5 * loss + 0.5 * criterion(linear_out, linear_spec, mel_lengths) loss.backward() optimizer.step() # check parameter changes count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional if count not in [139, 59]: assert (param != param_ref).any( ), "param {} with shape {} not updated!! \n{}\n{}".format( count, param.shape, param, param_ref) count += 1
def test_train_step(self): input = torch.randint(0, 24, (8, 128)).long().to(device) mel_spec = torch.rand(8, 30, c.num_mels).to(device) linear_spec = torch.rand(8, 30, c.num_freq).to(device) mel_lengths = torch.randint(20, 30, (8, )).long().to(device) stop_targets = torch.zeros(8, 30, 1).float().to(device) for idx in mel_lengths: stop_targets[:, int(idx.item()):, 0] = 1.0 stop_targets = stop_targets.view(input.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float() criterion = L1LossMasked().to(device) criterion_st = nn.BCELoss().to(device) model = Tacotron(c.embedding_size, c.num_freq, c.num_mels, c.r).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(5): mel_out, linear_out, align, stop_tokens = model.forward( input, mel_spec) assert stop_tokens.data.max() <= 1.0 assert stop_tokens.data.min() >= 0.0 optimizer.zero_grad() loss = criterion(mel_out, mel_spec, mel_lengths) stop_loss = criterion_st(stop_tokens, stop_targets) loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional if count not in [145, 59]: assert (param != param_ref).any( ), "param {} with shape {} not updated!! \n{}\n{}".format( count, param.shape, param, param_ref) count += 1
def test_in_out(self): layer = L1LossMasked() dummy_input = T.ones(4, 8, 128).float() dummy_target = T.ones(4, 8, 128).float() dummy_length = (T.ones(4) * 8).long() output = layer(dummy_input, dummy_target, dummy_length) assert output.item() == 0.0 dummy_input = T.ones(4, 8, 128).float() dummy_target = T.zeros(4, 8, 128).float() dummy_length = (T.ones(4) * 8).long() output = layer(dummy_input, dummy_target, dummy_length) assert output.item() == 1.0, "1.0 vs {}".format(output.data[0]) dummy_input = T.ones(4, 8, 128).float() dummy_target = T.zeros(4, 8, 128).float() dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) assert output.item() == 1.0, "1.0 vs {}".format(output.data[0])
def test_in_out(self): layer = L1LossMasked() dummy_input = T.autograd.Variable(T.ones(4, 8, 128).float()) dummy_target = T.autograd.Variable(T.ones(4, 8, 128).float()) dummy_length = T.autograd.Variable((T.ones(4) * 8).long()) output = layer(dummy_input, dummy_target, dummy_length) assert output.shape[0] == 0 assert len(output.shape) == 1 assert output.data[0] == 0.0 dummy_input = T.autograd.Variable(T.ones(4, 8, 128).float()) dummy_target = T.autograd.Variable(T.zeros(4, 8, 128).float()) dummy_length = T.autograd.Variable((T.ones(4) * 8).long()) output = layer(dummy_input, dummy_target, dummy_length) assert output.data[0] == 1.0, "1.0 vs {}".format(output.data[0]) dummy_input = T.autograd.Variable(T.ones(4, 8, 128).float()) dummy_target = T.autograd.Variable(T.zeros(4, 8, 128).float()) dummy_length = T.autograd.Variable((T.arange(5, 9)).long()) mask = ((_sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) assert output.data[0] == 1.0, "1.0 vs {}".format(output.data[0])
def main(args): #pylint: disable=redefined-outer-name # Audio processor ap = AudioProcessor(**c.audio) # DISTRUBUTED if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) num_chars = len(phonemes) if c.use_phonemes else len(symbols) if c.use_speaker_embedding: speakers = get_speakers(c.data_path, c.meta_file_train, c.dataset) if args.restore_path: prev_out_path = os.path.dirname(args.restore_path) speaker_mapping = load_speaker_mapping(prev_out_path) assert all([speaker in speaker_mapping for speaker in speakers]), "As of now you, you cannot " \ "introduce new speakers to " \ "a previously trained model." else: speaker_mapping = {name: i for i, name in enumerate(speakers)} save_speaker_mapping(OUT_PATH, speaker_mapping) num_speakers = len(speaker_mapping) print("Training with {} speakers: {}".format(num_speakers, ", ".join(speakers))) else: num_speakers = 0 model = setup_model(num_chars, num_speakers, c) print(" | > Num output units : {}".format(ap.num_freq), flush=True) optimizer = RAdam(model.parameters(), lr=c.lr, weight_decay=0) if c.stopnet and c.separate_stopnet: optimizer_st = RAdam(model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0) else: optimizer_st = None if c.loss_masking: criterion = L1LossMasked() if c.model in ["Tacotron", "TacotronGST" ] else MSELossMasked() else: criterion = nn.L1Loss() if c.model in ["Tacotron", "TacotronGST" ] else nn.MSELoss() criterion_st = nn.BCEWithLogitsLoss() if c.stopnet else None if args.restore_path: checkpoint = torch.load(args.restore_path) try: # TODO: fix optimizer init, model.cuda() needs to be called before # optimizer restore # optimizer.load_state_dict(checkpoint['optimizer']) if c.reinit_layers: raise RuntimeError model.load_state_dict(checkpoint['model']) except: print(" > Partial model initialization.") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint, c) model.load_state_dict(model_dict) del model_dict for group in optimizer.param_groups: group['lr'] = c.lr print(" > Model restored from step %d" % checkpoint['step'], flush=True) args.restore_step = checkpoint['step'] else: args.restore_step = 0 if use_cuda: model = model.cuda() criterion.cuda() if criterion_st: criterion_st.cuda() # DISTRUBUTED if num_gpus > 1: model = apply_gradient_allreduce(model) if c.lr_decay: scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if 'best_loss' not in locals(): best_loss = float('inf') global_step = args.restore_step for epoch in range(0, c.epochs): # set gradual training if c.gradual_training is not None: r, c.batch_size = gradual_training_scheduler(global_step, c) c.r = r model.decoder.set_r(r) print(" > Number of outputs per iteration:", model.decoder.r) train_loss, global_step = train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, ap, global_step, epoch) val_loss = evaluate(model, criterion, criterion_st, ap, global_step, epoch) print(" | > Training Loss: {:.5f} Validation Loss: {:.5f}".format( train_loss, val_loss), flush=True) target_loss = train_loss if c.run_eval: target_loss = val_loss best_loss = save_best_model(model, optimizer, target_loss, best_loss, OUT_PATH, global_step, epoch)
model = setup_model(num_chars, num_speakers, C) checkpoint = torch.load(MODEL_FILE) model.load_state_dict(checkpoint['model']) print(checkpoint['step']) model.eval() if use_cuda: model = model.cuda() # ### Generate model outputs import pickle file_idxs = [] losses = [] postnet_losses = [] criterion = L1LossMasked() for data in tqdm(loader): # setup input data text_input = data[0] text_lengths = data[1] speaker_names = data[2] linear_input = data[3] if C.model in ["Tacotron", "TacotronGST"] else None mel_input = data[4] mel_lengths = data[5] stop_targets = data[6] item_idx = data[7] if C.use_speaker_embedding: speaker_ids = [ speaker_mapping[speaker_name] for speaker_name in speaker_names ]