def main(): project_path = str(Path(__file__).resolve().parents[0]) tokenizer = RobertaTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME) train_dataset = ParallelLanguageDataset( project_path + "/data/raw/en/train.txt", project_path + "/data/raw/fr/train.txt", tokenizer, max_seq_length, ) train_loader = DataLoader( train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, ) valid_dataset = ParallelLanguageDataset( project_path + "/data/raw/en/val.txt", project_path + "/data/raw/fr/val.txt", tokenizer, max_seq_length, ) valid_loader = DataLoader( valid_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, ) model = LanguageTransformer( tokenizer.vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, max_seq_length, pos_dropout, trans_dropout, ).to("cpu") for p in model.parameters(): if p.dim() > 1: nn.init.xavier_normal_(p) optim = ScheduledOptim( Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09), d_model, n_warmup_steps) criterion = nn.CrossEntropyLoss(ignore_index=0) train_losses, val_losses = train(train_loader, valid_loader, model, optim, criterion, num_epochs)
def main(**kwargs): project_path = str(Path(__file__).resolve().parents[0]) # train_dataset = ParallelLanguageDataset(project_path + '/data/processed/en/train.pkl', # project_path + '/data/processed/fr/train.pkl', # kwargs['num_tokens'], kwargs['max_seq_length']) # train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=4, pin_memory=True) # valid_dataset = ParallelLanguageDataset(project_path + '/data/processed/en/val.pkl', # project_path + '/data/processed/fr/val.pkl', # kwargs['num_tokens'], kwargs['max_seq_length']) # valid_loader = DataLoader(valid_dataset, batch_size=1, shuffle=True, num_workers=4, pin_memory=True) train_dataset = TranslationDataset( project_path + '/data_enru/processed/en/train.pkl', project_path + '/data_enru/processed/ru/train.pkl', kwargs['num_tokens'], kwargs['max_seq_length']) train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=4, pin_memory=True) valid_dataset = TranslationDataset( project_path + '/data_enru/processed/en/val.pkl', project_path + '/data_enru/processed/ru/val.pkl', kwargs['num_tokens'], kwargs['max_seq_length']) valid_loader = DataLoader(valid_dataset, batch_size=1, shuffle=True, num_workers=4, pin_memory=True) model = LanguageTransformer( kwargs['vocab_size'], kwargs['d_model'], kwargs['nhead'], kwargs['num_encoder_layers'], kwargs['num_decoder_layers'], kwargs['dim_feedforward'], kwargs['max_seq_length'], kwargs['pos_dropout'], kwargs['trans_dropout']).to('cuda') for p in model.parameters(): if p.dim() > 1: nn.init.xavier_normal_(p) optim = ScheduledOptim( Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09), kwargs['d_model'], kwargs['n_warmup_steps']) criterion = nn.CrossEntropyLoss(ignore_index=0) train_losses, val_losses = train(train_loader, valid_loader, model, optim, criterion, kwargs['num_epochs'])
def train(self, maxEpoch): if use_cuda: print('use cuda') self.model = self.model.cuda() self.model.use_cuda = True self.model.tensor = torch.cuda.LongTensor self.model.train() #opt = optim.Adam(self.model.parameters(),betas=(0.9, 0.98), eps=1e-09) #opt = optim.Adam(self.model.parameters()) opt = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, self.model.parameters()), betas=(0.9, 0.98), eps=1e-09), config.d_model, config.n_warmup_steps) for ep in range(maxEpoch): start = time.time() print(ep) indices = np.random.permutation(len(self.ds_train.idData)) batches = pack(indices, 64) accLoss = 0 n_word_total = 0 n_word_correct = 0 for batch in tqdm(batches): opt.zero_grad() idLines = [self.ds_train.idData[b] for b in batch] loss, n_correct, ts = self.model.getLoss(idLines) # backward and update loss.backward() opt.step_and_update_lr() # keep info accLoss += loss.item() non_pad_mask = ts.ne(config.pad_id) n_word = non_pad_mask.sum().item() n_word_total += n_word n_word_correct += n_correct loss_per_word = accLoss / n_word_total accuracy = n_word_correct / n_word_total print(' - (Train) ppl: {ppl: 8.5f}, accuracy: {accu:3.3f} %, '\ 'elapse: {elapse:3.3f} min'.format( ppl=math.exp(min(loss_per_word, 100)), accu=100*accuracy, elapse=(time.time()-start)/60)) self.evaluate()
def train_process(opt): opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model opt.batch_size = opt.b device = torch.device('cuda' if opt.cuda else 'cpu') data_class = SingleTurnDialog.load_class('OpenSubtitles') data_arg = Storage() data_arg.file_id = opt.datapath data_arg.min_vocab_times = 20 def load_dataset(data_arg, wvpath, embedding_size): dm = data_class(**data_arg) return dm opt.n_position = 100 dm = load_dataset(data_arg, None, opt.n_position) opt.n_src_vocab = dm.valid_vocab_len opt.n_trg_vocab = dm.valid_vocab_len opt.n_vocab_size = dm.valid_vocab_len opt.src_pad_idx = 0 opt.trg_pad_idx = 0 opt.pad_idx = 0 model = transformer_model(opt, device).to(device) n_steps = 0 optimizer_ = optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09) if (opt.restore != None): checkpoint = torch.load(opt.restore) model.load_state_dict(checkpoint['net']) n_steps = checkpoint['n_steps'] optimizer_.load_state_dict(checkpoint['opt']) optimizer = ScheduledOptim(optimizer_, opt.lr, opt.d_model, opt.n_warmup_steps, n_steps) dl = cotk.dataloader.OpenSubtitles( opt.datapath, min_vocab_times=data_arg.min_vocab_times) train(model, dm, optimizer, device, opt, dl)
def make_model(classes, n_warmup_steps, n_encoder=2, d_dim=256, dropout=0.1, l_byte=1500, byte_range=256, h_groups=8): # we use 40 bytes for classify, but here we define l_byte=1500 just because it is the MTU # in fact, Preprocess.py has make every packet to 40 bytes model = SAN(classes, n_encoder, d_dim, dropout, l_byte, byte_range, h_groups) for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, model.parameters()), betas=(0.9, 0.98), eps=1e-09), d_dim, n_warmup_steps) return model.to(torch.device('cuda')), optimizer
fusion, out_prob, cross_att = net_model(audio_inputs, video_inputs, args.threshold) labels = labels.cpu().data.numpy() x_labels = out_prob.cpu().data.numpy() acc = compute_acc(labels, x_labels, nb_batch) print('[test]acc: ', acc) return acc if __name__ == "__main__": args = parser.parse_args() print("args: ", args) # model and optimizer model_name = args.model_name if model_name == "PSP": net_model = psp_net(128, 512, 128, 29) else: raise NotImplementedError net_model.to(device) optimizer = optim.Adam(net_model.parameters(), lr=1e-3) optimizer = ScheduledOptim(optimizer) if args.train: train(args, net_model, optimizer) else: test_acc = test(args, net_model, model_path=args.trained_model_path) print("[test] accuracy: ", test_acc)
def main(): torch_num_threads = 25 torch.set_num_threads(torch_num_threads) ''' Main function''' parser = argparse.ArgumentParser() #parser.add_argument('-data', required=True) parser.add_argument('-torch_threads', type=int, default=25) parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-batch_size', type=int, default=8) #parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=8) parser.add_argument('-d_inner_hid', type=int, default=8) parser.add_argument('-n_warmup_steps', type=int, default=3) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default='model') parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-network', type=int, default=0) # use social network; need features or deepwalk embeddings as initial input parser.add_argument('-pos_emb', type=int, default=1) parser.add_argument('-warmup', type=int, default=3) # warmup epochs parser.add_argument('-notes', default='') parser.add_argument('-data_name', default='twitter') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model if opt.network==1: opt.network = True else: opt.network = False if opt.pos_emb==1: opt.pos_emb = True else: opt.pos_emb = False print(opt.notes) #========= Preparing DataLoader =========# train_data = DataLoader(opt.data_name, data=0, load_dict=True, batch_size=opt.batch_size, cuda=opt.cuda, loadNE=opt.network) valid_data = DataLoader(opt.data_name, data=1, batch_size=opt.batch_size, cuda=opt.cuda, loadNE=opt.network) test_data = DataLoader(opt.data_name, data=2, batch_size=opt.batch_size, cuda=opt.cuda, loadNE=opt.network) opt.user_size = train_data.user_size if opt.network: opt.net = train_data._adj_list opt.net_dict = train_data._adj_dict_list opt.embeds = train_data._embeds #========= Preparing Model =========# #print(opt) decoder = RNNModel('GRUCell', opt) RLLearner = RRModel(decoder) #print(transformer) optimizer = ScheduledOptim( optim.Adam( RLLearner.parameters(), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) def get_criterion(user_size): ''' With PAD token zero weight ''' weight = torch.ones(user_size) weight[Constants.PAD] = 0 weight[Constants.EOS] = 1 return nn.CrossEntropyLoss(weight, size_average=False) crit = get_criterion(train_data.user_size) if opt.cuda: decoder = decoder.cuda() RLLearner = RLLearner.cuda() crit = crit.cuda() train(RLLearner, train_data, valid_data, test_data, crit, optimizer, opt)
def train(opt): """ dataset preparation """ opt.select_data = opt.select_data.split('-') opt.batch_ratio = opt.batch_ratio.split('-') train_dataset = Batch_Balanced_Dataset(opt) AlignCollate_valid = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) valid_dataset = hierarchical_dataset(root=opt.valid_data, opt=opt) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=opt.batch_size, # 'True' to check training progress with validation function. shuffle=True, num_workers=int(opt.workers), collate_fn=AlignCollate_valid, pin_memory=True) print('-' * 80) """ model configuration """ if 'Transformer' in opt.SequenceModeling: converter = TransformerLabelConverter(opt.character) elif 'CTC' in opt.Prediction: converter = CTCLabelConverter(opt.character) else: converter = AttnLabelConverter(opt.character) opt.num_class = len(converter.character) if opt.rgb: opt.input_channel = 3 model = Model(opt) print('model input parameters', opt.imgH, opt.imgW, opt.num_fiducial, opt.input_channel, opt.output_channel, opt.hidden_size, opt.num_class, opt.batch_max_length, opt.Transformation, opt.FeatureExtraction, opt.SequenceModeling, opt.Prediction) # weight initialization for name, param in model.named_parameters(): if 'localization_fc2' in name: print(f'Skip {name} as it is already initialized') continue try: if 'bias' in name: init.constant_(param, 0.0) elif 'weight' in name: init.kaiming_normal_(param) except Exception as e: # for batchnorm. if 'weight' in name: param.data.fill_(1) continue """ setup loss """ if 'Transformer' in opt.SequenceModeling: criterion = transformer_loss elif 'CTC' in opt.Prediction: criterion = torch.nn.CTCLoss(zero_infinity=True).cuda() else: # ignore [GO] token = ignore index 0 criterion = torch.nn.CrossEntropyLoss(ignore_index=0).cuda() # loss averager loss_avg = Averager() # filter that only require gradient decent filtered_parameters = [] params_num = [] for p in filter(lambda p: p.requires_grad, model.parameters()): filtered_parameters.append(p) params_num.append(np.prod(p.size())) print('Trainable params num : ', sum(params_num)) # [print(name, p.numel()) for name, p in filter(lambda p: p[1].requires_grad, model.named_parameters())] # setup optimizer if opt.adam: optimizer = optim.Adam(filtered_parameters, lr=opt.lr, betas=(opt.beta1, 0.999)) elif 'Transformer' in opt.SequenceModeling and opt.use_scheduled_optim: optimizer = optim.Adam(filtered_parameters, betas=(0.9, 0.98), eps=1e-09) optimizer_schedule = ScheduledOptim(optimizer, opt.d_model, opt.n_warmup_steps) else: optimizer = optim.Adadelta(filtered_parameters, lr=opt.lr, rho=opt.rho, eps=opt.eps) print("Optimizer:") print(optimizer) """ final options """ # print(opt) with open(f'./saved_models/{opt.experiment_name}/opt.txt', 'a') as opt_file: opt_log = '------------ Options -------------\n' args = vars(opt) for k, v in args.items(): opt_log += f'{str(k)}: {str(v)}\n' opt_log += '---------------------------------------\n' print(opt_log) opt_file.write(opt_log) """ start training """ start_iter = 0 start_time = time.time() best_accuracy = -1 best_norm_ED = 1e+6 pickle.load = partial(pickle.load, encoding="latin1") pickle.Unpickler = partial(pickle.Unpickler, encoding="latin1") if opt.load_weights != '' and check_isfile(opt.load_weights): # load pretrained weights but ignore layers that don't match in size checkpoint = torch.load(opt.load_weights, pickle_module=pickle) if type(checkpoint) == dict: pretrain_dict = checkpoint['state_dict'] else: pretrain_dict = checkpoint model_dict = model.state_dict() pretrain_dict = { k: v for k, v in pretrain_dict.items() if k in model_dict and model_dict[k].size() == v.size() } model_dict.update(pretrain_dict) model.load_state_dict(model_dict) print("Loaded pretrained weights from '{}'".format(opt.load_weights)) del checkpoint torch.cuda.empty_cache() if opt.continue_model != '': print(f'loading pretrained model from {opt.continue_model}') checkpoint = torch.load(opt.continue_model) print(checkpoint.keys()) model.load_state_dict(checkpoint['state_dict']) start_iter = checkpoint['step'] + 1 print('continue to train start_iter: ', start_iter) if 'optimizer' in checkpoint.keys(): optimizer.load_state_dict(checkpoint['optimizer']) for state in optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.cuda() if 'best_accuracy' in checkpoint.keys(): best_accuracy = checkpoint['best_accuracy'] if 'best_norm_ED' in checkpoint.keys(): best_norm_ED = checkpoint['best_norm_ED'] del checkpoint torch.cuda.empty_cache() # data parallel for multi-GPU model = torch.nn.DataParallel(model).cuda() model.train() print("Model size:", count_num_param(model), 'M') if 'Transformer' in opt.SequenceModeling and opt.use_scheduled_optim: optimizer_schedule.n_current_steps = start_iter for i in tqdm(range(start_iter, opt.num_iter)): for p in model.parameters(): p.requires_grad = True cpu_images, cpu_texts = train_dataset.get_batch() image = cpu_images.cuda() if 'Transformer' in opt.SequenceModeling: text, length, text_pos = converter.encode(cpu_texts, opt.batch_max_length) elif 'CTC' in opt.Prediction: text, length = converter.encode(cpu_texts) else: text, length = converter.encode(cpu_texts, opt.batch_max_length) batch_size = image.size(0) if 'Transformer' in opt.SequenceModeling: preds = model(image, text, tgt_pos=text_pos) target = text[:, 1:] # without <s> Symbol cost = criterion(preds.view(-1, preds.shape[-1]), target.contiguous().view(-1)) elif 'CTC' in opt.Prediction: preds = model(image, text).log_softmax(2) preds_size = torch.IntTensor([preds.size(1)] * batch_size) preds = preds.permute(1, 0, 2) # to use CTCLoss format cost = criterion(preds, text, preds_size, length) else: preds = model(image, text) target = text[:, 1:] # without [GO] Symbol cost = criterion(preds.view(-1, preds.shape[-1]), target.contiguous().view(-1)) model.zero_grad() cost.backward() if 'Transformer' in opt.SequenceModeling and opt.use_scheduled_optim: optimizer_schedule.step_and_update_lr() elif 'Transformer' in opt.SequenceModeling: optimizer.step() else: # gradient clipping with 5 (Default) torch.nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip) optimizer.step() loss_avg.add(cost) # validation part if i > 0 and (i + 1) % opt.valInterval == 0: elapsed_time = time.time() - start_time print( f'[{i+1}/{opt.num_iter}] Loss: {loss_avg.val():0.5f} elapsed_time: {elapsed_time:0.5f}' ) # for log with open(f'./saved_models/{opt.experiment_name}/log_train.txt', 'a') as log: log.write( f'[{i+1}/{opt.num_iter}] Loss: {loss_avg.val():0.5f} elapsed_time: {elapsed_time:0.5f}\n' ) loss_avg.reset() model.eval() with torch.no_grad(): valid_loss, current_accuracy, current_norm_ED, preds, gts, infer_time, length_of_data = validation( model, criterion, valid_loader, converter, opt) model.train() for pred, gt in zip(preds[:5], gts[:5]): if 'Transformer' in opt.SequenceModeling: pred = pred[:pred.find('</s>')] gt = gt[:gt.find('</s>')] elif 'Attn' in opt.Prediction: pred = pred[:pred.find('[s]')] gt = gt[:gt.find('[s]')] print(f'{pred:20s}, gt: {gt:20s}, {str(pred == gt)}') log.write( f'{pred:20s}, gt: {gt:20s}, {str(pred == gt)}\n') valid_log = f'[{i+1}/{opt.num_iter}] valid loss: {valid_loss:0.5f}' valid_log += f' accuracy: {current_accuracy:0.3f}, norm_ED: {current_norm_ED:0.2f}' print(valid_log) log.write(valid_log + '\n') # keep best accuracy model if current_accuracy > best_accuracy: best_accuracy = current_accuracy state_dict = model.module.state_dict() save_checkpoint( { 'best_accuracy': best_accuracy, 'state_dict': state_dict, }, False, f'./saved_models/{opt.experiment_name}/best_accuracy.pth' ) if current_norm_ED < best_norm_ED: best_norm_ED = current_norm_ED state_dict = model.module.state_dict() save_checkpoint( { 'best_norm_ED': best_norm_ED, 'state_dict': state_dict, }, False, f'./saved_models/{opt.experiment_name}/best_norm_ED.pth' ) # torch.save( # model.state_dict(), f'./saved_models/{opt.experiment_name}/best_norm_ED.pth') best_model_log = f'best_accuracy: {best_accuracy:0.3f}, best_norm_ED: {best_norm_ED:0.2f}' print(best_model_log) log.write(best_model_log + '\n') # save model per 1000 iter. if (i + 1) % 1000 == 0: state_dict = model.module.state_dict() optimizer_state_dict = optimizer.state_dict() save_checkpoint( { 'state_dict': state_dict, 'optimizer': optimizer_state_dict, 'step': i, 'best_accuracy': best_accuracy, 'best_norm_ED': best_norm_ED, }, False, f'./saved_models/{opt.experiment_name}/iter_{i+1}.pth')
dim_c=args.dim_c, hidden_size1=hidden_size1, hidden_size2=hidden_size2, hidden_size3=hidden_size3, dropout=args.dropout, use_selu=args.use_selu, model_path=args.model_path, n_warmup_steps=args.n_warmup_steps, comp_eff=args.comp_eff) dim_embedding = args.dim_u + args.dim_s1 + args.dim_s2 + args.dim_s3 ## Optimizer optimizer_ttime = ScheduledOptim( torch.optim.Adam(TTime_combine.parameters(), betas=(0.9, 0.98), eps=1e-9, amsgrad=False), args.lr, dim_embedding, args.n_warmup_steps) ## Preparing the data trainfiles = list( filter(lambda x: x.endswith(".h5"), sorted(os.listdir(args.trainpath)))) validfiles = list( filter(lambda x: x.endswith(".h5"), sorted(os.listdir(args.validpath)))) train_dataloader = DataLoader(args.trainpath) print("Loading the training data...") train_dataloader.read_files(trainfiles) valid_dataloader = DataLoader(args.validpath) print("Loading the validation data...") valid_dataloader.read_files(validfiles) train_slot_size = np.array(
def train(hp): train_loader, valset, collate_fn = prepare_dataloaders(hparams) #device = torch.device('cuda' if hp.cuda else 'cpu') device = torch.device('cuda') ''' model=Transformer(n_src_vocab=hp.n_src_vocab,len_max_seq=hp.len_max_seq,d_word_vec=hp.d_word_vec,d_model=hp.d_model, d_inner=hp.d_inner,n_layers=hp.n_layers,n_head=hp.n_head, d_k=hp.d_k,d_v=hp.d_v,dropout=hp.dropout).to(device) ''' model=make_model().to('cuda') print(model) #model_opt = NoamOpt(512, 1, 400,torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-9)) model_opt=ScheduledOptim(optim.Adam( filter(lambda x:x.requires_grad,model.parameters()),betas=(0.9,0.98),eps=1e-09), hp.d_model,hp.n_warmup_steps) ''' optimizer=ScheduledOptim(optim.Adam(\ filter(lambda x:x.requires_grad,model.parameters()),betas=(0.9,0.98),eps=1e-09), hp.d_model,hp.n_warmup_steps) ''' try: checkpoint=torch.load(os.path.join(hp.checkpoint_path,'checkpoint_%d.pth.tar'% args.restore_step)) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) print("\n--------model restored at step %d--------\n" % args.restore_step) except: print("\n--------Start New Training--------\n") if not os.path.exists(hp.checkpoint_path): os.mkdir(hp.checkpoint_path) model.train() lambda_l1=0.3 l1=0 for epoch in range(hp.epochs): train_loader, valset, collate_fn = prepare_dataloaders(hparams) for i,data in enumerate(train_loader): #current_step = i + hp.restore_step + epoch * len(dataloader) + 1 n_iter = i + 0 + epoch * len(train_loader) + 1 #print(data[0].shape) #print(data[1].shape) #print(data[2].shape) #print(data[3].shape) #optimizer.zero_grad() model_opt.zero_grad() try: #mel_input = np.concatenate((np.zeros([hp.batch_size, hp.num_mels, 1], dtype=np.float32),data[2][:,:,1:]), axis=2) mel_input = np.concatenate((np.zeros([hp.batch_size, hp.num_mels, 1], dtype=np.float32),data[2][:,:,1:]), axis=2) except: raise TypeError("not same dimension") #mel_input = np.concatenate((np.zeros([hp.batch_size, hp.num_mels, 1], dtype=np.float32),data[2][:,:,1:]), axis=2) if use_cuda: text_padded=Variable(data[0].type(torch.cuda.LongTensor), requires_grad=False).cuda() text_zeroone=Variable(data[1].type(torch.cuda.FloatTensor), requires_grad=False).cuda() #print(text_zeroone) #text_zeronne=torch.ones(text_zeroone.shape).cuda()-text_zeroone en_mask=make_src_mask(text_padded,0) #mel_padded=Variable(torch.from_numpy(mel_input).type(torch.cuda.FloatTensor), requires_grad=False).cuda() mel_zeroone=Variable(data[4].type(torch.cuda.FloatTensor), requires_grad=False).cuda() #print(mel_zeroone) mel_zeroone=torch.ones(mel_zeroone.shape).cuda()-mel_zeroone de_mask=make_tgt_mask(mel_zeroone,0) mel_truth=Variable(data[2].type(torch.cuda.FloatTensor), requires_grad=False).cuda() mel_input=Variable(data[3].type(torch.cuda.FloatTensor), requires_grad=False).cuda() else: text_padded=Variable(torch.from_numpy(data[0]).type(torch.LongTensor), requires_grad=False) text_zeroone=Variable(torch.from_numpy(data[1]).type(torch.FloatTensor), requires_grad=False) mel_padded=Variable(torch.from_numpy(mel_input).type(torch.FloatTensor), requires_grad=False) mel_zeroone=Variable(torch.from_numpy(data[3]).type(torch.FloatTensor), requires_grad=False) mel_truth=Variable(torch.from_numpy(data[2]).type(torch.FloatTensor), requires_grad=False) ''' print('epoch:',epoch) print('text_padded:',text_padded.shape) print('mel_padded:',mel_input.shape) print('text_zeroonr',text_zeroone.shape) print('mel_zeroone',mel_zeroone.shape) print('en_mask',en_mask) print('de_mask',de_mask[0,0,:]) ''' #l2_regularization=torch.Tensor(0) #for param in model.parameters(): # l2_regularization += torch.norm(param, 2) frame,frame_post,stop=model(text_padded,mel_input,en_mask,de_mask) #print("0",model.decoder.layers[0].en_attn.attn[0,0,:,:]) #print("1",model.decoder.layers[1].en_attn.attn[0, 2]) if n_iter%500==0: for layer in range(4): for h in range(hp.n_head): alignment=model.decoder.layers[layer].en_attn.attn[0, h].cpu().data.numpy() tag="alignment_layer{}_head{}".format(layer,h) writer.add_image(tag,np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255),n_iter) #plot.plot_alignment(model.decoder.layers[layer].en_attn.attn[0, h],os.path.join(hp.checkpoint_path,'step-%d-layer-%d-head-%d-align.png' % (current_step, layer+1,h+1)),info='%s, %s, %s, step=%d, loss=%.5f' % ('transformer','mo', time_string(), current_step, 0)) #print(model.decoder.layers[0].en_attn.attn[0, 1]) ''' before_loss=criterion(before,mel_truth) post_loss=criterion(post,mel_truth) gate_loss=nn.BCEWithLogitsLoss()(stop, mel_zeroone) loss=before_loss+post_loss+gate_loss ''' #l2_regularization=torch.Tensor(0) #for param in model.parameters(): # l2_regularization += torch.norm(param, 2) #print(l2_regularization) before = nn.MSELoss()(frame.transpose(-2,-1),mel_truth) post=nn.MSELoss()(frame_post,mel_truth) gate=nn.BCEWithLogitsLoss()(stop, mel_zeroone) loss=before+post+gate ''' for param in model.parameters(): l1=l1+param.abs().sum() loss=loss+lambda_l1*l1 ''' loss.backward() #nn.utils.clip_grad_norm(model.parameters(), 1.) #optimizer.step_and_update_lr() nn.utils.clip_grad_norm(model.parameters(), 1.) model_opt.step_and_update_lr() if i%1000==0: writer.add_scalar('Train/before',before, n_iter) writer.add_scalar('Train/post', post, n_iter) writer.add_scalar('Train/gate', gate, n_iter) writer.add_scalar('Train/all', loss, n_iter) ''' if i%100==0: for i in range(len(dec_enc_attn_list)): for j in range(0,hp.batch_size*hp.n_head,hp.batch_size): #print('dec_enc_attn:',dec_enc_attn_list[i][j].shape) plot.plot_alignment(dec_enc_attn_list[i][j],os.path.join(hp.checkpoint_path, 'step-%d-layer-%d-head-%d-align.png' % (niter, i+1, j/hp.batch_size+1)),info='%s, %s, %s, step= %d, loss=%.5f' % ('transformer','mo', time_string(), niter, loss)) params=list(model.named_parameters()) print(params) ''' '''