def setup_train(self, model_path): device = torch.device('cuda' if use_cuda else 'cpu') self.model = Model(config.vocab_size, config.vocab_size, config.max_enc_steps, config.max_dec_steps, d_k=config.d_k, d_v=config.d_v, d_model=config.d_model, d_word_vec=config.emb_dim, d_inner=config.d_inner_hid, n_layers=config.n_layers, n_head=config.n_head, dropout=config.dropout).to(device) self.optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, self.model.parameters()), betas=(0.9, 0.98), eps=1e-09), config.d_model, config.n_warmup_steps) params = list(self.model.encoder.parameters()) + list( self.model.decoder.parameters()) total_params = sum([param[0].nelement() for param in params]) print('The Number of params of model: %.3f million' % (total_params / 1e6)) # million start_iter, start_loss = 0, 0 if model_path is not None: state = torch.load(model_path, map_location=lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] if not config.is_coverage: self.optimizer._optimizer.load_state_dict(state['optimizer']) if use_cuda: for state in self.optimizer._optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() return start_iter, start_loss
def __init__(self, writer, model, device, args): self.model = model self.energy_fn = LocalEnergyCE(model, args) self.device = device # Setting the Adam optimizer with hyper-param self.optim = Adam(self.model.parameters(), lr=args.lr, betas=args.betas, weight_decay=args.weight_decay) # self.optim = SGD(self.model.parameters(), lr=lr, weight_decay=weight_decay) self.optim_schedule = ScheduledOptim( self.optim, init_lr=args.lr, n_warmup_steps=args.n_warmup_steps, steps_decay_scale=args.steps_decay_scale) self.log_freq = args.log_interval self.writer = writer print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))
def init_training(args): """ Initialize training process """ # load vocabulary vocab = torch.load(args.vocab) # build model transformer = Transformer(args, vocab) # if finetune if args.finetune: print("[Finetune] %s" % args.finetune_model_path) transformer.load_state_dict(torch.load(args.finetune_model_path)) # vocab_mask for masking padding vocab_mask = torch.ones(len(vocab.tgt)) vocab_mask[vocab.tgt[constants.PAD_WORD]] = 0 # loss object cross_entropy_loss = nn.CrossEntropyLoss(weight=vocab_mask, size_average=False) if args.cuda: transformer = transformer.cuda() cross_entropy_loss = cross_entropy_loss.cuda() if args.optimizer == "Warmup_Adam": optimizer = ScheduledOptim( torch.optim.Adam(transformer.get_trainable_parameters(), betas=(0.9, 0.98), eps=1e-09), args.d_model, args.n_warmup_steps) if args.optimizer == "Adam": optimizer = torch.optim.Adam( params=transformer.get_trainable_parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-8) if args.optimizer == 'SGD': optimizer = torch.optim.SGD( params=transformer.get_trainable_parameters(), lr=args.lr) # multi gpus if torch.cuda.device_count() > 1: print("[Multi GPU] using", torch.cuda.device_count(), "GPUs\n") transformer = nn.DataParallel(transformer) return vocab, transformer, optimizer, cross_entropy_loss
args.n_warmup_steps = args.n_warmup_steps if args.n_warmup_steps != 0 else training_data._stop_step # ############################################################################## # Build model # ############################################################################## import model from optim import ScheduledOptim vae = model.VAE(args) if use_cuda: vae = vae.cuda() criterion = torch.nn.CrossEntropyLoss() optimizer = ScheduledOptim( torch.optim.Adam(vae.parameters(), betas=(0.9, 0.98), eps=1e-09), args.embed_dim, args.n_warmup_steps, vae.parameters(), args.clip) # ############################################################################## # Training # ############################################################################## import time from tqdm import tqdm train_loss = [] def repackage_hidden(h): if type(h) == Variable: return Variable(h.data) else:
model = Transformer(device=device, d_feature=train_data.sig_len, d_model=d_model, d_inner=d_inner, n_layers=num_layers, n_head=num_heads, d_k=64, d_v=64, dropout=dropout, class_num=class_num) model = model.to(device) optimizer = ScheduledOptim( Adam(filter(lambda x: x.requires_grad, model.parameters()), betas=(0.9, 0.98), eps=1e-09), d_model, warm_steps) train_accs = [] valid_accs = [] eva_indis = [] train_losses = [] valid_losses = [] for epoch_i in range(epoch): print('[ Epoch', epoch_i, ']') start = time.time() train_loss, train_acc, cnt = train_epoch(train_loader, device, model, optimizer, train_data.__len__()) print( ' - (Training) loss: {loss: 8.5f}, accuracy: {accu:3.3f} %, ' 'elapse: {elapse:3.3f} min'.format(
class LocalGenTrainer: def __init__(self, writer, model, device, args): self.model = model self.energy_fn = LocalEnergyCE(model, args) self.device = device # Setting the Adam optimizer with hyper-param self.optim = Adam(self.model.parameters(), lr=args.lr, betas=args.betas, weight_decay=args.weight_decay) # self.optim = SGD(self.model.parameters(), lr=lr, weight_decay=weight_decay) self.optim_schedule = ScheduledOptim( self.optim, init_lr=args.lr, n_warmup_steps=args.n_warmup_steps, steps_decay_scale=args.steps_decay_scale) self.log_freq = args.log_interval self.writer = writer print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()])) def step(self, data): seq, coords, start_id, res_counts = data seq = seq.to(self.device) # (N, L) coords = coords.to(self.device) # (N, L, 3) start_id = start_id.to(self.device) # (N, L) res_counts = res_counts.to(self.device) # (N, 3) loss_r, loss_angle, loss_profile, loss_start_id, loss_res_counts = self.energy_fn.forward( seq, coords, start_id, res_counts) return loss_r, loss_angle, loss_profile, loss_start_id, loss_res_counts def train(self, epoch, data_loader, flag='Train'): for i, data in tqdm(enumerate(data_loader)): loss_r, loss_angle, loss_profile, loss_start_id, loss_res_counts = self.step( data) loss = loss_r + loss_angle + loss_profile + loss_start_id + loss_res_counts if flag == 'Train': self.optim_schedule.zero_grad() loss.backward() self.optim_schedule.step_and_update_lr() len_data_loader = len(data_loader) if flag == 'Train': log_freq = self.log_freq else: log_freq = 1 if i % log_freq == 0: self.writer.add_scalar(f'{flag}/profile_loss', loss_profile.item(), epoch * len_data_loader + i) self.writer.add_scalar(f'{flag}/coords_radius_loss', loss_r.item(), epoch * len_data_loader + i) self.writer.add_scalar(f'{flag}/coords_angle_loss', loss_angle.item(), epoch * len_data_loader + i) self.writer.add_scalar(f'{flag}/start_id_loss', loss_start_id.item(), epoch * len_data_loader + i) self.writer.add_scalar(f'{flag}/res_counts_loss', loss_res_counts.item(), epoch * len_data_loader + i) self.writer.add_scalar(f'{flag}/total_loss', loss.item(), epoch * len_data_loader + i) print(f'{flag} epoch {epoch} Iter: {i} ' f'profile_loss: {loss_profile.item():.3f} ' f'coords_radius_loss: {loss_r.item():.3f} ' f'coords_angle_loss: {loss_angle.item():.3f} ' f'start_id_loss: {loss_start_id.item():.3f} ' f'res_counts_loss: {loss_res_counts.item():.3f} ' f'total_loss: {loss.item():.3f} ') def test(self, epoch, data_loader, flag='Test'): self.model.eval() torch.set_grad_enabled(False) self.train(epoch, data_loader, flag=flag) self.model.train() torch.set_grad_enabled(True)
cuda=use_cuda) args.n_warmup_steps = args.n_warmup_steps if args.n_warmup_steps != 0 else training_data.sents_size // args.batch_size # ############################################################################## # Build model # ############################################################################## from model import BiLSTM_Cut from optim import ScheduledOptim model = BiLSTM_Cut(args) if use_cuda: model = model.cuda() optimizer = ScheduledOptim( torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09), args.lstm_hsz, args.n_warmup_steps) criterion = torch.nn.CrossEntropyLoss() # ############################################################################## # Training # ############################################################################## import time from tqdm import tqdm def evaluate(): model.eval() corrects = eval_loss = 0 _size = validation_data.sents_size
# ############################################################################## # Build model # ############################################################################## import model from optim import ScheduledOptim from modelp import densenet161 densenet = model.DenseNet(args) if use_cuda: densenet = densenet.cuda() optimizer = ScheduledOptim( torch.optim.SGD(densenet.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay), args.epochs, args.lr) criterion = torch.nn.CrossEntropyLoss() # ############################################################################## # Training # ############################################################################## import time from tqdm import tqdm from torch.autograd import Variable train_loss = [] valid_loss = []
class Train(object): def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(self.vocab, config.train_data_path, config.batch_size, single_pass=False, mode='train') time.sleep(10) train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = os.path.join(train_dir, 'models') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.summary_writer = tf.summary.FileWriter(train_dir) def save_model(self, running_avg_loss, iter): model_state_dict = self.model.state_dict() state = { 'iter': iter, 'current_loss': running_avg_loss, 'optimizer': self.optimizer._optimizer.state_dict(), "model": model_state_dict } model_save_path = os.path.join( self.model_dir, 'model_%d_%d' % (iter, int(time.time()))) torch.save(state, model_save_path) def setup_train(self, model_path): device = torch.device('cuda' if use_cuda else 'cpu') self.model = Model(config.vocab_size, config.vocab_size, config.max_enc_steps, config.max_dec_steps, d_k=config.d_k, d_v=config.d_v, d_model=config.d_model, d_word_vec=config.emb_dim, d_inner=config.d_inner_hid, n_layers=config.n_layers, n_head=config.n_head, dropout=config.dropout).to(device) self.optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, self.model.parameters()), betas=(0.9, 0.98), eps=1e-09), config.d_model, config.n_warmup_steps) params = list(self.model.encoder.parameters()) + list( self.model.decoder.parameters()) total_params = sum([param[0].nelement() for param in params]) print('The Number of params of model: %.3f million' % (total_params / 1e6)) # million start_iter, start_loss = 0, 0 if model_path is not None: state = torch.load(model_path, map_location=lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] if not config.is_coverage: self.optimizer._optimizer.load_state_dict(state['optimizer']) if use_cuda: for state in self.optimizer._optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() return start_iter, start_loss def train_one_batch(self, batch): enc_batch, enc_lens, enc_pos, enc_padding_mask, enc_batch_extend_vocab, \ extra_zeros, c_t, coverage = get_input_from_batch(batch, use_cuda, transformer=True) dec_batch, dec_lens, dec_pos, dec_padding_mask, max_dec_len, tgt_batch = \ get_output_from_batch(batch, use_cuda, transformer=True) self.optimizer.zero_grad() pred = self.model(enc_batch, enc_pos, dec_batch, dec_pos) gold_probs = torch.gather(pred, -1, tgt_batch.unsqueeze(-1)).squeeze() batch_loss = -torch.log(gold_probs + config.eps) batch_loss = batch_loss * dec_padding_mask sum_losses = torch.sum(batch_loss, 1) batch_avg_loss = sum_losses / dec_lens loss = torch.mean(batch_avg_loss) loss.backward() # update parameters self.optimizer.step_and_update_lr() return loss.item(), 0. def run(self, n_iters, model_path=None): iter, running_avg_loss = self.setup_train(model_path) start = time.time() interval = 100 while iter < n_iters: batch = self.batcher.next_batch() loss, cove_loss = self.train_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 if iter % interval == 0: self.summary_writer.flush() print('step: %d, second: %.2f , loss: %f, cover_loss: %f' % (iter, time.time() - start, loss, cove_loss)) start = time.time() if iter % 5000 == 0: self.save_model(running_avg_loss, iter)
cuda=use_cuda, evaluation=True) # ############################################################################## # Build model # ############################################################################## from model import Model from optim import ScheduledOptim model = Model(args) if use_cuda: model = model.cuda() optimizer = ScheduledOptim( torch.optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-09, weight_decay=args.l2), args.lr) # ############################################################################## # Training # ############################################################################## import time from tqdm import tqdm import const train_loss = [] valid_loss = [] accuracy = []
def train(args, model, train_iter, eval_iter=None): if args.use_cuda: model = model.cuda(args.device_no) model.train() # model = torch.nn.DataParallel(model, device_ids=(0,1,2)) # train_data = load_data('train.txt') optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, model.parameters())), args.learning_rate, args.warmup_steps) loss_list = [] eval_loss_list = [] # with torch.cuda.device(device_num): batch_count = 0 running_loss = 0 # start = time.time() while batch_count < args.training_steps: for inputs, targets in train_iter: if batch_count >= args.training_steps: break # input is a masked sequence # target contains original word on the masked position, other positions are filled with -1 # e.g. # input: [101, 2342, 6537, 104, 104, 4423] # target: [-1, -1, -1, 10281, 8213, -1] logger.debug(f'inputs: {inputs}') logger.debug(f'targets: {targets}') batch_count += 1 # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = model(inputs, labels=inputs) loss = outputs[0] loss.backward() optimizer.step() logger.debug(f'model outputs: {outputs}') running_loss += loss.item() # write loss log if batch_count % args.log_interval == 0 or \ (batch_count < args.warmup_steps and batch_count % int(args.log_interval / 10) == 0): if batch_count <= args.warmup_steps: loss_list.append(running_loss / args.log_interval * 10) logger.info('Batch:%6d, loss: %.6f [%s]' % \ (batch_count, running_loss/args.log_interval*10, time.strftime("%D %H:%M:%S"))) else: loss_list.append(running_loss / args.log_interval) logger.info('Batch:%6d, loss: %.6f [%s]' % \ (batch_count, running_loss/args.log_interval, time.strftime("%D %H:%M:%S"))) running_loss = 0 # save model & curve if batch_count % args.checkpoint_interval == 0: if eval_iter is not None: eval_loss = eval(args, model, eval_iter) eval_loss_list.append(eval_loss) if eval_loss <= min( eval_loss_list) and args.save_best_checkpoint: path = os.path.join(args.checkpoint_save_path, "model", f"{args.model_type}-best") if not os.path.exists(path): os.makedirs(path) model.save_pretrained(path) logger.info('Best model saved in %s' % path) if args.save_normal_checkpoint: path = os.path.join(args.checkpoint_save_path, "tmp", f"{args.model_type}-{batch_count}") if not os.path.exists(path): os.makedirs(path) model.save_pretrained(path) logger.info('Model saved in %s' % path) curve_info = { "train_loss_list": loss_list, "eval_loss_list": eval_loss_list } with open( path + f'/{args.model_type}-{batch_count}-loss.pkl', 'wb+') as file: pickle.dump(curve_info, file) return loss_list
evaluation=True, cuda=use_cuda) args.enc_vocab_size = data['dict']['src_size'] args.dec_vocab_size = data['dict']['tgt_size'] args.n_warmup_steps = args.n_warmup_steps if args.n_warmup_steps != 0 else training_data._stop_step # ############################################################################## # Build model # ############################################################################## model = Transformer(args) optimizer = ScheduledOptim( torch.optim.Adam(model.get_trainable_parameters(), betas=(0.9, 0.98), eps=1e-09), args.d_model, args.n_warmup_steps) def get_criterion(vocab_size): weight = torch.ones(vocab_size) weight[const.PAD] = 0 return torch.nn.CrossEntropyLoss(weight, size_average=False) crit = get_criterion(args.dec_vocab_size) if use_cuda: model = model.cuda() crit = crit.cuda()