def train_eval(self): params = filter(lambda p: p.requires_grad, self.model.parameters()) optimizer = Optimizer(params, args) task_dev_acc_dict = dict() task_test_err_dict = dict() for ep in range(1, self.args.epoch + 1): for task_id, train_data in enumerate(self.train_set): task_name = get_task(task_id) print(f'training {task_name} task ...') task_train_loss, task_train_acc = self.train_iter( ep, task_id, train_data, optimizer) task_dev_acc = self.eval(task_id, self.dev_set[task_id]) if task_id not in task_dev_acc_dict or task_dev_acc_dict[ task_id] < task_dev_acc: task_dev_acc_dict[task_id] = task_dev_acc task_test_acc = self.eval(task_id, self.test_set[task_id]) task_test_err_dict[task_id] = 1 - task_test_acc logger.info( '[Epoch %d][Task %s] train loss: %.4f, lr: %f, Train ACC: %.4f, Dev ACC: %.4f, Best Dev ACC: %.4f, Best Test ERR: %.4f' % (ep, task_name, task_train_loss, optimizer.get_lr(), task_train_acc, task_dev_acc, task_dev_acc_dict[task_id], task_test_err_dict[task_id])) for tid, test_err in task_test_err_dict.items(): logger.info('[Epoch %d][Task %s] Test Err: %.4f' % (ep, get_task(tid), test_err)) all_task_err = list(task_test_err_dict.values()) logger.info('Avg Test Err: %.4f' % np.mean(all_task_err))
def train(self): params = filter(lambda p: p.requires_grad, self.model.parameters()) optimizer = Optimizer(params, args) patient = 0 best_dev_acc, best_test_acc = 0, 0 for ep in range(1, self.args.epoch + 1): train_loss, train_acc = self.train_iter(ep, self.train_set, optimizer) dev_acc = self.eval(self.val_set) if dev_acc > best_dev_acc: best_dev_acc = dev_acc test_acc = self.eval(self.test_set) if test_acc > best_test_acc: best_test_acc = test_acc patient = 0 else: patient += 1 logger.info( '[Epoch %d] train loss: %.4f, lr: %f, Train ACC: %.4f, Dev ACC: %.4f, Best Dev ACC: %.4f, Best Test ACC: %.4f, patient: %d' % (ep, train_loss, optimizer.get_lr(), train_acc, dev_acc, best_dev_acc, best_test_acc, patient)) if patient >= args.patient: break logger.info('Final Test ACC: %.4f' % best_test_acc)
def __init__(self, input_size, hidden_size, output_size, num_layers=1, optimizer_type='Adagrad', lr=.01, weight_decay=0, momentum=0, eps=1e-6, loss_type='TOP1', clip_grad=-1, dropout_input=.0, dropout_hidden=.5, batch_size=50, use_cuda=True, time_sort=False, pretrained=None): """ The GRU4REC model Args: input_size (int): dimension of the gru input variables hidden_size (int): dimension of the gru hidden units output_size (int): dimension of the gru output variables num_layers (int): the number of layers in the GRU optimizer_type (str): optimizer type for GRU weights lr (float): learning rate for the optimizer weight_decay (float): weight decay for the optimizer momentum (float): momentum for the optimizer eps (float): eps for the optimizer loss_type (str): type of the loss function to use clip_grad (float): clip the gradient norm at clip_grad. No clipping if clip_grad = -1 dropout_input (float): dropout probability for the input layer dropout_hidden (float): dropout probability for the hidden layer batch_size (int): mini-batch size use_cuda (bool): whether you want to use cuda or not time_sort (bool): whether to ensure the the order of sessions is chronological (default: False) pretrained (modules.layer.GRU): pretrained GRU layer, if it exists (default: None) """ # Initialize the GRU Layer self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size self.batch_size = batch_size self.use_cuda = use_cuda if pretrained is None: self.gru = GRU(input_size, hidden_size, output_size, num_layers, dropout_input=dropout_input, dropout_hidden=dropout_hidden, use_cuda=use_cuda, batch_size=batch_size) else: self.gru = pretrained # Initialize the optimizer self.optimizer_type = optimizer_type self.weight_decay = weight_decay self.momentum = momentum self.lr = lr self.eps = eps self.optimizer = Optimizer(self.gru.parameters(), optimizer_type=optimizer_type, lr=lr, weight_decay=weight_decay, momentum=momentum, eps=eps) # Initialize the loss function self.loss_type = loss_type self.loss_fn = LossFunction(loss_type, use_cuda) # gradient clipping(optional) self.clip_grad = clip_grad # etc self.time_sort = time_sort
def train_eval(self): train_loader = DataLoader( self.train_set, batch_size=self.args.batch_size, shuffle=True) self.args.max_step = self.args.epoch * \ (len(train_loader) // self.args.update_step) print('max step:', self.args.max_step) optimizer = Optimizer( filter(lambda p: p.requires_grad, self.model.parameters()), args) best_dev_metric, best_test_metric = dict(), dict() patient = 0 for ep in range(1, 1 + self.args.epoch): train_loss = 0. self.model.train() t1 = time.time() train_right, train_pred, train_gold = 0, 0, 0 for i, batcher in enumerate(train_loader): batch = batch_variable(batcher, self.vocabs) batch.to_device(self.args.device) pred_score = self.model( batch.wd_ids, batch.ch_ids, batch.tag_ids, batch.bert_inps) loss = self.calc_loss(pred_score, batch.ner_ids) loss_val = loss.data.item() train_loss += loss_val sent_lens = batch.wd_ids.gt(0).sum(dim=1) gold_res = self.ner_gold( batch.ner_ids, sent_lens, self.vocabs['ner']) pred_res = self.ner_pred( pred_score, sent_lens, self.vocabs['ner']) nb_right, nb_pred, nb_gold = self.calc_acc( pred_res, gold_res, return_prf=False) train_right += nb_right train_pred += nb_pred train_gold += nb_gold train_p, train_r, train_f = self.calc_prf( train_right, train_pred, train_gold) if self.args.update_step > 1: loss = loss / self.args.update_step loss.backward() if (i + 1) % self.args.update_step == 0 or (i == self.args.max_step - 1): nn_utils.clip_grad_norm_(filter(lambda p: p.requires_grad, self.model.parameters()), max_norm=self.args.grad_clip) optimizer.step() self.model.zero_grad() logger.info('[Epoch %d] Iter%d time cost: %.2fs, lr: %.6f, train loss: %.3f, P: %.3f, R: %.3f, F: %.3f' % ( ep, i + 1, (time.time() - t1), optimizer.get_lr(), loss_val, train_p, train_r, train_f)) dev_metric = self.evaluate('dev') if dev_metric['f'] > best_dev_metric.get('f', 0): best_dev_metric = dev_metric test_metric = self.evaluate('test') if test_metric['f'] > best_test_metric.get('f', 0): # check_point = {'model': self.model.state_dict(), 'settings': args} # torch.save(check_point, self.args.model_chkp) best_test_metric = test_metric patient = 0 else: patient += 1 logger.info('[Epoch %d] train loss: %.4f, lr: %f, patient: %d, dev_metric: %s, test_metric: %s' % ( ep, train_loss, optimizer.get_lr(), patient, best_dev_metric, best_test_metric)) # if patient >= (self.args.patient // 2 + 1): # 训练一定epoch, dev性能不上升, decay lr # optimizer.lr_decay(0.95) if patient >= self.args.patient: # early stopping break logger.info('Final Metric: %s' % best_test_metric)
def train_eval(self): train_loader = DataLoader(self.train_set, batch_size=self.args.batch_size, shuffle=True) self.args.max_step = self.args.epoch * (len(train_loader) // self.args.update_step) print('max step:', self.args.max_step) optimizer = Optimizer( filter(lambda p: p.requires_grad, self.model.parameters()), args) best_dev_metric, best_test_metric = dict(), dict() patient = 0 for ep in range(1, 1 + self.args.epoch): train_loss = 0. self.model.train() t1 = time.time() train_head_acc, train_rel_acc, train_total_head = 0, 0, 0 for i, batcher in enumerate(train_loader): batch = batch_variable(batcher, self.vocabs) batch.to_device(self.args.device) head_score, rel_score = self.model(batch.wd_ids, batch.ch_ids, batch.tag_ids) loss = self.calc_loss(head_score, rel_score, batch.head_ids, batch.rel_ids, batch.wd_ids.gt(0)) loss_val = loss.data.item() train_loss += loss_val head_acc, rel_acc, total_head = self.calc_acc( head_score, rel_score, batch.head_ids, batch.rel_ids) train_head_acc += head_acc train_rel_acc += rel_acc train_total_head += total_head if self.args.update_step > 1: loss = loss / self.args.update_step loss.backward() if (i + 1) % self.args.update_step == 0 or ( i == self.args.max_step - 1): nn_utils.clip_grad_norm_(filter(lambda p: p.requires_grad, self.model.parameters()), max_norm=self.args.grad_clip) optimizer.step() self.model.zero_grad() logger.info( '[Epoch %d] Iter%d time cost: %.2fs, lr: %.6f, train loss: %.3f, head acc: %.3f, rel acc: %.3f' % (ep, i + 1, (time.time() - t1), optimizer.get_lr(), loss_val, train_head_acc / train_total_head, train_rel_acc / train_total_head)) dev_metric = self.evaluate('dev') if dev_metric['uf'] > best_dev_metric.get('uf', 0): best_dev_metric = dev_metric test_metric = self.evaluate('test') if test_metric['uf'] > best_test_metric.get('uf', 0): # check_point = {'model': self.model.state_dict(), 'settings': args} # torch.save(check_point, self.args.model_chkp) best_test_metric = test_metric patient = 0 else: patient += 1 logger.info( '[Epoch %d] train loss: %.4f, lr: %f, patient: %d, dev_metric: %s, test_metric: %s' % (ep, train_loss, optimizer.get_lr(), patient, best_dev_metric, best_test_metric)) # if patient == (self.args.patient // 2 + 1): # 训练一定epoch, dev性能不上升, decay lr # optimizer.lr_decay(0.95) if patient >= self.args.patient: # early stopping break logger.info('Final Metric: %s' % best_test_metric)
def train(model, train_data, dev_data, test_data, args, word_vocab, extwd_vocab, lbl_vocab): args.max_step = args.epoch * ((len(train_data) + args.batch_size - 1) // (args.batch_size * args.update_steps)) optimizer = Optimizer( filter(lambda p: p.requires_grad, lni_model.parameters()), args) best_dev_acc, best_test_acc = 0, 0 patient = 0 for ep in range(1, 1 + args.epoch): model.train() train_loss = 0. start_time = time.time() for i, batch_data in enumerate( batch_iter(train_data, args.batch_size, True)): batcher = batch_variable(batch_data, word_vocab, extwd_vocab, lbl_vocab) batcher = (x.to(args.device) for x in batcher) sent1, sent2, extsent1, extsent2, gold_lbl = batcher pred = model((sent1, sent2), (extsent1, extsent2)) loss = criterion(pred, gold_lbl) if args.update_steps > 1: loss = loss / args.update_steps loss_val = loss.data.item() train_loss += loss_val loss.backward() if (i + 1) % args.update_steps == 0 or (i == args.max_step - 1): nn.utils.clip_grad_norm_(filter(lambda p: p.requires_grad, model.parameters()), max_norm=args.grad_clip) optimizer.step() model.zero_grad() train_acc = calc_acc(pred, gold_lbl) / len(batch_data) logger.info( 'Iter%d time cost: %.2fs, lr: %.8f, train loss: %.3f, train acc: %.3f' % (i + 1, (time.time() - start_time), optimizer.get_lr(), loss_val, train_acc)) train_loss /= len(train_data) dev_acc = eval(model, dev_data, args, word_vocab, extwd_vocab, lbl_vocab) logger.info('[Epoch %d] train loss: %.3f, lr: %f, DEV ACC: %.3f' % (ep, train_loss, optimizer.get_lr(), dev_acc)) if dev_acc > best_dev_acc: patient = 0 best_dev_acc = dev_acc test_acc = eval(model, test_data, args, word_vocab, extwd_vocab, lbl_vocab) logger.info('Test ACC: %.3f' % test_acc) if test_acc > best_test_acc: best_test_acc = test_acc else: patient += 1 if patient > args.patient: break logger.info('Final Test ACC: %.3f' % best_test_acc)
def __init__(self, input_size, if_embedding, embedding_size, hidden_size, output_size, num_layers=1, optimizer_type='Adagrad', lr=.01, weight_decay=0, momentum=0, eps=1e-6, loss_type='TOP1', clip_grad=-1, dropout_input=.0, dropout_hidden=.5, batch_size=50, use_cuda=True, cuda_id=1, compress=False, time_sort=False, pretrained=None): """ The GRU4REC model Args: input_size (int): dimension of the gru input variables hidden_size (int): dimension of the gru hidden units output_size (int): dimension of the gru output variables num_layers (int): the number of layers in the GRU optimizer_type (str): optimizer type for GRU weights lr (float): learning rate for the optimizer weight_decay (float): weight decay for the optimizer momentum (float): momentum for the optimizer eps (float): eps for the optimizer loss_type (str): type of the loss function to use clip_grad (float): clip the gradient norm at clip_grad. No clipping if clip_grad = -1 dropout_input (float): dropout probability for the input layer dropout_hidden (float): dropout probability for the hidden layer batch_size (int): mini-batch size use_cuda (bool): whether you want to use cuda or not time_sort (bool): whether to ensure the the order of sessions is chronological (default: False) pretrained (modules.layer.GRU): pretrained GRU layer, if it exists (default: None) """ # Initialize the GRU Layer self.input_size = input_size self.if_embedding = if_embedding self.embedding_size = embedding_size self.hidden_size = hidden_size self.output_size = output_size self.batch_size = batch_size self.use_cuda = use_cuda self.cuda_id = cuda_id self.device = torch.device( 'cuda:%d' % cuda_id if use_cuda else 'cpu' ) # must specify cuda_id or it will be torch.cuda.current_device() print(self.device) if pretrained is None: self.gru = GRU(input_size, if_embedding, embedding_size, hidden_size, output_size, num_layers, dropout_input=dropout_input, dropout_hidden=dropout_hidden, batch_size=batch_size, use_cuda=use_cuda, cuda_id=cuda_id) else: self.gru = pretrained # Initialize the optimizer self.optimizer_type = optimizer_type self.weight_decay = weight_decay self.momentum = momentum self.lr = lr self.eps = eps self.compress = compress self.compression_scheduler = None if self.compress: # Create a CompressionScheduler and configure it from a YAML schedule file source = self.compress self.compression_scheduler = distiller.config.file_config( self.gru, None, self.compress) self.optimizer = Optimizer(self.gru.parameters(), optimizer_type=optimizer_type, lr=lr, weight_decay=weight_decay, momentum=momentum, eps=eps) # Initialize the loss function self.loss_type = loss_type self.loss_fn = LossFunction(loss_type, use_cuda, cuda_id) # gradient clipping(optional) self.clip_grad = clip_grad # etc self.time_sort = time_sort