def train(self): self.model.to(DEVICE) # weight decay是放在正则项(regularization)前面的一个系数,正则项一般指示模型的复杂度, # 所以weight decay的作用是调节模型复杂度对损失函数的影响,若weight decay很大,则复杂的模型损失函数的值也就大。 optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate, weight_decay=0.0005) # schedule = ReduceLROnPlateau(optimizer=optimizer, mode='min', factor=0.1, patience=100, eps=1e-4, verbose=True) total_size = math.ceil(self.dataset.get_train_length() / self.batch_size) for epoch in range(self.epochs): for step in range(self.dataset.get_step() // self.epochs): self.model.train() # 与optimizer.zero_grad()作用一样 self.model.zero_grad() x_train, y_train = self.dataset.next_train_batch() x_val, y_val = self.dataset.next_validation_batch() batch = tuple( t.to(DEVICE) for t in create_batch_iter( mode='train', X=x_train, y=y_train).dataset.tensors) b_input_ids, b_input_mask, b_labels, b_out_masks = batch bert_encode = self.model(b_input_ids, b_input_mask) loss = self.model.loss_fn(bert_encode=bert_encode, tags=b_labels, output_mask=b_out_masks) loss.backward() # 梯度裁剪 # torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1) optimizer.step() # schedule.step(loss) if step % 50 == 0: self.model.eval() eval_loss, eval_acc, eval_f1 = 0, 0, 0 with torch.no_grad(): batch = tuple( t.to(DEVICE) for t in create_batch_iter( mode='dev', X=x_val, y=y_val).dataset.tensors) batch = tuple(t.to(DEVICE) for t in batch) input_ids, input_mask, label_ids, output_mask = batch bert_encode = self.model(input_ids, input_mask) eval_los = self.model.loss_fn(bert_encode=bert_encode, tags=label_ids, output_mask=output_mask) eval_loss = eval_los + eval_loss predicts = self.model.predict(bert_encode, output_mask) label_ids = label_ids.view(1, -1) label_ids = label_ids[label_ids != -1] self.model.acc_f1(predicts, label_ids) self.model.class_report(predicts, label_ids) print('eval_loss: ', eval_loss) print("-" * 50) progress = ("█" * int(step * 25 / total_size)).ljust(25) print("step {}".format(step)) print("epoch [{}] |{}| {}/{}\n\tloss {:.2f}".format( epoch, progress, step, total_size, loss.item())) save_model(self.model, arguments.output_dir)
def save_model(self, network, path, name=None, overwrite=False): save_model(model=network, output_dir=path)
def fit(model, training_iter, eval_iter, num_train_steps, device, n_gpu, verbose=1): # ------------------结果可视化------------------------ if args.local_rank in [-1, 0]: TIMESTAMP = "{0:%Y-%m-%dT%H-%M-%S/}".format(datetime.now()) tb_writer = SummaryWriter('log/%s'%TIMESTAMP) # ---------------------优化器------------------------- param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] t_total = num_train_steps optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)#int(t_total*args.warmup_proportion) # ---------------------GPU半精度fp16----------------------------- if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # ---------------------模型初始化---------------------- model.to(device) tr_loss, logging_loss = 0.0, 0.0 # ------------------------训练------------------------------ best_f1 = 0 #start = time.time() global_step = 0 set_seed(args, n_gpu) # Added here for reproductibility (even between python 2 and 3) bar = tqdm(range(t_total), total = t_total) nb_tr_examples, nb_tr_steps = 0, 0 for step in bar: model.train() batch = next(training_iter) batch = tuple(t.to(device) for t in batch) inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids 'labels': batch[3]} encode = model(**inputs) encode = encode[0]#提取预测结果 loss = model.loss_fn(encode, labels=inputs['labels']) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() #torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() #torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() train_loss = round(tr_loss*args.gradient_accumulation_steps/(nb_tr_steps+1),4) bar.set_description("loss {}".format(train_loss)) nb_tr_examples += inputs['input_ids'].size(0) nb_tr_steps += 1 if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule optimizer.zero_grad() global_step += 1 if (step + 1) %(args.eval_steps*args.gradient_accumulation_steps)==0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if args.local_rank in [-1, 0] and \ args.do_eval and (step+1)%(args.eval_steps*args.gradient_accumulation_steps)==0: # -----------------------验证---------------------------- model.eval() y_predicts, y_labels = [], [] eval_loss, eval_acc, eval_f1 = 0, 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for _, batch in enumerate(eval_iter): batch = tuple(t.to(device) for t in batch) inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids 'labels': batch[3]} with torch.no_grad(): encode = model(**inputs) encode = encode[0] # 提取预测结果 eval_los = model.loss_fn(encode, labels=inputs['labels']) predicts = model.predict(encode)#.detach().cpu().numpy() nb_eval_examples += inputs['input_ids'].size(0) nb_eval_steps += 1 eval_loss += eval_los.mean().item() y_predicts.append(torch.from_numpy(predicts)) labels = inputs['labels'].view(1, -1) labels = labels[labels != -1] y_labels.append(labels) eval_loss = eval_loss / nb_eval_steps eval_predicted = torch.cat(y_predicts, dim=0).cpu().numpy() eval_labeled = torch.cat(y_labels, dim=0).cpu().numpy() eval_f1 = model.acc_rec_f1(eval_predicted, eval_labeled)#eval_acc, eval_rec, logger.info( '\n\nglobal_step %d - train_loss: %4f - eval_loss: %4f - eval_f1:%4f\n' % (global_step, train_loss, eval_loss, eval_f1)) # 保存最好的模型 if eval_f1 > best_f1: best_f1 = eval_f1 save_model(model, args.output_dir) if args.local_rank in [-1, 0]: tb_writer.add_scalar('train_loss', train_loss, step)#.item() tb_writer.add_scalar('eval_loss', eval_loss, step)#.item() / count tb_writer.add_scalar('eval_f1', eval_f1, step)#eval_acc tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) if args.local_rank in [-1, 0]: tb_writer.close()
def train(self, train_source, train_target, dev_source, dev_target): if os.path.exists(self.args.output_dir) is True: shutil.rmtree(self.args.output_dir) train_dataloader = create_batch_iter(mode='train', X=train_source, y=train_target, batch_size=self.args.BATCH) dev_dataloader = create_batch_iter(mode='dev', X=dev_source, y=dev_target, batch_size=self.args.BATCH) self.model.to(DEVICE) # 优化器准备 param_optimizer = list(self.model.named_parameters()) no_decay = list(['bias', 'LayerNorm.bias', 'LayerNorm.weight']) optimizer_grouped_parameters = list([{'params': [p for n, p in param_optimizer if not any( nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any( nd in n for nd in no_decay)], 'weight_decay': 0.0}]) optimizer = AdamW(params=optimizer_grouped_parameters, lr=self.args.learning_rate) total_size = math.ceil(len(train_source) / self.args.BATCH) best_acc = 0 for epoch in range(self.args.EPOCHS): for train_step, train_batch in enumerate(tqdm(train_dataloader, desc='Train_Iteration')): self.model.train() self.model.zero_grad() train_batch = tuple(t.to(DEVICE) for t in train_batch) t_input_ids, t_input_mask, t_labels, t_out_masks = train_batch t_bert_encode = self.model(t_input_ids, t_input_mask) loss = self.model.loss_fn(bert_encode=t_bert_encode, tags=t_labels, output_mask=t_out_masks) loss.backward() # 梯度裁剪 # torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1) optimizer.step() if train_step % 10 == 0: self.model.eval() eval_loss = 0 for dev_step, dev_batch in enumerate(dev_dataloader): dev_batch = tuple(t.to(DEVICE) for t in dev_batch) d_input_ids, d_input_mask, d_label_ids, d_output_mask = dev_batch with torch.no_grad(): d_bert_encode = self.model(d_input_ids, d_input_mask) eval_loss += self.model.loss_fn(bert_encode=d_bert_encode, tags=d_label_ids, output_mask=d_output_mask) predicts = self.model.predict(d_bert_encode, d_output_mask) d_label_ids = d_label_ids.view(1, -1) d_label_ids = d_label_ids[d_label_ids != -1] eval_acc, eval_f1 = self.model.acc_f1(predicts, d_label_ids) if eval_acc > best_acc: best_acc = eval_acc save_model(self.model, self.args.output_dir) self.model.class_report(predicts, d_label_ids) logger.info("\n>step {}".format(train_step)) logger.info("\n>epoch [{}] {}/{}\n\tloss {:.2f}".format(epoch, train_step, total_size, loss.item())) if self.args.output_dir is False: save_model(self.model, self.args.output_dir)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--data_path', type=str, default='data/bair') parser.add_argument('--model_path', type=str, default='model/bair') parser.add_argument('--epoch', type=int, default=10) parser.add_argument('--batch_size', type=int, default=16) parser.add_argument('--horizon', type=int, default=10) parser.add_argument('--seed', type=int, default=42) parser.add_argument('--cpu_workers', type=int, default=4) parser.add_argument('--gpu_id', type=int, default=0) parser.add_argument('--model_name', type=str, default='cdna') parser.add_argument('--start_point', type=int, default=0) parser.add_argument('--no-gif', dest='save_gif', action='store_false') parser.set_defaults(save_gif=True) args = parser.parse_args() setup_seed(args.seed) device = 'cuda:%d' % args.gpu_id if torch.cuda.device_count( ) > 0 else 'cpu' if not os.path.exists(args.model_path): os.makedirs(args.model_path) # dataset setup train_set = VideoDataset(args.data_path, 'train', args.horizon, fix_start=False) val_set = VideoDataset(args.data_path, 'val', args.horizon, fix_start=True) config = train_set.get_config() H, W, C = config['observations'] A = config['actions'][0] T = args.horizon train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=args.cpu_workers) val_loader = torch.utils.data.DataLoader(val_set, batch_size=args.batch_size, num_workers=args.cpu_workers) # model setup if args.model_name == 'cdna': model = CDNA(T, H, W, C, A) elif args.model_name == 'etd': model = ETD(H, W, C, A, T, 5) elif args.model_name == 'etds': model = ETDS(H, W, C, A, T, 5) elif args.model_name == 'etdm': model = ETDM(H, W, C, A, T, 5) elif args.model_name == 'etdsd': model = ETDSD(H, W, C, A, T, 5) model.to(device) model_path = os.path.join(args.model_path, '{}_{}'.format(args.model_name, args.horizon)) if not os.path.exists(model_path): os.makedirs(model_path) if args.start_point > 0: load_model(model, os.path.join( model_path, '{}_{}.pt'.format(args.model_name, args.start_point)), eval_mode=False) opt = torch.optim.Adam(model.parameters(), lr=1e-3) # tensorboard writer = SummaryWriter() step = 0 epoch = args.start_point while epoch < args.start_point + args.epoch: for j, data in enumerate(train_loader): observations = data['observations'] actions = data['actions'] # B x T ==> T x B observations = torch.transpose(observations, 0, 1).to(device) actions = torch.transpose(actions, 0, 1).to(device) predicted_observations = model(observations[0], actions) loss = mse_loss(observations, predicted_observations) / args.batch_size loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 10) opt.step() opt.zero_grad() # add summary if step % 100 == 0: writer.add_scalar('loss', loss.item(), global_step=step) writer.add_video('video', predicted_observations.permute(1, 0, 2, 3, 4), global_step=step, fps=10) step += 1 epoch += 1 save_model( model, os.path.join(model_path, '{}_{}.pt'.format(args.model_name, epoch))) gif_path = os.path.join(model_path, 'val_{}'.format(epoch)) if not os.path.exists(gif_path): os.makedirs(gif_path) losses = [] videos = [] for j, data in enumerate(val_loader): observations = data['observations'] actions = data['actions'] # B x T ==> T x B observations = torch.transpose(observations, 0, 1).to(device) actions = torch.transpose(actions, 0, 1).to(device) predicted_observations = model(observations[0], actions) video = torch.cat([ observations[0, 0].unsqueeze(0), predicted_observations[0:T - 1, 0] ]) # tensor[T, C, H, W] videos.append(video.unsqueeze(0).detach()) if args.save_gif: torch_save_gif(os.path.join(gif_path, "{}.gif".format(j)), video.detach().cpu(), fps=10) loss = mse_loss(observations, predicted_observations).item() / args.batch_size losses.append(loss) opt.zero_grad() videos = torch.cat(videos, 0) writer.add_video('val_video', videos, global_step=epoch, fps=10) print("-" * 50) print("In epoch {}, loss in val set is {}".format( epoch, np.mean(losses))) print("-" * 50)
def main(): """ 项目的超参 """ parser = argparse.ArgumentParser() parser.add_argument("-e", "--EPOCHS", default=50, type=int, help="train epochs") parser.add_argument("-b", "--BATCH", default=8, type=int, help="batch size") args = parser.parse_args() # ------------------判断CUDA模式---------------------- if torch.cuda.is_available(): device = 'cuda' else: device = 'cpu' device = torch.device(device) # ------------------预处理数据---------------------- dataset = Dataset(epochs=args.EPOCHS, batch=args.BATCH) network = Net.from_pretrained(arguments.bert_model, num_tag=len(arguments.labels)).to(device) logger.info('\n预处理结束!!!\n') # ---------------------优化器------------------------- param_optimizer = list(network.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] t_total = int(dataset.get_train_length() / arguments.gradient_accumulation_steps / args.BATCH * args.EPOCHS) # ---------------------GPU半精度fp16----------------------------- if arguments.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=arguments.learning_rate, bias_correction=False, max_grad_norm=1.0) if arguments.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=arguments.loss_scale) # ------------------------GPU单精度fp32--------------------------- else: optimizer = BertAdam(optimizer_grouped_parameters, lr=arguments.learning_rate, warmup=arguments.warmup_proportion, t_total=t_total ) # ---------------------模型初始化---------------------- if arguments.fp16: network.half() train_losses = [] eval_losses = [] train_accuracy = [] eval_accuracy = [] best_f1 = 0 start = time.time() global_step = 0 for e in range(args.EPOCHS): network.train() for step in range(dataset.get_step() // args.EPOCHS): x_train, y_train = dataset.next_train_batch() batch = create_batch_iter(mode='train', X=x_train, y=y_train).dataset.tensors batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, output_mask = batch bert_encode = network(input_ids, segment_ids, input_mask) train_loss = network.loss_fn(bert_encode=bert_encode, tags=label_ids, output_mask=output_mask) if arguments.gradient_accumulation_steps > 1: train_loss = train_loss / arguments.gradient_accumulation_steps if arguments.fp16: optimizer.backward(train_loss) else: train_loss.backward() if (step + 1) % arguments.gradient_accumulation_steps == 0: def warmup_linear(x, warmup=0.002): if x < warmup: return x / warmup return 1.0 - x # modify learning rate with special warm up BERT uses lr_this_step = arguments.learning_rate * warmup_linear(global_step / t_total, arguments.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 predicts = network.predict(bert_encode, output_mask) label_ids = label_ids.view(1, -1) label_ids = label_ids[label_ids != -1] label_ids = label_ids.cpu() train_acc, f1 = network.acc_f1(predicts, label_ids) logger.info("\n train_acc: %f - train_loss: %f - f1: %f - using time: %f - step: %d \n" % (train_acc, train_loss.item(), f1, ( time.time() - start), step)) # -----------------------验证---------------------------- network.eval() count = 0 y_predicts, y_labels = [], [] eval_loss, eval_acc, eval_f1 = 0, 0, 0 with torch.no_grad(): for step in range(dataset.get_step() // args.EPOCHS): x_val, y_val = dataset.next_validation_batch() batch = create_batch_iter(mode='dev', X=x_val, y=y_val).dataset.tensors batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, output_mask = batch bert_encode = network(input_ids, segment_ids, input_mask).cpu() eval_los = network.loss_fn(bert_encode=bert_encode, tags=label_ids, output_mask=output_mask) eval_loss = eval_los + eval_loss count += 1 predicts = network.predict(bert_encode, output_mask) y_predicts.append(predicts) label_ids = label_ids.view(1, -1) label_ids = label_ids[label_ids != -1] y_labels.append(label_ids) eval_predicted = torch.cat(y_predicts, dim=0).cpu() eval_labeled = torch.cat(y_labels, dim=0).cpu() print('eval:') print(eval_predicted.numpy().tolist()) print(eval_labeled.numpy().tolist()) eval_acc, eval_f1 = network.acc_f1(eval_predicted, eval_labeled) network.class_report(eval_predicted, eval_labeled) logger.info( '\n\nEpoch %d - train_loss: %4f - eval_loss: %4f - train_acc:%4f - eval_acc:%4f - eval_f1:%4f\n' % (e + 1, train_loss.item(), eval_loss.item() / count, train_acc, eval_acc, eval_f1)) # 保存最好的模型 if eval_f1 > best_f1: best_f1 = eval_f1 save_model(network, arguments.output_dir) if e % 1 == 0: train_losses.append(train_loss.item()) train_accuracy.append(train_acc) eval_losses.append(eval_loss.item() / count) eval_accuracy.append(eval_acc)