def _train(start_iteration, model, optimizer, device, train_dataloader, test_dataloader, args): train_loss = deque(maxlen=args.log_freq) test_loss = deque(maxlen=args.log_freq) model = model.to(device) start_time = time.perf_counter() test_iter = iter(test_dataloader) train_iter = iter(train_dataloader) loss_func = partial(_loss_func, model=model, device=device) oclr = OneCycleLR(optimizer, args.learning_rate, pct_start=0.01, total_steps=1_000_000, cycle_momentum=False, last_epoch=start_iteration - 2) for iteration in range(start_iteration, 1 + args.num_training_steps): loss = loss_func(train_iter) optimizer.zero_grad() loss.backward() optimizer.step() oclr.step() train_loss.append(loss.detach()) if iteration % (10 * args.log_freq) == 0: ckpt = f'checkpoint_{iteration:07d}.pt' print('Saving checkpoint', ckpt) torch.save( { 'iteration': iteration, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'args': args }, ckpt) if iteration % 20 == 0: with torch.no_grad(): model.eval() test_loss.append(loss_func(test_iter).detach()) model.train() if iteration % args.log_freq == 0: avg_train_loss = sum(train_loss).item() / len(train_loss) avg_test_loss = sum(test_loss).item() / len(test_loss) end_time = time.perf_counter() duration, start_time = end_time - start_time, end_time lr = oclr.get_last_lr()[0] with torch.no_grad(): model.eval() cat = random.randrange(0, len(dataset.categories)) sample = generate(model, device, cat) model.train() train_sample = next(train_iter)[0, :] test_sample = next(test_iter)[0, :] plot_encoded_figure(train_sample[:, 0].tolist(), train_sample[0, 2], 'train_sample.png') plot_encoded_figure(test_sample[:, 0].tolist(), test_sample[0, 2], 'test_sample.png') plot_encoded_figure(sample, cat, 'random_sample.png') print( f'Iteration {iteration:07d} Train loss {avg_train_loss:.3f} Test loss {avg_test_loss:.3f} LR {lr:.3e} Duration {duration:.3f}' ) if args.use_wandb: wandb.log({ 'iteration': iteration, 'train loss': avg_train_loss, 'test loss': avg_test_loss, 'duration': duration, 'learning rate': lr, 'train sample': wandb.Image('train_sample.png'), 'test sample': wandb.Image('test_sample.png'), 'random sample': wandb.Image('random_sample.png'), })
def train(args, training_features, model, tokenizer): """ Train the model """ wandb.init(project=os.getenv("WANDB_PROJECT", "huggingface"), config=args, name=args.run_name) wandb.watch(model) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) else: amp = None # model recover recover_step = utils.get_max_epoch_model(args.output_dir) # if recover_step: # model_recover_checkpoint = os.path.join(args.output_dir, "model.{}.bin".format(recover_step)) # logger.info(" ** Recover model checkpoint in %s ** ", model_recover_checkpoint) # model_state_dict = torch.load(model_recover_checkpoint, map_location='cpu') # optimizer_recover_checkpoint = os.path.join(args.output_dir, "optim.{}.bin".format(recover_step)) # checkpoint_state_dict = torch.load(optimizer_recover_checkpoint, map_location='cpu') # checkpoint_state_dict['model'] = model_state_dict # else: checkpoint_state_dict = None model.to(args.device) model, optimizer = prepare_for_training(args, model, checkpoint_state_dict, amp=amp) if args.n_gpu == 0 or args.no_cuda: per_node_train_batch_size = args.per_gpu_train_batch_size * args.gradient_accumulation_steps else: per_node_train_batch_size = args.per_gpu_train_batch_size * args.n_gpu * args.gradient_accumulation_steps train_batch_size = per_node_train_batch_size * ( torch.distributed.get_world_size() if args.local_rank != -1 else 1) global_step = recover_step if recover_step else 0 if args.num_training_steps == -1: args.num_training_steps = int(args.num_training_epochs * len(training_features) / train_batch_size) if args.warmup_portion: args.num_warmup_steps = args.warmup_portion * args.num_training_steps if args.scheduler == "linear": scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.num_training_steps, last_epoch=-1) elif args.scheduler == "constant": scheduler = get_constant_schedule(optimizer, last_epoch=-1) elif args.scheduler == "1cycle": scheduler = OneCycleLR(optimizer, max_lr=args.learning_rate, total_steps=args.num_training_steps, pct_start=args.warmup_portion, anneal_strategy=args.anneal_strategy, final_div_factor=1e4, last_epoch=-1) else: assert False if checkpoint_state_dict: scheduler.load_state_dict(checkpoint_state_dict["lr_scheduler"]) train_dataset = utils.Seq2seqDatasetForBert( features=training_features, max_source_len=args.max_source_seq_length, max_target_len=args.max_target_seq_length, vocab_size=tokenizer.vocab_size, cls_id=tokenizer.cls_token_id, sep_id=tokenizer.sep_token_id, pad_id=tokenizer.pad_token_id, mask_id=tokenizer.mask_token_id, random_prob=args.random_prob, keep_prob=args.keep_prob, offset=train_batch_size * global_step, num_training_instances=train_batch_size * args.num_training_steps, ) logger.info("Check dataset:") for i in range(5): source_ids, target_ids, pseudo_ids, num_source_tokens, num_target_tokens = train_dataset.__getitem__( i) logger.info("Instance-%d" % i) logger.info("Source tokens = %s" % " ".join(tokenizer.convert_ids_to_tokens(source_ids))) logger.info("Target tokens = %s" % " ".join(tokenizer.convert_ids_to_tokens(target_ids))) logger.info("Mode = %s" % str(model)) # Train! logger.info(" ***** Running training ***** *") logger.info(" Num examples = %d", len(training_features)) logger.info(" Num Epochs = %.2f", len(train_dataset) / len(training_features)) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Batch size per node = %d", per_node_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", train_batch_size) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", args.num_training_steps) if args.num_training_steps <= global_step: logger.info( "Training is done. Please use a new dir or clean this dir!") else: # The training features are shuffled train_sampler = SequentialSampler(train_dataset) \ if args.local_rank == -1 else DistributedSampler(train_dataset, shuffle=False) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=per_node_train_batch_size // args.gradient_accumulation_steps, collate_fn=utils.batch_list_to_batch_tensors) train_iterator = tqdm.tqdm(train_dataloader, initial=global_step, desc="Iter (loss=X.XXX, lr=X.XXXXXXX)", disable=args.local_rank not in [-1, 0]) model.train() model.zero_grad() tr_loss, logging_loss = 0.0, 0.0 for step, batch in enumerate(train_iterator): batch = tuple(t.to(args.device) for t in batch) inputs = { 'source_ids': batch[0], 'target_ids': batch[1], 'pseudo_ids': batch[2], 'num_source_tokens': batch[3], 'num_target_tokens': batch[4] } loss = model(**inputs) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel (not distributed) training train_iterator.set_description( 'Iter (loss=%5.3f) lr=%9.7f' % (loss.item(), scheduler.get_last_lr()[0])) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() logging_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: wandb.log( { 'lr': scheduler.get_last_lr()[0], 'loss': logging_loss / args.logging_steps }, step=global_step) logger.info(" Step [%d ~ %d]: %.2f", global_step - args.logging_steps, global_step, logging_loss) logging_loss = 0.0 if args.local_rank in [-1, 0] and args.save_steps > 0 and \ (global_step % args.save_steps == 0 or global_step == args.num_training_steps): save_path = os.path.join(args.output_dir, "ckpt-%d" % global_step) os.makedirs(save_path, exist_ok=True) model_to_save = model.module if hasattr( model, "module") else model model_to_save.save_pretrained(save_path) # optim_to_save = { # "optimizer": optimizer.state_dict(), # "lr_scheduler": scheduler.state_dict(), # } # if args.fp16: # optim_to_save["amp"] = amp.state_dict() # torch.save( # optim_to_save, os.path.join(args.output_dir, 'optim.{}.bin'.format(global_step))) logger.info("Saving model checkpoint %d into %s", global_step, save_path) wandb.save(f'{save_path}/*')
def train(args, writer): # 1.数据处理 # 获得预定义的fields,划分过的训练数据集 # train_dataset中的每一行是一个torchtext.data.Example对象,这个对象的'id': ,'category': ,'news_text': 这三个属性保存了原来csv中每一行的数据 # 此时还未数字化,要等到构造迭代器的时候才数字化 fields, train_dataset = build_and_cache_dataset(args, mode='train') # NEWS_TEXT,CATEGORY是要存词汇表的,之后构造迭代器的时候会用上 ID, CATEGORY, NEWS_TEXT = fields # 词向量 vectors = Vectors(name=args.embed_path, cache=args.data_dir) # import gensim # word2vec = gensim.models.KeyedVectors.load_word2vec_format(args.embed_path, binary=True) # 创建数据集的词汇表,同时加载预训练的词向量 # 创建词汇表,作为一个Vocab对象,存在Field对象NEWS_TEXT里,其中stoi是词和数字的映射字典,vectors是词的词向量矩阵,两者是对应的,第一个词映射为0,且词向量在vectors里也是第一行 NEWS_TEXT.build_vocab( train_dataset, # 根据训练数据集创建词汇表 max_size=args.vocab_size, # 句子最大长度 vectors=vectors, # 根据词汇表,从加载的预训练词向量中抽出相应的词向量 unk_init=torch.nn.init.xavier_normal_, ) # 创建标签的词汇表,作为一个Vocab对象,存在Field对象CATEGORY里 CATEGORY.build_vocab(train_dataset) # 实例化模型 model = TextClassifier( vocab_size=len(NEWS_TEXT.vocab), # 训练集划分后的词的总个数,即词汇表长度 output_dim=args.num_labels, # 类别数 pad_idx=NEWS_TEXT.vocab.stoi[ NEWS_TEXT. pad_token], # NEWS_TEXT.pad_token = <pad>,从stoi('<pad> : 1')里取出<pad>的值 dropout=args.dropout, ) # 为embedding层的矩阵赋值为NEWS.vocab.vectors model.embedding.from_pretrained(NEWS_TEXT.vocab.vectors) # 构造训练集迭代器,在这一步将torchtext.data.Example对象中的news_text属性数字化 # 还会对同一个batch内的不够长的句子做pad,pad成batch内最长的句子的长度,但是在batch.news_text里会记录句子真实的长度 bucket_iterator = BucketIterator( train_dataset, batch_size=args.train_batch_size, # batch_size大小 sort_within_batch=True, # batch内排序 shuffle=True, # 2.batch间进行乱序 sort_key=lambda x: len( x.news_text), # 1.按句子长度排序,x代表训练集中的每一行,即一个torchtext.data.Example对象 device=args.device, # 放入GPU里 ) # 2.训练 model.to(args.device) # 损失函数 criterion = nn.CrossEntropyLoss() # 优化器 optimizer = Adam(model.parameters(), lr=args.learning_rate, eps=args.adam_epsilon) # 学习率随epoch改变 scheduler = OneCycleLR(optimizer, max_lr=args.learning_rate * 10, epochs=args.num_train_epochs, steps_per_epoch=len(bucket_iterator)) global_step = 0 # 梯度清零 model.zero_grad() # tqdm(list) 方法可以传入任意一种list # trange(i) 是 tqdm(range(i)) 的简单写法 # 下式左边,等价于tqdm(range(0, 5)) train_trange = trange(0, args.num_train_epochs, desc="Train epoch") for _ in train_trange: epoch_iterator = tqdm(bucket_iterator, desc='Training') # 进度条 # 对每个batch做一个前向传播和反向传播,更新参数 for step, batch in enumerate(epoch_iterator): # for循环结束进度条才为100% model.train() # news_text:所有句子组成一个list[[句子1],[句子2],...],实际是按列是一个句子 # [句子1] = [单词1(单词对应的下标),单词2,单词3,...] # news_text_lengths:所有句子的长度组成一个list news_text, news_text_lengths = batch.news_text # news_text中,每一列是一个数字化后的句子,batch_size是多少,就有多少列 # print(batch.news_text) # # print(len(news_text)) # print(news_text.shape) # # print(len(news_text_lengths)) # print(news_text_lengths) category = batch.category # 标签的list # 前向传播 preds = model(news_text, news_text_lengths) # 计算损失值 loss = criterion(preds, category) # 计算梯度 loss.backward() # loss随每次batch的变化,写入tensorboard writer.add_scalar('Train/Loss', loss.item(), global_step) # 学习率随每次batch的变化,写入tensorboard writer.add_scalar('Train/lr', scheduler.get_last_lr()[0], global_step) # NOTE: Update model, optimizer should update before scheduler # 更新参数 optimizer.step() # 更新学习率 scheduler.step() # 记录用过多少个batch进行参数更新了 global_step += 1 # 评估 # 每50轮评估一次 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # 返回损失值,精准率,召回率,f1_score的字典 results = evaluate(args, model, CATEGORY.vocab, NEWS_TEXT.vocab) # 损失值,精准率,召回率,f1_score随每次batch的变化,写入tensorboard for key, value in results.items(): writer.add_scalar("Eval/{}".format(key), value, global_step) # 每100轮保存一次模型 if args.save_steps > 0 and global_step % args.save_steps == 0: save_model(args, model, optimizer, scheduler, global_step) writer.close()
class Detector(object): def __init__(self, cfg): self.device = cfg["device"] self.model = Models().get_model(cfg["network"]) # cfg.network self.model.to(self.device) params = [p for p in self.model.parameters() if p.requires_grad] self.optimizer = AdamW(params, lr=0.00001) self.lr_scheduler = OneCycleLR(self.optimizer, max_lr=1e-4, epochs=cfg["nepochs"], steps_per_epoch=169, # len(dataloader)/accumulations div_factor=25, # for initial lr, default: 25 final_div_factor=1e3, # for final lr, default: 1e4 ) def fit(self, data_loader, accumulation_steps=4, wandb=None): self.model.train() # metric_logger = utils.MetricLogger(delimiter=" ") # metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) avg_loss = MetricLogger('scalar') total_loss = MetricLogger('dict') lr_log = MetricLogger('list') self.optimizer.zero_grad() device = self.device for i, (images, targets) in enumerate(data_loader): images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = self.model(images, targets) losses = sum(loss for loss in loss_dict.values()) loss_value = losses.detach().item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) sys.exit(1) losses.backward() if (i+1) % accumulation_steps == 0: self.optimizer.step() self.optimizer.zero_grad() if self.lr_scheduler is not None: self.lr_scheduler.step() lr_log.update(self.lr_scheduler.get_last_lr()) print(f"\rTrain iteration: [{i+1}/{len(data_loader)}]", end="") avg_loss.update(loss_value) total_loss.update(loss_dict) # metric_logger.update(loss=losses_reduced, **loss_dict_reduced) # metric_logger.update(lr=optimizer.param_groups[0]["lr"]) print() #print(loss_dict) return {"train_avg_loss": avg_loss.avg}, total_loss.avg def mixup_fit(self, data_loader, accumulation_steps=4, wandb=None): self.model.train() torch.cuda.empty_cache() # metric_logger = utils.MetricLogger(delimiter=" ") # metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) avg_loss = MetricLogger('scalar') total_loss = MetricLogger('dict') #lr_log = MetricLogger('list') self.optimizer.zero_grad() device = self.device for i, (batch1, batch2) in enumerate(data_loader): images1, targets1 = batch1 images2, targets2 = batch2 images = mixup_images(images1, images2) targets = merge_targets(targets1, targets2) del images1, images2, targets1, targets2, batch1, batch2 images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = self.model(images, targets) losses = sum(loss for loss in loss_dict.values()) loss_value = losses.detach().item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) sys.exit(1) losses.backward() if (i+1) % accumulation_steps == 0: self.optimizer.step() self.optimizer.zero_grad() if self.lr_scheduler is not None: self.lr_scheduler.step() #lr_log.update(self.lr_scheduler.get_last_lr()) print(f"Train iteration: [{i+1}/{674}]\r", end="") avg_loss.update(loss_value) total_loss.update(loss_dict) # metric_logger.update(loss=losses_reduced, **loss_dict_reduced) # metric_logger.update(lr=optimizer.param_groups[0]["lr"]) print() #print(loss_dict) return {"train_avg_loss": avg_loss.avg}, total_loss.avg def evaluate(self, val_dataloader): device = self.device torch.cuda.empty_cache() # self.model.to(device) self.model.eval() mAp_logger = MetricLogger('list') with torch.no_grad(): for (j, batch) in enumerate(val_dataloader): print(f"\rValidation: [{j+1}/{len(val_dataloader)}]", end="") images, targets = batch del batch images = [img.to(device) for img in images] # targets = [{k: v.to(device) for k, v in t.items()} for t in targets] predictions = self.model(images)#, targets) for i, pred in enumerate(predictions): probas = pred["scores"].detach().cpu().numpy() mask = probas > 0.6 preds = pred["boxes"].detach().cpu().numpy()[mask] gts = targets[i]["boxes"].detach().cpu().numpy() score, scores = map_score(gts, preds, thresholds=[.5, .55, .6, .65, .7, .75]) mAp_logger.update(scores) print() return {"validation_mAP_score": mAp_logger.avg} def get_checkpoint(self): self.model.eval() model_state = self.model.state_dict() optimizer_state = self.optimizer.state_dict() checkpoint = {'model_state_dict': model_state, 'optimizer_state_dict': optimizer_state } # if self.lr_scheduler: # scheduler_state = self.lr_scheduler.state_dict() # checkpoint['lr_scheduler_state_dict'] = scheduler_state return checkpoint def load_checkpoint(self, checkpoint): self.model.eval() self.model.load_state_dict(checkpoint["model_state_dict"]) self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"])