def _create_optimizer(self, sgd): optimizer = AdamW( self._model.parameters(), lr=getattr(sgd, "pytt_lr", sgd.alpha), eps=sgd.eps, betas=(sgd.b1, sgd.b2), weight_decay=getattr(sgd, "pytt_weight_decay", 0.0), ) optimizer.zero_grad() return optimizer
def _create_optimizer(self, sgd): optimizer = AdamW( self._model.parameters(), lr=getattr(sgd, "pytt_lr", sgd.alpha), eps=sgd.eps, betas=(sgd.b1, sgd.b2), weight_decay=getattr(sgd, "pytt_weight_decay", 0.0), ) if getattr(sgd, "pytt_use_swa", False): optimizer = SWA(optimizer, swa_start=1, swa_freq=10, swa_lr=sgd.alpha) optimizer.zero_grad() return optimizer
def main(self): self.trainLoader = MyDataLoader(self, mode='train').getdata() self.validLoader = MyDataLoader(self, mode='valid').getdata() self.testLoader = MyDataLoader(self, mode='test').getdata() self.word_dict = pkl.load(open(os.path.join(self.data_dir, 'word_dict.pkl'), 'rb')) #embedding_matrix = pkl.load(open(os.path.join(self.data_dir, 'emb.pkl'), 'rb')) self.vocab_size = len(self.word_dict) self.embedding_matrix = pkl.load(open(os.path.join(self.data_dir, 'emb.pkl'), 'rb')) self.model = myLSTM(self, device=self.device).to(self.device) param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0} ] self.optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, eps=float(self.adam_epsilon), weight_decay=1e-6) self.scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=self.warmup_steps, t_total=self.epoch_size * self.trainLoader.__len__()) self.criterion = nn.CrossEntropyLoss() self.forward()
def _create_optimizer(self, sgd): optimizer = AdamW( self._model.parameters(), lr=sgd.alpha, betas=(sgd.b1, sgd.b2), ) return optimizer
def tpu_training_loop(model, loader, device, context): """ Called by torch_xla_py.data_parallel. This function is executed on each core of the TPU once per epoch""" param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] # one optimizer and scheduler per TPU core. Both objects are saved in `context` to be reused the next epoch optimizer = context.getattr_or( 'optimizer', AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=tuple(args.betas))) # derive warmup info if args.warmup_proportion is not None: warmup_steps = int(args.warmup_proportion * num_train_optimization_steps + 0.5) elif args.warmup_steps is not None: warmup_steps = args.warmup_steps else: raise Exception('What is the warmup?? Specify either warmup proportion or steps') scheduler = context.getattr_or( 'scheduler', WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps)) tr_loss = None pbar = None if str(pbar_device) == str(device): # All threads are in sync. Use progress bar only on one of them pbar = tqdm(total=int(pbar_steps), desc=f"device {device}", dynamic_ncols=True) tracker = tpu_xm.RateTracker() model.train() for step, batch in enumerate(loader): input_ids, input_mask, segment_ids, lm_label_ids, _ = batch outputs = model(input_ids, segment_ids, input_mask, lm_label_ids) loss = outputs[0] if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.sum().backward() # for multiple tensors tracker.add(args.train_batch_size) tr_loss = loss * args.gradient_accumulation_steps if step == 0 else tr_loss + loss * args.gradient_accumulation_steps if pbar is not None: pbar.update(1) # pbar.set_description(desc=f'LR: {scheduler.get_lr()}') if (step + 1) % args.gradient_accumulation_steps == 0: tpu_xm.optimizer_step(optimizer) prev_lr = scheduler.get_last_lr()[0] scheduler.step() curr_lr = scheduler.get_last_lr()[0] if args.track_learning_rate: if pbar is not None: pbar.set_description(f"Prev LR: {prev_lr} Curr LR: {curr_lr}") optimizer.zero_grad() return tr_loss.sum().item() / step # `.item()` requires a trip from TPU to CPU, which is very slow. Use it only once per epoch=
def __call__(self, model, device, args): log = self._logger # Prepare optimizer and schedule (linear warmup and decay) optimization_steps = (len(self.train_dataloader) * args.epochs) // args.gradient_accumulation_steps no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=optimization_steps) # Train log.info(f"Training Started with parameters {args}") model.zero_grad() global_step = 1 for epoch in trange(args.epochs, desc="Epoch"): for step, batch in enumerate(tqdm(self.train_dataloader)): model.train() batch = tuple(t.to(device) for t in batch) # Send data to target device model_input = {'input_ids': batch[0], # word ids 'attention_mask': batch[1], # input mask 'token_type_ids': batch[2], # segment ids 'labels': batch[3]} # labels outputs = model(**model_input) train_loss = outputs[0] if args.gradient_accumulation_steps > 1: train_loss = train_loss / args.gradient_accumulation_steps train_loss.backward() # Accumulates the gradient before optimize the model if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_norm) # grad clip optimizer.step() scheduler.step() model.zero_grad() # Steps necessary to run the trained model into validation data set if (step + 1) % args.eval_steps == 0 and not args.eval_per_epoch: self.evaluate_on_val_set(epoch, global_step, optimization_steps, model, device, scheduler, args) global_step += 1 if args.eval_per_epoch: self.evaluate_on_val_set(epoch, global_step, optimization_steps, model, device, scheduler, args)
def setup_optim(named_params, learning_rate, adam_epsilon, warmup_steps, num_train_optim_steps): param_optimizer = list(named_params) # model.named_parameters() no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optim_steps) return optimizer, scheduler
def train(): BFT = BertFineTuning() BFT.criterion = nn.CrossEntropyLoss() BFT.optimizer = AdamW(BFT.parameters_main) BFT.scheduler = torch.optim.lr_scheduler.MultiStepLR(BFT.optimizer, milestones=[]) ml = MultiLoader() print("Training in progress ...") BFT.train(run, ml.bert_train_split, ml.bert_valid_split)
def get_scheduler_and_optimizer(self, parameters, train_tensor_data, logger): model = self.model num_train_optimization_steps = (int( len(train_tensor_data) / parameters["train_batch_size"] / parameters["gradient_accumulation_steps"]) * parameters["num_train_epochs"]) num_warmup_steps = int(num_train_optimization_steps * parameters["warmup_proportion"]) param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer] no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW( optimizer_grouped_parameters, lr=parameters["learning_rate"], correct_bias=False, ) scheduler = WarmupLinearSchedule( optimizer, warmup_steps=num_warmup_steps, t_total=num_train_optimization_steps, ) logger.info(" Num optimization steps = %d", num_train_optimization_steps) logger.info(" Num warmup steps = %d", num_warmup_steps) return optimizer, scheduler
def get_bert_optimizer(models, type_optimization, learning_rate, fp16=False): """ Optimizes the network with AdamWithDecay """ if type_optimization not in patterns_optimizer: print('Error. Type optimizer must be one of %s' % (str(patterns_optimizer.keys()))) parameters_with_decay = [] parameters_with_decay_names = [] parameters_without_decay = [] parameters_without_decay_names = [] no_decay = ['bias', 'gamma', 'beta'] patterns = patterns_optimizer[type_optimization] for model in models: for n, p in model.named_parameters(): if any(t in n for t in patterns): if any(t in n for t in no_decay): parameters_without_decay.append(p) parameters_without_decay_names.append(n) else: parameters_with_decay.append(p) parameters_with_decay_names.append(n) print('The following parameters will be optimized WITH decay:') print(ellipse(parameters_with_decay_names, 5, ' , ')) print('The following parameters will be optimized WITHOUT decay:') print(ellipse(parameters_without_decay_names, 5, ' , ')) optimizer_grouped_parameters = [ { 'params': parameters_with_decay, 'weight_decay': 0.01 }, { 'params': parameters_without_decay, 'weight_decay': 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, correct_bias=False) if fp16: optimizer = fp16_optimizer_wrapper(optimizer) return optimizer
def train(): BFT = BertFineTuning() BFT.criterion = nn.CrossEntropyLoss() BFT.optimizer = AdamW(BFT.parameters_main) BFT.scheduler = torch.optim.lr_scheduler.MultiStepLR(BFT.optimizer, milestones=[]) ml = MultiLoader() print("Training in progress ...") BFT.train(model_config, ml.bert_train_split, ml.bert_valid_split, epochs=100, print_every=100, validate_at_epoch=0)
def setup_optim(named_params, learning_rate, adam_epsilon, warmup_steps, num_train_optim_steps): """Sets up optimizer and scheduler for the model train loop Parameters: - named_params : List of models parameters - learning_rate : float : Between 0-1, recommended 3e-5 - adam_epsilon : float : Between 0-1, recommended 1e-8 - warmup_steps : int : Number training steps before regular learning is used - num_train_optim_steps : int : Total number of batches over all epochs Returns: - optimizer : AdamW Optimizer - scheduler : WarmupLinearSchedule """ param_optimizer = list(named_params) # model.named_parameters() no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optim_steps) return optimizer, scheduler
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'labels': batch[3] } outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(self, train_category, dev_category, train_news, dev_news, tokenizer, Net=None, model=None): if os.path.exists(self.arguments.output_config_file) is True: os.remove(self.arguments.output_config_file) logger.info('>>train.shape: {} | dev.shape: {}'.format( train_category.shape, dev_category.shape)) train_dataloader, train_examples_len = Util.load_data( news=train_news, category=train_category, data_type='train', label_list=self.arguments.label_list, max_length=self.arguments.max_seq_length, tokenizer=tokenizer, batch_size=self.arguments.BATCH) dev_dataloader, dev_examples_len = Util.load_data( news=dev_news, category=dev_category, data_type='dev', label_list=self.arguments.label_list, max_length=self.arguments.max_seq_length, tokenizer=tokenizer, batch_size=self.arguments.BATCH) num_train_optimization_steps = int( train_examples_len / self.arguments.BATCH / self.arguments.gradient_accumulation_steps) * self.arguments.EPOCHS # 模型准备 logger.info("model name is {}".format(self.arguments.model_name)) if model is None: if self.arguments.model_name == "BertOrigin": model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir) elif self.arguments.model_name == 'BertHAN': model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir) elif self.arguments.model_name == "BertCNN": filter_sizes = [ int(val) for val in self.arguments.filter_sizes.split() ] model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, n_filters=self.arguments.filter_num, filter_sizes=filter_sizes, cache_dir=self.arguments.cache_dir) elif self.arguments.model_name == "BertATT": model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir) elif self.arguments.model_name == "BertRCNN": model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir, rnn_hidden_size=self.arguments.rnn_hidden_size, num_layers=self.arguments.num_layers, bidirectional=self.arguments.bidirectional, dropout=self.arguments.dropout) elif self.arguments.model_name == "BertCNNPlus": filter_sizes = [ int(val) for val in self.arguments.filter_sizes.split() ] model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir, n_filters=self.arguments.filter_num, filter_sizes=filter_sizes) model.to(DEVICE) """ 优化器准备 """ param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # To reproduce BertAdam specific behavior set correct_bias=False optimizer = AdamW(params=optimizer_grouped_parameters, lr=self.arguments.learning_rate, correct_bias=False) # PyTorch scheduler scheduler = WarmupLinearSchedule( optimizer=optimizer, warmup_steps=self.arguments.warmup_proportion, t_total=num_train_optimization_steps) """ 损失函数准备 """ if self.arguments.use_label_smoothing: criterion = NMTCriterion( label_smoothing=self.arguments.label_smoothing) else: criterion = nn.CrossEntropyLoss() criterion = criterion.to(DEVICE) best_auc, best_acc, global_step, early_stop_times = 0, 0, 0, 0 for epoch in range(int(self.arguments.EPOCHS)): if early_stop_times >= self.arguments.early_stop * ( train_examples_len // self.arguments.BATCH): break logger.info(f'---------------- Epoch: {epoch + 1:02} ----------') for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): model.train() if self.arguments.label_smoothing: criterion.train() batch = tuple(t.to(DEVICE) for t in batch) _, input_ids, input_mask, segment_ids, label_ids = batch logits = model(input_ids, segment_ids, input_mask, labels=None) loss = criterion(inputs=logits, labels=label_ids, normalization=1.0, reduce=False) # 修正 if self.arguments.gradient_accumulation_steps > 1: loss = loss / self.arguments.gradient_accumulation_steps loss.backward(torch.ones_like(loss)) scheduler.step() if (step + 1) % self.arguments.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % self.arguments.print_step == 0 and global_step != 0: dev_loss, dev_acc, dev_report, dev_auc = Util.evaluate( model, dev_dataloader, criterion, DEVICE, self.arguments.label_list, args=self.arguments) logger.info('\n>>>dev report: \n{}'.format(dev_report)) # 以 acc 取优 if dev_acc > best_acc: best_acc = dev_acc # 以 auc 取优 if dev_auc > best_auc: best_auc = dev_auc # 保存模型 model_to_save = model.module if hasattr( model, 'module') else model torch.save(model_to_save.state_dict(), self.arguments.output_model_file) with open(self.arguments.output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) early_stop_times = 0 else: early_stop_times += 1 if os.path.exists(self.arguments.output_config_file) is False: model_to_save = model.module if hasattr(model, 'module') else model torch.save(model_to_save.state_dict(), self.arguments.output_model_file) with open(self.arguments.output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in named_params if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in named_params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = len( train_feats['data']) // opt.batchSize * opt.max_epoch optimizer = AdamW( optimizer_grouped_parameters, lr=opt.lr, correct_bias=False ) # To reproduce BertAdam specific behavior set correct_bias=False scheduler = WarmupLinearSchedule( optimizer, warmup_steps=int(opt.warmup_proportion * num_train_optimization_steps), t_total=num_train_optimization_steps) # PyTorch scheduler # prepare_inputs_for_bert(sentences, word_lengths) def decode(data_feats, data_tags, data_class, output_path): data_index = np.arange(len(data_feats)) losses = [] TP, FP, FN, TN = 0.0, 0.0, 0.0, 0.0 TP2, FP2, FN2, TN2 = 0.0, 0.0, 0.0, 0.0 with open(output_path, 'w') as f:
def main(): # **************************** 基础信息 *********************** logger = init_logger(log_name=config['model']['arch'], log_dir=config['output']['log_dir']) logger.info(f"seed is {config['train']['seed']}") device = f"cuda: {config['train']['n_gpu'][0] if len(config['train']['n_gpu']) else 'cpu'}" seed_everything(seed=config['train']['seed'],device=device) logger.info('starting load data from disk') id2label = {value: key for key, value in config['label2id'].items()} # **************************** 数据生成 *********************** DT = DataTransformer(logger = logger,seed = config['train']['seed']) # 读取数据集以及数据划分 targets,sentences = DT.read_data(raw_data_path = config['data']['raw_data_path'], preprocessor = EnglishPreProcessor(), is_train = True) train, valid = DT.train_val_split(X = sentences,y = targets,save=True,shuffle=True,stratify=False, valid_size = config['train']['valid_size'], train_path = config['data']['train_file_path'], valid_path = config['data']['valid_file_path']) tokenizer = BertTokenizer(vocab_file=config['pretrained']['bert']['vocab_path'], do_lower_case=config['train']['do_lower_case']) # train train_dataset = CreateDataset(data = train, tokenizer = tokenizer, max_seq_len = config['train']['max_seq_len'], seed = config['train']['seed'], example_type = 'train') # valid valid_dataset = CreateDataset(data= valid, tokenizer = tokenizer, max_seq_len = config['train']['max_seq_len'], seed = config['train']['seed'], example_type = 'valid') #加载训练数据集 train_loader = DataLoader(dataset = train_dataset, batch_size = config['train']['batch_size'], num_workers = config['train']['num_workers'], shuffle = True, drop_last = False, pin_memory = False) # 验证数据集 valid_loader = DataLoader(dataset = valid_dataset, batch_size = config['train']['batch_size'], num_workers = config['train']['num_workers'], shuffle = False, drop_last = False, pin_memory = False) # **************************** 模型 *********************** logger.info("initializing model") model = BertFine.from_pretrained(config['pretrained']['bert']['bert_model_dir'], cache_dir=config['output']['cache_dir'], num_classes = len(id2label)) # ************************** 优化器 ************************* param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] num_train_steps = int( len(train_dataset.examples) / config['train']['batch_size'] / config['train']['gradient_accumulation_steps'] * config['train']['epochs']) # t_total: total number of training steps for the learning rate schedule # warmup: portion of t_total for the warmup optimizer = AdamW(optimizer_grouped_parameters, lr = config['train']['learning_rate']) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=config['train']['warmup_steps'], t_total=num_train_steps) # **************************** callbacks *********************** logger.info("initializing callbacks") # 模型保存 model_checkpoint = ModelCheckpoint(checkpoint_dir = config['output']['checkpoint_dir'], mode = config['callbacks']['mode'], monitor = config['callbacks']['monitor'], save_best_only = config['callbacks']['save_best_only'], arch = config['model']['arch'], logger = logger) # 监控训练过程 train_monitor = TrainingMonitor(file_dir = config['output']['figure_dir'], arch = config['model']['arch']) # 学习率机制 lr_scheduler = BertLR(optimizer = optimizer, learning_rate = config['train']['learning_rate'], t_total = num_train_steps, warmup = config['train']['warmup_steps']) # **************************** training model *********************** logger.info('training model....') train_configs = { 'model': model, 'logger': logger, 'optimizer': optimizer, 'scheduler': scheduler, 'resume': config['train']['resume'], 'epochs': config['train']['epochs'], 'n_gpu': config['train']['n_gpu'], 'gradient_accumulation_steps': config['train']['gradient_accumulation_steps'], 'epoch_metrics':[F1Score(average='micro',task_type='binary'),MultiLabelReport(id2label = id2label)], 'batch_metrics':[AccuracyThresh(thresh=0.5)], 'criterion': BCEWithLogLoss(), 'model_checkpoint': model_checkpoint, 'training_monitor': train_monitor, 'lr_scheduler': lr_scheduler, 'early_stopping': None, 'verbose': 1 } trainer = Trainer(train_configs=train_configs) # 拟合模型 trainer.train(train_data = train_loader,valid_data=valid_loader) # 释放显存 if len(config['train']['n_gpu']) > 0: torch.cuda.empty_cache()
if any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0 }] if not any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0.01 }] optimizer = AdamW(optimizer_grouped_parameters, lr=params['lr']) scheduler = WarmupLinearScheduleNonZero(optimizer, warmup_steps=10000, t_total=200000) start_iter_id = 0 if params['start_path']: pretrained_dict = torch.load(params['start_path']) if not params['continue']: if 'model_state_dict' in pretrained_dict: pretrained_dict = pretrained_dict['model_state_dict'] model_dict = dialog_encoder.state_dict() pretrained_dict = {
def main(): args = parse_args() # Devices if args.local_rank == -1: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 torch.distributed.init_process_group(backend="nccl") # Init distributed backend for sychronizing nodes/GPUs default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True logger.info(f"device: {device} n_gpu: {n_gpu}, distributed training: {bool(args.local_rank != -1)}") # Load config config = BertConfig.from_json_file(args.config_file) # Output dirs timestamp = args.config_file.split("/")[1].split(".")[0] save_path = os.path.join(args.output_dir, timestamp) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if default_gpu: if not os.path.exists(save_path): os.makedirs(save_path) # save all the hidden parameters. with open(os.path.join(save_path, "command.txt"), "w") as f: print(args, file=f) # Python 3.x print("\n", file=f) print(config, file=f) cache = 5000 args.train_batch_size = args.train_batch_size // args.grad_acc_steps if dist.is_available() and args.local_rank != -1: num_replicas = dist.get_world_size() args.train_batch_size = args.train_batch_size // num_replicas args.num_workers = args.num_workers // num_replicas cache = cache // num_replicas # Seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # Datasets tokenizer = AutoTokenizer.from_pretrained(config.bert_model, do_lower_case=config.do_lower_case) train_dataset = ConceptCapLoaderTrain(args.annotations_path, args.features_path, tokenizer, seq_len=args.max_seq_length, batch_size=args.train_batch_size, num_workers=args.num_workers, local_rank=args.local_rank, objective=args.objective, cache=cache, add_global_imgfeat=config.add_global_imgfeat, num_locs=config.num_locs) valid_dataset = ConceptCapLoaderVal(args.annotations_path, args.features_path, tokenizer, seq_len=args.max_seq_length, batch_size=args.train_batch_size, num_workers=2, objective=args.objective, add_global_imgfeat=config.add_global_imgfeat, num_locs=config.num_locs) # Task details task_names = ["Conceptual_Caption"] task_ids = ["TASK0"] task2num_iters = {"TASK0": train_dataset.num_dataset / args.train_batch_size} # Logging logdir = os.path.join(args.logdir, timestamp) if default_gpu: tb_logger = tbLogger(logdir, save_path, task_names, task_ids, task2num_iters, args.grad_acc_steps) else: tb_logger = None # Model if args.from_pretrained: type_vocab_size = config.type_vocab_size config.type_vocab_size = 2 model = BertForVLPreTraining.from_pretrained(args.from_pretrained, config=config, default_gpu=default_gpu, from_hf=True) # Resize type embeddings model.bert.embeddings.token_type_embeddings = \ model._get_resized_embeddings(model.bert.embeddings.token_type_embeddings, type_vocab_size) config.type_vocab_size = type_vocab_size else: model = BertForVLPreTraining(config) # Optimization details freeze_layers(model) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] bert_weight_name = json.load(open("config/" + args.from_pretrained + "_weight_name.json", "r")) if not args.from_pretrained: param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [ {"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay}, {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] else: optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if key[12:] in bert_weight_name: lr = args.learning_rate * 0.1 else: lr = args.learning_rate if any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{"params": [value], "lr": lr, "weight_decay": 0.0}] if not any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{"params": [value], "lr": lr, "weight_decay": args.weight_decay}] if default_gpu: print(len(list(model.named_parameters())), len(optimizer_grouped_parameters)) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=args.adam_betas) num_train_optimization_steps = int( train_dataset.num_dataset / args.train_batch_size / args.grad_acc_steps ) * args.num_train_epochs warmup_steps = args.warmup_steps or args.warmup_proportion * num_train_optimization_steps scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) # Resume training start_iter_id, global_step, start_epoch, tb_logger, _ = \ resume(args.resume_file, model, optimizer, scheduler, tb_logger) # Move to GPU(s) model.cuda() for state in optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Save starting model save(save_path, logger, -1, model, optimizer, scheduler, global_step, tb_logger, default_gpu, -1) # Print summary if default_gpu: summary_parameters(model, logger) logger.info("***** Running training *****") logger.info(" Num examples = %d", train_dataset.num_dataset) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) # Train for epoch_id in range(start_epoch, int(args.num_train_epochs)): model.train() for step, batch in enumerate(train_dataset): iter_id = start_iter_id + step + (epoch_id * len(train_dataset)) batch = tuple(t.cuda(device=device, non_blocking=True) for t in batch[:-1]) input_ids, input_mask, segment_ids, lm_label_ids, is_match, \ image_feat, image_loc, image_cls, obj_labels, obj_confs, \ attr_labels, attr_confs, image_attrs, image_label, image_mask = batch if args.objective == 1: # Ignore labels (setting them to -1) for mismatched caption-image pairs image_label = image_label * (is_match == 0).long().unsqueeze(1) image_label[image_label == 0] = -1 lm_label_ids = lm_label_ids * (is_match == 0).long().unsqueeze(1) lm_label_ids[lm_label_ids == 0] = -1 masked_loss_t, masked_loss_v, pair_match_loss = model(input_ids, image_feat, image_loc, segment_ids, input_mask, image_mask, lm_label_ids, image_label, image_cls, obj_labels, obj_confs, attr_labels, attr_confs, image_attrs, is_match) if args.objective == 2: pair_match_loss = pair_match_loss * 0 loss = masked_loss_t + masked_loss_v + pair_match_loss if n_gpu > 1: loss = loss.mean() masked_loss_t = masked_loss_t.mean() masked_loss_v = masked_loss_v.mean() pair_match_loss = pair_match_loss.mean() if args.grad_acc_steps > 1: loss = loss / args.grad_acc_steps loss.backward() if (step + 1) % args.grad_acc_steps == 0: # Clip gradient if args.clip_grad_norm > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad_norm) optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 if default_gpu: tb_logger.step_train_CC(epoch_id, iter_id, float(masked_loss_t), float(masked_loss_v), float(pair_match_loss), optimizer.param_groups[0]["lr"], "TASK0", "train") if (step % (20 * args.grad_acc_steps) == 0) and step != 0 and default_gpu: tb_logger.showLossTrainCC() # Do the evaluation torch.set_grad_enabled(False) numBatches = len(valid_dataset) model.eval() for step, batch in enumerate(valid_dataset): batch = tuple(t.cuda(device=device, non_blocking=True) for t in batch[:-1]) input_ids, input_mask, segment_ids, lm_label_ids, is_match, \ image_feat, image_loc, image_cls, obj_labels, obj_confs, \ attr_labels, attr_confs, image_attrs, image_label, image_mask = batch batch_size = input_ids.size(0) masked_loss_t, masked_loss_v, pair_match_loss = model(input_ids, image_feat, image_loc, segment_ids, input_mask, image_mask, lm_label_ids, image_label, image_cls, obj_labels, obj_confs, attr_labels, attr_confs, image_attrs, is_match) loss = masked_loss_t + masked_loss_v + pair_match_loss if n_gpu > 1: loss = loss.mean() masked_loss_t = masked_loss_t.mean() masked_loss_v = masked_loss_v.mean() pair_match_loss = pair_match_loss.mean() if default_gpu: tb_logger.step_val_CC(epoch_id, iter_id, float(masked_loss_t), float(masked_loss_v), float(pair_match_loss), "TASK0", batch_size, "val") sys.stdout.write("%d / %d \r" % (step, numBatches)) sys.stdout.flush() if default_gpu: tb_logger.showLossValCC() torch.set_grad_enabled(True) save(save_path, logger, epoch_id, model, optimizer, scheduler, global_step, tb_logger, default_gpu, loss) if default_gpu: tb_logger.txt_close()
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--from_pretrained", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--output_dir", default="save", type=str, help="The output directory where the model checkpoints will be written.", ) parser.add_argument( "--config_file", default="config/bert_base_6layer_6conect.json", type=str, help="The config file which specified the model details.", ) parser.add_argument( "--num_train_epochs", default=20, type=int, help="Total number of training epochs to perform.", ) parser.add_argument( "--train_iter_multiplier", default=1.0, type=float, help="multiplier for the multi-task training.", ) parser.add_argument( "--train_iter_gap", default=4, type=int, help="forward every n iteration is the validation score is not improving over the last 3 epoch, -1 means will stop", ) parser.add_argument( "--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.", ) parser.add_argument( "--no_cuda", action="store_true", help="Whether not to use CUDA when available" ) parser.add_argument( "--do_lower_case", default=True, type=bool, help="Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument( "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus", ) parser.add_argument( "--seed", type=int, default=0, help="random seed for initialization" ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumualte before performing a backward/update pass.", ) parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument( "--num_workers", type=int, default=16, help="Number of workers in the dataloader.", ) parser.add_argument( "--save_name", default="", type=str, help="save name for training." ) parser.add_argument( "--in_memory", default=False, type=bool, help="whether use chunck for parallel training.", ) parser.add_argument( "--optim", default="AdamW", type=str, help="what to use for the optimization." ) parser.add_argument( "--freeze", default=-1, type=int, help="till which layer of textual stream of vilbert need to fixed.", ) parser.add_argument( "--vision_scratch", action="store_true", help="whether pre-trained the image or not.", ) parser.add_argument( "--evaluation_interval", default=1, type=int, help="evaluate very n epoch." ) parser.add_argument( "--lr_scheduler", default="mannul", type=str, help="whether use learning rate scheduler.", ) parser.add_argument( "--baseline", action="store_true", help="whether use single stream baseline." ) parser.add_argument( "--resume_file", default="", type=str, help="Resume from checkpoint" ) parser.add_argument( "--dynamic_attention", action="store_true", help="whether use dynamic attention.", ) parser.add_argument( "--clean_train_sets", default=True, type=bool, help="whether clean train sets for multitask data.", ) parser.add_argument( "--visual_target", default=0, type=int, help="which target to use for visual branch. \ 0: soft label, \ 1: regress the feature, \ 2: NCE loss.", ) args = parser.parse_args() with open("task_config.yml", "r") as f: task_cfg = edict(yaml.safe_load(f)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.baseline: from pytorch_transformers.modeling_bert import BertConfig from src.models.basebert import BaseBertForVLTasks else: from src.models.vilbert import BertConfig from src.models.vilbert import VILBertForVLTasks name = task_cfg["name"] task_lr = task_cfg["lr"] base_lr = task_lr loss_scale = task_lr / base_lr if args.save_name: prefix = "-" + args.save_name else: prefix = "" timeStamp = ( args.config_file.split("/")[1].split(".")[0] + prefix ) savePath = os.path.join(args.output_dir, timeStamp) bert_weight_name = json.load( open("config/" + args.bert_model + "_weight_name.json", "r") ) if args.local_rank == -1 or args.no_cuda: device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu" ) n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16 ) ) default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True if default_gpu: if not os.path.exists(savePath): os.makedirs(savePath) config = BertConfig.from_json_file(args.config_file) if default_gpu: # save all the hidden parameters. with open(os.path.join(savePath, "command.txt"), "w") as f: print(args, file=f) # Python 3.x print("\n", file=f) print(config, file=f) # load dataset task_batch_size, task_num_iters, task_datasets_train, task_datasets_val, task_dataloader_train, task_dataloader_val = LoadDatasets( args, task_cfg ) logdir = os.path.join(savePath, "logs") tbLogger = utils.tbLogger( logdir, savePath, task_num_iters, args.gradient_accumulation_steps, ) if args.visual_target == 0: config.v_target_size = 1601 config.visual_target = args.visual_target else: config.v_target_size = 2048 config.visual_target = args.visual_target if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_ave_iter = {} task_stop_controller = {} task_ave_iter = int( task_cfg["num_epoch"] * task_num_iters * args.train_iter_multiplier / args.num_train_epochs ) task_stop_controller = utils.TaskStopOnPlateau( mode="max", patience=1, continue_threshold=0.005, cooldown=1, threshold=0.001, ) median_num_iter = task_ave_iter num_train_optimization_steps = ( median_num_iter * args.num_train_epochs // args.gradient_accumulation_steps ) num_labels = task_datasets_train.num_labels if args.dynamic_attention: config.dynamic_attention = True model = VILBertForVLTasks.from_pretrained( args.from_pretrained, config=config, num_labels=num_labels, default_gpu=default_gpu, ) task_losses = LoadLosses(args, task_cfg) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] if args.freeze != -1: bert_weight_name_filtered = [] for name in bert_weight_name: if "embeddings" in name: bert_weight_name_filtered.append(name) elif "encoder" in name: layer_num = name.split(".")[2] if int(layer_num) <= args.freeze: bert_weight_name_filtered.append(name) optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if key[12:] in bert_weight_name_filtered: value.requires_grad = False if default_gpu: print("filtered weight") print(bert_weight_name_filtered) optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if "vil_" in key: lr = 1e-4 else: if args.vision_scratch: if key[12:] in bert_weight_name: lr = base_lr else: lr = 1e-4 else: lr = base_lr if any(nd in key for nd in no_decay): optimizer_grouped_parameters += [ {"params": [value], "lr": lr, "weight_decay": 0.0} ] if not any(nd in key for nd in no_decay): optimizer_grouped_parameters += [ {"params": [value], "lr": lr, "weight_decay": 0.01} ] if default_gpu: print(len(list(model.named_parameters())), len(optimizer_grouped_parameters)) # choose optimizer if args.optim == "AdamW": optimizer = AdamW(optimizer_grouped_parameters, lr=base_lr, correct_bias=False) elif args.optim == "RAdam": optimizer = RAdam(optimizer_grouped_parameters, lr=base_lr) # choose scheduler warmpu_steps = args.warmup_proportion * num_train_optimization_steps if args.lr_scheduler == "warmup_linear": warmup_scheduler = WarmupLinearSchedule( optimizer, warmup_steps=warmpu_steps, t_total=num_train_optimization_steps ) else: warmup_scheduler = WarmupConstantSchedule(optimizer, warmup_steps=warmpu_steps) lr_reduce_list = np.array([5, 7]) if args.lr_scheduler == "automatic": lr_scheduler = ReduceLROnPlateau( optimizer, mode="max", factor=0.2, patience=1, cooldown=1, threshold=0.001 ) elif args.lr_scheduler == "cosine": lr_scheduler = CosineAnnealingLR( optimizer, T_max=median_num_iter * args.num_train_epochs ) elif args.lr_scheduler == "cosine_warm": lr_scheduler = CosineAnnealingWarmRestarts( optimizer, T_0=median_num_iter * args.num_train_epochs ) elif args.lr_scheduler == "mannul": def lr_lambda_fun(epoch): return pow(0.2, np.sum(lr_reduce_list <= epoch)) lr_scheduler = LambdaLR(optimizer, lr_lambda=lr_lambda_fun) startIterID = 0 global_step = 0 start_epoch = 0 if args.resume_file != "" and os.path.exists(args.resume_file): checkpoint = torch.load(args.resume_file, map_location="cpu") new_dict = {} for attr in checkpoint["model_state_dict"]: if attr.startswith("module."): new_dict[attr.replace("module.", "", 1)] = checkpoint[ "model_state_dict" ][attr] else: new_dict[attr] = checkpoint["model_state_dict"][attr] model.load_state_dict(new_dict) warmup_scheduler.load_state_dict(checkpoint["warmup_scheduler_state_dict"]) # lr_scheduler.load_state_dict(checkpoint['lr_scheduler_state_dict']) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) global_step = checkpoint["global_step"] start_epoch = int(checkpoint["epoch_id"]) + 1 task_stop_controller = checkpoint["task_stop_controller"] tbLogger = checkpoint["tb_logger"] del checkpoint model.to(device) print("`==============`MODEL=============") print(next(model.parameters()).is_cuda)#False for state in optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, delay_allreduce=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) if default_gpu: print("***** Running training *****") print(" Num Iters: ", task_num_iters) print(" Batch size: ", task_batch_size) print(" Num steps: %d" % num_train_optimization_steps) task_iter_train = None task_count = 0 for epochId in tqdm(range(start_epoch, args.num_train_epochs), desc="Epoch", ncols=100): model.train() for step in range(median_num_iter): iterId = startIterID + step + (epochId * median_num_iter) first_task = True is_forward = False if (not task_stop_controller.in_stop) or ( iterId % args.train_iter_gap == 0 ): is_forward = True if is_forward: loss, score = ForwardModelsTrain( args, task_cfg, device, task_count, task_iter_train, task_dataloader_train, model, task_losses, ) loss = loss * loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion, ) for param_group in optimizer.param_groups: param_group["lr"] = lr_this_step if first_task and ( global_step < warmpu_steps or args.lr_scheduler == "warmup_linear" ): warmup_scheduler.step() optimizer.step() model.zero_grad() if first_task: global_step += 1 first_task = False if default_gpu: tbLogger.step_train( epochId, iterId, float(loss), float(score), optimizer.param_groups[0]["lr"], "train", ) if "cosine" in args.lr_scheduler and global_step > warmpu_steps: lr_scheduler.step() if ( step % (20 * args.gradient_accumulation_steps) == 0 and step != 0 and default_gpu ): tbLogger.showLossTrain() # decided whether to evaluate on SNLI tasks. if (iterId != 0 and iterId % task_num_iters == 0) or ( epochId == args.num_train_epochs - 1 and step == median_num_iter - 1 ): evaluate( args, task_dataloader_val, task_stop_controller, task_cfg, device, model, task_losses, epochId, default_gpu, tbLogger, ) if args.lr_scheduler == "automatic": lr_scheduler.step(sum(val_scores.values())) logger.info("best average score is %3f" % lr_scheduler.best) elif args.lr_scheduler == "mannul": lr_scheduler.step() if epochId in lr_reduce_list: # reset the task_stop_controller once the lr drop task_stop_controller._reset() if default_gpu: # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = ( model.module if hasattr(model, "module") else model ) # Only save the model it-self output_model_file = os.path.join( savePath, "pytorch_model_" + str(epochId) + ".bin" ) output_checkpoint = os.path.join(savePath, "pytorch_ckpt_latest.tar") torch.save(model_to_save.state_dict(), output_model_file) torch.save( { "model_state_dict": model_to_save.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "warmup_scheduler_state_dict": warmup_scheduler.state_dict(), # 'lr_scheduler_state_dict': lr_scheduler.state_dict(), "global_step": global_step, "epoch_id": epochId, "task_stop_controller": task_stop_controller, "tb_logger": tbLogger, }, output_checkpoint, ) tbLogger.txt_close()
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_data', type=Path, required=True) parser.add_argument('--output_dir', type=Path, required=True) parser.add_argument( "--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--do_lower_case", action="store_true") parser.add_argument( "--reduce_memory", action="store_true", help= "Store training data as on-disc memmaps to massively reduce memory usage" ) parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument("--wp", type=bool, default=False, help="if train on wp") parser.add_argument( '--from_scratch', action='store_true', help='do not load prtrain model, only random initialize') parser.add_argument("--output_step", type=int, default=100000, help="Number of step to save model") args = parser.parse_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" samples_per_epoch = [] num_data_epochs = args.epochs for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}.json" metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) args.n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) args.output_mode = "classification" if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning( f"Output directory ({args.output_dir}) already exists and is not empty!" ) args.output_dir.mkdir(parents=True, exist_ok=True) while True: try: tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) if tokenizer._noi_token is None: tokenizer._noi_token = '[NOI]' if args.bert_model == 'bert-base-uncased' or 'bert-large-uncased': tokenizer.vocab['[NOI]'] = tokenizer.vocab.pop('[unused0]') else: tokenizer.vocab['[NOI]'] = tokenizer.vocab.pop('[unused1]') # else: # raise ValueError("No clear choice for insert NOI for tokenizer type {}".format(args.model_name_or_path)) tokenizer.ids_to_tokens[1] = '[NOI]' logger.info("Adding [NOI] to the vocabulary 1") except: continue break total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int(total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model if args.from_scratch: model = BertForMaskedLM() else: model = BertForMaskedLM.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) model.train() for epoch in range(args.epochs): epoch_dataset = PregeneratedDataset( epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory, args=args) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids = batch outputs = model( input_ids, segment_ids, input_mask, lm_label_ids, ) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() # Update learning rate schedule optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % args.output_step == 0 and args.local_rank in [ -1, 0 ]: # Save model checkpoint output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.local_rank in [-1, 0]: # Save model checkpoint output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) logger.info("PROGRESS: {}%".format( round(100 * (epoch + 1) / args.epochs, 4))) logger.info("EVALERR: {}%".format(tr_loss)) # Save a trained model if args.local_rank == -1 or torch.distributed.get_rank() == 0: logging.info("** ** * Saving fine-tuned model ** ** * ") logger.info("Saving model checkpoint to %s", args.output_dir) model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir)
def prepare_optimizer_and_scheduler( args: Namespace, model: nn.Module, num_batches: int, ) -> Tuple[AdamW, WarmupLinearSchedule]: """Configures BERT's AdamW optimizer and WarmupLinearSchedule learning rate scheduler. Divides parameters into two learning rate groups, with higher learning rate for non-BERT parameters (classifier model).""" t_total = (num_batches // args.gradient_accumulation_steps * args.num_train_epochs) if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() logger.info(" Total optimization steps = %d", t_total) # Prepare optimizer param_optimizer = list( filter(lambda p: p[1].requires_grad, model.named_parameters())) no_decay = ['bias', 'LayerNorm.weight'] higher_lr = ['classifier', 'crf', 'lstm'] def is_classifier_param(param_name: str) -> bool: return any(hl in param_name for hl in higher_lr) def ignore_in_weight_decay(param_name: str) -> bool: return any(nd in param_name for nd in no_decay) optimizer_grouped_parameters = [ { 'params': [ p for name, p in param_optimizer if not ignore_in_weight_decay(name) and not is_classifier_param(name) ], 'weight_decay': 0.01 }, { 'params': [ p for name, p in param_optimizer if not ignore_in_weight_decay(name) and is_classifier_param(name) ], 'weight_decay': 0.01, 'lr': args.classifier_lr }, { 'params': [ p for name, p in param_optimizer if ignore_in_weight_decay(name) and not is_classifier_param(name) ], 'weight_decay': 0.0 }, ] # To reproduce BertAdam specific behavior set correct_bias=False optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) num_warmup_steps = t_total * args.warmup_proportion scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=t_total) return optimizer, scheduler
def train(train_task_name, model, tokenizer): set_seed(42) # for reproductibility # prepare training dataset train_features = convert_to_input_features_helper(train_examples, tokenizer, use_multiprocessing) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) # !!!! no minus 1 train_dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # total batch size train_batch_size = per_gpu_train_batch_size * max(1, n_gpus) train_sampler = SequentialSampler(train_dataset) # was random sampler train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size) if max_steps > 0: t_total = max_steps num_trian_epochs = max_steps // len( train_dataloader) // gradient_accumulation_steps + 1 else: t_total = len( train_dataloader) // gradient_accumulation_steps * num_train_epochs # prepare optimizer and schedule (linear warmup and decay) warmup_steps = int(t_total * warmup_proportion) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=1e-8) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) if n_gpus > 1: print('*********** using multi gpu! ************') model = torch.nn.DataParallel(model) logger.info("***** Running %s *****", 'training') logger.info(" Num examples = %d", len(train_dataloader)) logger.info(" Batch size per gpu = %d", per_gpu_train_batch_size) logger.info(" Total batch size = %d", train_batch_size) logger.info(" Num steps = %d", t_total) # visualization # train max_grad_norm = 1 epoch = 0 # for visualization loss-epoch global_step = 0 tr_loss, loging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(num_train_epochs), desc='Epoch') saved_loss = [] for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'labels': batch[3] } outputs = model(**inputs) # unpack dict loss, logits = outputs[:2] # model outputs are in tuple saved_loss.append(loss.detach().cpu().numpy().item()) if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps # print("\r%f" % loss, end='') # delete this loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) tr_loss += loss.item() if (step + 1) % gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # call optimizer before scheduler model.zero_grad() global_step += 1 if max_steps > 0 and global_step > max_steps: epoch_iterator.close() break epoch += 1 # save model at each epoch output_model_dir = os.path.join(cache_dir, 'epoch_{}'.format(epoch)) if not os.path.exists(output_model_dir): os.makedirs(output_model_dir) model_to_save = model.module if hasattr( model, 'module') else model # take care of distributed/parallel training model_to_save.save_pretrained(output_model_dir) tokenizer.save_pretrained(output_model_dir) torch.save(stats, os.path.join(output_dir, 'training_args.bin')) logger.info('Saving model at epoch %d to %s' % (epoch, output_model_dir)) # evaluation using saved model if max_steps > 0 and global_step > max_steps: train_iterator.close() break # draw and save loss-step graph save_loss(saved_loss, global_step)
def _train_one_epoch(model, loader, device, context): """ Called by torch_xla_py.data_parallel. This function is executed on each core of the TPU once per epoch""" # model parameters param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # one optimizer and scheduler per TPU core. Both objects are saved in `context` to be reused the next epoch optimizer = context.getattr_or( 'optimizer', AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=tuple(args.betas))) scheduler = context.getattr_or( 'scheduler', WarmupLinearSchedule(optimizer, warmup_steps=warmup_updates, t_total=total_num_updates)) # restart # TODO: scheduler reset to 0 each epoch scheduler.step(args.scheduler_last_epoch) logging.info(f'Restarting scheduler LR to: {scheduler.get_last_lr()}') tr_loss = None tracker = tpu_xm.RateTracker() model.train() for step, batch in loader: input_ids, input_mask, segment_ids, lm_label_ids, _ = batch outputs = model(input_ids, segment_ids, input_mask, lm_label_ids) loss = outputs[0] if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tracker.add(args.per_tpu_train_batch_size) tr_loss = loss * args.gradient_accumulation_steps if step == 0 else tr_loss + loss * args.gradient_accumulation_steps if (step + 1) % args.gradient_accumulation_steps == 0: tpu_xm.optimizer_step(optimizer) scheduler.step() optimizer.zero_grad() # logging.info(f' Adjusted scheduler LR to {scheduler.get_last_lr()}') # since checkpointing happens each epoch, we only need to save the scheduler state at end of each epoch logging.info(f'Scheduler last_epoch {scheduler.last_epoch}') return tr_loss.item( ) / step # `.item()` requires a trip from TPU to CPU, which is very slow. Use it only once per epoch=
def main(config): args = config if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) processor = ATEPCProcessor() label_list = processor.get_labels() num_labels = len(label_list) + 1 datasets = { 'camera': "atepc_datasets/camera", 'car': "atepc_datasets/car", 'phone': "atepc_datasets/phone", 'notebook': "atepc_datasets/notebook", 'laptop': "atepc_datasets/laptop", 'restaurant': "atepc_datasets/restaurant", 'twitter': "atepc_datasets/twitter", 'mixed': "atepc_datasets/mixed", } pretrained_bert_models = { 'camera': "bert-base-chinese", 'car': "bert-base-chinese", 'phone': "bert-base-chinese", 'notebook': "bert-base-chinese", 'laptop': "bert-base-uncased", 'restaurant': "bert-base-uncased", 'twitter': "bert-base-uncased", 'mixed': "bert-base-multilingual-uncased", } args.bert_model = pretrained_bert_models[args.dataset] args.data_dir = datasets[args.dataset] def convert_polarity(examples): for i in range(len(examples)): polarities = [] for polarity in examples[i].polarity: if polarity == 2: polarities.append(1) else: polarities.append(polarity) examples[i].polarity = polarities tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=True) train_examples = processor.get_train_examples(args.data_dir) eval_examples = processor.get_test_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs bert_base_model = BertModel.from_pretrained(args.bert_model) bert_base_model.config.num_labels = num_labels if args.dataset in {'camera', 'car', 'phone', 'notebook'}: convert_polarity(train_examples) convert_polarity(eval_examples) model = LCF_ATEPC(bert_base_model, args=args) else: model = LCF_ATEPC(bert_base_model, args=args) for arg in vars(args): logger.info('>>> {0}: {1}'.format(arg, getattr(args, arg))) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.00001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.00001 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, weight_decay=0.00001) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) all_spc_input_ids = torch.tensor([f.input_ids_spc for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_polarities = torch.tensor([f.polarities for f in eval_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in eval_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_spc_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_polarities, all_valid_ids, all_lmask_ids) # Run prediction for full data eval_sampler = RandomSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) def evaluate(eval_ATE=True, eval_APC=True): # evaluate apc_result = {'max_apc_test_acc': 0, 'max_apc_test_f1': 0} ate_result = 0 y_true = [] y_pred = [] n_test_correct, n_test_total = 0, 0 test_apc_logits_all, test_polarities_all = None, None model.eval() label_map = {i: label for i, label in enumerate(label_list, 1)} for input_ids_spc, input_mask, segment_ids, label_ids, polarities, valid_ids, l_mask in eval_dataloader: input_ids_spc = input_ids_spc.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) valid_ids = valid_ids.to(device) label_ids = label_ids.to(device) polarities = polarities.to(device) l_mask = l_mask.to(device) with torch.no_grad(): ate_logits, apc_logits = model(input_ids_spc, segment_ids, input_mask, valid_ids=valid_ids, polarities=polarities, attention_mask_label=l_mask) if eval_APC: polarities = model.get_batch_polarities(polarities) n_test_correct += (torch.argmax( apc_logits, -1) == polarities).sum().item() n_test_total += len(polarities) if test_polarities_all is None: test_polarities_all = polarities test_apc_logits_all = apc_logits else: test_polarities_all = torch.cat( (test_polarities_all, polarities), dim=0) test_apc_logits_all = torch.cat( (test_apc_logits_all, apc_logits), dim=0) if eval_ATE: if not args.use_bert_spc: label_ids = model.get_batch_token_labels_bert_base_indices( label_ids) ate_logits = torch.argmax(F.log_softmax(ate_logits, dim=2), dim=2) ate_logits = ate_logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() input_mask = input_mask.to('cpu').numpy() for i, label in enumerate(label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(label): if j == 0: continue elif label_ids[i][j] == len(label_list): y_true.append(temp_1) y_pred.append(temp_2) break else: temp_1.append(label_map.get(label_ids[i][j], 'O')) temp_2.append(label_map.get(ate_logits[i][j], 'O')) if eval_APC: test_acc = n_test_correct / n_test_total if args.dataset in {'camera', 'car', 'phone', 'notebook'}: test_f1 = f1_score(torch.argmax(test_apc_logits_all, -1).cpu(), test_polarities_all.cpu(), labels=[0, 1], average='macro') else: test_f1 = f1_score(torch.argmax(test_apc_logits_all, -1).cpu(), test_polarities_all.cpu(), labels=[0, 1, 2], average='macro') test_acc = round(test_acc * 100, 2) test_f1 = round(test_f1 * 100, 2) apc_result = { 'max_apc_test_acc': test_acc, 'max_apc_test_f1': test_f1 } if eval_ATE: report = classification_report(y_true, y_pred, digits=4) tmps = report.split() ate_result = round(float(tmps[7]) * 100, 2) return apc_result, ate_result def save_model(path): # Save a trained model and the associated configuration, # Take care of the storage! os.makedirs(path, exist_ok=True) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self model_to_save.save_pretrained(path) tokenizer.save_pretrained(path) label_map = {i: label for i, label in enumerate(label_list, 1)} model_config = { "bert_model": args.bert_model, "do_lower": True, "max_seq_length": args.max_seq_length, "num_labels": len(label_list) + 1, "label_map": label_map } json.dump(model_config, open(os.path.join(path, "config.json"), "w")) logger.info('save model to: {}'.format(path)) def train(): train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_spc_input_ids = torch.tensor( [f.input_ids_spc for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in train_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in train_features], dtype=torch.long) all_polarities = torch.tensor([f.polarities for f in train_features], dtype=torch.long) train_data = TensorDataset(all_spc_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_polarities, all_valid_ids, all_lmask_ids) train_sampler = SequentialSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) max_apc_test_acc = 0 max_apc_test_f1 = 0 max_ate_test_f1 = 0 global_step = 0 for epoch in range(int(args.num_train_epochs)): logger.info('#' * 80) logger.info('Train {} Epoch{}'.format(args.seed, epoch + 1, args.data_dir)) logger.info('#' * 80) nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): model.train() batch = tuple(t.to(device) for t in batch) input_ids_spc, input_mask, segment_ids, label_ids, polarities, valid_ids, l_mask = batch loss_ate, loss_apc = model(input_ids_spc, segment_ids, input_mask, label_ids, polarities, valid_ids, l_mask) loss = loss_ate + loss_apc loss.backward() nb_tr_examples += input_ids_spc.size(0) nb_tr_steps += 1 optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % args.eval_steps == 0: if epoch >= args.num_train_epochs - 2 or args.num_train_epochs <= 2: # evaluate only in last 2 epochs apc_result, ate_result = evaluate( eval_ATE=not args.use_bert_spc) # apc_result, ate_result = evaluate() path = '{0}/{1}_{2}_apcacc_{3}_apcf1_{4}_atef1_{5}'.format( args.output_dir, args.dataset, args.local_context_focus, round(apc_result['max_apc_test_acc'], 2), round(apc_result['max_apc_test_f1'], 2), round(ate_result, 2)) if apc_result['max_apc_test_acc'] > max_apc_test_acc: max_apc_test_acc = apc_result['max_apc_test_acc'] if apc_result['max_apc_test_f1'] > max_apc_test_f1: max_apc_test_f1 = apc_result['max_apc_test_f1'] if ate_result > max_ate_test_f1: max_ate_test_f1 = ate_result if apc_result['max_apc_test_acc'] > max_apc_test_acc or \ apc_result['max_apc_test_f1'] > max_apc_test_f1 or \ ate_result > max_ate_test_f1: save_model(path) current_apc_test_acc = apc_result['max_apc_test_acc'] current_apc_test_f1 = apc_result['max_apc_test_f1'] current_ate_test_f1 = round(ate_result, 2) logger.info('*' * 80) logger.info('Train {} Epoch{}, Evaluate for {}'.format( args.seed, epoch + 1, args.data_dir)) logger.info( f'APC_test_acc: {current_apc_test_acc}(max: {max_apc_test_acc}) ' f'APC_test_f1: {current_apc_test_f1}(max: {max_apc_test_f1})' ) if args.use_bert_spc: logger.info( f'ATE_test_F1: {current_apc_test_f1}(max: {max_apc_test_f1})' f' (Unreliable since `use_bert_spc` is "True".)' ) else: logger.info( f'ATE_test_f1: {current_ate_test_f1}(max:{max_ate_test_f1})' ) logger.info('*' * 80) return [max_apc_test_acc, max_apc_test_f1, max_ate_test_f1] return train()
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_data', type=Path, required=True) parser.add_argument('--output_dir', type=Path, required=True) parser.add_argument( "--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--do_lower_case", action="store_true") parser.add_argument( "--reduce_memory", action="store_true", help= "Store training data as on-disc memmaps to massively reduce memory usage" ) parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") args = parser.parse_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}.json" metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break else: num_data_epochs = args.epochs if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning( f"Output directory ({args.output_dir}) already exists and is not empty!" ) args.output_dir.mkdir(parents=True, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int(total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForMaskedLM.from_pretrained(args.bert_model) # We don't need to manually call model.half() following Apex's recommend # if args.fp16: # model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) if args.fp16: try: # from apex.optimizers import FP16_Optimizer # from apex.optimizers import FusedAdam from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) # This below line of code is the main upgrade of Apex Fp16 implementation. I chose opt_leve="01" # because it's recommended for typical use by Apex. We can make it configured model, optimizer = amp.initialize(model, optimizer, opt_level="O1") # We don't need to use FP16_Optimizer wrapping over FusedAdam as well. Now Apex supports all Pytorch Optimizer # optimizer = FusedAdam(optimizer_grouped_parameters, # lr=args.learning_rate, # bias_correction=False, # max_grad_norm=1.0) # if args.loss_scale == 0: # optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) # else: # optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) # else: # optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) # scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) model.train() for epoch in range(args.epochs): epoch_dataset = PregeneratedDataset( epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch outputs = model(input_ids, attention_mask=input_mask, masked_lm_labels=lm_label_ids) loss = outputs[0] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: # I depricate FP16_Optimizer's backward func and replace as Apex document # optimizer.backward(loss) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule optimizer.zero_grad() global_step += 1 # Save a trained model if args.local_rank == -1 or torch.distributed.get_rank() == 0: logging.info("** ** * Saving fine-tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=64, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=256, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() # if args.server_ip and args.server_port: # # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script # import ptvsd # print("Waiting for debugger attach") # ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) # ptvsd.wait_for_attach() processors = { # "cola": ColaProcessor, # "mnli": MnliProcessor, # "mnli-mm": MnliMismatchedProcessor, # "mrpc": MrpcProcessor, # "sst-2": Sst2Processor, # "sts-b": StsbProcessor, # "qqp": QqpProcessor, # "qnli": QnliProcessor, "rte": RteProcessor # "wnli": WnliProcessor, } output_modes = { # "cola": "classification", # "mnli": "classification", # "mrpc": "classification", # "sst-2": "classification", # "sts-b": "regression", # "qqp": "classification", # "qnli": "classification", "rte": "classification" # "wnli": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() #[0,1] num_labels = len(label_list) train_examples = None # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_TRANSFORMERS_CACHE), 'distributed_{}'.format( args.local_rank)) global_step = 0 nb_tr_steps = 0 tr_loss = 0 max_test_unseen_acc = 0.0 max_dev_unseen_acc = 0.0 max_dev_seen_acc = 0.0 max_overall_acc = 0.0 '''load test set''' seen_types = set() test_examples, test_label_list, test_hypo_seen_str_indicator, test_hypo_2_type_index = processor.get_examples_emotion_test( '/export/home/Dataset/Stuttgart_Emotion/unify-emotion-datasets-master/zero-shot-split/test.txt', seen_types) test_features = convert_examples_to_features( test_examples, label_list, args.max_seq_length, BertTokenizer.from_pretrained( '/export/home/Dataset/fine_tune_Bert_stored/FineTuneOnRTE', do_lower_case=args.do_lower_case), output_mode) test_all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) test_all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) test_all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) test_all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long) test_data = TensorDataset(test_all_input_ids, test_all_input_mask, test_all_segment_ids, test_all_label_ids) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) ''' start evaluate on test set after this epoch ''' modelpaths = [ '/export/home/Dataset/fine_tune_Bert_stored/FineTuneOnRTE', '/export/home/Dataset/fine_tune_Bert_stored/FineTuneOnMNLI', '/export/home/Dataset/fine_tune_Bert_stored/FineTuneOnFEVER' ] pred_probs_ensemble = 0.0 for i, modelpath in enumerate(modelpaths): # pretrain_model_dir = '/export/home/Dataset/fine_tune_Bert_stored/FineTuneOnRTE' #FineTuneOnCombined'# FineTuneOnMNLI model = BertForSequenceClassification.from_pretrained( modelpath, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained( modelpath, do_lower_case=args.do_lower_case) if args.fp16: model.half() model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) model.eval() logger.info("***** Running testing *****") logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", args.eval_batch_size) test_loss = 0 nb_test_steps = 0 preds = [] print('Testing...') for input_ids, input_mask, segment_ids, label_ids in test_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) logits = logits[0] if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) # print('preds:', preds) preds = preds[0] pred_probs_i = softmax(preds, axis=1)[:, 0] pred_binary_labels_harsh = [] pred_binary_labels_loose = [] for i in range(preds.shape[0]): if preds[i][0] > preds[i][1] + 0.1: pred_binary_labels_harsh.append(0) else: pred_binary_labels_harsh.append(1) if preds[i][0] > preds[i][1]: pred_binary_labels_loose.append(0) else: pred_binary_labels_loose.append(1) seen_acc, unseen_acc = evaluate_emotion_zeroshot_TwpPhasePred( pred_probs_i, pred_binary_labels_harsh, pred_binary_labels_loose, test_label_list, test_hypo_seen_str_indicator, test_hypo_2_type_index, seen_types) print('seen:', seen_acc, 'unseen:', unseen_acc) print('\n\n this model preds over\n\n\n') if i == 0: pred_probs_ensemble = softmax(preds, axis=1) else: pred_probs_ensemble += softmax(preds, axis=1) pred_probs_ensemble = softmax(pred_probs_ensemble, axis=1) pred_probs = pred_probs_ensemble[:, 0] pred_binary_labels_harsh = [] pred_binary_labels_loose = [] for i in range(preds.shape[0]): if pred_probs_ensemble[i][0] > pred_probs_ensemble[i][1] + 0.1: pred_binary_labels_harsh.append(0) else: pred_binary_labels_harsh.append(1) if pred_probs_ensemble[i][0] > pred_probs_ensemble[i][1]: pred_binary_labels_loose.append(0) else: pred_binary_labels_loose.append(1) seen_acc, unseen_acc = evaluate_emotion_zeroshot_TwpPhasePred( pred_probs, pred_binary_labels_harsh, pred_binary_labels_loose, test_label_list, test_hypo_seen_str_indicator, test_hypo_2_type_index, seen_types) if unseen_acc > max_test_unseen_acc: max_test_unseen_acc = unseen_acc print('\n\n\t test seen_f1 & unseen_f1:', seen_acc, unseen_acc, ' max_test_unseen_f1:', max_test_unseen_acc, '\n')
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_corpus", default=None, type=str, required=True, help="The input train corpus.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--cuda_device', default='0', type=str, help="Which GPU card to target") args = parser.parse_args() os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda_device if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train: raise ValueError( "Training is currently the only implemented execution option. Please set `do_train`." ) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir) and ( args.local_rank == -1 or torch.distributed.get_rank() == 0): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) #train_examples = None num_train_optimization_steps = None if args.do_train: print("Loading Train Dataset", args.train_corpus) train_dataset = BERTDataset(args.train_corpus, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForPreTraining.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: #TODO: check if this works with current data generator from disk that relies on next(file) # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) best_avg_loss = 10000000 best_dir = os.path.join(args.output_dir, 'best') loss_dir = os.path.join(args.output_dir, 'losses') os.mkdir(loss_dir) os.mkdir(best_dir) model.train() for epoch in trange(int(args.num_train_epochs), desc="Epoch"): losses = [] tr_loss = 0 running_loss = 0 curr_step = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch outputs = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) loss = outputs[0] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() running_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule optimizer.zero_grad() global_step += 1 curr_step += 1 if curr_step % 1000 == 0: logger.info('Average loss after %d steps is: %.5f' % (curr_step, running_loss / 1000)) losses.append(running_loss / 1000) # Save a trained model if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info( "** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training if running_loss < best_avg_loss: best_avg_loss = running_loss model_to_save.save_pretrained(best_dir) tokenizer.save_pretrained(best_dir) curr_dir = os.path.join(args.output_dir, 'epoch_%d' % epoch) if not os.path.exists(curr_dir): os.mkdir(curr_dir) model_to_save.save_pretrained(curr_dir) tokenizer.save_pretrained(curr_dir) running_loss = 0.0 if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training if running_loss < best_avg_loss: best_avg_loss = running_loss model_to_save.save_pretrained(best_dir) tokenizer.save_pretrained(best_dir) curr_dir = os.path.join(args.output_dir, 'epoch_%d' % epoch) if not os.path.exists(curr_dir): os.mkdir(curr_dir) model_to_save.save_pretrained(curr_dir) tokenizer.save_pretrained(curr_dir) with open(os.path.join(loss_dir, 'epoch_%d' % epoch), 'w+', encoding='utf-8') as fp: json.dump(losses, fp)
def _prepare_optimizer(self, learning_rate, loss_scale, warmup_proportion, num_train_optimization_steps): """Initialize the optimizer Arguments: learning_rate {float} -- The initial learning rate for Adam loss_scale {float} -- Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True. 0 (default value): dynamic loss scaling. Positive power of 2: static loss scaling value. warmup_proportion {float} -- Proportion of training to perform linear learning rate warmup for E.g., 0.1 = 10%% of training num_train_optimization_steps {int} -- Number of optimization steps Returns: Optimizer -- The optimizer to use while training """ param_optimizer = list(self.model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if self.fp16: optimizer = FusedAdam(optimizer_grouped_parameters, lr=learning_rate, bias_correction=False, max_grad_norm=1.0) if loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=loss_scale) warmup_linear = WarmupLinearSchedule( warmup_steps=warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = AdamW( optimizer_grouped_parameters, lr=learning_rate, ) warmup_linear = None return optimizer, warmup_linear
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=64, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=256, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() # if args.server_ip and args.server_port: # # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script # import ptvsd # print("Waiting for debugger attach") # ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) # ptvsd.wait_for_attach() processors = { # "cola": ColaProcessor, # "mnli": MnliProcessor, # "mnli-mm": MnliMismatchedProcessor, # "mrpc": MrpcProcessor, # "sst-2": Sst2Processor, # "sts-b": StsbProcessor, # "qqp": QqpProcessor, # "qnli": QnliProcessor, "rte": RteProcessor # "wnli": WnliProcessor, } output_modes = { # "cola": "classification", # "mnli": "classification", # "mrpc": "classification", # "sst-2": "classification", # "sts-b": "regression", # "qqp": "classification", # "qnli": "classification", "rte": "classification" # "wnli": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() #[0,1] num_labels = len(label_list) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples, seen_types = processor.get_examples_Wikipedia_train('/export/home/Dataset/wikipedia/parsed_output/tokenized_wiki/tokenized_wiki2categories.txt', 100000) #train_pu_half_v1.txt # seen_classes=[0,2,4,6,8] num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_TRANSFORMERS_CACHE), 'distributed_{}'.format(args.local_rank)) # model = BertForSequenceClassification.from_pretrained(args.bert_model, # cache_dir=cache_dir, # num_labels=num_labels) # tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) pretrain_model_dir = 'bert-base-uncased' #FineTuneOnCombined'# FineTuneOnMNLI model = BertForSequenceClassification.from_pretrained(pretrain_model_dir, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained(pretrain_model_dir, do_lower_case=args.do_lower_case) if args.fp16: model.half() model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) global_step = 0 nb_tr_steps = 0 tr_loss = 0 max_test_unseen_acc = 0.0 max_dev_unseen_acc = 0.0 max_dev_seen_acc = 0.0 max_overall_acc = 0.0 if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, output_mode) '''load dev set''' eval_examples, eval_label_list, eval_hypo_seen_str_indicator, eval_hypo_2_type_index = processor.get_examples_situation_test('/export/home/Dataset/LORELEI/zero-shot-split/dev.txt', seen_types) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) eval_all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) eval_all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) eval_all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) eval_all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(eval_all_input_ids, eval_all_input_mask, eval_all_segment_ids, eval_all_label_ids) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) '''load test set''' test_examples, test_label_list, test_hypo_seen_str_indicator, test_hypo_2_type_index = processor.get_examples_situation_test('/export/home/Dataset/LORELEI/zero-shot-split/test.txt', seen_types) test_features = convert_examples_to_features( test_examples, label_list, args.max_seq_length, tokenizer, output_mode) test_all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) test_all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) test_all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) test_all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long) test_data = TensorDataset(test_all_input_ids, test_all_input_mask, test_all_segment_ids, test_all_label_ids) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) # print('train all_label_ids:', all_label_ids) # exit(0) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) iter_co = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): model.train() batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch logits = model(input_ids, segment_ids, input_mask, labels=None) loss_fct = CrossEntropyLoss() loss = loss_fct(logits[0].view(-1, num_labels), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 optimizer.step() optimizer.zero_grad() global_step += 1 iter_co+=1 if iter_co %200==0: ''' start evaluate on dev set after this epoch ''' model.eval() logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0 nb_eval_steps = 0 preds = [] print('Evaluating...') for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) logits = logits[0] loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] ''' preds: size*2 (entail, not_entail) wenpeng added a softxmax so that each row is a prob vec ''' pred_probs = softmax(preds,axis=1)[:,0] pred_binary_labels_harsh = [] pred_binary_labels_loose = [] for i in range(preds.shape[0]): if preds[i][0]>preds[i][1]+0.1: pred_binary_labels_harsh.append(0) else: pred_binary_labels_harsh.append(1) if preds[i][0]>preds[i][1]: pred_binary_labels_loose.append(0) else: pred_binary_labels_loose.append(1) seen_acc, unseen_acc = evaluate_situation_zeroshot_TwpPhasePred(pred_probs, pred_binary_labels_harsh, pred_binary_labels_loose, eval_label_list, eval_hypo_seen_str_indicator, eval_hypo_2_type_index, seen_types) # result = compute_metrics('F1', preds, all_label_ids.numpy()) loss = tr_loss/nb_tr_steps if args.do_train else None # test_acc = mean_f1#result.get("f1") if unseen_acc > max_dev_unseen_acc: max_dev_unseen_acc = unseen_acc print('\ndev seen_f1 & unseen_f1:', seen_acc,unseen_acc, ' max_dev_unseen_f1:', max_dev_unseen_acc, '\n') # if seen_acc+unseen_acc > max_overall_acc: # max_overall_acc = seen_acc + unseen_acc # if seen_acc > max_dev_seen_acc: # max_dev_seen_acc = seen_acc ''' start evaluate on test set after this epoch ''' model.eval() logger.info("***** Running testing *****") logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", args.eval_batch_size) test_loss = 0 nb_test_steps = 0 preds = [] print('Testing...') for input_ids, input_mask, segment_ids, label_ids in test_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) logits = logits[0] if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) # eval_loss = eval_loss / nb_eval_steps preds = preds[0] pred_probs = softmax(preds,axis=1)[:,0] pred_binary_labels_harsh = [] pred_binary_labels_loose = [] for i in range(preds.shape[0]): if preds[i][0]>preds[i][1]+0.1: pred_binary_labels_harsh.append(0) else: pred_binary_labels_harsh.append(1) if preds[i][0]>preds[i][1]: pred_binary_labels_loose.append(0) else: pred_binary_labels_loose.append(1) seen_acc, unseen_acc = evaluate_situation_zeroshot_TwpPhasePred(pred_probs, pred_binary_labels_harsh, pred_binary_labels_loose, test_label_list, test_hypo_seen_str_indicator, test_hypo_2_type_index, seen_types) # result = compute_metrics('F1', preds, all_label_ids.numpy()) # loss = tr_loss/nb_tr_steps if args.do_train else None # test_acc = mean_f1#result.get("f1") if unseen_acc > max_test_unseen_acc: max_test_unseen_acc = unseen_acc print('\n\n\t test seen_f1 & unseen_f1:', seen_acc,unseen_acc, ' max_test_unseen_f1:', max_test_unseen_acc, '\n')
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--file_path", default="data/conceptual_caption/", type=str, help="The input train corpus.", ) parser.add_argument( "--from_pretrained", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-base-uncased, roberta-base, roberta-large, ", ) parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, roberta-base", ) parser.add_argument( "--output_dir", default="save", type=str, # required=True, help= "The output directory where the model checkpoints will be written.", ) parser.add_argument( "--config_file", type=str, default="config/bert_base_6layer_6conect.json", help="The config file which specified the model details.", ) ## Other parameters parser.add_argument( "--max_seq_length", default=36, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.", ) parser.add_argument( "--train_batch_size", default=512, type=int, help="Total batch size for training.", ) parser.add_argument( "--learning_rate", default=1e-4, type=float, help="The initial learning rate for Adam.", ) parser.add_argument( "--num_train_epochs", default=10.0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--start_epoch", default=0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.", ) parser.add_argument("--img_weight", default=1, type=float, help="weight for image loss") parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", action="store_true", help="Whether to load train samples into memory or use disk", ) parser.add_argument( "--do_lower_case", type=bool, default=True, help= "Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument( "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus", ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass.", ) parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument( "--dynamic_attention", action="store_true", help="whether use dynamic attention.", ) parser.add_argument( "--num_workers", type=int, default=25, help="Number of workers in the dataloader.", ) parser.add_argument("--save_name", default="", type=str, help="save name for training.") parser.add_argument( "--baseline", action="store_true", help="Wheter to use the baseline model (single bert).", ) parser.add_argument( "--freeze", default=-1, type=int, help="till which layer of textual stream of vilbert need to fixed.", ) parser.add_argument( "--distributed", action="store_true", help="whether use chunck for parallel training.", ) parser.add_argument("--without_coattention", action="store_true", help="whether pair loss.") parser.add_argument( "--visual_target", default=0, type=int, help="which target to use for visual branch. \ 0: soft label, \ 1: regress the feature, \ 2: NCE loss.", ) parser.add_argument( "--objective", default=0, type=int, help="which objective to use \ 0: with ICA loss, \ 1: with ICA loss, for the not aligned pair, no masking objective, \ 2: without ICA loss, do not sample negative pair.", ) parser.add_argument("--num_negative", default=255, type=int, help="num of negative to use") parser.add_argument("--resume_file", default="", type=str, help="Resume from checkpoint") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") args = parser.parse_args() if args.baseline: from pytorch_pretrained_bert.modeling import BertConfig from vilbert.basebert import BertForMultiModalPreTraining else: from vilbert.vilbert import BertForMultiModalPreTraining, BertConfig if args.save_name: prefix = "-" + args.save_name else: prefix = "" timeStamp = args.config_file.split("/")[1].split(".")[0] + prefix savePath = os.path.join(args.output_dir, timeStamp) bert_weight_name = json.load( open("config/" + args.from_pretrained + "_weight_name.json", "r")) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True if default_gpu: if not os.path.exists(savePath): os.makedirs(savePath) config = BertConfig.from_json_file(args.config_file) if default_gpu: # save all the hidden parameters. with open(os.path.join(savePath, "command.txt"), "w") as f: print(args, file=f) # Python 3.x print("\n", file=f) print(config, file=f) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps cache = 5000 if dist.is_available() and args.local_rank != -1: num_replicas = dist.get_world_size() args.train_batch_size = args.train_batch_size // num_replicas args.num_workers = args.num_workers // num_replicas cache = cache // num_replicas random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if "roberta" in args.bert_model: tokenizer = RobertaTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) else: tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) num_train_optimization_steps = None train_dataset = ConceptCapLoaderTrain( args.file_path, tokenizer, args.bert_model, seq_len=args.max_seq_length, batch_size=args.train_batch_size, visual_target=args.visual_target, num_workers=args.num_workers, local_rank=args.local_rank, objective=args.objective, cache=cache, ) validation_dataset = ConceptCapLoaderVal( args.file_path, tokenizer, args.bert_model, seq_len=args.max_seq_length, batch_size=args.train_batch_size, visual_target=args.visual_target, num_workers=2, objective=args.objective, ) num_train_optimization_steps = int( train_dataset.num_dataset / args.train_batch_size / args.gradient_accumulation_steps) * (args.num_train_epochs - args.start_epoch) task_names = ["Conceptual_Caption"] task_ids = ["TASK0"] task_num_iters = { "TASK0": train_dataset.num_dataset / args.train_batch_size } logdir = os.path.join("logs", timeStamp) if default_gpu: tbLogger = utils.tbLogger( logdir, savePath, task_names, task_ids, task_num_iters, args.gradient_accumulation_steps, ) if args.visual_target == 0: config.v_target_size = 1601 config.visual_target = args.visual_target else: config.v_target_size = 2048 config.visual_target = args.visual_target if "roberta" in args.bert_model: config.model = "roberta" if args.freeze > config.t_biattention_id[0]: config.fixed_t_layer = config.t_biattention_id[0] if args.without_coattention: config.with_coattention = False if args.dynamic_attention: config.dynamic_attention = True if args.from_pretrained: model = BertForMultiModalPreTraining.from_pretrained( args.from_pretrained, config=config, default_gpu=default_gpu) else: model = BertForMultiModalPreTraining(config) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] if args.freeze != -1: bert_weight_name_filtered = [] for name in bert_weight_name: if "embeddings" in name: bert_weight_name_filtered.append(name) elif "encoder" in name: layer_num = name.split(".")[2] if int(layer_num) <= args.freeze: bert_weight_name_filtered.append(name) optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if key[12:] in bert_weight_name_filtered: value.requires_grad = False if default_gpu: print("filtered weight") print(bert_weight_name_filtered) if not args.from_pretrained: param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] else: optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if key[12:] in bert_weight_name: lr = args.learning_rate * 0.1 else: lr = args.learning_rate if any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0.0 }] if not any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0.01 }] if default_gpu: print(len(list(model.named_parameters())), len(optimizer_grouped_parameters)) # set different parameters for vision branch and lanugage branch. if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam( optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0, ) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = AdamW( optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(0.9, 0.98), ) scheduler = WarmupLinearSchedule( optimizer, warmup_steps=args.warmup_proportion * num_train_optimization_steps, t_total=num_train_optimization_steps, ) startIterID = 0 global_step = 0 if args.resume_file != "" and os.path.exists(args.resume_file): checkpoint = torch.load(args.resume_file, map_location="cpu") new_dict = {} for attr in checkpoint["model_state_dict"]: if attr.startswith("module."): new_dict[attr.replace( "module.", "", 1)] = checkpoint["model_state_dict"][attr] else: new_dict[attr] = checkpoint["model_state_dict"][attr] model.load_state_dict(new_dict) scheduler.load_state_dict(checkpoint["scheduler_state_dict"]) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) global_step = checkpoint["global_step"] del checkpoint model.cuda() for state in optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if args.fp16: model.half() if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) if default_gpu: logger.info("***** Running training *****") logger.info(" Num examples = %d", train_dataset.num_dataset) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) for epochId in range(int(args.start_epoch), int(args.num_train_epochs)): model.train() for step, batch in enumerate(train_dataset): iterId = startIterID + step + (epochId * len(train_dataset)) image_ids = batch[-1] batch = tuple( t.cuda(device=device, non_blocking=True) for t in batch[:-1]) input_ids, input_mask, segment_ids, lm_label_ids, is_next, image_feat, image_loc, image_target, image_label, image_mask = ( batch) if args.objective == 1: image_label = image_label * (is_next == 0).long().unsqueeze(1) image_label[image_label == 0] = -1 lm_label_ids = lm_label_ids * (is_next == 0).long().unsqueeze(1) lm_label_ids[lm_label_ids == 0] = -1 masked_loss_t, masked_loss_v, next_sentence_loss = model( input_ids, image_feat, image_loc, segment_ids, input_mask, image_mask, lm_label_ids, image_label, image_target, is_next, ) if args.objective == 2: next_sentence_loss = next_sentence_loss * 0 masked_loss_v = masked_loss_v * args.img_weight loss = masked_loss_t + masked_loss_v + next_sentence_loss if n_gpu > 1: loss = loss.mean() masked_loss_t = masked_loss_t.mean() masked_loss_v = masked_loss_v.mean() next_sentence_loss = next_sentence_loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion, ) for param_group in optimizer.param_groups: param_group["lr"] = lr_this_step scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if default_gpu: tbLogger.step_train_CC( epochId, iterId, float(masked_loss_t), float(masked_loss_v), float(next_sentence_loss), optimizer.param_groups[0]["lr"], "TASK0", "train", ) if (step % (20 * args.gradient_accumulation_steps) == 0 and step != 0 and default_gpu): tbLogger.showLossTrainCC() # Do the evaluation torch.set_grad_enabled(False) numBatches = len(validation_dataset) model.eval() for step, batch in enumerate(validation_dataset): image_ids = batch[-1] batch = tuple( t.cuda(device=device, non_blocking=True) for t in batch[:-1]) input_ids, input_mask, segment_ids, lm_label_ids, is_next, image_feat, image_loc, image_target, image_label, image_mask = ( batch) batch_size = input_ids.size(0) masked_loss_t, masked_loss_v, next_sentence_loss = model( input_ids, image_feat, image_loc, segment_ids, input_mask, image_mask, lm_label_ids, image_label, image_target, is_next, ) masked_loss_v = masked_loss_v * args.img_weight loss = masked_loss_t + masked_loss_v + next_sentence_loss if n_gpu > 1: loss = loss.mean() masked_loss_t = masked_loss_t.mean() masked_loss_v = masked_loss_v.mean() next_sentence_loss = next_sentence_loss.mean() if default_gpu: tbLogger.step_val_CC( epochId, float(masked_loss_t), float(masked_loss_v), float(next_sentence_loss), "TASK0", batch_size, "val", ) sys.stdout.write("%d / %d \r" % (step, numBatches)) sys.stdout.flush() if default_gpu: ave_score = tbLogger.showLossValCC() torch.set_grad_enabled(True) if default_gpu: # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = ( model.module if hasattr(model, "module") else model ) # Only save the model it-self output_model_file = os.path.join( savePath, "pytorch_model_" + str(epochId) + ".bin") output_checkpoint = os.path.join( savePath, "pytorch_ckpt_" + str(epochId) + ".tar") torch.save(model_to_save.state_dict(), output_model_file) torch.save( { "model_state_dict": model_to_save.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "scheduler_state_dict": scheduler.state_dict(), "global_step": global_step, }, output_checkpoint, ) if default_gpu: tbLogger.txt_close()