def backward_pytorch(d_output: Activations, sgd: Optimizer = None) -> None: y_for_bwd = [] dy_for_bwd = [] if d_output.has_lh: dy_for_bwd.append(xp2torch(d_output.lh)) y_for_bwd.append(y_var[0]) if d_output.has_po: dy_for_bwd.append(xp2torch(d_output.po)) y_for_bwd.append(y_var[1]) if d_output.has_ah: raise ValueError( "Gradients on all hidden states not supported yet.") if d_output.has_aa: raise ValueError( "Gradients on all attentions not supported yet.") if FINE_TUNE: torch.autograd.backward(y_for_bwd, grad_tensors=dy_for_bwd) if sgd is not None: if self._optimizer is None: self._optimizer = self._create_optimizer(sgd) if getattr(self, "_lr_schedule", None) is None: self._lr_schedule = WarmupLinearSchedule( self._optimizer, warmup_steps=50, t_total=500) if sgd.max_grad_norm: torch.nn.utils.clip_grad.clip_grad_norm_( self._model.parameters(), sgd.max_grad_norm) optimizer = self._optimizer self._lr_schedule.step() optimizer.step() optimizer.zero_grad() return None
def main(self): self.trainLoader = MyDataLoader(self, mode='train').getdata() self.validLoader = MyDataLoader(self, mode='valid').getdata() self.testLoader = MyDataLoader(self, mode='test').getdata() self.word_dict = pkl.load(open(os.path.join(self.data_dir, 'word_dict.pkl'), 'rb')) #embedding_matrix = pkl.load(open(os.path.join(self.data_dir, 'emb.pkl'), 'rb')) self.vocab_size = len(self.word_dict) self.embedding_matrix = pkl.load(open(os.path.join(self.data_dir, 'emb.pkl'), 'rb')) self.model = myLSTM(self, device=self.device).to(self.device) param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0} ] self.optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, eps=float(self.adam_epsilon), weight_decay=1e-6) self.scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=self.warmup_steps, t_total=self.epoch_size * self.trainLoader.__len__()) self.criterion = nn.CrossEntropyLoss() self.forward()
def tpu_training_loop(model, loader, device, context): """ Called by torch_xla_py.data_parallel. This function is executed on each core of the TPU once per epoch""" param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] # one optimizer and scheduler per TPU core. Both objects are saved in `context` to be reused the next epoch optimizer = context.getattr_or( 'optimizer', AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=tuple(args.betas))) # derive warmup info if args.warmup_proportion is not None: warmup_steps = int(args.warmup_proportion * num_train_optimization_steps + 0.5) elif args.warmup_steps is not None: warmup_steps = args.warmup_steps else: raise Exception('What is the warmup?? Specify either warmup proportion or steps') scheduler = context.getattr_or( 'scheduler', WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps)) tr_loss = None pbar = None if str(pbar_device) == str(device): # All threads are in sync. Use progress bar only on one of them pbar = tqdm(total=int(pbar_steps), desc=f"device {device}", dynamic_ncols=True) tracker = tpu_xm.RateTracker() model.train() for step, batch in enumerate(loader): input_ids, input_mask, segment_ids, lm_label_ids, _ = batch outputs = model(input_ids, segment_ids, input_mask, lm_label_ids) loss = outputs[0] if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.sum().backward() # for multiple tensors tracker.add(args.train_batch_size) tr_loss = loss * args.gradient_accumulation_steps if step == 0 else tr_loss + loss * args.gradient_accumulation_steps if pbar is not None: pbar.update(1) # pbar.set_description(desc=f'LR: {scheduler.get_lr()}') if (step + 1) % args.gradient_accumulation_steps == 0: tpu_xm.optimizer_step(optimizer) prev_lr = scheduler.get_last_lr()[0] scheduler.step() curr_lr = scheduler.get_last_lr()[0] if args.track_learning_rate: if pbar is not None: pbar.set_description(f"Prev LR: {prev_lr} Curr LR: {curr_lr}") optimizer.zero_grad() return tr_loss.sum().item() / step # `.item()` requires a trip from TPU to CPU, which is very slow. Use it only once per epoch=
def setup_optim(named_params, learning_rate, adam_epsilon, warmup_steps, num_train_optim_steps): param_optimizer = list(named_params) # model.named_parameters() no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optim_steps) return optimizer, scheduler
def get_scheduler_and_optimizer(self, parameters, train_tensor_data, logger): model = self.model num_train_optimization_steps = (int( len(train_tensor_data) / parameters["train_batch_size"] / parameters["gradient_accumulation_steps"]) * parameters["num_train_epochs"]) num_warmup_steps = int(num_train_optimization_steps * parameters["warmup_proportion"]) param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer] no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW( optimizer_grouped_parameters, lr=parameters["learning_rate"], correct_bias=False, ) scheduler = WarmupLinearSchedule( optimizer, warmup_steps=num_warmup_steps, t_total=num_train_optimization_steps, ) logger.info(" Num optimization steps = %d", num_train_optimization_steps) logger.info(" Num warmup steps = %d", num_warmup_steps) return optimizer, scheduler
def get_scheduler(params, optimizer, len_train_data, logger): batch_size = params["train_batch_size"] grad_acc = params["gradient_accumulation_steps"] epochs = params["num_train_epochs"] num_train_steps = int(len_train_data / batch_size / grad_acc) * epochs num_warmup_steps = int(num_train_steps * params["warmup_proportion"]) scheduler = WarmupLinearSchedule( optimizer, warmup_steps=num_warmup_steps, t_total=num_train_steps, ) logger.info(" Num optimization steps = %d" % num_train_steps) logger.info(" Num warmup steps = %d", num_warmup_steps) return scheduler
def __call__(self, model, device, args): log = self._logger # Prepare optimizer and schedule (linear warmup and decay) optimization_steps = (len(self.train_dataloader) * args.epochs) // args.gradient_accumulation_steps no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=optimization_steps) # Train log.info(f"Training Started with parameters {args}") model.zero_grad() global_step = 1 for epoch in trange(args.epochs, desc="Epoch"): for step, batch in enumerate(tqdm(self.train_dataloader)): model.train() batch = tuple(t.to(device) for t in batch) # Send data to target device model_input = {'input_ids': batch[0], # word ids 'attention_mask': batch[1], # input mask 'token_type_ids': batch[2], # segment ids 'labels': batch[3]} # labels outputs = model(**model_input) train_loss = outputs[0] if args.gradient_accumulation_steps > 1: train_loss = train_loss / args.gradient_accumulation_steps train_loss.backward() # Accumulates the gradient before optimize the model if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_norm) # grad clip optimizer.step() scheduler.step() model.zero_grad() # Steps necessary to run the trained model into validation data set if (step + 1) % args.eval_steps == 0 and not args.eval_per_epoch: self.evaluate_on_val_set(epoch, global_step, optimization_steps, model, device, scheduler, args) global_step += 1 if args.eval_per_epoch: self.evaluate_on_val_set(epoch, global_step, optimization_steps, model, device, scheduler, args)
def setup_optim(named_params, learning_rate, adam_epsilon, warmup_steps, num_train_optim_steps): """Sets up optimizer and scheduler for the model train loop Parameters: - named_params : List of models parameters - learning_rate : float : Between 0-1, recommended 3e-5 - adam_epsilon : float : Between 0-1, recommended 1e-8 - warmup_steps : int : Number training steps before regular learning is used - num_train_optim_steps : int : Total number of batches over all epochs Returns: - optimizer : AdamW Optimizer - scheduler : WarmupLinearSchedule """ param_optimizer = list(named_params) # model.named_parameters() no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optim_steps) return optimizer, scheduler
def main(): args = parse_args() # Devices if args.local_rank == -1: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 torch.distributed.init_process_group(backend="nccl") # Init distributed backend for sychronizing nodes/GPUs default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True logger.info(f"device: {device} n_gpu: {n_gpu}, distributed training: {bool(args.local_rank != -1)}") # Load config config = BertConfig.from_json_file(args.config_file) # Output dirs timestamp = args.config_file.split("/")[1].split(".")[0] save_path = os.path.join(args.output_dir, timestamp) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if default_gpu: if not os.path.exists(save_path): os.makedirs(save_path) # save all the hidden parameters. with open(os.path.join(save_path, "command.txt"), "w") as f: print(args, file=f) # Python 3.x print("\n", file=f) print(config, file=f) cache = 5000 args.train_batch_size = args.train_batch_size // args.grad_acc_steps if dist.is_available() and args.local_rank != -1: num_replicas = dist.get_world_size() args.train_batch_size = args.train_batch_size // num_replicas args.num_workers = args.num_workers // num_replicas cache = cache // num_replicas # Seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # Datasets tokenizer = AutoTokenizer.from_pretrained(config.bert_model, do_lower_case=config.do_lower_case) train_dataset = ConceptCapLoaderTrain(args.annotations_path, args.features_path, tokenizer, seq_len=args.max_seq_length, batch_size=args.train_batch_size, num_workers=args.num_workers, local_rank=args.local_rank, objective=args.objective, cache=cache, add_global_imgfeat=config.add_global_imgfeat, num_locs=config.num_locs) valid_dataset = ConceptCapLoaderVal(args.annotations_path, args.features_path, tokenizer, seq_len=args.max_seq_length, batch_size=args.train_batch_size, num_workers=2, objective=args.objective, add_global_imgfeat=config.add_global_imgfeat, num_locs=config.num_locs) # Task details task_names = ["Conceptual_Caption"] task_ids = ["TASK0"] task2num_iters = {"TASK0": train_dataset.num_dataset / args.train_batch_size} # Logging logdir = os.path.join(args.logdir, timestamp) if default_gpu: tb_logger = tbLogger(logdir, save_path, task_names, task_ids, task2num_iters, args.grad_acc_steps) else: tb_logger = None # Model if args.from_pretrained: type_vocab_size = config.type_vocab_size config.type_vocab_size = 2 model = BertForVLPreTraining.from_pretrained(args.from_pretrained, config=config, default_gpu=default_gpu, from_hf=True) # Resize type embeddings model.bert.embeddings.token_type_embeddings = \ model._get_resized_embeddings(model.bert.embeddings.token_type_embeddings, type_vocab_size) config.type_vocab_size = type_vocab_size else: model = BertForVLPreTraining(config) # Optimization details freeze_layers(model) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] bert_weight_name = json.load(open("config/" + args.from_pretrained + "_weight_name.json", "r")) if not args.from_pretrained: param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [ {"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay}, {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] else: optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if key[12:] in bert_weight_name: lr = args.learning_rate * 0.1 else: lr = args.learning_rate if any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{"params": [value], "lr": lr, "weight_decay": 0.0}] if not any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{"params": [value], "lr": lr, "weight_decay": args.weight_decay}] if default_gpu: print(len(list(model.named_parameters())), len(optimizer_grouped_parameters)) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=args.adam_betas) num_train_optimization_steps = int( train_dataset.num_dataset / args.train_batch_size / args.grad_acc_steps ) * args.num_train_epochs warmup_steps = args.warmup_steps or args.warmup_proportion * num_train_optimization_steps scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) # Resume training start_iter_id, global_step, start_epoch, tb_logger, _ = \ resume(args.resume_file, model, optimizer, scheduler, tb_logger) # Move to GPU(s) model.cuda() for state in optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Save starting model save(save_path, logger, -1, model, optimizer, scheduler, global_step, tb_logger, default_gpu, -1) # Print summary if default_gpu: summary_parameters(model, logger) logger.info("***** Running training *****") logger.info(" Num examples = %d", train_dataset.num_dataset) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) # Train for epoch_id in range(start_epoch, int(args.num_train_epochs)): model.train() for step, batch in enumerate(train_dataset): iter_id = start_iter_id + step + (epoch_id * len(train_dataset)) batch = tuple(t.cuda(device=device, non_blocking=True) for t in batch[:-1]) input_ids, input_mask, segment_ids, lm_label_ids, is_match, \ image_feat, image_loc, image_cls, obj_labels, obj_confs, \ attr_labels, attr_confs, image_attrs, image_label, image_mask = batch if args.objective == 1: # Ignore labels (setting them to -1) for mismatched caption-image pairs image_label = image_label * (is_match == 0).long().unsqueeze(1) image_label[image_label == 0] = -1 lm_label_ids = lm_label_ids * (is_match == 0).long().unsqueeze(1) lm_label_ids[lm_label_ids == 0] = -1 masked_loss_t, masked_loss_v, pair_match_loss = model(input_ids, image_feat, image_loc, segment_ids, input_mask, image_mask, lm_label_ids, image_label, image_cls, obj_labels, obj_confs, attr_labels, attr_confs, image_attrs, is_match) if args.objective == 2: pair_match_loss = pair_match_loss * 0 loss = masked_loss_t + masked_loss_v + pair_match_loss if n_gpu > 1: loss = loss.mean() masked_loss_t = masked_loss_t.mean() masked_loss_v = masked_loss_v.mean() pair_match_loss = pair_match_loss.mean() if args.grad_acc_steps > 1: loss = loss / args.grad_acc_steps loss.backward() if (step + 1) % args.grad_acc_steps == 0: # Clip gradient if args.clip_grad_norm > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad_norm) optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 if default_gpu: tb_logger.step_train_CC(epoch_id, iter_id, float(masked_loss_t), float(masked_loss_v), float(pair_match_loss), optimizer.param_groups[0]["lr"], "TASK0", "train") if (step % (20 * args.grad_acc_steps) == 0) and step != 0 and default_gpu: tb_logger.showLossTrainCC() # Do the evaluation torch.set_grad_enabled(False) numBatches = len(valid_dataset) model.eval() for step, batch in enumerate(valid_dataset): batch = tuple(t.cuda(device=device, non_blocking=True) for t in batch[:-1]) input_ids, input_mask, segment_ids, lm_label_ids, is_match, \ image_feat, image_loc, image_cls, obj_labels, obj_confs, \ attr_labels, attr_confs, image_attrs, image_label, image_mask = batch batch_size = input_ids.size(0) masked_loss_t, masked_loss_v, pair_match_loss = model(input_ids, image_feat, image_loc, segment_ids, input_mask, image_mask, lm_label_ids, image_label, image_cls, obj_labels, obj_confs, attr_labels, attr_confs, image_attrs, is_match) loss = masked_loss_t + masked_loss_v + pair_match_loss if n_gpu > 1: loss = loss.mean() masked_loss_t = masked_loss_t.mean() masked_loss_v = masked_loss_v.mean() pair_match_loss = pair_match_loss.mean() if default_gpu: tb_logger.step_val_CC(epoch_id, iter_id, float(masked_loss_t), float(masked_loss_v), float(pair_match_loss), "TASK0", batch_size, "val") sys.stdout.write("%d / %d \r" % (step, numBatches)) sys.stdout.flush() if default_gpu: tb_logger.showLossValCC() torch.set_grad_enabled(True) save(save_path, logger, epoch_id, model, optimizer, scheduler, global_step, tb_logger, default_gpu, loss) if default_gpu: tb_logger.txt_close()
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--file_path", default="data/conceptual_caption/", type=str, help="The input train corpus.", ) parser.add_argument( "--from_pretrained", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-base-uncased, roberta-base, roberta-large, ", ) parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, roberta-base", ) parser.add_argument( "--output_dir", default="save", type=str, # required=True, help= "The output directory where the model checkpoints will be written.", ) parser.add_argument( "--config_file", type=str, default="config/bert_base_6layer_6conect.json", help="The config file which specified the model details.", ) ## Other parameters parser.add_argument( "--max_seq_length", default=36, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.", ) parser.add_argument( "--train_batch_size", default=512, type=int, help="Total batch size for training.", ) parser.add_argument( "--learning_rate", default=1e-4, type=float, help="The initial learning rate for Adam.", ) parser.add_argument( "--num_train_epochs", default=10.0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--start_epoch", default=0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.", ) parser.add_argument("--img_weight", default=1, type=float, help="weight for image loss") parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", action="store_true", help="Whether to load train samples into memory or use disk", ) parser.add_argument( "--do_lower_case", type=bool, default=True, help= "Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument( "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus", ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass.", ) parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument( "--dynamic_attention", action="store_true", help="whether use dynamic attention.", ) parser.add_argument( "--num_workers", type=int, default=25, help="Number of workers in the dataloader.", ) parser.add_argument("--save_name", default="", type=str, help="save name for training.") parser.add_argument( "--baseline", action="store_true", help="Wheter to use the baseline model (single bert).", ) parser.add_argument( "--freeze", default=-1, type=int, help="till which layer of textual stream of vilbert need to fixed.", ) parser.add_argument( "--distributed", action="store_true", help="whether use chunck for parallel training.", ) parser.add_argument("--without_coattention", action="store_true", help="whether pair loss.") parser.add_argument( "--visual_target", default=0, type=int, help="which target to use for visual branch. \ 0: soft label, \ 1: regress the feature, \ 2: NCE loss.", ) parser.add_argument( "--objective", default=0, type=int, help="which objective to use \ 0: with ICA loss, \ 1: with ICA loss, for the not aligned pair, no masking objective, \ 2: without ICA loss, do not sample negative pair.", ) parser.add_argument("--num_negative", default=255, type=int, help="num of negative to use") parser.add_argument("--resume_file", default="", type=str, help="Resume from checkpoint") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") args = parser.parse_args() if args.baseline: from pytorch_pretrained_bert.modeling import BertConfig from vilbert.basebert import BertForMultiModalPreTraining else: from vilbert.vilbert import BertForMultiModalPreTraining, BertConfig if args.save_name: prefix = "-" + args.save_name else: prefix = "" timeStamp = args.config_file.split("/")[1].split(".")[0] + prefix savePath = os.path.join(args.output_dir, timeStamp) bert_weight_name = json.load( open("config/" + args.from_pretrained + "_weight_name.json", "r")) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True if default_gpu: if not os.path.exists(savePath): os.makedirs(savePath) config = BertConfig.from_json_file(args.config_file) if default_gpu: # save all the hidden parameters. with open(os.path.join(savePath, "command.txt"), "w") as f: print(args, file=f) # Python 3.x print("\n", file=f) print(config, file=f) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps cache = 5000 if dist.is_available() and args.local_rank != -1: num_replicas = dist.get_world_size() args.train_batch_size = args.train_batch_size // num_replicas args.num_workers = args.num_workers // num_replicas cache = cache // num_replicas random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if "roberta" in args.bert_model: tokenizer = RobertaTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) else: tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) num_train_optimization_steps = None train_dataset = ConceptCapLoaderTrain( args.file_path, tokenizer, args.bert_model, seq_len=args.max_seq_length, batch_size=args.train_batch_size, visual_target=args.visual_target, num_workers=args.num_workers, local_rank=args.local_rank, objective=args.objective, cache=cache, ) validation_dataset = ConceptCapLoaderVal( args.file_path, tokenizer, args.bert_model, seq_len=args.max_seq_length, batch_size=args.train_batch_size, visual_target=args.visual_target, num_workers=2, objective=args.objective, ) num_train_optimization_steps = int( train_dataset.num_dataset / args.train_batch_size / args.gradient_accumulation_steps) * (args.num_train_epochs - args.start_epoch) task_names = ["Conceptual_Caption"] task_ids = ["TASK0"] task_num_iters = { "TASK0": train_dataset.num_dataset / args.train_batch_size } logdir = os.path.join("logs", timeStamp) if default_gpu: tbLogger = utils.tbLogger( logdir, savePath, task_names, task_ids, task_num_iters, args.gradient_accumulation_steps, ) if args.visual_target == 0: config.v_target_size = 1601 config.visual_target = args.visual_target else: config.v_target_size = 2048 config.visual_target = args.visual_target if "roberta" in args.bert_model: config.model = "roberta" if args.freeze > config.t_biattention_id[0]: config.fixed_t_layer = config.t_biattention_id[0] if args.without_coattention: config.with_coattention = False if args.dynamic_attention: config.dynamic_attention = True if args.from_pretrained: model = BertForMultiModalPreTraining.from_pretrained( args.from_pretrained, config=config, default_gpu=default_gpu) else: model = BertForMultiModalPreTraining(config) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] if args.freeze != -1: bert_weight_name_filtered = [] for name in bert_weight_name: if "embeddings" in name: bert_weight_name_filtered.append(name) elif "encoder" in name: layer_num = name.split(".")[2] if int(layer_num) <= args.freeze: bert_weight_name_filtered.append(name) optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if key[12:] in bert_weight_name_filtered: value.requires_grad = False if default_gpu: print("filtered weight") print(bert_weight_name_filtered) if not args.from_pretrained: param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] else: optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if key[12:] in bert_weight_name: lr = args.learning_rate * 0.1 else: lr = args.learning_rate if any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0.0 }] if not any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0.01 }] if default_gpu: print(len(list(model.named_parameters())), len(optimizer_grouped_parameters)) # set different parameters for vision branch and lanugage branch. if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam( optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0, ) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = AdamW( optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(0.9, 0.98), ) scheduler = WarmupLinearSchedule( optimizer, warmup_steps=args.warmup_proportion * num_train_optimization_steps, t_total=num_train_optimization_steps, ) startIterID = 0 global_step = 0 if args.resume_file != "" and os.path.exists(args.resume_file): checkpoint = torch.load(args.resume_file, map_location="cpu") new_dict = {} for attr in checkpoint["model_state_dict"]: if attr.startswith("module."): new_dict[attr.replace( "module.", "", 1)] = checkpoint["model_state_dict"][attr] else: new_dict[attr] = checkpoint["model_state_dict"][attr] model.load_state_dict(new_dict) scheduler.load_state_dict(checkpoint["scheduler_state_dict"]) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) global_step = checkpoint["global_step"] del checkpoint model.cuda() for state in optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if args.fp16: model.half() if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) if default_gpu: logger.info("***** Running training *****") logger.info(" Num examples = %d", train_dataset.num_dataset) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) for epochId in range(int(args.start_epoch), int(args.num_train_epochs)): model.train() for step, batch in enumerate(train_dataset): iterId = startIterID + step + (epochId * len(train_dataset)) image_ids = batch[-1] batch = tuple( t.cuda(device=device, non_blocking=True) for t in batch[:-1]) input_ids, input_mask, segment_ids, lm_label_ids, is_next, image_feat, image_loc, image_target, image_label, image_mask = ( batch) if args.objective == 1: image_label = image_label * (is_next == 0).long().unsqueeze(1) image_label[image_label == 0] = -1 lm_label_ids = lm_label_ids * (is_next == 0).long().unsqueeze(1) lm_label_ids[lm_label_ids == 0] = -1 masked_loss_t, masked_loss_v, next_sentence_loss = model( input_ids, image_feat, image_loc, segment_ids, input_mask, image_mask, lm_label_ids, image_label, image_target, is_next, ) if args.objective == 2: next_sentence_loss = next_sentence_loss * 0 masked_loss_v = masked_loss_v * args.img_weight loss = masked_loss_t + masked_loss_v + next_sentence_loss if n_gpu > 1: loss = loss.mean() masked_loss_t = masked_loss_t.mean() masked_loss_v = masked_loss_v.mean() next_sentence_loss = next_sentence_loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion, ) for param_group in optimizer.param_groups: param_group["lr"] = lr_this_step scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if default_gpu: tbLogger.step_train_CC( epochId, iterId, float(masked_loss_t), float(masked_loss_v), float(next_sentence_loss), optimizer.param_groups[0]["lr"], "TASK0", "train", ) if (step % (20 * args.gradient_accumulation_steps) == 0 and step != 0 and default_gpu): tbLogger.showLossTrainCC() # Do the evaluation torch.set_grad_enabled(False) numBatches = len(validation_dataset) model.eval() for step, batch in enumerate(validation_dataset): image_ids = batch[-1] batch = tuple( t.cuda(device=device, non_blocking=True) for t in batch[:-1]) input_ids, input_mask, segment_ids, lm_label_ids, is_next, image_feat, image_loc, image_target, image_label, image_mask = ( batch) batch_size = input_ids.size(0) masked_loss_t, masked_loss_v, next_sentence_loss = model( input_ids, image_feat, image_loc, segment_ids, input_mask, image_mask, lm_label_ids, image_label, image_target, is_next, ) masked_loss_v = masked_loss_v * args.img_weight loss = masked_loss_t + masked_loss_v + next_sentence_loss if n_gpu > 1: loss = loss.mean() masked_loss_t = masked_loss_t.mean() masked_loss_v = masked_loss_v.mean() next_sentence_loss = next_sentence_loss.mean() if default_gpu: tbLogger.step_val_CC( epochId, float(masked_loss_t), float(masked_loss_v), float(next_sentence_loss), "TASK0", batch_size, "val", ) sys.stdout.write("%d / %d \r" % (step, numBatches)) sys.stdout.flush() if default_gpu: ave_score = tbLogger.showLossValCC() torch.set_grad_enabled(True) if default_gpu: # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = ( model.module if hasattr(model, "module") else model ) # Only save the model it-self output_model_file = os.path.join( savePath, "pytorch_model_" + str(epochId) + ".bin") output_checkpoint = os.path.join( savePath, "pytorch_ckpt_" + str(epochId) + ".tar") torch.save(model_to_save.state_dict(), output_model_file) torch.save( { "model_state_dict": model_to_save.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "scheduler_state_dict": scheduler.state_dict(), "global_step": global_step, }, output_checkpoint, ) if default_gpu: tbLogger.txt_close()
def _prepare_optimizer(self, learning_rate, loss_scale, warmup_proportion, num_train_optimization_steps): """Initialize the optimizer Arguments: learning_rate {float} -- The initial learning rate for Adam loss_scale {float} -- Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True. 0 (default value): dynamic loss scaling. Positive power of 2: static loss scaling value. warmup_proportion {float} -- Proportion of training to perform linear learning rate warmup for E.g., 0.1 = 10%% of training num_train_optimization_steps {int} -- Number of optimization steps Returns: Optimizer -- The optimizer to use while training """ param_optimizer = list(self.model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if self.fp16: optimizer = FusedAdam(optimizer_grouped_parameters, lr=learning_rate, bias_correction=False, max_grad_norm=1.0) if loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=loss_scale) warmup_linear = WarmupLinearSchedule( warmup_steps=warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = AdamW( optimizer_grouped_parameters, lr=learning_rate, ) warmup_linear = None return optimizer, warmup_linear
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'labels': batch[3] } outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
model.load_state_dict(load_my_state_dict(model, pretrained)) for name, param in model.named_parameters(): if 'module.conv1' in name or 'module.bn1' in name or 'layer1' in name or 'layer2' in name or 'layer3' in name: param.requires_grad = False # Create instance of optimizer (AdamW) and learning rate scheduler optimizer = optim.AdamW(model.parameters(), lr=args.init_lr) # Select options per parser args if args.scheduler == 'WarmupLinearSchedule': num_train_optimization_steps = len( train_loader) * args.num_train_epochs args.warmup_steps = args.warmup_proportion * num_train_optimization_steps scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) if args.scheduler == 'ReduceLROnPlateau': scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, threshold=1e-6) transform = nn.Sequential( kornia.augmentation.RandomAffine((-20., 20.), translate=(0.1, 0.1))) # Start model training best_loss = np.inf total_step = len(train_loader) early_stop = 0 running_loss = 0.0 for epoch in range(args.num_train_epochs): tr_loss = 0 model.train()
def train(self, train_category, dev_category, train_news, dev_news, tokenizer, Net=None, model=None): if os.path.exists(self.arguments.output_config_file) is True: os.remove(self.arguments.output_config_file) logger.info('>>train.shape: {} | dev.shape: {}'.format( train_category.shape, dev_category.shape)) train_dataloader, train_examples_len = Util.load_data( news=train_news, category=train_category, data_type='train', label_list=self.arguments.label_list, max_length=self.arguments.max_seq_length, tokenizer=tokenizer, batch_size=self.arguments.BATCH) dev_dataloader, dev_examples_len = Util.load_data( news=dev_news, category=dev_category, data_type='dev', label_list=self.arguments.label_list, max_length=self.arguments.max_seq_length, tokenizer=tokenizer, batch_size=self.arguments.BATCH) num_train_optimization_steps = int( train_examples_len / self.arguments.BATCH / self.arguments.gradient_accumulation_steps) * self.arguments.EPOCHS # 模型准备 logger.info("model name is {}".format(self.arguments.model_name)) if model is None: if self.arguments.model_name == "BertOrigin": model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir) elif self.arguments.model_name == 'BertHAN': model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir) elif self.arguments.model_name == "BertCNN": filter_sizes = [ int(val) for val in self.arguments.filter_sizes.split() ] model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, n_filters=self.arguments.filter_num, filter_sizes=filter_sizes, cache_dir=self.arguments.cache_dir) elif self.arguments.model_name == "BertATT": model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir) elif self.arguments.model_name == "BertRCNN": model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir, rnn_hidden_size=self.arguments.rnn_hidden_size, num_layers=self.arguments.num_layers, bidirectional=self.arguments.bidirectional, dropout=self.arguments.dropout) elif self.arguments.model_name == "BertCNNPlus": filter_sizes = [ int(val) for val in self.arguments.filter_sizes.split() ] model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir, n_filters=self.arguments.filter_num, filter_sizes=filter_sizes) model.to(DEVICE) """ 优化器准备 """ param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # To reproduce BertAdam specific behavior set correct_bias=False optimizer = AdamW(params=optimizer_grouped_parameters, lr=self.arguments.learning_rate, correct_bias=False) # PyTorch scheduler scheduler = WarmupLinearSchedule( optimizer=optimizer, warmup_steps=self.arguments.warmup_proportion, t_total=num_train_optimization_steps) """ 损失函数准备 """ if self.arguments.use_label_smoothing: criterion = NMTCriterion( label_smoothing=self.arguments.label_smoothing) else: criterion = nn.CrossEntropyLoss() criterion = criterion.to(DEVICE) best_auc, best_acc, global_step, early_stop_times = 0, 0, 0, 0 for epoch in range(int(self.arguments.EPOCHS)): if early_stop_times >= self.arguments.early_stop * ( train_examples_len // self.arguments.BATCH): break logger.info(f'---------------- Epoch: {epoch + 1:02} ----------') for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): model.train() if self.arguments.label_smoothing: criterion.train() batch = tuple(t.to(DEVICE) for t in batch) _, input_ids, input_mask, segment_ids, label_ids = batch logits = model(input_ids, segment_ids, input_mask, labels=None) loss = criterion(inputs=logits, labels=label_ids, normalization=1.0, reduce=False) # 修正 if self.arguments.gradient_accumulation_steps > 1: loss = loss / self.arguments.gradient_accumulation_steps loss.backward(torch.ones_like(loss)) scheduler.step() if (step + 1) % self.arguments.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % self.arguments.print_step == 0 and global_step != 0: dev_loss, dev_acc, dev_report, dev_auc = Util.evaluate( model, dev_dataloader, criterion, DEVICE, self.arguments.label_list, args=self.arguments) logger.info('\n>>>dev report: \n{}'.format(dev_report)) # 以 acc 取优 if dev_acc > best_acc: best_acc = dev_acc # 以 auc 取优 if dev_auc > best_auc: best_auc = dev_auc # 保存模型 model_to_save = model.module if hasattr( model, 'module') else model torch.save(model_to_save.state_dict(), self.arguments.output_model_file) with open(self.arguments.output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) early_stop_times = 0 else: early_stop_times += 1 if os.path.exists(self.arguments.output_config_file) is False: model_to_save = model.module if hasattr(model, 'module') else model torch.save(model_to_save.state_dict(), self.arguments.output_model_file) with open(self.arguments.output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string())
def _train_one_epoch(model, loader, device, context): """ Called by torch_xla_py.data_parallel. This function is executed on each core of the TPU once per epoch""" # model parameters param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # one optimizer and scheduler per TPU core. Both objects are saved in `context` to be reused the next epoch optimizer = context.getattr_or( 'optimizer', AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=tuple(args.betas))) scheduler = context.getattr_or( 'scheduler', WarmupLinearSchedule(optimizer, warmup_steps=warmup_updates, t_total=total_num_updates)) # restart # TODO: scheduler reset to 0 each epoch scheduler.step(args.scheduler_last_epoch) logging.info(f'Restarting scheduler LR to: {scheduler.get_last_lr()}') tr_loss = None tracker = tpu_xm.RateTracker() model.train() for step, batch in loader: input_ids, input_mask, segment_ids, lm_label_ids, _ = batch outputs = model(input_ids, segment_ids, input_mask, lm_label_ids) loss = outputs[0] if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tracker.add(args.per_tpu_train_batch_size) tr_loss = loss * args.gradient_accumulation_steps if step == 0 else tr_loss + loss * args.gradient_accumulation_steps if (step + 1) % args.gradient_accumulation_steps == 0: tpu_xm.optimizer_step(optimizer) scheduler.step() optimizer.zero_grad() # logging.info(f' Adjusted scheduler LR to {scheduler.get_last_lr()}') # since checkpointing happens each epoch, we only need to save the scheduler state at end of each epoch logging.info(f'Scheduler last_epoch {scheduler.last_epoch}') return tr_loss.item( ) / step # `.item()` requires a trip from TPU to CPU, which is very slow. Use it only once per epoch=
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .csv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) # Prepare model model = BertForMultipleChoice.from_pretrained( args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)), num_choices=4) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_train: # Prepare data loader train_examples = read_swag_examples(os.path.join( args.data_dir, 'train.csv'), is_training=True) train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, True) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) num_train_optimization_steps = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, token_types_ids=segment_ids, attention_mask=input_mask, labels=label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForMultipleChoice.from_pretrained(args.output_dir, num_choices=4) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) else: model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_swag_examples(os.path.join( args.data_dir, 'val.csv'), is_training=True) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / global_step } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
except ImportError: raise ImportError( "lease install apex from https://www.github.com/nvidia/apex to use fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=params.learning_rate, bias_correction=False, max_grad_norm=1.0) scheduler = LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch)) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: # optimizer = AdamW(model.parameters(), lr=params.learning_rate, correct_bias=False) optimizer = AdamW(optimizer_grouped_parameters, lr=params.learning_rate, correct_bias=False) train_steps_per_epoch = params.train_size // params.batch_size scheduler = WarmupLinearSchedule(optimizer, warmup_steps=train_steps_per_epoch, t_total=params.epoch_num * train_steps_per_epoch) # Train and evaluate the model logging.info("Starting training for {} epoch(s)".format(params.epoch_num)) train_and_evaluate(model, train_data, val_data, optimizer, scheduler, params, tagger_model_dir, args.restore_dir)
def main(): args = parse_args() # Devices if args.local_rank == -1: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 torch.distributed.init_process_group(backend="nccl") default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True logger.info(f"device: {device} n_gpu: {n_gpu}, distributed training: {bool(args.local_rank != -1)}") # Load config config = BertConfig.from_json_file(args.config_file) # Load task config with open(args.tasks_config_file, "r") as f: task_cfg = edict(yaml.safe_load(f)) task_id = args.task.strip() task = "TASK" + task_id task_name = task_cfg[task]["name"] base_lr = task_cfg[task]["lr"] if task_cfg[task].get("fusion_method", None): # VL-BERT pooling for VQA config.fusion_method = task_cfg[task]["fusion_method"] # Output dirs if args.save_name: prefix = "-" + args.save_name else: prefix = "" timestamp = (task_name + "_" + args.config_file.split("/")[1].split(".")[0] + prefix) save_path = os.path.join(args.output_dir, timestamp) if default_gpu: if not os.path.exists(save_path): os.makedirs(save_path) # save all the hidden parameters. with open(os.path.join(save_path, "command.txt"), "w") as f: print(args, file=f) # Python 3.x print("\n", file=f) print(config, file=f) # Seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) # Dataset batch_size, task2num_iters, dset_train, dset_val, dl_train, dl_val = LoadDataset(args, config, task_cfg, args.task) # Logging logdir = os.path.join(args.logdir, timestamp) tb_logger = tbLogger(logdir, save_path, [task_name], [task], task2num_iters, args.grad_acc_steps) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Model if "roberta" in args.bert_model: config.model = "roberta" model = BertForVLTasks.from_pretrained(args.from_pretrained, config=config, task_cfg=task_cfg, task_ids=[task]) if task_cfg[task].get("embed_clf", None): logger.info('Initializing classifier weight for %s from pretrained word embeddings...' % task) answers_word_embed = [] for k, v in model.state_dict().items(): if 'bert.embeddings.word_embeddings.weight' in k: word_embeddings = v.detach().clone() break for answer, label in sorted(dset_train.ans2label.items()): a_tokens = dset_train._tokenizer.tokenize(answer) a_ids = dset_train._tokenizer.convert_tokens_to_ids(a_tokens) if len(a_ids): a_word_embed = (torch.stack([word_embeddings[a_id] for a_id in a_ids], dim=0)).mean(dim=0) else: a_tokens = dset_train._tokenizer.tokenize("<unk>") a_id = dset_train._tokenizer.convert_tokens_to_ids(a_tokens)[0] a_word_embed = word_embeddings[a_id] answers_word_embed.append(a_word_embed) answers_word_embed_tensor = torch.stack(answers_word_embed, dim=0) for name, module in model.named_modules(): if name.endswith('clfs_dict.%s.logit_fc.3' % task): module.weight.data = answers_word_embed_tensor.to(device=module.weight.data.device) # Optimization details freeze_layers(model) criterion = LoadLoss(task_cfg, args.task) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if "vil_" in key: lr = 1e-4 else: lr = base_lr if any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{"params": [value], "lr": lr, "weight_decay": 0.0}] if not any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{"params": [value], "lr": lr, "weight_decay": args.weight_decay}] if default_gpu: print(len(list(model.named_parameters())), len(optimizer_grouped_parameters)) if args.optim == "AdamW": optimizer = AdamW(optimizer_grouped_parameters, lr=base_lr, eps=args.adam_epsilon, betas=args.adam_betas, correct_bias=args.adam_correct_bias) elif args.optim == "RAdam": optimizer = RAdam(optimizer_grouped_parameters, lr=base_lr) num_train_optim_steps = (task2num_iters[task] * args.num_train_epochs // args.grad_acc_steps) warmup_steps = args.warmup_steps or args.warmup_proportion * num_train_optim_steps if args.lr_scheduler == "warmup_linear": scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optim_steps) else: scheduler = WarmupConstantSchedule(optimizer, warmup_steps=warmup_steps) # Resume training start_iter_id, global_step, start_epoch, tb_logger, max_score = \ resume(args.resume_file, model, optimizer, scheduler, tb_logger) # Move to GPU(s) model.to(device) for state in optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, delay_allreduce=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Save starting model save(save_path, logger, -1, model, optimizer, scheduler, global_step, tb_logger, default_gpu) # Print summary if default_gpu: summary_parameters(model, logger) print("***** Running training *****") print(" Num Iters: ", task2num_iters[task]) print(" Batch size: ", batch_size) print(" Num steps: %d" % num_train_optim_steps) # Train for epoch_id in tqdm(range(start_epoch, args.num_train_epochs), desc="Epoch"): model.train() for step, batch in enumerate(dl_train): iter_id = start_iter_id + step + (epoch_id * len(dl_train)) loss, score = ForwardModelsTrain(config, task_cfg, device, task, batch, model, criterion) if args.grad_acc_steps > 1: loss = loss / args.grad_acc_steps loss.backward() if (step + 1) % args.grad_acc_steps == 0: # Clip gradient if args.clip_grad_norm > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad_norm) optimizer.step() if global_step < warmup_steps or args.lr_scheduler == "warmup_linear": scheduler.step() model.zero_grad() global_step += 1 if default_gpu: tb_logger.step_train(epoch_id, iter_id, float(loss), float(score), optimizer.param_groups[0]["lr"], task, "train") if (step % (20 * args.grad_acc_steps) == 0) and step != 0 and default_gpu: tb_logger.showLossTrain() # Decide whether to evaluate task if iter_id != 0 and iter_id % task2num_iters[task] == 0: score = evaluate(config, dl_val, task_cfg, device, task, model, criterion, epoch_id, default_gpu, tb_logger) if score > max_score: max_score = score save(save_path, logger, epoch_id, model, optimizer, scheduler, global_step, tb_logger, default_gpu, max_score) save(save_path, logger, epoch_id, model, optimizer, scheduler, global_step, tb_logger, default_gpu, max_score) tb_logger.txt_close()
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--training_data_path", default=None, type=str, required=True, help="The training data path") parser.add_argument("--validation_data_path", default=None, type=str, required=True, help="The validation data path") parser.add_argument( "--mcq_model", default=None, type=str, required=True, help="choose one from the list: bert-mcq-parallel-max, " "bert-mcq_parallel-weighted-sum, bert-mcq-concat, mac-bert, or add roberta instead of bert" ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese, roberta-base, roberta-large" ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=32, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--max_grad_norm", default=None, type=float, help="Max gradient norm.") parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--dropout", default=0.0, type=float, help="dropout") parser.add_argument( "--eval_freq", default=0, type=int, help="Evaluation steps frequency. Default is at the end of each epoch. " "You can also increase the frequency") parser.add_argument( '--tie_weights_weighted_sum', action='store_true', help="Whether to tie the weights for the weighted sum model") parser.add_argument('--max_number_premises', type=int, default=None, help="Number of premise sentences to use at max") parser.add_argument('--num_labels', type=int, default=3, help="Number of labels") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument('--with_score', action='store_true', help="Knowledge with score is provided") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) # true batch size args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") # if os.path.exists(args.output_dir) and os.listdir(args.output_dir): # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) with open(os.path.join(args.output_dir, "mcq_inputs.json"), 'w') as f: json.dump(vars(args), f, indent=2) stdout_handler = prepare_global_logging(args.output_dir, False) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if "roberta" in args.bert_model: tokenizer = RobertaTokenizer.from_pretrained( "roberta-large", do_lower_case=args.do_lower_case) logger.info("Type of Tokenizer : ROBERTA") else: tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) logger.info("Type of Tokenizer : BERT") data_reader = None if args.mcq_model == 'bert-mcq-parallel-max': model = BertMCQParallel.from_pretrained( args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) data_reader = BertMCQParallelReader() elif args.mcq_model == 'bert-mcq-concat': model = BertMCQConcat.from_pretrained( args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) data_reader = BertMCQConcatReader() elif args.mcq_model == 'bert-mcq-weighted-sum': model = BertMCQWeightedSum.from_pretrained( args.bert_model, tie_weights=args.tie_weights_weighted_sum, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) data_reader = BertMCQParallelReader() elif args.mcq_model == 'bert-mcq-simple-sum': model = BertMCQSimpleSum.from_pretrained( args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) data_reader = BertMCQParallelReader() elif args.mcq_model == 'bert-mcq-mac': model = BertMCQMAC.from_pretrained( args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) data_reader = BertMCQParallelReader() elif args.mcq_model == 'roberta-mcq-parallel-max': model = RoBertaMCQParallel.from_pretrained( args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) data_reader = RoBertaMCQParallelReader() elif args.mcq_model == 'roberta-mcq-concat': model = RoBertaMCQConcat.from_pretrained( args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) data_reader = RoBertaMCQConcatReader() elif args.mcq_model == 'roberta-mcq-weighted-sum': model = RoBertaMCQWeightedSum.from_pretrained( args.bert_model, tie_weights=args.tie_weights_weighted_sum, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) data_reader = RoBertaMCQParallelReader() elif args.mcq_model == 'roberta-mcq-ws-score': model = RoBertaMCQWeightedSumScore.from_pretrained( args.bert_model, tie_weights=args.tie_weights_weighted_sum, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) data_reader = RoBertaMCQParallelScoreReader() elif args.mcq_model == 'roberta-mcq-simple-sum': model = RoBertaMCQSimpleSum.from_pretrained( args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) data_reader = RoBertaMCQParallelReader() elif args.mcq_model == 'roberta-mcq-ss-score': model = RoBertaMCQSimpleSumScore.from_pretrained( args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) data_reader = RoBertaMCQParallelScoreReader() elif args.mcq_model == 'roberta-mcq-mac': model = RoBertaMCQMAC.from_pretrained( args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) data_reader = RoBertaMCQParallelReader() elif args.mcq_model == 'roberta-mcq-conv3d': model = RoBertaMCQConv3d.from_pretrained( args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) data_reader = RoBertaMCQParallelReader() else: logger.error(f"Invalid MCQ model name {args.mcq_model}") exit(0) if args.do_train: # Prepare data loader # get data loader for train/dev train_data = data_reader.read(args.training_data_path, tokenizer, args.max_seq_length, args.max_number_premises) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = data_reader.read(args.validation_data_path, tokenizer, args.max_seq_length, args.max_number_premises) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # num_train_optimization_steps, dividing by effective batch size t_total = (len(train_dataloader) // args.gradient_accumulation_steps) * args.num_train_epochs num_train_optimization_steps = ( len(train_dataloader) // args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare optimizer # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) model.to(device) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1 and not args.no_cuda: model = torch.nn.DataParallel(model) global_step = 0 number_of_batches_per_epoch = len(train_dataloader) if args.eval_freq > 0: steps_to_eval = args.eval_freq else: steps_to_eval = number_of_batches_per_epoch logger.info("***** Training *****") logger.info(" num examples = %d", len(train_data)) logger.info(" batch size = %d", args.train_batch_size) logger.info(" num steps = %d", num_train_optimization_steps) logger.info(" number of Gpus= %d", n_gpu) logger.info("***** Evaluation *****") logger.info(" num examples = %d", len(eval_data)) logger.info(" batch size = %d", args.eval_batch_size) best_acc = 0.0 best_epoch = 1 for epoch_index in trange(int(args.num_train_epochs), desc="Epoch"): epoch_start_time = time.time() model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 tq = tqdm(train_dataloader, desc="Iteration") acc = 0 for step, batch in enumerate(tq): batch = tuple(t.to(device) for t in batch) if not args.with_score: input_ids, segment_ids, input_mask, label_ids = batch outputs = model(input_ids, segment_ids, input_mask, label_ids) else: input_ids, segment_ids, input_mask, scores, label_ids = batch outputs = model(input_ids, segment_ids, input_mask, scores, label_ids) loss = outputs[0] logits = outputs[1] logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_accuracy = accuracy(logits, label_ids) acc += tmp_accuracy if n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if args.max_grad_norm is not None: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() if args.max_grad_norm is not None: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() global_step += 1 tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 tq.set_description( _get_loss_accuracy(tr_loss / nb_tr_steps, acc / nb_tr_examples)) # TODO: always eval on last batch # For now select the batch_size appropriately if (((step + 1) % steps_to_eval == 0) or (step+1)==number_of_batches_per_epoch )\ and args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 etq = tqdm(eval_dataloader, desc="Validating") for batch in etq: batch = tuple(t.to(device) for t in batch) with torch.no_grad(): if not args.with_score: input_ids, segment_ids, input_mask, label_ids = batch outputs = model(input_ids, segment_ids, input_mask, label_ids) else: input_ids, segment_ids, input_mask, scores, label_ids = batch outputs = model(input_ids, segment_ids, input_mask, scores, label_ids) tmp_eval_loss = outputs[0] logits = outputs[1] logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 etq.set_description( _get_loss_accuracy( eval_loss / nb_eval_steps, eval_accuracy / nb_eval_examples)) eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples logger.info(f"epoch, step | {epoch_index}, {step}") logger.info(" | Training | Validation") logger.info("accuracy | %.4f" % (acc / nb_tr_examples) + " | %.4f" % eval_accuracy) logger.info("loss | %.4f" % (tr_loss / nb_tr_steps) + " | %.4f" % eval_loss) best_acc = max(best_acc, eval_accuracy) if eval_accuracy == best_acc: best_epoch = (epoch_index, step) logger.info( "best validation performance so far %.4f: " % best_acc + ", best epoch: " + str(best_epoch) + ". saving current model to " + args.output_dir) # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join( args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join( args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) model.train() epoch_end_time = time.time() logger.info( f"time it took to finish the epoch {epoch_index} of {args.num_train_epochs} is " + _show_runtime(epoch_end_time - epoch_start_time)) # Does this even make sense to output? result = { 'eval_accuracy': best_acc, 'global_step': global_step, 'best_epoch': best_epoch } cleanup_global_logging(stdout_handler) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): # **************************** 基础信息 *********************** logger = init_logger(log_name=config['model']['arch'], log_dir=config['output']['log_dir']) logger.info(f"seed is {config['train']['seed']}") device = f"cuda: {config['train']['n_gpu'][0] if len(config['train']['n_gpu']) else 'cpu'}" seed_everything(seed=config['train']['seed'],device=device) logger.info('starting load data from disk') id2label = {value: key for key, value in config['label2id'].items()} # **************************** 数据生成 *********************** DT = DataTransformer(logger = logger,seed = config['train']['seed']) # 读取数据集以及数据划分 targets,sentences = DT.read_data(raw_data_path = config['data']['raw_data_path'], preprocessor = EnglishPreProcessor(), is_train = True) train, valid = DT.train_val_split(X = sentences,y = targets,save=True,shuffle=True,stratify=False, valid_size = config['train']['valid_size'], train_path = config['data']['train_file_path'], valid_path = config['data']['valid_file_path']) tokenizer = BertTokenizer(vocab_file=config['pretrained']['bert']['vocab_path'], do_lower_case=config['train']['do_lower_case']) # train train_dataset = CreateDataset(data = train, tokenizer = tokenizer, max_seq_len = config['train']['max_seq_len'], seed = config['train']['seed'], example_type = 'train') # valid valid_dataset = CreateDataset(data= valid, tokenizer = tokenizer, max_seq_len = config['train']['max_seq_len'], seed = config['train']['seed'], example_type = 'valid') #加载训练数据集 train_loader = DataLoader(dataset = train_dataset, batch_size = config['train']['batch_size'], num_workers = config['train']['num_workers'], shuffle = True, drop_last = False, pin_memory = False) # 验证数据集 valid_loader = DataLoader(dataset = valid_dataset, batch_size = config['train']['batch_size'], num_workers = config['train']['num_workers'], shuffle = False, drop_last = False, pin_memory = False) # **************************** 模型 *********************** logger.info("initializing model") model = BertFine.from_pretrained(config['pretrained']['bert']['bert_model_dir'], cache_dir=config['output']['cache_dir'], num_classes = len(id2label)) # ************************** 优化器 ************************* param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] num_train_steps = int( len(train_dataset.examples) / config['train']['batch_size'] / config['train']['gradient_accumulation_steps'] * config['train']['epochs']) # t_total: total number of training steps for the learning rate schedule # warmup: portion of t_total for the warmup optimizer = AdamW(optimizer_grouped_parameters, lr = config['train']['learning_rate']) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=config['train']['warmup_steps'], t_total=num_train_steps) # **************************** callbacks *********************** logger.info("initializing callbacks") # 模型保存 model_checkpoint = ModelCheckpoint(checkpoint_dir = config['output']['checkpoint_dir'], mode = config['callbacks']['mode'], monitor = config['callbacks']['monitor'], save_best_only = config['callbacks']['save_best_only'], arch = config['model']['arch'], logger = logger) # 监控训练过程 train_monitor = TrainingMonitor(file_dir = config['output']['figure_dir'], arch = config['model']['arch']) # 学习率机制 lr_scheduler = BertLR(optimizer = optimizer, learning_rate = config['train']['learning_rate'], t_total = num_train_steps, warmup = config['train']['warmup_steps']) # **************************** training model *********************** logger.info('training model....') train_configs = { 'model': model, 'logger': logger, 'optimizer': optimizer, 'scheduler': scheduler, 'resume': config['train']['resume'], 'epochs': config['train']['epochs'], 'n_gpu': config['train']['n_gpu'], 'gradient_accumulation_steps': config['train']['gradient_accumulation_steps'], 'epoch_metrics':[F1Score(average='micro',task_type='binary'),MultiLabelReport(id2label = id2label)], 'batch_metrics':[AccuracyThresh(thresh=0.5)], 'criterion': BCEWithLogLoss(), 'model_checkpoint': model_checkpoint, 'training_monitor': train_monitor, 'lr_scheduler': lr_scheduler, 'early_stopping': None, 'verbose': 1 } trainer = Trainer(train_configs=train_configs) # 拟合模型 trainer.train(train_data = train_loader,valid_data=valid_loader) # 释放显存 if len(config['train']['n_gpu']) > 0: torch.cuda.empty_cache()
[p for n, p in named_params if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in named_params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = len( train_feats['data']) // opt.batchSize * opt.max_epoch optimizer = AdamW( optimizer_grouped_parameters, lr=opt.lr, correct_bias=False ) # To reproduce BertAdam specific behavior set correct_bias=False scheduler = WarmupLinearSchedule( optimizer, warmup_steps=int(opt.warmup_proportion * num_train_optimization_steps), t_total=num_train_optimization_steps) # PyTorch scheduler # prepare_inputs_for_bert(sentences, word_lengths) def decode(data_feats, data_tags, data_class, output_path): data_index = np.arange(len(data_feats)) losses = [] TP, FP, FN, TN = 0.0, 0.0, 0.0, 0.0 TP2, FP2, FN2, TN2 = 0.0, 0.0, 0.0, 0.0 with open(output_path, 'w') as f: for j in range(0, len(data_index), opt.test_batchSize): if opt.testing: words, tags, raw_tags, classes, raw_classes, lens, line_nums = data_reader.get_minibatch_with_class( data_feats,
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_data', type=Path, required=True) parser.add_argument('--output_dir', type=Path, required=True) parser.add_argument( "--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--do_lower_case", action="store_true") parser.add_argument( "--reduce_memory", action="store_true", help= "Store training data as on-disc memmaps to massively reduce memory usage" ) parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") args = parser.parse_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}.json" metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break else: num_data_epochs = args.epochs if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning( f"Output directory ({args.output_dir}) already exists and is not empty!" ) args.output_dir.mkdir(parents=True, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int(total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForMaskedLM.from_pretrained(args.bert_model) # We don't need to manually call model.half() following Apex's recommend # if args.fp16: # model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) if args.fp16: try: # from apex.optimizers import FP16_Optimizer # from apex.optimizers import FusedAdam from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) # This below line of code is the main upgrade of Apex Fp16 implementation. I chose opt_leve="01" # because it's recommended for typical use by Apex. We can make it configured model, optimizer = amp.initialize(model, optimizer, opt_level="O1") # We don't need to use FP16_Optimizer wrapping over FusedAdam as well. Now Apex supports all Pytorch Optimizer # optimizer = FusedAdam(optimizer_grouped_parameters, # lr=args.learning_rate, # bias_correction=False, # max_grad_norm=1.0) # if args.loss_scale == 0: # optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) # else: # optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) # else: # optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) # scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) model.train() for epoch in range(args.epochs): epoch_dataset = PregeneratedDataset( epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch outputs = model(input_ids, attention_mask=input_mask, masked_lm_labels=lm_label_ids) loss = outputs[0] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: # I depricate FP16_Optimizer's backward func and replace as Apex document # optimizer.backward(loss) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule optimizer.zero_grad() global_step += 1 # Save a trained model if args.local_rank == -1 or torch.distributed.get_rank() == 0: logging.info("** ** * Saving fine-tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir)
'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE, correct_bias=False) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=100, t_total=num_train_steps) # Now we do the actual training. In each epoch, we present the model with all training data and compute the loss on the training set and the development set. We save the model whenever the development loss improves. We end training when we haven't seen an improvement of the development loss for a specific number of epochs (the patience). # # Optionally, we use gradient accumulation to accumulate the gradient for several training steps. This is useful when we want to use a larger batch size than our current GPU allows us to do. # In[ ]: import os from tqdm import trange from tqdm import tqdm from sklearn.metrics import classification_report, precision_recall_fscore_support OUTPUT_DIR = "/tmp/" MODEL_FILE_NAME = "pytorch_model.bin"
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_corpus", default=None, type=str, required=True, help="The input train corpus.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--cuda_device', default='0', type=str, help="Which GPU card to target") args = parser.parse_args() os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda_device if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train: raise ValueError( "Training is currently the only implemented execution option. Please set `do_train`." ) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir) and ( args.local_rank == -1 or torch.distributed.get_rank() == 0): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) #train_examples = None num_train_optimization_steps = None if args.do_train: print("Loading Train Dataset", args.train_corpus) train_dataset = BERTDataset(args.train_corpus, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForPreTraining.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: #TODO: check if this works with current data generator from disk that relies on next(file) # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) best_avg_loss = 10000000 best_dir = os.path.join(args.output_dir, 'best') loss_dir = os.path.join(args.output_dir, 'losses') os.mkdir(loss_dir) os.mkdir(best_dir) model.train() for epoch in trange(int(args.num_train_epochs), desc="Epoch"): losses = [] tr_loss = 0 running_loss = 0 curr_step = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch outputs = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) loss = outputs[0] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() running_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule optimizer.zero_grad() global_step += 1 curr_step += 1 if curr_step % 1000 == 0: logger.info('Average loss after %d steps is: %.5f' % (curr_step, running_loss / 1000)) losses.append(running_loss / 1000) # Save a trained model if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info( "** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training if running_loss < best_avg_loss: best_avg_loss = running_loss model_to_save.save_pretrained(best_dir) tokenizer.save_pretrained(best_dir) curr_dir = os.path.join(args.output_dir, 'epoch_%d' % epoch) if not os.path.exists(curr_dir): os.mkdir(curr_dir) model_to_save.save_pretrained(curr_dir) tokenizer.save_pretrained(curr_dir) running_loss = 0.0 if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training if running_loss < best_avg_loss: best_avg_loss = running_loss model_to_save.save_pretrained(best_dir) tokenizer.save_pretrained(best_dir) curr_dir = os.path.join(args.output_dir, 'epoch_%d' % epoch) if not os.path.exists(curr_dir): os.mkdir(curr_dir) model_to_save.save_pretrained(curr_dir) tokenizer.save_pretrained(curr_dir) with open(os.path.join(loss_dir, 'epoch_%d' % epoch), 'w+', encoding='utf-8') as fp: json.dump(losses, fp)
def prepare_optimizer_and_scheduler( args: Namespace, model: nn.Module, num_batches: int, ) -> Tuple[AdamW, WarmupLinearSchedule]: """Configures BERT's AdamW optimizer and WarmupLinearSchedule learning rate scheduler. Divides parameters into two learning rate groups, with higher learning rate for non-BERT parameters (classifier model).""" t_total = (num_batches // args.gradient_accumulation_steps * args.num_train_epochs) if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() logger.info(" Total optimization steps = %d", t_total) # Prepare optimizer param_optimizer = list( filter(lambda p: p[1].requires_grad, model.named_parameters())) no_decay = ['bias', 'LayerNorm.weight'] higher_lr = ['classifier', 'crf', 'lstm'] def is_classifier_param(param_name: str) -> bool: return any(hl in param_name for hl in higher_lr) def ignore_in_weight_decay(param_name: str) -> bool: return any(nd in param_name for nd in no_decay) optimizer_grouped_parameters = [ { 'params': [ p for name, p in param_optimizer if not ignore_in_weight_decay(name) and not is_classifier_param(name) ], 'weight_decay': 0.01 }, { 'params': [ p for name, p in param_optimizer if not ignore_in_weight_decay(name) and is_classifier_param(name) ], 'weight_decay': 0.01, 'lr': args.classifier_lr }, { 'params': [ p for name, p in param_optimizer if ignore_in_weight_decay(name) and not is_classifier_param(name) ], 'weight_decay': 0.0 }, ] # To reproduce BertAdam specific behavior set correct_bias=False optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) num_warmup_steps = t_total * args.warmup_proportion scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=t_total) return optimizer, scheduler
def main(): if os.path.exists(args.save_dir) and os.listdir(args.save_dir): if args.overwrite==False and input("Save directory ({}) already exists and is not empty, rewrite the files?(Y/N)\n".format(args.save_dir)) not in ("Y","y"): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.save_dir)) if not os.path.exists(args.save_dir): os.makedirs(args.save_dir, exist_ok=True) # train and eval if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) logger.info("gradient_accumulation_steps {}".format(args.gradient_accumulation_steps)) train_batch_size = args.train_batch_size // args.gradient_accumulation_steps #preppare train data if os.path.exists(args.data_dir)==False: raise ValueError("data dir does not exists") #load train data train_loader=FeatureLoader(os.path.join(args.data_dir, "{}-features-train".format(args.encoder)), batch_size=train_batch_size) #load valid data eval_loader=FeatureLoader(os.path.join(args.data_dir, "{}-features-valid".format(args.encoder)), batch_size=args.eval_batch_size) #configure model running parameters t_total = args.max_steps #num_train_epochs = args.max_steps // (len(train_features) // args.gradient_accumulation_steps) + 1 # Prepare model model =get_inference_net( model_path=model_path[args.encoder], name=args.encoder) logger.info("{}".format(model)) model.to(device) if n_gpu>1: from torch import nn model = nn.DataParallel(model) # Prepare optimizer no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) #begin train logger.info("total train steps: {}, n_gpu:{}".format(t_total, n_gpu)) def _eval_model(model): logger.info("***** Running evaluation *****") eval_dataloader = eval_loader.load_data(args.eval_load_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 i=0 for input_ids, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, label_ids) logits = model(input_ids, segment_ids) i+=args.eval_batch_size logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples return eval_accuracy, eval_loss def _train_model(): logger.info("***** Running training *****") best_acc=0.0 train_step=0 step=0 tr_loss=0.0 train_dataloader = iter(train_loader.load_data(args.train_load_size) ) model.train() while train_step< t_total: try: batch=next(train_dataloader) except: train_dataloader=iter(train_loader.load_data(args.train_load_size) ) batch=next(train_dataloader) batch = tuple(t.to(device) for t in batch) input_ids, segment_ids, label_ids = batch input_ids=input_ids.to(device) segment_ids=segment_ids.to(device) label_ids=label_ids.to(device) loss = model(input_ids, segment_ids, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss+=loss.item() step+=1 if step % args.gradient_accumulation_steps == 0: scheduler.step() optimizer.step() optimizer.zero_grad() train_step += 1 #verbose after training steps if train_step%args.train_verbose==0: logger.info("training steps: #{}, loss={}".format(train_step, tr_loss/args.train_verbose)) tr_loss=0.0 # save Model if train_step%args.eval_step_size==0: eval_acc,eval_loss=_eval_model(model) if eval_acc>best_acc: saveModel(model, train_step) best_acc=eval_acc logger.info("training step:{}, eval_accuracy:{}, eval_loss:{}, best eval_acc:{}".format( train_step, eval_acc, eval_loss, best_acc) ) #begin training model _train_model()
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_data', type=Path, required=True) parser.add_argument('--output_dir', type=Path, required=True) parser.add_argument( "--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--do_lower_case", action="store_true") parser.add_argument( "--reduce_memory", action="store_true", help= "Store training data as on-disc memmaps to massively reduce memory usage" ) parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument("--wp", type=bool, default=False, help="if train on wp") parser.add_argument( '--from_scratch', action='store_true', help='do not load prtrain model, only random initialize') parser.add_argument("--output_step", type=int, default=100000, help="Number of step to save model") args = parser.parse_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" samples_per_epoch = [] num_data_epochs = args.epochs for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}.json" metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) args.n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) args.output_mode = "classification" if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning( f"Output directory ({args.output_dir}) already exists and is not empty!" ) args.output_dir.mkdir(parents=True, exist_ok=True) while True: try: tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) if tokenizer._noi_token is None: tokenizer._noi_token = '[NOI]' if args.bert_model == 'bert-base-uncased' or 'bert-large-uncased': tokenizer.vocab['[NOI]'] = tokenizer.vocab.pop('[unused0]') else: tokenizer.vocab['[NOI]'] = tokenizer.vocab.pop('[unused1]') # else: # raise ValueError("No clear choice for insert NOI for tokenizer type {}".format(args.model_name_or_path)) tokenizer.ids_to_tokens[1] = '[NOI]' logger.info("Adding [NOI] to the vocabulary 1") except: continue break total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int(total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model if args.from_scratch: model = BertForMaskedLM() else: model = BertForMaskedLM.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) model.train() for epoch in range(args.epochs): epoch_dataset = PregeneratedDataset( epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory, args=args) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids = batch outputs = model( input_ids, segment_ids, input_mask, lm_label_ids, ) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() # Update learning rate schedule optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % args.output_step == 0 and args.local_rank in [ -1, 0 ]: # Save model checkpoint output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.local_rank in [-1, 0]: # Save model checkpoint output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) logger.info("PROGRESS: {}%".format( round(100 * (epoch + 1) / args.epochs, 4))) logger.info("EVALERR: {}%".format(tr_loss)) # Save a trained model if args.local_rank == -1 or torch.distributed.get_rank() == 0: logging.info("** ** * Saving fine-tuned model ** ** * ") logger.info("Saving model checkpoint to %s", args.output_dir) model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir)
def main(log_in_file, lm_path, lm_type, data_path, usegpu, n_fold, total_step, eval_every, early_stop, lr, weight_decay, lr_decay_in_layers, wd_decay_in_layers, max_length, max_title_rate, content_head_rate, batch_size, lr_scheduler_type, input_pattern, clean_method, warmup_rate, classifier_dropout, classifier_active, seed): arg_name_value_pairs = deepcopy(locals()) prefix = time.strftime('%Y%m%d_%H%M') logger = logging.getLogger('default') formatter = logging.Formatter("%(asctime)s %(message)s") if log_in_file: handler1 = logging.FileHandler(prefix + '.log') handler1.setFormatter(formatter) handler1.setLevel(logging.DEBUG) logger.addHandler(handler1) handler2 = logging.StreamHandler() handler2.setFormatter(formatter) handler2.setLevel(logging.DEBUG) logger.addHandler(handler2) logger.setLevel(logging.DEBUG) for arg_name, arg_value in arg_name_value_pairs.items(): logger.info(f'{arg_name}: {arg_value}') global tokenizer if lm_type == 'bert': tokenizer = BertTokenizer(os.path.join(lm_path, 'vocab.txt')) else: tokenizer = XLNetTokenizer(os.path.join(lm_path, 'spiece.model')) global PAD, PAD_t, CLS_t, SEP_t PAD_t = '<pad>' CLS_t = '<cls>' SEP_t = '<sep>' PAD = tokenizer.convert_tokens_to_ids([PAD_t])[0] logger.info(f'padding token is {PAD}') processed_train = preprocess( os.path.join(data_path, 'Train_DataSet.csv'), os.path.join(data_path, 'Train_DataSet_Label.csv'), tokenizer, max_length, input_pattern, clean_method, max_title_rate, content_head_rate, logger) processed_test = preprocess(os.path.join(data_path, 'Test_DataSet.csv'), False, tokenizer, max_length, input_pattern, clean_method, max_title_rate, content_head_rate, logger) logger.info('seed everything and create model') seed_everything(seed) no_decay = ['.bias', 'LayerNorm.bias', 'LayerNorm.weight'] if lm_type == 'xlnet': model = XLNetForSequenceClassification.from_pretrained( lm_path, num_labels=3, summary_last_dropout=classifier_dropout) if classifier_active == 'relu': model.sequence_summary.activation = nn.ReLU() if usegpu: model = model.cuda() model_layer_names = [ 'transformer.mask_emb', 'transformer.word_embedding.weight' ] model_layer_names += [ f'transformer.layer.{i}.' for i in range(model.config.n_layer) ] model_layer_names += ['sequence_summary.summary', 'logits_proj'] else: model = BertForSequenceClassification.from_pretrained( lm_path, num_labels=3, hidden_dropout_prob=classifier_dropout) if classifier_active == 'relu': model.bert.pooler.activation = nn.ReLU() if usegpu: model = model.cuda() model_layer_names = ['bert.embeddings'] model_layer_names += [ 'bert.encoder.layer.{}.'.format(i) for i in range(model.config.num_hidden_layers) ] model_layer_names += ['bert.pooler', 'classifier'] optimizer = optimizer = AdamW([{ 'params': [ p for n, p in model.named_parameters() if layer_name in n and not any(nd in n for nd in no_decay) ], 'lr': lr * (lr_decay_in_layers**i), 'weight_decay': weight_decay * (wd_decay_in_layers**i) } for i, layer_name in enumerate(model_layer_names[::-1])] + [{ 'params': [ p for n, p in model.named_parameters() if layer_name in n and any(nd in n for nd in no_decay) ], 'lr': lr * (lr_decay_in_layers**i), 'weight_decay': .0 } for i, layer_name in enumerate(model_layer_names[::-1])]) if lr_scheduler_type == 'linear': lr_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_rate, t_total=total_step) elif lr_scheduler_type == 'constant': lr_scheduler = WarmupConstantSchedule(optimizer, warmup_steps=warmup_rate) else: raise ValueError model_state_0 = deepcopy(model.state_dict()) optimizer_state_0 = deepcopy(optimizer.state_dict()) test_iter = get_data_iter(processed_test, batch_size * 4, collect_test_func, shuffle=False) pred = np.zeros((len(processed_test), 3)) val_scores = [] for fold_idx, (train_idx, val_idx) in enumerate( KFold(n_splits=n_fold, shuffle=True, random_state=seed).split(processed_train)): model.load_state_dict(model_state_0) optimizer.load_state_dict(optimizer_state_0) if lr_scheduler_type == 'linear': lr_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_rate, t_total=total_step) elif lr_scheduler_type == 'constant': lr_scheduler = WarmupConstantSchedule(optimizer, warmup_steps=warmup_rate) else: raise ValueError train_iter = get_data_iter([processed_train[i] for i in train_idx], batch_size, collect_func) val_iter = get_data_iter([processed_train[i] for i in val_idx], batch_size * 4, collect_func, shuffle=False) best_model, best_score = training(model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, train_iter=train_iter, val_iter=val_iter, total_step=total_step, tokenizer=tokenizer, usegpu=usegpu, eval_every=eval_every, logger=logger, early_stop=early_stop, fold_idx=fold_idx) model.load_state_dict(best_model) val_scores.append(best_score) pred += predict(model, test_iter, usegpu) logger.info(f'average: {np.mean(val_scores):.6f}') pred = pred / n_fold prob_df = pd.DataFrame() submit = pd.DataFrame() submit['id'] = [i['id'] for i in processed_test] submit['label'] = pred.argmax(-1) prob_df['id'] = [i['id'] for i in processed_test] prob_df['0'] = pred[:, 0] prob_df['1'] = pred[:, 1] prob_df['2'] = pred[:, 2] submit.to_csv(f'submit_{prefix}.csv', index=False) prob_df.to_csv(f'probability_{prefix}.csv', index=False)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--from_pretrained", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--output_dir", default="save", type=str, help="The output directory where the model checkpoints will be written.", ) parser.add_argument( "--config_file", default="config/bert_base_6layer_6conect.json", type=str, help="The config file which specified the model details.", ) parser.add_argument( "--num_train_epochs", default=20, type=int, help="Total number of training epochs to perform.", ) parser.add_argument( "--train_iter_multiplier", default=1.0, type=float, help="multiplier for the multi-task training.", ) parser.add_argument( "--train_iter_gap", default=4, type=int, help="forward every n iteration is the validation score is not improving over the last 3 epoch, -1 means will stop", ) parser.add_argument( "--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.", ) parser.add_argument( "--no_cuda", action="store_true", help="Whether not to use CUDA when available" ) parser.add_argument( "--do_lower_case", default=True, type=bool, help="Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument( "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus", ) parser.add_argument( "--seed", type=int, default=0, help="random seed for initialization" ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumualte before performing a backward/update pass.", ) parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument( "--num_workers", type=int, default=16, help="Number of workers in the dataloader.", ) parser.add_argument( "--save_name", default="", type=str, help="save name for training." ) parser.add_argument( "--in_memory", default=False, type=bool, help="whether use chunck for parallel training.", ) parser.add_argument( "--optim", default="AdamW", type=str, help="what to use for the optimization." ) parser.add_argument( "--freeze", default=-1, type=int, help="till which layer of textual stream of vilbert need to fixed.", ) parser.add_argument( "--vision_scratch", action="store_true", help="whether pre-trained the image or not.", ) parser.add_argument( "--evaluation_interval", default=1, type=int, help="evaluate very n epoch." ) parser.add_argument( "--lr_scheduler", default="mannul", type=str, help="whether use learning rate scheduler.", ) parser.add_argument( "--baseline", action="store_true", help="whether use single stream baseline." ) parser.add_argument( "--resume_file", default="", type=str, help="Resume from checkpoint" ) parser.add_argument( "--dynamic_attention", action="store_true", help="whether use dynamic attention.", ) parser.add_argument( "--clean_train_sets", default=True, type=bool, help="whether clean train sets for multitask data.", ) parser.add_argument( "--visual_target", default=0, type=int, help="which target to use for visual branch. \ 0: soft label, \ 1: regress the feature, \ 2: NCE loss.", ) args = parser.parse_args() with open("task_config.yml", "r") as f: task_cfg = edict(yaml.safe_load(f)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.baseline: from pytorch_transformers.modeling_bert import BertConfig from src.models.basebert import BaseBertForVLTasks else: from src.models.vilbert import BertConfig from src.models.vilbert import VILBertForVLTasks name = task_cfg["name"] task_lr = task_cfg["lr"] base_lr = task_lr loss_scale = task_lr / base_lr if args.save_name: prefix = "-" + args.save_name else: prefix = "" timeStamp = ( args.config_file.split("/")[1].split(".")[0] + prefix ) savePath = os.path.join(args.output_dir, timeStamp) bert_weight_name = json.load( open("config/" + args.bert_model + "_weight_name.json", "r") ) if args.local_rank == -1 or args.no_cuda: device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu" ) n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16 ) ) default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True if default_gpu: if not os.path.exists(savePath): os.makedirs(savePath) config = BertConfig.from_json_file(args.config_file) if default_gpu: # save all the hidden parameters. with open(os.path.join(savePath, "command.txt"), "w") as f: print(args, file=f) # Python 3.x print("\n", file=f) print(config, file=f) # load dataset task_batch_size, task_num_iters, task_datasets_train, task_datasets_val, task_dataloader_train, task_dataloader_val = LoadDatasets( args, task_cfg ) logdir = os.path.join(savePath, "logs") tbLogger = utils.tbLogger( logdir, savePath, task_num_iters, args.gradient_accumulation_steps, ) if args.visual_target == 0: config.v_target_size = 1601 config.visual_target = args.visual_target else: config.v_target_size = 2048 config.visual_target = args.visual_target if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_ave_iter = {} task_stop_controller = {} task_ave_iter = int( task_cfg["num_epoch"] * task_num_iters * args.train_iter_multiplier / args.num_train_epochs ) task_stop_controller = utils.TaskStopOnPlateau( mode="max", patience=1, continue_threshold=0.005, cooldown=1, threshold=0.001, ) median_num_iter = task_ave_iter num_train_optimization_steps = ( median_num_iter * args.num_train_epochs // args.gradient_accumulation_steps ) num_labels = task_datasets_train.num_labels if args.dynamic_attention: config.dynamic_attention = True model = VILBertForVLTasks.from_pretrained( args.from_pretrained, config=config, num_labels=num_labels, default_gpu=default_gpu, ) task_losses = LoadLosses(args, task_cfg) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] if args.freeze != -1: bert_weight_name_filtered = [] for name in bert_weight_name: if "embeddings" in name: bert_weight_name_filtered.append(name) elif "encoder" in name: layer_num = name.split(".")[2] if int(layer_num) <= args.freeze: bert_weight_name_filtered.append(name) optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if key[12:] in bert_weight_name_filtered: value.requires_grad = False if default_gpu: print("filtered weight") print(bert_weight_name_filtered) optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if "vil_" in key: lr = 1e-4 else: if args.vision_scratch: if key[12:] in bert_weight_name: lr = base_lr else: lr = 1e-4 else: lr = base_lr if any(nd in key for nd in no_decay): optimizer_grouped_parameters += [ {"params": [value], "lr": lr, "weight_decay": 0.0} ] if not any(nd in key for nd in no_decay): optimizer_grouped_parameters += [ {"params": [value], "lr": lr, "weight_decay": 0.01} ] if default_gpu: print(len(list(model.named_parameters())), len(optimizer_grouped_parameters)) # choose optimizer if args.optim == "AdamW": optimizer = AdamW(optimizer_grouped_parameters, lr=base_lr, correct_bias=False) elif args.optim == "RAdam": optimizer = RAdam(optimizer_grouped_parameters, lr=base_lr) # choose scheduler warmpu_steps = args.warmup_proportion * num_train_optimization_steps if args.lr_scheduler == "warmup_linear": warmup_scheduler = WarmupLinearSchedule( optimizer, warmup_steps=warmpu_steps, t_total=num_train_optimization_steps ) else: warmup_scheduler = WarmupConstantSchedule(optimizer, warmup_steps=warmpu_steps) lr_reduce_list = np.array([5, 7]) if args.lr_scheduler == "automatic": lr_scheduler = ReduceLROnPlateau( optimizer, mode="max", factor=0.2, patience=1, cooldown=1, threshold=0.001 ) elif args.lr_scheduler == "cosine": lr_scheduler = CosineAnnealingLR( optimizer, T_max=median_num_iter * args.num_train_epochs ) elif args.lr_scheduler == "cosine_warm": lr_scheduler = CosineAnnealingWarmRestarts( optimizer, T_0=median_num_iter * args.num_train_epochs ) elif args.lr_scheduler == "mannul": def lr_lambda_fun(epoch): return pow(0.2, np.sum(lr_reduce_list <= epoch)) lr_scheduler = LambdaLR(optimizer, lr_lambda=lr_lambda_fun) startIterID = 0 global_step = 0 start_epoch = 0 if args.resume_file != "" and os.path.exists(args.resume_file): checkpoint = torch.load(args.resume_file, map_location="cpu") new_dict = {} for attr in checkpoint["model_state_dict"]: if attr.startswith("module."): new_dict[attr.replace("module.", "", 1)] = checkpoint[ "model_state_dict" ][attr] else: new_dict[attr] = checkpoint["model_state_dict"][attr] model.load_state_dict(new_dict) warmup_scheduler.load_state_dict(checkpoint["warmup_scheduler_state_dict"]) # lr_scheduler.load_state_dict(checkpoint['lr_scheduler_state_dict']) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) global_step = checkpoint["global_step"] start_epoch = int(checkpoint["epoch_id"]) + 1 task_stop_controller = checkpoint["task_stop_controller"] tbLogger = checkpoint["tb_logger"] del checkpoint model.to(device) print("`==============`MODEL=============") print(next(model.parameters()).is_cuda)#False for state in optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, delay_allreduce=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) if default_gpu: print("***** Running training *****") print(" Num Iters: ", task_num_iters) print(" Batch size: ", task_batch_size) print(" Num steps: %d" % num_train_optimization_steps) task_iter_train = None task_count = 0 for epochId in tqdm(range(start_epoch, args.num_train_epochs), desc="Epoch", ncols=100): model.train() for step in range(median_num_iter): iterId = startIterID + step + (epochId * median_num_iter) first_task = True is_forward = False if (not task_stop_controller.in_stop) or ( iterId % args.train_iter_gap == 0 ): is_forward = True if is_forward: loss, score = ForwardModelsTrain( args, task_cfg, device, task_count, task_iter_train, task_dataloader_train, model, task_losses, ) loss = loss * loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion, ) for param_group in optimizer.param_groups: param_group["lr"] = lr_this_step if first_task and ( global_step < warmpu_steps or args.lr_scheduler == "warmup_linear" ): warmup_scheduler.step() optimizer.step() model.zero_grad() if first_task: global_step += 1 first_task = False if default_gpu: tbLogger.step_train( epochId, iterId, float(loss), float(score), optimizer.param_groups[0]["lr"], "train", ) if "cosine" in args.lr_scheduler and global_step > warmpu_steps: lr_scheduler.step() if ( step % (20 * args.gradient_accumulation_steps) == 0 and step != 0 and default_gpu ): tbLogger.showLossTrain() # decided whether to evaluate on SNLI tasks. if (iterId != 0 and iterId % task_num_iters == 0) or ( epochId == args.num_train_epochs - 1 and step == median_num_iter - 1 ): evaluate( args, task_dataloader_val, task_stop_controller, task_cfg, device, model, task_losses, epochId, default_gpu, tbLogger, ) if args.lr_scheduler == "automatic": lr_scheduler.step(sum(val_scores.values())) logger.info("best average score is %3f" % lr_scheduler.best) elif args.lr_scheduler == "mannul": lr_scheduler.step() if epochId in lr_reduce_list: # reset the task_stop_controller once the lr drop task_stop_controller._reset() if default_gpu: # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = ( model.module if hasattr(model, "module") else model ) # Only save the model it-self output_model_file = os.path.join( savePath, "pytorch_model_" + str(epochId) + ".bin" ) output_checkpoint = os.path.join(savePath, "pytorch_ckpt_latest.tar") torch.save(model_to_save.state_dict(), output_model_file) torch.save( { "model_state_dict": model_to_save.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "warmup_scheduler_state_dict": warmup_scheduler.state_dict(), # 'lr_scheduler_state_dict': lr_scheduler.state_dict(), "global_step": global_step, "epoch_id": epochId, "task_stop_controller": task_stop_controller, "tb_logger": tbLogger, }, output_checkpoint, ) tbLogger.txt_close()
def train(train_task_name, model, tokenizer): set_seed(42) # for reproductibility # prepare training dataset train_features = convert_to_input_features_helper(train_examples, tokenizer, use_multiprocessing) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) # !!!! no minus 1 train_dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # total batch size train_batch_size = per_gpu_train_batch_size * max(1, n_gpus) train_sampler = SequentialSampler(train_dataset) # was random sampler train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size) if max_steps > 0: t_total = max_steps num_trian_epochs = max_steps // len( train_dataloader) // gradient_accumulation_steps + 1 else: t_total = len( train_dataloader) // gradient_accumulation_steps * num_train_epochs # prepare optimizer and schedule (linear warmup and decay) warmup_steps = int(t_total * warmup_proportion) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=1e-8) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) if n_gpus > 1: print('*********** using multi gpu! ************') model = torch.nn.DataParallel(model) logger.info("***** Running %s *****", 'training') logger.info(" Num examples = %d", len(train_dataloader)) logger.info(" Batch size per gpu = %d", per_gpu_train_batch_size) logger.info(" Total batch size = %d", train_batch_size) logger.info(" Num steps = %d", t_total) # visualization # train max_grad_norm = 1 epoch = 0 # for visualization loss-epoch global_step = 0 tr_loss, loging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(num_train_epochs), desc='Epoch') saved_loss = [] for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'labels': batch[3] } outputs = model(**inputs) # unpack dict loss, logits = outputs[:2] # model outputs are in tuple saved_loss.append(loss.detach().cpu().numpy().item()) if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps # print("\r%f" % loss, end='') # delete this loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) tr_loss += loss.item() if (step + 1) % gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # call optimizer before scheduler model.zero_grad() global_step += 1 if max_steps > 0 and global_step > max_steps: epoch_iterator.close() break epoch += 1 # save model at each epoch output_model_dir = os.path.join(cache_dir, 'epoch_{}'.format(epoch)) if not os.path.exists(output_model_dir): os.makedirs(output_model_dir) model_to_save = model.module if hasattr( model, 'module') else model # take care of distributed/parallel training model_to_save.save_pretrained(output_model_dir) tokenizer.save_pretrained(output_model_dir) torch.save(stats, os.path.join(output_dir, 'training_args.bin')) logger.info('Saving model at epoch %d to %s' % (epoch, output_model_dir)) # evaluation using saved model if max_steps > 0 and global_step > max_steps: train_iterator.close() break # draw and save loss-step graph save_loss(saved_loss, global_step)