def test_warmup_cosine_hard_restart_scheduler(self): scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( self.optimizer, num_warmup_steps=2, num_cycles=2, num_training_steps=10 ) lrs = unwrap_schedule(scheduler, self.num_steps) expected_learning_rates = [5.0, 10.0, 8.53, 5.0, 1.46, 10.0, 8.53, 5.0, 1.46, 0.0] self.assertEqual(len(lrs[0]), 1) self.assertListAlmostEqual([l[0] for l in lrs], expected_learning_rates, tol=1e-2) scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( self.optimizer, num_warmup_steps=2, num_cycles=2, num_training_steps=10 ) lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps) self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
def _get_scheduler(self, optimizer, scheduler: str, warmup_steps: int, t_total: int): """ Returns the correct learning rate scheduler """ scheduler = scheduler.lower() if scheduler == 'constantlr': return transformers.get_constant_schedule(optimizer) elif scheduler == 'warmupconstant': return transformers.get_constant_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps) elif scheduler == 'warmuplinear': return transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) elif scheduler == 'warmupcosine': return transformers.get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) elif scheduler == 'warmupcosinewithhardrestarts': return transformers.get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) else: raise ValueError("Unknown scheduler {}".format(scheduler))
def get_scheduler(optimizer, scheduler: str, warmup_steps: int, num_total: int): assert scheduler in [ "constantlr", "warmuplinear", "warmupconstant", "warmupcosine", "warmupcosinewithhardrestarts" ], ('scheduler should be one of ["constantlr","warmupconstant","warmupcosine","warmupcosinewithhardrestarts"]' ) if scheduler == 'constantlr': return transformers.get_constant_schedule(optimizer) elif scheduler == 'warmupconstant': return transformers.get_constant_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps) elif scheduler == 'warmuplinear': return transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_total) elif scheduler == 'warmupcosine': return transformers.get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_total) elif scheduler == 'warmupcosinewithhardrestarts': return transformers.get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_total)
def augmentation_train(self, DATA_FILE, BATCH_SIZE=2, EPOCHS=1, LEARNING_RATE=0.01, WARMUP_STEPS=100, MAX_SEQ_LEN=100, MODEL_NAME='gpt2'): """ DATA_FILE: Path of the data file BATCH_SIZE: Batch size EPOCHS: Number of augmentation epochs LEARNING_RATE: Learning Rate WARMUP_STEPS: Warm up steps MAX_SEQ_LEN: Maximum sequence length in each of text MODEL_NAME: For now, only gpt2 is supported. more will be added in future. """ TOKENIZER, MODEL = load_models() LOADER = get_data_loader(DATA_FILE) DEVICE = 'cpu' if torch.cuda.is_available(): DEVICE = 'cuda' model = MODEL.to(DEVICE) model.train() optimizer = AdamW(model.parameters(), lr=LEARNING_RATE) scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=-1) model = train(EPOCHS, LOADER, BATCH_SIZE, TOKENIZER, MODEL, DEVICE, optimizer, scheduler) self.MODEL = model
def configure_optimizers( self): # Scheduler can be changed with a one without hard_restarts optimizer = transformers.AdamW(self.parameters(), lr=self.learning_rate) warmup_steps = self.steps_per_epoch // 3 # First third of the epoch is warmup to fasten the training process total_steps = self.steps_per_epoch * self.n_epochs - warmup_steps # We use default 1 hard restart scheduler = transformers.get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, warmup_steps, total_steps) return [optimizer], [scheduler]
def _initialise_lr_scheduler(self, optimizer): num_batches = len(self.datasets['train']) // self.hparams.batch_size num_training_steps = num_batches // self.hparams.accumulate_grad_batches * self.hparams.max_epochs warmup_steps = int(num_training_steps * self.hparams.warmup_proportion) if self.hparams.learning_rate_scheduler == 'linear_with_warmup': scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps) elif self.hparams.learning_rate_scheduler == 'cosine_with_hard_restarts_warmup': scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps, num_cycles=1) elif self.hparams.learning_rate_scheduler == 'cosine_schedule_with_warmup': scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps) elif self.hparams.learning_rate_scheduler == 'constant_schedule_with_warmup': scheduler = get_constant_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps) elif self.hparams.learning_rate_scheduler == 'cosine_annealing_warm_restarts': scheduler = CosineAnnealingWarmRestarts(optimizer, warmup_steps) elif self.hparams.learning_rate_scheduler == 'reduce_on_plateau': scheduler = ReduceLROnPlateau(optimizer) elif self.hparams.learning_rate_scheduler == 'constant': scheduler = StepLR(optimizer, 10, gamma=1.0) else: raise ValueError( f'learning_rate_scheduler needs to be one of ' f'linear_with_warmup, cosine_with_hard_restarts_warmup, cosine_schedule_with_warmup, ' f'constant_schedule_with_warmup, cosine_annealing_warm_restarts, reduce_on_plateau, ' f'step_lr. ' f'Given: {self.hparams.learning_rate_scheduler}') logger.info(f'SCHEDULER: {self.hparams.learning_rate_scheduler} ' f'num_batches={num_batches} ' f'num_training_steps={num_training_steps} ' f'warmup_steps={warmup_steps}') return { 'scheduler': scheduler, 'monitor': 'valid_loss', 'interval': 'step', 'frequency': 1 }
def _get_lr_scheduler(self, optimizer): lr_scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=self._num_warmup_steps, num_training_steps=self._get_num_training_steps(), num_cycles=self._num_cycles) scheduler = { 'scheduler': lr_scheduler, 'interval': 'step', 'frequency': self.trainer.accumulate_grad_batches, 'monitor': 'Loss/valid'} return scheduler
def configure_optimizers(self): optimizer = torch.optim.AdamW(self.parameters(), lr=self.train_config["learning_rate"]) scheduler = transformers.get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, num_warmup_steps=10, num_training_steps=5000, num_cycles=10) schedulers = [{ 'scheduler': scheduler, 'interval': 'step', 'frequency': 1 }] return [optimizer], schedulers
def configure_optimizers(self): # optimizer = torch.optim.Adam(self.parameters(), lr=self.config['lr']) optimizer = transformers.AdamW( self.parameters(), lr=self.config['lr']) #, weight_decay=0.01 scheduler = transformers.get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, num_warmup_steps=350, num_training_steps=3000, num_cycles=1) schedulers = [{ 'scheduler': scheduler, 'interval': 'step', 'frequency': 1 }] return [optimizer], schedulers
def get_model(model_args, training_args): '''tokenizer, model_config, model, optimizer, scaler, shceduler를 반환하는 함수''' # Load pretrained model and tokenizer model_config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, use_fast=True, ) if model_args.use_custom_model == 'ConvModel': model = ConvModel(model_args.config_name, model_config, model_args.tokenizer_name) elif model_args.use_custom_model == 'QueryAttentionModel': model = QueryAttentionModel(model_args.config_name, model_config, model_args.tokenizer_name) elif model_args.use_custom_model == 'QAConvModelV1': model = QAConvModelV1(model_args.config_name, model_config, model_args.tokenizer_name) elif model_args.use_custom_model == 'QAConvModelV2': model = QAConvModelV2(model_args.config_name, model_config, model_args.tokenizer_name) else: model = AutoModelForQuestionAnswering.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=model_config, ) if model_args.use_pretrained_model: pretrained_model = torch.load( f'/opt/ml/output/{model_args.model_name_or_path}/{model_args.model_name_or_path}.pt' ) pretrained_model_state = deepcopy(pretrained_model.state_dict()) model.load_state_dict(pretrained_model_state) del pretrained_model optimizer = AdamW(model.parameters(), lr=training_args.learning_rate) scaler = GradScaler() scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=1000, num_training_steps=12820, num_cycles=2) return tokenizer, model_config, model, optimizer, scaler, scheduler
def _get_lr_scheduler(self, optimizer): total_steps = len(self.train_dataloader()) * self.trainer.max_epochs training_steps = total_steps // self.trainer.accumulate_grad_batches lr_scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=self.num_warmup_steps, num_training_steps=training_steps, num_cycles=self.num_cycles) scheduler = { 'scheduler': lr_scheduler, 'interval': 'step', 'frequency': self.trainer.accumulate_grad_batches, 'monitor': 'Loss/valid' } return scheduler
def get_optimizer_scheduler(args, model, training_steps): # param_optimizer = list(model.named_parameters()) # no_decay = ['bias', 'gamma', 'beta'] # optimizer_grouped_parameters = [ # {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], # 'weight_decay_rate': 0.01}, # {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], # 'weight_decay_rate': 0.0}, # ] param_optimizer = list(model.named_parameters()) other_parameters = [(n, p) for n, p in param_optimizer if 'crf' not in n] no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in other_parameters if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in other_parameters if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}, {'params':[p for n, p in param_optimizer if 'crf.transitions' == n], 'lr':3e-2} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) if args.avg_steps: scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=training_steps, num_cycles=int(args.max_epoches/args.avg_steps) ) else: scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=training_steps ) return optimizer, scheduler
parser.add_argument('--warmup', default=300, type=int, action='store', help='Number of warmup steps to run') parser.add_argument('--model_name', default='mymodel.pt', type=str, action='store', help='Name of the model file') parser.add_argument('--data_file', default='mydata.csv', type=str, action='store', help='Name of the data file') parser.add_argument('--batch', type=int, default=32, action='store', help='Batch size') parser.add_argument('--learning_rate', default=3e-5, type=float, action='store', help='Learning rate for the model') parser.add_argument('--max_len', default=200, type=int, action='store', help='Maximum length of sequence') args = parser.parse_args() BATCH_SIZE = args.batch EPOCHS = args.epoch LEARNING_RATE = args.learning_rate WARMUP_STEPS = args.warmup MAX_SEQ_LEN = args.max_len MODEL_NAME = args.model_name DATA_FILE = args.data_file TOKENIZER, MODEL = load_models() LOADER = get_data_loader(DATA_FILE) DEVICE = 'cpu' if torch.cuda.is_available(): DEVICE = 'cuda' model = MODEL.to(DEVICE) model.train() optimizer = AdamW(model.parameters(), lr=LEARNING_RATE) scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=-1) model = train(EPOCHS, LOADER, BATCH_SIZE, TOKENIZER, MODEL, DEVICE) save_model(model, MODEL_NAME)
if args.model_name == 'one-tag': print('ont-tag') model = IOB_ONE(args) if args.model_name == 'tag-lo': model = IOB_LO(args) elif args.model_name == 'tag-hi': model = IOB_HI(args) model.to(args.device) optimizer = AdamW(model.parameters(), lr=float(args.lr)) if args.scheduler == 'linear': from transformers import get_linear_schedule_with_warmup scheduler = get_linear_schedule_with_warmup(optimizer, args.num_warmup_steps, args.num_training_steps) elif args.scheduler == 'cyclic_cosine': from transformers import get_cosine_with_hard_restarts_schedule_with_warmup scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, args.num_warmup_steps, args.num_training_steps, int(args.epochs / 2)) elif args.scheduler == 'plateau': raise NotImplementedError # Train ------------------------------------------------------------------- stats = utils.TraningStats() model.train() for epoch in range(1, args.epochs + 1): stats.epoch = epoch logger.info('*** Epoch %s starts ***', epoch) train(args, ds, model, optimizer, scheduler, stats)
def train(args, train_dataset, model, tokenizer): """ Train the model """ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) if args.cyclic_scheduler: scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, num_cycles=5) else: scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): # set global_step to global_step of last saved checkpoint from model path try: global_step = int( args.model_name_or_path.split("-")[-1].split("/")[0]) except ValueError: global_step = 0 epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=False, ) set_seed(args) # Added here for reproductibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=False) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None ) # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: logs = {} if ( args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): eval_key = "eval_{}".format(key) logs[eval_key] = value loss_scalar = (tr_loss - logging_loss) / args.logging_steps learning_rate_scalar = scheduler.get_lr()[0] logs["learning_rate"] = learning_rate_scalar logs["loss"] = loss_scalar logging_loss = tr_loss for key, value in logs.items(): tb_writer.add_scalar(key, value, global_step) # print(json.dumps({**logs, **{"step": global_step}})) if args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if global_step > args.max_steps > 0: epoch_iterator.close() break if global_step > args.max_steps > 0: train_iterator.close() break return global_step, tr_loss / global_step
def __init__(self, model_cls, tokenizer=None, sampling_function=None, scoring_function=None, **params): """ Initialize PointwiseGDCTrainer. Args: model (torch.model): pi_theta(x) Policy to be trained e.g. Hugging Face transformer GPT2 model with value head orig_model (torch.model): original model before any training: a(x) equation (1) in the paper. e.g. Hugging Face transformer GPT2 original model ref_model (torch.model): q(x) a reference/proposal model to calculate off policy DPG tokenizer (transformes.Tokenizer): tokenizer to pass to the sampling function sampling_function: function that returns samples given a model, tokenizer, scoring function scoring_function: b(x) function that given a text returns a score. params (dict or None): DPG parameters for training. Can include following keys: 'lr' (float): Adam or AdamW learning rate 'batch_size' (int): Number of samples per optimisation step, default: 256 'forward_batch_size' (int): Number of samples forward passed through model at a time, default: 16 'dpg_epochs' (int): Number of optimisation epochs per batch of samples, default: 4 """ super().__init__(tokenizer, sampling_function, scoring_function) self.params = self.default_params self.params.update(params) # double check q_update_criterion assert self.params['q_update_criterion'] in ['interval', 'tvd', "kld"] # make sure interval is specified assert 'q_update_interval' in self.params, "you need to specify an interval to update q" # init models self.model = model_cls.from_pretrained( self.params['lm_name'], attn_pdrop=self.params['dropout'], summary_first_dropout=self.params['dropout']).to( self.params['gpt2_device']) # original model "a" the one combined with "b" to generate the EBM self.orig_model = model_cls.from_pretrained(self.params['lm_name']).to( self.params['gpt2_orig_device']) self.orig_model.eval() self.ref_model = model_cls.from_pretrained(self.params['lm_name']).to( self.params['gpt2_ref_device']) self.ref_model.eval() # allow bootstrapping learning rate if "auto" is given if self.params['lr'] == "auto": self.bootstrap_learning_rate() self.params["optimizer"] = self.params.get("optimizer", "Adam") if self.params["optimizer"].lower() == "adamw": self.optimizer = AdamW(self.model.parameters(), lr=self.params['lr'], amsgrad=False) else: self.optimizer = Adam(self.model.parameters(), lr=self.params['lr'], amsgrad=False) self.is_policy_eval = False # choosing scheduler based on params scheduler_ = self.params['scheduler'] assert scheduler_ in [ 'cosine', 'constant', 'linear', 'cosine_restarts' ], "unknown scheduler: {}".format(self.params['scheduler']) if scheduler_ == 'constant': self.scheduler = get_constant_schedule_with_warmup( self.optimizer, self.params['warmup_steps']) elif scheduler_ == 'cosine': print("Cosine scheduler...") self.scheduler = get_cosine_schedule_with_warmup( self.optimizer, self.params['warmup_steps'], self.params['steps'] // self.params['batch_size']) elif scheduler_ == 'linear': self.scheduler = get_linear_schedule_with_warmup( self.optimizer, self.params['warmup_steps']) elif scheduler_ == 'cosine_restarts': self.scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( self.optimizer, self.params['warmup_steps'], self.params['steps'] // self.params['batch_size'], num_cycles=self.params['num_restart_cycles']) ## freezing ref_model, orig_model for p1, p2 in zip(self.ref_model.parameters(), self.orig_model.parameters()): p1.requires_grad = False p2.requires_grad = False # will hold all values of P(x) / q(x) for estimating TVD self.Z_moving_average = 0 self.iter = 0 self.min_kld = float("inf") self.min_tvd = float("inf") ### compute gradient accumulation steps self.params['gradient_accumulation_steps'] = self.params[ 'batch_size'] // self.params['forward_batch_size'] ### bootstrap Z bstrp_steps = self.params.get('z_bootstrap_steps', 0) if bstrp_steps > 0: self.bootstrap_z(steps=bstrp_steps)
def __init__(self, model_cls, tokenizer, sampling_function, features, **params): """ Initialize GDCTrainer. Args: model (torch.model): pi_theta(x) Policy to be trained e.g. Hugging Face transformer GPT2 model with value head orig_model (torch.model): original model before any training: a(x) in the equation above. e.g. Hugging Face transformer GPT2 original model ref_model (torch.model): q(x) a reference modelto calculate off policy DPG tokenizer (transformes.Tokenizer): tokenizer to pass to the sampling function sampling_function: function that returns samples given a model, tokenizer, scoring function features: phi(x) a list of functions that that detect a set of features lambdas: lambdas vector, where each lamda corerspond to a feature. params (dict or None): DPG parameters for training. Can include following keys: 'lr' (float): Adam learning rate, default: 1.41e-5 'batch_size' (int): Number of samples per optimization step, default: 256 'forward_batch_size' (int): Number of samples forward passed through model at a time. Should be set according to available GPU memory. This is used in combination with gradient accumulation to obtain a larger batch size, default: 16 'dpg_epochs' (int): Number of optimization epochs per batch of samples, default: 4 """ self.params = self.default_params self.params.update(params) # we pass the sampling method to be able to use it for for the calculaltion of TVD # TVD has to be calculated on a new batch than this used to optimize pi_theta self.sampling_function = sampling_function self.features = features self.lambdas = {k: 0.0 for k in features} # initialize lambdas with 0 self.desired_moments = params['desired_moments'] self.tokenizer = tokenizer # double check q_update_criterion assert self.params['q_update_criterion'] in ['interval', 'tvd', "kld"] # make sure interval is specified assert 'q_update_interval' in self.params, "you need to specify an interval to update q" # init models self.model = model_cls.from_pretrained( self.params['lm_name'], attn_pdrop=self.params['dropout'], summary_first_dropout=self.params['dropout']).to( self.params['gpt2_device']) # original model "a" the one combined with "b" to generate the EBM self.orig_model = model_cls.from_pretrained(self.params['lm_name']).to( self.params['gpt2_orig_device']) self.orig_model.eval() self.ref_model = model_cls.from_pretrained(self.params['lm_name']).to( self.params['gpt2_ref_device']) self.ref_model.eval() self.optimizer = Adam(self.model.parameters(), lr=self.params['lr'], amsgrad=False) # choosing scheduler based on params scheduler_ = self.params['scheduler'] assert scheduler_ in [ 'cosine', 'constant', 'linear', 'cosine_restarts' ], "unknown scheduler: {}".format(self.params['scheduler']) if scheduler_ == 'constant': self.scheduler = get_constant_schedule_with_warmup( self.optimizer, self.params['warmup_steps']) elif scheduler_ == 'cosine': print("Cosine scheduler...") self.scheduler = get_cosine_schedule_with_warmup( self.optimizer, self.params['warmup_steps'], self.params['steps'] // self.params['batch_size']) elif scheduler_ == 'linear': self.scheduler = get_linear_schedule_with_warmup( self.optimizer, self.params['warmup_steps']) elif scheduler_ == 'cosine_restarts': self.scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( self.optimizer, self.params['warmup_steps'], self.params['steps'] // self.params['batch_size'], num_cycles=self.params['num_restart_cycles']) ## freezing ref_model, orig_model as we only train pi_theta for p1, p2 in zip(self.ref_model.parameters(), self.orig_model.parameters()): p1.requires_grad = False p2.requires_grad = False # will hold all values of P(x) / q(x) for estimating TVD self.Z_moving_average = 0 self.iter = 0 self.min_kld = float("inf") self.min_tvd = float("inf") self.is_policy_eval = False ### compute gradient accumulation steps self.params['gradient_accumulation_steps'] = self.params[ 'batch_size'] // self.params['forward_batch_size'] #### Compute lambdas self.compute_optimal_lambdas( sample_size=self.params["moment_matching_sample_size"]) ### bootstrap Z bstrp_steps = self.params.get('z_bootstrap_steps', 0) if bstrp_steps > 0: self.bootstrap_z(steps=bstrp_steps)
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight', 'LayerNorm.bias'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] # warmup_proportion * t_total optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps*t_total, num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 best_accuracy = 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproducibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): #batch(问题+选项+解释)[16,65],batch_mrc(option_inputs)[16,5,65],batch_mask_mrc(option_inputs的mask)[16,5,65],batch_segment_mrc[16,5,65],promput_length(问题+选项的长度)[16,], #total_length(option_input的长度)[16,],label_mrc正确选项[16,] batch, batch_mrc, batch_mask_mrc, batch_segment_mrc, prompt_lengths, total_lengths, labels_mrc = batch max_length = torch.max(total_lengths).item()#option_input的最大长度 batch = batch[:, :max_length]#batch=16*58 # max_length = args.block_size # 设batch中最长句子的长度为max_seq_length, 将超过max_seq_length的部分删除 inputs, labels = (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) batch_size = batch_mrc.size(0) #下面这段循环好像没意义? while True: end_flag = False for i in range(batch_size): if batch_mrc[i, 0, -1] != 0: end_flag = True if batch_mrc[i, 1, -1] != 0: end_flag = True if end_flag: break else: batch_mrc = batch_mrc[:, :, :-1] max_seq_length = batch_mrc.size(2) batch_mask_mrc = batch_mask_mrc[:, :, :max_seq_length] batch_segment_mrc = batch_segment_mrc[:, :, :max_seq_length] labels_mrc = labels_mrc.to(args.device) batch_mrc = batch_mrc.to(args.device) batch_mask_mrc = batch_mask_mrc.to(args.device) batch_segment_mrc = batch_segment_mrc.to(args.device) total_lengths = torch.tensor(total_lengths) #这一步是干啥的? attention_mask = torch.arange(max_length).expand(len(total_lengths), max_length) < total_lengths.unsqueeze(1) attention_mask = attention_mask[:, :max_length] attention_mask = attention_mask.to(args.device) # total_lengths = total_lengths.to(args.device) # for idx in range(len(prompt_lengths)): # labels[idx, :prompt_lengths[idx]] = cross_entropy_ignore_index # print(attention) model.train() #input_ids为option_inputs[16,58] ,attention_mask[16,58], attention_mask_mrc[16,5,41]不知道根据啥截断的,token_type_ids_mrc[16,5,41],labels原始option_input的序列[16,58] outputs = model(input_ids=inputs, attention_mask=attention_mask, attention_mask_mrc=batch_mask_mrc, token_type_ids_mrc=batch_segment_mrc,\ input_ids_mrc=batch_mrc, labels=labels, labels_mrc=labels_mrc) # loss = outputs[0] # model outputs are always tuple in transformers (see doc) # print(outputs[0], outputs[1], outputs[2]) # mse mrc lm loss = 0.3 * outputs[0] + 0.6 * outputs[1] + 0.1 * outputs[2] # add two loss TODO 不同的权重 # loss = outputs[1] # loss = outputs[1] + outputs[2] if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) logging_loss = tr_loss assert args.save_steps == args.logging_steps, "Save steps must equal to logging steps." if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint print("{} {}".format(results['mrc_accuracy'], best_accuracy)) if results['mrc_accuracy'] > best_accuracy: output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) best_accuracy = results['mrc_accuracy'] if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break # result = evaluate(args, model, tokenizer) if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def __init__(self, model_cls, tokenizer, sampling_function, scoring_function, **params): """ Initialize PPOTrainer. Args: model (torch.model): Hugging Face transformer GPT2 model with value head ref_model (torch.model): Hugging Face transformer GPT2 refrence model used for KL penalty params (dict or None): PPO parameters for training. Can include following keys: 'lr' (float): Adam learning rate, default: 1.41e-5 'batch_size' (int): Number of samples per optimisation step, default: 256 'forward_batch_size' (int): Number of samples forward passed through model at a time, default: 16 'ppo_epochs' (int): Number of optimisation epochs per batch of samples, default: 4 'gamma' (float)): Gamma parameter for advantage calculation, default: 1. 'lam' (float): Lambda parameter for advantage calcualation, default: 0.95 'cliprange_value' (float): Range for clipping values in loss calculation, default: 0.2 'cliprange' (float): Range for clipping in PPO policy gradient loss, default: 0.2 'vf_coef' (float): Scaling factor for value loss, default: 0.1 'adap_kl_ctrl' (bool): Use adaptive KL control, otherwise linear, default: True 'init_kl_coef' (float): Initial KL penalty coefficient (used for adaptive and linear control), default: 0.2 'target' (float): Target KL value for adaptive KL control, default: 6.0 'horizon' (float): Horizon for adaptive KL control, default: 10000 """ super().__init__(tokenizer=tokenizer, sampling_function=sampling_function, scoring_function=scoring_function) self.params = self.default_params self.params.update(params) self.ref_model = model_cls.from_pretrained(self.params['lm_name']).to(self.params['gpt2_orig_device']) self.ref_model.eval() self.orig_model = self.ref_model self.model = model_cls.from_pretrained(self.params['lm_name'], attn_pdrop=self.params['dropout'], summary_first_dropout=self.params['dropout']).to(self.params['gpt2_device']) self.optimizer = Adam(self.model.parameters(), lr=self.params['lr']) self.is_policy_eval = True ### create LR scheduler scheduler_ = self.params['scheduler'] assert scheduler_ in ['cosine', 'constant', 'linear', 'cosine_restarts'], "unknown scheduler: {}".format(self.params ['scheduler']) if scheduler_ == 'constant': self.scheduler = get_constant_schedule_with_warmup(self.optimizer, self.params['warmup_steps']) elif scheduler_ == 'cosine': print("Cosine scheduler...") self.scheduler = get_cosine_schedule_with_warmup(self.optimizer, self.params['warmup_steps'], self.params['steps']//self.params['batch_size']) elif scheduler_ == 'linear': self.scheduler = get_linear_schedule_with_warmup(self.optimizer, self.params['warmup_steps']) elif scheduler_ == 'cosine_restarts': self.scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(self.optimizer, self.params['warmup_steps'], self.params['steps'] // self.params['batch_size'], num_cycles=self.params['num_restart_cycles']) self.kl_ctl = AdaptiveKLController(self.params['init_kl_coef'], self.params['target'], self.params['horizon']) self.iter= 0
def train_question_generation( model, train_dataset, tokenizer, num_train_epochs, train_batch_size, learning_rate, device, adam_epsilon=1e-8, logging_steps=None, logging_dir=None, gradient_accumulation_steps=1, max_grad_norm=1.0, weight_decay=0.0, warmup_steps=0, output_dir=None, max_steps=-1, num_cycles=1.0, evaluate_during_training=False, eval_dataset=None, eval_batch_size=8, generation_during_training=False, generation_dataset=None, generation_hyperparameters=None, save_steps=-1, verbose=0, ): """ This function trains models on the train_dataset, eval_dataset being optional. INPUTS: - model: Torch model, model to train. - train_dataset: Torch TensorDataset, used for training. - tokenizer: Torch tokenizer object, tokenizer used for preprocessing. - num_train_epochs: int, number of epochs for training. - train_batch_size: int, size of mini batch. - learning_rate: int, learning rate. - device: torch cuda object, describing the device on which the training will be done. - adam_epsilon: float, epsilon parameter for optimizer AdamW. - logging_steps: float, number of steps for evaluation. - logging_dir: str, name of the directory in which the logs will be written. - gradient_accumulation_steps: int, number of step before retropropagation. - max_grad_norm: float, maximum norm for gradient. - weights_decay: float, weights_decay parameter for optimizer. - warmup_steps: int, number of steps used for warmup. - output_dir: str, directory to save output. - max_steps: int, maximum number of step per epochs, -1 for None. - num_cycles: float, number of cycle for warmup. - evaluate_during_training: bool, saying whether to evaluate. - eval_dataset: Torch TensorDataset, to provide for evaluation. - eval_batch_size: int, batch size for evaluation dataset. - generation_during_training: bool, saying whether to generate some question as examples. - generation_dataset: TensorDataset, will be used for generation in generation_during_training=True. - generation_hyperparameters: dictionary, containing hyperparameters used for generation. - save_steps; int, number of steps between each checkpoint. - verbose: int, 0 for no verbose, 1 for displaying. OUTPUTS: - train_loss_history: list of floats, loss history. - val_loss_history: list of floats, validation loss history. """ train_loss = [] assert not (logging_steps > 0 and eval_dataset is None ), "logging_steps > 0 but no eval_dataset provided" train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size) if logging_steps is None: logging_steps = len(train_dataloader) // (gradient_accumulation_steps * 5) if max_steps > 0: t_total = max_steps num_train_epochs = max_steps // (len(train_dataloader) // gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader) // gradient_accumulation_steps * num_train_epochs no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{'params': [p for n, p in model.named_parameters() \ if not any(nd in n for nd in no_decay)], 'weight_decay_rate': weight_decay}, {'params': [p for n, p in model.named_parameters() \ if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] optimizer = AdamW( optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon, ) scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total, num_cycles=num_cycles, ) # Train print("***** Running training *****") print(" Num examples = %d" % len(train_dataset)) print(" Num Epochs = %d" % num_train_epochs) print(" Batch size = %d" % train_batch_size) print( " Total train batch size (w. parallel, distributed & accumulation) = %d" % (train_batch_size * gradient_accumulation_steps)) print(" Gradient Accumulation steps = %d" % gradient_accumulation_steps) print(" Total optimization steps = %d" % t_total) if logging_dir is not None: if not os.path.exists(logging_dir): os.makedirs(logging_dir) logging_file = os.path.join(logging_dir, "logs.txt") with open(logging_file, "w") as writer: writer.write("***** Running training *****\n") writer.write(" Num examples = %d\n" % len(train_dataset)) writer.write(" Num Epochs = %d\n" % num_train_epochs) writer.write(" Batch size = %d\n" % train_batch_size) writer.write( " Total train batch size (w. parallel, distributed & accumulation) = %d\n" % (train_batch_size * gradient_accumulation_steps)) writer.write(" Gradient Accumulation steps = %d\n" % gradient_accumulation_steps) writer.write(" Total optimization steps = %d\n" % t_total) writer.write("\n") global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_loss_history = [] val_loss_history = [] start_train = time.time() for epoch in range(num_train_epochs): print(f"Epoch: {epoch + 1} / {num_train_epochs}") start_epoch = time.time() epoch_iterator = train_dataloader for step, batch in enumerate(epoch_iterator): if global_step > t_total: break ######## TRAINING STEP ######## model.train() # Transfer input data to device batch = tuple(t.to(device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], # 'token_type_ids' : batch[2], 'decoder_input_ids': batch[3], 'decoder_attention_mask': batch[4], 'labels': batch[5], } optimizer.zero_grad() outputs = model(**inputs) loss = outputs[0] if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps loss.backward() tr_loss += loss.item() train_loss.append(loss.item()) ######## LOGGING RESULTS ######## if (step + 1) % gradient_accumulation_steps == 0: accumulated_loss = np.sum( train_loss[-gradient_accumulation_steps:]) if verbose > 0: print("lr: {:.10f}".format(scheduler.get_lr()[0]), "loss: {:.6f}".format(accumulated_loss), " -- step:", global_step, "/", t_total) if logging_dir is not None: with open(logging_file, "a") as writer: writer.write(" ".join([ "lr: {:.10f}".format(scheduler.get_lr()[0]), "loss: {:.6f}".format(accumulated_loss), " -- step:", str(global_step), "/", str(t_total), "\n" ])) torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if (logging_steps > 0 and global_step > 0) and (global_step % logging_steps == 0): print("\nEval") if evaluate_during_training: # Log metrics dict_print = { 'step': global_step, 'lr': scheduler.get_lr()[0], 'tr_loss': (tr_loss - logging_loss) / logging_steps } result_eval = evaluate_question_generation( model=model, eval_dataset=eval_dataset, tokenizer=tokenizer, device=device, eval_batch_size=eval_batch_size, generation=generation_during_training, generation_dataset=generation_dataset, generation_hyperparameters= generation_hyperparameters, logging_dir=logging_dir, verbose=1, ) for key, value in result_eval.items(): dict_print['eval_{}'.format(key)] = value train_loss_history.append( (tr_loss - logging_loss) / logging_steps) val_loss_history.append(result_eval['val_loss']) print('Evaluation:', dict_print) logging_loss = tr_loss if logging_dir is not None: with open(logging_file, "a") as writer: writer.write("\nEval\n") for key in sorted(dict_print.keys()): writer.write(" %s = %s\n" % (key, str(dict_print[key]))) writer.write("\n") ######## SAVING MODEL ######## if (save_steps > 0 and global_step > 0) and (global_step % save_steps == 0): print("\nSave") # Save model checkpoint if not os.path.exists(output_dir): os.makedirs(output_dir) save_model_dir = os.path.join( output_dir, 'checkpoint-{}'.format(global_step)) os.makedirs(save_model_dir) model.save_pretrained(save_model_dir) print("Saving model checkpoint to %s" % save_model_dir) if logging_dir is not None: with open(logging_file, "a") as writer: writer.write("\nSave\n") writer.write("Saving model checkpoint to %s\n\n" % save_model_dir) end_epoch = time.time() print( f'Epoch {epoch + 1}/{num_train_epochs}, time = {end_epoch - start_epoch} secs' ) end_train = time.time() print("Train took:", end_train - start_train) return train_loss_history, val_loss_history
def prepaire_D_scheduler(optimizer, epoch_num, train_num): warmup_steps = int(0.5 * train_num) total_steps = train_num * epoch_num - warmup_steps # print(total_steps, warmup_steps) return get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps,last_epoch=-1)
logging_dir=args.logdir, logging_steps=args.logsteps, logging_first_step=False, #max_steps=20100, save_steps=args.savesteps, save_total_limit=5, seed=SEED) optimizer = AdamW(model.parameters(), lr=0.00006, betas=(0.9, 0.999), weight_decay=args.weightdecay) if args.scheduler == "cosine": scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, num_warmup_steps=args.warmupsteps, num_training_steps=len(dataset) * args.epochs, num_cycles=1.0, ) else: scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmupsteps, num_training_steps=len(dataset) * args.epochs, ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, prediction_loss_only=True,
def train(self): train_dataset = BERTDANN(self.args, self.tokenizer, 'train') train_dataloader = DataLoader(dataset=train_dataset, sampler=SequentialSampler(train_dataset), batch_size=self.args.train_batch_size) valid_dataset = BERTDANN(self.args, self.tokenizer, 'valid') valid_dataloader = DataLoader(dataset=valid_dataset, sampler=RandomSampler(valid_dataset), batch_size=self.args.eval_batch_size) if self.args.max_steps > 0: t_total = self.args.max_steps self.args.num_train_epochs = self.args.max_steps // (len( train_dataloader) // self.args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // self.args.gradient_accumulation_steps * self.args.num_train_epochs no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': self.args.weight_decay }, { 'params': [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon) # scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=t_total) scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=self.args.max_steps, num_cycles=self.args.num_train_epochs) loss_fct = nn.CrossEntropyLoss() logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataloader)) logger.info(" Num train data = %d", len(train_dataset)) logger.info(" Num valid data = %d", len(valid_dataset)) logger.info(" Num Epochs = %d", self.args.num_train_epochs) logger.info(" Total train batch size = %d", self.args.train_batch_size) logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) logger.info(" Logging steps = %d", self.args.logging_steps) logger.info(" Saving steps = %d", self.args.logging_steps) logger.info(" mmd length scale 'l' = %d", 1) logger.info(" mmd alpha = %d", 1) df_args = pd.DataFrame.from_dict([{ 'initial lr': self.args.learning_rate, "Num examples": len(train_dataloader), 'num_data(movie/sports)': f'{len(train_dataset)} ({len(train_dataset.movie_data)}/{len(train_dataset.target)})', 'Max_Epochs': self.args.num_train_epochs, 'train_batch_size': self.args.train_batch_size, 'Total_optimization_steps': t_total, 'logging_steps': self.args.logging_steps, 'scheduler': 'get_linear_schedule_with_warmup', 'warmup_steps': self.args.warmup_steps, 'num_cycles': self.args.num_train_epochs }]) if not os.path.exists(self.args.third_sentiment_classifier_output): os.makedirs(self.args.third_sentiment_classifier_output) df_args.to_csv( f'{self.args.third_sentiment_classifier_output}/args.csv', sep='\t') self.model.zero_grad() best_valid_loss = float('inf') global_step = 0 early_cnt = 0 training_stats = [] try: for epoch_idx in range(int(self.args.num_train_epochs)): logger.info( f"========== {epoch_idx + 1} : {self.args.num_train_epochs} ==========" ) epoch_iterator = tqdm(train_dataloader, desc="Iteration") epoch_train_loss, epoch_valid_loss = 0, 0 epoch_valid_accuracy, valid_cnt = 0, 0 for step, batch in enumerate(epoch_iterator): self.model.train() optimizer.zero_grad() # sentiment classifier | source domain data batch_sentiment = tuple( t.to(self.device) for t in batch[0]) inputs = { 'input_ids': batch_sentiment[0], 'attention_mask': batch_sentiment[1], 'output_hidden_states': True } labels = batch_sentiment[2] source_outputs = self.model(**inputs) pooled_output = source_outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) loss_sentiment = loss_fct(logits.view(-1, 2), labels.view(-1)) # MMD loss| source domain data source_representation = source_outputs[0][:, 0, :] # MMD loss| target domain data batch_mmd = tuple(t.to(self.device) for t in batch[1]) inputs = { 'input_ids': batch_mmd[0], 'attention_mask': batch_mmd[1] } target_outputs = self.model(**inputs) target_representation = target_outputs[0][:, 0, :] loss_mmd = mmd_squared(source_representation, target_representation) loss = loss_sentiment + loss_mmd if self.args.gradient_accumulation_steps > 1: loss = loss / self.args.gradient_accumulation_steps loss.backward() if (step + 1) % self.args.gradient_accumulation_steps == 0: epoch_train_loss += loss.item() torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule self.model.zero_grad() global_step += 1 if self.args.logging_steps > 0 and global_step % self.args.logging_steps == 0: valid_loss, valid_accuracy = self.evaluate( valid_dataloader, "valid") epoch_valid_loss += valid_loss epoch_valid_accuracy += valid_accuracy valid_cnt += 1 training_stats.append({ 'epoch': epoch_idx + 1, 'training_loss': epoch_train_loss / (step + 1), 'valid_loss': valid_loss, 'valid_accuracy': valid_accuracy, 'steps': global_step, 'lr': scheduler.get_last_lr()[0] }) if self.args.save_steps > 0 and global_step % self.args.save_steps == 0: if valid_loss < best_valid_loss: best_valid_loss = valid_loss self.save_model(optimizer, best_valid_loss) early_cnt = 0 else: early_cnt += 1 if early_cnt > self.args.early_cnt - 1: logger.info('training session has been early stopped') df_stats = pd.DataFrame(data=training_stats, ) df_stats = df_stats.set_index('epoch') df_stats.to_csv( f'{self.args.third_sentiment_classifier_output}/stats.csv', sep='\t', index=True) break if 0 < self.args.max_steps < global_step: epoch_iterator.close() break if early_cnt > self.args.early_cnt - 1: break if 0 < self.args.max_steps < global_step: df_stats = pd.DataFrame(data=training_stats, ) df_stats = df_stats.set_index('epoch') df_stats.to_csv( f'{self.args.third_sentiment_classifier_output}/stats.csv', sep='\t', index=True) break # epoch_train_loss = epoch_train_loss / global_step # epoch_valid_loss = epoch_valid_loss / valid_cnt # epoch_valid_accuracy = epoch_valid_accuracy / valid_cnt # if epoch_valid_loss < best_valid_loss: # best_valid_loss = epoch_valid_loss # self.save_model(optimizer, best_valid_loss) # logger.info(" %s : %s | %s = %s", 'EPOCH', epoch_idx + 1, 'train_loss', epoch_train_loss) # logger.info(" %s : %s | %s = %s", 'EPOCH', epoch_idx + 1, 'valid_loss', epoch_valid_loss) # logger.info(" %s : %s | %s = %s", 'EPOCH', epoch_idx + 1, 'valid_accuracy', epoch_valid_accuracy) except KeyboardInterrupt as e: logger.info(e) df_stats = pd.DataFrame(data=training_stats) df_stats = df_stats.set_index('epoch') df_stats.to_csv( f'{self.args.third_sentiment_classifier_output}/stats.csv', sep='\t', index=True) return except Exception as e: logger.info(e) df_stats = pd.DataFrame(data=training_stats) df_stats = df_stats.set_index('epoch') df_stats.to_csv( f'{self.args.third_sentiment_classifier_output}/stats.csv', sep='\t', index=True) return
def train(args, model, tokenizer): """ Train the model """ if xm.is_master_ordinal(): tb_writer = SummaryWriterP(args.output_dir) def summary_write(*args, **kwargs): if xm.is_master_ordinal(): tb_writer.add_scalar(*args, **kwargs) args.train_batch_size = args.per_gpu_train_batch_size #* max(1, args.n_gpu) train_dataloader = build_dataloader(args, tokenizer) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if p.requires_grad and not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if p.requires_grad and any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] # Scale learning rate to num cores #args.learning_rate = args.learning_rate * xm.xrt_world_size() if args.sgd: optimizer = SGD(optimizer_grouped_parameters, lr=args.learning_rate) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) warmup_steps = args.warmup_samples // (args.train_batch_size * xm.xrt_world_size()) if args.lr_decay: scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps=warmup_steps, t_total=t_total) elif args.lr_cosine: scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, warmup_steps=warmup_steps, t_total=t_total, cycles=args.num_train_epochs) else: scheduler = WarmupZeroSchedule(optimizer, warmup_steps=warmup_steps) # Train! tracker = xm.RateTracker() log_info("***** Running training *****") log_info(" Num Epochs = %d", args.num_train_epochs) log_info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) log_info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (xm.xrt_world_size() if args.local_rank != -1 else 1)) log_info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) log_info(" Total optimization steps = %d", t_total) try: with open(os.path.join(args.model_name_or_path, 'step.txt'), 'r') as c: global_step = int(c.readline()) except OSError as e: global_step = 0 moving_loss = MovingLoss(10000 // args.logging_steps) train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=not xm.is_master_ordinal()) try: for epoch in train_iterator: p_train_dataloader = pl.ParallelLoader(train_dataloader, [args.device]) epoch_iterator = tqdm(p_train_dataloader.per_device_loader( args.device), total=len(train_dataloader), desc="Iteration", disable=not xm.is_master_ordinal()) model.train() for step, batch in enumerate(epoch_iterator): optimizer.zero_grad() inputs, labels = mask_tokens( batch, tokenizer, args) if args.mlm else (batch, batch) outputs = model( inputs, masked_lm_labels=labels) if args.mlm else model( inputs, labels=labels) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) xm.optimizer_step(optimizer, barrier=True) scheduler.step() global_step += 1 tracker.add(args.train_batch_size) if args.logging_steps > 0 and global_step % args.logging_steps == 0: ls = loss.item( ) # weird. if you call loss.item() only in one process, the whole thing hangs. So call on every and log in one. moving_loss.add(ls) summary_write('lr', scheduler.get_last_lr()[0], global_step) epoch_iterator.set_postfix( MovingLoss=f'{moving_loss.loss:.2f}', Perplexity= f'{torch.exp(torch.tensor(moving_loss.loss)):.2f}') if args.save_steps > 0 and global_step % args.save_steps == 0: save_state(args, model, tokenizer, global_step) #if step >= 1023: # TPU seems to like consistent epoch lenght # epoch_iterator.close() # break if args.max_steps > 0 and step > args.max_steps: epoch_iterator.close() break # evaluate once in an epoch if args.evaluate_during_training: results = evaluate(args, model, tokenizer, f"checkpoint-{global_step}") log_info(f"Eval {results}") for key, value in results.items(): summary_write("eval_{}".format(key), value, global_step) # reload dataset every args.reload_data_file epochs if args.reload_data_file and (epoch + 1) % args.reload_data_file == 0: train_dataloader = build_dataloader(args, tokenizer) # that's very slow on TPU #print_sample(model, tokenizer, args.device, args) except (KeyboardInterrupt, SystemExit): save_state(args, model, tokenizer, global_step) raise save_state(args, model, tokenizer, global_step) return global_step, moving_loss.loss
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) if args.learning_rate_schedule == 'constant': scheduler = get_constant_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps) elif args.learning_rate_schedule == 'linear': scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) elif args.learning_rate_schedule == 'cosine': scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) elif args.learning_rate_schedule == 'cosine_hard_restarts': scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) else: raise ValueError( "Invalid learning rate schedule. The available schedules are 'linear', 'cosine', 'cosine_hard_restarts' and 'constant'" "More details at https://huggingface.co/transformers/main_classes/optimizer_schedules.html." ) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 1 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split( "/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) # Added here for reproductibility set_seed(args) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "start_positions": batch[3], "end_positions": batch[4], } if args.model_type in [ "xlm", "roberta", "distilbert", "camembert" ]: del inputs["token_type_ids"] if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[5], "p_mask": batch[6]}) if args.version_2_with_negative: inputs.update({"is_impossible": batch[7]}) if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device) }) outputs = model(**inputs) # model outputs are always tuple in transformers (see doc) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Log metrics if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Only evaluate when single GPU otherwise metrics may not average well if args.local_rank == -1 and args.evaluate_during_training: results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss # Save model checkpoint if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Take care of distributed/parallel training model_to_save = model.module if hasattr( model, "module") else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) import json with open(os.path.join(output_dir, "config.json")) as json_file: config_to_update = json.load(json_file) #adding number of non interaction layers to config config_to_update[ "non_interaction_layers"] = args.non_interaction_layers with open(os.path.join(output_dir, "config.json"), 'w') as json_file: json.dump(config_to_update, json_file) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(self, train_path: str, valid_path: str, types_path: str, input_reader_cls: BaseInputReader): args = self.args train_label, valid_label = 'train', 'valid' self._logger.info("Datasets: %s, %s" % (train_path, valid_path)) self._logger.info("Model type: %s" % args.model_type) # create log csv files self._init_train_logging(train_label) self._init_eval_logging(valid_label) # read datasets input_reader = input_reader_cls(types_path, args.bio_path, self._tokenizer, self._logger) input_reader.read({train_label: train_path, valid_label: valid_path}) self._log_datasets(input_reader) train_dataset = input_reader.get_dataset(train_label) train_sample_count = train_dataset.document_count updates_epoch = train_sample_count // args.train_batch_size updates_total = updates_epoch * args.epochs steps_before_rel = int(updates_total * self.args.before_rel) validation_dataset = input_reader.get_dataset(valid_label) self._logger.info("Updates per epoch: %s" % updates_epoch) self._logger.info("Updates total: %s" % updates_total) self._logger.info("Updates before relation: %s" % steps_before_rel) # create model model_class = models.get_model(self.args.model_type) # load model if args.model_type == 'table_filling': model = model_class.from_pretrained( self.args.model_path, cache_dir=self.args.cache_path, tokenizer=self._tokenizer, # table_filling model parameters relation_labels=input_reader.relation_label_count, entity_labels=input_reader.entity_label_count, att_hidden=self.args.att_hidden, prop_drop=self.args.prop_drop, entity_label_embedding=self.args.entity_label_embedding, freeze_transformer=self.args.freeze_transformer, device=self._device) # if self._device.type != 'cpu': # torch.distributed.init_process_group(backend='nccl', world_size=3, init_method='...') # model = torch.nn.parallel.DistributedDataParallel(model) model.to(self._device) # model.to(f'cuda:{model.device_ids[0]}') # create optimizer optimizer_params = self._get_optimizer_params(model) optimizer = AdamW(optimizer_params, lr=args.lr, weight_decay=args.weight_decay, correct_bias=False) # other_optimizer_params = self._get_optimizer_params([]) # create scheduler if args.scheduler == 'constant': scheduler = transformers.get_constant_schedule(optimizer) elif args.scheduler == 'constant_warmup': scheduler = transformers.get_constant_schedule_with_warmup( optimizer, num_warmup_steps=args.lr_warmup * updates_total) elif args.scheduler == 'linear_warmup': scheduler = transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.lr_warmup * updates_total, num_training_steps=updates_total) elif args.scheduler == 'cosine_warmup': scheduler = transformers.get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=args.lr_warmup * updates_total, num_training_steps=updates_total) elif args.scheduler == 'cosine_warmup_restart': scheduler = transformers.get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, num_warmup_steps=args.lr_warmup * updates_total, num_training_steps=updates_total, num_cycles=args.num_cycles) # create loss function rel_criterion = torch.nn.CrossEntropyLoss(reduction='none') entity_criterion = torch.nn.CrossEntropyLoss(reduction='none') if args.model_type == 'table_filling': compute_loss = TableLoss(rel_criterion, entity_criterion, model, optimizer, scheduler, args.max_grad_norm) # eval validation set if args.init_eval: self._eval(model, compute_loss, validation_dataset, input_reader, 0, updates_epoch) # train for epoch in range(args.epochs): # train epoch self._train_epoch(model, compute_loss, optimizer, train_dataset, updates_epoch, epoch, input_reader.context_size, input_reader.entity_label_count, input_reader.relation_label_count, input_reader._start_entity_label, steps_before_rel) # eval validation sets if not args.final_eval or (epoch == args.epochs - 1): ner_acc, rel_acc, rel_ner_acc = self._eval( model, compute_loss, validation_dataset, input_reader, epoch, updates_epoch) if args.save_best: extra = dict(epoch=epoch, updates_epoch=updates_epoch, epoch_iteration=0) self._save_best(model=model, optimizer=optimizer if self.args.save_optimizer else None, accuracy=ner_acc[2], iteration=epoch * updates_epoch, label='ner_micro_f1', extra=extra) # save final model extra = dict(epoch=args.epochs, updates_epoch=updates_epoch, epoch_iteration=0) global_iteration = args.epochs * updates_epoch self._save_model( self._save_path, model, global_iteration, optimizer=optimizer if self.args.save_optimizer else None, extra=extra, include_iteration=False, name='final_model') self._logger.info("Logged in: %s" % self._log_path) self._logger.info("Saved in: %s" % self._save_path)
def main(args): torch.manual_seed(5) if args.local_rank is not None: local_rank = args.local_rank print(f"Using GPU ID {local_rank}") torch.cuda.set_device(local_rank) device = f"cuda:{local_rank}" dist.init_process_group(backend="nccl", init_method="env://") # An alternative method to perform distributed training. # dist.init_process_group(backend="nccl", init_method="file:///temp/parallel_comm", \ # world_size=args.world_size, rank=local_rank) else: local_rank = None device = "cpu" if args.no_cuda else "cuda" # Create Model if local_rank is not None: model = models.ModelNSP(args.pretrained_class).cuda(local_rank) else: model = models.ModelNSP(args.pretrained_class).to(device) model.core_model.output_past = False if args.test: model.eval() else: model.train() print(f"Number of parameters: {count_parameters(model):,}") print(f"Gradient Accumulation Steps: {args.accumulation_steps}") tokenizer = getattr(transformers, args.tokenizer).from_pretrained(args.pretrained_class) if "gpt2" in args.tokenizer.lower(): # this enables us to do batched training, GPT2 wasn't trained with a padding token. tokenizer.add_special_tokens({"pad_token": "<PAD>"}) model.core_model.resize_token_embeddings(len(tokenizer)) criterion = nn.CrossEntropyLoss() # the pretrained model has been fairly optimized, while the NSP head has been randomly initialized. # using different learning rates helps speed up training. specific_learning_rates = [{"params": model.core_model.parameters(), "lr": args.core_lr, "correct_bias": False}, {"params": model.nsp_head.parameters(), "lr": args.head_lr, "correct_bias": False}] optimizer = transformers.AdamW(specific_learning_rates, lr=args.core_lr, correct_bias=False) fp16 = args.fp16 if fp16: model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt, keep_batchnorm_fp32=True) if local_rank is not None: print(f"Device is set to {device}!") else: print("Let's use", torch.cuda.device_count(), "GPUs!") if local_rank is not None: model = nn.parallel.DistributedDataParallel(model, device_ids=[local_rank]) else: model = nn.DataParallel(model) print("Passed model distribution stage!") if args.saved_model: sd = torch.load(args.saved_model, map_location=device) model.load_state_dict(sd) model.to(device) # Create Dataset data = dataset.NextSentenceDataset(args.dataset, tokenizer, data_frac=args.data_frac, max_seq_length=args.max_seq_length, test=args.test, skip_frac=args.skip_frac) if local_rank is not None: sampler = torch.utils.data.distributed.DistributedSampler(data, \ num_replicas=args.world_size, rank=local_rank) shuffle = False else: sampler = None shuffle = True dataloader = DataLoader(data, batch_size=args.batch_size, shuffle=shuffle, num_workers=0, \ sampler=sampler, pin_memory=True) test_scores = [] accumulation_steps = args.accumulation_steps num_training_steps = len(dataloader) // accumulation_steps * args.epochs print(f"Total Training Steps: {num_training_steps}") scheduler = transformers.get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps=250, num_training_steps=num_training_steps) # Also try # scheduler = ReduceLROnPlateau(optimizer, "max", patience=10, verbose=True) # Train for epoch in range(args.epochs): running_loss = 0.0 running_accuracy = 0.0 ticks = 0.0 number_of_batches = len(dataloader) for train_batch_num, example in enumerate(dataloader): input_ids = torch.stack(example[0], dim=0).transpose(0, 1) token_type_ids = torch.stack(example[1], dim=0).transpose(0, 1) attention_mask = torch.stack(example[2], dim=0).transpose(0, 1) labels = example[3] if local_rank is not None: input_ids = input_ids.cuda(non_blocking=True) token_type_ids = token_type_ids.cuda(non_blocking=True) attention_mask = attention_mask.cuda(non_blocking=True) labels = labels.cuda(non_blocking=True) else: input_ids = input_ids.cuda() token_type_ids = token_type_ids.cuda() attention_mask = attention_mask.cuda() labels = labels.cuda() output, loss = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels) output_probs = output.softmax(dim=-1) predictions = torch.argmax(output_probs, dim=1) loss = loss.mean(dim=0) loss = loss / accumulation_steps running_loss += loss.item() accuracy = accuracy_score(predictions.detach().cpu().numpy(), labels.detach().cpu().numpy()) if args.test: test_scores.append(accuracy) running_accuracy += accuracy ticks += 1.0 if not args.test: if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if (train_batch_num) % accumulation_steps == 0: if fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 1.0) else: torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() model.zero_grad() if ((train_batch_num * args.batch_size) % 500==0 and train_batch_num>0): for param_group in optimizer.param_groups: print("LR:", param_group['lr']) acc = (running_accuracy / ticks) loss = (running_loss / ticks) * accumulation_steps progress = train_batch_num / number_of_batches print(f"[Epoch {epoch+1}: {progress*100:.2f}%] Accuracy: {acc}, Loss: {loss}") running_loss = 0.0 running_accuracy = 0.0 ticks = 0.0 if args.test: print(f"Final test accuracy: {np.mean(test_scores)}") if not args.test and (local_rank==0 or local_rank is None): save_path = f"trained_models/ft_{args.model}_{args.pretrained_class}_{args.core_lr}_{args.head_lr}.pth" print(f"Saving model to {save_path}") torch.save(model.state_dict(), save_path)
def train(config, args, device, logger): # Dataset if config['model']['share_architecture'] in ['ocnli']: train_dataset = OCNLIDataset( config['dataset'], split='trainval' if args.no_validate else 'train', overfit=args.overfit, tensor_type='np' ) if not args.no_validate: val_dataset = OCNLIDataset( config['dataset'], split='val', overfit=args.overfit, tensor_type='np' ) task_name2int = {'ocnli': TASK_NAME2INT['ocnli']} elif config['model']['share_architecture'] == 'ocemotion': train_dataset = OCEMOTIONDataset( config['dataset'], split='trainval' if args.no_validate else 'train', overfit=args.overfit, tensor_type='np' ) if not args.no_validate: val_dataset = OCEMOTIONDataset( config['dataset'], split='val', overfit=args.overfit, tensor_type='np' ) task_name2int = {'ocemotion': TASK_NAME2INT['ocemotion']} elif config['model']['share_architecture'] == 'tnews': train_dataset = TNEWSDataset( config['dataset'], split='trainval' if args.no_validate else 'train', overfit=args.overfit, tensor_type='np' ) if not args.no_validate: val_dataset = TNEWSDataset( config['dataset'], split='val', overfit=args.overfit, tensor_type='np' ) task_name2int = {'tnews': TASK_NAME2INT['tnews']} else: train_dataset = NLPCJointDataset( config['dataset'], split='trainval' if args.no_validate else 'train', overfit=args.overfit, tensor_type='np' ) if not args.no_validate: val_dataset = NLPCJointDataset( config['dataset'], split='val', overfit=args.overfit, tensor_type='np' ) task_name2int = { 'ocnli': TASK_NAME2INT['ocnli'], 'ocemotion': TASK_NAME2INT['ocemotion'], 'tnews': TASK_NAME2INT['tnews'] } logger.info( 'Training set number of samples: {}'.format(len(train_dataset)) ) if not args.no_validate: logger.info( 'Validation set number of samples: {}'.format(len(val_dataset)) ) assert( config['solver']['batch_size'] % config['solver']['accumulation_steps'] == 0 ) actual_batch_size = ( config['solver']['batch_size'] // config['solver']['accumulation_steps'] ) logger.info('Acture batch size: {}'.format(actual_batch_size)) logger.info( 'Gradient accumulation steps: {}' .format(config['solver']['accumulation_steps']) ) logger.info( 'Effective batch size: {}'.format(config['solver']['batch_size']) ) train_dataloader = DataLoader( train_dataset, batch_size=actual_batch_size, shuffle=True, num_workers=args.cpu_workers, collate_fn=collate_fn_with_padding ) if not args.no_validate: val_dataloader = DataLoader( val_dataset, batch_size=actual_batch_size * 4, shuffle=False, num_workers=args.cpu_workers, collate_fn=collate_fn_with_padding ) # Model model = NLPCModel(config['model']).to(device) if -1 not in args.gpu_ids: model = nn.DataParallel(model, args.gpu_ids) if args.load_pthpath != "": model_state_dict, _ = load_checkpoint(args.load_pthpath) if isinstance(model, nn.DataParallel): model.module.load_state_dict(model_state_dict) else: model.load_state_dict(model_state_dict) logger.info( 'Loaded model checkpoint from {}.'.format(args.load_pthpath) ) # loss criterion = NLPCLoss(config['model'], task_name2int, 'train', device) if not args.no_validate: val_criterion = NLPCLoss(config['model'], task_name2int, 'val', device) # Weight decay if 'no_decay' in config['solver'].keys(): no_decay = config['solver']['no_decay'] else: no_decay = [] transformer_params = [ item for item in list(model.named_parameters()) if 'transformer' in item[0] ] not_transformer_params = [ item for item in list(model.named_parameters()) if 'transformer' not in item[0] ] grouped_parameters = [ # non-transformer and need decay { 'params': [ p for n, p in not_transformer_params if not any(nd in n for nd in no_decay) ], 'weight_decay': config['solver']['weight_decay'], "lr": config['solver']['initial_lr'] }, # transformer and need decay { 'params': [ p for n, p in transformer_params if not any(nd in n for nd in no_decay) ], 'weight_decay': config['solver']['transformer_weight_decay'], 'lr': ( config['solver']['transformer_initial_lr'] if 'transformer_initial_lr' in config['solver'] else config['solver']['initial_lr'] ) }, # non-transformer and need not decay { 'params': [ p for n, p in not_transformer_params if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': config['solver']['initial_lr'] }, # transformer and need not decay { 'params': [ p for n, p in transformer_params if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': ( config['solver']['transformer_initial_lr'] if 'transformer_initial_lr' in config['solver'] else config['solver']['initial_lr'] ) } ] if 'task_weights' in config['model'] \ and config['model']['task_weights'] == 'uct': grouped_parameters.append( { 'params': criterion.parameters(), 'weight_decay': 0.0, 'lr': ( config['solver']['uct_initial_lr'] if 'uct_initial_lr' in config['solver'] else config['solver']['initial_lr'] ) } ) # Optimizer if config['solver']['optimizer'] == 'AdamW': optimizer = AdamW( grouped_parameters, lr=config["solver"]["initial_lr"], weight_decay=config['solver']['weight_decay'] ) else: raise ValueError( 'optimizer {} not support now.' .format(config['solver']['optimizer']) ) # Learning rate schedule total_steps = ( math.ceil( len(train_dataloader) / config['solver']['accumulation_steps'] ) * config['solver']['num_epochs'] if 'num_epochs' in config['solver'] else config['solver']['total_steps'] ) warmup_steps = ( math.ceil(total_steps * config['solver']['warmup_fraction']) if 'warmup_fraction' in config['solver'] else config['solver']['warmup_steps'] ) validation_steps = ( config['solver']['validation_steps'] if 'validation_steps' in config['solver'] else math.ceil( len(train_dataloader) / config['solver']['accumulation_steps'] ) ) if not args.no_validate else total_steps logger.info('Total steps: {}'.format(total_steps)) logger.info('Warmup_steps: {}'.format(warmup_steps)) if not args.no_validate: logger.info('Validation steps: {}'.format(validation_steps)) if config['solver']['lr_schedule'] == 'warmup_linear': scheduler = get_linear_schedule_with_warmup( optimizer, warmup_steps, total_steps ) elif config['solver']['lr_schedule'] == 'warmup_cosine': scheduler = get_cosine_schedule_with_warmup( optimizer, warmup_steps, total_steps ) elif config['solver']['lr_schedule'] == 'warmup_cosine_with_hard_restarts': num_cycles = config['solver']['num_cycles'] scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, warmup_steps, total_steps, num_cycles=num_cycles ) else: raise ValueError( 'Learning rate schedule {} not support not.' .format(config['solver']['lr_schedule']) ) # Setup before training summary_writer = SummaryWriter(logdir=args.save_dirpath) checkpoint_manager = CheckpointManager( model, optimizer, args.save_dirpath, overwrite=True, config=config ) accumulation_steps = config['solver']['accumulation_steps'] forward_steps = 0 optimizer_steps = 0 loss = [] if not args.no_validate: best_score = float('-inf') # Evaluate before training if loaded pretrained model if not args.no_validate and args.load_pthpath != "": model.eval() val_losses, val_report = evaluate( model, val_dataloader, val_criterion, device, task_name2int ) val_score = val_report['competition_score'] logger.info('Step {} evaluate result:'.format(optimizer_steps)) for k, v in val_losses.items(): logger.info(' {} = {:.6f}'.format(k, v)) if k == 'val_loss': summary_writer.add_scalar( "val/loss", v, global_step=optimizer_steps ) else: summary_writer.add_scalar( "val/" + k, v, global_step=optimizer_steps ) for k, v in val_report.items(): logger.info(' {} = {:.6f}'.format(k, v)) summary_writer.add_scalar( "val/" + k, v, global_step=optimizer_steps ) # Training loop model.train() train_iterator = iter(train_dataloader) for _ in range(int(math.ceil(total_steps / validation_steps))): for _ in tqdm(range(validation_steps * accumulation_steps)): try: batch = next(train_iterator) except StopIteration: train_iterator = iter(train_dataloader) if args.overfit: break else: batch = next(train_iterator) for key in batch: batch[key] = batch[key].to(device) batch_output = model(batch) batch_loss_output = criterion( batch_output, batch['target'], batch['task_type_id'] ) if isinstance(batch_loss_output, torch.Tensor): batch_loss = batch_loss_output / accumulation_steps batch_loss.backward() loss.append(batch_loss.detach().cpu().numpy()) elif ( isinstance(batch_loss_output, dict) and 'task_weights' in config['model'] and config['model']['task_weights'] in ['uct', 'dtp'] ): batch_loss = batch_loss_output['loss'] / accumulation_steps batch_loss.backward() loss.append(batch_loss.detach().cpu().numpy()) else: raise ValueError() forward_steps += 1 if forward_steps % accumulation_steps == 0: optimizer_steps += 1 loss = np.sum(loss) summary_writer.add_scalar( "train/loss", loss, global_step=optimizer_steps ) loss = [] if isinstance(batch_loss_output, dict) \ and 'task_weights' in config['model'] \ and config['model']['task_weights'] == 'uct': for task_name in task_name2int: summary_writer.add_scalar( "train/weight_" + task_name, batch_loss_output["weight_" + task_name], global_step=optimizer_steps ) if isinstance(batch_loss_output, dict) \ and 'task_weights' in config['model'] \ and config['model']['task_weights'] == 'dtp': for task_name in task_name2int: summary_writer.add_scalar( "train/running_kpi_" + task_name, batch_loss_output["running_kpi_" + task_name], global_step=optimizer_steps ) summary_writer.add_scalar( "train/lr", optimizer.param_groups[0]["lr"], global_step=optimizer_steps ) summary_writer.add_scalar( "train/transformer_lr", optimizer.param_groups[1]["lr"], global_step=optimizer_steps ) if 'task_weights' in config['model'] \ and config['model']['task_weights'] == 'uct': summary_writer.add_scalar( "train/uct_lr", optimizer.param_groups[-1]["lr"], global_step=optimizer_steps ) if config['solver']['max_grad_norm'] > 0: clip_grad_norm_( model.parameters(), config['solver']['max_grad_norm'] ) optimizer.step() optimizer.zero_grad() scheduler.step() torch.cuda.empty_cache() if optimizer_steps >= total_steps: break # Evaluate on validation set. if not args.no_validate: model.eval() val_losses, val_report = evaluate( model, val_dataloader, val_criterion, device, task_name2int ) val_score = val_report['competition_score'] logger.info('Step {} evaluate result:'.format(optimizer_steps)) for k, v in val_losses.items(): logger.info(' {} = {:.6f}'.format(k, v)) if k == 'val_loss': summary_writer.add_scalar( "val/loss", v, global_step=optimizer_steps ) else: summary_writer.add_scalar( "val/" + k, v, global_step=optimizer_steps ) for k, v in val_report.items(): logger.info(' {} = {:.6f}'.format(k, v)) summary_writer.add_scalar( "val/" + k, v, global_step=optimizer_steps ) if val_score > best_score: checkpoint_manager.step() logger.info( ' Validation best score update from {:.6f} to {:.6f}. ' 'Saved checkpoint to {}'.format( best_score, val_score, args.save_dirpath + 'checkpoint.pth' ) ) best_score = val_score else: logger.info( ' Validation best score not updated since {:.6f}. ' 'No checkpoint saved.'.format(best_score) ) model.train() torch.cuda.empty_cache() summary_writer.flush() # Save the final model if no validate if args.no_validate: checkpoint_manager.step() logger.info( 'Saved final checkpoint to {}'.format( args.save_dirpath + 'checkpoint.pth' ) ) summary_writer.close()
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, betas = (args.beta1, args.beta2), eps=args.adam_epsilon) # Set the learning rate schedule if args.lr_scheduler == 'linear': scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) elif args.lr_scheduler == 'cosine': scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) elif args.lr_scheduler == 'cosine_restart': scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'start_positions': batch[3], 'end_positions': batch[4]} if args.model_type != 'distilbert': inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2] if args.model_type in ['xlnet', 'xlm']: inputs.update({'cls_index': batch[5], 'p_mask': batch[6]}) outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def single_fit(self, train_ds, dev_ds, cv=None): ## data_iter batch_size = self.n_gpu * self.per_train_batch_size train_sampler = RandomSampler(train_ds) train_iter = DataLoader(train_ds, sampler=train_sampler, batch_size=batch_size) dev_sampler = SequentialSampler(dev_ds) dev_iter = DataLoader(dev_ds, sampler=dev_sampler, batch_size=batch_size) ## model model = BertologyForTokenClassification( model_name_or_path=self.model_name_or_path, num_labels=self.num_labels, cache_dir=self.cache_dir, device=self.device, bert_dropout=self.bert_dropout, lstm_dropout=self.lstm_dropout, classifier_type=self.classifier_type, num_layers=self.num_layers, lstm_hidden_size=self.lstm_hidden_size) model.to(self.device) ## optimizer no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': self.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] if 'CRF' in self.classifier_type: step_lr = ['GRU', 'LSTM'] optimizer_grouped_parameters.append({ 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in step_lr) ], 'weight_decay': 0.0, 'lr': self.learning_rate * 10 }) t_total = len( train_iter) // self.gradient_accumulation_steps * self.max_epochs warmup_steps = t_total * self.warmup optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate) if self.schedule_type == "linear": scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) elif self.schedule_type == "cosine": scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) elif self.schedule_type == "constant": scheduler = get_constant_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps) elif self.schedule_type == "cosine_restarts": scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) if self.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=self.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if self.n_gpu > 1: model = torch.nn.DataParallel(model) tb_writer = SummaryWriter() def train_fn(engine, batch): model.train() optimizer.zero_grad() batch = tuple(t.to(self.device) for t in batch) labels = batch[3] inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "labels": labels, "is_nested": self.is_nested } loss, sequence_tags = model(**inputs) if not self.is_nested: score = ( sequence_tags == labels).float().detach().cpu().numpy() condition_1 = (labels != self.label_list.index("O") ).detach().cpu().numpy() condition_2 = (labels != self.label_list.index("<PAD>") ).detach().cpu().numpy() patten = np.logical_and(condition_1, condition_2) score = score[patten].mean() else: ''' y_pred = sequence_tags.detach().cpu().numpy() labels_np = labels.detach().cpu().numpy() score = ((y_pred > self.multi_label_threshold) == (labels_np > 0)).mean() ''' score = ((sequence_tags > self.multi_label_threshold) == ( labels > 0)).float().detach().cpu().numpy() condition_1 = (labels != self.label_list.index("O") ).detach().cpu().numpy() condition_2 = (labels != self.label_list.index("<PAD>") ).detach().cpu().numpy() patten = np.logical_and(condition_1, condition_2) score = score[patten].mean() if self.n_gpu > 1: loss = loss.mean() ## tensorboard global_step = global_step_from_engine(engine)( engine, engine.last_event_name) tb_writer.add_scalar('learning_rate', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('train_loss', loss.item(), global_step) tb_writer.add_scalar('train_score', score.item(), global_step) loss.backward() optimizer.step() scheduler.step() model.zero_grad() return loss.item(), score.item() trainer = Engine(train_fn) RunningAverage(output_transform=lambda x: x[0]).attach(trainer, 'loss') RunningAverage(output_transform=lambda x: x[1]).attach( trainer, 'score') def eval_fn(engine, batch): model.eval() batch = tuple(t.to(self.device) for t in batch) with torch.no_grad(): optimizer.zero_grad() labels = batch[3] inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "labels": labels, "is_nested": self.is_nested } loss, sequence_tags = model(**inputs) if not self.is_nested: score = (sequence_tags == labels ).float().detach().cpu().numpy() condition_1 = (labels != self.label_list.index("O") ).detach().cpu().numpy() condition_2 = (labels != self.label_list.index("<PAD>") ).detach().cpu().numpy() patten = np.logical_and(condition_1, condition_2) score = score[patten].mean() else: score = ((sequence_tags > self.multi_label_threshold) == ( labels > 0)).float().detach().cpu().numpy() ''' y_pred = sequence_tags.detach().cpu().numpy() labels_np = labels.detach().cpu().numpy() score = ((y_pred > self.multi_label_threshold) == (labels_np > 0)).mean() ''' condition_1 = (labels != self.label_list.index("O") ).detach().cpu().numpy() condition_2 = (labels != self.label_list.index("<PAD>") ).detach().cpu().numpy() patten = np.logical_and(condition_1, condition_2) score = score[patten].mean() if self.n_gpu > 1: loss = loss.mean() ## tensorboard global_step = global_step_from_engine(trainer)( engine, engine.last_event_name) tb_writer.add_scalar('dev_loss', loss.item(), global_step) tb_writer.add_scalar('dev_score', score.item(), global_step) return loss.item(), score.item() dev_evaluator = Engine(eval_fn) RunningAverage(output_transform=lambda x: x[0]).attach( dev_evaluator, 'loss') RunningAverage(output_transform=lambda x: x[1]).attach( dev_evaluator, 'score') pbar = ProgressBar(persist=True, bar_format="") pbar.attach(trainer, ['loss', 'score']) pbar.attach(dev_evaluator, ['loss', 'score']) def score_fn(engine): loss = engine.state.metrics['loss'] score = engine.state.metrics['score'] return score / (loss + 1e-12) handler = EarlyStopping(patience=self.patience, score_function=score_fn, trainer=trainer) dev_evaluator.add_event_handler(Events.COMPLETED, handler) @trainer.on(Events.EPOCH_COMPLETED) def log_dev_results(engine): dev_evaluator.run(dev_iter) dev_metrics = dev_evaluator.state.metrics avg_score = dev_metrics['score'] avg_loss = dev_metrics['loss'] logger.info( "Validation Results - Epoch: {} Avg score: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_score, avg_loss)) l = self.model_name_or_path.split('/') if len(l) > 1: model_name = l[-1] else: model_name = self.model_name_or_path def model_score(engine): score = engine.state.metrics['score'] return score model_prefix = "bertology_{}".format(self.classifier_type.lower()) \ if cv is None else "bertology_{}_cv_{}".format(self.classifier_type.lower(), cv) checkpointer = ModelCheckpoint( self.output_dir, model_prefix, n_saved=self.n_saved, create_dir=True, score_name="model_score", score_function=model_score, global_step_transform=global_step_from_engine(trainer), require_empty=False) dev_evaluator.add_event_handler( Events.COMPLETED, checkpointer, {model_name: model.module if hasattr(model, 'module') else model}) # Clear cuda cache between training/testing def empty_cuda_cache(engine): torch.cuda.empty_cache() import gc gc.collect() trainer.add_event_handler(Events.EPOCH_COMPLETED, empty_cuda_cache) dev_evaluator.add_event_handler(Events.COMPLETED, empty_cuda_cache) # save config @trainer.on(Events.COMPLETED) def save_config(engine): torch.save(self, os.path.join(self.output_dir, 'fit_args.pkl')) trainer.run(train_iter, max_epochs=self.max_epochs)