def adding_lr_decay_handler(optimizer, trainer, model_parameters): lr_decay_epoch_frequency = model_parameters['lr_decay_epoch_frequency'] lr_decay = model_parameters['lr_decay'] step_scheduler = StepLR(optimizer, step_size=lr_decay_epoch_frequency, gamma=lr_decay) scheduler = LRScheduler(step_scheduler) trainer.add_event_handler(Events.EPOCH_COMPLETED, scheduler)
def find_lr_add_one_cycle(engine: Engine, lr_finder: LRFinder, optimizer: Optimizer): train_dl = engine.state.dataloader lr_finder.range_test(train_dl, num_iter=1000) max_lr = lr_finder.lr_suggestion lr_finder.reset() one_cycle_scheduler = LRScheduler( OneCycleLR(optimizer, max_lr, train_dl=train_dl, num_epochs=engine.state.max_epochs)) engine.add_event_handler(Events.ITERATION_STARTED, one_cycle_scheduler)
def schedule_lr(self, optimizer, name, params, warmup_start=None, warmup_end=None, warmup_duration=None): if name is None: return None lr_scheduler = self._get_lr_scheduler(name)(optimizer, **params) if warmup_start and warmup_end and warmup_duration: scheduler = \ create_lr_scheduler_with_warmup(lr_scheduler, warmup_start_value=warmup_start, warmup_end_value=warmup_end, warmup_duration=warmup_duration) else: scheduler = LRScheduler(lr_scheduler) self.trainer.add_event_handler(Events.EPOCH_COMPLETED, scheduler)
def _run(self, engine): engine.state.state_cache = _StateCacher(self._memory_cache, cache_dir=self._cache_dir) engine.state.state_cache.store("model", self._model.state_dict()) engine.state.state_cache.store("optimizer", self._optimizer.state_dict()) self._history = {"lr": [], "loss": []} self._best_loss = None self._diverge_flag = False # attach loss and lr logging if not engine.has_event_handler(self._log_lr_and_loss): engine.add_event_handler(Events.ITERATION_COMPLETED, self._log_lr_and_loss) # attach LRScheduler to engine. can be done only after engine.run was called because of num_iter required_epochs = self.num_iter / len(engine.state.dataloader) if engine.state.max_epochs < required_epochs: engine.state.max_epochs = int(np.ceil(required_epochs)) self._logger.debug("Running LR finder for {} iterations".format( self.num_iter)) # Initialize the proper learning rate policy if self._step_mode.lower() == "exp": self._lr_schedule = LRScheduler( _ExponentialLR(self._optimizer, self._end_lr, self.num_iter)) else: self._lr_schedule = LRScheduler( _LinearLR(self._optimizer, self._end_lr, self.num_iter)) if not engine.has_event_handler(self._lr_schedule): engine.add_event_handler(Events.ITERATION_COMPLETED, self._lr_schedule, self.num_iter) if not engine.has_event_handler(self._reached_num_iterations): engine.add_event_handler(Events.ITERATION_COMPLETED, self._reached_num_iterations)
def _run(self, trainer, optimizer, output_transform, num_iter, end_lr, step_mode, smooth_f, diverge_th): self._history = {"lr": [], "loss": []} self._best_loss = None self._diverge_flag = False # attach LRScheduler to trainer. if num_iter is None: num_iter = trainer.state.epoch_length * trainer.state.max_epochs else: max_iter = trainer.state.epoch_length * trainer.state.max_epochs if num_iter > max_iter: warnings.warn( "Desired num_iter {} is unreachable with the current run setup of {} iteration " "({} epochs)".format(num_iter, max_iter, trainer.state.max_epochs), UserWarning, ) if not trainer.has_event_handler(self._reached_num_iterations): trainer.add_event_handler(Events.ITERATION_COMPLETED, self._reached_num_iterations, num_iter) # attach loss and lr logging if not trainer.has_event_handler(self._log_lr_and_loss): trainer.add_event_handler(Events.ITERATION_COMPLETED, self._log_lr_and_loss, output_transform, smooth_f, diverge_th) self.logger.debug( "Running LR finder for {} iterations".format(num_iter)) # Initialize the proper learning rate policy if step_mode.lower() == "exp": self._lr_schedule = LRScheduler( _ExponentialLR(optimizer, end_lr, num_iter)) else: start_lr = optimizer.param_groups[0]["lr"] self._lr_schedule = PiecewiseLinear(optimizer, param_name="lr", milestones_values=[ (0, start_lr), (num_iter, end_lr) ]) if not trainer.has_event_handler(self._lr_schedule): trainer.add_event_handler(Events.ITERATION_COMPLETED, self._lr_schedule, num_iter)
def _test_setup_common_training_handlers( dirname, device, rank=0, local_rank=0, distributed=False, lr_scheduler=None, save_handler=None ): lr = 0.01 step_size = 100 gamma = 0.5 num_iters = 100 num_epochs = 10 model = DummyModel().to(device) if distributed and "cuda" in device: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank,], output_device=local_rank) optimizer = torch.optim.SGD(model.parameters(), lr=lr) if lr_scheduler is None: lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma) elif isinstance(lr_scheduler, str) and lr_scheduler == "ignite|LRScheduler": from ignite.contrib.handlers import LRScheduler lr_scheduler = LRScheduler(torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)) elif isinstance(lr_scheduler, str) and lr_scheduler == "ignite": from ignite.contrib.handlers import PiecewiseLinear milestones_values = [(0, 0.0), (step_size, lr), (num_iters * (num_epochs - 1), 0.0)] lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values) else: raise ValueError(f"Unknown lr_scheduler: {lr_scheduler}") def update_fn(engine, batch): optimizer.zero_grad() x = torch.tensor([batch], requires_grad=True, device=device) y_pred = model(x) loss = y_pred.mean() loss.backward() optimizer.step() return loss train_sampler = None if distributed and idist.get_world_size() > 1: train_sampler = MagicMock(spec=DistributedSampler) train_sampler.set_epoch = MagicMock() trainer = Engine(update_fn) setup_common_training_handlers( trainer, train_sampler=train_sampler, to_save={"model": model, "optimizer": optimizer}, save_every_iters=75, output_path=dirname, save_handler=save_handler, lr_scheduler=lr_scheduler, with_gpu_stats=False, output_names=["batch_loss",], with_pbars=True, with_pbar_on_iters=True, log_every_iters=50, ) data = [i * 0.1 for i in range(num_iters)] trainer.run(data, max_epochs=num_epochs) # check handlers handlers = trainer._event_handlers[Events.ITERATION_COMPLETED] for cls in [ TerminateOnNan, ]: assert any([isinstance(h[0], cls) for h in handlers]), f"{handlers}" assert "batch_loss" in trainer.state.metrics # Check saved checkpoint if rank == 0: if save_handler is not None: dirname = save_handler.dirname checkpoints = list(os.listdir(dirname)) assert len(checkpoints) == 1 for v in [ "training_checkpoint", ]: assert any([v in c for c in checkpoints]) # Check LR scheduling assert optimizer.param_groups[0]["lr"] <= lr * gamma ** ( num_iters * num_epochs / step_size ), f"{optimizer.param_groups[0]['lr']} vs {lr * gamma ** (num_iters * num_epochs / step_size)}"
def train(): os.environ['CUDA_VISIBLE_DEVICES'] = '7' parser = ArgumentParser() parser.add_argument('--gpt2', action='store_true', help="use gpt2") parser.add_argument("--model_checkpoint", type=str, default="uer/gpt2-chinese-cluecorpussmall", help="Path or URL of the model") parser.add_argument("--from_step", type=int, default=-1, help="Init learning rate from this step") parser.add_argument('--pretrained', action='store_true', help="If False train from scratch") parser.add_argument("--data_path", type=str, default="data/autocloze.json", help="Path or url of the dataset. ") parser.add_argument("--train_path", type=str, default="data/toy_train.txt", help="Path of the train dataset for dist dataset. ") parser.add_argument("--valid_path", type=str, default="data/toy_valid.txt", help="Path of the valid dataset for dist dataset. ") #-------------------------------------------------------------- parser.add_argument("--dataset_cache", type=str, default="dataset_zh", help="Path or url of the dataset cache") parser.add_argument('--log_file', '-log_file', type=str, default="", help="Output logs to a file under this path") parser.add_argument("--num_workers", type=int, default=8, help="Number of subprocesses for data loading") parser.add_argument("--n_epochs", type=int, default=40, help="Number of training epochs") parser.add_argument("--train_batch_size", type=int, default=1, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=1, help="Batch size for validation") parser.add_argument("--max_history", type=int, default=15, help="Number of previous exchanges to keep in history") parser.add_argument("--scheduler", type=str, default="noam", choices=['noam', 'linear'], help="method of optim") parser.add_argument("--n_emd", type=int, default=768, help="Number of n_emd in config file (for noam)") parser.add_argument("--lr", type=float, default=5e-5, help="Learning rate") parser.add_argument("--eval_before_start", action='store_true', help="If true start with a first evaluation before training") parser.add_argument("--warmup_steps", type=int, default=5000, help="Warm up steps") parser.add_argument("--valid_steps", type=int, default=5000, help="Perfom validation every X steps") parser.add_argument("--gradient_accumulation_steps", type=int, default=64, help="Accumulate gradients on several steps") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--fp16", type=str, default="", help="Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument("--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") args = parser.parse_args() print('cuda ',torch.cuda.is_available()) # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. # logger.info => log main process only, logger.warning => log all processes logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning("Running process %d", args.local_rank) logger.info("Arguments: %s", pformat(args)) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) '''if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') ''' args.device = torch.device("cuda") print('device ',args.device) logger.info("Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning") #model_class = OpenAIGPTLMHeadModel if not args.gpt2 else GPT2LMHeadModel #config_class = OpenAIGPTConfig if not args.gpt2 else GPT2Config model_class = GPT2LMHeadModel config_class = GPT2Config tokenizer_class = BertTokenizer print('pretrained:',args.pretrained) if args.pretrained: print("----------------pretrained") tokenizer = BertTokenizer.from_pretrained(args.model_checkpoint, do_lower_case=True) model = GPT2LMHeadModel.from_pretrained(args.model_checkpoint) else: tokenizer = BertTokenizer.from_pretrained("uer/gpt2-chinese-cluecorpussmall") model = GPT2LMHeadModel.from_pretrained("uer/gpt2-chinese-cluecorpussmall",from_tf=True) #print('generate') #print(text_generator("这是很久之前的事情了", max_length=100, do_sample=True)) #args.device=torch.device("cuda", 2) model.to(args.device) optimizer = AdamW([{'params': model.parameters(), 'initial_lr': args.lr}], lr=args.lr, correct_bias=True) logger.info("Prepare datasets") loader_class = build_dist_loaders if not args.data_path else build_dataloaders train_loader, val_loader, train_sampler, valid_sampler = loader_class(args, tokenizer, logger) logger.info("Prepare datasets ends") # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) model=model.module #if isinstance(model,torch.nn.DataParallel): #print('params:',params_count(model)) #tokens_embed = model.transformer.get_input_embeddings() # Training function and trainer def update(engine, batch): input_ids, token_type_ids, lm_labels = tuple(input_tensor.to(args.device) for input_tensor in batch) #for i in range(input_ids.size()[0]): # for j in range(input_ids.size()[1]): # if input_ids[i,j]==-1: # input_ids[i,j]=-100 # if lm_labels[i,j]==-1: # lm_labels[i,j]=-100 #one=torch.tensor(-100) #input_ids=torch.where(input_ids==-1,one,input_ids) #lm_labels=torch.where(lm_labels==-1,one,lm_labels) #print('traindata',input_ids,lm_labels) #lm_labels=input_ids r'''input_shape = input_ids.siz`e`() input_ids = input_ids.view(-1, input_shape[-1]) inputs_embeds = tokens_embed(input_ids) * math.sqrt(tokens_embed.embedding_dim)''' model.train() #(lm_loss), *_ = model(inputs_embeds=inputs_embeds, labels=lm_labels,return_dict=0) (lm_loss), *_ = model(input_ids=input_ids, labels=lm_labels,return_dict=False) #print('lm_loss',lm_loss) loss = lm_loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item(), optimizer.param_groups[0]['lr'] trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) cntepoch=0 def inference(engine, batch): model.eval() with torch.no_grad(): input_ids, token_type_ids, lm_labels = tuple(input_tensor.to(args.device) for input_tensor in batch) # logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) #one = torch.tensor(-100) #input_ids=torch.where(input_ids==-1,one,input_ids) #print('validdata',input_ids,lm_labels) #lm_labels=input_ids r'''input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) inputs_embeds = tokens_embed(input_ids) * math.sqrt(tokens_embed.embedding_dim)''' #lm_logits, *_ = model(inputs_embeds=inputs_embeds,return_dict=0) lm_logits, *_ = model(input_ids=input_ids,return_dict=False) lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return lm_logits_flat_shifted, lm_labels_flat_shifted cntepoch+=1 torch.save(args, tb_logger.writer.logdir + '_%s/model_training_args.bin'%(str(cntepoch))) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Evaluation during training @trainer.on(Events.ITERATION_STARTED) def log_iterations(engine): # if engine.state.iteration % max(int(0.1 * len(train_loader)), 1) == 0: if engine.state.iteration % args.valid_steps == 0: evaluator.run(val_loader) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # noam decrease the learning rate # model_size = model.config.n_embd model_size = args.n_emd noam_lambda = lambda step: ( model_size ** (-0.5) * min((step + 1) ** (-0.5), (step + 1) * args.warmup_steps ** (-1.5))) noam_scheduler = LambdaLR(optimizer, lr_lambda=noam_lambda, last_epoch=args.from_step) scheduler = LRScheduler(noam_scheduler) if args.scheduler == "linear": scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x[0]).attach(trainer, "loss") RunningAverage(output_transform=lambda x: x[1]).attach(trainer, "lr") metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0], x[1]))} metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args)}) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints # And save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True, mininterval=2) pbar.attach(trainer, metric_names=["loss", "lr"]) evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics))) tb_logger = TensorboardLogger(log_dir=None) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(tb_logger.writer.logdir, 'checkpoint', save_interval=1, n_saved=6) # save model after evaluation evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, { 'mymodel': getattr(model, 'module', model)}) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, { 'mymodel': getattr(model, 'module', model)}) # "getattr" take care of distributed encapsulation torch.save(args, tb_logger.writer.logdir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file(os.path.join(tb_logger.writer.logdir, CONFIG_NAME)) tokenizer.save_vocabulary(tb_logger.writer.logdir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint # (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.logdir, WEIGHTS_NAME)) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def fit(self, train_dataset: TileFeaturesDataset, val_dataset: TileFeaturesDataset = None, epochs: int = 10, batch_size: int = 10, num_workers: int = 10, evaluate_every: int = 300, save_every: int = 1000): """ Args: train_dataset: The dataset object for training data val_dataset: The dataset object for validation data, optional epochs: number of epochs to train the network batch_size: batch size for the network num_workers: number of workers for the network evaluate_every: every how many steps to run evaluation save_every: every how many steps to save the model Returns: a trained pytorch model """ # create data loader train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers) if val_dataset is not None: val_data_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers) else: val_data_loader = None # create the model criterion = MultiheadLoss(self.losses, use_log=self.log_loss, weights=self.losses_weights).to(self.device) # create tensorboard writer = create_summary_writer(self.model, train_data_loader, log_dir=TENSORBOARD_DIR) def multihead_loss_func(y_pred, y): return criterion(y_pred[1], torch.split(y, 1, dim=1))[0] def multihead_output_transform(x, y, y_pred, *args): embedding, output = y_pred y_pred_tensor = torch.stack(output).squeeze(2).transpose(0, 1) y_tensor = y data = x with torch.no_grad(): loss, multi_losses = criterion(output, torch.split(y, 1, dim=1)) return data, embedding, loss, multi_losses, y_pred_tensor, y_tensor eval_metrics = { 'rmse': RootMeanSquaredError(), # 'corr': DistanceCorrelation(), # 'embedding_data': EmbeddingData() } train_metrics = { 'rmse': RootMeanSquaredError() # , 'corr': DistanceCorrelation() } trainer = create_supervised_trainer( self.model, self.optimizer, multihead_loss_func, device=self.device, output_transform=multihead_output_transform) for name, metric in train_metrics.items( ): # Calculate metrics also on trainer metric.attach(trainer, name) evaluator = create_supervised_evaluator( self.model, metrics=eval_metrics, device=self.device, output_transform=multihead_output_transform) if self.model_save_path is not None: # do we want to use it ? from Ignite checkpoint_handler = ModelCheckpoint(self.model_save_path, 'checkpoint', save_interval=save_every, n_saved=10, require_empty=False, create_dir=True) pbar = ProgressBar() # RunningAverage(output_transform=lambda x: x[2]) pbar.attach(trainer) scheduler = LRScheduler(self.step_scheduler) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': self.model}) @trainer.on(Events.EPOCH_STARTED) def init_state_params(engine): engine.state.plusplus_ex, engine.state.plusminus_ex = [ None ] * self.n_features, [None] * self.n_features engine.state.minusminus_ex, engine.state.minusplus_ex = [ None ] * self.n_features, [None] * self.n_features @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): writer.add_scalar('General/LR', scheduler.get_param(), global_step=engine.state.iteration) _, embedding, loss, multi_losses, y_pred_tensor, y_tensor = engine.state.output images_batch, features_batch = engine.state.batch plusplus_ex, plusminus_ex = engine.state.plusplus_ex, engine.state.plusminus_ex minusminus_ex, minusplus_ex = engine.state.minusminus_ex, engine.state.minusplus_ex writer.add_scalar('General/Train Loss', loss, global_step=engine.state.iteration) feat_diff = (y_pred_tensor - y_tensor) # / y_tensor + 1 feat_sum = y_pred_tensor + y_tensor for j in range(self.n_features): writer.add_scalar(f'Multiple Losses/{self.feature_names[j]}', multi_losses[j], global_step=engine.state.iteration) for i in range(len(images_batch)): itm_diff, itm_sum = feat_diff[i][j].item( ), feat_sum[i][j].item() itm_pred, itm_actual = y_pred_tensor[i][j].item( ), y_tensor[i][j].item() ex = TrainExample(images_batch[i], predicted=itm_pred, actual=itm_actual, sum=itm_sum, diff=itm_diff) if minusminus_ex[j] is None or minusminus_ex[ j].sum > itm_sum: engine.state.minusminus_ex[j] = ex elif plusminus_ex[j] is None or plusminus_ex[ j].diff < itm_diff: engine.state.plusminus_ex[j] = ex elif minusplus_ex[j] is None or minusplus_ex[ j].diff > itm_diff: engine.state.minusplus_ex[j] = ex elif plusplus_ex[j] is None or plusplus_ex[j].sum < itm_sum: engine.state.plusplus_ex[j] = ex @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): global_step = engine.state.iteration metrics = engine.state.metrics # already attached to the trainer engine to save # can add more metrics here add_metrics_to_tensorboard(metrics, writer, self.feature_names, global_step, log_str="train") # plot min-max examples plusplus_ex, plusminus_ex = engine.state.plusplus_ex, engine.state.plusminus_ex minusminus_ex, minusplus_ex = engine.state.minusminus_ex, engine.state.minusplus_ex for j in range(self.n_features): if plusplus_ex[j] is None: continue writer.add_figure(tag=f"{self.feature_names[j]}/plusplus", figure=build_example_image_figure( plusplus_ex[j]), global_step=global_step) writer.add_figure(tag=f"{self.feature_names[j]}/plusminus", figure=build_example_image_figure( plusminus_ex[j]), global_step=global_step) writer.add_figure(tag=f"{self.feature_names[j]}/minusminus", figure=build_example_image_figure( minusminus_ex[j]), global_step=global_step) writer.add_figure(tag=f"{self.feature_names[j]}/minusplus", figure=build_example_image_figure( minusplus_ex[j]), global_step=global_step) @trainer.on(Events.ITERATION_COMPLETED) def log_validation_results(engine): global_step = engine.state.iteration if global_step % evaluate_every == 0: evaluator.run(val_data_loader) metrics = evaluator.state.metrics # can add more metrics here add_metrics_to_tensorboard(metrics, writer, self.feature_names, global_step, log_str="validation") # add_embedding_visualization(writer, metrics, global_step) if global_step % save_every == 0: self.save_trained_model( f"{self.model_save_path}/{global_step}_model.pth") trainer.run(train_data_loader, max_epochs=epochs) return self.model
# 9.5608062744141, # 7.8698215484619, # 9.5168733596802, # 10.373730659485, # 6.6616044044495, # 10.260489463806, # 10.287888526917, # 10.289801597595, # 10.405355453491, # 10.138095855713]) # loss_fn = torch.nn.CrossEntropyLoss(ignore_index=255, weight=weight) loss_fn = OHEMLoss(ignore_index=255, numel_frac=1 / 16) loss_fn = loss_fn.cuda() scheduler = LRScheduler( PolyLR(optimizer, args.learning_rate, total_steps=args.epochs * len(train_loader) - 1000)) scheduler = create_lr_scheduler_with_warmup(scheduler, 0, args.learning_rate, 1000) original_optimizer = optimizer model, optimizer = amp.initialize(model, optimizer, opt_level="O2") if args.checkpoint: amp.load_state_dict(checkpoint['amp']) if args.distributed: model = convert_syncbn_model(model) model = DistributedDataParallel(model) trainer = create_segmentation_trainer(
def create_lr_finder( model, criterion, optim_fn=optim.SGD, create_engine=None, lr_init=1e-11, lr_final=10, optim_fn_kwargs=None, device=None, non_blocking=False, ): """ create_lr_finder( model, criterion, optim_fn=optim.SGD, create_engine=None, lr_init=1e-11, lr_final=10, optim_fn_kwargs=None, device=None, ) Parameters ---------- model : nn.Module criterion : nn.Loss optim_fn : torch.optim instance Default: optim.SGD lr_init : float lr_final : float optim_fn_kwargs : dict (optional) device : torch.device Returns ------- find_lr : callable Example ------- >>> model = nn.Sequential(nn.Linear(5, 2), nn.ReLU(), nn.Linear(2, 2)) >>> model_parameter = next(model.parameters()) >>> # initial value for model_parameter: >>> print(model_parameter) >>> ## <some tensor> >>> criterion = nn.CrossEntropyLoss() >>> find_lr = create_lr_finder(model, criterion) >>> # plotting does not require GUI >>> output = find_lr(loader, plot_fpath="./lr_finder_plot.pdf") >>> # the original model's parameters are not modified! >>> print(model_parameter) >>> ## <the same tensor> >>> print(output.keys()) >>> ## ('lr', 'batch_loss') >>> print(len(output["lr"])) >>> ## <number of iterations> Notes ----- See this article https://sgugger.github.io/how-do-you-find-a-good-learning-rate.html for what might be a better implementation: exponential smoothing and runs over a single epoch only. Maybe look at this one too: https://forums.fast.ai/t/automated-learning-rate-suggester/44199/15 which talks about going the final step and choosing an lr automagically. """ from copy import deepcopy # Old code: # new_model = deepcopy(model) if hasattr(model, "_args"): new_model = type(model)(*model._args) else: new_model = deepcopy(model) if create_engine is None: create_engine = create_lr_finder_engine if optim_fn_kwargs is None: optim_fn_kwargs = {} elif isinstance(optim_fn_kwargs, dict): optim_fn_kwargs = { key: value for key, value in optim_fn_kwargs.items() if key != "lr" } if device is None: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") new_model = new_model.to(device) optimizer = optim_fn(new_model.parameters(), lr=lr_init, **optim_fn_kwargs) lr_finder = create_engine( new_model, optimizer, criterion, device=device, non_blocking=non_blocking, ) exp_scheduler = ExponentialLR(optimizer, gamma=1.1) scheduler = LRScheduler(exp_scheduler, save_history=True) lr_finder.add_event_handler(Events.ITERATION_COMPLETED, scheduler) logger = Logger() @lr_finder.on(Events.ITERATION_STARTED) def log_lr(engine): logger("lr", scheduler.get_param()) @lr_finder.on(Events.ITERATION_COMPLETED) def log_batch_loss(engine): logger("batch_loss", engine.state.output) @lr_finder.on(Events.ITERATION_COMPLETED) def terminate_maybe(engine): loss_upper_bound = logger["batch_loss"][0] * 100 if engine.state.output >= loss_upper_bound: engine.terminate() return if scheduler.get_param() > lr_final: engine.terminate() return @lr_finder.on(Events.COMPLETED) def attach_logger(engine): if not hasattr(engine.state, "logger"): setattr(engine.state, "logger", logger) def _get_smoothed_data(output, lr_min, lr_max): df = pd.DataFrame(output) df["log_lr"] = np.log(df.lr.values) df["log_loss"] = np.log(df.batch_loss.values) smoothed_log_loss = (df.set_index("log_lr")["log_loss"].rolling( 10, center=True).mean().reset_index()) df["lr_smooth"] = np.exp(smoothed_log_loss.log_lr) df["batch_loss_smooth"] = np.exp(smoothed_log_loss.log_loss) df = df.dropna() df = df.loc[(df.lr >= lr_min) & (df.lr <= lr_max)] return df def _plot_helper(plot_fpath, df, lr_min, lr_max, figsize=None): import matplotlib.pyplot as plt if figsize is None: figsize = (8, 5) fig, ax = plt.subplots(1, 1, figsize=figsize) ax.plot(df.lr, df.batch_loss, label="unsmoothed") ax.plot(df.lr_smooth, df.batch_loss_smooth, label="smooth") ax.set_xscale("log") ax.set_yscale("log") ax.set_xlim(lr_min, lr_max) ax.set_xlabel("batch loss") ax.set_ylabel("lr value") ax.legend() fig.savefig(plot_fpath) del fig, ax return def _auto_lr_finder(output): from scipy.ndimage import gaussian_filter1d lr_vec = np.array(output["lr"]) loss_vec = np.array(output["batch_loss"]) idx = np.argmin(loss_vec) lr_centre = lr_vec[idx] lr_min = np.maximum(lr_centre / 100, np.min(lr_vec)) lr_max = np.minimum(lr_centre * 1000, np.max(lr_vec)) lr_values = lr_vec[(lr_vec >= lr_min) & (lr_vec <= lr_max)] batch_loss = loss_vec[(lr_vec >= lr_min) & (lr_vec <= lr_max)] batch_loss_sm = gaussian_filter1d(batch_loss, 1) d_batch_loss_sm = gaussian_filter1d(batch_loss, 1, order=1) idx_min = np.argmin(batch_loss_sm) idx_dec = np.argmin(d_batch_loss_sm[:idx_min]) lr_star = lr_values[idx_dec] if lr_star > 1: print("warning: found lr_star > 1. returning 1e-2") lr_star = 1e-2 return lr_star def find_lr(dataloader, max_epochs=100, plot_fpath=None, figsize=None): """ find_lr(dataloader, max_epochs=100, plot_fpath=False) Parameters ---------- dataloader: torch.utils.data.DataLoader dataloader max_epochs: int upper bound on number of epochs for which to run plot_fpath: string location of saved plot Returns ------- output : dict Has keys 'lr' and 'batch_loss'. """ final_state = lr_finder.run(dataloader, max_epochs) output = final_state.logger.log if isinstance(plot_fpath, str): lr_vec = output["lr"] loss_vec = output["batch_loss"] idx = np.argmin(loss_vec) lr_centre = lr_vec[idx] lr_min = np.maximum(lr_centre / 100, np.min(lr_vec)) lr_max = np.minimum(lr_centre * 1000, np.max(lr_vec)) df = _get_smoothed_data(output, lr_min, lr_max) _plot_helper(plot_fpath, df, lr_min, lr_max, figsize=figsize) lr_star = _auto_lr_finder(output) return lr_star return find_lr
def run_once(self): # self.log_path = 'log/%s/' % self.dataset # self.model_name = 'efficientnet-b0_MSI_{0}fold_random_tile_patch'.format(self.fold_idx) # self.log_dir = self.log_path + self.model_name log_dir = self.log_dir check_manual_seed(self.seed) train_pairs, valid_pairs = dataset.prepare_PAIP2020_PANDA( self.fold_idx) print(len(train_pairs)) print(len(valid_pairs)) train_augmentors = self.train_augmentors() train_dataset = dataset.DatasetSerial(train_pairs[:], self.tile_size, self.num_tile, train_mode=True) infer_augmentors = self.infer_augmentors() # HACK at has_aux infer_dataset = dataset.DatasetSerial(valid_pairs[:], self.tile_size, self.num_tile, train_mode=False) train_loader = data.DataLoader(train_dataset, num_workers=self.nr_procs_train, batch_size=self.train_batch_size, shuffle=True, drop_last=True) valid_loader = data.DataLoader(infer_dataset, num_workers=self.nr_procs_valid, batch_size=self.infer_batch_size, shuffle=True, drop_last=False) # --------------------------- Training Sequence if self.logging: check_log_dir(log_dir) # device = 'cuda' # networksv input_chs = 3 # TODO: dynamic config # ### VGGNet net = EfficientNet.from_pretrained('efficientnet-b0', num_classes=2) #net =DenseNet(3,2) # load pre-trained models net = torch.nn.DataParallel(net).to(device) if self.load_network: saved_state = torch.load(self.save_net_path) net.load_state_dict(saved_state) # optimizers optimizer = optim.Adam(net.parameters(), lr=self.init_lr) scheduler = StepLR(optimizer, self.lr_steps, gamma=0.1) scheduler = LRScheduler(scheduler) # trainer = Engine(lambda engine, batch: self.train_step( net, batch, optimizer, device)) valider = Engine( lambda engine, batch: self.infer_step(net, batch, device)) infer_output = ['prob', 'true'] ## if self.logging: checkpoint_handler = ModelCheckpoint(log_dir, self.chkpts_prefix, save_interval=1, n_saved=100, require_empty=False) # adding handlers using `trainer.add_event_handler` method API trainer.add_event_handler(event_name=Events.EPOCH_COMPLETED, handler=checkpoint_handler, to_save={'net': net}) timer = Timer(average=True) timer.attach(trainer, start=Events.EPOCH_STARTED, resume=Events.ITERATION_STARTED, pause=Events.ITERATION_COMPLETED, step=Events.ITERATION_COMPLETED) timer.attach(valider, start=Events.EPOCH_STARTED, resume=Events.ITERATION_STARTED, pause=Events.ITERATION_COMPLETED, step=Events.ITERATION_COMPLETED) # attach running average metrics computation # decay of EMA to 0.95 to match tensorpack default # TODO: refactor this RunningAverage(alpha=0.95, output_transform=lambda x: x['acc']).attach( trainer, 'acc') RunningAverage(alpha=0.95, output_transform=lambda x: x['loss']).attach( trainer, 'loss') # attach progress bar pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=['loss']) pbar.attach(valider) # #early Stopping # def score_function(engine): # val_acc=engine.state.metrics["valid-acc"] # return val_acc # early_stopping_handler=EarlyStopping(patience=10,score_function=score_function,trainer=trainer) # adding handlers using `trainer.on` decorator API @trainer.on(Events.EXCEPTION_RAISED) def handle_exception(engine, e): if isinstance(e, KeyboardInterrupt) and (engine.state.iteration > 1): engine.terminate() warnings.warn('KeyboardInterrupt caught. Exiting gracefully.') checkpoint_handler(engine, {'net_exception': net}) else: raise e # writer for tensorboard logging tfwriter = None # HACK temporary if self.logging: tfwriter = SummaryWriter(log_dir) json_log_file = log_dir + '/stats.json' with open(json_log_file, 'w') as json_file: json.dump({}, json_file) # create empty file ### TODO refactor again log_info_dict = { 'logging': self.logging, 'optimizer': optimizer, 'tfwriter': tfwriter, 'json_file': json_log_file, 'nr_classes': self.nr_classes, 'metric_names': infer_output, 'infer_batch_size': self.infer_batch_size # too cumbersome } trainer.add_event_handler(Events.EPOCH_COMPLETED, log_train_ema_results, log_info_dict) trainer.add_event_handler(Events.EPOCH_COMPLETED, inference, valider, valid_loader, log_info_dict) valider.add_event_handler(Events.ITERATION_COMPLETED, accumulate_outputs) # Setup is done. Now let's run the training trainer.run(train_loader, self.nr_epochs) return
def attach_handlers(run, model, optimizer, learning_rule, trainer, evaluator, train_loader, val_loader, params): # Metrics UnitConvergence(model[0], learning_rule.norm).attach(trainer.engine, 'unit_conv') # Tqdm logger pbar = ProgressBar(persist=True, bar_format=config.IGNITE_BAR_FORMAT) pbar.attach(trainer.engine, metric_names='all') tqdm_logger = TqdmLogger(pbar=pbar) # noinspection PyTypeChecker tqdm_logger.attach_output_handler( evaluator.engine, event_name=Events.COMPLETED, tag="validation", global_step_transform=global_step_from_engine(trainer.engine), ) # Evaluator evaluator.attach(trainer.engine, Events.EPOCH_COMPLETED(every=100), train_loader, val_loader) # Learning rate scheduling lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer=optimizer, lr_lambda=lambda epoch: 1 - epoch / params['epochs']) lr_scheduler = LRScheduler(lr_scheduler) trainer.engine.add_event_handler(Events.EPOCH_COMPLETED, lr_scheduler) # Early stopping mc_handler = ModelCheckpoint(config.MODELS_DIR, run.replace('/', '-'), n_saved=1, create_dir=True, require_empty=False, global_step_transform=global_step_from_engine(trainer.engine)) trainer.engine.add_event_handler(Events.EPOCH_COMPLETED, mc_handler, {'m': model}) # Create a TensorBoard logger tb_logger = TensorboardLogger(log_dir=os.path.join(config.TENSORBOARD_DIR, run)) images, labels = next(iter(train_loader)) tb_logger.writer.add_graph(copy.deepcopy(model).cpu(), images) tb_logger.writer.add_hparams(params, {}) # noinspection PyTypeChecker tb_logger.attach_output_handler( evaluator.engine, event_name=Events.COMPLETED, tag="validation", metric_names="all", global_step_transform=global_step_from_engine(trainer.engine), ) # noinspection PyTypeChecker tb_logger.attach_output_handler( trainer.engine, event_name=Events.EPOCH_COMPLETED, tag="train", metric_names=["unit_conv"] ) input_shape = tuple(next(iter(train_loader))[0].shape[1:]) tb_logger.attach(trainer.engine, log_handler=WeightsImageHandler(model, input_shape), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(trainer.engine, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.EPOCH_STARTED) # tb_logger.attach(trainer.engine, # log_handler=WeightsScalarHandler(model, layer_names=['linear1', 'linear2']), # event_name=Events.EPOCH_COMPLETED) # tb_logger.attach(trainer.engine, # log_handler=WeightsHistHandler(model, layer_names=['linear1', 'linear2']), # event_name=Events.EPOCH_COMPLETED) # tb_logger.attach(trainer.engine, # log_handler=ActivationsHistHandler(model, layer_names=['batch_norm', 'repu']), # event_name=Events.ITERATION_COMPLETED) # tb_logger.attach(trainer.engine, # log_handler=NumActivationsScalarHandler(model, layer_names=['repu']), # event_name=Events.ITERATION_COMPLETED) # tb_logger.attach(trainer.engine, # log_handler=ActivationsScalarHandler(model, reduction=torch.mean, # layer_names=['batch_norm', 'repu']), # event_name=Events.ITERATION_COMPLETED) # tb_logger.attach(trainer.engine, # log_handler=ActivationsScalarHandler(model, reduction=torch.std, # layer_names=['batch_norm', 'repu']), # event_name=Events.ITERATION_COMPLETED) return tb_logger
model = model.to(device) optimizer = torch.optim.SGD( model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay, momentum=0.9, ) loss_fn = nn.CrossEntropyLoss() loss_fn = loss_fn.to(device) scheduler = LRScheduler( torch.optim.lr_scheduler.OneCycleLR(optimizer, args.learning_rate, steps_per_epoch=len(train_loader), epochs=args.epochs)) model, optimizer = amp.initialize(model, optimizer, opt_level="O2") if args.distributed: model = convert_syncbn_model(model) model = DistributedDataParallel(model) trainer = create_classification_trainer( model, optimizer, loss_fn, device=device, use_f16=True, )
def range_test(self, train_loader, val_loader=None, end_lr=10, num_iter=100, step_mode="exp", smooth_f=0.05, diverge_th=5, suggestion=True): """Performs the learning rate range test. Arguments: train_loader (torch.utils.data.DataLoader): the training set data laoder. val_loader (torch.utils.data.DataLoader, optional): if `None` the range test will only use the training loss. When given a data loader, the model is evaluated after each iteration on that dataset and the evaluation loss is used. Note that in this mode the test takes significantly longer but generally produces more precise results. Default: None. end_lr (float, optional): the maximum learning rate to test. Default: 10. num_iter (int, optional): the number of iterations over which the test occurs. Default: 100. step_mode (str, optional): one of the available learning rate policies, linear or exponential ("linear", "exp"). Default: "exp". smooth_f (float, optional): the loss smoothing factor within the [0, 1[ interval. Disabled if set to 0, otherwise the loss is smoothed using exponential smoothing. Default: 0.05. diverge_th (int, optional): the test is stopped when the loss surpasses the threshold: diverge_th * best_loss. Default: 5. suggestion (bool, optional): whether to compute suggested learning rate (minimal grad) and store value into {lr_finder_name}.lr_suggestion. Default: True """ self.logger.info("Learning rate search started") # Reset test results self.history = {"lr": [], "loss": []} self.best_loss = None # Initialize the proper learning rate policy if step_mode.lower() == "exp": lr_schedule = LRScheduler( ExponentialLR(self.optimizer, end_lr, num_iter)) elif step_mode.lower() == "linear": lr_schedule = LRScheduler( LinearLR(self.optimizer, end_lr, num_iter)) else: raise ValueError(f"expected one of (exp, linear), got {step_mode}") if smooth_f < 0 or smooth_f >= 1: raise ValueError("smooth_f is outside the range [0, 1]") trainer = create_supervised_trainer(self.model, self.optimizer, self.criterion, self.device, non_blocking=True) # if val_loader provided, calculates average loss across entire validation set, accurate but very very slow if val_loader: evaluator = create_supervised_evaluator( self.model, metrics={"Loss": Loss(self.criterion)}, device=self.device, non_blocking=True) trainer.add_event_handler(Events.ITERATION_COMPLETED, lambda engine: evaluator.run(val_loader)) # log the loss at the end of every train iteration def log_lr_and_loss(finder): loss = evaluator.state.metrics[ "Loss"] if val_loader else trainer.state.output lr = lr_schedule.lr_scheduler.get_lr()[0] finder.history["lr"].append(lr) if trainer.state.iteration == 1: finder.best_loss = loss else: if smooth_f > 0: loss = smooth_f * loss + ( 1 - smooth_f) * finder.history["loss"][-1] if loss < finder.best_loss: finder.best_loss = loss finder.history["loss"].append(loss) trainer.add_event_handler(Events.ITERATION_COMPLETED, lambda engine: log_lr_and_loss(self)) # increase lr with every iteration trainer.add_event_handler(Events.ITERATION_COMPLETED, lr_schedule) # Check if the loss has diverged; if it has, stop the trainer def loss_diverged(engine: Engine, finder): if finder.history["loss"][-1] > diverge_th * finder.best_loss: engine.terminate() finder.logger.info("Stopping early, the loss has diverged") trainer.add_event_handler(Events.ITERATION_COMPLETED, lambda engine: loss_diverged(engine, self)) # run lr finder trainer.run(train_loader, 999) if suggestion: self.lr_suggestion = self._suggestion() self.logger.info( "Learning rate search finished. See the graph with {finder_name}.plot()" )
def train(): parser = ArgumentParser() parser.add_argument("--dataset_path", type=str, default=DATA_FOLDER, help="Path of the dataset.") parser.add_argument("--image_path", type=str, default=IMG_FOLDER, help="Path of the images.") parser.add_argument("--images_feature_path", type=str, default=IMG_FEATURE_FOLDER, help="Path of the images.") parser.add_argument("--dataset_cache", type=str, default=DATA_CACHE, help="Path of the dataset cache_no_pretrained") parser.add_argument("--model_checkpoint", type=str, default="gpt2", help="Path, url or short name of the model") parser.add_argument('--dhead_gpt2', action='store_true', default=False, help="use double head gpt2") parser.add_argument("--from_step", type=int, default=-1, help="Init learning rate from this step") parser.add_argument('--pretrained', action='store_true', default=True, help="If False train from scratch") parser.add_argument("--num_candidates", type=int, default=1, help="Number of candidates for training") parser.add_argument("--max_history", type=int, default=3, help="Number of previous turns to keep in history") parser.add_argument("--max_length", type=int, default=256, help="Max length of input sentence") parser.add_argument("--train_batch_size", type=int, default=58, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=32, help="Batch size for validation") parser.add_argument("--gradient_accumulation_steps", type=int, default=9, help="Accumulate gradients on several steps") parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate") parser.add_argument("--scheduler", type=str, default="linear", choices=['noam', 'linear'], help="method of optim") parser.add_argument("--n_emd", type=int, default=768, help="Number of n_emd in config file (for noam)") parser.add_argument("--warmup_steps", type=int, default=5000, help="Warm up steps") parser.add_argument("--lm_coef", type=float, default=2.0, help="LM loss coefficient") parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--n_epochs", type=int, default=50, help="Number of training epochs") parser.add_argument("--num_workers", type=int, default=0, help="Number of subprocesses for data loading") parser.add_argument("--personality_permutations", type=int, default=1, help="Number of permutations of personality sentences") parser.add_argument("--eval_before_start", action='store_true', help="If true start with a first evaluation before training") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--fp16", type=str, default="O1", help="Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument("--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning("Running process %d", args.local_rank) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(args)) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info("Prepare tokenizer, pretrained model and optimizer.") tokenizer_class = BertTokenizer config_class = GPT2Config # GPT2Config if "gpt2" in args.model_checkpoint else OpenAIGPTConfig model_class = GPT2LMHeadModel # GPT2DoubleHeadsModel if "gpt2" in args.model_checkpoint else OpenAIGPTDoubleHeadsModel if args.pretrained: tokenizer = tokenizer_class.from_pretrained(MODEL_CHECKPOINT, do_lower_case=False) # tokenizer = tokenizer_class(vocab_file=VOCAB_PATH, do_lower_case=True) model = model_class.from_pretrained(MODEL_CHECKPOINT) else: tokenizer = tokenizer_class(vocab_file=VOCAB_PATH, do_lower_case=False) tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) config = config_class.from_json_file(CONFIG_PATH) model = model_class(config) model.to(args.device) # Add special tokens if they are not already added # add_special_tokens_(model, tokenizer) # optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True) optimizer = AdamW([{'params': model.parameters(), 'initial_lr': args.lr}], lr=args.lr, correct_bias=True) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = build_dataloader(args, tokenizer, logger) def update(engine, batch): model.train() batch = tuple(torch.tensor(input_data).to(args.device) if idx not in [2, 3] else input_data for idx, input_data in enumerate(batch)) input_ids, token_type_ids, input_images, image_ids, lm_labels, mc_token_ids, mc_labels = batch if args.dhead_gpt2: (lm_loss), (mc_loss), *_ = model(input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, mc_labels=mc_labels, lm_labels=lm_labels) loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps else: (lm_loss), *_ = model(input_ids, labels=lm_labels, token_type_ids=token_type_ids, input_images=input_images, image_ids=image_ids) loss = lm_loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() #, optimizer.param_groups[0]['lr'] trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple(input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch # logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) # if we dont send labels to model, it doesnt return losses if args.dhead_gpt2: lm_logits, mc_logits, *_ = model( input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, ) lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels) else: lm_logits, *_ = model(input_ids, token_type_ids=token_type_ids) lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return lm_logits_flat_shifted, lm_labels_flat_shifted evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch # trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero model_size = args.n_emd noam_lambda = lambda step: ( model_size ** (-0.5) * min((step + 1) ** (-0.5), (step + 1) * args.warmup_steps ** (-1.5))) noam_scheduler = LambdaLR(optimizer, lr_lambda=noam_lambda, last_epoch=args.from_step) scheduler = LRScheduler(noam_scheduler) if args.scheduler == "linear": scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0][0], x[1][0])), "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))} metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args), "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args)}) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics))) log_dir = make_logdir(args.model_checkpoint) tb_logger = TensorboardLogger(log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) # tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', n_saved=None) trainer.add_event_handler(Events.EPOCH_COMPLETED(every=1), checkpoint_handler, {'mymodel': getattr(model, 'module', model)}) # "getattr" takes care of distributed encapsulation torch.save(args, log_dir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME)) tokenizer.save_pretrained(log_dir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename(os.path.join(log_dir, checkpoint_handler._saved[-1][1]), os.path.join(log_dir, WEIGHTS_NAME)) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
else: logger.info( 'Current model CANNOT BEAT the previous best model, previous best accuracy is %.5f', best_res['acc']) def score_function(engine): return engine.state.metrics['accuracy'] if not args.evaluation_mode: '''If current run is training''' train_data_loader, _, _, _ = get_pytorch_dataloader( args, train_file_name_prefix, shuffle=True) optimizer = Adam(model.parameters(), lr=args.lr) '''Learning rate decays every 5 epochs''' optimizer_scheduler = StepLR(optimizer, step_size=5, gamma=0.5) scheduler = LRScheduler(optimizer_scheduler) trainer = Engine(train) trainer.add_event_handler(Events.EPOCH_COMPLETED, scheduler) trainer.add_event_handler(Events.ITERATION_COMPLETED, lambda _: evaluator.run(dev_data_loader)) pbar = ProgressBar(persist=True, desc='Training') pbar.attach(trainer, metric_names=["loss"]) RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") trainer.add_event_handler(Events.ITERATION_COMPLETED(every=args.loss_log_interval), lambda engine: \ logger.info('Loss at iteration %d is %.5f', engine.state.iteration, engine.state.metrics['loss'])) early_stop_handler = EarlyStopping(patience=args.patience, score_function=score_function, trainer=trainer) evaluator.add_event_handler(Events.COMPLETED,