def _setup_common_training_handlers( trainer: Engine, to_save: Optional[Mapping] = None, save_every_iters: int = 1000, output_path: Optional[str] = None, lr_scheduler: Optional[Union[ParamScheduler, _LRScheduler]] = None, with_gpu_stats: bool = False, output_names: Optional[Iterable[str]] = None, with_pbars: bool = True, with_pbar_on_iters: bool = True, log_every_iters: int = 100, stop_on_nan: bool = True, clear_cuda_cache: bool = True, save_handler: Optional[Union[Callable, BaseSaveHandler]] = None, **kwargs: Any, ) -> None: if output_path is not None and save_handler is not None: raise ValueError( "Arguments output_path and save_handler are mutually exclusive. Please, define only one of them" ) if stop_on_nan: trainer.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan()) if lr_scheduler is not None: if isinstance(lr_scheduler, torch.optim.lr_scheduler._LRScheduler): trainer.add_event_handler( Events.ITERATION_COMPLETED, lambda engine: cast(_LRScheduler, lr_scheduler).step()) elif isinstance(lr_scheduler, LRScheduler): trainer.add_event_handler(Events.ITERATION_COMPLETED, lr_scheduler) else: trainer.add_event_handler(Events.ITERATION_STARTED, lr_scheduler) if torch.cuda.is_available() and clear_cuda_cache: trainer.add_event_handler(Events.EPOCH_COMPLETED, empty_cuda_cache) if to_save is not None: if output_path is None and save_handler is None: raise ValueError( "If to_save argument is provided then output_path or save_handler arguments should be also defined" ) if output_path is not None: save_handler = DiskSaver(dirname=output_path, require_empty=False) checkpoint_handler = Checkpoint(to_save, cast(Union[Callable, BaseSaveHandler], save_handler), filename_prefix="training", **kwargs) trainer.add_event_handler( Events.ITERATION_COMPLETED(every=save_every_iters), checkpoint_handler) if with_gpu_stats: GpuInfo().attach( trainer, name="gpu", event_name=Events.ITERATION_COMPLETED( every=log_every_iters) # type: ignore[arg-type] ) if output_names is not None: def output_transform(x: Any, index: int, name: str) -> Any: if isinstance(x, Mapping): return x[name] elif isinstance(x, Sequence): return x[index] elif isinstance(x, (torch.Tensor, numbers.Number)): return x else: raise TypeError( "Unhandled type of update_function's output. " f"It should either mapping or sequence, but given {type(x)}" ) for i, n in enumerate(output_names): RunningAverage(output_transform=partial(output_transform, index=i, name=n), epoch_bound=False).attach(trainer, n) if with_pbars: if with_pbar_on_iters: ProgressBar(persist=False).attach( trainer, metric_names="all", event_name=Events.ITERATION_COMPLETED(every=log_every_iters)) ProgressBar(persist=True, bar_format="").attach(trainer, event_name=Events.EPOCH_STARTED, closing_event_name=Events.COMPLETED)
def _test(save_history): tensor = torch.ones([1], requires_grad=True) optimizer = torch.optim.SGD([tensor], lr=0.001) max_epochs = 25 lr_max_value = 0.4 num_iterations_per_epoch = 128 num_iterations = max_epochs * num_iterations_per_epoch warmup_duration = 5 * num_iterations_per_epoch cooldown_duration = 5 * num_iterations_per_epoch scheduler_1 = LinearCyclicalScheduler( optimizer, "lr", start_value=lr_max_value, end_value=lr_max_value * 0.9, cycle_size=(num_iterations - warmup_duration - cooldown_duration) * 2, ) scheduler_2 = LinearCyclicalScheduler( optimizer, "lr", start_value=lr_max_value, end_value=0.0, cycle_size=cooldown_duration * 2 ) lr_scheduler = ConcatScheduler( schedulers=[scheduler_1, scheduler_2], durations=[num_iterations - warmup_duration - cooldown_duration], save_history=False, ) lr_values = [None] * num_iterations scheduler = create_lr_scheduler_with_warmup( lr_scheduler, warmup_start_value=0.0, warmup_end_value=lr_max_value, warmup_duration=warmup_duration, save_history=save_history, output_simulated_values=lr_values, ) state_dict = scheduler.state_dict() trainer = Engine(lambda engine, batch: None) @trainer.on(Events.ITERATION_COMPLETED) def save_lr(engine): lrs.append(optimizer.param_groups[0]["lr"]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) data = [0] * num_iterations_per_epoch for _ in range(2): lrs = [] trainer.run(data, max_epochs=max_epochs) assert lrs == pytest.approx([v for i, v in lr_values]) if save_history: param_history = trainer.state.param_history["lr"] assert lrs == pytest.approx([v[0] for v in param_history]) trainer.state.param_history = None scheduler.load_state_dict(state_dict)
def test_concat_scheduler_3_schedulers(): tensor = torch.zeros([1], requires_grad=True) optimizer = torch.optim.SGD([tensor], lr=0) scheduler_1 = LinearCyclicalScheduler(optimizer, "lr", start_value=1.0, end_value=0.5, cycle_size=20) scheduler_2 = LinearCyclicalScheduler(optimizer, "lr", start_value=0.5, end_value=0.45, cycle_size=10) scheduler_3 = LinearCyclicalScheduler(optimizer, "lr", start_value=0.5, end_value=0.0, cycle_size=20) durations = [10, 5] concat_scheduler = ConcatScheduler( schedulers=[scheduler_1, scheduler_2, scheduler_3], durations=durations, save_history=True ) state_dict = concat_scheduler.state_dict() data = [0] * 10 max_epochs = 2 simulated_values = ConcatScheduler.simulate_values( num_events=len(data) * max_epochs, schedulers=[scheduler_1, scheduler_2, scheduler_3], durations=durations ) def save_lr(engine): lrs.append(optimizer.param_groups[0]["lr"]) trainer = Engine(lambda engine, batch: None) trainer.add_event_handler(Events.ITERATION_STARTED, concat_scheduler) trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr) for _ in range(2): lrs = [] trainer.run(data, max_epochs=max_epochs) assert lrs == list( map( pytest.approx, [ # Cycle 1 of the first LinearCyclicalScheduler 1.0, 0.95, 0.9, 0.85, 0.8, 0.75, 0.7, 0.65, 0.6, 0.55, # Cycle 1 of the second LinearCyclicalScheduler 0.5, 0.49, 0.48, 0.47, 0.46, # Cycle 1 of the third LinearCyclicalScheduler 0.5, 0.45, 0.4, 0.35, 0.3, ], ) ) state_lrs = trainer.state.param_history["lr"] assert len(state_lrs) == len(lrs) # Unpack singleton lists assert [group[0] for group in state_lrs] == lrs assert lrs == pytest.approx([v for i, v in simulated_values]) concat_scheduler.load_state_dict(state_dict) trainer.state.param_history = None
def train(args): device = torch.device("cuda" if args.cuda else "cpu") train_loader = check_dataset(args) transformer = TransformerNet().to(device) optimizer = Adam(transformer.parameters(), args.lr) mse_loss = torch.nn.MSELoss() vgg = Vgg16(requires_grad=False).to(device) style_transform = transforms.Compose( [transforms.ToTensor(), transforms.Lambda(lambda x: x.mul(255))]) style = utils.load_image(args.style_image, size=args.style_size) style = style_transform(style) style = style.repeat(args.batch_size, 1, 1, 1).to(device) features_style = vgg(utils.normalize_batch(style)) gram_style = [utils.gram_matrix(y) for y in features_style] running_avgs = OrderedDict() def step(engine, batch): x, _ = batch x = x.to(device) n_batch = len(x) optimizer.zero_grad() y = transformer(x) x = utils.normalize_batch(x) y = utils.normalize_batch(y) features_x = vgg(x) features_y = vgg(y) content_loss = args.content_weight * mse_loss(features_y.relu2_2, features_x.relu2_2) style_loss = 0.0 for ft_y, gm_s in zip(features_y, gram_style): gm_y = utils.gram_matrix(ft_y) style_loss += mse_loss(gm_y, gm_s[:n_batch, :, :]) style_loss *= args.style_weight total_loss = content_loss + style_loss total_loss.backward() optimizer.step() return { "content_loss": content_loss.item(), "style_loss": style_loss.item(), "total_loss": total_loss.item(), } trainer = Engine(step) checkpoint_handler = ModelCheckpoint( args.checkpoint_model_dir, "checkpoint", n_saved=10, require_empty=False, create_dir=True, ) progress_bar = Progbar(loader=train_loader, metrics=running_avgs) trainer.add_event_handler( event_name=Events.EPOCH_COMPLETED(every=args.checkpoint_interval), handler=checkpoint_handler, to_save={"net": transformer}, ) trainer.add_event_handler(event_name=Events.ITERATION_COMPLETED, handler=progress_bar) trainer.run(train_loader, max_epochs=args.epochs)
class Señalizador(nn.Module): def __init__(self, train_data, validation_data, test_data): super(type(self), self).__init__() self.conv1 = torch.nn.Conv2d(kernel_size=7, in_channels=3, out_channels=18, bias=True) self.conv2 = torch.nn.Conv2d(kernel_size=5, in_channels=18, out_channels=18, bias=True) self.conv3 = torch.nn.Conv2d(kernel_size=3, in_channels=18, out_channels=36) self.conv4 = torch.nn.Conv2d(kernel_size=3, in_channels=36, out_channels=64) self.conv5 = torch.nn.Conv2d(kernel_size=3, in_channels=64, out_channels=64, bias=True) self.conv6 = torch.nn.Conv2d(kernel_size=3, in_channels=64, out_channels=128, bias=True) self.conv7 = torch.nn.Conv2d(kernel_size=3, in_channels=128, out_channels=254, padding=1, bias=True) self.mpool = torch.nn.MaxPool2d(kernel_size=2) self.activation = torch.nn.ReLU() self.linear1 = torch.nn.Linear(in_features=254, out_features=128) self.linear2 = torch.nn.Linear(in_features=128, out_features=16) self.linear3 = torch.nn.Linear(in_features=16, out_features=4) self.dropout = torch.nn.Dropout(p=0.2) self.train_data = train_data self.validation_data = validation_data self.test_data = test_data self.device = torch.device('cuda:0') self.optimizer = torch.optim.Adam(self.parameters(), lr=5e-3) self.criterion = torch.nn.CrossEntropyLoss(reduction='sum') self = self.to(self.device) self.loaders() def forward(self, x): x = self.mpool(self.activation(self.conv1(x))) x = self.mpool(self.activation(self.conv2(x))) x = self.activation(self.conv3(x)) x = self.mpool(self.activation(self.conv4(x))) x = self.mpool(self.activation(self.conv5(x))) x = self.mpool(self.activation(self.conv6(x))) x = self.mpool(self.activation(self.conv7(x))) x = x.view(-1, self.linear1.in_features) x = self.activation(self.linear1(x)) x = self.activation(self.linear2(x)) x = self.linear3(x) return x def loaders(self): self.train_loader = DataLoader(self.train_data, shuffle=True, batch_size=32) self.valid_loader = DataLoader(self.validation_data, shuffle=False, batch_size=256) self.test_loader = DataLoader(self.test_data, shuffle=False, batch_size=512) print("Loaders initialized") def load_checkpoint(self, dir): self.load_state_dict(torch.load(dir)) def train_one_step(self, engine, batch): self.optimizer.zero_grad() x, y = batch x, y = x.to(self.device), y.to(self.device) yhat = self.forward(x) loss = self.criterion(yhat, y) loss.backward() self.optimizer.step() del x del y torch.cuda.empty_cache() return loss.item( ) # Este output puede llamar luego como trainer.state.output def evaluate_one_step(self, engine, batch): with torch.no_grad(): x, y = batch x, y = x.to(self.device), y.to(self.device) yhat = self.forward(x) del x loss = self.criterion(yhat, y) torch.cuda.empty_cache() return yhat, y def train_epochs(self, max_epochs): self.trainer = Engine(self.train_one_step) self.evaluator = Engine(self.evaluate_one_step) self.metrics = {'Loss': Loss(self.criterion), 'Acc': Accuracy()} for name, metric in self.metrics.items(): metric.attach(self.evaluator, name) with SummaryWriter( log_dir="/tmp/tensorboard/Transform" + str(type(self))[17:len(str(type(self))) - 2]) as writer: @self.trainer.on(Events.EPOCH_COMPLETED(every=1)) # Cada 1 epocas def log_results(engine): # Evaluo el conjunto de entrenamiento self.eval() self.evaluator.run(self.train_loader) writer.add_scalar("train/loss", self.evaluator.state.metrics['Loss'], engine.state.epoch) writer.add_scalar("train/accy", self.evaluator.state.metrics['Acc'], engine.state.epoch) # Evaluo el conjunto de validación self.evaluator.run(self.valid_loader) writer.add_scalar("valid/loss", self.evaluator.state.metrics['Loss'], engine.state.epoch) writer.add_scalar("valid/accy", self.evaluator.state.metrics['Acc'], engine.state.epoch) self.train() # Guardo el mejor modelo en validación best_model_handler = ModelCheckpoint( dirname='.', require_empty=False, filename_prefix="best", n_saved=1, score_function=lambda engine: -engine.state.metrics['Loss'], score_name="val_loss") # Lo siguiente se ejecuta cada ves que termine el loop de validación self.evaluator.add_event_handler( Events.COMPLETED, best_model_handler, { f'Transform{str(type(self))[17:len(str(type(self)))-2]}': model }) self.trainer.run(self.train_loader, max_epochs=max_epochs) def test(self, confussion, report): self.eval() test_targets = np.array(self.test_data.targets) prediction_test = [] for mbdata, label in self.test_loader: mbdata = mbdata.to(self.device) logits = self.forward(mbdata).to("cpu") prediction_test.append(logits.argmax(dim=1).detach().numpy()) del mbdata del logits torch.cuda.empty_cache() prediction_test = np.concatenate(prediction_test) cm = confusion_matrix(test_targets, prediction_test) if (confussion): display(cm) if (report): print(classification_report(test_targets, prediction_test)) self.train() return cm
def test_piecewiselinear(milestones_as_np_int): tensor = torch.zeros([1], requires_grad=True) optimizer = torch.optim.SGD([tensor], lr=0) milestones_values = [(5, 0.5), (15, 1.0), (25, 0.0), (35, 1.0), (40, 0.5)] if milestones_as_np_int: milestones_values = [(np.int64(t), v) for t, v in milestones_values] scheduler = PiecewiseLinear(optimizer, "lr", milestones_values=milestones_values) state_dict = scheduler.state_dict() def save_lr(engine): lrs.append(optimizer.param_groups[0]["lr"]) trainer = Engine(lambda engine, batch: None) trainer.add_event_handler(Events.ITERATION_COMPLETED, scheduler) trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr) for _ in range(2): lrs = [] trainer.run([0] * 25, max_epochs=2) assert lrs == list( map( pytest.approx, [ 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, ], ) ) scheduler.load_state_dict(state_dict)
def attach(self, engine: Engine) -> None: """ Args: engine: Ignite Engine, it can be a trainer, validator or evaluator. """ engine.add_event_handler(IterationEvents.MODEL_COMPLETED, self)
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Add progressbar with loss RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") ProgressBar(persist=True).attach(trainer, metric_names=['loss']) # Learning rate schedule: linearly warm-up to lr and then decrease the learning rate to zero with cosine cos_scheduler = CosineAnnealingScheduler(optimizer, 'lr', args.lr, 0.0, len(dataloader) * args.n_epochs) scheduler = create_lr_scheduler_with_warmup(cos_scheduler, 0.0, args.lr, args.n_warmup) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Save checkpoints and training config checkpoint_handler = ModelCheckpoint(args.log_dir, 'checkpoint', save_interval=1, n_saved=5) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': model}) torch.save(args, os.path.join(args.log_dir, 'training_args.bin')) trainer.run(dataloader, max_epochs=args.n_epochs)
def train(self): train_input_ids, train_token_type_ids, \ train_attention_mask, train_label_ids = self.get_X_y_ids(self.train_path) dev_input_ids, dev_token_type_ids, \ dev_attention_mask, dev_label_ids = self.get_X_y_ids(self.dev_path) train_ds = TensorDataset(train_input_ids, train_token_type_ids, train_attention_mask, train_label_ids) dev_ds = TensorDataset(dev_input_ids, dev_token_type_ids, dev_attention_mask, dev_label_ids) batch_size = self.n_gpu * self.per_gpu_batch_size train_iter = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=True) dev_iter = DataLoader(dev_ds, batch_size=batch_size, shuffle=True, drop_last=True) model = CLS_Model(vocab_size=self.bert_tokenizer.vocab_size, embed_size=self.embed_size, num_labels=len(self.label_list), dense_layer_type=self.dense_layer_type, dropout=self.dropout, embed_type=self.embed_type, max_len=self.max_seq_len, model_name_or_path=self.model_name_or_path, vector_file=self.vector_file) model.to(self.device) if self.n_gpu > 1: model = torch.nn.DataParallel(model) logger.info("model.named_parameters()") for n, p in model.named_parameters(): logger.info(n) parameters = [{ "params": [p for n, p in model.named_parameters() if "bert" in n], "lr": self.bert_lr }, { "params": [p for n, p in model.named_parameters() if "bert" not in n], "lr": self.normal_lr }] optimizer = torch.optim.AdamW(parameters, lr=self.normal_lr) tb_writer = SummaryWriter() def train_fn(engine, batch): model.train() optimizer.zero_grad() batch = tuple(t.to(self.device) for t in batch) labels = batch[3] inputs = { "input_ids": batch[0], "token_type_ids": batch[1], "attention_mask": batch[2], "label_ids": labels } loss, sequence_tags = model(**inputs) score = f1_score(labels.detach().cpu().numpy(), y_pred=sequence_tags.detach().cpu().numpy(), average="macro") if self.n_gpu > 1: loss = loss.mean() ## tensorboard global_step = global_step_from_engine(engine)( engine, engine.last_event_name) # tb_writer.add_scalar('learning_rate', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('train_loss', loss.item(), global_step) tb_writer.add_scalar('train_score', score, global_step) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 3.0) optimizer.step() return loss.item(), score trainer = Engine(train_fn) RunningAverage(output_transform=lambda x: x[0]).attach(trainer, 'loss') RunningAverage(output_transform=lambda x: x[1]).attach( trainer, 'score') def dev_fn(engine, batch): model.eval() optimizer.zero_grad() with torch.no_grad(): batch = tuple(t.to(self.device) for t in batch) labels = batch[3] inputs = { "input_ids": batch[0], "token_type_ids": batch[1], "attention_mask": batch[2], "label_ids": labels } loss, sequence_tags = model(**inputs) score = f1_score(labels.detach().cpu().numpy(), y_pred=sequence_tags.detach().cpu().numpy(), average="macro") if self.n_gpu > 1: loss = loss.mean() ## tensorboard global_step = global_step_from_engine(engine)( engine, engine.last_event_name) # tb_writer.add_scalar('learning_rate', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('dev_loss', loss.item(), global_step) tb_writer.add_scalar('dev_score', score, global_step) return loss.item(), score dev_evaluator = Engine(dev_fn) RunningAverage(output_transform=lambda x: x[0]).attach( dev_evaluator, 'loss') RunningAverage(output_transform=lambda x: x[1]).attach( dev_evaluator, 'score') pbar = ProgressBar(persist=True, bar_format="") pbar.attach(trainer, ['loss', 'score']) pbar.attach(dev_evaluator, ['loss', 'score']) def score_fn(engine): loss = engine.state.metrics['loss'] score = engine.state.metrics['score'] ''' if score < 0.5: logger.info("Too low to learn!") trainer.terminate() ''' return score / (loss + 1e-12) handler = EarlyStopping(patience=self.patience, score_function=score_fn, trainer=trainer) dev_evaluator.add_event_handler(Events.COMPLETED, handler) @trainer.on(Events.EPOCH_COMPLETED) def log_dev_results(engine): dev_evaluator.run(dev_iter) dev_metrics = dev_evaluator.state.metrics avg_score = dev_metrics['score'] avg_loss = dev_metrics['loss'] logger.info( "Validation Results - Epoch: {} Avg score: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_score, avg_loss)) def model_score(engine): score = engine.state.metrics['score'] return score checkpointer = ModelCheckpoint( self.output_dir, "cmed_qq", n_saved=self.n_saved, create_dir=True, score_name="model_score", score_function=model_score, global_step_transform=global_step_from_engine(trainer), require_empty=False) dev_evaluator.add_event_handler(Events.COMPLETED, checkpointer, { self.model_name: model.module if hasattr(model, 'module') else model }) # Clear cuda cache between training/testing def empty_cuda_cache(engine): torch.cuda.empty_cache() import gc gc.collect() trainer.add_event_handler(Events.EPOCH_COMPLETED, empty_cuda_cache) dev_evaluator.add_event_handler(Events.COMPLETED, empty_cuda_cache) trainer.run(train_iter, max_epochs=self.max_epochs)
def test_linear_scheduler(): with pytest.raises(ValueError): LinearCyclicalScheduler({}, 'lr', 1, 0, cycle_size=0) with pytest.raises(ValueError): LinearCyclicalScheduler({}, 'lr', 1, 0, cycle_size=1) tensor = torch.zeros([1], requires_grad=True) optimizer = torch.optim.SGD([tensor], lr=0) scheduler = LinearCyclicalScheduler(optimizer, 'lr', 1, 0, 10) lrs = [] def save_lr(engine): lrs.append(optimizer.param_groups[0]['lr']) trainer = Engine(lambda engine, batch: None) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr) trainer.run([0] * 10, max_epochs=2) assert lrs == list( map( pytest.approx, [ # Cycle 1 1.0, 0.8, 0.6, 0.4, 0.2, 0.0, 0.2, 0.4, 0.6, 0.8, # Cycle 2 1.0, 0.8, 0.6, 0.4, 0.2, 0.0, 0.2, 0.4, 0.6, 0.8, ])) optimizer = torch.optim.SGD([tensor], lr=0) scheduler = LinearCyclicalScheduler(optimizer, 'lr', 1, 0, 10, cycle_mult=2) trainer = Engine(lambda engine, batch: None) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr) lrs = [] trainer.run([0] * 10, max_epochs=3) assert lrs == list( map( pytest.approx, [ # Cycle 1 1.0, 0.8, 0.6, 0.4, 0.2, 0.0, 0.2, 0.4, 0.6, 0.8, # Cycle 2 1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, ])) # With float cycle_size optimizer = torch.optim.SGD([tensor], lr=0) scheduler = LinearCyclicalScheduler(optimizer, 'lr', start_value=1.2, end_value=0.2, cycle_size=10.00000012, cycle_mult=1.0) trainer = Engine(lambda engine, batch: None) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr) lrs = [] trainer.run([0] * 10, max_epochs=2) assert lrs == list( map( pytest.approx, [ # Cycle 1 1.2, 1.0, 0.8, 0.6, 0.4, 0.2, 0.4, 0.6, 0.8, 1.0, # Cycle 2 1.2, 1.0, 0.8, 0.6, 0.4, 0.2, 0.4, 0.6, 0.8, 1.0, ]))
def test_concat_scheduler_two_linear(): tensor = torch.zeros([1], requires_grad=True) optimizer = torch.optim.SGD([tensor], lr=0) scheduler_1 = LinearCyclicalScheduler(optimizer, "lr", start_value=0.0, end_value=0.1, cycle_size=2) scheduler_2 = LinearCyclicalScheduler(optimizer, "lr", start_value=0.2, end_value=1.0, cycle_size=2) durations = [ 5, ] concat_scheduler = ConcatScheduler(schedulers=[scheduler_1, scheduler_2], durations=durations, save_history=True) assert concat_scheduler.get_param() == 0.0 data = [0] * 10 max_epochs = 2 simulated_values = ConcatScheduler.simulate_values( num_events=len(data) * max_epochs, schedulers=[scheduler_1, scheduler_2], durations=durations) lrs = [] def save_lr(engine): lrs.append(optimizer.param_groups[0]['lr']) trainer = Engine(lambda engine, batch: None) trainer.add_event_handler(Events.ITERATION_STARTED, concat_scheduler) trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr) trainer.run(data, max_epochs=max_epochs) assert lrs == list( map( pytest.approx, [ # first LinearCyclicalScheduler 0.0, 0.1, 0.0, 0.1, 0.0, # second LinearCyclicalScheduler 0.2, 1.0, 0.2, 1.0, 0.2, 1.0, 0.2, 1.0, 0.2, 1.0, 0.2, 1.0, 0.2, 1.0, 0.2, ])) state_lrs = trainer.state.param_history['lr'] assert len(state_lrs) == len(lrs) # Unpack singleton lists assert [group[0] for group in state_lrs] == lrs assert lrs == pytest.approx([v for i, v in simulated_values])
class TrajPredEngine: def __init__(self, net, optim, train_loader, val_loader, args): self.net = net self.args = args self.pretrainEpochs = args["pretrainEpochs"] self.trainEpochs = args["trainEpochs"] self.optim = optim self.train_loader = train_loader self.val_loader = val_loader self.cuda = args['cuda'] self.device = args['device'] self.dsId = self.args['dsId'] self.n_iterations = max(len(train_loader), len(train_loader) / args["batch_size"]) ## training metrics to keep track of, consider making a metrics class # remember to 0 these out self.avg_trn_loss = 0 self.metrics = {"Avg train loss": 0, "Avg val loss": 0} ## validation metrics self.avg_val_loss = 0 self.val_batch_count = 1 # only if using maneuvers self.avg_lat_acc = 0 self.avg_lon_acc = 0 self.trainer = None self.evaluator = None self.makeTrainer() self.save_name = args['name'] # testing stuff wow need 2 clean this so bad self.lossVals = torch.zeros(self.args['out_length']).cuda( self.device) if self.cuda else torch.zeros(self.args['out_length']) self.counts = torch.zeros(self.args['out_length']).cuda( self.device) if self.cuda else torch.zeros(self.args['out_length']) self.lastTestLoss = 0 self.writer = None self.log_dir = args['log_dir'] self.tensorboard = args['tensorboard'] def netPred(self, batch): raise NotImplementedError def saveModel(self, engine): os.makedirs(self.args['modelLoc'], exist_ok=True) name = os.path.join(self.args['modelLoc'], self.args['name']) torch.save(self.net.state_dict(), name) print("Model saved {}.".format(name)) def train_a_batch(self, engine, batch): self.net.train_flag = True epoch = engine.state.epoch _, _, _, _, _, _, _, fut, op_mask = batch fut_pred = self.netPred(batch) if self.cuda: fut = fut.cuda(self.device) op_mask = op_mask.cuda(self.device) if epoch < self.pretrainEpochs: if self.args["pretrain_loss"] == 'MSE': l = maskedMSE(fut_pred, fut, op_mask, device=self.device) elif self.args['pretrain_loss'] == 'NLL': l = maskedNLL(fut_pred, fut, op_mask, device=self.device) else: l = maskedMSE(fut_pred, fut, op_mask, device=self.device) else: if self.args["train_loss"] == 'MSE': l = maskedMSE(fut_pred, fut, op_mask, device=self.device) elif self.args['train_loss'] == 'NLL': l = maskedNLL(fut_pred, fut, op_mask, device=self.device) else: l = maskedNLL(fut_pred, fut, op_mask, device=self.device) # if self.args['nll_only']: # l = maskedNLL(fut_pred, fut, op_mask) # else: # if epoch < self.pretrainEpochs: # l = maskedMSE(fut_pred, fut, op_mask) # else: # l = maskedNLL(fut_pred, fut, op_mask) # Backprop and update weights # if l.item() != l.item(): # print(l.item()) # exit(1) # return 1 self.optim.zero_grad() l.backward() self.optim.step() # Track average train loss: self.avg_trn_loss += l.item() self.metrics["Avg train loss"] += l.item() / 100.0 if self.writer: self.writer.add_scalar( "{}epoch/trainingloss".format(engine.state.epoch), l.item(), engine.state.iteration) return l.item() def eval_a_batch(self, engine, batch): self.net.train_flag = False epoch = engine.state.epoch _, _, _, _, _, _, _, fut, op_mask = batch fut_pred = self.netPred(batch) if self.cuda: fut = fut.cuda(self.device) op_mask = op_mask.cuda(self.device) # Forward pass if epoch < self.pretrainEpochs: if self.args["pretrain_loss"] == 'MSE': l = maskedMSE(fut_pred, fut, op_mask, device=self.device) elif self.args['pretrain_loss'] == 'NLL': l = maskedNLL(fut_pred, fut, op_mask, device=self.device) else: l = maskedMSE(fut_pred, fut, op_mask, device=self.device) else: if self.args["train_loss"] == 'MSE': l = maskedMSE(fut_pred, fut, op_mask, device=self.device) elif self.args['train_loss'] == 'NLL': l = maskedNLL(fut_pred, fut, op_mask, device=self.device) else: l = maskedNLL(fut_pred, fut, op_mask, device=self.device) # if self.args['nll_only']: # l = maskedNLL(fut_pred, fut, op_mask) # else: # if epoch_num < pretrainEpochs: # l = maskedMSE(fut_pred, fut, op_mask) # else: # l = maskedNLL(fut_pred, fut, op_mask) self.avg_val_loss += l.item() self.metrics["Avg val loss"] += l.item() / (self.val_batch_count * 100.0) self.val_batch_count += 1 return fut_pred, fut def validate(self, engine): self.evaluator.run(self.val_loader) max_epochs = self.args["pretrainEpochs"] + self.args["trainEpochs"] # if not self.eval_only: print("{}/{} Epochs in dataset{}".format(engine.state.epoch, max_epochs, self.dsId)) # print(max((engine.state.epoch / max_epochs) * 100,1)) print("EPOCH {}: Train loss: {} Val loss: {}\n".format( engine.state.epoch, self.metrics["Avg train loss"], self.metrics["Avg val loss"])) # else: # print("EPOCH {}: Test loss: {}\n".format(engine.state.epoch, self.metrics["Avg val loss"])) if self.writer: self.writer.add_scalar("training_avg_loss", self.metrics['Avg train loss'], engine.state.epoch) self.writer.add_scalar("validating_avg_loss", self.metrics['Avg val loss'], engine.state.epoch) self.metrics["Avg train loss"] = 0 self.metrics["Avg val loss"] = 0 def zeroMetrics(self, engine): self.val_batch_count = 1 self.metrics["Avg val loss"] = 0 def zeroTrainLoss(self, engine): self.metrics["Avg train loss"] = 0 def zeroValLoss(self, engine): self.metrics["Avg val loss"] = 0 def makeTrainer(self): self.trainer = Engine(self.train_a_batch) self.evaluator = Engine(self.eval_a_batch) pbar = ProgressBar(persist=True, postfix=self.metrics) pbar.attach(self.trainer) pbar.attach(self.evaluator) ## attach hooks self.trainer.add_event_handler(Events.EPOCH_COMPLETED, self.validate) self.trainer.add_event_handler(Events.ITERATION_COMPLETED, self.zeroMetrics) self.trainer.add_event_handler(Events.COMPLETED, self.saveModel) # zero out metrics for next epoch def create_summary_writer(self, model, data_loader, log_dir): writer = SummaryWriter(logdir=log_dir) data_loader_iter = iter(data_loader) b = next(data_loader_iter) b = tuple(x.cuda(self.device) for x in b) try: writer.add_graph(model, b[:7]) except Exception as e: print("Failed to save model graph: {}".format(e)) return writer def start(self): max_epochs = self.args["pretrainEpochs"] + self.args["trainEpochs"] if self.tensorboard: self.writer = self.create_summary_writer(self.net, self.train_loader, self.log_dir) # @self.trainer.on(Events.ITERATION_COMPLETED) # def log_training_loss(engine): # iter = (engine.state.iteration - 1) % len(self.train_loader) + 1 # if iter % 10 == 0: # self.writer.add_scalar("training/loss", engine.state.output, engine.state.iteration) # if not self.eval_only: self.trainer.run(self.train_loader, max_epochs=max_epochs) # else: # self.trainer.run(self.train_loader, max_epochs=1) if self.tensorboard: self.writer.close() def test_a_batch(self, engine, batch): _, _, _, _, _, _, _, fut, op_mask, _, _, _, _ = batch # Initialize Variables if self.cuda: fut = fut.cuda(self.device) op_mask = op_mask.cuda(self.device) if self.args["train_loss"] == 'NLL': # Forward pass if self.args['use_maneuvers']: fut_pred, lat_pred, lon_pred = self.netPred(batch) l, c = maskedNLLTest(fut_pred, lat_pred, lon_pred, fut, op_mask, device=self.device, cuda=self.args.cuda) else: fut_pred = self.netPred(batch) l, c = maskedNLLTest(fut_pred, 0, 0, fut, op_mask, device=self.device, use_maneuvers=False, cuda=self.cuda) else: # Forward pass if self.args['use_maneuvers']: fut_pred, lat_pred, lon_pred = self.netPred(batch) fut_pred_max = torch.zeros_like(fut_pred[0]) for k in range(lat_pred.shape[0]): lat_man = torch.argmax(lat_pred[k, :]).detach() lon_man = torch.argmax(lon_pred[k, :]).detach() indx = lon_man * 3 + lat_man fut_pred_max[:, k, :] = fut_pred[indx][:, k, :] l, c = maskedMSETest(fut_pred_max, fut, op_mask, device=self.device) else: fut_pred = self.netPred(batch) l, c = maskedMSETest(fut_pred, fut, op_mask, device=self.device) self.lossVals += l.detach() self.lastTestLoss = l.detach() self.counts += c.detach() def eval(self, test_loader): self.test_batch_size = len(test_loader) tester = Engine(self.test_a_batch) pbar = ProgressBar(persist=True, postfix=self.metrics) pbar.attach(tester) print('evaluating on dataset{}...'.format(self.dsId)) tester.run(test_loader) if (self.args["train_loss"]) == "NLL": nll_loss = self.lossVals / self.counts nll_loss[nll_loss != nll_loss] = 0 print("NLL:") print("Last Test loss: " + str(self.lastTestLoss.mean().item())) print("Avg Test loss: " + str(nll_loss.mean().item())) else: rmse = torch.pow(self.lossVals / self.counts, 0.5) * .3048 # converting from feet to meters rmse[torch.isnan(rmse)] = 0 # self.lastTestLoss = torch.pow(self.lastTestLoss, 0.5) * .3048 # print(self.lastTestLoss) seq_loss = rmse.tolist() seq_loss = [x for x in seq_loss if x != 0] print("RMSE:") print(rmse) print("Last Test loss: " + str(seq_loss[-1])) print("Avg Test loss: " + str(rmse.mean().item()))
def train(args): tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=False) args.num_embeddings = len( tokenizer.vocab ) # We need this to create the model at next line (number of embeddings to use) model = TransformerWithLMHead(args) model.to(args.device) optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) logger.info("Model has %s parameters", sum(p.numel() for p in model.parameters() if p.requires_grad)) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler, train_num_words, valid_num_words = get_data_loaders( args, tokenizer) # Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original def mask_tokens(inputs): labels = inputs.clone() masked_indices = torch.bernoulli( torch.full(labels.shape, args.mlm_probability)).byte() labels[~masked_indices] = -1 # We only compute loss on masked tokens indices_replaced = torch.bernoulli(torch.full( labels.shape, 0.8)).byte() & masked_indices inputs[indices_replaced] = tokenizer.vocab[ "[MASK]"] # 80% of the time, replace masked input tokens with [MASK] indices_random = torch.bernoulli(torch.full( labels.shape, 0.5)).byte() & masked_indices & ~indices_replaced random_words = torch.randint(args.num_embeddings, labels.shape, dtype=torch.long, device=args.device) inputs[indices_random] = random_words[ indices_random] # 10% of the time, replace masked input tokens with random word return inputs, labels def update(engine, batch): model.train() inputs = batch.transpose(0, 1).contiguous().to(args.device) inputs, labels = mask_tokens(inputs) if args.mlm else (inputs, inputs) logits, loss = model(inputs, labels=labels) loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) def inference(engine, batch): model.eval() with torch.no_grad(): inputs = batch.transpose(0, 1).contiguous().to(args.device) inputs, labels = mask_tokens(inputs) if args.mlm else ( inputs, inputs) # Prepare masked input/labels if we use masked LM logits = model(inputs) shift_logits = logits[:-1] if not args.mlm else logits shift_labels = labels[1:] if not args.mlm else labels return shift_logits.view(-1, logits.size(-1)), shift_labels.view(-1) evaluator = Engine(inference) trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_every > 0: trainer.add_event_handler( Events.ITERATION_COMPLETED, lambda engine: evaluator.run(val_loader) if engine.state.iteration % args.eval_every == 0 else None) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) # Learning rate schedule: linearly warm-up to lr and then decrease the learning rate to zero with cosine schedule cos_scheduler = CosineAnnealingScheduler(optimizer, 'lr', args.lr, 0.0, len(train_loader) * args.n_epochs) scheduler = create_lr_scheduler_with_warmup(cos_scheduler, 0.0, args.lr, args.n_warmup) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we average distributed metrics using average_distributed_scalar metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1))} metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) # Let's convert sub-word perplexities in word perplexities. If you need details: http://sjmielke.com/comparing-perplexities.htm metrics["average_word_ppl"] = MetricsLambda( lambda x: math.exp(x * val_loader.dataset.numel() / valid_num_words), metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model and configuration before we start to train if args.local_rank in [-1, 0]: checkpoint_handler, tb_logger = add_logging_and_checkpoint_saving( trainer, evaluator, metrics, model, optimizer, args) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs)
def attach( self, trainer: Engine, to_save: Mapping, output_transform: Callable = lambda output: output, num_iter: Optional[int] = None, end_lr: float = 10.0, step_mode: str = "exp", smooth_f: float = 0.05, diverge_th: float = 5.0, ) -> Any: """Attaches lr_finder to a given trainer. It also resets model and optimizer at the end of the run. Usage: .. code-block:: python to_save = {"model": model, "optimizer": optimizer} with lr_finder.attach(trainer, to_save=to_save) as trainer_with_lr_finder: trainer_with_lr_finder.run(dataloader)` Args: trainer: lr_finder is attached to this trainer. Please, keep in mind that all attached handlers will be executed. to_save: dictionary with optimizer and other objects that needs to be restored after running the LR finder. For example, `to_save={'optimizer': optimizer, 'model': model}`. All objects should implement `state_dict` and `load_state_dict` methods. output_transform: function that transforms the trainer's `state.output` after each iteration. It must return the loss of that iteration. num_iter: number of iterations for lr schedule between base lr and end_lr. Default, it will run for `trainer.state.epoch_length * trainer.state.max_epochs`. end_lr: upper bound for lr search. Default, 10.0. step_mode: "exp" or "linear", which way should the lr be increased from optimizer's initial lr to `end_lr`. Default, "exp". smooth_f: loss smoothing factor in range `[0, 1)`. Default, 0.05 diverge_th: Used for stopping the search when `current loss > diverge_th * best_loss`. Default, 5.0. Returns: trainer_with_lr_finder (trainer used for finding the lr) Note: lr_finder cannot be attached to more than one trainer at a time. """ if not isinstance(to_save, Mapping): raise TypeError( f"Argument to_save should be a mapping, but given {type(to_save)}" ) Checkpoint._check_objects(to_save, "state_dict") Checkpoint._check_objects(to_save, "load_state_dict") if "optimizer" not in to_save: raise ValueError("Mapping to_save should contain 'optimizer' key") if not isinstance(to_save["optimizer"], torch.optim.Optimizer): raise TypeError( f"Object to_save['optimizer'] should be torch optimizer, but given {type(to_save['optimizer'])}" ) if smooth_f < 0 or smooth_f >= 1: raise ValueError("smooth_f is outside the range [0, 1]") if diverge_th < 1: raise ValueError("diverge_th should be larger than 1") if step_mode not in ["exp", "linear"]: raise ValueError( f"step_mode should be 'exp' or 'linear', but given {step_mode}" ) if num_iter is not None: if not isinstance(num_iter, int): raise TypeError( f"if provided, num_iter should be an integer, but give {num_iter}" ) if num_iter <= 0: raise ValueError( f"if provided, num_iter should be positive, but give {num_iter}" ) # store to_save with tempfile.TemporaryDirectory() as tmpdirname: obj = {k: o.state_dict() for k, o in to_save.items()} # add trainer obj["trainer"] = trainer.state_dict() cache_filepath = Path(tmpdirname) / "ignite_lr_finder_cache.pt" torch.save(obj, cache_filepath.as_posix()) optimizer = to_save["optimizer"] # Attach handlers if not trainer.has_event_handler(self._run): trainer.add_event_handler( Events.STARTED, self._run, optimizer, output_transform, num_iter, end_lr, step_mode, smooth_f, diverge_th, ) if not trainer.has_event_handler(self._warning): trainer.add_event_handler(Events.COMPLETED, self._warning) if not trainer.has_event_handler(self._reset): trainer.add_event_handler(Events.COMPLETED, self._reset) yield trainer self._detach(trainer) # restore to_save and reset trainer's state obj = torch.load(cache_filepath.as_posix()) trainer.load_state_dict(obj["trainer"]) for k, o in obj.items(): if k in to_save: to_save[k].load_state_dict(o)
def train(self): # Get self.configs bn = self.config.bn name = self.config.name load_from_ckpt = self.config.load_from_ckpt lr = self.config.lr epochs = self.config.epochs wd = self.config.weight_decay if self.config.restart and not self.is_debug: mod_ckpt, op_ckpt = self._load_ckpt("reg_ckpt") else: mod_ckpt = op_ckpt = None # get datasets for training and testing def w_init_fn(worker_id): return np.random.seed(np.random.get_state()[1][0] + worker_id) # Load Datasets and DataLoader dset = get_dataset(self.config.dataset) transforms = T.ToTensor() train_dataset = dset(self.config, transforms, train=True) test_dataset = dset(self.config, transforms, train=False) train_loader = DataLoader( train_dataset, batch_size=bn, shuffle=True, num_workers=0 if self.is_debug else self.config.n_workers, worker_init_fn=w_init_fn) test_loader = DataLoader( test_dataset, batch_size=bn, shuffle=True, num_workers=0 if self.is_debug else self.config.n_workers, worker_init_fn=w_init_fn) eval_loader = DataLoader( test_dataset, batch_size=bn, num_workers=0 if self.is_debug else self.config.n_workers, worker_init_fn=w_init_fn) # model model = Model(self.config) self.logger.info( f"Number of trainable parameters in model is {sum(p.numel() for p in model.parameters())}" ) if self.config.restart and mod_ckpt is not None: self.logger.info("Load pretrained parameters and resume training.") model.load_state_dict(mod_ckpt) model.cuda(self.device) wandb.watch(model, log="all") # optimizer optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd) if self.config.restart and op_ckpt is not None: self.logger.info("Load state_dict of optimizer.") optimizer.load_state_dict(op_ckpt) if self.perc_loss: self.vgg = PerceptualVGG() self.vgg.cuda(self.device) else: self.vgg = None n_epoch_train = self.config.epochs start_it = 0 start_epoch = 0 if self.config.restart and op_ckpt is not None: start_it = list( optimizer.state_dict()["state"].values())[-1]["step"] start_epoch = int(np.floor(start_it / len(train_loader))) assert self.config.epochs > start_epoch n_epoch_train = self.config.epochs - start_epoch def train_step(engine, batch): model.train() original = batch["images"].cuda(self.device) tps_param_dic = tps_parameters(original.shape[0], self.config.scal, self.config.tps_scal, self.config.rot_scal, self.config.off_scal, self.config.scal_var, self.config.augm_scal) coord, vector = make_input_tps_param(tps_param_dic) coord, vector = coord.cuda(self.device), vector.cuda(self.device) image_spatial_t, _ = ThinPlateSpline(original, coord, vector, original.shape[3], self.device) image_appearance_t = K.ColorJitter(self.config.brightness, self.config.contrast, self.config.saturation, self.config.hue)(original) # Zero out gradients rec, ssp, asp, mu, heat_map = model(original, image_spatial_t, image_appearance_t, coord, vector) loss, rec_loss, equiv_loss = total_loss( original, rec, ssp, asp, mu, coord, vector, self.device, self.config.L_mu, self.config.L_cov, self.config.scal, self.config.l_2_scal, self.config.l_2_threshold, self.vgg) # fixme compute keypoint metrics if available optimizer.zero_grad() loss.backward() optimizer.step() out_dict = { "loss": loss.item(), "rec_loss": rec_loss.item(), "equiv_loss": equiv_loss.item() } return out_dict def eval_step(engine, batch): model.eval() with torch.no_grad(): original = batch["images"].cuda(self.device) tps_param_dic = tps_parameters( original.shape[0], self.config.scal, self.config.tps_scal, self.config.rot_scal, self.config.off_scal, self.config.scal_var, self.config.augm_scal) coord, vector = make_input_tps_param(tps_param_dic) coord, vector = coord.cuda(self.device), vector.cuda( self.device) image_spatial_t, _ = ThinPlateSpline(original, coord, vector, original.shape[3], self.device) image_appearance_t = K.ColorJitter(self.config.brightness, self.config.contrast, self.config.saturation, self.config.hue)(original) # Zero out gradients rec, ssp, asp, mu, heat_map = model(original, image_spatial_t, image_appearance_t, coord, vector) loss, rec_loss, equiv_loss = total_loss( original, rec, ssp, asp, mu, coord, vector, self.device, self.config.L_mu, self.config.L_cov, self.config.scal, self.config.l_2_scal, self.config.l_2_threshold) metric_ssim = ssim(original, rec) metric_psnr = psnr(original, rec) # fixme keypoint metrics return { "loss": loss.item(), "rec_loss": rec_loss.item(), "equiv_loss": equiv_loss.item(), "ssim": float(metric_ssim), "psnr": float(metric_psnr) } def eval_visual(engine, eval_batch): model.eval() with torch.no_grad(): original = eval_batch["images"].cuda(self.device) tps_param_dic = tps_parameters( original.shape[0], self.config.scal, self.config.tps_scal, self.config.rot_scal, self.config.off_scal, self.config.scal_var, self.config.augm_scal) coord, vector = make_input_tps_param(tps_param_dic) coord, vector = coord.cuda(self.device), vector.cuda( self.device) image_spatial_t, _ = ThinPlateSpline(original, coord, vector, original.shape[3], self.device) image_appearance_t = K.ColorJitter(self.config.brightness, self.config.contrast, self.config.saturation, self.config.hue)(original) # Zero out gradients rec, ssp, asp, mu, heat_map = model(original, image_spatial_t, image_appearance_t, coord, vector) img_grid = make_img_grid(image_appearance_t, image_spatial_t, rec, original, mus=mu, n_logged=6) wandb.log({ "Evaluation image logs": wandb.Image(img_grid, caption=f"Image logs on test set.") }) self.logger.info("Initialize engines...") trainer = Engine(train_step) evaluator = Engine(eval_step) test_img_generator = Engine(eval_visual) self.logger.info("Finish engine initialization...") # checkpointing n_saved = 10 ckpt_handler = ModelCheckpoint(self.dirs["ckpt"], "reg_ckpt", n_saved=n_saved, require_empty=False) save_dict = {"model": model, "optimizer": optimizer} trainer.add_event_handler( Events.ITERATION_COMPLETED(every=self.config.ckpt_intervall), ckpt_handler, save_dict) pbar = ProgressBar(ascii=True) pbar.attach(trainer, output_transform=lambda x: x) pbar.attach(evaluator, output_transform=lambda x: x) @trainer.on(Events.ITERATION_COMPLETED(every=self.config.log_intervall) ) def log(engine): it = engine.state.iteration wandb.log({"iteration": it}) # log losses for key in engine.state.output: wandb.log({key: engine.state.output[key]}) batch = engine.state.batch model.eval() original = batch["images"].cuda(self.device) with torch.no_grad(): tps_param_dic = tps_parameters( original.shape[0], self.config.scal, self.config.tps_scal, self.config.rot_scal, self.config.off_scal, self.config.scal_var, self.config.augm_scal) coord, vector = make_input_tps_param(tps_param_dic) coord, vector = coord.cuda(self.device), vector.cuda( self.device) image_spatial_t, _ = ThinPlateSpline(original, coord, vector, original.shape[3], self.device) image_appearance_t = K.ColorJitter(self.config.brightness, self.config.contrast, self.config.saturation, self.config.hue)(original) rec, ssp, asp, mu, heat_map = model(original, image_spatial_t, image_appearance_t, coord, vector) img_grid = make_img_grid(image_appearance_t, image_spatial_t, rec, original, mus=mu, n_logged=6) wandb.log({ "Training image logs": wandb.Image(img_grid, caption=f"Image logs after {it} train steps.") }) # metrics for training Average(output_transform=lambda x: x["loss"]).attach( trainer, "loss-epoch_avg") Average(output_transform=lambda x: x["rec_loss"]).attach( trainer, "rec_loss-epoch_avg") Average(output_transform=lambda x: x["equiv_loss"]).attach( trainer, "equiv_loss-epoch_avg") # metrics during evaluation Average(output_transform=lambda x: x["loss"]).attach( evaluator, "loss-eval") Average(output_transform=lambda x: x["rec_loss"]).attach( evaluator, "rec_loss-eval") Average(output_transform=lambda x: x["equiv_loss"]).attach( evaluator, "equiv_loss-eval") Average(output_transform=lambda x: x["psnr"]).attach( evaluator, "psnr-eval") Average(output_transform=lambda x: x["ssim"]).attach( evaluator, "ssim-eval") @trainer.on(Events.EPOCH_COMPLETED(every=self.config.metric_at_epochs)) def metrics(engine): self.logger.info( f"Computing metrics after epoch #{engine.state.epoch}") batch_size = eval_loader.batch_size bs = 20 if self.is_debug else (int( 8000 / batch_size) if len(test_dataset) > 8000 else len(eval_loader)) evaluator.run(eval_loader, max_epochs=1, epoch_length=bs) [ wandb.log({key: evaluator.state.metrics[key]}) for key in evaluator.state.metrics ] @trainer.on( Events.ITERATION_COMPLETED(every=self.config.test_img_intervall)) def make_test_grid(engine): test_img_generator.run(test_loader, max_epochs=1, epoch_length=1) @trainer.on(Events.EPOCH_COMPLETED) def log_train_avg(engine): wandb.log({"epoch": engine.state.epoch}) [ wandb.log({key: engine.state.metrics[key]}) for key in engine.state.metrics ] @trainer.on(Events.STARTED) def set_start_it(engine): self.logger.info( f'Engine starting from iteration {start_it}, epoch {start_epoch}' ) engine.state.iteration = start_it engine.state.epoch = start_epoch # run everything n_step_per_epoch = 10 if self.is_debug else len(train_loader) self.logger.info("Start training...") trainer.run(train_loader, max_epochs=n_epoch_train, epoch_length=n_step_per_epoch) self.logger.info("End training.")
def attach(self, engine: Engine): if self._name is None: self.logger = engine.logger return engine.add_event_handler(Events.STARTED, self)
def test_lr_scheduler(torch_lr_scheduler_cls, kwargs): if torch_lr_scheduler_cls is None: return tensor = torch.zeros([1], requires_grad=True) optimizer1 = torch.optim.SGD([tensor], lr=0.01) optimizer2 = torch.optim.SGD([tensor], lr=0.01) optimizer3 = torch.optim.SGD([tensor], lr=0.01) opt_state_dict1 = optimizer1.state_dict() opt_state_dict2 = optimizer2.state_dict() opt_state_dict3 = optimizer3.state_dict() torch_lr_scheduler1 = torch_lr_scheduler_cls(optimizer=optimizer1, **kwargs) scheduler1 = LRScheduler(torch_lr_scheduler1) state_dict1 = scheduler1.state_dict() torch_lr_scheduler2 = torch_lr_scheduler_cls(optimizer=optimizer2, **kwargs) with pytest.warns(UserWarning, match=r"the first lr value from the optimizer, otherwise it is will be skipped"): scheduler2 = LRScheduler(torch_lr_scheduler2, use_legacy=True) state_dict2 = scheduler2.state_dict() torch_lr_scheduler3 = torch_lr_scheduler_cls(optimizer=optimizer3, **kwargs) state_dict3 = torch_lr_scheduler3.state_dict() def dummy_update(engine, batch): optimizer1.step() optimizer2.step() optimizer3.step() trainer = Engine(dummy_update) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler1) @trainer.on(Events.ITERATION_STARTED) def save_lr1(engine): lrs1.append(optimizer1.param_groups[0]["lr"]) @trainer.on(Events.ITERATION_STARTED) def save_lr2(engine): lrs2.append(optimizer2.param_groups[0]["lr"]) @trainer.on(Events.ITERATION_STARTED) def save_true_lr(engine): lrs_true.append(optimizer3.param_groups[0]["lr"]) @trainer.on(Events.ITERATION_COMPLETED) def torch_lr_scheduler_step(engine): torch_lr_scheduler3.step() trainer.add_event_handler(Events.ITERATION_COMPLETED, scheduler2) for _ in range(2): lrs1 = [] lrs2 = [] lrs_true = [] data = [0] * 10 max_epochs = 2 trainer.run(data, max_epochs=max_epochs) assert lrs_true == pytest.approx(lrs1), f"{_}: {lrs_true} ({len(lrs_true)}) vs {lrs1} ({len(lrs1)})" assert lrs_true == pytest.approx(lrs2), f"{_}: {lrs_true} ({len(lrs_true)}) vs {lrs2} ({len(lrs2)})" optimizer1.load_state_dict(opt_state_dict1) scheduler1.load_state_dict(state_dict1) optimizer2.load_state_dict(opt_state_dict2) scheduler2.load_state_dict(state_dict2) optimizer3.load_state_dict(opt_state_dict3) torch_lr_scheduler3.load_state_dict(state_dict3) optimizer4 = torch.optim.SGD([tensor], lr=0.01) torch_lr_scheduler4 = torch_lr_scheduler_cls(optimizer=optimizer4, **kwargs) simulated_values = LRScheduler.simulate_values(num_events=len(data) * max_epochs, lr_scheduler=torch_lr_scheduler4) assert lrs1 == pytest.approx([v for i, v in simulated_values]) assert lrs2 == pytest.approx([v for i, v in simulated_values])
def train(): config_file = "configs/train_multihead_config.json" config = Config.from_json_file(config_file) ec_coef = 1 sc_coef = 1 # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig(level=logging.INFO if config.local_rank in [-1, 0] else logging.WARN) logger.warning("Running process %d", config.local_rank) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(config)) # Initialize distributed training if needed config.distributed = (config.local_rank != -1) if config.distributed: torch.cuda.set_device(config.local_rank) config.device = torch.device("cuda", config.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info("Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning") tokenizer_class = OpenAIGPTTokenizer tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint) model_class = OpenAIGPTMultiHeadModel model = model_class.from_pretrained(config.model_checkpoint) tokenizer.set_special_tokens(SPECIAL_TOKENS) model.set_num_special_tokens(len(SPECIAL_TOKENS)) model.to(config.device) optimizer = OpenAIAdam(model.parameters(), lr=config.lr) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if config.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=config.fp16) if config.distributed: model = DistributedDataParallel(model, device_ids=[config.local_rank], output_device=config.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(config, tokenizer) # Training function and trainer def update(engine, batch): model.train() # input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids, token_action_ids = tuple(input_tensor.to(config.device) for input_tensor in batch) input_ids, ec_token_ids, sc_token_ids, lm_labels, ec_labels, sc_labels, token_type_ids, token_emotion_ids, token_action_ids = tuple( input_tensor.to(config.device) for input_tensor in batch) lm_loss, emotion_loss, sentence_loss = model(input_ids, ec_token_ids, sc_token_ids, lm_labels, ec_labels, sc_labels, token_type_ids, token_emotion_ids, token_action_ids) loss = (lm_loss * config.lm_coef + emotion_loss * ec_coef + sentence_loss * sc_coef) / config.gradient_accumulation_steps if config.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), config.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_norm) if engine.state.iteration % config.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple(input_tensor.to(config.device) for input_tensor in batch) input_ids, ec_token_ids, sc_token_ids, lm_labels, ec_labels, \ sc_labels, token_type_ids, token_emotion_ids, token_action_ids = batch # logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) model_outputs = model(input_ids, ec_token_ids, sc_token_ids, token_type_ids=token_type_ids, token_emotion_ids=token_emotion_ids, token_action_ids=token_action_ids) lm_logits, mc_logits = model_outputs[0], model_outputs[2] # So we can also use GPT2 outputs lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, sc_labels) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if config.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if config.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if config.distributed: trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, config.lr), (config.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])), "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))} metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], config), "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], config)}) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if config.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics))) tb_logger = TensorboardLogger(log_dir=config.log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, { 'mymodel': getattr(model, 'module', model)}) # "getattr" take care of distributed encapsulation torch.save(config, tb_logger.writer.log_dir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file(os.path.join(tb_logger.writer.log_dir, CONFIG_NAME)) tokenizer.save_vocabulary(tb_logger.writer.log_dir) # Run the training trainer.run(train_loader, max_epochs=config.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if config.local_rank in [-1, 0] and config.n_epochs > 0: os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME)) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def attach(self, engine: Engine, name: str, usage: Union[str, MetricUsage] = EpochWise()) -> None: usage = self._check_usage(usage) # recursively attach all its dependencies (partially) self._internal_attach(engine, usage) # attach only handler on EPOCH_COMPLETED engine.add_event_handler(usage.COMPLETED, self.completed, name)
loss.backward() model_opt.step() model_opt.zero_grad() return loss.item() / targ.shape[0] def inference(engine, batch): model_par.eval() inp, targ = batch out = model_par.forward(inp) return out, targ trainer = Engine(training_update_function) trainer.add_event_handler(Events.ITERATION_COMPLETED, best_checkpointer, {'best_model': model}) evaluator = Engine(inference) @trainer.on(Events.ITERATION_COMPLETED) def track_results(trainer): results['best_loss'] = min(results['best_loss'], trainer.state.output) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(trainer): if (trainer.state.iteration + 1) % 1 == 0: print("Epoch[{}] Iteration[{}] Loss: {:.8f}".format(trainer.state.epoch, trainer.state.iteration + 1, trainer.state.output)) trainer.run(train_loader, max_epochs=5)
def train(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='persona_comet_weak_label_preprocessed', help="Path or url of the dataset cache") parser.add_argument("--model_checkpoint", type=str, default="openai-gpt", help="Path, url or short name of the model") parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training") parser.add_argument("--max_history", type=int, default=2, help="Number of previous exchanges to keep in history") parser.add_argument("--train_batch_size", type=int, default=4, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=4, help="Batch size for validation") parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps") parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate") parser.add_argument("--lm_coef", type=float, default=1.0, help="LM loss coefficient") parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs") parser.add_argument("--personality_permutations", type=int, default=1, help="Number of permutations of personality sentences") parser.add_argument( "--eval_before_start", action='store_true', help="If true start with a first evaluation before training") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument( "--fp16", type=str, default="", help= "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument( "--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") parser.add_argument("--num_beams", type=int, default=5, help="Number of beams for comet expansion") parser.add_argument("--test_run_num", type=int, default=-1, help="Datapoints to run with in a test run") parser.add_argument("--exp_name", type=str, default="", required=True, help="Provide an experiment name") parser.add_argument("--do_train", action='store_true', help="Do training") parser.add_argument("--do_eval", action='store_true', help="Do Evaluation") parser.add_argument("--no_persona", action='store_true', help="No Persona Evaluation") parser.add_argument("--no_comet_persona", action='store_true', help="No Persona Evaluation") parser.add_argument("--uniform_prior", action='store_true', help="Uniform prior") parser.add_argument("--log_dir", type=str, default="", required=True, help="Provide a log dir") args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) print( "Running process {}".format(args.local_rank) ) # This is a logger.warning: it will be printed by all distributed processes print("Arguments: {}".format(pformat(args))) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') print("Prepare tokenizer, pretrained model and optimizer.") tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer # cant use Autotokenizer because checkpoint could be a Path tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model_class = GPT2DoubleHeadsModel if "gpt2" in args.model_checkpoint else OpenAIGPTDoubleHeadsModel if args.do_eval and not args.do_train: print('Loading model from checkpoint {}'.format(args.model_checkpoint)) # model = model_class.from_pretrained(args.model_checkpoint) # model.to(args.device) model = LatentMarginalizedModel(args, generator_class=model_class) model.to(args.device) # Add special tokens if they are not already added add_special_tokens_(model, tokenizer) optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) print("Prepare datasets") train_dataset = PersonaChatDataset(args, tokenizer, split='train') if args.do_eval: val_dataset = PersonaChatDataset(args, tokenizer, split='valid') train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset) train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate_dialog, pin_memory=True) if args.do_eval: val_loader = DataLoader(val_dataset, shuffle=False, batch_size=args.valid_batch_size, collate_fn=collate_dialog, pin_memory=True) # train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(args, tokenizer) # Training function and trainer def update(engine, batch): model.train() batch = tuple(input_tensor.to(args.device) for input_tensor in batch) input_ids, token_type_ids, lm_labels, mc_token_ids, mc_labels, persona, history = batch (lm_loss), (mc_loss) = model(input_ids=input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels, persona=persona, history=history) loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item(), lm_loss.item(), mc_loss.item(), math.exp( lm_loss.item()) trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple( input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch # print(tokenizer.decode(input_ids[0, -1, :].tolist())) # if we dont send labels to model, it doesnt return losses lm_logits, mc_logits, *_ = model( input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, ) lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view( -1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels) evaluator = Engine(inference) # Make sure distributed data samplers split the dataset nicely between the distributed processes # if args.distributed: # trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) # evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x[0]).attach(trainer, "loss") RunningAverage(output_transform=lambda x: x[1]).attach(trainer, "lm_loss") RunningAverage(output_transform=lambda x: x[2]).attach(trainer, "mc_loss") RunningAverage(output_transform=lambda x: x[3], alpha=0.01).attach(trainer, "perplexity") metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0][0], x[1][0])), "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1])) } metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args), "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train def print_model_save(engine): print("Training complete. Saving Model.") def print_validation(engine): print("Model saved. Starting validation.") if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss", "lm_loss", "mc_loss", "perplexity"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) log_dir = make_logdir(args.model_checkpoint, args.exp_name) log_dir = os.path.join(args.log_dir, log_dir) print("Logging at log dir: {}".format(log_dir)) # tb stuff # tb_logger = TensorboardLogger(log_dir) # tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) # tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) # tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) # save model checkpoints checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler(Events.EPOCH_COMPLETED, print_model_save) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" takes care of distributed encapsulation torch.save(args, log_dir + '/model_training_args.bin') # getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME)) tokenizer.save_pretrained(log_dir) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, print_validation) if args.do_eval: trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Run the training if args.do_train: trainer.run(train_loader, max_epochs=args.n_epochs) if args.do_eval and not args.do_train: print('Running only Evaluation. No Training.') evaluator.run(val_loader) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0 and args.do_train: os.rename( os.path.join(log_dir, checkpoint_handler._saved[-1][1]), os.path.join(log_dir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner)
def create_train_and_validation_engines(train_func, val_func=None, device='cpu'): """ Helper function for creating an ignite Engine object with helpful defaults. This sets up an Engine that has four handlers attached to it: - prepare_batch: before a batch is passed to train_func or val_func, this function runs, moving every item in the batch (which is a dictionary) to the appropriate device ('cpu' or 'cuda'). - book_keeping: sets up some dictionaries that are used for bookkeeping so one can easily track the epoch and iteration losses for both training and validation. - add_to_iter_history: records the iteration, epoch, and past iteration losses into the dictionaries set up by book_keeping. - clear_iter_history: resets the current iteration history of losses after moving the current iteration history into past iteration history. Args: train_func (func): Function that provides the closure for training for a single batch. val_func (func, optional): Function that provides the closure for validating a single batch. Defaults to None. device (str, optional): Device to move tensors to. Defaults to 'cpu'. """ # Set up engines for training and validation trainer = Engine(train_func) trainer.register_events(*ValidationEvents) trainer.register_events(*BackwardsEvents) validator = None if val_func is None else Engine(val_func) # Before a batch starts, the items should be float and moved to the # correct device, for both training and validation. Checks to make # sure "cuda" is available if user requested cuda. device = device if torch.cuda.is_available() else 'cpu' device = torch.device(device) def prepare_batch(engine): batch = engine.state.batch for key in batch: if torch.is_tensor(batch[key]): batch[key] = batch[key].float().to(device) engine.state.batch = batch # Set up stuff for bookkeeping as training progresses. def book_keeping(engine): engine.state.epoch_history = {} engine.state.iter_history = {} engine.state.past_iter_history = {} def add_to_iter_history(engine): for key in engine.state.output: if key not in engine.state.iter_history: engine.state.iter_history[key] = [] if key not in engine.state.past_iter_history: engine.state.past_iter_history[key] = [] engine.state.iter_history[key].append( engine.state.output[key] ) engine.state.past_iter_history[key].append( engine.state.iter_history[key] ) def clear_iter_history(engine): engine.state.iter_history = {} trainer.add_event_handler( Events.ITERATION_STARTED, prepare_batch) trainer.add_event_handler( Events.STARTED, book_keeping) trainer.add_event_handler( Events.ITERATION_COMPLETED, add_to_iter_history) trainer.add_event_handler( Events.EPOCH_STARTED, clear_iter_history) if validator is not None: validator.add_event_handler( Events.ITERATION_STARTED, prepare_batch) validator.add_event_handler( Events.STARTED, book_keeping) validator.add_event_handler( Events.ITERATION_COMPLETED, add_to_iter_history) validator.add_event_handler( Events.EPOCH_STARTED, clear_iter_history) return trainer, validator
def attach(self, engine: Engine): engine.add_event_handler( Events.ITERATION_COMPLETED(every=self.interval), self)
def train(): parser = ArgumentParser() parser.add_argument('--gpt2', action='store_true', help="use gpt2") parser.add_argument("--model_checkpoint", type=str, default="config/cgpt/", help="Path or URL of the model") parser.add_argument("--from_step", type=int, default=-1, help="Init learning rate from this step") parser.add_argument('--pretrained', action='store_true', help="If False train from scratch") parser.add_argument("--data_path", type=str, default="", help="Path or url of the dataset. ") parser.add_argument( "--train_path", type=str, default= "/Users/sunhongchao/Documents/craft/09_Dialogue/corpus/chitchat/gpt-chinese/toy_train.txt", help="Path of the train dataset for dist dataset. ") parser.add_argument( "--valid_path", type=str, default= "/Users/sunhongchao/Documents/craft/09_Dialogue/corpus/chitchat/gpt-chinese/toy_valid.txt", help="Path of the valid dataset for dist dataset. ") parser.add_argument("--dataset_cache", type=str, default="dataset_cache", help="Path or url of the dataset cache") parser.add_argument('--log_file', '-log_file', type=str, default="", help="Output logs to a file under this path") parser.add_argument("--num_workers", type=int, default=8, help="Number of subprocesses for data loading") parser.add_argument("--n_epochs", type=int, default=70, help="Number of training epochs") parser.add_argument("--train_batch_size", type=int, default=2, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=2, help="Batch size for validation") parser.add_argument("--max_history", type=int, default=15, help="Number of previous exchanges to keep in history") parser.add_argument("--scheduler", type=str, default="noam", choices=['noam', 'linear'], help="method of optim") parser.add_argument("--n_emd", type=int, default=768, help="Number of n_emd in config file (for noam)") parser.add_argument("--lr", type=float, default=5e-5, help="Learning rate") parser.add_argument( "--eval_before_start", action='store_true', help="If true start with a first evaluation before training") parser.add_argument("--warmup_steps", type=int, default=5000, help="Warm up steps") parser.add_argument("--valid_steps", type=int, default=5000, help="Perfom validation every X steps") parser.add_argument("--gradient_accumulation_steps", type=int, default=64, help="Accumulate gradients on several steps") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument( "--fp16", type=str, default="", help= "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument( "--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. # logger.info => log main process only, logger.warning => log all processes logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning("Running process %d", args.local_rank) logger.info("Arguments: %s", pformat(args)) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info( "Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning" ) model_class = OpenAIGPTLMHeadModel if not args.gpt2 else GPT2LMHeadModel config_class = OpenAIGPTConfig if not args.gpt2 else GPT2Config tokenizer_class = BertTokenizer if args.pretrained: tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint, do_lower_case=True) model = model_class.from_pretrained(args.model_checkpoint) else: tokenizer = tokenizer_class(os.path.join(args.model_checkpoint, "vocab.txt"), do_lower_case=True) config = config_class.from_json_file( os.path.join(args.model_checkpoint, CONFIG_NAME)) model = model_class(config) model.to(args.device) optimizer = AdamW([{ 'params': model.parameters(), 'initial_lr': args.lr }], lr=args.lr, correct_bias=True) logger.info("Prepare datasets") loader_class = build_dist_loaders if not args.data_path else build_dataloaders train_loader, val_loader, train_sampler, valid_sampler = loader_class( args, tokenizer, logger) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) # Training function and trainer def update(engine, batch): input_ids, token_type_ids, lm_labels = tuple( input_tensor.to(args.device) for input_tensor in batch) model.train() (lm_loss), *_ = model(input_ids, labels=lm_labels, token_type_ids=token_type_ids) loss = lm_loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item(), optimizer.param_groups[0]['lr'] trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): input_ids, token_type_ids, lm_labels = tuple( input_tensor.to(args.device) for input_tensor in batch) # logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) lm_logits, *_ = model(input_ids, token_type_ids=token_type_ids) lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view( -1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return lm_logits_flat_shifted, lm_labels_flat_shifted evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Evaluation during training @trainer.on(Events.ITERATION_STARTED) def log_iterations(engine): # if engine.state.iteration % max(int(0.1 * len(train_loader)), 1) == 0: if engine.state.iteration % args.valid_steps == 0: evaluator.run(val_loader) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # noam decrease the learning rate # model_size = model.config.n_embd model_size = args.n_emd noam_lambda = lambda step: (model_size**(-0.5) * min( (step + 1)**(-0.5), (step + 1) * args.warmup_steps**(-1.5))) noam_scheduler = LambdaLR(optimizer, lr_lambda=noam_lambda, last_epoch=args.from_step) scheduler = LRScheduler(noam_scheduler) if args.scheduler == "linear": scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x[0]).attach(trainer, "loss") RunningAverage(output_transform=lambda x: x[1]).attach(trainer, "lr") metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0], x[1])) } metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints # And save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True, mininterval=2) pbar.attach(trainer, metric_names=["loss", "lr"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) tb_logger = TensorboardLogger(log_dir=None) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list( metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(tb_logger.writer.logdir, 'checkpoint', save_interval=1, n_saved=3) # save model after evaluation evaluator.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)}) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" take care of distributed encapsulation torch.save(args, tb_logger.writer.logdir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file( os.path.join(tb_logger.writer.logdir, CONFIG_NAME)) tokenizer.save_vocabulary(tb_logger.writer.logdir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint # (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename( checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.logdir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def training(config, local_rank=None, with_mlflow_logging=False, with_plx_logging=False): if not getattr(config, "use_fp16", True): raise RuntimeError("This training script uses by default fp16 AMP") set_seed(config.seed + local_rank) torch.cuda.set_device(local_rank) device = "cuda" torch.backends.cudnn.benchmark = True train_loader = config.train_loader train_sampler = getattr(train_loader, "sampler", None) assert ( train_sampler is not None ), "Train loader of type '{}' " "should have attribute 'sampler'".format( type(train_loader)) assert hasattr(train_sampler, "set_epoch") and callable( train_sampler.set_epoch ), "Train sampler should have a callable method `set_epoch`" train_eval_loader = config.train_eval_loader val_loader = config.val_loader model = config.model.to(device) optimizer = config.optimizer model, optimizer = amp.initialize( model, optimizer, opt_level=getattr(config, "fp16_opt_level", "O2"), num_losses=1, ) model = DDP(model, delay_allreduce=True) criterion = config.criterion.to(device) prepare_batch = getattr(config, "prepare_batch", _prepare_batch) non_blocking = getattr(config, "non_blocking", True) # Setup trainer accumulation_steps = getattr(config, "accumulation_steps", 1) model_output_transform = getattr(config, "model_output_transform", lambda x: x) def train_update_function(engine, batch): model.train() x, y = prepare_batch(batch, device=device, non_blocking=non_blocking) y_pred = model(x) y_pred = model_output_transform(y_pred) loss = criterion(y_pred, y) if isinstance(loss, Mapping): assert "supervised batch loss" in loss loss_dict = loss output = {k: v.item() for k, v in loss_dict.items()} loss = loss_dict["supervised batch loss"] / accumulation_steps else: output = {"supervised batch loss": loss.item()} with amp.scale_loss(loss, optimizer, loss_id=0) as scaled_loss: scaled_loss.backward() if engine.state.iteration % accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return output output_names = getattr( config, "output_names", [ "supervised batch loss", ], ) trainer = Engine(train_update_function) common.setup_common_distrib_training_handlers( trainer, train_sampler, to_save={ "model": model, "optimizer": optimizer }, save_every_iters=1000, output_path=config.output_path.as_posix(), lr_scheduler=config.lr_scheduler, with_gpu_stats=True, output_names=output_names, with_pbars=True, with_pbar_on_iters=with_mlflow_logging, log_every_iters=1, ) # Setup evaluators num_classes = config.num_classes cm_metric = ConfusionMatrix(num_classes=num_classes) val_metrics = { "IoU": IoU(cm_metric), "mIoU_bg": mIoU(cm_metric), } if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict): val_metrics.update(config.val_metrics) model_output_transform = getattr(config, "model_output_transform", lambda x: x) evaluator_args = dict( model=model, metrics=val_metrics, device=device, non_blocking=non_blocking, prepare_batch=prepare_batch, output_transform=lambda x, y, y_pred: ( model_output_transform(y_pred), y, ), ) train_evaluator = create_supervised_evaluator(**evaluator_args) evaluator = create_supervised_evaluator(**evaluator_args) if dist.get_rank() == 0 and with_mlflow_logging: ProgressBar(persist=False, desc="Train Evaluation").attach(train_evaluator) ProgressBar(persist=False, desc="Val Evaluation").attach(evaluator) def run_validation(_): train_evaluator.run(train_eval_loader) evaluator.run(val_loader) if getattr(config, "start_by_validation", False): trainer.add_event_handler(Events.STARTED, run_validation) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1)), run_validation) trainer.add_event_handler(Events.COMPLETED, run_validation) score_metric_name = "mIoU_bg" if hasattr(config, "es_patience"): common.add_early_stopping_by_val_score(config.es_patience, evaluator, trainer, metric_name=score_metric_name) if dist.get_rank() == 0: tb_logger = common.setup_tb_logging( config.output_path.as_posix(), trainer, optimizer, evaluators={ "training": train_evaluator, "validation": evaluator }, ) if with_mlflow_logging: common.setup_mlflow_logging( trainer, optimizer, evaluators={ "training": train_evaluator, "validation": evaluator }, ) if with_plx_logging: common.setup_plx_logging( trainer, optimizer, evaluators={ "training": train_evaluator, "validation": evaluator }, ) common.save_best_model_by_val_score( config.output_path.as_posix(), evaluator, model, metric_name=score_metric_name, trainer=trainer, ) # Log train/val predictions: tb_logger.attach( evaluator, log_handler=predictions_gt_images_handler( img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation", ), event_name=Events.EPOCH_COMPLETED, ) log_train_predictions = getattr(config, "log_train_predictions", False) if log_train_predictions: tb_logger.attach( train_evaluator, log_handler=predictions_gt_images_handler( img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation", ), event_name=Events.EPOCH_COMPLETED, ) trainer.run(train_loader, max_epochs=config.num_epochs)
def main(cfg): """ Performs training, validation and testing. """ assert isdir(cfg.data_dir), \ '`data_dir` must be a valid path.' cfg.cuda = torch.cuda.is_available() \ and not cfg.no_cuda cfg.model_dir = os.getcwd() # setting random seed for reproducibility if cfg.seed: set_random_seed(cfg) device = torch.device('cuda' if cfg.cuda else 'cpu') os.makedirs(cfg.model_dir, exist_ok=True) label2id = create_label2id(cfg) cfg.num_labels = len(label2id) xlmr = create_pretrained(cfg.model_type, cfg.force_download) # creating dataset split loaders datasets = create_dataset(cfg, xlmr, label2id) train_dataset, valid_dataset = datasets def compute_loss(batch): """ Computes the forward pass and returns the cross entropy loss. """ inputs, labels = [ torch.from_numpy(tensor).to(device).long() for tensor in batch ] logits = model(inputs) logits = logits.view(-1, logits.size(-1)) labels = labels.view(-1) loss = torch.nn.functional.cross_entropy(logits, labels, ignore_index=-1) return loss def train_step(engine, batch): """ Propagates the inputs forward and updates the parameters. """ step = engine.state.iteration model.train() loss = compute_loss(batch) backward(loss) if cfg.clip_grad_norm is not None: clip_grad_norm(cfg.clip_grad_norm) if step % cfg.grad_accum_steps == 0: optimizer.step() optimizer.zero_grad() scheduler.step() # restoring the averaged loss across steps loss *= cfg.grad_accum_steps return loss.item() def eval_step(engine, batch): """ Propagates the inputs forward without storing any gradients. """ model.eval() with torch.no_grad(): loss = compute_loss(batch) return loss.item() def backward(loss): """ Backpropagates the loss in either mixed or normal precision mode. """ if cfg.fp16: with amp.scale_loss(loss, optimizer) as sc: sc.backward() else: loss.backward() def clip_grad_norm(max_norm): """ Applies gradient clipping. """ if cfg.fp16: params = amp.master_params(optimizer) else: params = model.parameters() torch.nn.utils.clip_grad_norm_(params, max_norm) trainer = Engine(train_step) validator = Engine(eval_step) checkpoint = ModelCheckpoint( cfg.model_dir, cfg.model_type, n_saved=5, save_as_state_dict=True, score_function=lambda e: -e.state.metrics['loss']) last_ckpt_path = cfg.ckpt_path if last_ckpt_path is not None: msg = 'Loading state from {}' print(msg.format(basename(last_ckpt_path))) last_state = torch.load(last_ckpt_path, map_location=device) model = create_model(xlmr, len(label2id), cfg) model = model.to(device) del xlmr.model optimizer = create_optimizer(cfg, model) scheduler = create_scheduler(cfg, optimizer, len(train_dataset)) # using apex if required and loading its state if cfg.fp16: model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if last_ckpt_path is not None and \ 'amp' in last_state: amp.load_state_dict(last_state['amp']) if last_ckpt_path is not None: model.load_state_dict(last_state['model']) optimizer.load_state_dict(last_state['optimizer']) scheduler.load_state_dict(last_state['scheduler']) checkpoint_dict = { 'model': model, 'optimizer': optimizer, 'scheduler': scheduler } if cfg.fp16: checkpoint_dict['amp'] = amp validator.add_event_handler(Events.COMPLETED, checkpoint, checkpoint_dict) metric = RunningAverage(output_transform=lambda x: x) metric.attach(trainer, 'loss') metric.attach(validator, 'loss') pbar = ProgressBar() pbar.attach(trainer, metric_names=['loss']) history_path = join(cfg.model_dir, 'history.json') history = collections.defaultdict(list) headers = ['epoch', 'train_loss', 'valid_loss'] if exists(history_path): with open(history_path, 'r') as fh: history = json.load(fh) def record_history(results): """ Records the results to the history. """ for header in headers: history[header].append(results[header]) with open(history_path, 'w') as fh: json.dump(history, fh) @trainer.on(Events.EPOCH_COMPLETED) def print_results(engine): """ Logs the training results. """ validator.run(valid_dataset) record_history({ 'epoch': engine.state.epoch, 'train_loss': engine.state.metrics['loss'], 'valid_loss': validator.state.metrics['loss'] }) data = list(zip(*[history[h] for h in headers])) table = tabulate(data, headers, floatfmt='.3f') print(table.split('\n')[-1]) data = list(zip(*[history[h] for h in headers])) print() print(cfg.pretty()) print() print('***** Running training *****') print() print(tabulate(data, headers, floatfmt='.3f')) trainer.run(train_dataset, cfg.max_epochs)
def _test(duration_vals_as_np_int): scheduler_1 = LinearCyclicalScheduler(optimizer, "lr", start_value=1.0, end_value=0.0, cycle_size=10) scheduler_2 = CosineAnnealingScheduler(optimizer, "lr", start_value=0.0, end_value=1.0, cycle_size=10) durations = [10] if duration_vals_as_np_int: durations = [np.int64(t) for t in durations] concat_scheduler = ConcatScheduler( schedulers=[scheduler_1, scheduler_2], durations=durations, save_history=True ) state_dict = concat_scheduler.state_dict() data = [0] * 10 max_epochs = 2 simulated_values = ConcatScheduler.simulate_values( num_events=len(data) * max_epochs, schedulers=[scheduler_1, scheduler_2], durations=durations ) def save_lr(engine): lrs.append(optimizer.param_groups[0]["lr"]) trainer = Engine(lambda engine, batch: None) trainer.add_event_handler(Events.ITERATION_STARTED, concat_scheduler) trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr) for _ in range(2): lrs = [] trainer.run(data, max_epochs=max_epochs) assert lrs == list( map( pytest.approx, [ # Cycle 1 of the LinearCyclicalScheduler 1.0, 0.8, 0.6, 0.4, 0.2, 0.0, 0.2, 0.4, 0.6, 0.8, # Cycle 1 of the CosineAnnealingScheduler 0.0, 0.02447174185242318, 0.09549150281252627, 0.20610737385376332, 0.3454915028125263, 0.5, 0.6545084971874737, 0.7938926261462365, 0.9045084971874737, 0.9755282581475768, ], ) ) state_lrs = trainer.state.param_history["lr"] assert len(state_lrs) == len(lrs) # Unpack singleton lists assert [group[0] for group in state_lrs] == lrs assert lrs == pytest.approx([v for i, v in simulated_values]) concat_scheduler.load_state_dict(state_dict) trainer.state.param_history = None
def run(output_path, config): device = "cuda" local_rank = config['local_rank'] distributed = backend is not None if distributed: torch.cuda.set_device(local_rank) device = "cuda" rank = dist.get_rank() if distributed else 0 # Rescale batch_size and num_workers ngpus_per_node = torch.cuda.device_count() ngpus = dist.get_world_size() if distributed else 1 batch_size = config['batch_size'] // ngpus num_workers = int((config['num_workers'] + ngpus_per_node - 1) / ngpus_per_node) train_labelled_loader, test_loader = \ get_train_test_loaders(path=config['data_path'], batch_size=batch_size, distributed=distributed, num_workers=num_workers) model = get_model(config['model']) model = model.to(device) if distributed: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank, ], output_device=local_rank) optimizer = optim.SGD(model.parameters(), lr=config['learning_rate'], momentum=config['momentum'], weight_decay=config['weight_decay'], nesterov=True) criterion = nn.CrossEntropyLoss().to(device) le = len(train_labelled_loader) milestones_values = [ (0, 0.0), (le * config['num_warmup_epochs'], config['learning_rate']), (le * config['num_epochs'], 0.0) ] lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values) def _prepare_batch(batch, device, non_blocking): x, y = batch return (convert_tensor(x, device=device, non_blocking=non_blocking), convert_tensor(y, device=device, non_blocking=non_blocking)) def process_function(engine, labelled_batch): x, y = _prepare_batch(labelled_batch, device=device, non_blocking=True) model.train() # Supervised part y_pred = model(x) loss = criterion(y_pred, y) optimizer.zero_grad() loss.backward() optimizer.step() return { 'batch loss': loss.item(), } trainer = Engine(process_function) if not hasattr(lr_scheduler, "step"): trainer.add_event_handler(Events.ITERATION_STARTED, lr_scheduler) else: trainer.add_event_handler(Events.ITERATION_STARTED, lambda engine: lr_scheduler.step()) metric_names = [ 'batch loss', ] def output_transform(x, name): return x[name] for n in metric_names: # We compute running average values on the output (batch loss) across all devices RunningAverage(output_transform=partial(output_transform, name=n), epoch_bound=False, device=device).attach(trainer, n) if rank == 0: checkpoint_handler = ModelCheckpoint(dirname=output_path, filename_prefix="checkpoint", save_interval=1000) trainer.add_event_handler(Events.ITERATION_COMPLETED, checkpoint_handler, {'model': model, 'optimizer': optimizer}) ProgressBar(persist=True, bar_format="").attach(trainer, event_name=Events.EPOCH_STARTED, closing_event_name=Events.COMPLETED) if config['display_iters']: ProgressBar(persist=False, bar_format="").attach(trainer, metric_names=metric_names) tb_logger = TensorboardLogger(log_dir=output_path) tb_logger.attach(trainer, log_handler=tbOutputHandler(tag="train", metric_names=metric_names), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=tbOptimizerParamsHandler(optimizer, param_name="lr"), event_name=Events.ITERATION_STARTED) metrics = { "accuracy": Accuracy(device=device if distributed else None), "loss": Loss(criterion, device=device if distributed else None) } evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def run_validation(engine, val_interval): if engine.state.epoch % val_interval == 0: torch.cuda.synchronize() train_evaluator.run(train_labelled_loader) evaluator.run(test_loader) trainer.add_event_handler(Events.EPOCH_STARTED, run_validation, val_interval=3) trainer.add_event_handler(Events.COMPLETED, run_validation, val_interval=1) if rank == 0: if config['display_iters']: ProgressBar(persist=False, desc="Train evaluation").attach(train_evaluator) ProgressBar(persist=False, desc="Test evaluation").attach(evaluator) tb_logger.attach(train_evaluator, log_handler=tbOutputHandler(tag="train", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.COMPLETED) tb_logger.attach(evaluator, log_handler=tbOutputHandler(tag="test", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.COMPLETED) # Store the best model def default_score_fn(engine): score = engine.state.metrics['accuracy'] return score score_function = default_score_fn if not hasattr(config, "score_function") else config.score_function best_model_handler = ModelCheckpoint(dirname=output_path, filename_prefix="best", n_saved=3, score_name="val_accuracy", score_function=score_function) evaluator.add_event_handler(Events.COMPLETED, best_model_handler, {'model': model, }) trainer.run(train_labelled_loader, max_epochs=config['num_epochs']) if rank == 0: tb_logger.close()
def test_linear_scheduler(): with pytest.raises(TypeError, match=r"Argument optimizer should be torch.optim.Optimizer"): LinearCyclicalScheduler({}, "lr", 1, 0, cycle_size=0) tensor = torch.zeros([1], requires_grad=True) optimizer = torch.optim.SGD([tensor], lr=0.0) with pytest.raises(ValueError, match=r"Argument cycle_size should be positive and larger than 1"): LinearCyclicalScheduler(optimizer, "lr", 1, 0, cycle_size=0) with pytest.raises(ValueError, match=r"Argument cycle_size should be positive and larger than 1"): LinearCyclicalScheduler(optimizer, "lr", 1, 0, cycle_size=1) scheduler = LinearCyclicalScheduler(optimizer, "lr", 1, 0, 10) state_dict = scheduler.state_dict() def save_lr(engine): lrs.append(optimizer.param_groups[0]["lr"]) trainer = Engine(lambda engine, batch: None) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr) for _ in range(2): lrs = [] trainer.run([0] * 9, max_epochs=2) assert lrs == list( map( pytest.approx, [ # Cycle 1 1.0, 0.8, 0.6, 0.4, 0.2, 0.0, 0.2, 0.4, 0.6, 0.8, # Cycle 2 1.0, 0.8, 0.6, 0.4, 0.2, 0.0, 0.2, 0.4, # 0.6, 0.8, ], ) ) scheduler.load_state_dict(state_dict) optimizer = torch.optim.SGD([tensor], lr=0) scheduler = LinearCyclicalScheduler(optimizer, "lr", 1, 0, 10, cycle_mult=2) state_dict = scheduler.state_dict() trainer = Engine(lambda engine, batch: None) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr) for _ in range(2): lrs = [] trainer.run([0] * 10, max_epochs=3) assert lrs == list( map( pytest.approx, [ # Cycle 1 1.0, 0.8, 0.6, 0.4, 0.2, 0.0, 0.2, 0.4, 0.6, 0.8, # Cycle 2 1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, ], ) ) scheduler.load_state_dict(state_dict) # With float cycle_size optimizer = torch.optim.SGD([tensor], lr=0) scheduler = LinearCyclicalScheduler( optimizer, "lr", start_value=1.2, end_value=0.2, cycle_size=10.00000012, cycle_mult=1.0 ) state_dict = scheduler.state_dict() trainer = Engine(lambda engine, batch: None) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr) for _ in range(2): lrs = [] trainer.run([0] * 9, max_epochs=2) assert lrs == list( map( pytest.approx, [ # Cycle 1 1.2, 1.0, 0.8, 0.6, 0.4, 0.2, 0.4, 0.6, 0.8, 1.0, # Cycle 2 1.2, 1.0, 0.8, 0.6, 0.4, 0.2, 0.4, 0.6, # 0.8, 1.0, ], ) ) scheduler.load_state_dict(state_dict)
def attach(self, engine: Engine) -> None: if self._name is None: self.logger = engine.logger if not engine.has_event_handler(self, Events.ITERATION_COMPLETED): engine.add_event_handler(Events.ITERATION_COMPLETED, self)