def test_weights_hist_handler_wrong_setup(): with pytest.raises(TypeError, match="Argument model should be of type torch.nn.Module"): WeightsHistHandler(None) model = MagicMock(spec=torch.nn.Module) wrapper = WeightsHistHandler(model) mock_logger = MagicMock() mock_engine = MagicMock() with pytest.raises(RuntimeError, match="Handler 'WeightsHistHandler' works only with TensorboardLogger"): wrapper(mock_engine, mock_logger, Events.ITERATION_STARTED)
def add_tensorboard(engine_train, optimizer, model, log_dir): """Creates an ignite logger object and adds training elements such as weight and gradient histograms Args: engine_train (:obj:`ignite.engine`): the train engine to attach to the logger optimizer (:obj:`torch.optim`): the model's optimizer model (:obj:`torch.nn.Module`): the model being trained log_dir (string): path to where tensorboard data should be saved """ # Create a logger tb_logger = TensorboardLogger(log_dir=log_dir) # Attach the logger to the trainer to log training loss at each iteration tb_logger.attach(engine_train, log_handler=OutputHandler( tag="training", output_transform=lambda loss: {"loss": loss}), event_name=Events.ITERATION_COMPLETED) # Attach the logger to the trainer to log optimizer's parameters, e.g. learning rate at each iteration tb_logger.attach(engine_train, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.EPOCH_COMPLETED) # Attach the logger to the trainer to log model's weights as a histogram after each epoch tb_logger.attach(engine_train, log_handler=WeightsHistHandler(model), event_name=Events.EPOCH_COMPLETED) # Attach the logger to the trainer to log model's gradients as a histogram after each epoch tb_logger.attach(engine_train, log_handler=GradsHistHandler(model), event_name=Events.EPOCH_COMPLETED) tb_logger.close()
def test_weights_hist_handler_frozen_layers(dummy_model_factory): model = dummy_model_factory(with_grads=True, with_frozen_layer=True) wrapper = WeightsHistHandler(model) mock_logger = MagicMock(spec=TensorboardLogger) mock_logger.writer = MagicMock() mock_engine = MagicMock() mock_engine.state = State() mock_engine.state.epoch = 5 wrapper(mock_engine, mock_logger, Events.EPOCH_STARTED) mock_logger.writer.add_histogram.assert_has_calls( [ call(tag="weights/fc2/weight", values=ANY, global_step=5), call(tag="weights/fc2/bias", values=ANY, global_step=5), ], any_order=True, ) with pytest.raises(AssertionError): mock_logger.writer.add_histogram.assert_has_calls( [ call(tag="weights/fc1/weight", values=ANY, global_step=5), call(tag="weights/fc1/bias", values=ANY, global_step=5), ], any_order=True, ) assert mock_logger.writer.add_histogram.call_count == 2
def _test(tag=None): wrapper = WeightsHistHandler(model, tag=tag) mock_logger = MagicMock(spec=TensorboardLogger) mock_logger.writer = MagicMock() mock_engine = MagicMock() mock_engine.state = State() mock_engine.state.epoch = 5 wrapper(mock_engine, mock_logger, Events.EPOCH_STARTED) tag_prefix = f"{tag}/" if tag else "" assert mock_logger.writer.add_histogram.call_count == 4 mock_logger.writer.add_histogram.assert_has_calls( [ call(tag=tag_prefix + "weights/fc1/weight", values=ANY, global_step=5), call(tag=tag_prefix + "weights/fc1/bias", values=ANY, global_step=5), call(tag=tag_prefix + "weights/fc2/weight", values=ANY, global_step=5), call(tag=tag_prefix + "weights/fc2/bias", values=ANY, global_step=5), ], any_order=True, )
def custom_setup(self): if self.tensorboard_logs: tb_logger = TensorboardLogger(log_dir=self.tensorboard_logs) tb_logger.attach(self.trainer, log_handler=OutputHandler( tag="training", output_transform=lambda loss: {'loss': loss}), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(self.evaluator, log_handler=OutputHandler( tag="validation", metric_names=["LossMetric"], another_engine=self.trainer), event_name=Events.EPOCH_COMPLETED) if self.optional_tensorboard_features: tb_logger.attach(self.trainer, log_handler=OptimizerParamsHandler( self.optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(self.trainer, log_handler=WeightsScalarHandler(self.model), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(self.trainer, log_handler=WeightsHistHandler(self.model), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(self.trainer, log_handler=GradsScalarHandler(self.model), event_name=Events.ITERATION_COMPLETED) # This is important to close the tensorboard file logger @self.trainer.on(Events.COMPLETED) def end_tensorboard(trainer): logger.info("Training completed") tb_logger.close() if self.embeddings_name: @self.trainer.on(Events.COMPLETED) def log_embeddings(trainer): if hasattr(self.model, self.embeddings_name) and hasattr( self.dataset_splits, "vectorizer") and TENSORBOARD: logger.info( f"Logging embeddings ({self.embeddings_name}) to Tensorboard!" ) embeddings = getattr(self.model, self.embeddings_name).weight.data metadata = [ str(self.dataset_splits.vectorizer.data_vocab. _id2token[token_index]).encode('utf-8') for token_index in range(embeddings.shape[0]) ] self.writer.add_embedding( mat=embeddings, metadata=metadata, global_step=self.trainer.state.epoch)
def test_weights_hist_handler_whitelist(dummy_model_factory): model = dummy_model_factory() wrapper = WeightsHistHandler(model, whitelist=["fc2.weight"]) mock_logger = MagicMock(spec=TensorboardLogger) mock_logger.writer = MagicMock() mock_engine = MagicMock() mock_engine.state = State() mock_engine.state.epoch = 5 wrapper(mock_engine, mock_logger, Events.EPOCH_STARTED) mock_logger.writer.add_histogram.assert_called_once_with(tag="weights/fc2/weight", values=ANY, global_step=5) mock_logger.writer.reset_mock() wrapper = WeightsHistHandler(model, tag="model", whitelist=["fc1"]) wrapper(mock_engine, mock_logger, Events.EPOCH_STARTED) mock_logger.writer.add_histogram.assert_has_calls( [ call(tag="model/weights/fc1/weight", values=ANY, global_step=5), call(tag="model/weights/fc1/bias", values=ANY, global_step=5), ], any_order=True, ) assert mock_logger.writer.add_histogram.call_count == 2 mock_logger.writer.reset_mock() def weight_selector(n, _): return "bias" in n wrapper = WeightsHistHandler(model, tag="model", whitelist=weight_selector) wrapper(mock_engine, mock_logger, Events.EPOCH_STARTED) mock_logger.writer.add_histogram.assert_has_calls( [ call(tag="model/weights/fc1/bias", values=ANY, global_step=5), call(tag="model/weights/fc2/bias", values=ANY, global_step=5), ], any_order=True, ) assert mock_logger.writer.add_histogram.call_count == 2
def add_tensorboard_logging(self, logging_dir=None): # Add TensorBoard logging if logging_dir is None: os.path.join(self.config.DIRS.WORKING_DIR, 'tb_logs') else: os.path.join(logging_dir, 'tb_logs') print('Tensorboard logging saving to:: {} ...'.format(logging_dir), end='') self.tb_logger = TensorboardLogger(log_dir=logging_dir) # Logging iteration loss self.tb_logger.attach_output_handler( engine=self.train_engine, event_name=Events.ITERATION_COMPLETED, tag='training', output_transform=lambda loss: {"batch loss": loss}) # Logging epoch training metrics self.tb_logger.attach_output_handler( engine=self.train_evaluator, event_name=Events.EPOCH_COMPLETED, tag="training", metric_names=[ "loss", "accuracy", "precision", "recall", "f1", "topKCatAcc" ], global_step_transform=global_step_from_engine(self.train_engine), ) # Logging epoch validation metrics self.tb_logger.attach_output_handler( engine=self.evaluator, event_name=Events.EPOCH_COMPLETED, tag="validation", metric_names=[ "loss", "accuracy", "precision", "recall", "f1", "topKCatAcc" ], global_step_transform=global_step_from_engine(self.train_engine), ) # Attach the logger to the trainer to log model's weights as a histogram after each epoch self.tb_logger.attach(self.train_engine, event_name=Events.EPOCH_COMPLETED, log_handler=WeightsHistHandler(self.model)) # Attach the logger to the trainer to log model's gradients as a histogram after each epoch self.tb_logger.attach(self.train_engine, event_name=Events.EPOCH_COMPLETED, log_handler=GradsHistHandler(self.model)) print('Tensorboard Logging...', end='') print('done')
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_dir): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() device = "cpu" if torch.cuda.is_available(): device = "cuda" model.to(device) # Move model before creating optimizer optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) criterion = nn.CrossEntropyLoss() trainer = create_supervised_trainer(model, optimizer, criterion, device=device) trainer.logger = setup_logger("Trainer") if sys.version_info > (3, ): from ignite.contrib.metrics.gpu_info import GpuInfo try: GpuInfo().attach(trainer) except RuntimeError: print( "INFO: By default, in this example it is possible to log GPU information (used memory, utilization). " "As there is no pynvml python package installed, GPU information won't be logged. Otherwise, please " "install it : `pip install pynvml`") metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)} train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) train_evaluator.logger = setup_logger("Train Evaluator") validation_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) validation_evaluator.logger = setup_logger("Val Evaluator") @trainer.on(Events.EPOCH_COMPLETED) def compute_metrics(engine): train_evaluator.run(train_loader) validation_evaluator.run(val_loader) tb_logger = TensorboardLogger(log_dir=log_dir) tb_logger.attach_output_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), tag="training", output_transform=lambda loss: {"batchloss": loss}, metric_names="all", ) for tag, evaluator in [("training", train_evaluator), ("validation", validation_evaluator)]: tb_logger.attach_output_handler( evaluator, event_name=Events.EPOCH_COMPLETED, tag=tag, metric_names=["loss", "accuracy"], global_step_transform=global_step_from_engine(trainer), ) tb_logger.attach_opt_params_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), optimizer=optimizer) tb_logger.attach(trainer, log_handler=WeightsScalarHandler(model), event_name=Events.ITERATION_COMPLETED(every=100)) tb_logger.attach(trainer, log_handler=WeightsHistHandler(model), event_name=Events.EPOCH_COMPLETED(every=100)) tb_logger.attach(trainer, log_handler=GradsScalarHandler(model), event_name=Events.ITERATION_COMPLETED(every=100)) tb_logger.attach(trainer, log_handler=GradsHistHandler(model), event_name=Events.EPOCH_COMPLETED(every=100)) def score_function(engine): return engine.state.metrics["accuracy"] model_checkpoint = ModelCheckpoint( log_dir, n_saved=2, filename_prefix="best", score_function=score_function, score_name="validation_accuracy", global_step_transform=global_step_from_engine(trainer), ) validation_evaluator.add_event_handler(Events.COMPLETED, model_checkpoint, {"model": model}) # kick everything off trainer.run(train_loader, max_epochs=epochs) tb_logger.close()
output_transform=lambda output: {'loss': output['loss']}, metric_names=[f"gpu:{args.gpu} mem(%)"]) # FIRE tb_logger = TensorboardLogger(log_dir=TENSORBOARD_RUN_LOG_DIR_PATH) tb_logger.attach( trainer, log_handler=OutputHandler( tag='training', output_transform=lambda output: {'loss': output['loss']}), event_name=Events.ITERATION_COMPLETED( every=LOG_TRAINING_PROGRESS_EVERY_N)) tb_logger.attach( evaluator, log_handler=OutputHandler( tag='validation', metric_names='all', global_step_transform=global_step_from_engine(trainer)), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(opt), event_name=Events.ITERATION_STARTED) tb_logger.attach(trainer, log_handler=WeightsHistHandler(mude), event_name=Events.EPOCH_COMPLETED) trainer.run(train_ld, max_epochs=EPOCHS) tb_logger.close() torch.save(mude.state_dict(), CHECKPOINTS_RUN_DIR_PATH.joinpath(f"{RUN_NAME}-last.pth"))
def setup(self, training_metrics): def metric_name(n) -> str: if n.endswith('Accuracy'): n = 'acc' else: n = n[:-6] if n.endswith('Metric') else n return n def print_metrics(metrics) -> str: rv = '' metric_keys = sorted(k for k in metrics) for k in metric_keys: if k == 'Accuracy': rv += f'{metric_name(k)}: {metrics[k]:.3}' else: rv += f'{metric_name(k)}: {metrics[k]:.6}' return rv if self.seed: set_seed_everywhere(self.seed, self.cuda) pbar = ProgressBar() names = [] for k, v in training_metrics.items(): name = f'r{k}' names.append(name) RunningAverage(v).attach(self.trainer, name) RunningAverage(None, output_transform=lambda x: x[-1] * self. loss_accumulation_steps).attach(self.trainer, 'rloss') names.append('rloss') pbar.attach(self.trainer, names) pbar = ProgressBar() pbar.attach(self.evaluator) # A few events handler. To add / modify the events handler, you need to extend the __init__ method of RunnerABC # Ignite provides the necessary abstractions and a furnished repository of useful tools @self.trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(trainer): self.evaluator.run(self.dataset_splits.val_data_loader()) metrics = self.evaluator.state.metrics logger.info( f"Validation Results - Epoch: {trainer.state.epoch} {print_metrics(metrics)}" ) if self.scheduler: self.scheduler.step( metrics[self.loss_metric.__class__.__name__]) @self.trainer.on(Events.COMPLETED) def log_test_results(trainer): self.evaluator.run(self.dataset_splits.test_data_loader()) metrics = self.evaluator.state.metrics logger.info( f"Test Results - Epoch: {trainer.state.epoch} {print_metrics(metrics)}" ) if self.tensorboard_logs: tb_logger = TensorboardLogger(log_dir=self.tensorboard_logs) tb_logger.attach(self.trainer, log_handler=OutputHandler( tag="training", output_transform=lambda loss: {'loss': loss}), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(self.evaluator, log_handler=OutputHandler( tag="validation", metric_names=["LossMetric"], another_engine=self.trainer), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(self.trainer, log_handler=OptimizerParamsHandler( self.optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(self.trainer, log_handler=WeightsScalarHandler(self.model), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(self.trainer, log_handler=WeightsHistHandler(self.model), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(self.trainer, log_handler=GradsScalarHandler(self.model), event_name=Events.ITERATION_COMPLETED) # This is important to close the tensorboard file logger @self.trainer.on(Events.COMPLETED) def end_tensorboard(trainer): logger.info("Training completed") tb_logger.close() if self.embeddings_name: @self.trainer.on(Events.COMPLETED) def log_embeddings(trainer): if hasattr(self.model, self.embeddings_name) and hasattr( self.dataset_splits, "vectorizer"): logger.info( f"Logging embeddings ({self.embeddings_name}) to Tensorboard!" ) embeddings = getattr(self.model, self.embeddings_name).weight.data metadata = [ str(self.dataset_splits.vectorizer.data_vocab. _id2token[token_index]).encode('utf-8') for token_index in range(embeddings.shape[0]) ] self.writer.add_embedding( mat=embeddings, metadata=metadata, global_step=self.trainer.state.epoch)
def run(warmup_iterations=5000, batch_size=4, test_size=2000, epochs=10, log_interval=100, debug_images_interval=50, train_dataset_ann_file='~/bigdata/coco/annotations/instances_train2017.json', val_dataset_ann_file='~/bigdata/coco/annotations/instances_val2017.json', input_checkpoint='', load_optimizer=False, load_params=False, output_dir="/tmp/checkpoints", log_dir="/tmp/tensorboard_logs", lr=0.005, momentum=0.9, weight_decay=0.0005, use_mask=True, use_toy_testing_data=False, backbone_name='resnet101', num_workers=6, trainable_layers=3, train_set_size=None, early_stopping=False, patience=3, step_size=3, gamma=0.1, record_histograms=True): # Set the training device to GPU if available - if not set it to CPU device = torch.cuda.current_device() if torch.cuda.is_available() else torch.device('cpu') torch.backends.cudnn.benchmark = True if torch.cuda.is_available() else False # optimization for fixed input size # Write hyperparams hparam_dict = { 'warmup_iterations': warmup_iterations, 'training_batch_size': batch_size, 'test_size': test_size, 'epochs': epochs, 'trainable_layers': trainable_layers, 'lr': lr, 'momentum': momentum, 'weight_decay': weight_decay, 'train_set_size': train_set_size, 'step_size': step_size, 'gamma': gamma, 'early_stopping': early_stopping, 'patience': patience, 'total_iterations': 0, 'total_epochs': 0, 'timeout': True, } # Load checkpoint if available if input_checkpoint: hparam_path = Path(input_checkpoint).parent / 'hparams.pickle' logger.info('Loading model checkpoint from '.format(input_checkpoint)) input_checkpoint = torch.load(input_checkpoint, map_location=torch.device(device)) # FIXME Bad overload with open(hparam_path, 'rb') as f: hparam_dict = pickle.load(f) # Load the training parameters from the saved hparam dictionary if load_params: warmup_iterations, batch_size, test_size, epochs, trainable_layers, lr, momentum,\ weight_decay, train_set_size, step_size, gamma, early_stopping, patience = itemgetter( 'warmup_iterations', 'training_batch_size', 'test_size', 'epochs', 'trainable_layers', 'lr', 'momentum', 'weight_decay', 'train_set_size', 'step_size', 'gamma', 'early_stopping', 'patience')(hparam_dict) try: train_set_size -= 1 except TypeError as e: pass print('Hparams: ', hparam_dict) # Define train and test datasets train_loader, val_loader, labels_enum = get_data_loaders(train_dataset_ann_file, val_dataset_ann_file, batch_size, test_size, configuration_data.get('image_size'), use_mask=use_mask, _use_toy_testing_set=use_toy_testing_data, num_workers=num_workers, train_set_size=train_set_size) # Hparams hparam_dict['training_set_size'] = len(train_loader) * batch_size hparam_dict['validation_set_size'] = len(val_loader) * batch_size with open(os.path.join(output_dir, 'hparams.pickle'), 'wb') as f: pickle.dump(hparam_dict, f) val_dataset = list(chain.from_iterable( zip(*copy.deepcopy(batch)) for batch in iter(val_loader))) # TODO Figure out what this does and use deepcopy. coco_api_val_dataset = convert_to_coco_api(val_dataset) num_classes = max(labels_enum.keys()) + 1 # number of classes plus one for background class configuration_data['num_classes'] = num_classes logger.info('Training with {} classes...'.format(num_classes)) if use_mask: logger.debug('Loading MaskRCNN Model...') model = get_model_instance_segmentation(num_classes, configuration_data.get('mask_predictor_hidden_layer')) else: logger.debug('Loading FasterRCNN Model...') model = get_model_instance_detection(num_classes, backbone_name=backbone_name, trainable_layers=trainable_layers) iou_types = get_iou_types(model) # if there is more than one GPU, parallelize the model if torch.cuda.device_count() > 1: logger.debug("{} GPUs were detected - we will use all of them".format(torch.cuda.device_count())) model = torch.nn.DataParallel(model) # copy the model to each device model.to(device) if input_checkpoint: model.load_state_dict(input_checkpoint['model']) logger.debug('Initializing SummaryWriter...') if use_mask: comment = 'mask' else: comment = 'box-{}'.format(backbone_name) logger.debug('Creating Trainer...') # define Ignite's train and evaluation engine trainer = create_trainer(model, device) logger.debug('Creating Evaluator...') evaluator = create_evaluator(model, device) logger.debug('Initializing Tensorboard Logger...') tb_logger = TensorboardLogger(log_dir=log_dir, comment=comment) if record_histograms: tb_logger.attach( trainer, event_name=Events.ITERATION_COMPLETED(every=500), log_handler=WeightsHistHandler(model) ) writer = tb_logger.writer logger.debug('Setting up profiler...') profiler = BasicTimeProfiler() profiler.attach(trainer) coco_ap = CocoAP(coco_api_val_dataset, iou_types) coco_ap_05 = CocoAP5(coco_api_val_dataset, iou_types) coco_ap_075 = CocoAP75(coco_api_val_dataset, iou_types) coco_ap.attach(evaluator, "AP") coco_ap_05.attach(evaluator, "AP0.5") coco_ap_075.attach(evaluator, "AP0.75") tb_logger.attach( evaluator, log_handler=OutputHandler( tag='evaluation', metric_names=['AP', 'AP0.5', 'AP0.75'], global_step_transform=global_step_from_engine(trainer) ), event_name=Events.EPOCH_COMPLETED ) ## Early stopping def score_function(engine): ap_score = engine.state.metrics['AP'] return ap_score if early_stopping: handler = EarlyStopping(patience=patience, score_function=score_function, trainer=trainer) # Note: the handler is attached to an *Evaluator* (runs one epoch on validation dataset). evaluator.add_event_handler(Events.COMPLETED, handler) @trainer.on(Events.EPOCH_COMPLETED) def log_intermediate_results(): logger.debug('Epoch Complete...') profiler.print_results(profiler.get_results()) @trainer.on(Events.STARTED) def on_training_started(engine): # construct an optimizer logger.info('Started Training...') params = [p for p in model.parameters() if p.requires_grad] engine.state.optimizer = torch.optim.SGD(params, lr=lr, momentum=momentum, weight_decay=weight_decay) tb_logger.attach( trainer, log_handler=OptimizerParamsHandler(engine.state.optimizer), event_name=Events.ITERATION_STARTED ) engine.state.scheduler = torch.optim.lr_scheduler.StepLR(engine.state.optimizer, step_size=step_size, gamma=gamma) if input_checkpoint: # Load traininer states trainer.state.epoch = input_checkpoint['epoch'] if 'iteration' in input_checkpoint: trainer.state.iteration = input_checkpoint['iteration'] else: trainer.state.iteration = int(hparam_dict['training_set_size'] / batch_size * input_checkpoint['epoch']) if load_optimizer: print('loading optimizer') logger.info('Loading optimizer and scheduler...') engine.state.optimizer.load_state_dict(input_checkpoint['optimizer']) engine.state.scheduler.load_state_dict(input_checkpoint['lr_scheduler']) engine.state.scheduler.last_epoch = trainer.state.epoch else: print('not loading optimizer') @trainer.on(Events.EPOCH_STARTED) def on_epoch_started(engine): logger.debug('Started Epoch...') model.train() engine.state.warmup_scheduler = None #TODO Print optimizer values if engine.state.epoch == 1: warmup_iters = min(warmup_iterations, len(train_loader) - 1) print('Warm up period was set to {} iterations'.format(warmup_iters)) warmup_factor = 1. / warmup_iters engine.state.warmup_scheduler = utils.warmup_lr_scheduler(engine.state.optimizer, warmup_iters, warmup_factor) @trainer.on(Events.ITERATION_COMPLETED) def on_iteration_completed(engine): images, targets, loss_dict_reduced = engine.state.output if engine.state.iteration % log_interval == 0: loss = sum(loss for loss in loss_dict_reduced.values()).item() print("Epoch: {}, Iteration: {}, Loss: {}".format(engine.state.epoch, engine.state.iteration, loss)) for k, v in loss_dict_reduced.items(): writer.add_scalar("loss/{}".format(k), v.item(), engine.state.iteration) writer.add_scalar("loss/total_loss", sum(loss for loss in loss_dict_reduced.values()).item(), engine.state.iteration) # writer.add_scalar("learning_rate/lr", engine.state.optimizer.param_groups[0]['lr'], engine.state.iteration) if engine.state.iteration % debug_images_interval == 0: for n, debug_image in enumerate(draw_debug_images(images, targets)): writer.add_image("training/image_{}".format(n), debug_image, engine.state.iteration, dataformats='HWC') if 'masks' in targets[n]: writer.add_image("training/image_{}_mask".format(n), draw_mask(targets[n]), engine.state.iteration, dataformats='HW') images = targets = loss_dict_reduced = engine.state.output = None @trainer.on(Events.EPOCH_COMPLETED) def on_epoch_completed(engine): logger.debug('Finished Epoch...') update_hparams(engine) engine.state.scheduler.step() evaluator.run(val_loader) # for res_type in evaluator.state.coco_evaluator.iou_types: # average_precision_05 = evaluator.state.coco_evaluator.coco_eval[res_type].stats[1] # writer.add_scalar("validation-{}/average precision 0_5".format(res_type), average_precision_05, # engine.state.iteration) checkpoint_path = os.path.join(output_dir, 'model_epoch_{}.pth'.format(engine.state.epoch)) print('Saving model checkpoint') checkpoint = { 'model': model.state_dict(), 'optimizer': engine.state.optimizer.state_dict(), 'lr_scheduler': engine.state.scheduler.state_dict(), 'epoch': engine.state.epoch, 'iteration': engine.state.iteration, 'configuration': configuration_data, 'labels_enumeration': labels_enum} utils.save_on_master(checkpoint, checkpoint_path) print('Model checkpoint from epoch {} was saved at {}'.format(engine.state.epoch, checkpoint_path)) checkpoint = None evaluator.state = State() @trainer.on(Events.COMPLETED) def on_training_completed(engine): logger.debug('Finished Training...') update_hparams(engine, finished=True) writer.add_hparams(hparam_dict=hparam_dict, metric_dict={ 'hparams/AP': coco_ap.ap, 'hparams/AP.5': coco_ap_05.ap5, 'hparams/AP.75': coco_ap_075.ap75 }) logger.debug('Wrote hparams...') def update_hparams(engine, finished=False): hparam_dict['total_iterations'] = global_step_from_engine(engine)(engine, Events.ITERATION_COMPLETED) hparam_dict['total_epochs'] = global_step_from_engine(engine)(engine, Events.EPOCH_COMPLETED) hparam_dict['timeout'] = not finished if hparam_dict['train_set_size'] is None: hparam_dict['train_set_size'] = hparam_dict['training_set_size'] try: shutil.copyfile(os.path.join(output_dir, 'hparams.pickle'), os.path.join(output_dir, 'hparams.pickle.backup')) with open(os.path.join(output_dir, 'hparams.pickle'), 'wb') as f: pickle.dump(hparam_dict, f) except AttributeError as e: print('Could not pickle one of the total vars.', e) os.replace(os.path.join(output_dir, 'hparams.pickle.backup'), os.path.join(output_dir, 'hparams.pickle')) @evaluator.on(Events.STARTED) def on_evaluation_started(engine): logger.debug('Started Evaluation...') model.eval() # engine.state.coco_evaluator = CocoEvaluator(coco_api_val_dataset, iou_types) @evaluator.on(Events.ITERATION_COMPLETED) def on_eval_iteration_completed(engine): images, targets, results = engine.state.output if engine.state.iteration % log_interval == 0: print("Evaluation: Iteration: {}".format(engine.state.iteration)) if engine.state.iteration % debug_images_interval == 0: for n, debug_image in enumerate(draw_debug_images(images, targets, results)): print('Drawing debug image "validation/image_{}_{}"'.format(engine.state.iteration, n)) writer.add_image("evaluation/image_{}_{}".format(engine.state.iteration, n), debug_image, trainer.state.iteration, dataformats='HWC') if 'masks' in targets[n]: writer.add_image("validation/image_{}_{}_mask".format(engine.state.iteration, n), draw_mask(targets[n]), trainer.state.iteration, dataformats='HW') curr_image_id = int(targets[n]['image_id']) writer.add_image("validation/image_{}_{}_predicted_mask".format(engine.state.iteration, n), draw_mask(results[curr_image_id]).squeeze(), trainer.state.iteration, dataformats='HW') images = targets = results = engine.state.output = None @evaluator.on(Events.COMPLETED) def on_evaluation_completed(engine): logger.debug('Finished Evaluation...') # gather the stats from all processes # engine.state.coco_evaluator.synchronize_between_processes() # # # accumulate predictions from all images # engine.state.coco_evaluator.accumulate() # engine.state.coco_evaluator.summarize() # # pr_50, pr_75 = get_pr_levels(engine.state.coco_evaluator.coco_eval['bbox']) # TODO Bring this back # writer.add_hparams(hparam_dict, { # 'hparams/AP.5': np.mean(pr_50), # 'hparams/AP.75': np.mean(pr_75) # }) logger.debug('Running Trainer...') trainer.run(train_loader, max_epochs=epochs) writer.close() profiler.write_results('{}/time_profiling.csv'.format(output_dir))
def train(epochs=500, batch_size=32, bptt_len=70, lr=0.00025, log_steps=200, clip_grad=0.25, log_dir="experiments"): ################################################################### # Dataset ################################################################### wt = wikitext103(batch_size=batch_size, bptt_len=bptt_len) # wt = wikitext2(batch_size=batch_size, bptt_len=bptt_len) ################################################################### # Configs ################################################################### embedding_config = DropEmbedding.Hyperparams(len(wt.text_field.vocab) + 3, ninp=512) encoder_config = TransformerEncoder.Hyperparams( att_num_units=[512, 512, 512, 512, 512, 512], max_ext=384) ################################################################### # Models ################################################################### base_embedding = DropEmbedding(embedding_config) embedding = TransformerEmbedding(embedding=base_embedding, max_length=bptt_len, embedding_size=embedding_config.ninp, use_positional_embedding=False) encoder = TransformerEncoder(encoder_config) model = TransformerLanguageModel(embedding, encoder) model.init_weight() ################################################################### # Loss ################################################################### criterion = lm_criterion(in_features=encoder_config.att_num_units[-1], vocab_size=len(wt.text_field.vocab)) ################################################################### # Parameters + Train ops ################################################################### parameters = (list(model.parameters()) + list(criterion.parameters())) tot_params = 0 for p in parameters: tot_params += reduce(lambda x, y: x * y, p.size()) print("Total Parameters: ", tot_params) opt = optim.Adam(parameters, lr=lr) model.to(DEVICE) criterion.to(DEVICE) ################################################################### # Train + Evaluation ################################################################### def train_step(engine, batch): model.train() opt.zero_grad() text = batch.text.to(DEVICE).t().contiguous() target = batch.target.to(DEVICE).t().contiguous() out, out_past = model(text, engine.state.train_past) engine.state.train_past = out_past raw_loss = criterion(out.view(-1, out.size(2)), target.view(-1)) loss = raw_loss[1] loss.backward() nn.utils.clip_grad_norm_(parameters, clip_grad) opt.step() return {"train_loss": loss.item(), "train_ppl": loss.exp().item()} def eval_step(engine, batch): model.eval() if not hasattr(engine.state, "eval_past"): engine.state.eval_past = None with torch.no_grad(): text = batch.text.to(DEVICE).t().contiguous() target = batch.target.to(DEVICE).t().contiguous() out, out_past = model(text, engine.state.eval_past) engine.state.eval_past = out_past raw_loss = criterion(out.view(-1, out.size(2)), target.view(-1)) loss = raw_loss[1] return {"val_loss": loss.item()} train_engine = Engine(train_step) eval_engine = Engine(eval_step) def reset_state(engine): engine.state.train_past = None def run_eval(_): print("start running eval") eval_engine.run(wt.valid_iter) metrics = eval_engine.state.metrics print("Validation loss: ", metrics["val_loss"], ", ppl: ", np.exp(metrics["val_loss"])) train_engine.add_event_handler(Events.EPOCH_STARTED, reset_state) train_engine.add_event_handler(Events.EPOCH_COMPLETED, run_eval) ################################################################### # LR Scheduler ################################################################### cosine_scheduler = CosineAnnealingScheduler(opt.param_groups[0], "lr", 0.0, 2.5e-4, cycle_size=len(wt.train_iter)) warmup_scheduler = create_lr_scheduler_with_warmup(cosine_scheduler, 0.0, 2.5e-4, 200) train_engine.add_event_handler(Events.ITERATION_STARTED, warmup_scheduler) ################################################################### # Metrics ################################################################### RunningAverage(output_transform=lambda x: x["train_ppl"]).attach( train_engine, "train_ppl") RunningAverage(output_transform=lambda x: x["train_loss"]).attach( train_engine, "train_loss") RunningAverage(output_transform=lambda x: x["val_loss"]).attach( eval_engine, "val_loss") progress_bar = ProgressBar(persist=True) progress_bar.attach(train_engine, ["train_ppl", "train_loss"]) progress_bar_val = ProgressBar(persist=True) progress_bar_val.attach(eval_engine, ["val_loss"]) ################################################################### # Tensorboard ################################################################### tb_logger = TensorboardLogger(log_dir=log_dir) def stepn_logger(num_steps, handler): def logger_runner(engine, log_handler, event_name): if engine.state.iteration % num_steps == 0: handler(engine, log_handler, event_name) return logger_runner tb_logger.attach(train_engine, log_handler=stepn_logger( log_steps, OutputHandler(tag="training", output_transform=lambda loss: loss)), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(eval_engine, log_handler=OutputHandler( tag="validation", output_transform=lambda loss: loss, another_engine=train_engine), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(train_engine, log_handler=stepn_logger(log_steps, OptimizerParamsHandler(opt)), event_name=Events.ITERATION_STARTED) tb_logger.attach(train_engine, log_handler=stepn_logger(log_steps, WeightsScalarHandler(model)), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(train_engine, log_handler=stepn_logger(log_steps, GradsScalarHandler(model)), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(train_engine, log_handler=stepn_logger(500, WeightsHistHandler(model)), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(train_engine, log_handler=stepn_logger(500, GradsHistHandler(model)), event_name=Events.ITERATION_COMPLETED) try: train_engine.run(wt.train_iter, max_epochs=epochs) except Exception: pass finally: tb_logger.close()
metric_names=["loss", "accuracy", "precision", "recall", "f1", "topKCatAcc"], global_step_transform=global_step_from_engine(trainer), ) # Logging epoch validation metrics tb_logger.attach_output_handler( engine=evaluator, event_name=Events.EPOCH_COMPLETED, tag="validation", metric_names=["loss", "accuracy", "precision", "recall", "f1", "topKCatAcc"], global_step_transform=global_step_from_engine(trainer), ) # Attach the logger to the trainer to log model's weights as a histogram after each epoch tb_logger.attach( trainer, event_name=Events.EPOCH_COMPLETED, log_handler=WeightsHistHandler(model) ) # Attach the logger to the trainer to log model's gradients as a histogram after each epoch tb_logger.attach( trainer, event_name=Events.EPOCH_COMPLETED, log_handler=GradsHistHandler(model) ) print('Tensorboard Logging...', end='') print('done') ## SETUP CALLBACKS print('[INFO] Creating callback functions for training loop...', end='') # Early Stopping - stops training if the validation loss does not decrease after 5 epochs handler = EarlyStopping(patience=early_stopping_patience, score_function=score_function_loss, trainer=trainer) evaluator.add_event_handler(Events.COMPLETED, handler)
def run(args): if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) num_classes = 21 device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') model = GoogLeNetFCN(num_classes) model.init_from_googlenet() device_count = torch.cuda.device_count() if device_count > 1: print("Using %d GPU(s)" % device_count) model = nn.DataParallel(model) args.batch_size = device_count * args.batch_size args.val_batch_size = device_count * args.val_batch_size model = model.to(device) train_loader, val_loader = get_data_loaders( args.dataset_dir, args.batch_size, args.val_batch_size, args.num_workers, args.download, args.augmentations) criterion = nn.CrossEntropyLoss(ignore_index=255, reduction='sum') optimizer = optim.SGD([{ 'params': [ param for name, param in model.named_parameters() if name.endswith('weight') ] }, { 'params': [ param for name, param in model.named_parameters() if name.endswith('bias') ], 'lr': args.lr * 2, 'weight_decay': 0 }], lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.resume: if os.path.isfile(args.resume): print("Loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_iou = checkpoint['bestIoU'] model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) print("Loaded checkpoint '{}' (Epoch {})".format( args.resume, checkpoint['epoch'])) else: print("No checkpoint found at '{}'".format(args.resume)) sys.exit() if args.freeze_bn: print("Freezing batch norm") model = freeze_batchnorm(model) trainer = create_supervised_trainer(model, optimizer, criterion, device, non_blocking=True) RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss') # attach progress bar pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=['loss']) cm = ConfusionMatrix(num_classes) evaluator = create_supervised_evaluator(model, metrics={ 'loss': Loss(criterion), 'IoU': IoU(cm) }, device=device, non_blocking=True) pbar2 = ProgressBar(persist=True, desc='Eval Epoch') pbar2.attach(evaluator) def _global_step_transform(engine, event_name): return trainer.state.iteration tb_logger = TensorboardLogger(args.log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag='training', metric_names=['loss']), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(trainer, log_handler=WeightsHistHandler(model), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(evaluator, log_handler=OutputHandler( tag='validation', metric_names=['loss', 'IoU'], global_step_transform=_global_step_transform), event_name=Events.EPOCH_COMPLETED) @evaluator.on(Events.EPOCH_COMPLETED) def save_checkpoint(engine): iou = engine.state.metrics['IoU'] * 100.0 mean_iou = iou.mean() is_best = mean_iou.item() > trainer.state.best_iou trainer.state.best_iou = max(mean_iou.item(), trainer.state.best_iou) name = 'epoch{}_mIoU={:.1f}.pth'.format(trainer.state.epoch, mean_iou) file = { 'model': model.state_dict(), 'epoch': trainer.state.epoch, 'iteration': engine.state.iteration, 'optimizer': optimizer.state_dict(), 'args': args, 'bestIoU': trainer.state.best_iou } save(file, args.output_dir, 'checkpoint_{}'.format(name)) if is_best: save(model.state_dict(), args.output_dir, 'model_{}'.format(name)) @trainer.on(Events.STARTED) def initialize(engine): if args.resume: engine.state.epoch = args.start_epoch engine.state.iteration = args.start_epoch * len( engine.state.dataloader) engine.state.best_iou = best_iou else: engine.state.best_iou = 0.0 @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): pbar.log_message("Start Validation - Epoch: [{}/{}]".format( engine.state.epoch, engine.state.max_epochs)) evaluator.run(val_loader) metrics = evaluator.state.metrics loss = metrics['loss'] iou = metrics['IoU'] mean_iou = iou.mean() pbar.log_message( "Validation results - Epoch: [{}/{}]: Loss: {:.2e}, mIoU: {:.1f}". format(engine.state.epoch, engine.state.max_epochs, loss, mean_iou * 100.0)) print("Start training") trainer.run(train_loader, max_epochs=args.epochs) tb_logger.close()