def test_weights_hist_handler_wrong_setup():

    with pytest.raises(TypeError, match="Argument model should be of type torch.nn.Module"):
        WeightsHistHandler(None)

    model = MagicMock(spec=torch.nn.Module)
    wrapper = WeightsHistHandler(model)
    mock_logger = MagicMock()
    mock_engine = MagicMock()
    with pytest.raises(RuntimeError, match="Handler 'WeightsHistHandler' works only with TensorboardLogger"):
        wrapper(mock_engine, mock_logger, Events.ITERATION_STARTED)
Ejemplo n.º 2
0
def add_tensorboard(engine_train, optimizer, model, log_dir):
    """Creates an ignite logger object and adds training elements such as weight and gradient histograms

    Args:
        engine_train (:obj:`ignite.engine`): the train engine to attach to the logger
        optimizer (:obj:`torch.optim`): the model's optimizer
        model (:obj:`torch.nn.Module`): the model being trained
        log_dir (string): path to where tensorboard data should be saved
    """
    # Create a logger
    tb_logger = TensorboardLogger(log_dir=log_dir)

    # Attach the logger to the trainer to log training loss at each iteration
    tb_logger.attach(engine_train,
                     log_handler=OutputHandler(
                         tag="training",
                         output_transform=lambda loss: {"loss": loss}),
                     event_name=Events.ITERATION_COMPLETED)

    # Attach the logger to the trainer to log optimizer's parameters, e.g. learning rate at each iteration
    tb_logger.attach(engine_train,
                     log_handler=OptimizerParamsHandler(optimizer),
                     event_name=Events.EPOCH_COMPLETED)

    # Attach the logger to the trainer to log model's weights as a histogram after each epoch
    tb_logger.attach(engine_train,
                     log_handler=WeightsHistHandler(model),
                     event_name=Events.EPOCH_COMPLETED)

    # Attach the logger to the trainer to log model's gradients as a histogram after each epoch
    tb_logger.attach(engine_train,
                     log_handler=GradsHistHandler(model),
                     event_name=Events.EPOCH_COMPLETED)

    tb_logger.close()
Ejemplo n.º 3
0
def test_weights_hist_handler_frozen_layers(dummy_model_factory):

    model = dummy_model_factory(with_grads=True, with_frozen_layer=True)

    wrapper = WeightsHistHandler(model)
    mock_logger = MagicMock(spec=TensorboardLogger)
    mock_logger.writer = MagicMock()

    mock_engine = MagicMock()
    mock_engine.state = State()
    mock_engine.state.epoch = 5

    wrapper(mock_engine, mock_logger, Events.EPOCH_STARTED)

    mock_logger.writer.add_histogram.assert_has_calls(
        [
            call(tag="weights/fc2/weight", values=ANY, global_step=5),
            call(tag="weights/fc2/bias", values=ANY, global_step=5),
        ],
        any_order=True,
    )

    with pytest.raises(AssertionError):
        mock_logger.writer.add_histogram.assert_has_calls(
            [
                call(tag="weights/fc1/weight", values=ANY, global_step=5),
                call(tag="weights/fc1/bias", values=ANY, global_step=5),
            ],
            any_order=True,
        )
    assert mock_logger.writer.add_histogram.call_count == 2
Ejemplo n.º 4
0
    def _test(tag=None):
        wrapper = WeightsHistHandler(model, tag=tag)
        mock_logger = MagicMock(spec=TensorboardLogger)
        mock_logger.writer = MagicMock()

        mock_engine = MagicMock()
        mock_engine.state = State()
        mock_engine.state.epoch = 5

        wrapper(mock_engine, mock_logger, Events.EPOCH_STARTED)

        tag_prefix = f"{tag}/" if tag else ""

        assert mock_logger.writer.add_histogram.call_count == 4
        mock_logger.writer.add_histogram.assert_has_calls(
            [
                call(tag=tag_prefix + "weights/fc1/weight",
                     values=ANY,
                     global_step=5),
                call(tag=tag_prefix + "weights/fc1/bias",
                     values=ANY,
                     global_step=5),
                call(tag=tag_prefix + "weights/fc2/weight",
                     values=ANY,
                     global_step=5),
                call(tag=tag_prefix + "weights/fc2/bias",
                     values=ANY,
                     global_step=5),
            ],
            any_order=True,
        )
Ejemplo n.º 5
0
    def custom_setup(self):

        if self.tensorboard_logs:
            tb_logger = TensorboardLogger(log_dir=self.tensorboard_logs)
            tb_logger.attach(self.trainer,
                             log_handler=OutputHandler(
                                 tag="training",
                                 output_transform=lambda loss: {'loss': loss}),
                             event_name=Events.ITERATION_COMPLETED)
            tb_logger.attach(self.evaluator,
                             log_handler=OutputHandler(
                                 tag="validation",
                                 metric_names=["LossMetric"],
                                 another_engine=self.trainer),
                             event_name=Events.EPOCH_COMPLETED)

            if self.optional_tensorboard_features:
                tb_logger.attach(self.trainer,
                                 log_handler=OptimizerParamsHandler(
                                     self.optimizer),
                                 event_name=Events.ITERATION_STARTED)
                tb_logger.attach(self.trainer,
                                 log_handler=WeightsScalarHandler(self.model),
                                 event_name=Events.ITERATION_COMPLETED)
                tb_logger.attach(self.trainer,
                                 log_handler=WeightsHistHandler(self.model),
                                 event_name=Events.EPOCH_COMPLETED)
                tb_logger.attach(self.trainer,
                                 log_handler=GradsScalarHandler(self.model),
                                 event_name=Events.ITERATION_COMPLETED)

            # This is important to close the tensorboard file logger
            @self.trainer.on(Events.COMPLETED)
            def end_tensorboard(trainer):
                logger.info("Training completed")
                tb_logger.close()

        if self.embeddings_name:

            @self.trainer.on(Events.COMPLETED)
            def log_embeddings(trainer):
                if hasattr(self.model, self.embeddings_name) and hasattr(
                        self.dataset_splits, "vectorizer") and TENSORBOARD:
                    logger.info(
                        f"Logging embeddings ({self.embeddings_name}) to Tensorboard!"
                    )
                    embeddings = getattr(self.model,
                                         self.embeddings_name).weight.data
                    metadata = [
                        str(self.dataset_splits.vectorizer.data_vocab.
                            _id2token[token_index]).encode('utf-8')
                        for token_index in range(embeddings.shape[0])
                    ]
                    self.writer.add_embedding(
                        mat=embeddings,
                        metadata=metadata,
                        global_step=self.trainer.state.epoch)
Ejemplo n.º 6
0
def test_weights_hist_handler_whitelist(dummy_model_factory):
    model = dummy_model_factory()

    wrapper = WeightsHistHandler(model, whitelist=["fc2.weight"])
    mock_logger = MagicMock(spec=TensorboardLogger)
    mock_logger.writer = MagicMock()

    mock_engine = MagicMock()
    mock_engine.state = State()
    mock_engine.state.epoch = 5

    wrapper(mock_engine, mock_logger, Events.EPOCH_STARTED)
    mock_logger.writer.add_histogram.assert_called_once_with(tag="weights/fc2/weight", values=ANY, global_step=5)
    mock_logger.writer.reset_mock()

    wrapper = WeightsHistHandler(model, tag="model", whitelist=["fc1"])
    wrapper(mock_engine, mock_logger, Events.EPOCH_STARTED)

    mock_logger.writer.add_histogram.assert_has_calls(
        [
            call(tag="model/weights/fc1/weight", values=ANY, global_step=5),
            call(tag="model/weights/fc1/bias", values=ANY, global_step=5),
        ],
        any_order=True,
    )
    assert mock_logger.writer.add_histogram.call_count == 2
    mock_logger.writer.reset_mock()

    def weight_selector(n, _):
        return "bias" in n

    wrapper = WeightsHistHandler(model, tag="model", whitelist=weight_selector)
    wrapper(mock_engine, mock_logger, Events.EPOCH_STARTED)

    mock_logger.writer.add_histogram.assert_has_calls(
        [
            call(tag="model/weights/fc1/bias", values=ANY, global_step=5),
            call(tag="model/weights/fc2/bias", values=ANY, global_step=5),
        ],
        any_order=True,
    )
    assert mock_logger.writer.add_histogram.call_count == 2
Ejemplo n.º 7
0
    def add_tensorboard_logging(self, logging_dir=None):

        # Add TensorBoard logging
        if logging_dir is None:
            os.path.join(self.config.DIRS.WORKING_DIR, 'tb_logs')
        else:
            os.path.join(logging_dir, 'tb_logs')
        print('Tensorboard logging saving to:: {} ...'.format(logging_dir),
              end='')

        self.tb_logger = TensorboardLogger(log_dir=logging_dir)
        # Logging iteration loss
        self.tb_logger.attach_output_handler(
            engine=self.train_engine,
            event_name=Events.ITERATION_COMPLETED,
            tag='training',
            output_transform=lambda loss: {"batch loss": loss})
        # Logging epoch training metrics
        self.tb_logger.attach_output_handler(
            engine=self.train_evaluator,
            event_name=Events.EPOCH_COMPLETED,
            tag="training",
            metric_names=[
                "loss", "accuracy", "precision", "recall", "f1", "topKCatAcc"
            ],
            global_step_transform=global_step_from_engine(self.train_engine),
        )
        # Logging epoch validation metrics
        self.tb_logger.attach_output_handler(
            engine=self.evaluator,
            event_name=Events.EPOCH_COMPLETED,
            tag="validation",
            metric_names=[
                "loss", "accuracy", "precision", "recall", "f1", "topKCatAcc"
            ],
            global_step_transform=global_step_from_engine(self.train_engine),
        )
        # Attach the logger to the trainer to log model's weights as a histogram after each epoch
        self.tb_logger.attach(self.train_engine,
                              event_name=Events.EPOCH_COMPLETED,
                              log_handler=WeightsHistHandler(self.model))
        # Attach the logger to the trainer to log model's gradients as a histogram after each epoch
        self.tb_logger.attach(self.train_engine,
                              event_name=Events.EPOCH_COMPLETED,
                              log_handler=GradsHistHandler(self.model))
        print('Tensorboard Logging...', end='')
        print('done')
Ejemplo n.º 8
0
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_dir):
    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                val_batch_size)
    model = Net()
    device = "cpu"

    if torch.cuda.is_available():
        device = "cuda"

    model.to(device)  # Move model before creating optimizer
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    criterion = nn.CrossEntropyLoss()
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device)
    trainer.logger = setup_logger("Trainer")

    if sys.version_info > (3, ):
        from ignite.contrib.metrics.gpu_info import GpuInfo

        try:
            GpuInfo().attach(trainer)
        except RuntimeError:
            print(
                "INFO: By default, in this example it is possible to log GPU information (used memory, utilization). "
                "As there is no pynvml python package installed, GPU information won't be logged. Otherwise, please "
                "install it : `pip install pynvml`")

    metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)}

    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device)
    train_evaluator.logger = setup_logger("Train Evaluator")
    validation_evaluator = create_supervised_evaluator(model,
                                                       metrics=metrics,
                                                       device=device)
    validation_evaluator.logger = setup_logger("Val Evaluator")

    @trainer.on(Events.EPOCH_COMPLETED)
    def compute_metrics(engine):
        train_evaluator.run(train_loader)
        validation_evaluator.run(val_loader)

    tb_logger = TensorboardLogger(log_dir=log_dir)

    tb_logger.attach_output_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=100),
        tag="training",
        output_transform=lambda loss: {"batchloss": loss},
        metric_names="all",
    )

    for tag, evaluator in [("training", train_evaluator),
                           ("validation", validation_evaluator)]:
        tb_logger.attach_output_handler(
            evaluator,
            event_name=Events.EPOCH_COMPLETED,
            tag=tag,
            metric_names=["loss", "accuracy"],
            global_step_transform=global_step_from_engine(trainer),
        )

    tb_logger.attach_opt_params_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=100),
        optimizer=optimizer)

    tb_logger.attach(trainer,
                     log_handler=WeightsScalarHandler(model),
                     event_name=Events.ITERATION_COMPLETED(every=100))

    tb_logger.attach(trainer,
                     log_handler=WeightsHistHandler(model),
                     event_name=Events.EPOCH_COMPLETED(every=100))

    tb_logger.attach(trainer,
                     log_handler=GradsScalarHandler(model),
                     event_name=Events.ITERATION_COMPLETED(every=100))

    tb_logger.attach(trainer,
                     log_handler=GradsHistHandler(model),
                     event_name=Events.EPOCH_COMPLETED(every=100))

    def score_function(engine):
        return engine.state.metrics["accuracy"]

    model_checkpoint = ModelCheckpoint(
        log_dir,
        n_saved=2,
        filename_prefix="best",
        score_function=score_function,
        score_name="validation_accuracy",
        global_step_transform=global_step_from_engine(trainer),
    )
    validation_evaluator.add_event_handler(Events.COMPLETED, model_checkpoint,
                                           {"model": model})

    # kick everything off
    trainer.run(train_loader, max_epochs=epochs)

    tb_logger.close()
Ejemplo n.º 9
0
                output_transform=lambda output: {'loss': output['loss']},
                metric_names=[f"gpu:{args.gpu} mem(%)"])

    # FIRE
    tb_logger = TensorboardLogger(log_dir=TENSORBOARD_RUN_LOG_DIR_PATH)
    tb_logger.attach(
        trainer,
        log_handler=OutputHandler(
            tag='training',
            output_transform=lambda output: {'loss': output['loss']}),
        event_name=Events.ITERATION_COMPLETED(
            every=LOG_TRAINING_PROGRESS_EVERY_N))
    tb_logger.attach(
        evaluator,
        log_handler=OutputHandler(
            tag='validation',
            metric_names='all',
            global_step_transform=global_step_from_engine(trainer)),
        event_name=Events.EPOCH_COMPLETED)
    tb_logger.attach(trainer,
                     log_handler=OptimizerParamsHandler(opt),
                     event_name=Events.ITERATION_STARTED)
    tb_logger.attach(trainer,
                     log_handler=WeightsHistHandler(mude),
                     event_name=Events.EPOCH_COMPLETED)

    trainer.run(train_ld, max_epochs=EPOCHS)
    tb_logger.close()
    torch.save(mude.state_dict(),
               CHECKPOINTS_RUN_DIR_PATH.joinpath(f"{RUN_NAME}-last.pth"))
Ejemplo n.º 10
0
    def setup(self, training_metrics):
        def metric_name(n) -> str:
            if n.endswith('Accuracy'):
                n = 'acc'
            else:
                n = n[:-6] if n.endswith('Metric') else n
            return n

        def print_metrics(metrics) -> str:
            rv = ''
            metric_keys = sorted(k for k in metrics)
            for k in metric_keys:
                if k == 'Accuracy':
                    rv += f'{metric_name(k)}: {metrics[k]:.3}'
                else:
                    rv += f'{metric_name(k)}: {metrics[k]:.6}'
            return rv

        if self.seed:
            set_seed_everywhere(self.seed, self.cuda)

        pbar = ProgressBar()

        names = []
        for k, v in training_metrics.items():
            name = f'r{k}'
            names.append(name)
            RunningAverage(v).attach(self.trainer, name)
        RunningAverage(None,
                       output_transform=lambda x: x[-1] * self.
                       loss_accumulation_steps).attach(self.trainer, 'rloss')
        names.append('rloss')
        pbar.attach(self.trainer, names)

        pbar = ProgressBar()
        pbar.attach(self.evaluator)

        # A few events handler. To add / modify the events handler, you need to extend the __init__ method of RunnerABC
        # Ignite provides the necessary abstractions and a furnished repository of useful tools

        @self.trainer.on(Events.EPOCH_COMPLETED)
        def log_validation_results(trainer):
            self.evaluator.run(self.dataset_splits.val_data_loader())
            metrics = self.evaluator.state.metrics
            logger.info(
                f"Validation Results - Epoch: {trainer.state.epoch} {print_metrics(metrics)}"
            )

            if self.scheduler:
                self.scheduler.step(
                    metrics[self.loss_metric.__class__.__name__])

        @self.trainer.on(Events.COMPLETED)
        def log_test_results(trainer):
            self.evaluator.run(self.dataset_splits.test_data_loader())
            metrics = self.evaluator.state.metrics
            logger.info(
                f"Test Results - Epoch: {trainer.state.epoch} {print_metrics(metrics)}"
            )

        if self.tensorboard_logs:
            tb_logger = TensorboardLogger(log_dir=self.tensorboard_logs)
            tb_logger.attach(self.trainer,
                             log_handler=OutputHandler(
                                 tag="training",
                                 output_transform=lambda loss: {'loss': loss}),
                             event_name=Events.ITERATION_COMPLETED)
            tb_logger.attach(self.evaluator,
                             log_handler=OutputHandler(
                                 tag="validation",
                                 metric_names=["LossMetric"],
                                 another_engine=self.trainer),
                             event_name=Events.EPOCH_COMPLETED)
            tb_logger.attach(self.trainer,
                             log_handler=OptimizerParamsHandler(
                                 self.optimizer),
                             event_name=Events.ITERATION_STARTED)
            tb_logger.attach(self.trainer,
                             log_handler=WeightsScalarHandler(self.model),
                             event_name=Events.ITERATION_COMPLETED)
            tb_logger.attach(self.trainer,
                             log_handler=WeightsHistHandler(self.model),
                             event_name=Events.EPOCH_COMPLETED)
            tb_logger.attach(self.trainer,
                             log_handler=GradsScalarHandler(self.model),
                             event_name=Events.ITERATION_COMPLETED)

            # This is important to close the tensorboard file logger
            @self.trainer.on(Events.COMPLETED)
            def end_tensorboard(trainer):
                logger.info("Training completed")
                tb_logger.close()

        if self.embeddings_name:

            @self.trainer.on(Events.COMPLETED)
            def log_embeddings(trainer):
                if hasattr(self.model, self.embeddings_name) and hasattr(
                        self.dataset_splits, "vectorizer"):
                    logger.info(
                        f"Logging embeddings ({self.embeddings_name}) to Tensorboard!"
                    )
                    embeddings = getattr(self.model,
                                         self.embeddings_name).weight.data
                    metadata = [
                        str(self.dataset_splits.vectorizer.data_vocab.
                            _id2token[token_index]).encode('utf-8')
                        for token_index in range(embeddings.shape[0])
                    ]
                    self.writer.add_embedding(
                        mat=embeddings,
                        metadata=metadata,
                        global_step=self.trainer.state.epoch)
Ejemplo n.º 11
0
def run(warmup_iterations=5000, batch_size=4, test_size=2000, epochs=10, log_interval=100, debug_images_interval=50,
        train_dataset_ann_file='~/bigdata/coco/annotations/instances_train2017.json',
        val_dataset_ann_file='~/bigdata/coco/annotations/instances_val2017.json', input_checkpoint='',
        load_optimizer=False, load_params=False, output_dir="/tmp/checkpoints", log_dir="/tmp/tensorboard_logs",
        lr=0.005, momentum=0.9,
        weight_decay=0.0005, use_mask=True, use_toy_testing_data=False, backbone_name='resnet101', num_workers=6,
        trainable_layers=3, train_set_size=None, early_stopping=False, patience=3, step_size=3, gamma=0.1,
        record_histograms=True):
    # Set the training device to GPU if available - if not set it to CPU
    device = torch.cuda.current_device() if torch.cuda.is_available() else torch.device('cpu')
    torch.backends.cudnn.benchmark = True if torch.cuda.is_available() else False  # optimization for fixed input size

    # Write hyperparams
    hparam_dict = {
        'warmup_iterations': warmup_iterations,
        'training_batch_size': batch_size,
        'test_size': test_size,
        'epochs': epochs,
        'trainable_layers': trainable_layers,
        'lr': lr,
        'momentum': momentum,
        'weight_decay': weight_decay,
        'train_set_size': train_set_size,
        'step_size': step_size,
        'gamma': gamma,
        'early_stopping': early_stopping,
        'patience': patience,
        'total_iterations': 0,
        'total_epochs': 0,
        'timeout': True,
    }

    # Load checkpoint if available
    if input_checkpoint:
        hparam_path = Path(input_checkpoint).parent / 'hparams.pickle'

        logger.info('Loading model checkpoint from '.format(input_checkpoint))
        input_checkpoint = torch.load(input_checkpoint, map_location=torch.device(device))  # FIXME Bad overload

        with open(hparam_path, 'rb') as f:
            hparam_dict = pickle.load(f)

        # Load the training parameters from the saved hparam dictionary
        if load_params:
            warmup_iterations, batch_size, test_size, epochs, trainable_layers, lr, momentum,\
            weight_decay, train_set_size, step_size, gamma, early_stopping, patience = itemgetter(
                'warmup_iterations', 'training_batch_size', 'test_size', 'epochs', 'trainable_layers',
                'lr', 'momentum', 'weight_decay', 'train_set_size', 'step_size', 'gamma', 'early_stopping',
                'patience')(hparam_dict)
            try:
                train_set_size -= 1
            except TypeError as e:
                pass

    print('Hparams: ', hparam_dict)

    # Define train and test datasets
    train_loader, val_loader, labels_enum = get_data_loaders(train_dataset_ann_file,
                                                             val_dataset_ann_file,
                                                             batch_size,
                                                             test_size,
                                                             configuration_data.get('image_size'),
                                                             use_mask=use_mask,
                                                             _use_toy_testing_set=use_toy_testing_data,
                                                             num_workers=num_workers,
                                                             train_set_size=train_set_size)

    # Hparams
    hparam_dict['training_set_size'] = len(train_loader) * batch_size
    hparam_dict['validation_set_size'] = len(val_loader) * batch_size

    with open(os.path.join(output_dir, 'hparams.pickle'), 'wb') as f:
        pickle.dump(hparam_dict, f)

    val_dataset = list(chain.from_iterable(
        zip(*copy.deepcopy(batch)) for batch in iter(val_loader)))  # TODO Figure out what this does and use deepcopy.
    coco_api_val_dataset = convert_to_coco_api(val_dataset)
    num_classes = max(labels_enum.keys()) + 1  # number of classes plus one for background class
    configuration_data['num_classes'] = num_classes

    logger.info('Training with {} classes...'.format(num_classes))

    if use_mask:
        logger.debug('Loading MaskRCNN Model...')
        model = get_model_instance_segmentation(num_classes, configuration_data.get('mask_predictor_hidden_layer'))
    else:
        logger.debug('Loading FasterRCNN Model...')
        model = get_model_instance_detection(num_classes, backbone_name=backbone_name,
                                             trainable_layers=trainable_layers)
    iou_types = get_iou_types(model)

    # if there is more than one GPU, parallelize the model
    if torch.cuda.device_count() > 1:
        logger.debug("{} GPUs were detected - we will use all of them".format(torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    # copy the model to each device
    model.to(device)

    if input_checkpoint:
        model.load_state_dict(input_checkpoint['model'])

    logger.debug('Initializing SummaryWriter...')
    if use_mask:
        comment = 'mask'
    else:
        comment = 'box-{}'.format(backbone_name)

    logger.debug('Creating Trainer...')
    # define Ignite's train and evaluation engine
    trainer = create_trainer(model, device)
    logger.debug('Creating Evaluator...')
    evaluator = create_evaluator(model, device)

    logger.debug('Initializing Tensorboard Logger...')
    tb_logger = TensorboardLogger(log_dir=log_dir, comment=comment)
    if record_histograms:
        tb_logger.attach(
            trainer,
            event_name=Events.ITERATION_COMPLETED(every=500),
            log_handler=WeightsHistHandler(model)
        )
    writer = tb_logger.writer

    logger.debug('Setting up profiler...')
    profiler = BasicTimeProfiler()
    profiler.attach(trainer)

    coco_ap = CocoAP(coco_api_val_dataset, iou_types)
    coco_ap_05 = CocoAP5(coco_api_val_dataset, iou_types)
    coco_ap_075 = CocoAP75(coco_api_val_dataset, iou_types)
    coco_ap.attach(evaluator, "AP")
    coco_ap_05.attach(evaluator, "AP0.5")
    coco_ap_075.attach(evaluator, "AP0.75")

    tb_logger.attach(
        evaluator,
        log_handler=OutputHandler(
            tag='evaluation',
            metric_names=['AP', 'AP0.5', 'AP0.75'],
            global_step_transform=global_step_from_engine(trainer)
        ),
        event_name=Events.EPOCH_COMPLETED
    )

    ## Early stopping
    def score_function(engine):
        ap_score = engine.state.metrics['AP']
        return ap_score

    if early_stopping:
        handler = EarlyStopping(patience=patience, score_function=score_function, trainer=trainer)
        # Note: the handler is attached to an *Evaluator* (runs one epoch on validation dataset).
        evaluator.add_event_handler(Events.COMPLETED, handler)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_intermediate_results():
        logger.debug('Epoch Complete...')
        profiler.print_results(profiler.get_results())

    @trainer.on(Events.STARTED)
    def on_training_started(engine):
        # construct an optimizer
        logger.info('Started Training...')
        params = [p for p in model.parameters() if p.requires_grad]
        engine.state.optimizer = torch.optim.SGD(params,
                                                 lr=lr,
                                                 momentum=momentum,
                                                 weight_decay=weight_decay)

        tb_logger.attach(
            trainer,
            log_handler=OptimizerParamsHandler(engine.state.optimizer),
            event_name=Events.ITERATION_STARTED
        )

        engine.state.scheduler = torch.optim.lr_scheduler.StepLR(engine.state.optimizer, step_size=step_size,
                                                                 gamma=gamma)
        if input_checkpoint:
            # Load traininer states
            trainer.state.epoch = input_checkpoint['epoch']
            if 'iteration' in input_checkpoint:
                trainer.state.iteration = input_checkpoint['iteration']
            else:
                trainer.state.iteration = int(hparam_dict['training_set_size'] / batch_size * input_checkpoint['epoch'])

            if load_optimizer:
                print('loading optimizer')
                logger.info('Loading optimizer and scheduler...')
                engine.state.optimizer.load_state_dict(input_checkpoint['optimizer'])
                engine.state.scheduler.load_state_dict(input_checkpoint['lr_scheduler'])
                engine.state.scheduler.last_epoch = trainer.state.epoch
            else:
                print('not loading optimizer')


    @trainer.on(Events.EPOCH_STARTED)
    def on_epoch_started(engine):
        logger.debug('Started Epoch...')
        model.train()
        engine.state.warmup_scheduler = None

        #TODO Print optimizer values

        if engine.state.epoch == 1:
            warmup_iters = min(warmup_iterations, len(train_loader) - 1)
            print('Warm up period was set to {} iterations'.format(warmup_iters))
            warmup_factor = 1. / warmup_iters
            engine.state.warmup_scheduler = utils.warmup_lr_scheduler(engine.state.optimizer, warmup_iters,
                                                                      warmup_factor)

    @trainer.on(Events.ITERATION_COMPLETED)
    def on_iteration_completed(engine):
        images, targets, loss_dict_reduced = engine.state.output
        if engine.state.iteration % log_interval == 0:
            loss = sum(loss for loss in loss_dict_reduced.values()).item()
            print("Epoch: {}, Iteration: {}, Loss: {}".format(engine.state.epoch, engine.state.iteration, loss))
            for k, v in loss_dict_reduced.items():
                writer.add_scalar("loss/{}".format(k), v.item(), engine.state.iteration)
            writer.add_scalar("loss/total_loss", sum(loss for loss in loss_dict_reduced.values()).item(),
                              engine.state.iteration)
            # writer.add_scalar("learning_rate/lr", engine.state.optimizer.param_groups[0]['lr'], engine.state.iteration)

        if engine.state.iteration % debug_images_interval == 0:
            for n, debug_image in enumerate(draw_debug_images(images, targets)):
                writer.add_image("training/image_{}".format(n), debug_image, engine.state.iteration, dataformats='HWC')
                if 'masks' in targets[n]:
                    writer.add_image("training/image_{}_mask".format(n),
                                     draw_mask(targets[n]), engine.state.iteration, dataformats='HW')
        images = targets = loss_dict_reduced = engine.state.output = None

    @trainer.on(Events.EPOCH_COMPLETED)
    def on_epoch_completed(engine):
        logger.debug('Finished Epoch...')
        update_hparams(engine)

        engine.state.scheduler.step()
        evaluator.run(val_loader)
        # for res_type in evaluator.state.coco_evaluator.iou_types:
        #     average_precision_05 = evaluator.state.coco_evaluator.coco_eval[res_type].stats[1]
        #     writer.add_scalar("validation-{}/average precision 0_5".format(res_type), average_precision_05,
        #                       engine.state.iteration)
        checkpoint_path = os.path.join(output_dir, 'model_epoch_{}.pth'.format(engine.state.epoch))
        print('Saving model checkpoint')
        checkpoint = {
            'model': model.state_dict(),
            'optimizer': engine.state.optimizer.state_dict(),
            'lr_scheduler': engine.state.scheduler.state_dict(),
            'epoch': engine.state.epoch,
            'iteration': engine.state.iteration,
            'configuration': configuration_data,
            'labels_enumeration': labels_enum}
        utils.save_on_master(checkpoint, checkpoint_path)
        print('Model checkpoint from epoch {} was saved at {}'.format(engine.state.epoch, checkpoint_path))
        checkpoint = None
        evaluator.state = State()

    @trainer.on(Events.COMPLETED)
    def on_training_completed(engine):
        logger.debug('Finished Training...')
        update_hparams(engine, finished=True)

        writer.add_hparams(hparam_dict=hparam_dict, metric_dict={
            'hparams/AP': coco_ap.ap,
            'hparams/AP.5': coco_ap_05.ap5,
            'hparams/AP.75': coco_ap_075.ap75
        })
        logger.debug('Wrote hparams...')

    def update_hparams(engine, finished=False):
        hparam_dict['total_iterations'] = global_step_from_engine(engine)(engine, Events.ITERATION_COMPLETED)
        hparam_dict['total_epochs'] = global_step_from_engine(engine)(engine, Events.EPOCH_COMPLETED)
        hparam_dict['timeout'] = not finished

        if hparam_dict['train_set_size'] is None:
            hparam_dict['train_set_size'] = hparam_dict['training_set_size']

        try:
            shutil.copyfile(os.path.join(output_dir, 'hparams.pickle'),
                            os.path.join(output_dir, 'hparams.pickle.backup'))
            with open(os.path.join(output_dir, 'hparams.pickle'), 'wb') as f:
                pickle.dump(hparam_dict, f)
        except AttributeError as e:
            print('Could not pickle one of the total vars.', e)
            os.replace(os.path.join(output_dir, 'hparams.pickle.backup'), os.path.join(output_dir, 'hparams.pickle'))

    @evaluator.on(Events.STARTED)
    def on_evaluation_started(engine):
        logger.debug('Started Evaluation...')
        model.eval()
        # engine.state.coco_evaluator = CocoEvaluator(coco_api_val_dataset, iou_types)

    @evaluator.on(Events.ITERATION_COMPLETED)
    def on_eval_iteration_completed(engine):
        images, targets, results = engine.state.output
        if engine.state.iteration % log_interval == 0:
            print("Evaluation: Iteration: {}".format(engine.state.iteration))

        if engine.state.iteration % debug_images_interval == 0:
            for n, debug_image in enumerate(draw_debug_images(images, targets, results)):
                print('Drawing debug image "validation/image_{}_{}"'.format(engine.state.iteration, n))
                writer.add_image("evaluation/image_{}_{}".format(engine.state.iteration, n),
                                 debug_image, trainer.state.iteration, dataformats='HWC')
                if 'masks' in targets[n]:
                    writer.add_image("validation/image_{}_{}_mask".format(engine.state.iteration, n),
                                     draw_mask(targets[n]), trainer.state.iteration, dataformats='HW')
                    curr_image_id = int(targets[n]['image_id'])
                    writer.add_image("validation/image_{}_{}_predicted_mask".format(engine.state.iteration, n),
                                     draw_mask(results[curr_image_id]).squeeze(), trainer.state.iteration,
                                     dataformats='HW')
        images = targets = results = engine.state.output = None

    @evaluator.on(Events.COMPLETED)
    def on_evaluation_completed(engine):
        logger.debug('Finished Evaluation...')
        # gather the stats from all processes
        # engine.state.coco_evaluator.synchronize_between_processes()
        #
        # # accumulate predictions from all images
        # engine.state.coco_evaluator.accumulate()
        # engine.state.coco_evaluator.summarize()
        #
        # pr_50, pr_75 = get_pr_levels(engine.state.coco_evaluator.coco_eval['bbox'])
        # TODO Bring this back
        # writer.add_hparams(hparam_dict, {
        #     'hparams/AP.5': np.mean(pr_50),
        #     'hparams/AP.75': np.mean(pr_75)
        # })

    logger.debug('Running Trainer...')
    trainer.run(train_loader, max_epochs=epochs)
    writer.close()

    profiler.write_results('{}/time_profiling.csv'.format(output_dir))
Ejemplo n.º 12
0
def train(epochs=500,
          batch_size=32,
          bptt_len=70,
          lr=0.00025,
          log_steps=200,
          clip_grad=0.25,
          log_dir="experiments"):
    ###################################################################
    # Dataset
    ###################################################################
    wt = wikitext103(batch_size=batch_size, bptt_len=bptt_len)
    # wt = wikitext2(batch_size=batch_size, bptt_len=bptt_len)

    ###################################################################
    # Configs
    ###################################################################
    embedding_config = DropEmbedding.Hyperparams(len(wt.text_field.vocab) + 3,
                                                 ninp=512)
    encoder_config = TransformerEncoder.Hyperparams(
        att_num_units=[512, 512, 512, 512, 512, 512], max_ext=384)

    ###################################################################
    # Models
    ###################################################################
    base_embedding = DropEmbedding(embedding_config)
    embedding = TransformerEmbedding(embedding=base_embedding,
                                     max_length=bptt_len,
                                     embedding_size=embedding_config.ninp,
                                     use_positional_embedding=False)
    encoder = TransformerEncoder(encoder_config)
    model = TransformerLanguageModel(embedding, encoder)
    model.init_weight()

    ###################################################################
    # Loss
    ###################################################################
    criterion = lm_criterion(in_features=encoder_config.att_num_units[-1],
                             vocab_size=len(wt.text_field.vocab))

    ###################################################################
    # Parameters + Train ops
    ###################################################################
    parameters = (list(model.parameters()) + list(criterion.parameters()))
    tot_params = 0
    for p in parameters:
        tot_params += reduce(lambda x, y: x * y, p.size())
    print("Total Parameters: ", tot_params)
    opt = optim.Adam(parameters, lr=lr)
    model.to(DEVICE)
    criterion.to(DEVICE)

    ###################################################################
    # Train + Evaluation
    ###################################################################
    def train_step(engine, batch):
        model.train()
        opt.zero_grad()

        text = batch.text.to(DEVICE).t().contiguous()
        target = batch.target.to(DEVICE).t().contiguous()

        out, out_past = model(text, engine.state.train_past)
        engine.state.train_past = out_past
        raw_loss = criterion(out.view(-1, out.size(2)), target.view(-1))
        loss = raw_loss[1]

        loss.backward()
        nn.utils.clip_grad_norm_(parameters, clip_grad)
        opt.step()

        return {"train_loss": loss.item(), "train_ppl": loss.exp().item()}

    def eval_step(engine, batch):
        model.eval()

        if not hasattr(engine.state, "eval_past"):
            engine.state.eval_past = None

        with torch.no_grad():
            text = batch.text.to(DEVICE).t().contiguous()
            target = batch.target.to(DEVICE).t().contiguous()

            out, out_past = model(text, engine.state.eval_past)
            engine.state.eval_past = out_past
            raw_loss = criterion(out.view(-1, out.size(2)), target.view(-1))
            loss = raw_loss[1]

            return {"val_loss": loss.item()}

    train_engine = Engine(train_step)
    eval_engine = Engine(eval_step)

    def reset_state(engine):
        engine.state.train_past = None

    def run_eval(_):
        print("start running eval")
        eval_engine.run(wt.valid_iter)
        metrics = eval_engine.state.metrics
        print("Validation loss: ", metrics["val_loss"], ", ppl: ",
              np.exp(metrics["val_loss"]))

    train_engine.add_event_handler(Events.EPOCH_STARTED, reset_state)
    train_engine.add_event_handler(Events.EPOCH_COMPLETED, run_eval)

    ###################################################################
    # LR Scheduler
    ###################################################################
    cosine_scheduler = CosineAnnealingScheduler(opt.param_groups[0],
                                                "lr",
                                                0.0,
                                                2.5e-4,
                                                cycle_size=len(wt.train_iter))
    warmup_scheduler = create_lr_scheduler_with_warmup(cosine_scheduler, 0.0,
                                                       2.5e-4, 200)
    train_engine.add_event_handler(Events.ITERATION_STARTED, warmup_scheduler)

    ###################################################################
    # Metrics
    ###################################################################
    RunningAverage(output_transform=lambda x: x["train_ppl"]).attach(
        train_engine, "train_ppl")
    RunningAverage(output_transform=lambda x: x["train_loss"]).attach(
        train_engine, "train_loss")
    RunningAverage(output_transform=lambda x: x["val_loss"]).attach(
        eval_engine, "val_loss")
    progress_bar = ProgressBar(persist=True)
    progress_bar.attach(train_engine, ["train_ppl", "train_loss"])
    progress_bar_val = ProgressBar(persist=True)
    progress_bar_val.attach(eval_engine, ["val_loss"])

    ###################################################################
    # Tensorboard
    ###################################################################
    tb_logger = TensorboardLogger(log_dir=log_dir)

    def stepn_logger(num_steps, handler):
        def logger_runner(engine, log_handler, event_name):
            if engine.state.iteration % num_steps == 0:
                handler(engine, log_handler, event_name)

        return logger_runner

    tb_logger.attach(train_engine,
                     log_handler=stepn_logger(
                         log_steps,
                         OutputHandler(tag="training",
                                       output_transform=lambda loss: loss)),
                     event_name=Events.ITERATION_COMPLETED)
    tb_logger.attach(eval_engine,
                     log_handler=OutputHandler(
                         tag="validation",
                         output_transform=lambda loss: loss,
                         another_engine=train_engine),
                     event_name=Events.EPOCH_COMPLETED)
    tb_logger.attach(train_engine,
                     log_handler=stepn_logger(log_steps,
                                              OptimizerParamsHandler(opt)),
                     event_name=Events.ITERATION_STARTED)
    tb_logger.attach(train_engine,
                     log_handler=stepn_logger(log_steps,
                                              WeightsScalarHandler(model)),
                     event_name=Events.ITERATION_COMPLETED)
    tb_logger.attach(train_engine,
                     log_handler=stepn_logger(log_steps,
                                              GradsScalarHandler(model)),
                     event_name=Events.ITERATION_COMPLETED)
    tb_logger.attach(train_engine,
                     log_handler=stepn_logger(500, WeightsHistHandler(model)),
                     event_name=Events.ITERATION_COMPLETED)
    tb_logger.attach(train_engine,
                     log_handler=stepn_logger(500, GradsHistHandler(model)),
                     event_name=Events.ITERATION_COMPLETED)

    try:
        train_engine.run(wt.train_iter, max_epochs=epochs)
    except Exception:
        pass
    finally:
        tb_logger.close()
Ejemplo n.º 13
0
    metric_names=["loss", "accuracy", "precision", "recall", "f1", "topKCatAcc"],
    global_step_transform=global_step_from_engine(trainer),
)
# Logging epoch validation metrics
tb_logger.attach_output_handler(
    engine=evaluator,
    event_name=Events.EPOCH_COMPLETED,
    tag="validation",
    metric_names=["loss", "accuracy", "precision", "recall", "f1", "topKCatAcc"],
    global_step_transform=global_step_from_engine(trainer),
)
# Attach the logger to the trainer to log model's weights as a histogram after each epoch
tb_logger.attach(
    trainer,
    event_name=Events.EPOCH_COMPLETED,
    log_handler=WeightsHistHandler(model)
)
# Attach the logger to the trainer to log model's gradients as a histogram after each epoch
tb_logger.attach(
    trainer,
    event_name=Events.EPOCH_COMPLETED,
    log_handler=GradsHistHandler(model)
)
print('Tensorboard Logging...', end='')
print('done')

## SETUP CALLBACKS
print('[INFO] Creating callback functions for training loop...', end='')
# Early Stopping - stops training if the validation loss does not decrease after 5 epochs
handler = EarlyStopping(patience=early_stopping_patience, score_function=score_function_loss, trainer=trainer)
evaluator.add_event_handler(Events.COMPLETED, handler)
Ejemplo n.º 14
0
def run(args):
    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)

    num_classes = 21
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    model = GoogLeNetFCN(num_classes)
    model.init_from_googlenet()

    device_count = torch.cuda.device_count()
    if device_count > 1:
        print("Using %d GPU(s)" % device_count)
        model = nn.DataParallel(model)
        args.batch_size = device_count * args.batch_size
        args.val_batch_size = device_count * args.val_batch_size

    model = model.to(device)

    train_loader, val_loader = get_data_loaders(
        args.dataset_dir, args.batch_size, args.val_batch_size,
        args.num_workers, args.download, args.augmentations)

    criterion = nn.CrossEntropyLoss(ignore_index=255, reduction='sum')

    optimizer = optim.SGD([{
        'params': [
            param for name, param in model.named_parameters()
            if name.endswith('weight')
        ]
    }, {
        'params': [
            param for name, param in model.named_parameters()
            if name.endswith('bias')
        ],
        'lr':
        args.lr * 2,
        'weight_decay':
        0
    }],
                          lr=args.lr,
                          momentum=args.momentum,
                          weight_decay=args.weight_decay)

    if args.resume:
        if os.path.isfile(args.resume):
            print("Loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_iou = checkpoint['bestIoU']
            model.load_state_dict(checkpoint['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("Loaded checkpoint '{}' (Epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("No checkpoint found at '{}'".format(args.resume))
            sys.exit()

    if args.freeze_bn:
        print("Freezing batch norm")
        model = freeze_batchnorm(model)

    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device,
                                        non_blocking=True)

    RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss')

    # attach progress bar
    pbar = ProgressBar(persist=True)
    pbar.attach(trainer, metric_names=['loss'])

    cm = ConfusionMatrix(num_classes)
    evaluator = create_supervised_evaluator(model,
                                            metrics={
                                                'loss': Loss(criterion),
                                                'IoU': IoU(cm)
                                            },
                                            device=device,
                                            non_blocking=True)

    pbar2 = ProgressBar(persist=True, desc='Eval Epoch')
    pbar2.attach(evaluator)

    def _global_step_transform(engine, event_name):
        return trainer.state.iteration

    tb_logger = TensorboardLogger(args.log_dir)
    tb_logger.attach(trainer,
                     log_handler=OutputHandler(tag='training',
                                               metric_names=['loss']),
                     event_name=Events.ITERATION_COMPLETED)

    tb_logger.attach(trainer,
                     log_handler=OptimizerParamsHandler(optimizer),
                     event_name=Events.ITERATION_STARTED)

    tb_logger.attach(trainer,
                     log_handler=WeightsHistHandler(model),
                     event_name=Events.EPOCH_COMPLETED)

    tb_logger.attach(evaluator,
                     log_handler=OutputHandler(
                         tag='validation',
                         metric_names=['loss', 'IoU'],
                         global_step_transform=_global_step_transform),
                     event_name=Events.EPOCH_COMPLETED)

    @evaluator.on(Events.EPOCH_COMPLETED)
    def save_checkpoint(engine):
        iou = engine.state.metrics['IoU'] * 100.0
        mean_iou = iou.mean()

        is_best = mean_iou.item() > trainer.state.best_iou
        trainer.state.best_iou = max(mean_iou.item(), trainer.state.best_iou)

        name = 'epoch{}_mIoU={:.1f}.pth'.format(trainer.state.epoch, mean_iou)
        file = {
            'model': model.state_dict(),
            'epoch': trainer.state.epoch,
            'iteration': engine.state.iteration,
            'optimizer': optimizer.state_dict(),
            'args': args,
            'bestIoU': trainer.state.best_iou
        }

        save(file, args.output_dir, 'checkpoint_{}'.format(name))
        if is_best:
            save(model.state_dict(), args.output_dir, 'model_{}'.format(name))

    @trainer.on(Events.STARTED)
    def initialize(engine):
        if args.resume:
            engine.state.epoch = args.start_epoch
            engine.state.iteration = args.start_epoch * len(
                engine.state.dataloader)
            engine.state.best_iou = best_iou
        else:
            engine.state.best_iou = 0.0

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        pbar.log_message("Start Validation - Epoch: [{}/{}]".format(
            engine.state.epoch, engine.state.max_epochs))
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        loss = metrics['loss']
        iou = metrics['IoU']
        mean_iou = iou.mean()

        pbar.log_message(
            "Validation results - Epoch: [{}/{}]: Loss: {:.2e}, mIoU: {:.1f}".
            format(engine.state.epoch, engine.state.max_epochs, loss,
                   mean_iou * 100.0))

    print("Start training")
    trainer.run(train_loader, max_epochs=args.epochs)
    tb_logger.close()