Beispiel #1
0
def run(train_loader, val_loader, epochs, lr, momentum, weight_decay, lr_step,
        k1, k2, es_patience, log_dir):
    model = Vgg16()

    device = 'cpu'
    if torch.cuda.is_available():
        device = 'cuda'
    model.to(device)

    optimizer = optim.SGD(model.parameters(),
                          lr=lr,
                          momentum=momentum,
                          weight_decay=weight_decay)

    lr_scheduler = ExponentialLR(optimizer, gamma=0.975)

    # criterion = VAELoss(k1=k1, k2=k2).to(device)

    def update_fn(engine, batch):
        x, y = _prepare_batch(batch, device=device, non_blocking=True)

        model.train()

        optimizer.zero_grad()

        output = model(x)

        # Compute loss
        loss = F.nll_loss(output, y)

        loss.backward()

        optimizer.step()

        return {
            "batchloss": loss.item(),
        }

    trainer = Engine(update_fn)

    try:
        GpuInfo().attach(trainer)
    except RuntimeError:
        print(
            "INFO: By default, in this example it is possible to log GPU information (used memory, utilization). "
            "As there is no pynvml python package installed, GPU information won't be logged. Otherwise, please "
            "install it : `pip install pynvml`")

    trainer.add_event_handler(Events.ITERATION_COMPLETED(every=lr_step),
                              lambda engine: lr_scheduler.step())

    metric_names = [
        'batchloss',
    ]

    def output_transform(x, name):
        return x[name]

    for n in metric_names:
        # We compute running average values on the output (batch loss) across all devices
        RunningAverage(output_transform=partial(output_transform, name=n),
                       epoch_bound=False,
                       device=device).attach(trainer, n)

    exp_name = datetime.now().strftime("%Y%m%d-%H%M%S")
    log_path = log_dir + "/vgg_vae/{}".format(exp_name)

    tb_logger = TensorboardLogger(log_dir=log_path)

    tb_logger.attach(trainer,
                     log_handler=OutputHandler(tag="training",
                                               metric_names=metric_names),
                     event_name=Events.ITERATION_COMPLETED)

    tb_logger.attach(trainer,
                     log_handler=OptimizerParamsHandler(optimizer, "lr"),
                     event_name=Events.ITERATION_STARTED)

    ProgressBar(persist=True,
                bar_format="").attach(trainer,
                                      event_name=Events.EPOCH_STARTED,
                                      closing_event_name=Events.COMPLETED)
    ProgressBar(persist=False, bar_format="").attach(trainer,
                                                     metric_names=metric_names)

    # val process definition
    def loss_output_transform(output):
        return output

    def acc_output_transform(output):
        return output

    customed_loss = Loss(loss_fn=F.nll_loss,
                         output_transform=loss_output_transform,
                         device=device)
    customed_accuracy = Accuracy(output_transform=acc_output_transform,
                                 device=device)

    metrics = {'Loss': customed_loss, 'Accuracy': customed_accuracy}

    def val_update_fn(engine, batch):
        model.eval()
        with torch.no_grad():
            x, y = _prepare_batch(batch, device=device, non_blocking=True)
            output = model(x)
            return output, y

    val_evaluator = Engine(val_update_fn)

    for name, metric in metrics.items():
        metric.attach(val_evaluator, name)

    def run_evaluation(engine):
        val_evaluator.run(val_loader)

    trainer.add_event_handler(Events.EPOCH_COMPLETED, run_evaluation)
    trainer.add_event_handler(Events.COMPLETED, run_evaluation)

    ProgressBar(persist=False, desc="Train evaluation").attach(val_evaluator)

    # Log val metrics:
    tb_logger.attach(val_evaluator,
                     log_handler=OutputHandler(tag="val",
                                               metric_names=list(
                                                   metrics.keys()),
                                               another_engine=trainer),
                     event_name=Events.EPOCH_COMPLETED)

    # trainer.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan())

    # Store the best model
    def default_score_fn(engine):
        score = engine.state.metrics['Accuracy']
        return score

    best_model_handler = ModelCheckpoint(dirname=log_path,
                                         filename_prefix="best",
                                         n_saved=3,
                                         score_name="val_acc",
                                         score_function=default_score_fn)
    val_evaluator.add_event_handler(Events.COMPLETED, best_model_handler, {
        'model': model,
    })

    # Add early stopping
    es_patience = es_patience
    es_handler = EarlyStopping(patience=es_patience,
                               score_function=default_score_fn,
                               trainer=trainer)
    val_evaluator.add_event_handler(Events.COMPLETED, es_handler)

    setup_logger(es_handler._logger)
    setup_logger(logging.getLogger("ignite.engine.engine.Engine"))

    def empty_cuda_cache(engine):
        torch.cuda.empty_cache()
        import gc
        gc.collect()

    trainer.add_event_handler(Events.EPOCH_COMPLETED, empty_cuda_cache)
    val_evaluator.add_event_handler(Events.COMPLETED, empty_cuda_cache)

    trainer.run(train_loader, max_epochs=epochs)
Beispiel #2
0
def run(output_path, config):
    device = "cuda"

    local_rank = config['local_rank']
    distributed = backend is not None
    if distributed:
        torch.cuda.set_device(local_rank)
        device = "cuda"
    rank = dist.get_rank() if distributed else 0

    # Rescale batch_size and num_workers
    ngpus_per_node = torch.cuda.device_count()
    ngpus = dist.get_world_size() if distributed else 1
    batch_size = config['batch_size'] // ngpus
    num_workers = int(
        (config['num_workers'] + ngpus_per_node - 1) / ngpus_per_node)

    train_labelled_loader, test_loader = \
        get_train_test_loaders(path=config['data_path'],
                               batch_size=batch_size,
                               distributed=distributed,
                               num_workers=num_workers)

    model = get_model(config['model'])
    model = model.to(device)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[
                local_rank,
            ], output_device=local_rank)

    optimizer = optim.SGD(model.parameters(),
                          lr=config['learning_rate'],
                          momentum=config['momentum'],
                          weight_decay=config['weight_decay'],
                          nesterov=True)

    criterion = nn.CrossEntropyLoss().to(device)

    le = len(train_labelled_loader)
    milestones_values = [(0, 0.0),
                         (le * config['num_warmup_epochs'],
                          config['learning_rate']),
                         (le * config['num_epochs'], 0.0)]
    lr_scheduler = PiecewiseLinear(optimizer,
                                   param_name="lr",
                                   milestones_values=milestones_values)

    def _prepare_batch(batch, device, non_blocking):
        x, y = batch
        return (convert_tensor(x, device=device, non_blocking=non_blocking),
                convert_tensor(y, device=device, non_blocking=non_blocking))

    def process_function(engine, labelled_batch):

        x, y = _prepare_batch(labelled_batch, device=device, non_blocking=True)

        model.train()
        # Supervised part
        y_pred = model(x)
        loss = criterion(y_pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        return {
            'batch loss': loss.item(),
        }

    trainer = Engine(process_function)

    if not hasattr(lr_scheduler, "step"):
        trainer.add_event_handler(Events.ITERATION_STARTED, lr_scheduler)
    else:
        trainer.add_event_handler(Events.ITERATION_COMPLETED,
                                  lambda engine: lr_scheduler.step())

    metric_names = [
        'batch loss',
    ]

    def output_transform(x, name):
        return x[name]

    for n in metric_names:
        # We compute running average values on the output (batch loss) across all devices
        RunningAverage(output_transform=partial(output_transform, name=n),
                       epoch_bound=False,
                       device=device).attach(trainer, n)

    if rank == 0:
        checkpoint_handler = ModelCheckpoint(dirname=output_path,
                                             filename_prefix="checkpoint")
        trainer.add_event_handler(Events.ITERATION_COMPLETED(every=1000),
                                  checkpoint_handler, {
                                      'model': model,
                                      'optimizer': optimizer
                                  })

        ProgressBar(persist=True,
                    bar_format="").attach(trainer,
                                          event_name=Events.EPOCH_STARTED,
                                          closing_event_name=Events.COMPLETED)
        if config['display_iters']:
            ProgressBar(persist=False,
                        bar_format="").attach(trainer,
                                              metric_names=metric_names)

        tb_logger = TensorboardLogger(log_dir=output_path)
        tb_logger.attach(trainer,
                         log_handler=tbOutputHandler(
                             tag="train", metric_names=metric_names),
                         event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer,
                         log_handler=tbOptimizerParamsHandler(optimizer,
                                                              param_name="lr"),
                         event_name=Events.ITERATION_STARTED)

    metrics = {
        "accuracy": Accuracy(device=device if distributed else None),
        "loss": Loss(criterion, device=device if distributed else None)
    }

    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device,
                                                  non_blocking=True)

    def run_validation(engine):
        torch.cuda.synchronize()
        train_evaluator.run(train_labelled_loader)
        evaluator.run(test_loader)

    trainer.add_event_handler(Events.EPOCH_STARTED(every=3), run_validation)
    trainer.add_event_handler(Events.COMPLETED, run_validation)

    if rank == 0:
        if config['display_iters']:
            ProgressBar(persist=False,
                        desc="Train evaluation").attach(train_evaluator)
            ProgressBar(persist=False,
                        desc="Test evaluation").attach(evaluator)

        tb_logger.attach(train_evaluator,
                         log_handler=tbOutputHandler(tag="train",
                                                     metric_names=list(
                                                         metrics.keys()),
                                                     another_engine=trainer),
                         event_name=Events.COMPLETED)

        tb_logger.attach(evaluator,
                         log_handler=tbOutputHandler(tag="test",
                                                     metric_names=list(
                                                         metrics.keys()),
                                                     another_engine=trainer),
                         event_name=Events.COMPLETED)

        # Store the best model
        def default_score_fn(engine):
            score = engine.state.metrics['accuracy']
            return score

        score_function = default_score_fn if not hasattr(
            config, "score_function") else config.score_function

        best_model_handler = ModelCheckpoint(
            dirname=output_path,
            filename_prefix="best",
            n_saved=3,
            global_step_transform=global_step_from_engine(trainer),
            score_name="val_accuracy",
            score_function=score_function)
        evaluator.add_event_handler(Events.COMPLETED, best_model_handler, {
            'model': model,
        })

    trainer.run(train_labelled_loader, max_epochs=config['num_epochs'])

    if rank == 0:
        tb_logger.close()
Beispiel #3
0
def run(output_dir, config):
    device = torch.device("cuda" if args.use_cuda else "cpu")

    torch.manual_seed(config['seed'])
    np.random.seed(config['seed'])

    # Rescale batch_size and num_workers
    ngpus_per_node = 1
    batch_size = config['batch_size']
    num_workers = int(
        (config['num_workers'] + ngpus_per_node - 1) / ngpus_per_node)

    (train_loader, test_loader,
     mislabeled_train_loader) = get_train_test_loaders(
         path=config['data_path'],
         batch_size=batch_size,
         num_workers=num_workers,
         random_seed=config['seed'],
         random_labels_fraction=config['random_labels_fraction'],
     )

    model = get_mnist_model(args, device)

    optimizer = AdamFlexibleWeightDecay(
        model.parameters(),
        lr=config['init_lr'],
        weight_decay_order=config['weight_decay_order'],
        weight_decay=config['weight_decay'])

    criterion = nn.CrossEntropyLoss().to(device)

    le = len(train_loader)
    lr_scheduler = MultiStepLR(optimizer,
                               milestones=[le * config['epochs'] * 3 // 4],
                               gamma=0.1)

    def _prepare_batch(batch, device, non_blocking):
        x, y = batch
        return (convert_tensor(x, device=device, non_blocking=non_blocking),
                convert_tensor(y, device=device, non_blocking=non_blocking))

    def process_function(unused_engine, batch):
        x, y = _prepare_batch(batch, device=device, non_blocking=True)
        model.train()
        optimizer.zero_grad()
        y_pred = model(x)

        if config['agreement_threshold'] > 0.0:
            # The "batch_size" in this function refers to the batch size per env
            # Since we treat every example as one env, we should set the parameter
            # n_agreement_envs equal to batch size
            mean_loss, masks = and_mask_utils.get_grads(
                agreement_threshold=config['agreement_threshold'],
                batch_size=1,
                loss_fn=criterion,
                n_agreement_envs=config['batch_size'],
                params=optimizer.param_groups[0]['params'],
                output=y_pred,
                target=y,
                method=args.method,
                scale_grad_inverse_sparsity=config[
                    'scale_grad_inverse_sparsity'],
            )
        else:
            mean_loss = criterion(y_pred, y)
            mean_loss.backward()

        optimizer.step()

        return {}

    trainer = Engine(process_function)
    metric_names = []
    common.setup_common_training_handlers(trainer,
                                          output_path=output_dir,
                                          lr_scheduler=lr_scheduler,
                                          output_names=metric_names,
                                          with_pbar_on_iters=True,
                                          log_every_iters=10)

    tb_logger = TensorboardLogger(log_dir=output_dir)
    tb_logger.attach(trainer,
                     log_handler=OutputHandler(tag="train",
                                               metric_names=metric_names),
                     event_name=Events.ITERATION_COMPLETED)

    metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)}

    test_evaluator = create_supervised_evaluator(model,
                                                 metrics=metrics,
                                                 device=device,
                                                 non_blocking=True)
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device,
                                                  non_blocking=True)
    mislabeled_train_evaluator = create_supervised_evaluator(model,
                                                             metrics=metrics,
                                                             device=device,
                                                             non_blocking=True)

    def run_validation(engine):
        if args.use_cuda:
            torch.cuda.synchronize()
        train_evaluator.run(train_loader)
        if config['random_labels_fraction'] > 0.0:
            mislabeled_train_evaluator.run(mislabeled_train_loader)
        test_evaluator.run(test_loader)

    def flush_metrics(engine):
        tb_logger.writer.flush()

    trainer.add_event_handler(Events.EPOCH_STARTED(every=1), run_validation)
    trainer.add_event_handler(Events.COMPLETED, run_validation)
    trainer.add_event_handler(Events.EPOCH_COMPLETED, flush_metrics)

    ProgressBar(persist=False, desc="Train evaluation").attach(train_evaluator)
    ProgressBar(persist=False, desc="Test evaluation").attach(test_evaluator)
    ProgressBar(persist=False,
                desc="Train (mislabeled portion) evaluation").attach(
                    mislabeled_train_evaluator)

    tb_logger.attach(
        train_evaluator,
        log_handler=OutputHandler(
            tag="train",
            metric_names=list(metrics.keys()),
            global_step_transform=global_step_from_engine(trainer)),
        event_name=Events.COMPLETED)
    tb_logger.attach(
        test_evaluator,
        log_handler=OutputHandler(
            tag="test",
            metric_names=list(metrics.keys()),
            global_step_transform=global_step_from_engine(trainer)),
        event_name=Events.COMPLETED)
    tb_logger.attach(
        mislabeled_train_evaluator,
        log_handler=OutputHandler(
            tag="train_wrong",
            metric_names=list(metrics.keys()),
            global_step_transform=global_step_from_engine(trainer)),
        event_name=Events.COMPLETED)

    trainer_rng = np.random.RandomState()
    trainer.run(train_loader,
                max_epochs=config['epochs'],
                seed=trainer_rng.randint(2**32))

    tb_logger.close()
Beispiel #4
0
def run(epochs, lr, momentum, log_interval, params, trainloader, testloader,
        model):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    net = Net(params).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=lr, momentum=momentum)

    trainer = create_supervised_trainer(net,
                                        optimizer,
                                        criterion,
                                        device=device)
    trainer.logger = setup_logger("trainer")

    val_metrics = {
        "accuracy": Accuracy(),
        "loss": Loss(criterion),
        "recall": Recall()
    }
    evaluator = create_supervised_evaluator(net,
                                            metrics=val_metrics,
                                            device=device)
    evaluator.logger = setup_logger("evaluator")

    # Attach handler to plot trainer's loss every 100 iterations
    tb_logger = TensorboardLogger(log_dir="cifar-output")
    tb_logger.attach_output_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=params.get("loss_report")),
        tag="training",
        output_transform=lambda loss: {"loss": loss},
    )

    # Attach handler to dump evaluator's metrics every epoch completed
    for tag, evaluator in [("training", trainer), ("validation", evaluator)]:
        tb_logger.attach_output_handler(
            evaluator,
            event_name=Events.EPOCH_COMPLETED,
            tag=tag,
            metric_names="all",
            global_step_transform=global_step_from_engine(trainer),
        )

    # Attach function to build debug images and report every epoch end
    tb_logger.attach(
        evaluator,
        log_handler=predictions_gt_images_handler,
        event_name=Events.EPOCH_COMPLETED(once=1),
    )

    desc = "ITERATION - loss: {:.2f}"
    pbar = tqdm(initial=0,
                leave=False,
                total=len(trainloader),
                desc=desc.format(0))

    @trainer.on(Events.ITERATION_COMPLETED(every=log_interval))
    def log_training_loss(engine):
        pbar.desc = desc.format(engine.state.output)
        pbar.update(log_interval)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        pbar.refresh()
        evaluator.run(trainloader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics["accuracy"]
        avg_nll = metrics["loss"]
        tqdm.write(
            "Training Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(testloader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics["accuracy"]
        avg_nll = metrics["loss"]
        tqdm.write(
            "Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))

        pbar.n = pbar.last_print_n = 0

    @trainer.on(Events.EPOCH_COMPLETED | Events.COMPLETED)
    def log_time():
        tqdm.write("{} took {} seconds".format(
            trainer.last_event_name.name,
            trainer.state.times[trainer.last_event_name.name],
        ))

    trainer.run(trainloader, max_epochs=epochs)
    pbar.close()

    PATH = "./cifar_net.pth"

    # CONDITION depicts a custom condition for when to save the model. The model is saved and then updated in ClearML
    CONDITION = True

    if CONDITION:
        torch.save(net.state_dict(), PATH)
        model.update_weights(weights_filename=PATH)
    print("Finished Training")
    print("Task ID number is: {}".format(Task.current_task().id))
Beispiel #5
0
def run(warmup_iterations=5000, batch_size=4, test_size=2000, epochs=10, log_interval=100, debug_images_interval=50,
        train_dataset_ann_file='~/bigdata/coco/annotations/instances_train2017.json',
        val_dataset_ann_file='~/bigdata/coco/annotations/instances_val2017.json', input_checkpoint='',
        load_optimizer=False, load_params=False, output_dir="/tmp/checkpoints", log_dir="/tmp/tensorboard_logs",
        lr=0.005, momentum=0.9,
        weight_decay=0.0005, use_mask=True, use_toy_testing_data=False, backbone_name='resnet101', num_workers=6,
        trainable_layers=3, train_set_size=None, early_stopping=False, patience=3, step_size=3, gamma=0.1,
        record_histograms=True):
    # Set the training device to GPU if available - if not set it to CPU
    device = torch.cuda.current_device() if torch.cuda.is_available() else torch.device('cpu')
    torch.backends.cudnn.benchmark = True if torch.cuda.is_available() else False  # optimization for fixed input size

    # Write hyperparams
    hparam_dict = {
        'warmup_iterations': warmup_iterations,
        'training_batch_size': batch_size,
        'test_size': test_size,
        'epochs': epochs,
        'trainable_layers': trainable_layers,
        'lr': lr,
        'momentum': momentum,
        'weight_decay': weight_decay,
        'train_set_size': train_set_size,
        'step_size': step_size,
        'gamma': gamma,
        'early_stopping': early_stopping,
        'patience': patience,
        'total_iterations': 0,
        'total_epochs': 0,
        'timeout': True,
    }

    # Load checkpoint if available
    if input_checkpoint:
        hparam_path = Path(input_checkpoint).parent / 'hparams.pickle'

        logger.info('Loading model checkpoint from '.format(input_checkpoint))
        input_checkpoint = torch.load(input_checkpoint, map_location=torch.device(device))  # FIXME Bad overload

        with open(hparam_path, 'rb') as f:
            hparam_dict = pickle.load(f)

        # Load the training parameters from the saved hparam dictionary
        if load_params:
            warmup_iterations, batch_size, test_size, epochs, trainable_layers, lr, momentum,\
            weight_decay, train_set_size, step_size, gamma, early_stopping, patience = itemgetter(
                'warmup_iterations', 'training_batch_size', 'test_size', 'epochs', 'trainable_layers',
                'lr', 'momentum', 'weight_decay', 'train_set_size', 'step_size', 'gamma', 'early_stopping',
                'patience')(hparam_dict)
            try:
                train_set_size -= 1
            except TypeError as e:
                pass

    print('Hparams: ', hparam_dict)

    # Define train and test datasets
    train_loader, val_loader, labels_enum = get_data_loaders(train_dataset_ann_file,
                                                             val_dataset_ann_file,
                                                             batch_size,
                                                             test_size,
                                                             configuration_data.get('image_size'),
                                                             use_mask=use_mask,
                                                             _use_toy_testing_set=use_toy_testing_data,
                                                             num_workers=num_workers,
                                                             train_set_size=train_set_size)

    # Hparams
    hparam_dict['training_set_size'] = len(train_loader) * batch_size
    hparam_dict['validation_set_size'] = len(val_loader) * batch_size

    with open(os.path.join(output_dir, 'hparams.pickle'), 'wb') as f:
        pickle.dump(hparam_dict, f)

    val_dataset = list(chain.from_iterable(
        zip(*copy.deepcopy(batch)) for batch in iter(val_loader)))  # TODO Figure out what this does and use deepcopy.
    coco_api_val_dataset = convert_to_coco_api(val_dataset)
    num_classes = max(labels_enum.keys()) + 1  # number of classes plus one for background class
    configuration_data['num_classes'] = num_classes

    logger.info('Training with {} classes...'.format(num_classes))

    if use_mask:
        logger.debug('Loading MaskRCNN Model...')
        model = get_model_instance_segmentation(num_classes, configuration_data.get('mask_predictor_hidden_layer'))
    else:
        logger.debug('Loading FasterRCNN Model...')
        model = get_model_instance_detection(num_classes, backbone_name=backbone_name,
                                             trainable_layers=trainable_layers)
    iou_types = get_iou_types(model)

    # if there is more than one GPU, parallelize the model
    if torch.cuda.device_count() > 1:
        logger.debug("{} GPUs were detected - we will use all of them".format(torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    # copy the model to each device
    model.to(device)

    if input_checkpoint:
        model.load_state_dict(input_checkpoint['model'])

    logger.debug('Initializing SummaryWriter...')
    if use_mask:
        comment = 'mask'
    else:
        comment = 'box-{}'.format(backbone_name)

    logger.debug('Creating Trainer...')
    # define Ignite's train and evaluation engine
    trainer = create_trainer(model, device)
    logger.debug('Creating Evaluator...')
    evaluator = create_evaluator(model, device)

    logger.debug('Initializing Tensorboard Logger...')
    tb_logger = TensorboardLogger(log_dir=log_dir, comment=comment)
    if record_histograms:
        tb_logger.attach(
            trainer,
            event_name=Events.ITERATION_COMPLETED(every=500),
            log_handler=WeightsHistHandler(model)
        )
    writer = tb_logger.writer

    logger.debug('Setting up profiler...')
    profiler = BasicTimeProfiler()
    profiler.attach(trainer)

    coco_ap = CocoAP(coco_api_val_dataset, iou_types)
    coco_ap_05 = CocoAP5(coco_api_val_dataset, iou_types)
    coco_ap_075 = CocoAP75(coco_api_val_dataset, iou_types)
    coco_ap.attach(evaluator, "AP")
    coco_ap_05.attach(evaluator, "AP0.5")
    coco_ap_075.attach(evaluator, "AP0.75")

    tb_logger.attach(
        evaluator,
        log_handler=OutputHandler(
            tag='evaluation',
            metric_names=['AP', 'AP0.5', 'AP0.75'],
            global_step_transform=global_step_from_engine(trainer)
        ),
        event_name=Events.EPOCH_COMPLETED
    )

    ## Early stopping
    def score_function(engine):
        ap_score = engine.state.metrics['AP']
        return ap_score

    if early_stopping:
        handler = EarlyStopping(patience=patience, score_function=score_function, trainer=trainer)
        # Note: the handler is attached to an *Evaluator* (runs one epoch on validation dataset).
        evaluator.add_event_handler(Events.COMPLETED, handler)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_intermediate_results():
        logger.debug('Epoch Complete...')
        profiler.print_results(profiler.get_results())

    @trainer.on(Events.STARTED)
    def on_training_started(engine):
        # construct an optimizer
        logger.info('Started Training...')
        params = [p for p in model.parameters() if p.requires_grad]
        engine.state.optimizer = torch.optim.SGD(params,
                                                 lr=lr,
                                                 momentum=momentum,
                                                 weight_decay=weight_decay)

        tb_logger.attach(
            trainer,
            log_handler=OptimizerParamsHandler(engine.state.optimizer),
            event_name=Events.ITERATION_STARTED
        )

        engine.state.scheduler = torch.optim.lr_scheduler.StepLR(engine.state.optimizer, step_size=step_size,
                                                                 gamma=gamma)
        if input_checkpoint:
            # Load traininer states
            trainer.state.epoch = input_checkpoint['epoch']
            if 'iteration' in input_checkpoint:
                trainer.state.iteration = input_checkpoint['iteration']
            else:
                trainer.state.iteration = int(hparam_dict['training_set_size'] / batch_size * input_checkpoint['epoch'])

            if load_optimizer:
                print('loading optimizer')
                logger.info('Loading optimizer and scheduler...')
                engine.state.optimizer.load_state_dict(input_checkpoint['optimizer'])
                engine.state.scheduler.load_state_dict(input_checkpoint['lr_scheduler'])
                engine.state.scheduler.last_epoch = trainer.state.epoch
            else:
                print('not loading optimizer')


    @trainer.on(Events.EPOCH_STARTED)
    def on_epoch_started(engine):
        logger.debug('Started Epoch...')
        model.train()
        engine.state.warmup_scheduler = None

        #TODO Print optimizer values

        if engine.state.epoch == 1:
            warmup_iters = min(warmup_iterations, len(train_loader) - 1)
            print('Warm up period was set to {} iterations'.format(warmup_iters))
            warmup_factor = 1. / warmup_iters
            engine.state.warmup_scheduler = utils.warmup_lr_scheduler(engine.state.optimizer, warmup_iters,
                                                                      warmup_factor)

    @trainer.on(Events.ITERATION_COMPLETED)
    def on_iteration_completed(engine):
        images, targets, loss_dict_reduced = engine.state.output
        if engine.state.iteration % log_interval == 0:
            loss = sum(loss for loss in loss_dict_reduced.values()).item()
            print("Epoch: {}, Iteration: {}, Loss: {}".format(engine.state.epoch, engine.state.iteration, loss))
            for k, v in loss_dict_reduced.items():
                writer.add_scalar("loss/{}".format(k), v.item(), engine.state.iteration)
            writer.add_scalar("loss/total_loss", sum(loss for loss in loss_dict_reduced.values()).item(),
                              engine.state.iteration)
            # writer.add_scalar("learning_rate/lr", engine.state.optimizer.param_groups[0]['lr'], engine.state.iteration)

        if engine.state.iteration % debug_images_interval == 0:
            for n, debug_image in enumerate(draw_debug_images(images, targets)):
                writer.add_image("training/image_{}".format(n), debug_image, engine.state.iteration, dataformats='HWC')
                if 'masks' in targets[n]:
                    writer.add_image("training/image_{}_mask".format(n),
                                     draw_mask(targets[n]), engine.state.iteration, dataformats='HW')
        images = targets = loss_dict_reduced = engine.state.output = None

    @trainer.on(Events.EPOCH_COMPLETED)
    def on_epoch_completed(engine):
        logger.debug('Finished Epoch...')
        update_hparams(engine)

        engine.state.scheduler.step()
        evaluator.run(val_loader)
        # for res_type in evaluator.state.coco_evaluator.iou_types:
        #     average_precision_05 = evaluator.state.coco_evaluator.coco_eval[res_type].stats[1]
        #     writer.add_scalar("validation-{}/average precision 0_5".format(res_type), average_precision_05,
        #                       engine.state.iteration)
        checkpoint_path = os.path.join(output_dir, 'model_epoch_{}.pth'.format(engine.state.epoch))
        print('Saving model checkpoint')
        checkpoint = {
            'model': model.state_dict(),
            'optimizer': engine.state.optimizer.state_dict(),
            'lr_scheduler': engine.state.scheduler.state_dict(),
            'epoch': engine.state.epoch,
            'iteration': engine.state.iteration,
            'configuration': configuration_data,
            'labels_enumeration': labels_enum}
        utils.save_on_master(checkpoint, checkpoint_path)
        print('Model checkpoint from epoch {} was saved at {}'.format(engine.state.epoch, checkpoint_path))
        checkpoint = None
        evaluator.state = State()

    @trainer.on(Events.COMPLETED)
    def on_training_completed(engine):
        logger.debug('Finished Training...')
        update_hparams(engine, finished=True)

        writer.add_hparams(hparam_dict=hparam_dict, metric_dict={
            'hparams/AP': coco_ap.ap,
            'hparams/AP.5': coco_ap_05.ap5,
            'hparams/AP.75': coco_ap_075.ap75
        })
        logger.debug('Wrote hparams...')

    def update_hparams(engine, finished=False):
        hparam_dict['total_iterations'] = global_step_from_engine(engine)(engine, Events.ITERATION_COMPLETED)
        hparam_dict['total_epochs'] = global_step_from_engine(engine)(engine, Events.EPOCH_COMPLETED)
        hparam_dict['timeout'] = not finished

        if hparam_dict['train_set_size'] is None:
            hparam_dict['train_set_size'] = hparam_dict['training_set_size']

        try:
            shutil.copyfile(os.path.join(output_dir, 'hparams.pickle'),
                            os.path.join(output_dir, 'hparams.pickle.backup'))
            with open(os.path.join(output_dir, 'hparams.pickle'), 'wb') as f:
                pickle.dump(hparam_dict, f)
        except AttributeError as e:
            print('Could not pickle one of the total vars.', e)
            os.replace(os.path.join(output_dir, 'hparams.pickle.backup'), os.path.join(output_dir, 'hparams.pickle'))

    @evaluator.on(Events.STARTED)
    def on_evaluation_started(engine):
        logger.debug('Started Evaluation...')
        model.eval()
        # engine.state.coco_evaluator = CocoEvaluator(coco_api_val_dataset, iou_types)

    @evaluator.on(Events.ITERATION_COMPLETED)
    def on_eval_iteration_completed(engine):
        images, targets, results = engine.state.output
        if engine.state.iteration % log_interval == 0:
            print("Evaluation: Iteration: {}".format(engine.state.iteration))

        if engine.state.iteration % debug_images_interval == 0:
            for n, debug_image in enumerate(draw_debug_images(images, targets, results)):
                print('Drawing debug image "validation/image_{}_{}"'.format(engine.state.iteration, n))
                writer.add_image("evaluation/image_{}_{}".format(engine.state.iteration, n),
                                 debug_image, trainer.state.iteration, dataformats='HWC')
                if 'masks' in targets[n]:
                    writer.add_image("validation/image_{}_{}_mask".format(engine.state.iteration, n),
                                     draw_mask(targets[n]), trainer.state.iteration, dataformats='HW')
                    curr_image_id = int(targets[n]['image_id'])
                    writer.add_image("validation/image_{}_{}_predicted_mask".format(engine.state.iteration, n),
                                     draw_mask(results[curr_image_id]).squeeze(), trainer.state.iteration,
                                     dataformats='HW')
        images = targets = results = engine.state.output = None

    @evaluator.on(Events.COMPLETED)
    def on_evaluation_completed(engine):
        logger.debug('Finished Evaluation...')
        # gather the stats from all processes
        # engine.state.coco_evaluator.synchronize_between_processes()
        #
        # # accumulate predictions from all images
        # engine.state.coco_evaluator.accumulate()
        # engine.state.coco_evaluator.summarize()
        #
        # pr_50, pr_75 = get_pr_levels(engine.state.coco_evaluator.coco_eval['bbox'])
        # TODO Bring this back
        # writer.add_hparams(hparam_dict, {
        #     'hparams/AP.5': np.mean(pr_50),
        #     'hparams/AP.75': np.mean(pr_75)
        # })

    logger.debug('Running Trainer...')
    trainer.run(train_loader, max_epochs=epochs)
    writer.close()

    profiler.write_results('{}/time_profiling.csv'.format(output_dir))
    common.setup_common_training_handlers(trainer,
                                          train_sampler=train_sampler,
                                          to_save=to_save,
                                          save_every_iters=hp['save_every_iters'],
                                          output_path=str(output_path),
                                          lr_scheduler=scheduler,
                                          with_gpu_stats=True,
                                          output_names=metric_names,
                                          with_pbars=True,
                                          with_pbar_on_iters=True,
                                          log_every_iters=hp['log_progress_every_iters'],
                                          device=backend_conf.device)

    if backend_conf.rank == 0:
        tb_logger = TensorboardLogger(log_dir=str(output_path))
        tb_logger.attach(trainer, log_handler=OutputHandler(tag='train', metric_names=metric_names), event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer, param_name='lr'), event_name=Events.ITERATION_STARTED)
        # TODO: make sure hp params logging works here + use test eval metrics instead of training's
        tb_logger.attach(trainer, log_handler=HyperparamsOutoutHandler(hp, metric_names=metric_names), event_name=Events.COMPLETED)

    def _metrics(prefix): return {**{f'{prefix}_{n}': m for n, m in metrics.items()},
                                  **{f'{prefix}_{n}': loss for n, loss in losses.items()}}

    valid_evaluator = create_supervised_evaluator(model, metrics=_metrics('valid'), device=device, non_blocking=True)
    train_evaluator = create_supervised_evaluator(model, metrics=_metrics('train'), device=device, non_blocking=True)

    @trainer.on(Events.EPOCH_STARTED(every=hp['validate_every_epochs']))
    @trainer.on(Events.COMPLETED)
    def _run_validation(engine: Engine):
        if torch.cuda.is_available() and not backend_conf.is_cpu:
            torch.cuda.synchronize()
Beispiel #7
0
def run(output_path, config):

    device = "cuda"
    batch_size = config['batch_size']

    train_loader, test_loader = get_train_test_loaders(
        dataset_name=config['dataset'],
        path=config['data_path'],
        batch_size=batch_size,
        num_workers=config['num_workers'])

    model = get_model(config['model'])
    model = model.to(device)

    optim_fn = optim.SGD
    if config['with_layca']:
        optim_fn = LaycaSGD

    optimizer = optim_fn(model.parameters(),
                         lr=0.0,
                         momentum=config['momentum'],
                         weight_decay=config['weight_decay'],
                         nesterov=True)
    criterion = nn.CrossEntropyLoss()

    le = len(train_loader)
    milestones_values = [(le * m, v)
                         for m, v in config['lr_milestones_values']]
    scheduler = PiecewiseLinear(optimizer,
                                "lr",
                                milestones_values=milestones_values)

    def _prepare_batch(batch, device, non_blocking):
        x, y = batch
        return (convert_tensor(x, device=device, non_blocking=non_blocking),
                convert_tensor(y, device=device, non_blocking=non_blocking))

    def process_function(engine, batch):

        x, y = _prepare_batch(batch, device=device, non_blocking=True)

        model.train()
        y_pred = model(x)
        loss = criterion(y_pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        return loss.item()

    trainer = Engine(process_function)

    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    RunningAverage(output_transform=lambda x: x,
                   epoch_bound=False).attach(trainer, 'batchloss')

    ProgressBar(persist=True,
                bar_format="").attach(trainer,
                                      event_name=Events.EPOCH_STARTED,
                                      closing_event_name=Events.COMPLETED)

    tb_logger = TensorboardLogger(log_dir=output_path)
    tb_logger.attach(trainer,
                     log_handler=tbOutputHandler(tag="train",
                                                 metric_names='all'),
                     event_name=Events.ITERATION_COMPLETED)
    tb_logger.attach(trainer,
                     log_handler=tbOptimizerParamsHandler(optimizer,
                                                          param_name="lr"),
                     event_name=Events.ITERATION_STARTED)

    tb_logger.attach(trainer,
                     log_handler=LayerRotationStatsHandler(model),
                     event_name=Events.EPOCH_STARTED)

    metrics = {
        "accuracy": Accuracy(),
    }

    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device,
                                                  non_blocking=True)

    def run_validation(engine, val_interval):
        if (engine.state.epoch - 1) % val_interval == 0:
            train_evaluator.run(train_loader)
            evaluator.run(test_loader)

    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              run_validation,
                              val_interval=2)
    trainer.add_event_handler(Events.COMPLETED, run_validation, val_interval=1)

    tb_logger.attach(train_evaluator,
                     log_handler=tbOutputHandler(tag="train",
                                                 metric_names='all',
                                                 another_engine=trainer),
                     event_name=Events.COMPLETED)

    tb_logger.attach(evaluator,
                     log_handler=tbOutputHandler(tag="test",
                                                 metric_names='all',
                                                 another_engine=trainer),
                     event_name=Events.COMPLETED)

    def mlflow_batch_metrics_logging(engine, tag):
        step = trainer.state.iteration
        for name, value in engine.state.metrics.items():
            mlflow.log_metric("{} {}".format(tag, name), value, step=step)

    def mlflow_val_metrics_logging(engine, tag):
        step = trainer.state.epoch
        for name in metrics.keys():
            value = engine.state.metrics[name]
            mlflow.log_metric("{} {}".format(tag, name), value, step=step)

    trainer.add_event_handler(Events.ITERATION_COMPLETED,
                              mlflow_batch_metrics_logging, "train")
    train_evaluator.add_event_handler(Events.COMPLETED,
                                      mlflow_val_metrics_logging, "train")
    evaluator.add_event_handler(Events.COMPLETED, mlflow_val_metrics_logging,
                                "test")

    trainer.run(train_loader, max_epochs=config['num_epochs'])
    tb_logger.close()
def train(epochs=500,
          batch_size=32,
          bptt_len=70,
          lr=0.00025,
          log_steps=200,
          clip_grad=0.25,
          log_dir="experiments"):
    ###################################################################
    # Dataset
    ###################################################################
    wt = wikitext103(batch_size=batch_size, bptt_len=bptt_len)
    # wt = wikitext2(batch_size=batch_size, bptt_len=bptt_len)

    ###################################################################
    # Configs
    ###################################################################
    embedding_config = DropEmbedding.Hyperparams(len(wt.text_field.vocab) + 3,
                                                 ninp=512)
    encoder_config = TransformerEncoder.Hyperparams(
        att_num_units=[512, 512, 512, 512, 512, 512], max_ext=384)

    ###################################################################
    # Models
    ###################################################################
    base_embedding = DropEmbedding(embedding_config)
    embedding = TransformerEmbedding(embedding=base_embedding,
                                     max_length=bptt_len,
                                     embedding_size=embedding_config.ninp,
                                     use_positional_embedding=False)
    encoder = TransformerEncoder(encoder_config)
    model = TransformerLanguageModel(embedding, encoder)
    model.init_weight()

    ###################################################################
    # Loss
    ###################################################################
    criterion = lm_criterion(in_features=encoder_config.att_num_units[-1],
                             vocab_size=len(wt.text_field.vocab))

    ###################################################################
    # Parameters + Train ops
    ###################################################################
    parameters = (list(model.parameters()) + list(criterion.parameters()))
    tot_params = 0
    for p in parameters:
        tot_params += reduce(lambda x, y: x * y, p.size())
    print("Total Parameters: ", tot_params)
    opt = optim.Adam(parameters, lr=lr)
    model.to(DEVICE)
    criterion.to(DEVICE)

    ###################################################################
    # Train + Evaluation
    ###################################################################
    def train_step(engine, batch):
        model.train()
        opt.zero_grad()

        text = batch.text.to(DEVICE).t().contiguous()
        target = batch.target.to(DEVICE).t().contiguous()

        out, out_past = model(text, engine.state.train_past)
        engine.state.train_past = out_past
        raw_loss = criterion(out.view(-1, out.size(2)), target.view(-1))
        loss = raw_loss[1]

        loss.backward()
        nn.utils.clip_grad_norm_(parameters, clip_grad)
        opt.step()

        return {"train_loss": loss.item(), "train_ppl": loss.exp().item()}

    def eval_step(engine, batch):
        model.eval()

        if not hasattr(engine.state, "eval_past"):
            engine.state.eval_past = None

        with torch.no_grad():
            text = batch.text.to(DEVICE).t().contiguous()
            target = batch.target.to(DEVICE).t().contiguous()

            out, out_past = model(text, engine.state.eval_past)
            engine.state.eval_past = out_past
            raw_loss = criterion(out.view(-1, out.size(2)), target.view(-1))
            loss = raw_loss[1]

            return {"val_loss": loss.item()}

    train_engine = Engine(train_step)
    eval_engine = Engine(eval_step)

    def reset_state(engine):
        engine.state.train_past = None

    def run_eval(_):
        print("start running eval")
        eval_engine.run(wt.valid_iter)
        metrics = eval_engine.state.metrics
        print("Validation loss: ", metrics["val_loss"], ", ppl: ",
              np.exp(metrics["val_loss"]))

    train_engine.add_event_handler(Events.EPOCH_STARTED, reset_state)
    train_engine.add_event_handler(Events.EPOCH_COMPLETED, run_eval)

    ###################################################################
    # LR Scheduler
    ###################################################################
    cosine_scheduler = CosineAnnealingScheduler(opt.param_groups[0],
                                                "lr",
                                                0.0,
                                                2.5e-4,
                                                cycle_size=len(wt.train_iter))
    warmup_scheduler = create_lr_scheduler_with_warmup(cosine_scheduler, 0.0,
                                                       2.5e-4, 200)
    train_engine.add_event_handler(Events.ITERATION_STARTED, warmup_scheduler)

    ###################################################################
    # Metrics
    ###################################################################
    RunningAverage(output_transform=lambda x: x["train_ppl"]).attach(
        train_engine, "train_ppl")
    RunningAverage(output_transform=lambda x: x["train_loss"]).attach(
        train_engine, "train_loss")
    RunningAverage(output_transform=lambda x: x["val_loss"]).attach(
        eval_engine, "val_loss")
    progress_bar = ProgressBar(persist=True)
    progress_bar.attach(train_engine, ["train_ppl", "train_loss"])
    progress_bar_val = ProgressBar(persist=True)
    progress_bar_val.attach(eval_engine, ["val_loss"])

    ###################################################################
    # Tensorboard
    ###################################################################
    tb_logger = TensorboardLogger(log_dir=log_dir)

    def stepn_logger(num_steps, handler):
        def logger_runner(engine, log_handler, event_name):
            if engine.state.iteration % num_steps == 0:
                handler(engine, log_handler, event_name)

        return logger_runner

    tb_logger.attach(train_engine,
                     log_handler=stepn_logger(
                         log_steps,
                         OutputHandler(tag="training",
                                       output_transform=lambda loss: loss)),
                     event_name=Events.ITERATION_COMPLETED)
    tb_logger.attach(eval_engine,
                     log_handler=OutputHandler(
                         tag="validation",
                         output_transform=lambda loss: loss,
                         another_engine=train_engine),
                     event_name=Events.EPOCH_COMPLETED)
    tb_logger.attach(train_engine,
                     log_handler=stepn_logger(log_steps,
                                              OptimizerParamsHandler(opt)),
                     event_name=Events.ITERATION_STARTED)
    tb_logger.attach(train_engine,
                     log_handler=stepn_logger(log_steps,
                                              WeightsScalarHandler(model)),
                     event_name=Events.ITERATION_COMPLETED)
    tb_logger.attach(train_engine,
                     log_handler=stepn_logger(log_steps,
                                              GradsScalarHandler(model)),
                     event_name=Events.ITERATION_COMPLETED)
    tb_logger.attach(train_engine,
                     log_handler=stepn_logger(500, WeightsHistHandler(model)),
                     event_name=Events.ITERATION_COMPLETED)
    tb_logger.attach(train_engine,
                     log_handler=stepn_logger(500, GradsHistHandler(model)),
                     event_name=Events.ITERATION_COMPLETED)

    try:
        train_engine.run(wt.train_iter, max_epochs=epochs)
    except Exception:
        pass
    finally:
        tb_logger.close()
Beispiel #9
0
def run(output_path, config):

    distributed = dist.is_available() and dist.is_initialized()
    rank = dist.get_rank() if distributed else 0

    manual_seed(config["seed"] + rank)

    # Setup dataflow, model, optimizer, criterion
    train_loader, test_loader = utils.get_dataflow(config, distributed)
    model, optimizer = utils.get_model_optimizer(config, distributed)
    criterion = nn.CrossEntropyLoss().to(utils.device)

    le = len(train_loader)
    milestones_values = [
        (0, 0.0),
        (le * config["num_warmup_epochs"], config["learning_rate"]),
        (le * config["num_epochs"], 0.0),
    ]
    lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values)

    # Setup Ignite trainer:
    # - let's define training step
    # - add other common handlers:
    #    - TerminateOnNan,
    #    - handler to setup learning rate scheduling,
    #    - ModelCheckpoint
    #    - RunningAverage` on `train_step` output
    #    - Two progress bars on epochs and optionally on iterations

    def train_step(engine, batch):

        x = convert_tensor(batch[0], device=utils.device, non_blocking=True)
        y = convert_tensor(batch[1], device=utils.device, non_blocking=True)

        model.train()
        # Supervised part
        y_pred = model(x)
        loss = criterion(y_pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        return {
            "batch loss": loss.item(),
        }

    if config["deterministic"] and rank == 0:
        print("Setup deterministic trainer")
    trainer = Engine(train_step) if not config["deterministic"] else DeterministicEngine(train_step)
    train_sampler = train_loader.sampler if distributed else None
    to_save = {"trainer": trainer, "model": model, "optimizer": optimizer, "lr_scheduler": lr_scheduler}
    metric_names = [
        "batch loss",
    ]
    common.setup_common_training_handlers(
        trainer,
        train_sampler=train_sampler,
        to_save=to_save,
        save_every_iters=config["checkpoint_every"],
        output_path=output_path,
        lr_scheduler=lr_scheduler,
        output_names=metric_names,
        with_pbar_on_iters=config["display_iters"],
        log_every_iters=10,
    )

    if rank == 0:
        # Setup Tensorboard logger - wrapper on SummaryWriter
        tb_logger = TensorboardLogger(log_dir=output_path)
        # Attach logger to the trainer and log trainer's metrics (stored in trainer.state.metrics) every iteration
        tb_logger.attach(
            trainer,
            log_handler=OutputHandler(tag="train", metric_names=metric_names),
            event_name=Events.ITERATION_COMPLETED,
        )
        # log optimizer's parameters: "lr" every iteration
        tb_logger.attach(
            trainer, log_handler=OptimizerParamsHandler(optimizer, param_name="lr"), event_name=Events.ITERATION_STARTED
        )

    # Let's now setup evaluator engine to perform model's validation and compute metrics
    metrics = {
        "accuracy": Accuracy(device=utils.device if distributed else None),
        "loss": Loss(criterion, device=utils.device if distributed else None),
    }

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_supervised_evaluator(model, metrics=metrics, device=utils.device, non_blocking=True)
    train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=utils.device, non_blocking=True)

    def run_validation(engine):
        train_evaluator.run(train_loader)
        evaluator.run(test_loader)

    trainer.add_event_handler(Events.EPOCH_STARTED(every=config["validate_every"]), run_validation)
    trainer.add_event_handler(Events.COMPLETED, run_validation)

    if rank == 0:
        # Setup progress bar on evaluation engines
        if config["display_iters"]:
            ProgressBar(persist=False, desc="Train evaluation").attach(train_evaluator)
            ProgressBar(persist=False, desc="Test evaluation").attach(evaluator)

        # Let's log metrics of `train_evaluator` stored in `train_evaluator.state.metrics` when validation run is done
        tb_logger.attach(
            train_evaluator,
            log_handler=OutputHandler(
                tag="train", metric_names="all", global_step_transform=global_step_from_engine(trainer)
            ),
            event_name=Events.COMPLETED,
        )

        # Let's log metrics of `evaluator` stored in `evaluator.state.metrics` when validation run is done
        tb_logger.attach(
            evaluator,
            log_handler=OutputHandler(
                tag="test", metric_names="all", global_step_transform=global_step_from_engine(trainer)
            ),
            event_name=Events.COMPLETED,
        )

        # Store 3 best models by validation accuracy:
        common.save_best_model_by_val_score(
            output_path, evaluator, model=model, metric_name="accuracy", n_saved=3, trainer=trainer, tag="test"
        )

        # Optionally log model gradients
        if config["log_model_grads_every"] is not None:
            tb_logger.attach(
                trainer,
                log_handler=GradsHistHandler(model, tag=model.__class__.__name__),
                event_name=Events.ITERATION_COMPLETED(every=config["log_model_grads_every"]),
            )

    # In order to check training resuming we can emulate a crash
    if config["crash_iteration"] is not None:

        @trainer.on(Events.ITERATION_STARTED(once=config["crash_iteration"]))
        def _(engine):
            raise Exception("STOP at iteration: {}".format(engine.state.iteration))

    resume_from = config["resume_from"]
    if resume_from is not None:
        checkpoint_fp = Path(resume_from)
        assert checkpoint_fp.exists(), "Checkpoint '{}' is not found".format(checkpoint_fp.as_posix())
        print("Resume from a checkpoint: {}".format(checkpoint_fp.as_posix()))
        checkpoint = torch.load(checkpoint_fp.as_posix())
        Checkpoint.load_objects(to_load=to_save, checkpoint=checkpoint)

    try:
        trainer.run(train_loader, max_epochs=config["num_epochs"])
    except Exception as e:
        import traceback

        print(traceback.format_exc())

    if rank == 0:
        tb_logger.close()
Beispiel #10
0
def run(output_path, config):

    device = "cuda"
    batch_size = config['batch_size']

    train_labelled_loader, train_unlabelled_loader, test_loader = \
        get_train_test_loaders(dataset_name=config['dataset'],
                               num_labelled_samples=config['num_labelled_samples'],
                               path=config['data_path'],
                               batch_size=batch_size,
                               unlabelled_batch_size=config.get('unlabelled_batch_size', None),
                               num_workers=config['num_workers'])

    model = get_model(config['model'])
    model = model.to(device)

    optimizer = optim.SGD(model.parameters(),
                          lr=config['learning_rate'],
                          momentum=config['momentum'],
                          weight_decay=config['weight_decay'],
                          nesterov=True)

    with_SWA = config['with_SWA']
    if with_SWA:
        optimizer = torchcontrib.optim.SWA(optimizer)

    criterion = nn.CrossEntropyLoss().to(device)
    if config['consistency_criterion'] == "MSE":
        consistency_criterion = nn.MSELoss()
    elif config['consistency_criterion'] == "KL":
        consistency_criterion = nn.KLDivLoss(reduction='batchmean')
    else:
        raise RuntimeError("Unknown consistency criterion {}".format(
            config['consistency_criterion']))

    consistency_criterion = consistency_criterion.to(device)

    le = len(train_labelled_loader)
    num_train_steps = le * config['num_epochs']
    mlflow.log_param("num train steps", num_train_steps)

    lr = config['learning_rate']
    eta_min = lr * config['min_lr_ratio']
    num_warmup_steps = config['num_warmup_steps']

    lr_scheduler = CosineAnnealingLR(optimizer,
                                     eta_min=eta_min,
                                     T_max=num_train_steps - num_warmup_steps)

    if num_warmup_steps > 0:
        lr_scheduler = create_lr_scheduler_with_warmup(
            lr_scheduler,
            warmup_start_value=0.0,
            warmup_end_value=lr * (1.0 + 1.0 / num_warmup_steps),
            warmup_duration=num_warmup_steps)

    def _prepare_batch(batch, device, non_blocking):
        x, y = batch
        return (convert_tensor(x, device=device, non_blocking=non_blocking),
                convert_tensor(y, device=device, non_blocking=non_blocking))

    def cycle(iterable):
        while True:
            for i in iterable:
                yield i

    train_unlabelled_loader_iter = cycle(train_unlabelled_loader)

    lam = config['consistency_lambda']

    tsa = TrainingSignalAnnealing(num_steps=num_train_steps,
                                  min_threshold=config['TSA_proba_min'],
                                  max_threshold=config['TSA_proba_max'])

    with_tsa = config['with_TSA']
    with_UDA = not config['no_UDA']

    def uda_process_function(engine, labelled_batch):

        x, y = _prepare_batch(labelled_batch, device=device, non_blocking=True)

        if with_UDA:
            unsup_x, unsup_aug_x = next(train_unlabelled_loader_iter)
            unsup_x = convert_tensor(unsup_x, device=device, non_blocking=True)
            unsup_aug_x = convert_tensor(unsup_aug_x,
                                         device=device,
                                         non_blocking=True)

        model.train()
        # Supervised part
        y_pred = model(x)
        loss = criterion(y_pred, y)

        supervised_loss = loss
        step = engine.state.iteration - 1
        if with_tsa and with_UDA:
            new_y_pred, new_y = tsa(y_pred, y, step=step)
            new_loss = criterion(new_y_pred, new_y)

            engine.state.tsa_log = {
                "new_y_pred": new_y_pred,
                "loss": loss.item(),
                "tsa_loss": new_loss.item()
            }
            supervised_loss = new_loss

        # Unsupervised part
        if with_UDA:
            unsup_orig_y_pred = model(unsup_x).detach()
            unsup_orig_y_probas = torch.softmax(unsup_orig_y_pred, dim=-1)

            unsup_aug_y_pred = model(unsup_aug_x)
            unsup_aug_y_probas = torch.log_softmax(unsup_aug_y_pred, dim=-1)

            consistency_loss = consistency_criterion(unsup_aug_y_probas,
                                                     unsup_orig_y_probas)

        final_loss = supervised_loss

        if with_UDA:
            final_loss += lam * consistency_loss

        optimizer.zero_grad()
        final_loss.backward()
        optimizer.step()

        return {
            'supervised batch loss': supervised_loss.item(),
            'consistency batch loss':
            consistency_loss.item() if with_UDA else 0.0,
            'final batch loss': final_loss.item(),
        }

    trainer = Engine(uda_process_function)

    if with_UDA and with_tsa:

        @trainer.on(Events.ITERATION_COMPLETED)
        def log_tsa(engine):
            step = engine.state.iteration - 1
            if step % 50 == 0:
                mlflow.log_metric("TSA threshold",
                                  tsa.thresholds[step].item(),
                                  step=step)
                mlflow.log_metric("TSA selection",
                                  engine.state.tsa_log['new_y_pred'].shape[0],
                                  step=step)
                mlflow.log_metric("Original X Loss",
                                  engine.state.tsa_log['loss'],
                                  step=step)
                mlflow.log_metric("TSA X Loss",
                                  engine.state.tsa_log['tsa_loss'],
                                  step=step)

    if not hasattr(lr_scheduler, "step"):
        trainer.add_event_handler(Events.ITERATION_STARTED, lr_scheduler)
    else:
        trainer.add_event_handler(Events.ITERATION_STARTED,
                                  lambda engine: lr_scheduler.step())

    @trainer.on(Events.ITERATION_STARTED)
    def log_learning_rate(engine):
        step = engine.state.iteration - 1
        if step % 50 == 0:
            lr = optimizer.param_groups[0]['lr']
            mlflow.log_metric("learning rate", lr, step=step)

    if with_SWA:

        @trainer.on(Events.COMPLETED)
        def swap_swa_sgd(engine):
            optimizer.swap_swa_sgd()
            optimizer.bn_update(train_labelled_loader, model)

        @trainer.on(Events.EPOCH_COMPLETED)
        def update_swa(engine):
            if engine.state.epoch - 1 > int(num_epochs * 0.75):
                optimizer.update_swa()

    metric_names = [
        'supervised batch loss', 'consistency batch loss', 'final batch loss'
    ]

    def output_transform(x, name):
        return x[name]

    for n in metric_names:
        RunningAverage(output_transform=partial(output_transform, name=n),
                       epoch_bound=False).attach(trainer, n)

    ProgressBar(persist=True,
                bar_format="").attach(trainer,
                                      event_name=Events.EPOCH_STARTED,
                                      closing_event_name=Events.COMPLETED)

    tb_logger = TensorboardLogger(log_dir=output_path)
    tb_logger.attach(trainer,
                     log_handler=tbOutputHandler(tag="train",
                                                 metric_names=[
                                                     'final batch loss',
                                                     'consistency batch loss',
                                                     'supervised batch loss'
                                                 ]),
                     event_name=Events.ITERATION_COMPLETED)
    tb_logger.attach(trainer,
                     log_handler=tbOptimizerParamsHandler(optimizer,
                                                          param_name="lr"),
                     event_name=Events.ITERATION_STARTED)

    metrics = {
        "accuracy": Accuracy(),
    }

    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device,
                                                  non_blocking=True)

    def run_validation(engine, val_interval):
        if (engine.state.epoch - 1) % val_interval == 0:
            train_evaluator.run(train_labelled_loader)
            evaluator.run(test_loader)

    trainer.add_event_handler(Events.EPOCH_STARTED,
                              run_validation,
                              val_interval=2)
    trainer.add_event_handler(Events.COMPLETED, run_validation, val_interval=1)

    tb_logger.attach(train_evaluator,
                     log_handler=tbOutputHandler(tag="train",
                                                 metric_names=list(
                                                     metrics.keys()),
                                                 another_engine=trainer),
                     event_name=Events.COMPLETED)

    tb_logger.attach(evaluator,
                     log_handler=tbOutputHandler(tag="test",
                                                 metric_names=list(
                                                     metrics.keys()),
                                                 another_engine=trainer),
                     event_name=Events.COMPLETED)

    def mlflow_batch_metrics_logging(engine, tag):
        step = trainer.state.iteration
        for name, value in engine.state.metrics.items():
            mlflow.log_metric("{} {}".format(tag, name), value, step=step)

    def mlflow_val_metrics_logging(engine, tag):
        step = trainer.state.epoch
        for name in metrics.keys():
            value = engine.state.metrics[name]
            mlflow.log_metric("{} {}".format(tag, name), value, step=step)

    trainer.add_event_handler(Events.ITERATION_COMPLETED,
                              mlflow_batch_metrics_logging, "train")
    train_evaluator.add_event_handler(Events.COMPLETED,
                                      mlflow_val_metrics_logging, "train")
    evaluator.add_event_handler(Events.COMPLETED, mlflow_val_metrics_logging,
                                "test")

    trainer.run(train_labelled_loader, max_epochs=config['num_epochs'])
def run(args):
    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)

    num_classes = 21
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    model = GoogLeNetFCN(num_classes)
    model.init_from_googlenet()

    device_count = torch.cuda.device_count()
    if device_count > 1:
        print("Using %d GPU(s)" % device_count)
        model = nn.DataParallel(model)
        args.batch_size = device_count * args.batch_size
        args.val_batch_size = device_count * args.val_batch_size

    model = model.to(device)

    train_loader, val_loader = get_data_loaders(
        args.dataset_dir, args.batch_size, args.val_batch_size,
        args.num_workers, args.download, args.augmentations)

    criterion = nn.CrossEntropyLoss(ignore_index=255, reduction='sum')

    optimizer = optim.SGD([{
        'params': [
            param for name, param in model.named_parameters()
            if name.endswith('weight')
        ]
    }, {
        'params': [
            param for name, param in model.named_parameters()
            if name.endswith('bias')
        ],
        'lr':
        args.lr * 2,
        'weight_decay':
        0
    }],
                          lr=args.lr,
                          momentum=args.momentum,
                          weight_decay=args.weight_decay)

    if args.resume:
        if os.path.isfile(args.resume):
            print("Loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_iou = checkpoint['bestIoU']
            model.load_state_dict(checkpoint['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("Loaded checkpoint '{}' (Epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("No checkpoint found at '{}'".format(args.resume))
            sys.exit()

    if args.freeze_bn:
        print("Freezing batch norm")
        model = freeze_batchnorm(model)

    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device,
                                        non_blocking=True)

    RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss')

    # attach progress bar
    pbar = ProgressBar(persist=True)
    pbar.attach(trainer, metric_names=['loss'])

    cm = ConfusionMatrix(num_classes)
    evaluator = create_supervised_evaluator(model,
                                            metrics={
                                                'loss': Loss(criterion),
                                                'IoU': IoU(cm)
                                            },
                                            device=device,
                                            non_blocking=True)

    pbar2 = ProgressBar(persist=True, desc='Eval Epoch')
    pbar2.attach(evaluator)

    def _global_step_transform(engine, event_name):
        return trainer.state.iteration

    tb_logger = TensorboardLogger(args.log_dir)
    tb_logger.attach(trainer,
                     log_handler=OutputHandler(tag='training',
                                               metric_names=['loss']),
                     event_name=Events.ITERATION_COMPLETED)

    tb_logger.attach(trainer,
                     log_handler=OptimizerParamsHandler(optimizer),
                     event_name=Events.ITERATION_STARTED)

    tb_logger.attach(trainer,
                     log_handler=WeightsHistHandler(model),
                     event_name=Events.EPOCH_COMPLETED)

    tb_logger.attach(evaluator,
                     log_handler=OutputHandler(
                         tag='validation',
                         metric_names=['loss', 'IoU'],
                         global_step_transform=_global_step_transform),
                     event_name=Events.EPOCH_COMPLETED)

    @evaluator.on(Events.EPOCH_COMPLETED)
    def save_checkpoint(engine):
        iou = engine.state.metrics['IoU'] * 100.0
        mean_iou = iou.mean()

        is_best = mean_iou.item() > trainer.state.best_iou
        trainer.state.best_iou = max(mean_iou.item(), trainer.state.best_iou)

        name = 'epoch{}_mIoU={:.1f}.pth'.format(trainer.state.epoch, mean_iou)
        file = {
            'model': model.state_dict(),
            'epoch': trainer.state.epoch,
            'iteration': engine.state.iteration,
            'optimizer': optimizer.state_dict(),
            'args': args,
            'bestIoU': trainer.state.best_iou
        }

        save(file, args.output_dir, 'checkpoint_{}'.format(name))
        if is_best:
            save(model.state_dict(), args.output_dir, 'model_{}'.format(name))

    @trainer.on(Events.STARTED)
    def initialize(engine):
        if args.resume:
            engine.state.epoch = args.start_epoch
            engine.state.iteration = args.start_epoch * len(
                engine.state.dataloader)
            engine.state.best_iou = best_iou
        else:
            engine.state.best_iou = 0.0

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        pbar.log_message("Start Validation - Epoch: [{}/{}]".format(
            engine.state.epoch, engine.state.max_epochs))
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        loss = metrics['loss']
        iou = metrics['IoU']
        mean_iou = iou.mean()

        pbar.log_message(
            "Validation results - Epoch: [{}/{}]: Loss: {:.2e}, mIoU: {:.1f}".
            format(engine.state.epoch, engine.state.max_epochs, loss,
                   mean_iou * 100.0))

    print("Start training")
    trainer.run(train_loader, max_epochs=args.epochs)
    tb_logger.close()
Beispiel #12
0
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    pbar = ProgressBar(persist=False, bar_format=None)
    pbar.attach(trainer, ["loss"])

    trainer.add_event_handler(
        Events.EPOCH_COMPLETED, lambda e: evaluator.run(valid_data_iter)
    )

    log_dir = config.pop("results_path")
    if Path(log_dir).exists():
        rmtree(log_dir)
    tb_logger = TensorboardLogger(log_dir=log_dir)

    tb_logger.attach(
        trainer,
        log_handler=OutputHandler(tag="training", metric_names=["loss"]),
        event_name=Events.EPOCH_COMPLETED,
    )

    tb_logger.attach(
        evaluator,
        log_handler=OutputHandler(
            tag="validation",
            metric_names=["loss", "ppl", "bleu", "lr"],
            another_engine=trainer,
        ),
        event_name=Events.EPOCH_COMPLETED,
    )

    trainer.run(train_data_iter, max_epochs=100)
Beispiel #13
0
def run(output_path, config):
    device = "cuda"

    local_rank = config["local_rank"]
    distributed = backend is not None
    if distributed:
        torch.cuda.set_device(local_rank)
        device = "cuda"
    rank = dist.get_rank() if distributed else 0

    torch.manual_seed(config["seed"] + rank)

    # Rescale batch_size and num_workers
    ngpus_per_node = torch.cuda.device_count()
    ngpus = dist.get_world_size() if distributed else 1
    batch_size = config["batch_size"] // ngpus
    num_workers = int(
        (config["num_workers"] + ngpus_per_node - 1) / ngpus_per_node)

    train_loader, test_loader = get_train_test_loaders(
        path=config["data_path"],
        batch_size=batch_size,
        distributed=distributed,
        num_workers=num_workers,
    )

    model = get_model(config["model"])
    model = model.to(device)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[
                local_rank,
            ],
            output_device=local_rank,
        )

    optimizer = optim.SGD(
        model.parameters(),
        lr=config["learning_rate"],
        momentum=config["momentum"],
        weight_decay=config["weight_decay"],
        nesterov=True,
    )

    criterion = nn.CrossEntropyLoss().to(device)

    le = len(train_loader)
    milestones_values = [
        (0, 0.0),
        (le * config["num_warmup_epochs"], config["learning_rate"]),
        (le * config["num_epochs"], 0.0),
    ]
    lr_scheduler = PiecewiseLinear(optimizer,
                                   param_name="lr",
                                   milestones_values=milestones_values)

    def _prepare_batch(batch, device, non_blocking):
        x, y = batch
        return (
            convert_tensor(x, device=device, non_blocking=non_blocking),
            convert_tensor(y, device=device, non_blocking=non_blocking),
        )

    def process_function(engine, batch):

        x, y = _prepare_batch(batch, device=device, non_blocking=True)

        model.train()
        # Supervised part
        y_pred = model(x)
        loss = criterion(y_pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        return {
            "batch loss": loss.item(),
        }

    trainer = Engine(process_function)
    train_sampler = train_loader.sampler if distributed else None
    to_save = {
        "trainer": trainer,
        "model": model,
        "optimizer": optimizer,
        "lr_scheduler": lr_scheduler,
    }
    metric_names = [
        "batch loss",
    ]
    common.setup_common_training_handlers(
        trainer,
        train_sampler=train_sampler,
        to_save=to_save,
        save_every_iters=config["checkpoint_every"],
        output_path=output_path,
        lr_scheduler=lr_scheduler,
        output_names=metric_names,
        with_pbar_on_iters=config["display_iters"],
        log_every_iters=10,
    )

    if rank == 0:
        tb_logger = TensorboardLogger(log_dir=output_path)
        tb_logger.attach(
            trainer,
            log_handler=OutputHandler(tag="train", metric_names=metric_names),
            event_name=Events.ITERATION_COMPLETED,
        )
        tb_logger.attach(
            trainer,
            log_handler=OptimizerParamsHandler(optimizer, param_name="lr"),
            event_name=Events.ITERATION_STARTED,
        )

    metrics = {
        "accuracy": Accuracy(device=device if distributed else None),
        "loss": Loss(criterion, device=device if distributed else None),
    }

    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device,
                                                  non_blocking=True)

    def run_validation(engine):
        torch.cuda.synchronize()
        train_evaluator.run(train_loader)
        evaluator.run(test_loader)

    trainer.add_event_handler(
        Events.EPOCH_STARTED(every=config["validate_every"]), run_validation)
    trainer.add_event_handler(Events.COMPLETED, run_validation)

    if rank == 0:
        if config["display_iters"]:
            ProgressBar(persist=False,
                        desc="Train evaluation").attach(train_evaluator)
            ProgressBar(persist=False,
                        desc="Test evaluation").attach(evaluator)

        tb_logger.attach(
            train_evaluator,
            log_handler=OutputHandler(
                tag="train",
                metric_names=list(metrics.keys()),
                global_step_transform=global_step_from_engine(trainer),
            ),
            event_name=Events.COMPLETED,
        )

        tb_logger.attach(
            evaluator,
            log_handler=OutputHandler(
                tag="test",
                metric_names=list(metrics.keys()),
                global_step_transform=global_step_from_engine(trainer),
            ),
            event_name=Events.COMPLETED,
        )

        # Store the best model by validation accuracy:
        common.save_best_model_by_val_score(
            output_path,
            evaluator,
            model=model,
            metric_name="accuracy",
            n_saved=3,
            trainer=trainer,
            tag="test",
        )

        if config["log_model_grads_every"] is not None:
            tb_logger.attach(
                trainer,
                log_handler=GradsHistHandler(model,
                                             tag=model.__class__.__name__),
                event_name=Events.ITERATION_COMPLETED(
                    every=config["log_model_grads_every"]),
            )

    if config["crash_iteration"] is not None:

        @trainer.on(Events.ITERATION_STARTED(once=config["crash_iteration"]))
        def _(engine):
            raise Exception("STOP at iteration: {}".format(
                engine.state.iteration))

    resume_from = config["resume_from"]
    if resume_from is not None:
        checkpoint_fp = Path(resume_from)
        assert checkpoint_fp.exists(), "Checkpoint '{}' is not found".format(
            checkpoint_fp.as_posix())
        print("Resume from a checkpoint: {}".format(checkpoint_fp.as_posix()))
        checkpoint = torch.load(checkpoint_fp.as_posix())
        Checkpoint.load_objects(to_load=to_save, checkpoint=checkpoint)

    try:
        trainer.run(train_loader, max_epochs=config["num_epochs"])
    except Exception as e:
        import traceback

        print(traceback.format_exc())

    if rank == 0:
        tb_logger.close()
Beispiel #14
0
def run(args):
    train_loader, val_loader = get_data_loaders(args.dataset_dir, args.batch_size, args.val_batch_size,
                                                args.num_workers)

    if args.seed is not None:
        torch.manual_seed(args.seed)

    num_classes = CityscapesDataset.num_classes()
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    model = GoogLeNetFCN(num_classes)
    model.init_from_googlenet()

    device_count = torch.cuda.device_count()
    if device_count > 1:
        print("Using %d GPU(s)" % device_count)
        model = nn.DataParallel(model)
        args.batch_size = device_count * args.batch_size
        args.val_batch_size = device_count * args.val_batch_size

    model = model.to(device)
    criterion = nn.CrossEntropyLoss(ignore_index=255)

    optimizer = optim.SGD([{'params': [p for p, name in model.named_parameters() if name[-4:] != 'bias'],
                            'lr': args.lr, 'weight_decay': 5e-4},
                           {'params': [p for p, name in model.named_parameters() if name[-4:] == 'bias'],
                            'lr': args.lr * 2}], momentum=args.momentum, lr=args.lr)

    if args.resume:
        if os.path.isfile(args.resume):
            print("Loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("Loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch']))
        else:
            print("No checkpoint found at '{}'".format(args.resume))

    trainer = create_supervised_trainer(model, optimizer, criterion, device, non_blocking=True)
    RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss')

    # attach progress bar
    pbar = ProgressBar(persist=True)
    pbar.attach(trainer, metric_names=['loss'])

    cm = ConfusionMatrix(num_classes)
    evaluator = create_supervised_evaluator(model, metrics={'loss': Loss(criterion),
                                                            'IoU': IoU(cm, ignore_index=0)},
                                            device=device, non_blocking=True)

    pbar2 = ProgressBar(persist=True, desc='Eval Epoch')
    pbar2.attach(evaluator)

    def _global_step_transform(engine, event_name):
        return trainer.state.iteration

    tb_logger = TensorboardLogger(args.log_dir)
    tb_logger.attach(trainer,
                     log_handler=OutputHandler(tag='training',
                                               metric_names=['loss']),
                     event_name=Events.ITERATION_COMPLETED)

    tb_logger.attach(evaluator,
                     log_handler=OutputHandler(tag='validation',
                                               metric_names=['loss', 'IoU'],
                                               global_step_transform=_global_step_transform),
                     event_name=Events.EPOCH_COMPLETED)

    @evaluator.on(Events.EPOCH_COMPLETED)
    def save_checkpoint(engine):
        iou = engine.state.metrics['IoU'] * 100.0
        mean_iou = iou.mean()

        name = 'epoch{}_mIoU={:.1f}.pth'.format(trainer.state.epoch, mean_iou)
        file = {'model': model.state_dict(), 'epoch': trainer.state.epoch,
                'optimizer': optimizer.state_dict(), 'args': args}

        torch.save(file, os.path.join(args.output_dir, 'checkpoint_{}'.format(name)))
        torch.save(model.state_dict(), os.path.join(args.output_dir, 'model_{}'.format(name)))

    @trainer.on(Events.STARTED)
    def initialize(engine):
        if args.resume:
            engine.state.epoch = args.start_epoch

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        pbar.log_message('Start Validation - Epoch: [{}/{}]'.format(engine.state.epoch, engine.state.max_epochs))
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        loss = metrics['loss']
        iou = metrics['IoU']
        mean_iou = iou.mean()

        pbar.log_message('Validation results - Epoch: [{}/{}]: Loss: {:.2e}, mIoU: {:.1f}'
                         .format(engine.state.epoch, engine.state.max_epochs, loss, mean_iou * 100.0))

    @trainer.on(Events.EXCEPTION_RAISED)
    def handle_exception(engine, e):
        engine.state.exception_raised = True
        if isinstance(e, KeyboardInterrupt) and (engine.state.iteration > 1):
            engine.terminate()
            warnings.warn("KeyboardInterrupt caught. Exiting gracefully.")

            name = 'epoch{}_exception.pth'.format(trainer.state.epoch)
            file = {'model': model.state_dict(), 'epoch': trainer.state.epoch,
                    'optimizer': optimizer.state_dict()}

            torch.save(file, os.path.join(args.output_dir, 'checkpoint_{}'.format(name)))
            torch.save(model.state_dict(), os.path.join(args.output_dir, 'model_{}'.format(name)))
        else:
            raise e

    print("Start training")
    trainer.run(train_loader, max_epochs=args.epochs)
    tb_logger.close()
Beispiel #15
0
def run(batch_size=1,
        log_interval=50,
        debug_images_interval=10,
        val_dataset_ann_file='~/bigdata/coco/annotations/instances_val2017.json',
        input_checkpoint='',
        log_dir="/tmp/tensorboard_logs",
        use_mask=True,
        backbone_name='resnet101'):
    hparam_dict = {
        # 'warmup_iterations': warmup_iterations,
        'batch_size': batch_size,
        # 'test_size': test_size,
        # 'epochs': epochs,
        # 'trainable_layers': trainable_layers,
        # 'load_optimizer': load_optimizer,
        # 'lr': lr,
        # 'momentum': momentum,
        # 'weight_decay': weight_decay,
    }

    # Load the old hparams
    hparam_file = Path(input_checkpoint).parent / 'hparams.pickle'
    try:
        print('Opening hparams file from {}'.format(hparam_file.absolute()))
        with open(hparam_file, 'rb') as f:
            # The protocol version used is detected automatically, so we do not
            # have to specify it.
            data = pickle.load(f)
            print('Updating hparams with {}'.format(data))
            hparam_dict.update(data)
    except FileNotFoundError as e:
        print('HParam file not found at {}'.format(hparam_file.absolute()))

    print('Params: {}'.format(hparam_dict))

    # Define train and test datasets
    val_loader, labels_enum = get_eval_data_loader(
        val_dataset_ann_file,
        batch_size,
        configuration_data.get('image_size'),
        use_mask=use_mask)
    val_dataset = list(
        chain.from_iterable(zip(*batch) for batch in iter(val_loader)))
    coco_api_val_dataset = convert_to_coco_api(val_dataset)
    num_classes = max(labels_enum.keys()
                      ) + 1  # number of classes plus one for background class
    configuration_data['num_classes'] = num_classes
    print('Testing with {} classes...'.format(num_classes))

    # Set the training device to GPU if available - if not set it to CPU
    device = torch.cuda.current_device() if torch.cuda.is_available(
    ) else torch.device('cpu')
    torch.backends.cudnn.benchmark = True if torch.cuda.is_available(
    ) else False  # optimization for fixed input size

    if use_mask:
        print('Loading MaskRCNN Model...')
        model = get_model_instance_segmentation(
            num_classes, configuration_data.get('mask_predictor_hidden_layer'))
    else:
        print('Loading FasterRCNN Model...')
        model = get_model_instance_detection(num_classes,
                                             backbone_name=backbone_name)
    iou_types = get_iou_types(model)

    # if there is more than one GPU, parallelize the model
    if torch.cuda.device_count() > 1:
        print("{} GPUs were detected - we will use all of them".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    # copy the model to each device
    model.to(device)

    print('Loading model checkpoint from {}'.format(input_checkpoint))
    input_checkpoint = torch.load(input_checkpoint,
                                  map_location=torch.device(device))
    model.load_state_dict(input_checkpoint['model'])

    if use_mask:
        comment = 'mask'
    else:
        comment = 'box-{}'.format(backbone_name)
    tb_logger = TensorboardLogger(log_dir=log_dir, comment=comment)
    writer = tb_logger.writer

    # define Ignite's train and evaluation engine
    evaluator = create_evaluator(model, device)

    coco_ap = CocoAP(coco_api_val_dataset, iou_types)
    coco_ap_05 = CocoAP5(coco_api_val_dataset, iou_types)
    coco_ap_075 = CocoAP75(coco_api_val_dataset, iou_types)
    coco_ap.attach(evaluator, "AP")
    coco_ap_05.attach(evaluator, "AP0.5")
    coco_ap_075.attach(evaluator, "AP0.75")

    tb_logger.attach(
        evaluator,
        log_handler=OutputHandler(
            tag='evaluation',
            metric_names=['AP', 'AP0.5', 'AP0.75'],
            global_step_transform=global_step_from_engine(evaluator)),
        event_name=Events.COMPLETED)

    @evaluator.on(Events.STARTED)
    def on_evaluation_started(engine):
        model.eval()
        # engine.state.coco_evaluator = CocoEvaluator(coco_api_val_dataset, iou_types)

    @evaluator.on(Events.ITERATION_COMPLETED)
    def on_eval_iteration_completed(engine):
        images, targets, results = engine.state.output
        if engine.state.iteration % log_interval == 0:
            print("Evaluation: Iteration: {}".format(engine.state.iteration))

        if engine.state.iteration % debug_images_interval == 0:
            print('Saving debug image...')
            for n, debug_image in enumerate(
                    draw_debug_images(images, targets, results)):
                writer.add_image("evaluation/image_{}_{}".format(
                    engine.state.iteration, n),
                                 debug_image,
                                 evaluator.state.iteration,
                                 dataformats='HWC')
                if 'masks' in targets[n]:
                    writer.add_image("evaluation/image_{}_{}_mask".format(
                        engine.state.iteration, n),
                                     draw_mask(targets[n]),
                                     evaluator.state.iteration,
                                     dataformats='HW')
                    curr_image_id = int(targets[n]['image_id'])
                    writer.add_image(
                        "evaluation/image_{}_{}_predicted_mask".format(
                            engine.state.iteration, n),
                        draw_mask(results[curr_image_id]).squeeze(),
                        evaluator.state.iteration,
                        dataformats='HW')
        images = targets = results = engine.state.output = None

    @evaluator.on(Events.COMPLETED)
    def on_evaluation_completed(engine):
        # gather the stats from all processes
        # engine.state.coco_evaluator.synchronize_between_processes()

        # accumulate predictions from all images
        # engine.state.coco_evaluator.accumulate()
        # engine.state.coco_evaluator.summarize()
        #
        # pr_50, pr_75 = get_pr_levels(engine.state.coco_evaluator.coco_eval['bbox'])
        # plot_pr_curve_tensorboard(pr_50, pr_75, writer=writer)
        print('Writing hparams: {}'.format(hparam_dict))

        writer.add_hparams(hparam_dict=hparam_dict,
                           metric_dict={
                               'hparams/AP': coco_ap.ap,
                               'hparams/AP.5': coco_ap_05.ap5,
                               'hparams/AP.75': coco_ap_075.ap75
                           })

        coco_ap.write_tensorboard_pr_curve(writer)

    # evaluator.state = State()
    evaluator.run(val_loader)
    writer.close()