Beispiel #1
0
def train():
    torch.manual_seed(config_module.get('random_seed', 0))
    np.random.seed(config_module.get('random_seed', 0))

    base_config = config_module.get('base_config', None)
    audio_config = config_module.get('audio_config', None)
    decoder_config = config_module.get('decoder_config', None)

    # =============== setup hyper parameters ===============

    ##### setup for continue_learnning #####
    if (continue_learning):
        base_config['load_model'] = get_last_checkpoints(
            base_config['log_dir'])
        base_config['load_optim'] = True
        base_config['load_only_lr'] = False

    ##### hyper parameters #####
    batch_size = base_config['batch_size']
    max_epochs = base_config.get('max_epochs', 200)
    ctc_loss_weight = base_config.get('ctc_loss_weight', 1)
    ct_loss_left_weight = base_config.get('ct_loss_left_weight', 0)
    ct_loss_right_weight = base_config.get('ct_loss_right_weight', 0)
    n_left_context_heads = len(ct_loss_left_weight)
    n_right_context_heads = len(ct_loss_right_weight)
    eval_ct_steps = base_config.get('eval_steps') * base_config.get(
        'eval_ct_steps', 0)
    num_data_loader_workers = base_config.get('num_data_loader_workers', 0)
    device = base_config.get('device', 'cuda')

    print("data size", len(train_set))
    print("batch_size", batch_size)
    print("max_epochs", max_epochs)
    print('ctc_loss_weight', ctc_loss_weight)
    print('cctc_loss_weight', ct_loss_left_weight, ct_loss_right_weight)
    print("feature:", audio_config['features_type'])
    print("num_audio_features:", audio_config['num_audio_features'])
    print("alphabet_count", alphabet.size())
    print("num_data_loader_workers", num_data_loader_workers)
    print("mixed_precision", base_config.get('mixed_precision', False))
    print("log_dir", base_config['log_dir'])
    print('num_pool_workers', base_config.get('num_pool_workers', 8))
    print("train_version", train_version)
    print()

    # ============ Initialize model, loss, optimizer ============
    model = base_config['model'](audio_config['num_audio_features'],
                                 alphabet.size(),
                                 n_left_context_heads=n_left_context_heads,
                                 n_right_context_heads=n_right_context_heads,
                                 blank_index=blank_index)

    # model.to(device)
    ## print('model', model)
    # ctc_criterion = nn.CTCLoss(blank=blank_index, reduction='mean', zero_infinity=True)
    # ct_criterion = CTLoss(blank_index=blank_index, version='numpy')
    # total_steps = len(train_loader) * max_epochs
    # epoch = 0
    # step = 0

    model.configure_optimizers(optimizer)
    optimizer = base_config['optimizer'](model.parameters(), **base_config.get(
        'optimizer_params', {}))
    optimizer_wrapper = base_config.get('optimizer_wrapper', None)
    if (optimizer_wrapper != None):
        optimizer = optimizer_wrapper(
            optimizer, **base_config.get('optimizer_wrapper_params', {}))
    scheduler = base_config.get('scheduler', None)

    if (scheduler != None):
        #if('max_decay_steps' not in base_config['scheduler_params']):
        #    max_decay_steps = total_steps - step
        #    base_config['scheduler_params']['max_decay_steps'] = max_decay_steps
        lr_scheduler = scheduler(optimizer,
                                 **base_config.get('scheduler_params', {}))
        warmup_params = base_config.get('warmup_params', None)
        if (warmup_params != None):
            scheduler = GradualWarmupScheduler(optimizer,
                                               **warmup_params,
                                               after_scheduler=lr_scheduler)
        else:
            scheduler = lr_scheduler

    if (base_config.get('load_model', "") != ""
            and base_config.get('load_model', "") != None):
        print('load model: ', base_config.get('load_model'), '\n')
        checkpoint = torch.load(base_config.get('load_model'))

        pretrained_dict = checkpoint['model_state_dict'].copy()
        if (train_version == 5 and not continue_learning):
            for lat_name in pretrained_dict.keys():
                if ('output' in lat_name):
                    del checkpoint['model_state_dict'][lat_name]

        pretrained_dict = checkpoint['model_state_dict']
        model_dict = model.state_dict()
        # 1. filter out unnecessary keys
        unmet_params = {
            k: v
            for k, v in model_dict.items() if k not in pretrained_dict
            or pretrained_dict[k].shape != model_dict[k].shape
        }
        pretrained_dict.update(unmet_params)

        pretrained_dict = {
            k: v
            for k, v in pretrained_dict.items() if k in model_dict
        }

        # 3. load the new state dict
        model.load_state_dict(pretrained_dict)
        if (len(unmet_params) > 0):
            print("New Initialization")
            for k in unmet_params:
                print(k)

        if (base_config.get("load_optim", True)):
            opt_pre_state = checkpoint['optimizer_state_dict']
            current_opt_state = optimizer.state_dict()
            if (base_config.get("load_only_lr", False)):
                current_opt_state['param_groups'][0].update(
                    {'lr': opt_pre_state['param_groups'][0]['lr']})
                optimizer.load_state_dict(current_opt_state)
            else:
                try:
                    optimizer.load_state_dict(opt_pre_state)
                except:
                    current_opt_state['param_groups'][0].update(
                        {'lr': opt_pre_state['param_groups'][0]['lr']})
                    optimizer.load_state_dict(current_opt_state)

            if(checkpoint.get('scheduler_state_dict', None) != None and \
                    base_config.get("load_scheduler", True)):
                scheduler.load_state_dict(checkpoint['scheduler_state_dict'])

        step = checkpoint['step']
        epoch = checkpoint['epoch'] + 1
        loss = checkpoint['loss']
    else:
        print('training from scratch')
    print()
    print('optimizer', optimizer)
    print('scheduler', scheduler)

    dataset = MNIST(os.getcwd(),
                    download=True,
                    transform=transforms.ToTensor())
    train, val = random_split(dataset, [55000, 5000])

    model = LitAutoEncoder()
    trainer = pl.Trainer(max_epochs=1, gpus=8, precision=16)
    trainer.fit(model, DataLoader(train), DataLoader(val))
Beispiel #2
0
def train(name, df, VAL_FOLD=0, resume=False):
    dt_string = datetime.now().strftime("%d|%m_%H|%M|%S")
    print("Starting -->", dt_string)

    os.makedirs(OUTPUT_DIR, exist_ok=True)
    os.makedirs('checkpoint', exist_ok=True)
    run = f"{name}_[{dt_string}]"

    wandb.init(project="imanip", config=config_defaults, name=run)
    config = wandb.config

    # model = SRM_Classifer(num_classes=1, encoder_checkpoint='weights/pretrain_[31|03_12|16|32].h5')
    model = SMP_SRM_UPP(classifier_only=True)

    # for name_, param in model.named_parameters():
    #     if 'classifier' in name_:
    #         continue
    #     else:
    #         param.requires_grad = False

    print("Parameters : ",
          sum(p.numel() for p in model.parameters() if p.requires_grad))

    wandb.save('segmentation/smp_srm.py')
    wandb.save('dataset.py')

    train_imgaug, train_geo_aug = get_train_transforms()
    transforms_normalize = get_transforms_normalize()

    #region ########################-- CREATE DATASET and DATALOADER --########################
    train_dataset = DATASET(dataframe=df,
                            mode="train",
                            val_fold=VAL_FOLD,
                            test_fold=TEST_FOLD,
                            transforms_normalize=transforms_normalize,
                            imgaug_augment=train_imgaug,
                            geo_augment=train_geo_aug)
    train_loader = DataLoader(train_dataset,
                              batch_size=config.train_batch_size,
                              shuffle=True,
                              num_workers=4,
                              pin_memory=True,
                              drop_last=False)

    valid_dataset = DATASET(
        dataframe=df,
        mode="val",
        val_fold=VAL_FOLD,
        test_fold=TEST_FOLD,
        transforms_normalize=transforms_normalize,
    )
    valid_loader = DataLoader(valid_dataset,
                              batch_size=config.valid_batch_size,
                              shuffle=True,
                              num_workers=4,
                              pin_memory=True,
                              drop_last=False)

    test_dataset = DATASET(
        dataframe=df,
        mode="test",
        val_fold=VAL_FOLD,
        test_fold=TEST_FOLD,
        transforms_normalize=transforms_normalize,
    )
    test_loader = DataLoader(test_dataset,
                             batch_size=config.valid_batch_size,
                             shuffle=True,
                             num_workers=4,
                             pin_memory=True,
                             drop_last=False)
    #endregion ######################################################################################

    optimizer = get_optimizer(model, config.optimizer, config.learning_rate,
                              config.weight_decay)
    # after_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    #     optimizer,
    #     patience=config.schedule_patience,
    #     mode="min",
    #     factor=config.schedule_factor,
    # )
    after_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                                 T_0=35,
                                                                 T_mult=2)
    scheduler = GradualWarmupScheduler(optimizer=optimizer,
                                       multiplier=1,
                                       total_epoch=config.warmup + 1,
                                       after_scheduler=after_scheduler)

    # this zero gradient update is needed to avoid a warning message, issue #8.
    # optimizer.zero_grad()
    # optimizer.step()

    criterion = nn.BCEWithLogitsLoss()
    es = EarlyStopping(patience=200, mode="min")

    model = nn.DataParallel(model).to(device)

    # wandb.watch(model, log_freq=50, log='all')

    start_epoch = 0
    if resume:
        checkpoint = torch.load(
            'checkpoint/(using pretrain)COMBO_ALL_FULL_[09|04_12|46|35].pt')
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch'] + 1
        print("-----------> Resuming <------------")

    for epoch in range(start_epoch, config.epochs):
        print(f"Epoch = {epoch}/{config.epochs-1}")
        print("------------------")

        train_metrics = train_epoch(model, train_loader, optimizer, scheduler,
                                    criterion, epoch)
        valid_metrics = valid_epoch(model, valid_loader, criterion, epoch)

        scheduler.step(valid_metrics['valid_loss'])

        print(
            f"TRAIN_ACC = {train_metrics['train_acc_05']}, TRAIN_LOSS = {train_metrics['train_loss']}"
        )
        print(
            f"VALID_ACC = {valid_metrics['valid_acc_05']}, VALID_LOSS = {valid_metrics['valid_loss']}"
        )
        print("Optimizer LR", optimizer.param_groups[0]['lr'])
        print("Scheduler LR", scheduler.get_lr()[0])
        wandb.log({
            'optim_lr': optimizer.param_groups[0]['lr'],
            'schedule_lr': scheduler.get_lr()[0]
        })

        es(
            valid_metrics["valid_loss"],
            model,
            model_path=os.path.join(OUTPUT_DIR, f"{run}.h5"),
        )
        if es.early_stop:
            print("Early stopping")
            break

        checkpoint = {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
        }
        torch.save(checkpoint, os.path.join('checkpoint', f"{run}.pt"))

    if os.path.exists(os.path.join(OUTPUT_DIR, f"{run}.h5")):
        print(
            model.load_state_dict(
                torch.load(os.path.join(OUTPUT_DIR, f"{run}.h5"))))
        print("LOADED FOR TEST")

    test_metrics = test(model, test_loader, criterion)
    wandb.save(os.path.join(OUTPUT_DIR, f"{run}.h5"))

    return test_metrics
Beispiel #3
0
class TrainingLoop():
    def __init__(self,
                 model_kwargs,
                 train_positive_paths,
                 train_negative_paths,
                 train_unlabeled_paths,
                 val_positive_paths,
                 val_negative_paths,
                 val_unlabeled_paths,
                 data_cache_dir: str,
                 notify_callback: Callable[[Dict[str, Any]],
                                           None] = lambda x: None):
        '''The training loop for background splitting models.'''
        self.data_cache_dir = data_cache_dir
        self.notify_callback = notify_callback

        self._setup_model_kwargs(model_kwargs)

        # Setup dataset
        self._setup_dataset(train_positive_paths, train_negative_paths,
                            train_unlabeled_paths, val_positive_paths,
                            val_negative_paths, val_unlabeled_paths)

        # Setup model
        self._setup_model()

        # Setup optimizer

        # Resume if requested
        resume_from = model_kwargs.get('resume_from', None)
        if resume_from:
            resume_training = model_kwargs.get('resume_training', False)
            self.load_checkpoint(resume_from, resume_training=resume_training)

        self.writer = SummaryWriter(log_dir=model_kwargs['log_dir'])

        # Variables for estimating run-time
        self.train_batch_time = EMA(0)
        self.val_batch_time = EMA(0)
        self.train_batches_per_epoch = (len(self.train_dataloader.dataset) /
                                        self.train_dataloader.batch_size)
        self.val_batches_per_epoch = (len(self.val_dataloader.dataset) /
                                      self.val_dataloader.batch_size)
        self.train_batch_idx = 0
        self.val_batch_idx = 0
        self.train_epoch_loss = 0
        self.train_epoch_main_loss = 0
        self.train_epoch_aux_loss = 0

    def _setup_model_kwargs(self, model_kwargs):
        self.model_kwargs = copy.deepcopy(model_kwargs)
        self.num_workers = NUM_WORKERS
        self.val_frequency = model_kwargs.get('val_frequency', 1)
        self.checkpoint_frequency = model_kwargs.get('checkpoint_frequency', 1)
        self.use_cuda = bool(model_kwargs.get('use_cuda', True))
        assert 'model_dir' in model_kwargs
        self.model_dir = model_kwargs['model_dir']
        assert 'aux_labels' in model_kwargs
        self.aux_weight = float(model_kwargs.get('aux_weight', 0.1))
        assert 'log_dir' in model_kwargs

    def _setup_dataset(self, train_positive_paths, train_negative_paths,
                       train_unlabeled_paths, val_positive_paths,
                       val_negative_paths, val_unlabeled_paths):
        assert self.model_kwargs
        aux_labels = self.model_kwargs['aux_labels']
        image_input_size = self.model_kwargs.get('input_size', 224)
        batch_size = int(self.model_kwargs.get('batch_size', 64))
        num_workers = self.num_workers
        restrict_aux_labels = bool(
            self.model_kwargs.get('restrict_aux_labels', True))
        cache_images_on_disk = self.model_kwargs.get('cache_images_on_disk',
                                                     False)

        train_transform = transforms.Compose([
            transforms.RandomResizedCrop(image_input_size),
            transforms.RandomHorizontalFlip(),
            transforms.ConvertImageDtype(torch.float32),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
        resize_size = int(image_input_size * 1.15)
        resize_size += int(resize_size % 2)
        val_transform = transforms.Compose([
            transforms.Resize(resize_size),
            transforms.CenterCrop(image_input_size),
            transforms.ConvertImageDtype(torch.float32),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
        self.train_dataloader = DataLoader(AuxiliaryDataset(
            positive_paths=train_positive_paths,
            negative_paths=train_negative_paths,
            unlabeled_paths=train_unlabeled_paths,
            auxiliary_labels=aux_labels,
            restrict_aux_labels=restrict_aux_labels,
            cache_images_on_disk=cache_images_on_disk,
            data_cache_dir=self.data_cache_dir,
            transform=train_transform),
                                           batch_size=batch_size,
                                           shuffle=True,
                                           num_workers=num_workers)
        self.val_dataloader = DataLoader(AuxiliaryDataset(
            positive_paths=val_positive_paths,
            negative_paths=val_negative_paths,
            unlabeled_paths=val_unlabeled_paths,
            auxiliary_labels=aux_labels,
            restrict_aux_labels=restrict_aux_labels,
            cache_images_on_disk=cache_images_on_disk,
            data_cache_dir=self.data_cache_dir,
            transform=val_transform),
                                         batch_size=batch_size,
                                         shuffle=False,
                                         num_workers=num_workers)

    def _setup_model(self):
        num_classes = 2
        num_aux_classes = self.train_dataloader.dataset.num_auxiliary_classes
        freeze_backbone = self.model_kwargs.get('freeze_backbone', False)
        self.model_kwargs['num_aux_classes'] = num_aux_classes
        self.model = Model(num_main_classes=num_classes,
                           num_aux_classes=num_aux_classes,
                           freeze_backbone=freeze_backbone)
        if self.model_kwargs.get('aux_labels_type', None) == "imagenet":
            # Initialize auxiliary head to imagenet fc
            self.model.auxiliary_head.weight = self.model.backbone.fc.weight
            self.model.auxiliary_head.bias = self.model.backbone.fc.bias
        if self.use_cuda:
            self.model = self.model.cuda()
        self.model = nn.DataParallel(self.model)
        self.main_loss = nn.CrossEntropyLoss()
        self.auxiliary_loss = nn.CrossEntropyLoss()
        self.start_epoch = 0
        self.end_epoch = self.model_kwargs.get('epochs_to_run', 1)
        self.current_epoch = 0
        self.global_train_batch_idx = 0
        self.global_val_batch_idx = 0

        lr = float(self.model_kwargs.get('initial_lr', 0.01))
        endlr = float(self.model_kwargs.get('endlr', 0.0))
        optim_params = dict(
            lr=lr,
            momentum=float(self.model_kwargs.get('momentum', 0.9)),
            weight_decay=float(self.model_kwargs.get('weight_decay', 0.0001)),
        )
        self.optimizer = optim.SGD(self.model.parameters(), **optim_params)
        max_epochs = int(self.model_kwargs.get('max_epochs', 90))
        warmup_epochs = int(self.model_kwargs.get('warmup_epochs', 0))
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer,
                                                               max_epochs -
                                                               warmup_epochs,
                                                               eta_min=endlr)
        self.optimizer_scheduler = GradualWarmupScheduler(
            optimizer=self.optimizer,
            multiplier=1.0,
            warmup_epochs=warmup_epochs,
            after_scheduler=scheduler)

    def _notify(self):
        epochs_left = self.end_epoch - self.current_epoch - 1
        num_train_batches_left = (
            epochs_left * self.train_batches_per_epoch +
            max(0, self.train_batches_per_epoch - self.train_batch_idx - 1))
        num_val_batches_left = (
            (1 + round(epochs_left / self.val_frequency)) *
            self.val_batches_per_epoch +
            max(0, self.val_batches_per_epoch - self.val_batch_idx - 1))
        time_left = (num_train_batches_left * self.train_batch_time.value +
                     num_val_batches_left * self.val_batch_time.value)
        self.notify_callback(**{"training_time_left": time_left})

    def setup_resume(self, train_positive_paths, train_negative_paths,
                     train_unlabeled_paths, val_positive_paths,
                     val_negative_paths, val_unlabeled_paths):
        self._setup_dataset(train_positive_paths, train_negative_paths,
                            train_unlabeled_paths, val_positive_paths,
                            val_negative_paths, val_unlabeled_paths)
        self.start_epoch = self.end_epoch
        self.current_epoch = self.start_epoch
        self.end_epoch = self.start_epoch + self.model_kwargs.get(
            'epochs_to_run', 1)

    def load_checkpoint(self, path: str, resume_training: bool = False):
        checkpoint_state = torch.load(path)
        self.model.load_state_dict(checkpoint_state['state_dict'])
        if resume_training:
            self.global_train_batch_idx = checkpoint_state[
                'global_train_batch_idx']
            self.global_val_batch_idx = checkpoint_state[
                'global_val_batch_idx']
            self.start_epoch = checkpoint_state['epoch'] + 1
            self.current_epoch = self.start_epoch
            self.end_epoch = (self.start_epoch +
                              self.model_kwargs.get('epochs_to_run', 1))
            self.optimizer.load_state_dict(checkpoint_state['optimizer'])
            self.optimizer_scheduler.load_state_dict(
                checkpoint_state['optimizer_scheduler'])
            # Copy tensorboard state
            prev_log_dir = checkpoint_state['model_kwargs']['log_dir']
            curr_log_dir = self.model_kwargs['log_dir']
            shutil.copytree(prev_log_dir, curr_log_dir)

    def save_checkpoint(self, epoch, checkpoint_path: str):
        kwargs = dict(self.model_kwargs)
        del kwargs['aux_labels']
        state = dict(
            global_train_batch_idx=self.global_train_batch_idx,
            global_val_batch_idx=self.global_val_batch_idx,
            model_kwargs=kwargs,
            epoch=epoch,
            state_dict=self.model.state_dict(),
            optimizer=self.optimizer.state_dict(),
            optimizer_scheduler=self.optimizer_scheduler.state_dict(),
        )
        os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
        torch.save(state, checkpoint_path)

    def _validate(self, dataloader):
        self.model.eval()
        loss_value = 0
        main_gts = []
        aux_gts = []
        main_preds = []
        aux_preds = []
        for batch_idx, (images, main_labels,
                        aux_labels) in enumerate(dataloader):
            batch_start = time.perf_counter()
            self.val_batch_idx = batch_idx
            if self.use_cuda:
                images = images.cuda()
                main_labels = main_labels.cuda()
                aux_labels = aux_labels.cuda()
            main_logits, aux_logits = self.model(images)
            valid_main_labels = main_labels != -1
            valid_aux_labels = aux_labels != -1
            main_loss_value = self.main_loss(main_logits[valid_main_labels],
                                             main_labels[valid_main_labels])
            aux_loss_value = self.aux_weight * self.auxiliary_loss(
                aux_logits[valid_aux_labels], aux_labels[valid_aux_labels])
            loss_value = torch.zeros_like(main_loss_value)
            if valid_main_labels.sum() > 0:
                loss_value += main_loss_value
            if valid_aux_labels.sum() > 0:
                loss_value += aux_loss_value
            loss_value = loss_value.item()

            if valid_main_labels.sum() > 0:
                main_pred = F.softmax(main_logits[valid_main_labels])
                main_preds += list(
                    main_pred.argmax(dim=1)[valid_main_labels].cpu().numpy())
                main_gts += list(main_labels[valid_main_labels].cpu().numpy())
            if valid_aux_labels.sum() > 0:
                aux_pred = F.softmax(main_logits[valid_main_labels])
                aux_preds += list(
                    aux_pred.argmax(dim=1)[valid_aux_labels].cpu().numpy())
                aux_gts += list(aux_labels[valid_aux_labels].cpu().numpy())
            batch_end = time.perf_counter()
            self.val_batch_time += (batch_end - batch_start)
            self.global_val_batch_idx += 1
        # Compute F1 score
        if len(dataloader) > 0:
            loss_value /= (len(dataloader) + 1e-10)
            main_prec, main_recall, main_f1, _ = \
                sklearn.metrics.precision_recall_fscore_support(
                    main_gts, main_preds, average='binary')
            aux_prec, aux_recall, aux_f1, _ = \
                sklearn.metrics.precision_recall_fscore_support(
                    aux_gts, aux_preds, average='micro')
        else:
            loss_value = 0
            main_prec = -1
            main_recall = -1
            main_f1 = -1
            aux_prec = -1
            aux_recall = -1
            aux_f1 = -1

        summary_data = [
            ('loss', loss_value),
            ('f1/main_head', main_f1),
            ('prec/main_head', main_prec),
            ('recall/main_head', main_recall),
            ('f1/aux_head', aux_f1),
            ('prec/aux_head', aux_prec),
            ('recall/aux_head', aux_recall),
        ]
        for k, v in [('val/epoch/' + tag, v) for tag, v in summary_data]:
            self.writer.add_scalar(k, v, self.current_epoch)

    def validate(self):
        self._validate(self.val_dataloader)

    def train(self):
        self.model.train()
        logger.info('Starting train epoch')
        load_start = time.perf_counter()
        self.train_epoch_loss = 0
        self.train_epoch_main_loss = 0
        self.train_epoch_aux_loss = 0
        main_gts = []
        aux_gts = []
        main_logits_all = []
        main_preds = []
        aux_preds = []
        for batch_idx, (images, main_labels,
                        aux_labels) in enumerate(self.train_dataloader):
            load_end = time.perf_counter()
            batch_start = time.perf_counter()
            self.train_batch_idx = batch_idx
            logger.debug('Train batch')
            if self.use_cuda:
                images = images.cuda()
                main_labels = main_labels.cuda()
                aux_labels = aux_labels.cuda()

            main_logits, aux_logits = self.model(images)
            # Compute loss
            valid_main_labels = main_labels != -1
            valid_aux_labels = aux_labels != -1

            main_loss_value = self.main_loss(main_logits[valid_main_labels],
                                             main_labels[valid_main_labels])
            aux_loss_value = self.aux_weight * self.auxiliary_loss(
                aux_logits[valid_aux_labels], aux_labels[valid_aux_labels])

            loss_value = torch.zeros_like(main_loss_value)
            if valid_main_labels.sum() > 0:
                loss_value += main_loss_value
            if valid_aux_labels.sum() > 0:
                loss_value += aux_loss_value

            self.train_epoch_loss += loss_value.item()
            if torch.sum(valid_main_labels) > 0:
                self.train_epoch_main_loss += main_loss_value.item()
            if torch.sum(valid_aux_labels) > 0:
                self.train_epoch_aux_loss += aux_loss_value.item()
            # Update gradients
            self.optimizer.zero_grad()
            loss_value.backward()
            self.optimizer.step()

            if valid_main_labels.sum() > 0:
                main_pred = F.softmax(main_logits[valid_main_labels], dim=1)
                main_logits_all += list(
                    main_logits[valid_main_labels].detach().cpu().numpy())
                main_preds += list(
                    main_pred[valid_main_labels].argmax(dim=1).cpu().numpy())
                main_gts += list(main_labels[valid_main_labels].cpu().numpy())
            if valid_aux_labels.sum() > 0:
                aux_pred = F.softmax(aux_logits[valid_aux_labels], dim=1)
                aux_preds += list(
                    aux_pred[valid_aux_labels].argmax(dim=1).cpu().numpy())
                aux_gts += list(aux_labels[valid_aux_labels].cpu().numpy())

            batch_end = time.perf_counter()
            total_batch_time = (batch_end - batch_start)
            total_load_time = (load_end - load_start)
            self.train_batch_time += total_batch_time + total_load_time
            logger.debug(f'Train batch time: {self.train_batch_time.value}, '
                         f'this batch time: {total_batch_time}, '
                         f'this load time: {total_load_time}, '
                         f'batch epoch loss: {loss_value.item()}, '
                         f'main loss: {main_loss_value.item()}, '
                         f'aux loss: {aux_loss_value.item()}')
            summary_data = [
                ('loss', loss_value.item()),
                ('loss/main_head', main_loss_value.item()),
                ('loss/aux_head', aux_loss_value.item()),
            ]
            for k, v in [('train/batch/' + tag, v) for tag, v in summary_data]:
                self.writer.add_scalar(k, v, self.global_train_batch_idx)

            self._notify()
            self.global_train_batch_idx += 1
            load_start = time.perf_counter()

        model_lr = self.optimizer.param_groups[-1]['lr']
        self.optimizer_scheduler.step()
        logger.debug(f'Train epoch loss: {self.train_epoch_loss}, '
                     f'main loss: {self.train_epoch_main_loss}, '
                     f'aux loss: {self.train_epoch_aux_loss}')
        main_prec, main_recall, main_f1, _ = \
            sklearn.metrics.precision_recall_fscore_support(
                main_gts, main_preds, average='binary')
        aux_prec, aux_recall, aux_f1, _ = \
            sklearn.metrics.precision_recall_fscore_support(
                aux_gts, aux_preds, average='micro')
        logger.debug(
            f'Train epoch main: {main_prec}, {main_recall}, {main_f1}, '
            f'aux: {aux_prec}, {aux_recall}, {aux_f1}'
            f'main loss: {self.train_epoch_main_loss}, '
            f'aux loss: {self.train_epoch_aux_loss}')
        summary_data = [('lr', model_lr), ('loss', self.train_epoch_loss),
                        ('loss/main_head', self.train_epoch_main_loss),
                        ('loss/aux_head', self.train_epoch_aux_loss),
                        ('f1/main_head', main_f1),
                        ('prec/main_head', main_prec),
                        ('recall/main_head', main_recall),
                        ('f1/aux_head', aux_f1), ('prec/aux_head', aux_prec),
                        ('recall/aux_head', aux_recall)]
        for k, v in [('train/epoch/' + tag, v) for tag, v in summary_data]:
            self.writer.add_scalar(k, v, self.current_epoch)

        if len(main_logits_all):
            self.writer.add_histogram(
                'train/epoch/softmax/main_head',
                scipy.special.softmax(main_logits_all, axis=1)[:, 1])

    def run(self):
        self.last_checkpoint_path = None
        for i in range(self.start_epoch, self.end_epoch):
            logger.info(f'Train: Epoch {i}')
            self.current_epoch = i
            self.train()
            if i % self.val_frequency == 0 or i == self.end_epoch - 1:
                logger.info(f'Validate: Epoch {i}')
                self.validate()
            if i % self.checkpoint_frequency == 0 or i == self.end_epoch - 1:
                logger.info(f'Checkpoint: Epoch {i}')
                self.last_checkpoint_path = os.path.join(
                    self.model_dir, f'checkpoint_{i:03}.pth')
                self.save_checkpoint(i, self.last_checkpoint_path)
        return self.last_checkpoint_path