Esempio n. 1
0
def train_fold(fold, lr):
    train_indices, eval_indices = indices_for_fold(fold, len(train_data))

    train_dataset = TrainEvalDataset(train_data.iloc[train_indices],
                                     transform=train_transform)
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.batch_size,
        drop_last=True,
        shuffle=True,
        num_workers=args.workers)
    if config.mixup is not None:
        train_data_loader = MixupDataLoader(train_data_loader, config.mixup)

    eval_dataset = TrainEvalDataset(train_data.iloc[eval_indices],
                                    transform=eval_transform)
    eval_data_loader = torch.utils.data.DataLoader(
        eval_dataset, batch_size=config.batch_size, num_workers=args.workers)

    model = Model(config.model, NUM_CLASSES)
    model = model.to(DEVICE)
    optimizer = build_optimizer(config.opt.type,
                                model.parameters(),
                                lr,
                                config.opt.beta,
                                weight_decay=config.opt.weight_decay)

    if config.sched.type == 'onecycle':
        scheduler = lr_scheduler_wrapper.StepWrapper(
            OneCycleScheduler(optimizer,
                              lr=(lr / 20, lr),
                              beta_range=config.sched.onecycle.beta,
                              max_steps=len(train_data_loader) * config.epochs,
                              annealing=config.sched.onecycle.anneal))
    elif config.sched.type == 'cyclic':
        # TODO: add cyclic min/max momentum to config

        scheduler = lr_scheduler_wrapper.StepWrapper(
            CyclicLR(optimizer,
                     0.,
                     lr,
                     step_size_up=len(train_data_loader),
                     step_size_down=len(train_data_loader),
                     mode='triangular2',
                     cycle_momentum=True,
                     base_momentum=config.sched.cyclic.beta[1],
                     max_momentum=config.sched.cyclic.beta[0]))
    elif config.sched.type == 'cawr':
        scheduler = lr_scheduler_wrapper.StepWrapper(
            torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
                optimizer, T_0=len(train_data_loader), T_mult=2))
    elif config.sched.type == 'plateau':
        scheduler = lr_scheduler_wrapper.ScoreWrapper(
            torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                       mode='max',
                                                       factor=0.5,
                                                       patience=0,
                                                       verbose=True))
    else:
        raise AssertionError('invalid sched {}'.format(config.sched.type))

    best_score = 0
    for epoch in range(config.epochs):
        train_epoch(model=model,
                    optimizer=optimizer,
                    scheduler=scheduler,
                    data_loader=train_data_loader,
                    fold=fold,
                    epoch=epoch)
        gc.collect()
        score = eval_epoch(model=model,
                           data_loader=eval_data_loader,
                           fold=fold,
                           epoch=epoch)
        gc.collect()

        scheduler.step_epoch()
        scheduler.step_score(score)

        if score > best_score:
            best_score = score
            torch.save(
                model.state_dict(),
                os.path.join(args.experiment_path,
                             'model_{}.pth'.format(fold)))
Esempio n. 2
0
# If weights are tied between encoder and decoder, we can only optimize 
# parameters in one of those two layers
if not lstm_tie_weights:
    param_list.extend([
            {'params': lm.encoder.parameters(), 'lr':1e-3},
            {'params': lm.decoder.parameters(), 'lr':1e-3},
        ])
else:
    param_list.extend([
        {'params': lm.decoder.parameters(), 'lr':1e-3},
    ])

optimizer = torch.optim.Adam(param_list, lr=0.01)

scheduler = CyclicLR(optimizer,  max_lrs=[0.1, 0.1, 0.1, 0.1, 0.1], 
                     mode='ulmfit', ratio=1.5, cut_frac=0.4, 
                     n_epochs=num_epochs, batchsize=50000/1171, 
                     verbose=False, epoch_length=50000)

history = training_loop(batch_size=batch_size, 
                        num_epochs=num_epochs,
                        display_freq=1, 
                        model=lm, 
                        criterion=loss,
                        optim=optimizer,
                        scheduler=None,
                        device=device,
                        training_set=train_dl,
                        validation_set=valid_dl,
                        best_model_path=model_file_lm,
                        history=None)
Esempio n. 3
0
    def processing(self):
        log_file = os.path.join(self.data_folder, 'train.log')

        logger = Logger('train', log_file)

        iters = len(self.train_loader)
        step_size = iters * 2
        self.scheduler = CyclicLR(optimizer=self.optimizer,
                                  step_size=step_size,
                                  base_lr=self.lr)

        if self.evaluate:
            self.validate(logger)
            return

        iter_per_epoch = len(self.train_loader)
        logger.info('Iterations per epoch: {0}'.format(iter_per_epoch))
        print('Iterations per epoch: {0}'.format(iter_per_epoch))

        start_time = time.time()

        for epoch in range(self.start_epoch, self.epochs):
            # self.adjust_learning_rate(epoch)

            # train for one epoch
            train_losses, train_acc = self.train(logger, epoch)

            # evaluate on validation set
            with torch.no_grad():
                val_losses, val_acc = self.validate(logger)

            # self.scheduler.step(val_losses.avg)
            # log visualize
            info_acc = {'train_acc': train_acc.avg, 'val_acc': val_acc.avg}
            info_loss = {
                'train_loss': train_losses.avg,
                'val_loss': val_losses.avg
            }
            self.visualizer.write_summary(info_acc, info_loss, epoch + 1)

            self.visualizer.write_histogram(model=self.model, step=epoch + 1)

            # remember best Accuracy and save checkpoint
            is_best = val_acc.avg > self.best_prec1
            self.best_prec1 = max(val_acc.avg, self.best_prec1)
            self.save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': self.model.state_dict(),
                    'best_prec1': self.best_prec1,
                    'optimizer': self.optimizer.state_dict(),
                }, is_best)

            self.check_early_stop(val_acc.avg, logger, start_time)

        end_time = time.time()
        print("--- Total training time %s seconds ---" %
              (end_time - start_time))
        logger.info("--- Total training time %s seconds ---" %
                    (end_time - start_time))
        self.visualizer.writer_close()
Esempio n. 4
0
class Training(object):
    def __init__(self, name_list, num_classes=400, modality='RGB', **kwargs):
        self.__dict__.update(kwargs)
        self.num_classes = num_classes
        self.modality = modality
        self.name_list = name_list
        # set accuracy avg = 0
        self.count_early_stop = 0
        # Set best precision = 0
        self.best_prec1 = 0
        # init start epoch = 0
        self.start_epoch = 0

        if self.log_visualize != '':
            self.visualizer = Visualizer(logdir=self.log_visualize)

        self.checkDataFolder()

        self.loading_model()

        self.train_loader, self.val_loader = self.loading_data()

        # run
        self.processing()
        if self.random:
            print('random pick images')

    def check_early_stop(self, accuracy, logger, start_time):
        if self.best_prec1 <= accuracy:
            self.count_early_stop = 0
        else:
            self.count_early_stop += 1

        if self.count_early_stop > self.early_stop:
            print('Early stop')
            end_time = time.time()
            print("--- Total training time %s seconds ---" %
                  (end_time - start_time))
            logger.info("--- Total training time %s seconds ---" %
                        (end_time - start_time))
            exit()

    def checkDataFolder(self):
        try:
            os.stat('./' + self.model_type + '_' + self.data_set)
        except:
            os.mkdir('./' + self.model_type + '_' + self.data_set)
        self.data_folder = './' + self.model_type + '_' + self.data_set

    # Loading P3D model
    def loading_model(self):

        print('Loading %s model' % (self.model_type))

        if self.model_type == 'C3D':
            self.model = C3D()
            if self.pretrained:
                self.model.load_state_dict(torch.load('c3d.pickle'))
        elif self.model_type == 'I3D':
            if self.pretrained:
                self.model = I3D(num_classes=400, modality='rgb')
                self.model.load_state_dict(
                    torch.load('kinetics_i3d_model_rgb.pth'))
            else:
                self.model = I3D(num_classes=self.num_classes, modality='rgb')
        else:
            if self.pretrained:
                print("=> using pre-trained model")
                self.model = P3D199(pretrained=True,
                                    num_classes=400,
                                    dropout=self.dropout)

            else:
                print("=> creating model P3D")
                self.model = P3D199(pretrained=False,
                                    num_classes=400,
                                    dropout=self.dropout)
        # Transfer classes
        self.model = transfer_model(model=self.model,
                                    model_type=self.model_type,
                                    num_classes=self.num_classes)

        # Check gpu and run parallel
        if check_gpu() > 0:
            self.model = torch.nn.DataParallel(self.model).cuda()

        # define loss function (criterion) and optimizer
        self.criterion = nn.CrossEntropyLoss()

        if check_gpu() > 0:
            self.criterion = nn.CrossEntropyLoss().cuda()

        params = self.model.parameters()
        if self.model_type == 'P3D':
            params = get_optim_policies(model=self.model,
                                        modality=self.modality,
                                        enable_pbn=True)

        self.optimizer = optim.SGD(params=params,
                                   lr=self.lr,
                                   momentum=self.momentum,
                                   weight_decay=self.weight_decay)

        # self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=self.optimizer, mode='min', patience=10, verbose=True)

        # optionally resume from a checkpoint
        if self.resume:
            if os.path.isfile(self.resume):
                print("=> loading checkpoint '{}'".format(self.resume))
                checkpoint = torch.load(self.resume)
                self.start_epoch = checkpoint['epoch']
                self.best_prec1 = checkpoint['best_prec1']
                self.model.load_state_dict(checkpoint['state_dict'])
                self.optimizer.load_state_dict(checkpoint['optimizer'])
                print("=> loaded checkpoint '{}' (epoch {})".format(
                    self.evaluate, checkpoint['epoch']))
            else:
                print("=> no checkpoint found at '{}'".format(self.resume))

        if self.evaluate:
            file_model_best = os.path.join(self.data_folder,
                                           'model_best.pth.tar')
            if os.path.isfile(file_model_best):
                print(
                    "=> loading checkpoint '{}'".format('model_best.pth.tar'))
                checkpoint = torch.load(file_model_best)
                self.start_epoch = checkpoint['epoch']
                self.best_prec1 = checkpoint['best_prec1']
                self.model.load_state_dict(checkpoint['state_dict'])
                self.optimizer.load_state_dict(checkpoint['optimizer'])
                print("=> loaded checkpoint '{}' (epoch {})".format(
                    self.evaluate, checkpoint['epoch']))
            else:
                print("=> no checkpoint found at '{}'".format(self.resume))

        cudnn.benchmark = True

    # Loading data
    def loading_data(self):
        random = True if self.random else False
        size = 160
        if self.model_type == 'C3D':
            size = 112
        if self.model_type == 'I3D':
            size = 224

        normalize = Normalize(mean=[0.485, 0.456, 0.406],
                              std=[0.229, 0.224, 0.225])
        train_transformations = Compose([
            RandomSizedCrop(size),
            RandomHorizontalFlip(),
            # Resize((size, size)),
            # ColorJitter(
            #     brightness=0.4,
            #     contrast=0.4,
            #     saturation=0.4,
            # ),
            ToTensor(),
            normalize
        ])

        val_transformations = Compose([
            # Resize((182, 242)),
            Resize(256),
            CenterCrop(size),
            ToTensor(),
            normalize
        ])

        train_dataset = MyDataset(self.data,
                                  data_folder="train",
                                  name_list=self.name_list,
                                  version="1",
                                  transform=train_transformations,
                                  num_frames=self.num_frames,
                                  random=random)

        val_dataset = MyDataset(self.data,
                                data_folder="validation",
                                name_list=self.name_list,
                                version="1",
                                transform=val_transformations,
                                num_frames=self.num_frames,
                                random=random)

        train_loader = data.DataLoader(train_dataset,
                                       batch_size=self.batch_size,
                                       shuffle=True,
                                       num_workers=self.workers,
                                       pin_memory=True)

        val_loader = data.DataLoader(val_dataset,
                                     batch_size=self.batch_size,
                                     shuffle=False,
                                     num_workers=self.workers,
                                     pin_memory=False)

        return (train_loader, val_loader)

    def processing(self):
        log_file = os.path.join(self.data_folder, 'train.log')

        logger = Logger('train', log_file)

        iters = len(self.train_loader)
        step_size = iters * 2
        self.scheduler = CyclicLR(optimizer=self.optimizer,
                                  step_size=step_size,
                                  base_lr=self.lr)

        if self.evaluate:
            self.validate(logger)
            return

        iter_per_epoch = len(self.train_loader)
        logger.info('Iterations per epoch: {0}'.format(iter_per_epoch))
        print('Iterations per epoch: {0}'.format(iter_per_epoch))

        start_time = time.time()

        for epoch in range(self.start_epoch, self.epochs):
            # self.adjust_learning_rate(epoch)

            # train for one epoch
            train_losses, train_acc = self.train(logger, epoch)

            # evaluate on validation set
            with torch.no_grad():
                val_losses, val_acc = self.validate(logger)

            # self.scheduler.step(val_losses.avg)
            # log visualize
            info_acc = {'train_acc': train_acc.avg, 'val_acc': val_acc.avg}
            info_loss = {
                'train_loss': train_losses.avg,
                'val_loss': val_losses.avg
            }
            self.visualizer.write_summary(info_acc, info_loss, epoch + 1)

            self.visualizer.write_histogram(model=self.model, step=epoch + 1)

            # remember best Accuracy and save checkpoint
            is_best = val_acc.avg > self.best_prec1
            self.best_prec1 = max(val_acc.avg, self.best_prec1)
            self.save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': self.model.state_dict(),
                    'best_prec1': self.best_prec1,
                    'optimizer': self.optimizer.state_dict(),
                }, is_best)

            self.check_early_stop(val_acc.avg, logger, start_time)

        end_time = time.time()
        print("--- Total training time %s seconds ---" %
              (end_time - start_time))
        logger.info("--- Total training time %s seconds ---" %
                    (end_time - start_time))
        self.visualizer.writer_close()

    # Training
    def train(self, logger, epoch):
        batch_time = AverageMeter()
        data_time = AverageMeter()
        losses = AverageMeter()
        acc = AverageMeter()
        top1 = AverageMeter()
        top5 = AverageMeter()

        rate = get_learning_rate(self.optimizer)[0]
        # switch to train mode
        self.model.train()

        end = time.time()
        for i, (images, target) in enumerate(self.train_loader):
            # adjust learning rate scheduler step
            self.scheduler.batch_step()

            # measure data loading time
            data_time.update(time.time() - end)
            if check_gpu() > 0:
                images = images.cuda(async=True)
                target = target.cuda(async=True)
            image_var = torch.autograd.Variable(images)
            label_var = torch.autograd.Variable(target)

            self.optimizer.zero_grad()

            # compute y_pred
            y_pred = self.model(image_var)
            if self.model_type == 'I3D':
                y_pred = y_pred[0]

            loss = self.criterion(y_pred, label_var)
            # measure accuracy and record loss
            prec1, prec5 = accuracy(y_pred.data, target, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            acc.update(prec1.item(), images.size(0))
            top1.update(prec1.item(), images.size(0))
            top5.update(prec5.item(), images.size(0))
            # compute gradient and do SGD step

            loss.backward()
            self.optimizer.step()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % self.print_freq == 0:
                print('Epoch: [{0}/{1}][{2}/{3}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                      'Lr {rate:.5f}\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                      'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                          epoch,
                          self.epochs,
                          i,
                          len(self.train_loader),
                          batch_time=batch_time,
                          data_time=data_time,
                          rate=rate,
                          loss=losses,
                          top1=top1,
                          top5=top5))

        logger.info('Epoch: [{0}/{1}]\t'
                    'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                    'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                    'Lr {rate:.5f}\t'
                    'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                    'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                    'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                        epoch,
                        self.epochs,
                        batch_time=batch_time,
                        data_time=data_time,
                        rate=rate,
                        loss=losses,
                        top1=top1,
                        top5=top5))
        return losses, acc

    # Validation
    def validate(self, logger):
        batch_time = AverageMeter()
        losses = AverageMeter()
        acc = AverageMeter()
        top1 = AverageMeter()
        top5 = AverageMeter()
        # switch to evaluate mode
        self.model.eval()

        end = time.time()
        for i, (images, labels) in enumerate(self.val_loader):
            if check_gpu() > 0:
                images = images.cuda(async=True)
                labels = labels.cuda(async=True)

            image_var = torch.autograd.Variable(images)
            label_var = torch.autograd.Variable(labels)

            # compute y_pred
            y_pred = self.model(image_var)
            if self.model_type == 'I3D':
                y_pred = y_pred[0]

            loss = self.criterion(y_pred, label_var)

            # measure accuracy and record loss
            prec1, prec5 = accuracy(y_pred.data, labels, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            acc.update(prec1.item(), images.size(0))
            top1.update(prec1.item(), images.size(0))
            top5.update(prec5.item(), images.size(0))
            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % self.print_freq == 0:
                print('TrainVal: [{0}/{1}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                      'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                          i,
                          len(self.val_loader),
                          batch_time=batch_time,
                          loss=losses,
                          top1=top1,
                          top5=top5))

        print(' * Accuracy {acc.avg:.3f}  Loss {loss.avg:.3f}'.format(
            acc=acc, loss=losses))
        logger.info(' * Accuracy {acc.avg:.3f}  Loss {loss.avg:.3f}'.format(
            acc=acc, loss=losses))

        return losses, acc

    # save checkpoint to file
    def save_checkpoint(self, state, is_best):
        checkpoint = os.path.join(self.data_folder, 'checkpoint.pth.tar')
        torch.save(state, checkpoint)
        model_best = os.path.join(self.data_folder, 'model_best.pth.tar')
        if is_best:
            shutil.copyfile(checkpoint, model_best)

    # adjust learning rate for each epoch
    def adjust_learning_rate(self, epoch):
        """Sets the learning rate to the initial LR decayed by 10 every 3K iterations"""
        iters = len(self.train_loader)
        num_epochs = 3000 // iters
        decay = 0.1**(epoch // num_epochs)
        lr = self.lr * decay
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr * param_group['lr_mult']
            param_group['weight_decay'] = decay * param_group['decay_mult']
Esempio n. 5
0
    def fit_language_model(self,
                           pretrained_itos=None,
                           pretrained_weight_file=None,
                           lm_hidden_dim=1150,
                           lm_embedding_dim=400,
                           lm_lstm_layers=3,
                           num_epochs=100,
                           display_epoch_freq=1,
                           scheduler='ulmfit',
                           max_lrs=[1e-3, 1e-3, 1e-3, 1e-3, 1e-3]):

        ## Model Architecture
        self.hidden_dim = lm_hidden_dim
        self.embedding_dim = lm_embedding_dim
        self.dropout = 0.3
        self.lstm_layers = lm_lstm_layers
        self.lstm_bidirection = False
        self.lstm_tie_weights = True
        self.num_epochs = num_epochs
        self.display_epoch_freq = display_epoch_freq

        if self.use_gpu: torch.cuda.manual_seed(303)
        else: torch.manual_seed(303)

        # set up Files to save stuff in
        runtime = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

        if pretrained_weight_file is not None and pretrained_itos is not None:
            print("Starting Loading pretrained Wikitext model")
            enc = torch.load(pretrained_weight_file,
                             map_location=lambda storage, loc: storage)
            self.embedding_dim = enc['0.encoder.weight'].shape[1]
            self.hidden_dim = int(
                enc['0.rnns.0.module.weight_hh_l0_raw'].shape[0] / 4)
            self.lstm_layers = 3

            new_enc = {}
            for k, v in enc.items():
                layer_detail = k.split('.')
                layer_name = layer_detail[-1].replace('_raw', '')
                if len(layer_detail) == self.lstm_layers:
                    new_enc[f'{layer_detail[1]}.{layer_name}'] = v
                else:
                    new_enc[
                        f'{layer_detail[1]}.{layer_detail[2]}.{layer_name}'] = v

            del new_enc['encoder_with_dropout.embed.weight']

            pretrained_idx2word = pickle.load(open(pretrained_itos, 'rb'))
            pretrained_word2idx =\
                {k: i for i,k in enumerate(pretrained_idx2word)}

            new_model_vectorizer = self.vectorizer
            pretrained_encoder_weights = enc['0.encoder.weight']

            row_m = pretrained_encoder_weights.mean(dim=0)
            row_m = [x.item() for x in row_m]

            new_vocab_size = len(new_model_vectorizer.word2idx)
            new_encoder_weights = torch.tensor(
                [row_m for i in range(new_vocab_size)])

            new_idx2weights = {}
            for word, i in new_model_vectorizer.word2idx.items():
                if word in pretrained_word2idx:
                    word_idx = pretrained_word2idx[word]
                    new_encoder_weights[i] =\
                        pretrained_encoder_weights[word_idx]

            new_enc['encoder.weight'] = new_encoder_weights
            new_enc['decoder.weight'] = copy.copy(new_encoder_weights)
            new_enc['decoder.bias'] =\
                torch.zeros(new_enc['decoder.weight'].shape[0])

            self.lm = RNNLM(device=self.device,
                            vocab_size=new_vocab_size,
                            embedding_size=self.embedding_dim,
                            hidden_size=self.hidden_dim,
                            batch_size=50,
                            num_layers=3,
                            tie_weights=True,
                            word2idx=new_model_vectorizer.word2idx)
            print("Initialised loading with pretrained Wikitext model")

        else:
            # Build and initialize the model
            print("No Pretrained Model specified")
            self.lm = RNNLM(self.device,
                            self.vectorizer.vocabulary_size,
                            self.embedding_dim,
                            self.hidden_dim,
                            self.batch_size,
                            dropout=self.dropout,
                            tie_weights=self.lstm_tie_weights,
                            num_layers=self.lstm_layers,
                            bidirectional=self.lstm_bidirection,
                            word2idx=self.vectorizer.word2idx,
                            log_softmax=False)

        if self.use_gpu:
            self.lm = self.lm.to(self.device)

        # Loss and Optimizer
        self.loss = nn.CrossEntropyLoss()

        # Extract pointers to the parameters of the lstms

        param_list = [{
            'params': rnn.parameters(),
            'lr': 1e-3
        } for rnn in self.lm.rnns]

        # If weights are tied between encoder and decoder, we can only optimize
        # parameters in one of those two layers
        if not self.lstm_tie_weights:
            param_list.extend([
                {
                    'params': self.lm.encoder.parameters(),
                    'lr': 1e-3
                },
                {
                    'params': self.lm.decoder.parameters(),
                    'lr': 1e-3
                },
            ])
        else:
            param_list.extend([
                {
                    'params': self.lm.decoder.parameters(),
                    'lr': 1e-3
                },
            ])

        self.optimizer = torch.optim.Adam(param_list)
        if scheduler == 'ulmfit':
            self.scheduler = CyclicLR(self.optimizer,
                                      max_lrs=max_lrs,
                                      mode='ulmfit',
                                      ratio=1.5,
                                      cut_frac=0.4,
                                      train_data_loader=self.train_dl,
                                      verbose=False)
        print("Beginning LM Fine Tuning")
        self.freezeTo(3)
        history = training_loop(batch_size=self.batch_size,
                                num_epochs=1,
                                display_freq=self.display_epoch_freq,
                                model=self.lm,
                                criterion=self.loss,
                                optim=self.optimizer,
                                scheduler=self.scheduler,
                                device=self.device,
                                training_set=self.train_dl,
                                validation_set=self.valid_dl,
                                best_model_path=self.model_file_lm,
                                history=None)

        self.freezeTo(2)
        history = training_loop(batch_size=self.batch_size,
                                num_epochs=1,
                                display_freq=self.display_epoch_freq,
                                model=self.lm,
                                criterion=self.loss,
                                optim=self.optimizer,
                                scheduler=self.scheduler,
                                device=self.device,
                                training_set=self.train_dl,
                                validation_set=self.valid_dl,
                                best_model_path=self.model_file_lm,
                                history=history)

        self.freezeTo(0)
        history = training_loop(batch_size=self.batch_size,
                                num_epochs=self.num_epochs - 2,
                                display_freq=self.display_epoch_freq,
                                model=self.lm,
                                criterion=self.loss,
                                optim=self.optimizer,
                                scheduler=self.scheduler,
                                device=self.device,
                                training_set=self.train_dl,
                                validation_set=self.valid_dl,
                                best_model_path=self.model_file_lm,
                                history=history)