Ejemplo n.º 1
0
 def run(self):
     self.build_model()
     for epoch in range(self.nEpochs):
         self.epoch = epoch
         adjust_learning_rate(self.optimizer, epoch, self.config)
         print("\n===> Epoch {} starts:".format(epoch))
         self.train()
         state = {'epoch': epoch, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict()}
         saveCheckpoint(epoch, state, self.config.comment)
         validLoss = self.test()
         if validLoss < self.baseline:
             print('Current loss %.4f < Baseline %.4f ' % (validLoss, self.baseline))
             self.baseline = validLoss
             saveCheckpoint(epoch, state, self.config.comment, True)
         else:
             print('Current loss %.4f > Baseline %.4f ' % (validLoss, self.baseline))
Ejemplo n.º 2
0
def main(args):
    train_loader = get_loader(args)
    n_data = len(train_loader.dataset)
    if args.local_rank == 0:
        print(f"length of training dataset: {n_data}")

    model, model_ema = build_model(args)
    contrast = MemoryMoCo(128, args.nce_k, args.nce_t).cuda()
    criterion = NCESoftmaxLoss().cuda()
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.learning_rate,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    model = DistributedDataParallel(model,
                                    device_ids=[args.local_rank],
                                    broadcast_buffers=False)

    # optionally resume from a checkpoint
    if args.resume:
        assert os.path.isfile(args.resume)
        load_checkpoint(args, model, model_ema, contrast, optimizer)

    # tensorboard
    logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2)

    # routine
    for epoch in range(args.start_epoch, args.epochs + 1):
        train_loader.sampler.set_epoch(epoch)
        adjust_learning_rate(epoch, args, optimizer)

        tic = time.time()
        loss, prob = train_moco(epoch, train_loader, model, model_ema,
                                contrast, criterion, optimizer, args)

        if args.local_rank == 0:
            print('epoch {}, total time {:.2f}'.format(epoch,
                                                       time.time() - tic))

            # tensorboard logger
            logger.log_value('ins_loss', loss, epoch)
            logger.log_value('ins_prob', prob, epoch)
            logger.log_value('learning_rate', optimizer.param_groups[0]['lr'],
                             epoch)

            # save model
            save_checkpoint(args, epoch, model, model_ema, contrast, optimizer)
def main(args):
    global best_acc1
    best_acc1 = 0

    train_loader, val_loader = get_loader(args)
    if args.local_rank == 0:
        print(f"length of training dataset: {len(train_loader.dataset)}")

    # create model and optimizer
    #model = resnet50(width=args.model_width).cuda()
    model = ResNet50().cuda()
    for p in model.parameters():
        p.requires_grad = False
    classifier = LinearClassifierResNet(args.layer, args.n_label, 'avg',
                                        args.model_width).cuda()
    classifier = DistributedDataParallel(classifier,
                                         device_ids=[args.local_rank],
                                         broadcast_buffers=False)

    ckpt = torch.load(args.model_path, map_location='cpu')
    state_dict = {
        k.replace("module.", ""): v
        for k, v in ckpt['model'].items()
    }
    model.load_state_dict(state_dict)
    if args.local_rank == 0:
        print("==> loaded checkpoint '{}' (epoch {})".format(
            args.model_path, ckpt['epoch']))

    criterion = torch.nn.CrossEntropyLoss().cuda()

    if not args.adam:
        optimizer = torch.optim.SGD(classifier.parameters(),
                                    lr=args.learning_rate,
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay)
    else:
        optimizer = torch.optim.Adam(classifier.parameters(),
                                     lr=args.learning_rate,
                                     betas=(args.beta1, args.beta2),
                                     weight_decay=args.weight_decay,
                                     eps=1e-8)

    model.eval()

    # optionally resume from a checkpoint
    args.start_epoch = 1
    if args.resume:
        if os.path.isfile(args.resume):
            if args.local_rank == 0:
                print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume, map_location='cpu')
            args.start_epoch = checkpoint['epoch'] + 1
            classifier.load_state_dict(checkpoint['classifier'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            best_acc1 = checkpoint['best_acc1']
            if args.local_rank == 0:
                print("=> loaded checkpoint '{}' (epoch {})".format(
                    args.resume, checkpoint['epoch']))
            if 'opt' in checkpoint.keys():
                # resume optimization hyper-parameters
                if args.local_rank == 0:
                    print('=> resume hyper parameters')
                    if 'bn' in vars(checkpoint['opt']):
                        print('using bn: ', checkpoint['opt'].bn)
                    if 'adam' in vars(checkpoint['opt']):
                        print('using adam: ', checkpoint['opt'].adam)
                    if 'cosine' in vars(checkpoint['opt']):
                        print('using cosine: ', checkpoint['opt'].cosine)
                args.learning_rate = checkpoint['opt'].learning_rate
                args.lr_decay_rate = checkpoint['opt'].lr_decay_rate
                args.momentum = checkpoint['opt'].momentum
                args.weight_decay = checkpoint['opt'].weight_decay
                args.beta1 = checkpoint['opt'].beta1
                args.beta2 = checkpoint['opt'].beta2
            del checkpoint
            torch.cuda.empty_cache()
        else:
            if args.local_rank == 0:
                print("=> no checkpoint found at '{}'".format(args.resume))

    # set cosine annealing scheduler
    if args.cosine:
        assert not args.resume, "cosine lr scheduler not support resume now."
        eta_min = args.learning_rate * (args.lr_decay_rate**3) * 0.1
        scheduler = optim.lr_scheduler.CosineAnnealingLR(
            optimizer, args.epochs, eta_min, -1)

    if args.eval:
        if args.local_rank == 0:
            print("==> testing...")
            validate(val_loader, model, classifier.module, criterion, args)
        return

    # tensorboard
    logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2)

    # routine
    for epoch in range(args.start_epoch, args.epochs + 1):

        torch.distributed.barrier()
        train_loader.sampler.set_epoch(epoch)

        if not args.cosine:
            adjust_learning_rate(epoch, args, optimizer)

        if args.local_rank == 0:
            print("==> training...")

        time1 = time.time()
        train_acc, train_acc5, train_loss = train(epoch, train_loader, model,
                                                  classifier, criterion,
                                                  optimizer, args)

        if args.cosine:
            scheduler.step()

        if args.local_rank == 0:
            print('train epoch {}, total time {:.2f}'.format(
                epoch,
                time.time() - time1))
            logger.log_value('train_acc', train_acc, epoch)
            logger.log_value('train_acc5', train_acc5, epoch)
            logger.log_value('train_loss', train_loss, epoch)
            logger.log_value('learning_rate', optimizer.param_groups[0]['lr'],
                             epoch)

            print("==> testing...")
            test_acc, test_acc5, test_loss = validate(val_loader, model,
                                                      classifier.module,
                                                      criterion, args)

            logger.log_value('test_acc', test_acc, epoch)
            logger.log_value('test_acc5', test_acc5, epoch)
            logger.log_value('test_loss', test_loss, epoch)

            # save model
            if epoch % args.save_freq == 0:
                print('==> Saving...')
                state = {
                    'opt': args,
                    'epoch': epoch,
                    'classifier': classifier.state_dict(),
                    'best_acc1': test_acc,
                    'optimizer': optimizer.state_dict(),
                }
                torch.save(
                    state,
                    os.path.join(args.model_folder, f'ckpt_epoch_{epoch}.pth'))
 def start_epoch_hook(epoch, model, optimizer):
     print('| epoch %d, training:' % epoch)
     adjust_learning_rate(args.lr, epoch, optimizer, args.lr_decay,
                          args.lr_decay_stepsize)