def run(self): self.build_model() for epoch in range(self.nEpochs): self.epoch = epoch adjust_learning_rate(self.optimizer, epoch, self.config) print("\n===> Epoch {} starts:".format(epoch)) self.train() state = {'epoch': epoch, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict()} saveCheckpoint(epoch, state, self.config.comment) validLoss = self.test() if validLoss < self.baseline: print('Current loss %.4f < Baseline %.4f ' % (validLoss, self.baseline)) self.baseline = validLoss saveCheckpoint(epoch, state, self.config.comment, True) else: print('Current loss %.4f > Baseline %.4f ' % (validLoss, self.baseline))
def main(args): train_loader = get_loader(args) n_data = len(train_loader.dataset) if args.local_rank == 0: print(f"length of training dataset: {n_data}") model, model_ema = build_model(args) contrast = MemoryMoCo(128, args.nce_k, args.nce_t).cuda() criterion = NCESoftmaxLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) model = DistributedDataParallel(model, device_ids=[args.local_rank], broadcast_buffers=False) # optionally resume from a checkpoint if args.resume: assert os.path.isfile(args.resume) load_checkpoint(args, model, model_ema, contrast, optimizer) # tensorboard logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2) # routine for epoch in range(args.start_epoch, args.epochs + 1): train_loader.sampler.set_epoch(epoch) adjust_learning_rate(epoch, args, optimizer) tic = time.time() loss, prob = train_moco(epoch, train_loader, model, model_ema, contrast, criterion, optimizer, args) if args.local_rank == 0: print('epoch {}, total time {:.2f}'.format(epoch, time.time() - tic)) # tensorboard logger logger.log_value('ins_loss', loss, epoch) logger.log_value('ins_prob', prob, epoch) logger.log_value('learning_rate', optimizer.param_groups[0]['lr'], epoch) # save model save_checkpoint(args, epoch, model, model_ema, contrast, optimizer)
def main(args): global best_acc1 best_acc1 = 0 train_loader, val_loader = get_loader(args) if args.local_rank == 0: print(f"length of training dataset: {len(train_loader.dataset)}") # create model and optimizer #model = resnet50(width=args.model_width).cuda() model = ResNet50().cuda() for p in model.parameters(): p.requires_grad = False classifier = LinearClassifierResNet(args.layer, args.n_label, 'avg', args.model_width).cuda() classifier = DistributedDataParallel(classifier, device_ids=[args.local_rank], broadcast_buffers=False) ckpt = torch.load(args.model_path, map_location='cpu') state_dict = { k.replace("module.", ""): v for k, v in ckpt['model'].items() } model.load_state_dict(state_dict) if args.local_rank == 0: print("==> loaded checkpoint '{}' (epoch {})".format( args.model_path, ckpt['epoch'])) criterion = torch.nn.CrossEntropyLoss().cuda() if not args.adam: optimizer = torch.optim.SGD(classifier.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) else: optimizer = torch.optim.Adam(classifier.parameters(), lr=args.learning_rate, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay, eps=1e-8) model.eval() # optionally resume from a checkpoint args.start_epoch = 1 if args.resume: if os.path.isfile(args.resume): if args.local_rank == 0: print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location='cpu') args.start_epoch = checkpoint['epoch'] + 1 classifier.load_state_dict(checkpoint['classifier']) optimizer.load_state_dict(checkpoint['optimizer']) best_acc1 = checkpoint['best_acc1'] if args.local_rank == 0: print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) if 'opt' in checkpoint.keys(): # resume optimization hyper-parameters if args.local_rank == 0: print('=> resume hyper parameters') if 'bn' in vars(checkpoint['opt']): print('using bn: ', checkpoint['opt'].bn) if 'adam' in vars(checkpoint['opt']): print('using adam: ', checkpoint['opt'].adam) if 'cosine' in vars(checkpoint['opt']): print('using cosine: ', checkpoint['opt'].cosine) args.learning_rate = checkpoint['opt'].learning_rate args.lr_decay_rate = checkpoint['opt'].lr_decay_rate args.momentum = checkpoint['opt'].momentum args.weight_decay = checkpoint['opt'].weight_decay args.beta1 = checkpoint['opt'].beta1 args.beta2 = checkpoint['opt'].beta2 del checkpoint torch.cuda.empty_cache() else: if args.local_rank == 0: print("=> no checkpoint found at '{}'".format(args.resume)) # set cosine annealing scheduler if args.cosine: assert not args.resume, "cosine lr scheduler not support resume now." eta_min = args.learning_rate * (args.lr_decay_rate**3) * 0.1 scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, args.epochs, eta_min, -1) if args.eval: if args.local_rank == 0: print("==> testing...") validate(val_loader, model, classifier.module, criterion, args) return # tensorboard logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2) # routine for epoch in range(args.start_epoch, args.epochs + 1): torch.distributed.barrier() train_loader.sampler.set_epoch(epoch) if not args.cosine: adjust_learning_rate(epoch, args, optimizer) if args.local_rank == 0: print("==> training...") time1 = time.time() train_acc, train_acc5, train_loss = train(epoch, train_loader, model, classifier, criterion, optimizer, args) if args.cosine: scheduler.step() if args.local_rank == 0: print('train epoch {}, total time {:.2f}'.format( epoch, time.time() - time1)) logger.log_value('train_acc', train_acc, epoch) logger.log_value('train_acc5', train_acc5, epoch) logger.log_value('train_loss', train_loss, epoch) logger.log_value('learning_rate', optimizer.param_groups[0]['lr'], epoch) print("==> testing...") test_acc, test_acc5, test_loss = validate(val_loader, model, classifier.module, criterion, args) logger.log_value('test_acc', test_acc, epoch) logger.log_value('test_acc5', test_acc5, epoch) logger.log_value('test_loss', test_loss, epoch) # save model if epoch % args.save_freq == 0: print('==> Saving...') state = { 'opt': args, 'epoch': epoch, 'classifier': classifier.state_dict(), 'best_acc1': test_acc, 'optimizer': optimizer.state_dict(), } torch.save( state, os.path.join(args.model_folder, f'ckpt_epoch_{epoch}.pth'))
def start_epoch_hook(epoch, model, optimizer): print('| epoch %d, training:' % epoch) adjust_learning_rate(args.lr, epoch, optimizer, args.lr_decay, args.lr_decay_stepsize)