shuffle=True) loss_function = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) train_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=settings.MILESTONES, gamma=0.2) #learning rate decay iter_per_epoch = len(cifar100_training_loader) warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * args.warm) if args.resume: recent_folder = most_recent_folder(os.path.join( settings.CHECKPOINT_PATH, args.net), fmt=settings.DATE_FORMAT) if not recent_folder: raise Exception('no recent folder were found') checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder) else: checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net, settings.TIME_NOW) #use tensorboard if not os.path.exists(settings.LOG_DIR): os.mkdir(settings.LOG_DIR)
def train_variant(conv, fcl, args): net, arch_name = construct_vgg_variant(conv_variant=conv, fcl_variant=fcl, batch_norm=True, progress=True, pretrained=False) args.net = arch_name if args.gpu: #use_gpu net = net.cuda() loss_function = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) train_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=settings.MILESTONES, gamma=0.2) # learning rate decay iter_per_epoch = len(cifar100_training_loader) warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * args.warm) if args.resume: recent_folder = most_recent_folder(os.path.join( settings.CHECKPOINT_PATH, args.net), fmt=settings.DATE_FORMAT) if not recent_folder: raise Exception('no recent folder were found') checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder) else: checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net, settings.TIME_NOW) #use tensorboard if not os.path.exists(settings.LOG_DIR): os.mkdir(settings.LOG_DIR) #since tensorboard can't overwrite old values #so the only way is to create a new tensorboard log writer = SummaryWriter( log_dir=os.path.join(settings.LOG_DIR, args.net, settings.TIME_NOW)) if args.gpu: input_tensor = torch.Tensor(1, 3, 32, 32).cuda() else: input_tensor = torch.Tensor(1, 3, 32, 32) writer.add_graph(net, input_tensor) #create checkpoint folder to save model if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) checkpoint_path = os.path.join(checkpoint_path, '{net}-{epoch}-{type}.pth') best_acc = 0.0 if args.resume: best_weights = best_acc_weights( os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder)) if best_weights: weights_path = os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder, best_weights) print('found best acc weights file:{}'.format(weights_path)) print('load best training file to test acc...') net.load_state_dict(torch.load(weights_path)) best_acc = eval_training(tb=False) print('best acc is {:0.2f}'.format(best_acc)) recent_weights_file = most_recent_weights( os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder)) if not recent_weights_file: raise Exception('no recent weights file were found') weights_path = os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder, recent_weights_file) print('loading weights file {} to resume training.....'.format( weights_path)) net.load_state_dict(torch.load(weights_path)) resume_epoch = last_epoch( os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder)) train_params = { 'net': net, 'warmup_scheduler': warmup_scheduler, 'loss_function': loss_function, 'optimizer': optimizer, 'writer': writer } for epoch in range(1, settings.EPOCH): # for epoch in [1]:# range(1, 2): if epoch > args.warm: train_scheduler.step(epoch) if args.resume: if epoch <= resume_epoch: continue train(epoch=epoch, **train_params) acc = eval_training(epoch=epoch, **train_params) #start to save best performance model after learning rate decay to 0.01 if epoch > settings.MILESTONES[1] and best_acc < acc: torch.save( net.state_dict(), checkpoint_path.format(net=args.net, epoch=epoch, type='best')) best_acc = acc continue if not epoch % settings.SAVE_EPOCH: torch.save( net.state_dict(), checkpoint_path.format(net=args.net, epoch=epoch, type='regular')) writer.close()