def main(): args = parse_args(**Cfg) log_init(logging, args) assert torch.cuda.is_available(), 'no cuda' torch.cuda.set_device( args.devices_id[0] ) # fix bug for `ERROR: all tensors must be on devices[0]` # step1: load model,criterion,optimizer model, criterion, optimizer = load_model(args, logging) teacher_model = load_teacher_model(args, logging) # step2: load data train_loader, val_loader = load_data(args) # step3: run cudnn.benchmark = True if args.mode == "train": for epoch in range(args.start_epoch, args.epochs + 1): # adjust learning rate lr = adjust_learning_rate(args, optimizer, epoch, args.milestones) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, lr, args, logging, teacher_model=teacher_model) if epoch % args.save_epochs == 0 or epoch == args.epochs: filename = f'{args.snapshot}_checkpoint_epoch_{epoch}.pth.tar' save_checkpoint(model.state_dict(), logging, filename) if val_loader != None: validate(val_loader, model, criterion, epoch, logging, args) elif args.mode == 'test' and val_loader != None and Path( args.resume).is_file(): logging.info('Testing model acc......') validate(val_loader, model, criterion, args.start_epoch, logging, args, show_img_nums=25) # 计算模型总参数量 params = sum([param.nelement() for param in model.parameters()]) logging.info("Model Number of Parameters: %.2fM" % (params / 1e6)) elif args.mode == 'prune' and args.prune_percent > 0 and Path( args.resume).is_file(): logging.info('Pruning model......') prune(model, logging, args) else: raise Exception("Please check your config file!")
def main(): global opt # train data loader train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=opt.batchSize, shuffle=True, num_workers=int(opt.workers)) # create model if opt.model is 'VAMetric': model = models.VAMetric() elif opt.model is 'VAMetric2': model = models.VAMetric2() elif opt.model is 'VAMetric3': model = models.VAMetric3() else: model = models.VA_Linear() opt.model = 'VA_Linear' if opt.init_model != '': print('loading pretrained model from {0}'.format(opt.init_model)) model.load_state_dict(torch.load(opt.init_model)) # Contrastive Loss criterion = models.ContrastiveLoss() #criterion = nn.BCELoss() if opt.cuda: print('shift model and criterion to GPU .. ') model = model.cuda() criterion = criterion.cuda() # optimizer #optimizer = optim.SGD(model.parameters(), opt.lr, #momentum=opt.momentum, #weight_decay=opt.weight_decay) optimizer = optim.Adam(model.parameters()) # adjust learning rate every lr_decay_epoch #lambda_lr = lambda epoch: opt.lr_decay ** ((epoch + 1) // opt.lr_decay_epoch) #poly policy for epoch in range(opt.max_epochs): ################################# # train for one epoch ################################# train(train_loader, model, criterion, optimizer, epoch, opt) #LR_Policy(optimizer, opt.lr, lambda_lr(epoch)) # adjust learning rate through poly policy ################################## # save checkpoints ################################## # save model every 10 epochs if ((epoch + 1) % opt.epoch_save) == 0: path_checkpoint = '{0}/{1}_state_epoch{2}.pth'.format( opt.checkpoint_folder, opt.model, epoch + 1) utils.save_checkpoint(model, path_checkpoint)
def main(): global opt best_prec1 = 0 # only used when we resume training from some checkpoint model resume_epoch = 0 # train data loader # for loader, droplast by default is set to false train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=opt.batchSize, shuffle=True, num_workers=int(opt.workers)) # create model model = models.VAMetric() if not opt.train and opt.init_model != '': print('loading pretrained model from {0}'.format(opt.init_model)) model.load_state_dict(torch.load(opt.init_model)) # Contrastive Loss criterion = models.ContrastiveLoss() if opt.cuda: print('shift model and criterion to GPU .. ') model = model.cuda() criterion = criterion.cuda() # optimizer optimizer = optim.SGD(model.parameters(), opt.lr, momentum=opt.momentum, weight_decay=opt.weight_decay) # adjust learning rate every lr_decay_epoch lambda_lr = lambda epoch: opt.lr_decay**( (epoch + 1) // opt.lr_decay_epoch) #poly policy scheduler = LR_Policy(optimizer, lambda_lr) for epoch in range(resume_epoch, opt.max_epochs): ################################# # train for one epoch ################################# train(train_loader, model, criterion, optimizer, epoch, opt) scheduler.step() ################################## # save checkpoints ################################## # save model every 10 epochs if ((epoch + 1) % opt.epoch_save) == 0: path_checkpoint = '{0}/{1}_state_epoch{2}.pth'.format( opt.checkpoint_folder, opt.prefix, epoch + 1) utils.save_checkpoint(model.state_dict(), path_checkpoint)
def main(): # rospy.init_node('pcl2_pub_example', anonymous=True) global args, best_recall # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_recall = checkpoint['best_recall'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) if args.evaluate: validate() return for epoch in range(args.start_epoch, args.epochs): # pdb.set_trace() # adjust_learning_rate(optimizer, epoch) loss_t = train(epoch) # evaluate on validation set loss_v, recall, AOS = validate() with open("convergence.txt", "a") as myfile: myfile.write( "{},{},{},{},{}".format(epoch, loss_t, loss_v, recall, AOS) + "\n") if (epoch > 40): args.lr = 0.001 if (epoch > 80): args.lr = 0.0001 if (args.save_checkpoints): # remember best prec@1 and save checkpoint is_best = recall > best_recall best_recall = max(recall, best_recall) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_recall': best_recall, 'optimizer': optimizer.state_dict(), }, is_best)
def main(): global opt # model = models.Test2() # model = model.double() # train_dataset = dset(opt.data_dir, flist=opt.flist, pca=10) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=opt.batchSize, shuffle=True, num_workers=int(opt.workers)) # create model model = models.Test4() if opt.init_model != '': print('loading pretrained model from {0}'.format(opt.init_model)) model.load_state_dict(torch.load(opt.init_model)) # Contrastive Loss criterion = models.ContrastiveLoss() if opt.cuda: print('shift model and criterion to GPU .. ') model = model.cuda() criterion = criterion.cuda() # optimizer = optim.SGD(model.parameters(), opt.lr, # momentum=opt.momentum, # weight_decay=opt.weight_decay) optimizer = optim.Adam(model.parameters(), opt.lr, weight_decay=opt.weight_decay) # adjust learning rate every lr_decay_epoch lambda_lr = lambda epoch: opt.lr_decay**( (epoch + 1) // opt.lr_decay_epoch) #poly policy for epoch in range(opt.max_epochs): ################################# # train for one epoch ################################# # train(train_loader, model, criterion, optimizer, epoch, opt) train(train_loader, model, criterion, optimizer, epoch, opt) LR_Policy(optimizer, opt.lr, lambda_lr(epoch)) # adjust learning rate through poly policy ################################## # save checkpoint every 10 epochs ################################## if ((epoch + 1) % opt.epoch_save) == 0: path_checkpoint = '{0}/{1}_state.pth'.format( opt.checkpoint_folder, model.__name__) utils.save_checkpoint(model.state_dict(), path_checkpoint)
def main(opt): train_loader = load_train_dataset(opt) test_loader = load_test_dataset(opt) # create model print('shift model and criterion to GPU .. ') model = models.ResidualRNNConv().cuda() print(model) criterion = nn.BCELoss().cuda() # log directory logger = SummaryWriter(comment=f'_{model.__class__.__name__}') if opt.init_model: print(f'loading pretrained model from {opt.init_model}') model.load_state_dict( torch.load(opt.init_model, map_location=lambda storage, loc: storage.cuda())) # optimizer optimizer = optim.Adam(model.parameters(), opt.lr, weight_decay=opt.weight_decay) scheduler = ReduceLROnPlateau(optimizer, verbose=True, patience=opt.scheduler_patience) accuracy = [] for epoch in range(opt.max_epochs): loss = train(train_loader, model, criterion, optimizer, epoch + 1, opt, logger) scheduler.step(loss) right, simmat = evaluate.test(test_loader, model, opt) accuracy.append(right) # test accuracy logger.add_scalar('accuracy', accuracy[-1], epoch + 1) logger.add_histogram('simmat', simmat, epoch + 1, 'auto') if right > 0.8: path_checkpoint = os.path.join( opt.checkpoint_folder, f'{model.__class__.__name__}_{epoch + 1}_{right:.2%}.pth') utils.save_checkpoint(model.state_dict(), path_checkpoint) print( f'Max test accuracy: {np.max(accuracy):.2%} at epoch {(np.argmax(accuracy)+1)}' )
def main(): global opt # train data loader train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=opt.batchSize, shuffle=True, num_workers=int(opt.workers)) # create model model = models.DeepCNN() if opt.init_model != '': print('loading pretrained model from {0}'.format(opt.init_model)) model.load_state_dict(torch.load(opt.init_model)) # Contrastive Loss criterion = models.MyCrossEntropyLoss() #criterion=torch.nn.CrossEntropyLoss() if opt.cuda: print('shift model and criterion to GPU .. ') model = model.cuda() criterion = criterion.cuda() # optimizer optimizer = optim.Adam(model.parameters(), opt.lr, #momentum=opt.momentum, weight_decay=opt.weight_decay) # adjust learning rate every lr_decay_epoch #lambda_lr = lambda epoch: opt.lr_decay ** ((epoch + 1) // opt.lr_decay_epoch) #poly policy #scheduler = LR_Policy(optimizer, lambda_lr) for epoch in range(opt.max_epochs): ################################# # train for one epoch ################################# train(train_loader, model, criterion, optimizer, epoch, opt) #scheduler.step() ################################## # save checkpoint every 10 epochs ################################## if ((epoch+1) % opt.epoch_save) == 0: path_checkpoint = '{0}/{1}_state_epoch{2}.pth'.format(opt.checkpoint_folder, opt.prefix, epoch+1) utils.save_checkpoint(model.state_dict(), path_checkpoint)
def main(): global args, best_prec1 # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) # args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) if args.evaluate: validate(val_loader, model, criterion) return for epoch in range(args.epochs): # adjust_learning_rate(optimizer, epoch) train_stn(epoch) # evaluate on validation set prec1 = validate_stn() # if (prec1 < best_prec1): # adjust_learning_rate2(optimizer) if args.save_checkpoints: # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best)
def main(config): os.environ['CUDA_VISIBLE_DEVICES'] = config.GPU if not config.EVAL_MODE: sys.stdout = Logger(osp.join(config.OUTPUT, 'log_train.txt')) else: sys.stdout = Logger(osp.join(config.OUTPUT, 'log_test.txt')) print("==========\nConfig:{}\n==========".format(config)) print("Currently using GPU {}".format(config.GPU)) # Set random seed set_seed(config.SEED) # Build dataloader trainloader, queryloader, galleryloader, num_classes = build_dataloader( config) # Build model model, classifier = build_model(config, num_classes) # Build classification and pairwise loss criterion_cla, criterion_pair = build_losses(config) # Build optimizer parameters = list(model.parameters()) + list(classifier.parameters()) if config.TRAIN.OPTIMIZER.NAME == 'adam': optimizer = optim.Adam( parameters, lr=config.TRAIN.OPTIMIZER.LR, weight_decay=config.TRAIN.OPTIMIZER.WEIGHT_DECAY) elif config.TRAIN.OPTIMIZER.NAME == 'adamw': optimizer = optim.AdamW( parameters, lr=config.TRAIN.OPTIMIZER.LR, weight_decay=config.TRAIN.OPTIMIZER.WEIGHT_DECAY) elif config.TRAIN.OPTIMIZER.NAME == 'sgd': optimizer = optim.SGD(parameters, lr=config.TRAIN.OPTIMIZER.LR, momentum=0.9, weight_decay=config.TRAIN.OPTIMIZER.WEIGHT_DECAY, nesterov=True) else: raise KeyError("Unknown optimizer: {}".format( config.TRAIN.OPTIMIZER.NAME)) # Build lr_scheduler scheduler = lr_scheduler.MultiStepLR( optimizer, milestones=config.TRAIN.LR_SCHEDULER.STEPSIZE, gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE) start_epoch = config.TRAIN.START_EPOCH if config.MODEL.RESUME: print("Loading checkpoint from '{}'".format(config.MODEL.RESUME)) checkpoint = torch.load(config.MODEL.RESUME) model.load_state_dict(checkpoint['state_dict']) start_epoch = checkpoint['epoch'] model = nn.DataParallel(model).cuda() classifier = nn.DataParallel(classifier).cuda() if config.EVAL_MODE: print("Evaluate only") test(model, queryloader, galleryloader) return start_time = time.time() train_time = 0 best_rank1 = -np.inf best_epoch = 0 print("==> Start training") for epoch in range(start_epoch, config.TRAIN.MAX_EPOCH): start_train_time = time.time() train(epoch, model, classifier, criterion_cla, criterion_pair, optimizer, trainloader) train_time += round(time.time() - start_train_time) if (epoch+1) > config.TEST.START_EVAL and config.TEST.EVAL_STEP > 0 and \ (epoch+1) % config.TEST.EVAL_STEP == 0 or (epoch+1) == config.TRAIN.MAX_EPOCH: print("==> Test") rank1 = test(model, queryloader, galleryloader) is_best = rank1 > best_rank1 if is_best: best_rank1 = rank1 best_epoch = epoch + 1 state_dict = model.module.state_dict() save_checkpoint( { 'state_dict': state_dict, 'rank1': rank1, 'epoch': epoch, }, is_best, osp.join(config.OUTPUT, 'checkpoint_ep' + str(epoch + 1) + '.pth.tar')) scheduler.step() print("==> Best Rank-1 {:.1%}, achieved at epoch {}".format( best_rank1, best_epoch)) elapsed = round(time.time() - start_time) elapsed = str(datetime.timedelta(seconds=elapsed)) train_time = str(datetime.timedelta(seconds=train_time)) print( "Finished. Total elapsed time (h:m:s): {}. Training time (h:m:s): {}.". format(elapsed, train_time))
def main(): logger.info("Logger is set - training start") torch.cuda.set_device(config.gpus[0]) # seed setting np.random.seed(config.seed) torch.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) torch.backends.cudnn.benchmark = True # get data with meta infomation input_size, input_channels, n_classes, train_data = utils.get_data( config.dataset, config.data_path, cutout_length=0, validation=False) # set model net_crit = nn.CrossEntropyLoss().to(device) model = SearchCNNController(input_channels, config.init_channels, n_classes, config.layers, net_crit, n_nodes=config.nodes, device_ids=config.gpus) model = model.to(device) # weight optim w_optim = torch.optim.SGD(model.weights(), config.w_lr, momentum=config.w_momentum, weight_decay=config.alpha_weight_decay) # alpha optim alpha_optim = torch.optim.Adam(model.alphas(), config.alpha_lr, betas=(0.5, 0.999), weight_decay=config.alpha_weight_decay) # split data (train,validation) n_train = len(train_data) split = n_train // 2 indices = list(range(n_train)) train_sampler = torch.utils.data.sampler.SubsetRandomSampler( indices[:split]) valid_sampler = torch.utils.data.sampler.SubsetRandomSampler( indices[split:]) train_loader = torch.utils.data.DataLoader(train_data, batch_size=config.batch_size, sampler=train_sampler, num_workers=config.workers, pin_memory=True) valid_loader = torch.utils.data.DataLoader(train_data, batch_size=config.batch_size, sampler=valid_sampler, num_workers=config.workers, pin_memory=True) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( w_optim, config.epochs, eta_min=config.w_lr_min) arch = Architect(model, config.w_momentum, config.w_weight_decay) # training loop----------------------------------------------------------------------------- best_top1 = 0. for epoch in range(config.epochs): lr_scheduler.step() lr = lr_scheduler.get_lr()[0] model.print_alphas(logger) #training train(train_loader, valid_loader, model, arch, w_optim, alpha_optim, lr, epoch) #validation cur_step = (epoch + 1) * len(train_loader) top1 = validate(valid_loader, model, epoch, cur_step) #log #genotype genotype = model.genotype() logger.info("genotype = {}".format(genotype)) # genotype as a image plot_path = os.path.join(config.plot_path, "EP{:02d}".format(epoch + 1)) caption = "Epoch {}".format(epoch + 1) plot(genotype.normal, plot_path + "-normal", caption) plot(genotype.reduce, plot_path + "-reduce", caption) # output alpha per epochs to tensorboard data for i, tensor in enumerate(model.alpha_normal): for j, lsn in enumerate(F.softmax(tensor, dim=-1)): tb_writer.add_scalars( 'epoch_alpha_normal/%d ~~ %d' % ((j - 2), i), { 'max_pl3': lsn[0], 'avg_pl3': lsn[1], 'skip_cn': lsn[2], 'sep_conv3': lsn[3], 'sep_conv5': lsn[4], 'dil_conv3': lsn[5], 'dil_conv5': lsn[6], 'none': lsn[7] }, epoch) for i, tensor in enumerate(model.alpha_reduce): for j, lsr in enumerate(F.softmax(tensor, dim=-1)): tb_writer.add_scalars( 'epoch_alpha_reduce/%d ~~ %d' % ((j - 2), i), { 'max_pl3': lsr[0], 'avg_pl3': lsr[1], 'skip_cn': lsr[2], 'sep_conv3': lsr[3], 'sep_conv5': lsr[4], 'dil_conv3': lsr[5], 'dil_conv5': lsr[6], 'none': lsr[7] }, epoch) #save if best_top1 < top1: best_top1 = top1 best_genotype = genotype is_best = True else: is_best = False utils.save_checkpoint(model, config.path, is_best) print("") logger.info("Final best Prec@1 = {:.4%}".format(best_top1)) logger.info("Best Genotype is = {}".format(best_genotype))
def train(args, net): # Get DataLoader data_loader = make_dataloader(args) # Get Optimizer optimizer = make_optimizer(args, net) # Get Criterion criterion = Loss(args=args) # Get Timer timer = Chronometer() # Get Logger logger = Logger(args=args) logger.print_net(net) # Check for Multi GPU Support if torch.cuda.device_count() > 1 and args.mGPU: net = torch.nn.DataParallel(net) # Create a directory for training files if not os.path.exists(args.ckpt): os.mkdir(args.ckpt) start_epoch = args.start_epoch if args.resume: checkpoint = torch.load(args.resumed_ckpt) start_epoch = checkpoint['epoch'] best_accuracy = 0.0 timer.set() for epoch in range(start_epoch, args.epochs): logger('Epoch: {}'.format(epoch + 1), prt=False) epoch_train_loss, is_best = 0.0, False with tqdm(total=len(data_loader), ncols=0, file=sys.stdout, desc='Epoch: {}'.format(epoch + 1)) as pbar: for i, in_batch in enumerate(data_loader): optimizer.zero_grad() in_data, target = in_batch # Load to GPU if torch.cuda.is_available(): in_data, target = in_data.cuda(), target.cuda() # Forward Pass predicted = net(in_data) # Backward Pass loss = criterion(predicted, target) epoch_train_loss += loss.item() loss.backward() optimizer.step() # Update Progressbar if i % 50 == 49: logger('[Train loss/batch: {0:.4f}]'.format(loss.item()), prt=False) pbar.set_postfix(Loss=loss.item()) pbar.update() epoch_train_loss /= len(data_loader) message = 'Average Training Loss : {0:.4f}'.format(epoch_train_loss) logger(message) # Check Performance of the trained Model on test set if epoch % args.evaluate_every_n_epoch == args.evaluate_every_n_epoch - 1: print('Network Evaluation...') net.eval() output = evaluate.evaluate(args, net) net.train() logger(output['message']) if output['accuracy'] > best_accuracy: best_accuracy = output['accuracy'] is_best = True # save the checkpoint as best checkpoint so far save_checkpoint( { 'epoch': epoch + 1, 'net_state_dict': net.module.state_dict() if args.mGPU else net.state_dict() }, is_best, filename=os.path.join(args.ckpt, 'checkpoint.pth.tar'), best_filename=os.path.join(args.ckpt, 'best_checkpoint.pth.tar')) timer.stop() message = 'Finished Trainig Session in {0} hours & {1} minutes, Best Accuracy Achieved: {2:.2f}\n'.format( int(timer.elapsed / 3600), int((timer.elapsed % 3600) / 60), best_accuracy) logger(message) logger.end()
def worker(gpu, ngpus_per_node, config_in): # init config = copy.deepcopy(config_in) jobid = os.environ["SLURM_JOBID"] procid = int(os.environ["SLURM_PROCID"]) config.gpu = gpu if config.gpu is not None: writer_name = "tb.{}-{:d}-{:d}".format(jobid, procid, gpu) logger_name = "{}.{}-{:d}-{:d}.aug.log".format(config.name, jobid, procid, gpu) ploter_name = "{}-{:d}-{:d}".format(jobid, procid, gpu) ck_name = "{}-{:d}-{:d}".format(jobid, procid, gpu) else: writer_name = "tb.{}-{:d}-all".format(jobid, procid) logger_name = "{}.{}-{:d}-all.aug.log".format(config.name, jobid, procid) ploter_name = "{}-{:d}-all".format(jobid, procid) ck_name = "{}-{:d}-all".format(jobid, procid) writer = SummaryWriter(log_dir=os.path.join(config.path, writer_name)) writer.add_text('config', config.as_markdown(), 0) logger = utils.get_logger(os.path.join(config.path, logger_name)) config.print_params(logger.info) # get cuda device device = torch.device('cuda', gpu) # begin logger.info("Logger is set - training start") if config.dist_url == "env://" and config.rank == -1: config.rank = int(os.environ["RANK"]) if config.mp_dist: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes config.rank = config.rank * ngpus_per_node + gpu # print('back:{}, dist_url:{}, world_size:{}, rank:{}'.format(config.dist_backend, config.dist_url, config.world_size, config.rank)) dist.init_process_group(backend=config.dist_backend, init_method=config.dist_url, world_size=config.world_size, rank=config.rank) # get data with meta info input_size, input_channels, n_classes, train_data, valid_data = utils.get_data( config.dataset, config.data_path, config.cutout_length, validation=True) # build model criterion = nn.CrossEntropyLoss().to(device) use_aux = config.aux_weight > 0. model = AugmentCNN(input_size, input_channels, config.init_channels, n_classes, config.layers, use_aux, config.genotype) if config.gpu is not None: torch.cuda.set_device(config.gpu) # model = model.to(device) model.cuda(config.gpu) # When using a single GPU per process and per DistributedDataParallel, we need to divide # the batch size ourselves based on the total number of GPUs we have config.batch_size = int(config.batch_size / ngpus_per_node) config.workers = int( (config.workers + ngpus_per_node - 1) / ngpus_per_node) # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[config.rank]) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[config.gpu]) # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=None, output_device=None) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) # model size mb_params = utils.param_size(model) logger.info("Model size = {:.3f} MB".format(mb_params)) # weights optimizer optimizer = torch.optim.SGD(model.parameters(), config.lr, momentum=config.momentum, weight_decay=config.weight_decay) # load data train_sampler = data.distributed.DistributedSampler( train_data, num_replicas=config.world_size, rank=config.rank) valid_sampler = data.distributed.DistributedSampler( valid_data, num_replicas=config.world_size, rank=config.rank) train_loader = data.DataLoader(train_data, batch_size=config.batch_size, sampler=train_sampler, shuffle=False, num_workers=config.workers, pin_memory=True) valid_loader = data.DataLoader(valid_data, batch_size=config.batch_size, sampler=valid_sampler, shuffle=False, num_workers=config.workers, pin_memory=True) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, config.epochs) # setting the privacy protecting procedure if config.dist_privacy: logger.info("PRIVACY ENGINE OFF") best_top1 = 0. # training loop for epoch in range(config.epochs): # lr_scheduler.step() drop_prob = config.drop_path_prob * epoch / config.epochs model.module.drop_path_prob(drop_prob) # training train(logger, writer, device, config, train_loader, model, optimizer, criterion, epoch) lr_scheduler.step() # validation cur_step = (epoch + 1) * len(train_loader) top1 = validate(logger, writer, device, config, valid_loader, model, criterion, epoch, cur_step) # save if best_top1 < top1: best_top1 = top1 is_best = True else: is_best = False utils.save_checkpoint(model, config.path, ck_name, 'aug', is_best) print("") logger.info("Final best Prec@1 = {:.4%}".format(best_top1))
def train_and_evaluate(model, train_data, val_data, optimizer, scheduler, params, metric_labels, model_dir, restore_file=None): """Train the model and evaluate every epoch.""" # reload weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar') logging.info("Restoring parameters from {}".format(restore_path)) utils.load_checkpoint(restore_path, model, optimizer) best_val_f1 = 0.0 patience_counter = 0 for epoch in range(1, params.epoch_num + 1): # Run one epoch logging.info("Epoch {}/{}".format(epoch, params.epoch_num)) # Compute number of batches in one epoch train_steps_num = params.train_size // params.batch_size val_steps_num = params.val_size // params.batch_size # data iterator for training train_data_iterator = data_loader.data_iterator(train_data, params.batch_size, shuffle='True') # Train for one epoch on training set train_loss = train(model, train_data_iterator, optimizer, scheduler, params, train_steps_num) # data iterator for training and validation train_data_iterator = data_loader.data_iterator( train_data, params.batch_size) val_data_iterator = data_loader.data_iterator(val_data, params.batch_size) # Evaluate for one epoch on training set and validation set train_metrics = evaluate(model, train_data_iterator, train_steps_num, metric_labels) train_metrics['loss'] = train_loss train_metrics_str = "; ".join("{}: {:05.2f}".format(k, v) for k, v in train_metrics.items()) logging.info("- Train metrics: " + train_metrics_str) val_metrics = evaluate(model, val_data_iterator, val_steps_num, metric_labels) val_metrics_str = "; ".join("{}: {:05.2f}".format(k, v) for k, v in val_metrics.items()) logging.info("- Eval metrics: " + val_metrics_str) val_f1 = val_metrics['f1'] improve_f1 = val_f1 - best_val_f1 # Save weights ot the network utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict() }, is_best=improve_f1 > 0, checkpoint=model_dir) if improve_f1 > 0: logging.info("- Found new best F1") best_val_f1 = val_f1 if improve_f1 < params.patience: patience_counter += 1 else: patience_counter = 0 else: patience_counter += 1 # Early stopping and logging best f1 if (patience_counter >= params.patience_num and epoch > params.min_epoch_num) or epoch == params.epoch_num: logging.info("best val f1: {:05.2f}".format(best_val_f1)) break
def main(): logger.info("Logger is set - training start") # set default gpu device id torch.cuda.set_device(config.gpus[0]) # set seed np.random.seed(config.seed) torch.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) torch.backends.cudnn.benchmark = True # get data with meta info input_size, input_channels, n_classes, train_data, valid_data = utils.get_data( config.dataset, config.data_path, config.cutout_length, validation=True) criterion = nn.CrossEntropyLoss().to(device) use_aux = config.aux_weight > 0. model = AugmentCNN(input_size, input_channels, config.init_channels, n_classes, config.layers, use_aux, config.genotype) model = nn.DataParallel(model, device_ids=config.gpus).to(device) # model size mb_params = utils.param_size(model) logger.info('Model size = {:.3f} MB'.format(mb_params)) # weight optimizer optimizer = torch.optim.SGD(model.parameters(), config.lr, momentum=config.momentum, weight_decay=config.weight_decay) train_loader = torch.utils.data.DataLoader(train_data, batch_size=config.batch_size, shuffle=True, num_workers=config.workers, pin_memory=True) valid_loader = torch.utils.data.DataLoader(valid_data, batch_size=config.batch_size, shuffle=True, num_workers=config.workers, pin_memory=True) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, config.epochs) best_top1 = 0. # train loop for epoch in range(config.epochs): lr_scheduler.step() drop_prob = config.drop_path_prob * epoch / config.epochs model.module.drop_path_prob(drop_prob) # training train(train_loader, model, optimizer, criterion, epoch) # validation cur_step = (epoch + 1) * len(train_loader) top1 = validate(valid_loader, model, criterion, epoch, cur_step) # save if best_top1 < top1: best_top1 = top1 is_best = True else: is_best = False utils.save_checkpoint(model, config.path, is_best) print("") logger.info("Final best Prec@1 = {:.4%}".format(best_top1))
def main(): global opt # train data loader train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=opt.batchSize, shuffle=True, num_workers=int(opt.workers)) # create model model = models.VAMetric_conv() if opt.init_model != '': print('loading pretrained model from {0}'.format(opt.init_model)) model.load_state_dict(torch.load(opt.init_model)) # Contrastive Loss #criterion = models.StableBCELoss() criterion = nn.CrossEntropyLoss() if opt.cuda: print('shift model and criterion to GPU .. ') model = model.cuda() criterion = criterion.cuda() # optimizer # optimizer = optim.SGD(model.parameters(), lr=opt.lr, # momentum=opt.momentum, # weight_decay=opt.weight_decay) optimizer = optim.Adam(model.parameters(), lr=opt.lr) # optimizer = optim.SGD(model.parameters(), lr=opt.lr, weight_decay=opt.weight_decay, momentum=opt.momentum) # optimizer = optim.Adadelta(params=model.parameters(), lr=opt.lr) # adjust learning rate every lr_decay_epoch lambda_lr = lambda epoch: opt.lr_decay**( (epoch + 1) // opt.lr_decay_epoch) # poly policy scheduler = LR_Policy(optimizer, lambda_lr) resume_epoch = 0 global dis1_rec global dis2_rec global loss_rec loss_rec = [] dis1_rec = [] dis2_rec = [] # another test for git for epoch in range(resume_epoch, opt.max_epochs): ################################# # train for one epoch ################################# train(train_loader, model, criterion, optimizer, epoch, opt) scheduler.step() ################################## # save checkpoints ################################## # save model every 10 epochs if ((epoch + 1) % opt.epoch_save) == 0: path_checkpoint = '{0}/{1}_state_epoch{2}.pth'.format( opt.checkpoint_folder, opt.prefix, epoch + 1) utils.save_checkpoint(model.state_dict(), path_checkpoint) plt.figure(1) plt.subplot(1, 2, 1) plt.plot(loss_rec) plt.legend('loss') plt.subplot(1, 2, 2) #plt.plot(dis1_rec) #plt.plot(dis2_rec) plt.legend(('distance between positives', 'distance between negatives')) plt.show() plt.savefig("./figures/conv.jpg")
def main(): global opt # train data loader train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=opt.batchSize, shuffle=True, num_workers=int(opt.workers)) # create model model = models.VAMetric_conv() if opt.init_model != '': print('loading pretrained model from {0}'.format(opt.init_model)) model.load_state_dict(torch.load(opt.init_model)) # Contrastive Loss criterion = models.conv_loss_dqy() if opt.cuda: print('shift model and criterion to GPU .. ') model = model.cuda() criterion = criterion.cuda() # optimizer # optimizer = optim.SGD(model.parameters(), lr=opt.lr, # momentum=opt.momentum, # weight_decay=opt.weight_decay) optimizer = optim.Adam(model.parameters(), lr=opt.lr) # optimizer = optim.SGD(model.parameters(), lr=opt.lr, weight_decay=opt.weight_decay, momentum=opt.momentum) # optimizer = optim.Adadelta(params=model.parameters(), lr=opt.lr) # adjust learning rate every lr_decay_epoch lambda_lr = lambda epoch: opt.lr_decay**( (epoch + 1) // opt.lr_decay_epoch) # poly policy scheduler = LR_Policy(optimizer, lambda_lr) resume_epoch = 0 global dis1_rec global dis2_rec global loss_rec loss_rec = [] dis1_rec = [] dis2_rec = [] ######### to test each epoch parser = OptionParser() parser.add_option('--config', type=str, help="evaluation configuration", default="./configs/test_config.yaml") (opts_test, args) = parser.parse_args() opts_test = Config(opts_test.config) test_video_dataset = VideoFeatDataset(opts_test.data_dir, opts_test.video_flist, which_feat='vfeat') test_audio_dataset = VideoFeatDataset(opts_test.data_dir, opts_test.audio_flist, which_feat='afeat') test_video_loader = torch.utils.data.DataLoader( test_video_dataset, batch_size=opts_test.batchSize, shuffle=False, num_workers=int(opts_test.workers)) test_audio_loader = torch.utils.data.DataLoader( test_audio_dataset, batch_size=opts_test.batchSize, shuffle=False, num_workers=int(opts_test.workers)) ######## # another test for git for epoch in range(resume_epoch, opt.max_epochs): ################################# # train for one epoch ################################# train(train_loader, model, criterion, optimizer, epoch, opt, test_video_loader, test_audio_loader, opts_test) scheduler.step() ################################## # save checkpoints ################################## # save model every 10 epochs if ((epoch + 1) % opt.epoch_save) == 0: path_checkpoint = '{0}/{1}_state_epoch{2}.pth'.format( opt.checkpoint_folder, opt.prefix, epoch + 1) utils.save_checkpoint(model.state_dict(), path_checkpoint) plt.figure(1) plt.subplot(1, 2, 1) plt.plot(loss_rec) plt.legend('loss') plt.subplot(1, 2, 2) plt.plot(dis1_rec) plt.plot(dis2_rec) plt.legend(('distance between positives', 'distance between negatives')) plt.show() plt.savefig("./figures/conv.jpg")
def main(): global opt # train data loader tl_ls = [] for tds in tds_ls: tl_ls.append( torch.utils.data.DataLoader(tds, batch_size=opt.batchSize, shuffle=True, num_workers=int(opt.workers))) # create model model_ls = [] for i in range(opt.model_number): m = models.VA_lstm() # m = models.VAMetric_conv() model_ls.append(m) if opt.init_model_epoch != '': for i in range(opt.model_number): path = '{0}/{1}_state_epoch{2}_model{3}.pth'.format( opt.checkpoint_folder, opt.prefix, opt.init_model_epoch, i + 1) print('loading pretrained model from {0}'.format(path)) model_ls[i].load_state_dict(torch.load(path)) # Contrastive Loss # criterion = models.conv_loss_dqy() # criterion = models.N_pair_loss() # criterion = models.Topk_loss() criterion = models.lstm_loss() if opt.cuda: print('shift model and criterion to GPU .. ') for i in range(opt.model_number): model_ls[i] = model_ls[i].cuda() criterion = criterion.cuda() # optimizer # optimizer = optim.SGD(model.parameters(), lr=opt.lr, # momentum=opt.momentum, # weight_decay=opt.weight_decay) opt_ls = [] for m in model_ls: op = optim.Adam(m.parameters(), lr=opt.lr) # op = optim.SGD(m.parameters(), lr=opt.lr, # momentum=opt.momentum, # weight_decay=opt.weight_decay) opt_ls.append(op) # optimizer = optim.SGD(model.parameters(), lr=opt.lr, weight_decay=opt.weight_decay, momentum=opt.momentum) # optimizer = optim.Adadelta(params=model.parameters(), lr=opt.lr) # adjust learning rate every lr_decay_epoch lambda_lr = lambda epoch: opt.lr_decay**( (epoch + 1) // opt.lr_decay_epoch) # poly policy scheduler_ls = [] for op in opt_ls: scheduler_ls.append(LR_Policy(op, lambda_lr)) resume_epoch = 0 global positive_rec global negative_rec global loss_rec loss_rec = [] positive_rec = [] negative_rec = [] ######### to test each epoch parser = OptionParser() parser.add_option('--config', type=str, help="evaluation configuration", default="./configs/test_config.yaml") (opts_test, args) = parser.parse_args() opts_test = Config(opts_test.config) test_video_dataset = VideoFeatDataset(root=opts_test.data_dir, flist=opts_test.video_flist, which_feat='vfeat', creat_test=0) test_audio_dataset = VideoFeatDataset(root=opts_test.data_dir, flist=opts_test.audio_flist, which_feat='afeat', creat_test=0) test_video_loader = torch.utils.data.DataLoader( test_video_dataset, batch_size=opts_test.batchSize, shuffle=False, num_workers=int(opts_test.workers)) test_audio_loader = torch.utils.data.DataLoader( test_audio_dataset, batch_size=opts_test.batchSize, shuffle=False, num_workers=int(opts_test.workers)) ######## # another test for git for epoch in range(resume_epoch, opt.max_epochs): ################################# # train for one epoch ################################# for i in range(opt.model_number): train(train_loader=tl_ls[i], model=model_ls[i], criterion=criterion, optimizer=opt_ls[i], epoch=epoch + 1, opt=opt, num=i + 1) scheduler_ls[i].step() ################################## # save checkpoints ################################## # save model every 10 epochs if ((epoch + 1) % opt.epoch_save) == 0: for i in range(opt.model_number): path_checkpoint = '{0}/{1}_state_epoch{2}_model{3}.pth'.format( opt.checkpoint_folder, opt.prefix, epoch + 1, i + 1) utils.save_checkpoint(model_ls[i].state_dict(), path_checkpoint) if ((epoch + 1) % opt.epoch_plot) == 0: plt.figure(1) plt.subplot(1, 2, 1) plt.plot(loss_rec) plt.legend('loss') plt.subplot(1, 2, 2) plt.plot(positive_rec) plt.plot(negative_rec) plt.legend( ('simmilarity of positives', 'simmilarity of negatives')) plt.show() plt.savefig('./figures/result{0}.jpg'.format(epoch + 1)) plt.close() if ((epoch + 1) % opt.epoch_test) == 0: evaluate.test(test_video_loader, test_audio_loader, model_ls, opts_test)
trainloader, use_gpu, optimizer_center, criterion_center_loss, criterion_osm_caa) if args.stepsize > 0: scheduler.step() if epoch in args.epochs_eval: re_rank_mAP = test_rerank(model, queryloader, galleryloader, args.pool, use_gpu) if re_rank_mAP > prev_best: prev_best = re_rank_mAP if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_checkpoint({ 'state_dict': state_dict, }, is_best, osp.join( args.save_dir, args.arch + '_checkpoint_ep' + str(epoch + 1) + '.pth.tar')) else: for epoch in args.epochs_eval: checkpoint = torch.load( osp.join( args.save_dir, args.arch + '_checkpoint_ep' + str(epoch + 1) + '.pth.tar')) state_dict = {} for key in checkpoint['state_dict']: state_dict["module." + key] = checkpoint['state_dict'][key] model.load_state_dict(state_dict, strict=True) rank1 = test_rerank(model, queryloader, galleryloader, args.pool,
def main(): global opt # train data loader tl_ls = [] for tds in tds_ls: tl_ls.append( torch.utils.data.DataLoader(tds, batch_size=opt.batchSize, shuffle=True, num_workers=int(opt.workers))) # create model model_ls = [] for i in range(opt.model_number): encoder = models.Encoder() decoder = models.AttnDecoder() model_ls.append([encoder, decoder]) # if opt.init_model_epoch != '': # for i in range(opt.model_number): # path = '{0}/{1}_state_epoch{2}_model{3}.pth'.format(opt.checkpoint_folder, opt.prefix, # opt.init_model_epoch, i + 1) # print('loading pretrained model from {0}'.format(path)) # model_ls[i].load_state_dict(torch.load(path)) criterion = models.pairwise_loss() if opt.cuda: print('shift model and criterion to GPU .. ') for i in range(opt.model_number): cp = model_ls[i] cp[0] = cp[0].cuda() cp[1] = cp[1].cuda() criterion = criterion.cuda() opt_ls = [] for m in model_ls: encoder = m[0] decoder = m[1] encoder_optim = optim.Adam(encoder.parameters(), lr=opt.lr) decoder_optim = optim.Adam(decoder.parameters(), lr=opt.lr) # encoder_optim = optim.SGD(encoder.parameters(), lr=opt.lr, weight_decay=opt.weight_decay, momentum=opt.momentum) # decoder_optim = optim.SGD(decoder.parameters(), lr=opt.lr, weight_decay=opt.weight_decay, momentum=opt.momentum) op = [encoder_optim, decoder_optim] opt_ls.append(op) # adjust learning rate every lr_decay_epoch lambda_lr = lambda epoch: opt.lr_decay**( (epoch + 1) // opt.lr_decay_epoch) # poly policy scheduler_ls = [] for op in opt_ls: en = LR_Policy(op[0], lambda_lr) de = LR_Policy(op[1], lambda_lr) scheduler_ls.append([en, de]) resume_epoch = 0 global positive_rec global negative_rec global loss_rec loss_rec = [] positive_rec = [] negative_rec = [] ######### to test each epoch ############################################################### parser = OptionParser() parser.add_option('--config', type=str, help="evaluation configuration", default="./configs/test_config.yaml") (opts_test, args) = parser.parse_args() opts_test = Config(opts_test.config) test_video_dataset = VideoFeatDataset(root=opts_test.data_dir, flist=opts_test.video_flist, which_feat='vfeat', creat_test=0) test_audio_dataset = VideoFeatDataset(root=opts_test.data_dir, flist=opts_test.audio_flist, which_feat='afeat', creat_test=0) test_video_loader = torch.utils.data.DataLoader( test_video_dataset, batch_size=opts_test.batchSize, shuffle=False, num_workers=int(opts_test.workers)) test_audio_loader = torch.utils.data.DataLoader( test_audio_dataset, batch_size=opts_test.batchSize, shuffle=False, num_workers=int(opts_test.workers)) ############################################################################################ # another test for git for epoch in range(resume_epoch, opt.max_epochs): ################################# # train for one epoch ################################# for i in range(opt.model_number): m = model_ls[i] op = opt_ls[i] train(train_loader=tl_ls[i], encoder=m[0], decoder=m[1], criterion=criterion, encoder_optim=op[0], decoder_optim=op[1], epoch=epoch + 1, opt=opt, num=i + 1) s = scheduler_ls[i] s[0].step() s[1].step() ################################## # save checkpoints ################################## if ((epoch + 1) % opt.epoch_save) == 0: for i in range(opt.model_number): m = model_ls[i] encoder_path_checkpoint = '{0}/{1}_state_epoch{2}_encoder_model_{3}.pth'.format( opt.checkpoint_folder, opt.prefix, epoch + 1, i + 1) utils.save_checkpoint(m[0].state_dict(), encoder_path_checkpoint) decoder_path_checkpoint = '{0}/{1}_state_epoch{2}_decoder_model_{3}.pth'.format( opt.checkpoint_folder, opt.prefix, epoch + 1, i + 1) utils.save_checkpoint(m[1].state_dict(), decoder_path_checkpoint) print('Save encoder model to {0}'.format( encoder_path_checkpoint)) print('Save decoder model to {0}'.format( decoder_path_checkpoint)) if ((epoch + 1) % opt.epoch_plot) == 0: plt.figure(1) # plt.subplot(1, 2, 1) plt.plot(loss_rec) plt.legend('loss') # plt.subplot(1, 2, 2) # plt.plot(positive_rec) # plt.plot(negative_rec) # plt.legend(('simmilarity of positives', 'simmilarity of negatives')) plt.show() plt.savefig('./figures/lstm_result{0}.jpg'.format(epoch + 1)) plt.close() if ((epoch + 1) % opt.epoch_test) == 0: evaluate.test(test_video_loader, test_audio_loader, model_ls, opts_test)
def worker(gpu, ngpus_per_node, config_in): # init config = copy.deepcopy(config_in) jobid = os.environ["SLURM_JOBID"] procid = int(os.environ["SLURM_PROCID"]) config.gpu = gpu if config.gpu is not None: writer_name = "tb.{}-{:d}-{:d}".format(jobid, procid, gpu) logger_name = "{}.{}-{:d}-{:d}.search.log".format(config.name, jobid, procid, gpu) ploter_name = "{}-{:d}-{:d}".format(jobid, procid, gpu) ck_name = "{}-{:d}-{:d}".format(jobid, procid, gpu) else: writer_name = "tb.{}-{:d}-all".format(jobid, procid) logger_name = "{}.{}-{:d}-all.search.log".format(config.name, jobid, procid) ploter_name = "{}-{:d}-all".format(jobid, procid) ck_name = "{}-{:d}-all".format(jobid, procid) writer = SummaryWriter(log_dir=os.path.join(config.path, writer_name)) writer.add_text('config', config.as_markdown(), 0) logger = utils.get_logger(os.path.join(config.path, logger_name)) config.print_params(logger.info) # get cuda device device = torch.device('cuda', gpu) # begin logger.info("Logger is set - training start") if config.dist_url == "env://" and config.rank == -1: config.rank = int(os.environ["RANK"]) if config.mp_dist: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes config.rank = config.rank * ngpus_per_node + gpu # print('back:{}, dist_url:{}, world_size:{}, rank:{}'.format(config.dist_backend, config.dist_url, config.world_size, config.rank)) dist.init_process_group(backend=config.dist_backend, init_method=config.dist_url, world_size=config.world_size, rank=config.rank) # get data with meta info input_size, input_channels, n_classes, train_data = utils.get_data( config.dataset, config.data_path, cutout_length=0, validation=False) # build model net_crit = nn.CrossEntropyLoss().to(device) model = SearchCNNController(input_channels, config.init_channels, n_classes, config.layers, net_crit) if config.gpu is not None: torch.cuda.set_device(config.gpu) # model = model.to(device) model.cuda(config.gpu) # When using a single GPU per process and per DistributedDataParallel, we need to divide # the batch size ourselves based on the total number of GPUs we have config.batch_size = int(config.batch_size / ngpus_per_node) config.workers = int((config.workers + ngpus_per_node - 1) / ngpus_per_node) # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[config.rank]) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[config.gpu]) # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=None, output_device=None) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) # weights optimizer w_optim = torch.optim.SGD(model.module.weights(), config.w_lr, momentum=config.w_momentum, weight_decay=config.w_weight_decay) # alphas optimizer alpha_optim = torch.optim.Adam(model.module.alphas(), config.alpha_lr, betas=(0.5, 0.999), weight_decay=config.alpha_weight_decay) # split data to train/validation n_train = len(train_data) split = n_train // 2 indices = list(range(n_train)) train_data_ = data.Subset(train_data, indices[:split]) valid_data_ = data.Subset(train_data, indices[split:]) train_sampler = data.distributed.DistributedSampler(train_data_, num_replicas=config.world_size, rank=config.rank) valid_sampler = data.distributed.DistributedSampler(valid_data_, num_replicas=config.world_size, rank=config.rank) train_loader = data.DataLoader(train_data_, batch_size=config.batch_size, sampler=train_sampler, shuffle=False, num_workers=config.workers, pin_memory=True) valid_loader = data.DataLoader(valid_data_, batch_size=config.batch_size, sampler=valid_sampler, shuffle=False, num_workers=config.workers, pin_memory=True) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( w_optim, config.epochs, eta_min=config.w_lr_min) architect = Architect(model, config.w_momentum, config.w_weight_decay) # setting the privacy protecting procedure if config.dist_privacy: logger.info("PRIVACY ENGINE OFF") # training loop best_top1 = 0.0 for epoch in range(config.epochs): # lr_scheduler.step() # lr = lr_scheduler.get_lr()[0] lr = lr_scheduler.get_last_lr()[0] model.module.print_alphas(logger) # training train(logger, writer, device, config, train_loader, valid_loader, model, architect, w_optim, alpha_optim, lr, epoch) lr_scheduler.step() # move to the place after optimizer.step() # validation cur_step = (epoch+1) * len(train_loader) top1 = validate(logger, writer, device, config, valid_loader, model, epoch, cur_step) # log # genotype genotype = model.module.genotype() logger.info("genotype = {}".format(genotype)) # genotype as a image plot_path = os.path.join(config.plot_path, "JOB" + ploter_name + "-EP{:02d}".format(epoch+1)) caption = "Epoch {}".format(epoch+1) plot(genotype.normal, plot_path + "-normal", caption) plot(genotype.reduce, plot_path + "-reduce", caption) # save if best_top1 < top1: best_top1 = top1 best_genotype = genotype is_best = True else: is_best = False utils.save_checkpoint(model, config.path, ck_name, 'search', is_best) print("") logger.info("Final best Prec@1 = {:.4%}".format(best_top1)) logger.info("Best Genotype = {}".format(best_genotype))
def main(): torch.manual_seed(args.seed) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu use_gpu = torch.cuda.is_available() if args.use_cpu: use_gpu = False sys.stdout = Logger(osp.join(args.save_dir, 'log_train.txt')) print("==========\nArgs:{}\n==========".format(args)) if use_gpu: print("Currently using GPU {}".format(args.gpu)) torch.cuda.manual_seed_all(args.seed) else: print("Currently using CPU (GPU is highly recommended)") print("Initializing dataset {}".format(args.dataset)) dataset = data_manager.init_dataset(name=args.dataset, root=args.root) # Data augmentation spatial_transform_train = ST.Compose([ ST.Scale((args.height, args.width), interpolation=3), ST.RandomHorizontalFlip(), ST.ToTensor(), ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) temporal_transform_train = TT.TemporalRandomCrop(size=args.seq_len, stride=args.sample_stride) spatial_transform_test = ST.Compose([ ST.Scale((args.height, args.width), interpolation=3), ST.ToTensor(), ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) temporal_transform_test = TT.TemporalBeginCrop() pin_memory = True if use_gpu else False if args.dataset != 'mars': trainloader = DataLoader( VideoDataset(dataset.train_dense, spatial_transform=spatial_transform_train, temporal_transform=temporal_transform_train), sampler=RandomIdentitySampler(dataset.train_dense, num_instances=args.num_instances), batch_size=args.train_batch, num_workers=args.workers, pin_memory=pin_memory, drop_last=True) else: trainloader = DataLoader( VideoDataset(dataset.train, spatial_transform=spatial_transform_train, temporal_transform=temporal_transform_train), sampler=RandomIdentitySampler(dataset.train, num_instances=args.num_instances), batch_size=args.train_batch, num_workers=args.workers, pin_memory=pin_memory, drop_last=True) queryloader = DataLoader(VideoDataset( dataset.query, spatial_transform=spatial_transform_test, temporal_transform=temporal_transform_test), batch_size=args.test_batch, shuffle=False, num_workers=0, pin_memory=pin_memory, drop_last=False) galleryloader = DataLoader(VideoDataset( dataset.gallery, spatial_transform=spatial_transform_test, temporal_transform=temporal_transform_test), batch_size=args.test_batch, shuffle=False, num_workers=0, pin_memory=pin_memory, drop_last=False) print("Initializing model: {}".format(args.arch)) model = models.init_model(name=args.arch, num_classes=dataset.num_train_pids) print("Model size: {:.5f}M".format( sum(p.numel() for p in model.parameters()) / 1000000.0)) criterion_xent = nn.CrossEntropyLoss() criterion_htri = TripletLoss(margin=args.margin, distance=args.distance) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=args.stepsize, gamma=args.gamma) start_epoch = args.start_epoch if args.resume: print("Loading checkpoint from '{}'".format(args.resume)) checkpoint = torch.load(args.resume) model.load_state_dict(checkpoint['state_dict']) start_epoch = checkpoint['epoch'] if use_gpu: model = nn.DataParallel(model).cuda() # model = model.cuda() start_time = time.time() train_time = 0 best_rank1 = -np.inf best_epoch = 0 print("==> Start training") for epoch in range(start_epoch, args.max_epoch): scheduler.step() start_train_time = time.time() train(epoch, model, criterion_xent, criterion_htri, optimizer, trainloader, use_gpu) train_time += round(time.time() - start_train_time) if (epoch + 1) >= args.start_eval and args.eval_step > 0 and ( epoch + 1) % args.eval_step == 0 or (epoch + 1) == args.max_epoch: print("==> Test") with torch.no_grad(): # test using 4 frames rank1 = test(model, queryloader, galleryloader, use_gpu) is_best = rank1 > best_rank1 if is_best: best_rank1 = rank1 best_epoch = epoch + 1 if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_checkpoint( { 'state_dict': state_dict, 'rank1': rank1, 'epoch': epoch, }, is_best, osp.join(args.save_dir, 'checkpoint_ep' + str(epoch + 1) + '.pth.tar')) print("==> Best Rank-1 {:.1%}, achieved at epoch {}".format( best_rank1, best_epoch)) elapsed = round(time.time() - start_time) elapsed = str(datetime.timedelta(seconds=elapsed)) train_time = str(datetime.timedelta(seconds=train_time)) print( "Finished. Total elapsed time (h:m:s): {}. Training time (h:m:s): {}.". format(elapsed, train_time))
def test(epoch): model.eval() # 把module设置为评估模式,只对Dropout和BatchNorm模块有影响 test_loss = 0 correct = 0 for data, target in test_loader: if args.cuda: data, target = data.cuda(), target.cuda() data, target = Variable(data, volatile=True), Variable(target) output = model(data) test_loss += F.nll_loss(output, target).data[0] # Variable.data # get the index of the max log-probability pred = output.data.max(1)[1] correct += pred.eq(target.data).cpu().sum() test_loss = test_loss # loss function already averages over batch size test_loss /= len(test_loader) print( '\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset))) for epoch in range(1, args.epochs + 1): train(epoch) test(epoch) if ((epoch + 1) % args.save_interval) == 0: path_checkpoint = '{0}/{1}_state_epoch{2}.pth'.format( 'checkpoints', 'mnist', epoch + 1) utils.save_checkpoint(model.state_dict(), path_checkpoint)
def main(): global opt loss_rec = np.zeros((opt.folds, 100)) acc_rec = np.zeros((opt.folds, 100)) #loss_rec = np.load('acc_train.npy') #acc_rec = np.load('acc.npy') for iteration in range(opt.folds): train_dataset = mnist_Dataset(num_of_cross=iteration) print('number of train samples is: {0}'.format(len(train_dataset))) print('finished loading data') if opt.manualSeed is None: opt.manualSeed = random.randint(1, 10000) if torch.cuda.is_available() and not opt.cuda: print( "WARNING: You have a CUDA device, so you should probably run with \"cuda: True\"" ) torch.manual_seed(opt.manualSeed) else: if int(opt.ngpu) == 1: print('so we use 1 gpu to training') print('setting gpu on gpuid {0}'.format(opt.gpu_id)) if opt.cuda: os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id torch.cuda.manual_seed(opt.manualSeed) cudnn.benchmark = True print('Random Seed: {0}'.format(opt.manualSeed)) # train data loader train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=opt.batchSize, shuffle=True, num_workers=int( opt.workers)) # create model model = mnist_model.cat_and_dog_resnet() if opt.init_model != '': print('loading pretrained model from {0}'.format(opt.init_model)) model.load_state_dict(torch.load(opt.init_model)) # Contrastive Loss #criterion = mnist_model.StableBCELoss() criterion = nn.CrossEntropyLoss() if opt.cuda: print('shift model and criterion to GPU .. ') model = model.cuda() criterion = criterion.cuda() # optimizer # optimizer = optim.SGD(model.parameters(), lr=opt.lr, # momentum=opt.momentum, # weight_decay=opt.weight_decay) optimizer = optim.Adam(model.parameters(), lr=opt.lr) # optimizer = optim.SGD(model.parameters(), lr=opt.lr, weight_decay=opt.weight_decay, momentum=opt.momentum) # optimizer = optim.Adadelta(params=model.parameters(), lr=opt.lr) # adjust learning rate every lr_decay_epoch lambda_lr = lambda epoch: opt.lr_decay**( (epoch + 1) // opt.lr_decay_epoch) # poly policy scheduler = LR_Policy(optimizer, lambda_lr) resume_epoch = 0 acc = test(model, opt, iteration) acc_rec[iteration][0] = acc acc = test(model, opt, iteration, Training=True) loss_rec[iteration][0] = acc for epoch in range(resume_epoch, opt.max_epochs): ################################# # train for one epoch ################################# #accuracy = test(model, opt, epoch) train(train_loader, model, criterion, optimizer, iteration, opt, epoch) scheduler.step() ################################## # save checkpoints ################################## # save model every 10 epochs accuracy = test(model, opt, iteration) acc_rec[iteration][epoch + 1] = accuracy np.save('acc.npy', acc_rec) accuracy = test(model, opt, iteration, Training=True) loss_rec[iteration][epoch + 1] = accuracy np.save('acc_train.npy', loss_rec) if ((epoch + 1) % opt.epoch_save) == 0: path_checkpoint = '{0}/{1}_{3}_epoch{2}.pth'.format( opt.checkpoint_folder, opt.prefix, epoch + 1, iteration) utils.save_checkpoint(model.state_dict(), path_checkpoint)
def trainer(train_loader, valid_loader, model, criterion, optimizer_t, optimizer_s=None, lr_scheduler=None, stage=None): logger.log("start training..." + stage) best_top1 = 0.0 epochs = args.baseline_epochs start_time = time.time() epoch_time = utils.AverageMeter() for epoch in range(args.start_epoch, epochs): ##################################adjust learning rate################################## if args.lr_sch == "cosine": if optimizer_t is not None: adjust_learning_rateD(optimizer_t, epoch, epochs, lr_max=args.learning_rate, lr_min=args.learning_rate * 1e-3) if optimizer_s is not None: adjust_learning_rateD(optimizer_s, epoch, epochs, lr_max=args.learning_rate, lr_min=args.learning_rate * 1e-3) elif args.lr_sch == "imagenet": if optimizer_t is not None: adjust_learning_rateA(optimizer_t, epoch, args) if optimizer_s is not None: adjust_learning_rateA(optimizer_s, epoch, args) elif args.lr_sch == "step": if optimizer_t is not None: adjust_learning_rateS(optimizer_t, epoch, args) if optimizer_s is not None: adjust_learning_rateS(optimizer_s, epoch, args) else: raise NameError("lrsch name error") ######################################################################################## lr = optimizer_t.param_groups[0][ "lr"] if optimizer_t else optimizer_s.param_groups[0]["lr"] need_hour, need_mins, need_secs = convert_secs2time(epoch_time.val * (epochs - epoch)) need_time = '[Need: {:02d}:{:02d}:{:02d}]'.format( need_hour, need_mins, need_secs) logger.log(' [{:s}] :: {:3d}/{:3d} ----- [{:s}] {:s} LR={:}'.format( args.smodel_name, epoch, epochs, time_string(), need_time, lr)) train(train_loader, model, criterion, optimizer_t, optimizer_s, epoch, stage, logger, args) global_step = (epoch + 1) * len(train_loader) - 1 valid_top1 = valid(valid_loader, model, criterion, epoch, global_step, stage=stage, logger=logger, args=args) if epoch == 0 or best_top1 < valid_top1: best_top1 = valid_top1 is_best = True else: is_best = False if epoch >= 89: utils.save_checkpoint(model, logger.path('info'), is_best=is_best, pre=args.aim + "_" + "epoch_" + str(epoch) + "_" + stage) epoch_time.update(time.time() - start_time) start_time = time.time() logger.log("Final best valid Prec@1: {:.4%}".format(best_top1))
def main(): if not torch.cuda.is_available(): logging.info('No GPU device available') sys.exit(1) set_seed(args.seed) cudnn.enabled = True cudnn.benchmark = True args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 args.gpu = 0 args.world_size = 1 if args.distributed: set_seed(args.local_rank) args.gpu = args.local_rank torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() if args.local_rank == 0: logging.info("args = {}".format(args)) logging.info("unparsed_args = {}".format(unparsed)) logging.info("distributed = {}".format(args.distributed)) logging.info("opt_level = {}".format(args.opt_level)) logging.info("keep_batchnorm_fp32 = {}".format( args.keep_batchnorm_fp32)) logging.info("loss_scale = {}".format(args.loss_scale)) logging.info("CUDNN VERSION: {}".format( torch.backends.cudnn.version())) # create model if args.local_rank == 0: logging.info('parsing the architecture') if args.model_path and os.path.isfile(args.model_path): op_weights, depth_weights = get_op_and_depth_weights(args.model_path) parsed_arch = parse_architecture(op_weights, depth_weights) mc_mask_dddict = torch.load(args.model_path)['mc_mask_dddict'] mc_num_dddict = get_mc_num_dddict(mc_mask_dddict) model = Network(args.num_classes, parsed_arch, mc_num_dddict, None, args.dropout_rate, args.drop_connect_rate) elif args.config_path and os.path.isfile(args.config_path): model_config = json.load(open(args.config_path, 'r')) model = NetworkCfg(args.num_classes, model_config, None, args.dropout_rate, args.drop_connect_rate) else: raise Exception('invalid --model_path and --config_path') if args.sync_bn: if args.local_rank == 0: logging.info("using apex synced BN") model = parallel.convert_syncbn_model(model) model = model.cuda().to(memory_format=memory_format ) if memory_format is not None else model.cuda() config = model.config if args.local_rank == 0: with open(os.path.join(args.save, 'model.config'), 'w') as f: json.dump(config, f, indent=4) # logging.info(config) logging.info("param size = %fMB", count_parameters_in_MB(model)) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() criterion_smooth = CrossEntropyLabelSmooth(args.num_classes, args.label_smooth) criterion_smooth = criterion_smooth.cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # Initialize Amp if args.opt_level is not None: model, optimizer = amp.initialize( model, optimizer, opt_level=args.opt_level, keep_batchnorm_fp32=args.keep_batchnorm_fp32, loss_scale=args.loss_scale) # For distributed training, wrap the model with apex.parallel.DistributedDataParallel. # This must be done AFTER the call to amp.initialize. if args.distributed: # By default, apex.parallel.DistributedDataParallel overlaps communication with # computation in the backward pass. # delay_allreduce delays all communication to the end of the backward pass. model = DDP(model, delay_allreduce=True) else: model = nn.DataParallel(model) # define transform and initialize dataloader batch_size = args.batch_size // args.world_size workers = args.workers // args.world_size train_transform = transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ColorJitter( brightness=0.4, contrast=0.4, saturation=0.4, #), hue=0.2), transforms.ToTensor(), transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD), ]) val_transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD), ]) train_dataset = ImageList(root=args.train_root, list_path=args.train_list, transform=train_transform) val_dataset = ImageList(root=args.val_root, list_path=args.val_list, transform=val_transform) train_sampler = None val_sampler = None if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) val_sampler = torch.utils.data.distributed.DistributedSampler( val_dataset) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=workers, pin_memory=True, sampler=train_sampler, shuffle=(train_sampler is None)) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, num_workers=workers, pin_memory=True, sampler=val_sampler, shuffle=False) # define learning rate scheduler scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs)) best_acc_top1 = 0 best_acc_top5 = 0 start_epoch = 0 # restart from snapshot if args.snapshot and os.path.isfile(args.snapshot): if args.local_rank == 0: logging.info('loading snapshot from {}'.format(args.snapshot)) checkpoint = torch.load( args.snapshot, map_location=lambda storage, loc: storage.cuda(args.gpu)) start_epoch = checkpoint['epoch'] best_acc_top1 = checkpoint['best_acc_top1'] best_acc_top5 = checkpoint['best_acc_top5'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) if args.opt_level is not None: amp.load_state_dict(checkpoint['amp']) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs), last_epoch=0) for epoch in range(start_epoch): current_lr = scheduler.get_lr()[0] if args.local_rank == 0: logging.info('Epoch: %d lr %e', epoch, current_lr) if epoch < 5 and args.batch_size > 256: for param_group in optimizer.param_groups: param_group['lr'] = current_lr * (epoch + 1) / 5.0 if args.local_rank == 0: logging.info('Warming-up Epoch: %d, LR: %e', epoch, current_lr * (epoch + 1) / 5.0) if epoch < 5 and args.batch_size > 256: for param_group in optimizer.param_groups: param_group['lr'] = current_lr scheduler.step() # the main loop for epoch in range(start_epoch, args.epochs): current_lr = scheduler.get_lr()[0] if args.local_rank == 0: logging.info('Epoch: %d lr %e', epoch, current_lr) if epoch < 5 and args.batch_size > 256: for param_group in optimizer.param_groups: param_group['lr'] = current_lr * (epoch + 1) / 5.0 if args.local_rank == 0: logging.info('Warming-up Epoch: %d, LR: %e', epoch, current_lr * (epoch + 1) / 5.0) if args.distributed: train_sampler.set_epoch(epoch) epoch_start = time.time() train_acc, train_obj = train(train_loader, model, criterion_smooth, optimizer) if args.local_rank == 0: logging.info('Train_acc: %f', train_acc) val_acc_top1, val_acc_top5, val_obj = validate(val_loader, model, criterion) if args.local_rank == 0: logging.info('Val_acc_top1: %f', val_acc_top1) logging.info('Val_acc_top5: %f', val_acc_top5) logging.info('Epoch time: %ds.', time.time() - epoch_start) if args.local_rank == 0: is_best = False if val_acc_top1 > best_acc_top1: best_acc_top1 = val_acc_top1 best_acc_top5 = val_acc_top5 is_best = True save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_acc_top1': best_acc_top1, 'best_acc_top5': best_acc_top5, 'optimizer': optimizer.state_dict(), 'amp': amp.state_dict() if args.opt_level is not None else None, }, is_best, args.save) if epoch < 5 and args.batch_size > 256: for param_group in optimizer.param_groups: param_group['lr'] = current_lr scheduler.step()
def main(): if not torch.cuda.is_available(): logging.info('No GPU device available') sys.exit(1) set_seed(args.seed) cudnn.enabled = True cudnn.benchmark = True logging.info("args = %s", args) logging.info("unparsed_args = %s", unparsed) # create model logging.info('parsing the architecture') if args.model_path and os.path.isfile(args.model_path): op_weights, depth_weights = get_op_and_depth_weights(args.model_path) parsed_arch = parse_architecture(op_weights, depth_weights) mc_mask_dddict = torch.load(args.model_path)['mc_mask_dddict'] mc_num_dddict = get_mc_num_dddict(mc_mask_dddict) model = Network(args.num_classes, parsed_arch, mc_num_dddict, None, args.dropout_rate, args.drop_connect_rate) elif args.config_path and os.path.isfile(args.config_path): model_config = json.load(open(args.config_path, 'r')) model = NetworkCfg(args.num_classes, model_config, None, args.dropout_rate, args.drop_connect_rate) else: raise Exception('invalid --model_path and --config_path') model = nn.DataParallel(model).cuda() config = model.module.config with open(os.path.join(args.save, 'model.config'), 'w') as f: json.dump(config, f, indent=4) # logging.info(config) logging.info("param size = %fMB", count_parameters_in_MB(model)) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() criterion_smooth = CrossEntropyLabelSmooth(args.num_classes, args.label_smooth) criterion_smooth = criterion_smooth.cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # define transform and initialize dataloader train_transform = transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ColorJitter( brightness=0.4, contrast=0.4, saturation=0.4, #), hue=0.2), transforms.ToTensor(), transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD), ]) val_transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD), ]) train_queue = torch.utils.data.DataLoader(ImageList( root=args.train_root, list_path=args.train_list, transform=train_transform, ), batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=args.workers) val_queue = torch.utils.data.DataLoader(ImageList( root=args.val_root, list_path=args.val_list, transform=val_transform, ), batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=args.workers) # define learning rate scheduler scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs)) best_acc_top1 = 0 best_acc_top5 = 0 start_epoch = 0 # restart from snapshot if args.snapshot: logging.info('loading snapshot from {}'.format(args.snapshot)) checkpoint = torch.load(args.snapshot) start_epoch = checkpoint['epoch'] best_acc_top1 = checkpoint['best_acc_top1'] best_acc_top5 = checkpoint['best_acc_top5'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs), last_epoch=0) for epoch in range(start_epoch): current_lr = scheduler.get_lr()[0] logging.info('Epoch: %d lr %e', epoch, current_lr) if epoch < 5 and args.batch_size > 256: for param_group in optimizer.param_groups: param_group['lr'] = current_lr * (epoch + 1) / 5.0 logging.info('Warming-up Epoch: %d, LR: %e', epoch, current_lr * (epoch + 1) / 5.0) if epoch < 5 and args.batch_size > 256: for param_group in optimizer.param_groups: param_group['lr'] = current_lr scheduler.step() # the main loop for epoch in range(start_epoch, args.epochs): current_lr = scheduler.get_lr()[0] logging.info('Epoch: %d lr %e', epoch, current_lr) if epoch < 5 and args.batch_size > 256: for param_group in optimizer.param_groups: param_group['lr'] = current_lr * (epoch + 1) / 5.0 logging.info('Warming-up Epoch: %d, LR: %e', epoch, current_lr * (epoch + 1) / 5.0) epoch_start = time.time() train_acc, train_obj = train(train_queue, model, criterion_smooth, optimizer) logging.info('Train_acc: %f', train_acc) val_acc_top1, val_acc_top5, val_obj = validate(val_queue, model, criterion) logging.info('Val_acc_top1: %f', val_acc_top1) logging.info('Val_acc_top5: %f', val_acc_top5) logging.info('Epoch time: %ds.', time.time() - epoch_start) is_best = False if val_acc_top1 > best_acc_top1: best_acc_top1 = val_acc_top1 best_acc_top5 = val_acc_top5 is_best = True save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_acc_top1': best_acc_top1, 'best_acc_top5': best_acc_top5, 'optimizer': optimizer.state_dict(), }, is_best, args.save) if epoch < 5 and args.batch_size > 256: for param_group in optimizer.param_groups: param_group['lr'] = current_lr scheduler.step()