def test(self, task, seed, iteration_num, render, load_iter=None, debug=False): set_seed(seed) iteration_num = int(iteration_num) gt.reset() gt.set_def_unique(False) start_iter = 0 params = self.logger.load_params(load_iter) start_iter = self._set_params(params) self.theta.train() extra_data = self.logger.load_extra_data() self._set_extra_data(extra_data) rollout = [] state = task.reset() self.controller.set_task(task) for i in gt.timed_for(range(start_iter, iteration_num), save_itrs=True): t = 0 done = False reward_sum = 0 state = task.reset() while not done: past_traj = [r[-self.M:] for r in rollout] if past_traj != []: for _ in range(self.adaptation_update_num): loss = self._compute_adaptation_loss( self.theta, past_traj) zero_grad(self.theta.parameters()) self._meta_update(loss) action = self.controller.plan(self.theta, state, None, debug) next_state, reward, done, _ = task.step(action) reward_sum += reward if render: task.render() if action.shape == (): action = [action] rollout = _aggregate_rollout(rollout, state, action, next_state) state = next_state t += 1 if done: rollout = [] state = task.reset() print('Iteration:', i, 'Reward:', reward_sum, 'Traj len:', t)
from models.base_block_depth import FeatClassifier, BaseClassifier from models.resnet18_inception_depth_4 import resnet18_inception_depth_4 from models.resnet18_self_attention_depth_34 import resnet18_self_attention_depth_34 from models.resnet18_self_attention_depth_34_version2 import resnet18_self_attention_depth_34_version2 from models.resnet18_inception_depth_4_wrap import resnet18_inception_depth_4_wrap from models.ours import ours from models.resnet_depth import resnet_depth from models.resnet_attention import resnet_attention from models.resnet18_self_mutual_attention import resnet18_self_mutual_attention ''' from batch_engine import valid_trainer, batch_trainer from models.base_block import FeatClassifier, BaseClassifier from models.resnet18_depth import resnet18_depth from dataset.AttrDataset_depth import AttrDataset, get_transform ''' set_seed(605) def main(args): os.environ['CUDA_VISIBLE_DEVICES'] = args.device print('load the model from: ' + args.save_path ) exp_dir = os.path.join(args.save_path, args.dataset, args.dataset, 'img_model/ckpt_max.pth') train_tsfm, valid_tsfm = get_transform(args) #pdb.set_trace() valid_set = AttrDataset(args=args, split=args.valid_split, transform=valid_tsfm) valid_loader = DataLoader( dataset=valid_set, batch_size=args.batchsize, shuffle=False, num_workers=4,
def main(): assert torch.cuda.is_available(), 'need gpu to train network!' torch.cuda.empty_cache() args = parse_args() sys.path.append(args.work_dir) from train_config import config log_dir = os.path.join(args.work_dir, 'log') checkpoint_dir = os.path.join(args.work_dir, 'checkpoints') resume_model = os.path.join(checkpoint_dir, 'latest.pth') if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) global logger logger = get_logger('train', log_dir) set_seed(config.seed) local_rank = args.local_rank # start init process if config.distributed: torch.distributed.init_process_group(backend='nccl', init_method='env://') torch.cuda.set_device(local_rank) init_fn = functools.partial(worker_seed_init_fn, num_workers=config.num_workers, local_rank=local_rank, seed=config.seed) train_sampler = torch.utils.data.distributed.DistributedSampler( config.train_dataset, shuffle=True) if config.distributed else None collater = SegmentationCollater() train_loader = DataLoader(config.train_dataset, batch_size=config.batch_size, shuffle=(train_sampler is None), pin_memory=True, num_workers=config.num_workers, collate_fn=collater.next, sampler=train_sampler, worker_init_fn=init_fn) val_sampler = torch.utils.data.distributed.DistributedSampler( config.val_dataset, shuffle=False) if config.distributed else None val_loader = DataLoader(config.val_dataset, batch_size=config.batch_size, shuffle=False, pin_memory=True, num_workers=config.num_workers, collate_fn=collater.next, sampler=val_sampler) for key, value in config.__dict__.items(): if not key.startswith('__'): if key not in [ 'model', 'criterion', 'decoder', 'train_dataset', 'val_dataset' ]: log_info = f'{key}: {value}' logger.info(log_info) if ( config.distributed and local_rank == 0) or not config.distributed else None gpus_type, gpus_num = torch.cuda.get_device_name( ), torch.cuda.device_count() log_info = f'gpus_type: {gpus_type}, gpus_num: {gpus_num}' logger.info(log_info) if (config.distributed and local_rank == 0) or not config.distributed else None model = config.model.cuda() criterion = config.criterion.cuda() decoder = config.decoder.cuda() # parameters needs to be updated by the optimizer # buffers doesn't needs to be updated by the optimizer for name, param in model.named_parameters(): log_info = f'name: {name}, grad: {param.requires_grad}' logger.info(log_info) if (config.distributed and local_rank == 0) or not config.distributed else None for name, buffer in model.named_buffers(): log_info = f'name: {name}, grad: {buffer.requires_grad}' logger.info(log_info) if (config.distributed and local_rank == 0) or not config.distributed else None optimizer = build_optimizer(config, model) scheduler = build_scheduler(config, optimizer) model = build_training_mode(config, model, optimizer) start_epoch = 1 # automatically resume model for training if checkpoint model exist if os.path.exists(resume_model): checkpoint = torch.load(resume_model, map_location=torch.device('cpu')) model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) scheduler.load_state_dict(checkpoint['scheduler_state_dict']) saved_epoch = checkpoint['epoch'] start_epoch += saved_epoch test_loss, lr = checkpoint['test_loss'], checkpoint['lr'] log_info = f'resuming model from {resume_model}. resume_epoch: {saved_epoch}, test_loss: {test_loss:.4f}, lr: {lr:.6f}' logger.info(log_info) if (config.distributed and local_rank == 0) or not config.distributed else None # calculate training time start_time = time.time() best_test_loss = 100000000. for epoch in range(start_epoch, config.epochs + 1): torch.cuda.empty_cache() train_sampler.set_epoch(epoch) if config.distributed else None loss = train_segmentation(train_loader, model, criterion, optimizer, scheduler, epoch, logger, config) log_info = f'train: epoch {epoch:0>3d}, total_loss: {loss:.4f}' logger.info(log_info) if (config.distributed and local_rank == 0) or not config.distributed else None test_loss = None if epoch in config.eval_epoch or epoch == config.epochs: test_loss = compute_segmentation_test_loss(val_loader, model, criterion) log_info = f'eval: epoch: {epoch:0>3d}, test_loss: {test_loss:.4f}' logger.info(log_info) if (config.distributed and local_rank == 0) or not config.distributed else None if (config.distributed and local_rank == 0) or not config.distributed: # save best test loss model and each epoch checkpoint if test_loss and test_loss < best_test_loss: torch.save(model.module.state_dict(), os.path.join(checkpoint_dir, 'best.pth')) best_test_loss = test_loss torch.save( { 'epoch': epoch, 'test_loss': best_test_loss, 'lr': scheduler.get_lr()[0], 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'scheduler_state_dict': scheduler.state_dict(), }, os.path.join(checkpoint_dir, 'latest.pth')) if (config.distributed and local_rank == 0) or not config.distributed: if os.path.exists(os.path.join(checkpoint_dir, 'best.pth')): os.rename( os.path.join(checkpoint_dir, 'best.pth'), os.path.join( checkpoint_dir, f'{config.network}-epoch{epoch}-best_test_loss{best_test_loss:.3f}.pth' )) training_time = (time.time() - start_time) / 3600 flops, params = compute_flops_and_params(config, model) log_info = f'train done. model: {config.network}, flops: {flops}, params: {params}, training time: {training_time:.3f} hours, best_test_loss: {best_test_loss:.3f}' logger.info(log_info) if (config.distributed and local_rank == 0) or not config.distributed else None
def main(config): os.environ['CUDA_VISIBLE_DEVICES'] = config.GPU if not config.EVAL_MODE: sys.stdout = Logger(osp.join(config.OUTPUT, 'log_train.txt')) else: sys.stdout = Logger(osp.join(config.OUTPUT, 'log_test.txt')) print("==========\nConfig:{}\n==========".format(config)) print("Currently using GPU {}".format(config.GPU)) # Set random seed set_seed(config.SEED) # Build dataloader trainloader, queryloader, galleryloader, num_classes = build_dataloader( config) # Build model model, classifier = build_model(config, num_classes) # Build classification and pairwise loss criterion_cla, criterion_pair = build_losses(config) # Build optimizer parameters = list(model.parameters()) + list(classifier.parameters()) if config.TRAIN.OPTIMIZER.NAME == 'adam': optimizer = optim.Adam( parameters, lr=config.TRAIN.OPTIMIZER.LR, weight_decay=config.TRAIN.OPTIMIZER.WEIGHT_DECAY) elif config.TRAIN.OPTIMIZER.NAME == 'adamw': optimizer = optim.AdamW( parameters, lr=config.TRAIN.OPTIMIZER.LR, weight_decay=config.TRAIN.OPTIMIZER.WEIGHT_DECAY) elif config.TRAIN.OPTIMIZER.NAME == 'sgd': optimizer = optim.SGD(parameters, lr=config.TRAIN.OPTIMIZER.LR, momentum=0.9, weight_decay=config.TRAIN.OPTIMIZER.WEIGHT_DECAY, nesterov=True) else: raise KeyError("Unknown optimizer: {}".format( config.TRAIN.OPTIMIZER.NAME)) # Build lr_scheduler scheduler = lr_scheduler.MultiStepLR( optimizer, milestones=config.TRAIN.LR_SCHEDULER.STEPSIZE, gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE) start_epoch = config.TRAIN.START_EPOCH if config.MODEL.RESUME: print("Loading checkpoint from '{}'".format(config.MODEL.RESUME)) checkpoint = torch.load(config.MODEL.RESUME) model.load_state_dict(checkpoint['state_dict']) start_epoch = checkpoint['epoch'] model = nn.DataParallel(model).cuda() classifier = nn.DataParallel(classifier).cuda() if config.EVAL_MODE: print("Evaluate only") test(model, queryloader, galleryloader) return start_time = time.time() train_time = 0 best_rank1 = -np.inf best_epoch = 0 print("==> Start training") for epoch in range(start_epoch, config.TRAIN.MAX_EPOCH): start_train_time = time.time() train(epoch, model, classifier, criterion_cla, criterion_pair, optimizer, trainloader) train_time += round(time.time() - start_train_time) if (epoch+1) > config.TEST.START_EVAL and config.TEST.EVAL_STEP > 0 and \ (epoch+1) % config.TEST.EVAL_STEP == 0 or (epoch+1) == config.TRAIN.MAX_EPOCH: print("==> Test") rank1 = test(model, queryloader, galleryloader) is_best = rank1 > best_rank1 if is_best: best_rank1 = rank1 best_epoch = epoch + 1 state_dict = model.module.state_dict() save_checkpoint( { 'state_dict': state_dict, 'rank1': rank1, 'epoch': epoch, }, is_best, osp.join(config.OUTPUT, 'checkpoint_ep' + str(epoch + 1) + '.pth.tar')) scheduler.step() print("==> Best Rank-1 {:.1%}, achieved at epoch {}".format( best_rank1, best_epoch)) elapsed = round(time.time() - start_time) elapsed = str(datetime.timedelta(seconds=elapsed)) train_time = str(datetime.timedelta(seconds=train_time)) print( "Finished. Total elapsed time (h:m:s): {}. Training time (h:m:s): {}.". format(elapsed, train_time))
def main(): assert torch.cuda.is_available(), 'need gpu to train network!' torch.cuda.empty_cache() args = parse_args() sys.path.append(args.work_dir) from test_config import config log_dir = os.path.join(args.work_dir, 'log') set_seed(config.seed) collater = SegmentationCollater() val_loader = DataLoader(config.val_dataset, batch_size=config.batch_size, shuffle=False, pin_memory=True, num_workers=config.num_workers, collate_fn=collater.next) if not os.path.exists(log_dir): os.makedirs(log_dir) logger = get_logger('test', log_dir) for key, value in config.__dict__.items(): if not key.startswith('__'): if key not in [ 'model', 'criterion', 'decoder', 'train_dataset', 'val_dataset' ]: log_info = f'{key}: {value}' logger.info(log_info) gpus_type, gpus_num = torch.cuda.get_device_name( ), torch.cuda.device_count() log_info = f'gpus_type: {gpus_type}, gpus_num: {gpus_num}' logger.info(log_info) model = config.model decoder = config.decoder if config.trained_model_path: saved_model = torch.load(os.path.join(BASE_DIR, config.trained_model_path), map_location=torch.device('cpu')) model.load_state_dict(saved_model) flops, params = compute_flops_and_params(config, model) log_info = f'model: {config.network}, flops: {flops}, params: {params}' logger.info(log_info) model = model.cuda() decoder = decoder.cuda() model = nn.DataParallel(model) result_dict = validate_segmentation(config.val_dataset, val_loader, model, decoder, config) log_info = f'eval_result: ' if result_dict: for key, value in result_dict.items(): log_info += f'{key}: {value} ,' else: log_info += f', no target detected in testset images!' logger.info(log_info) return
def main(): assert torch.cuda.is_available(), 'need gpu to train network!' torch.cuda.empty_cache() args = parse_args() sys.path.append(args.work_dir) from test_config import config log_dir = os.path.join(args.work_dir, 'log') set_seed(config.seed) local_rank = args.local_rank # start init process if config.distributed: torch.distributed.init_process_group(backend='nccl', init_method='env://') torch.cuda.set_device(local_rank) val_sampler = torch.utils.data.distributed.DistributedSampler( config.val_dataset, shuffle=False) if config.distributed else None val_loader = DataLoader(config.val_dataset, batch_size=config.batch_size, shuffle=False, pin_memory=False, num_workers=config.num_workers, sampler=val_sampler) if (config.distributed and local_rank == 0) or not config.distributed: if not os.path.exists(log_dir): os.makedirs(log_dir) global logger logger = get_logger('test', log_dir) for key, value in config.__dict__.items(): if not key.startswith('__'): if key not in ['model', 'criterion']: log_info = f'{key}: {value}' logger.info(log_info) if ( config.distributed and local_rank == 0) or not config.distributed else None gpus_type, gpus_num = torch.cuda.get_device_name( ), torch.cuda.device_count() log_info = f'gpus_type: {gpus_type}, gpus_num: {gpus_num}' logger.info(log_info) if (config.distributed and local_rank == 0) or not config.distributed else None model = config.model criterion = config.criterion if config.trained_model_path: saved_model = torch.load(os.path.join(BASE_DIR, config.trained_model_path), map_location=torch.device('cpu')) model.load_state_dict(saved_model) flops, params = compute_flops_and_params(config, model) log_info = f'model: {config.network}, flops: {flops}, params: {params}' logger.info(log_info) if (config.distributed and local_rank == 0) or not config.distributed else None model = model.cuda() criterion = criterion.cuda() if config.distributed: model = nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank) else: model = nn.DataParallel(model) top1, top5, loss, per_image_load_time, per_image_inference_time = validate_classification( val_loader, model, criterion, config) log_info = f'top1: {top1:.3f}%, top5: {top5:.3f}%, loss: {loss:.4f}, per_image_load_time: {per_image_load_time:.3f}ms, per_image_inference_time: {per_image_inference_time:.3f}ms' logger.info(log_info) if (config.distributed and local_rank == 0) or not config.distributed else None return
if args.tb_path == '': args.tb_path = args.save writer = SummaryWriter(args.tb_path) utils.set_logging(args.save) if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) cudnn.benchmark = True cudnn.enabled = True if config.train_params.use_seed: utils.set_seed(config.train_params.seed) logging.info("args = %s", args) logging.info('Training with config:') logging.info(pprint.pformat(config)) config.net_config, net_type = utils.load_net_config( os.path.join(args.load_path, 'net_config')) derivedNetwork = getattr(model_derived, '%s_Net' % net_type.upper()) model = derivedNetwork(config.net_config, config=config) model.eval() if hasattr(model, 'net_config'): logging.info("Network Structure: \n" + '|\n'.join(map(str, model.net_config))) params = utils.count_parameters_in_MB(model)