def main(args): assert torch.cuda.is_available(), 'CUDA is not available' torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True # if resume is True, resume configs and checkpoint from the existing files. if args.search_resume: # args.resume_file path to ... .../EXP-time # resume experiment in a new File, rather than the same file. # configs resume assert os.path.exists( args.resume_file ), 'cannot find the resume file {:}, please re-check'.format( args.resume_file) config_file_path = os.path.join(args.resume_file, 'search.config') assert os.path.exists( config_file_path ), "the path to configs file path {:} is not exists".format( config_file_path) f = open(config_file_path, 'r') config_dict = json.load(f) f.close() configs_resume(args, config_dict, 'search') # new EXP file initialize resume_EXP_time = config_dict['path'].split('/')[-1] resume_exp_name = config_dict['path'].split('/')[-2] EXP_time = time_for_file() args.path = os.path.join( args.path, args.exp_name, EXP_time + '-resume-{:}'.format(resume_exp_name + '-' + resume_EXP_time)) os.makedirs(args.path, exist_ok=True) create_exp_dir(args.path, scripts_to_save='../Efficient_AutoDeeplab') #save_configs(args.__dict__, args.path, 'search') #logger = prepare_logger(args) #logger.log("=> loading configs from the file '{:}' start.".format(args.resume_file), mode='info') torch.set_num_threads(args.workers) set_manual_seed(args.random_seed) else: # training initialization torch.set_num_threads(args.workers) set_manual_seed(args.random_seed) EXP_time = time_for_file() args.path = os.path.join(args.path, args.exp_name, EXP_time) os.makedirs(args.path, exist_ok=True) create_exp_dir(args.path, scripts_to_save='../Efficient_AutoDeeplab') # weight optimizer config, related to network_weight_optimizer, scheduler, and criterion if args.weight_optimizer_type == 'SGD': weight_optimizer_params = { 'momentum': args.momentum, 'nesterov': args.nesterov, 'weight_decay': args.weight_decay, } elif args.weight_optimizer_type == 'RMSprop': weight_optimizer_params = { 'momentum': args.momentum, 'weight_decay': args.weight_decay, } else: weight_optimizer_params = None if args.scheduler == 'cosine': scheduler_params = {'T_max': args.T_max, 'eta_min': args.eta_min} elif args.scheduler == 'multistep': scheduler_params = { 'milestones': args.milestones, 'gammas': args.gammas } elif args.scheduler == 'exponential': scheduler_params = {'gamma': args.gamma} elif args.scheduler == 'linear': scheduler_params = {'min_lr': args.min_lr} else: scheduler_params = None if args.criterion == 'SmoothSoftmax': criterion_params = {'label_smooth': args.label_smoothing} else: criterion_params = None # weight_optimizer_config, used in run_manager to get weight_optimizer, scheduler, and criterion. args.optimizer_config = { 'optimizer_type': args.weight_optimizer_type, 'optimizer_params': weight_optimizer_params, 'scheduler': args.scheduler, 'scheduler_params': scheduler_params, 'criterion': args.criterion, 'criterion_params': criterion_params, 'init_lr': args.init_lr, 'warmup_epoch': args.warmup_epochs, 'epochs': args.epochs, 'class_num': args.nb_classes, } # arch_optimizer_config if args.arch_optimizer_type == 'adam': args.arch_optimizer_params = { 'betas': (args.arch_adam_beta1, args.arch_adam_beta2), 'eps': args.arch_adam_eps } else: args.arch_optimizer_params = None # related to entropy constraint loss # TODO: pay attention, use separate lambda for cell_entropy and network_entropy. if args.reg_loss_type == 'add#linear': args.reg_loss_params = { 'lambda1': args.reg_loss_lambda1, 'lambda2': args.reg_loss_lambda2, } elif args.reg_loss_type == 'add#linear#linearschedule': args.reg_loss_params = { 'lambda1': args.reg_loss_lambda1, 'lambda2': args.reg_loss_lambda2, } elif args.reg_loss_type == 'mul#log': args.reg_loss_params = { 'alpha': args.reg_loss_alpha, 'beta': args.reg_loss_beta } else: args.reg_loss_params = None # perform config save, for run_configs and arch_search_configs save_configs(args.__dict__, args.path, 'search') logger = prepare_logger(args) logger.log("=> loading configs from the file '{:}' start.".format( args.resume_file) if args.search_resume else '=> train-search phase initialization done', mode='info') #print(args.optimizer_config) run_config = RunConfig(**args.__dict__) arch_search_config = ArchSearchConfig(**args.__dict__) # args.bn_momentum and args.bn_eps are not used super_network = GumbelAutoDeepLab(args.filter_multiplier, args.block_multiplier, args.steps, args.nb_classes, args.nb_layers, args.bn_momentum, args.bn_eps, args.search_space, logger, affine=False) # calculate init entropy _, network_index = super_network.get_network_arch_hardwts( ) # set self.hardwts again _, aspp_index = super_network.get_aspp_hardwts_index() single_path = super_network.sample_single_path(args.nb_layers, aspp_index, network_index) cell_arch_entropy, network_arch_entropy, entropy = super_network.calculate_entropy( single_path) logger.log('=> entropy : {:}'.format(entropy), mode='info') vis_init_params = { 'cell_entropy': cell_arch_entropy, 'network_entropy': network_arch_entropy, 'entropy': entropy, } #vis_elements = args.elements #vis_elements.extend(['cell_entropy', 'network_entropy', 'entropy']) #args.elements = vis_elements args.vis_init_params = vis_init_params if args.open_vis: vis = visdomer(args.port, args.server, args.exp_name, args.compare_phase, args.elements, init_params=args.vis_init_params) else: vis = None ''' from exp.autodeeplab.auto_deeplab import AutoDeeplab super_network = AutoDeeplab(args.filter_multiplier, args.block_multiplier, args.steps, args.nb_classes, args.nb_layers, args.search_space, logger, affine=False) ''' ''' from exp.fixed_network_level.supernetwork import FixedNetwork super_network = FixedNetwork(args.filter_multiplier, args.block_multiplier, args.steps, args.nb_classes, args.nb_layers, args.search_space, logger, affine=False) ''' arch_search_run_manager = ArchSearchRunManager(args.path, super_network, run_config, arch_search_config, logger, vis) display_all_families_information(args, 'search', arch_search_run_manager, logger) ''' # get_model_infos, perform inference # TODO: modify the way of forward into gdas_forward flop, param = get_model_infos(super_network, [1, 3, 512, 512]) print('||||||| FLOPS & PARAMS |||||||') print('FLOP = {:.2f} M, Params = {:.2f} MB'.format(flop, param)) ''' # 1. resume warmup phase # 2. resume search phase # 3. add last_info log × not last_info, every time, the saved_file name is not consistent, should given resume_file # 1. given EXP file time completed :: resume_file :: ->EXP-time # 2. get configs, and load config completed # 3. resume checkpoint completed # TODO: have issue in resume semantics. After resume, it will allocate more GPU memory than the normal one, which will raise OOM in search phase. if args.search_resume: if os.path.exists(args.resume_file): # resume_file :: path to EXP-time logger.log("=> loading checkpoint of the file '{:}' start".format( args.resume_file), mode='info') warm_up_checkpoint = os.path.join( args.resume_file, 'checkpoints', 'seed-{:}-warm.pth'.format(args.random_seed)) search_checkpoint = os.path.join( args.resume_file, 'checkpoints', 'seed-{:}-search.pth'.format(args.random_seed)) if args.resume_from_warmup == False: # resume checkpoint in search phase checkpoint = torch.load(search_checkpoint) super_network.load_state_dict(checkpoint['state_dict']) arch_search_run_manager.run_manager.optimizer.load_state_dict( checkpoint['weight_optimizer']) arch_search_run_manager.run_manager.scheduler.load_state_dict( checkpoint['weight_scheduler']) arch_search_run_manager.arch_optimizer.load_state_dict( checkpoint['arch_optimizer']) arch_search_run_manager.run_manager.monitor_metric = checkpoint[ 'best_monitor'][0] arch_search_run_manager.run_manager.best_monitor = checkpoint[ 'best_monitor'][1] arch_search_run_manager.warmup = checkpoint['warmup'] arch_search_run_manager.start_epoch = checkpoint[ 'start_epochs'] # pay attention:: start_epochs and warmup_epoch in nas_manager logger.log( "=> loading checkpoint of the file '{:}' start with {:}-th epochs in search phase" .format(search_checkpoint, checkpoint['start_epochs']), mode='info') else: # resume checkpoint in warmup phase checkpoint = torch.load(warm_up_checkpoint) super_network.load_state_dict(checkpoint['state_dict']) arch_search_run_manager.run_manager.optimizer.load_state_dict( checkpoint['weight_optimizer']) arch_search_run_manager.run_manager.scheduler.load_state_dict( checkpoint['weight_scheduler']) arch_search_run_manager.warmup = checkpoint['warmup'] arch_search_run_manager.warmup_epoch = checkpoint[ 'warmup_epoch'] logger.log( "=> loading checkpoint of the file '{:}' start with {:}-th epochs in warmup phase" .format(warm_up_checkpoint, checkpoint['warmup_epoch']), mode='info') else: logger.log( "=> can not find the file: {:} please re-confirm it\n" "=> start warm-up and search from scratch... ...".format( args.resume_file), mode='info') else: logger.log("=> start warm-up and search from scratch... ...", mode='info') # torch.autograd.set_detect_anomaly(True) # warm up phase if arch_search_run_manager.warmup: arch_search_run_manager.warm_up(warmup_epochs=args.warmup_epochs) # train search phase arch_search_run_manager.train() logger.close()
def main(): global args, best_prec1 PID = os.getpid() args = parser.parse_args() prepare_seed(args.rand_seed) if args.timestamp == 'none': args.timestamp = "{:}".format(time.strftime('%h-%d-%C_%H-%M-%s', time.gmtime(time.time()))) # Log outputs if args.evaluate: args.save_dir = args.save_dir + "/Visda17-Res101-evaluate" + \ "%s/%s"%('/'+args.resume if args.resume != 'none' else '', args.timestamp) else: args.save_dir = args.save_dir + \ "/Visda17-Res101-%s-train.%s-LR%.2E-epoch%d-batch%d-seed%d"%( "LWF%.2f"%args.lwf if args.lwf > 0 else "XE", args.train_blocks, args.lr, args.epochs, args.batch_size, args.rand_seed) + \ "%s/%s"%('/'+args.resume if args.resume != 'none' else '', args.timestamp) logger = prepare_logger(args) data_transforms = { 'train': transforms.Compose([ transforms.Resize(224), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]), 'val': transforms.Compose([ transforms.Resize(224), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]), } kwargs = {'num_workers': 20, 'pin_memory': True} trainset = VisDA17(txt_file=os.path.join(args.data, "train/image_list.txt"), root_dir=os.path.join(args.data, "train"), transform=data_transforms['train']) valset = VisDA17(txt_file=os.path.join(args.data, "validation/image_list.txt"), root_dir=os.path.join(args.data, "validation"), transform=data_transforms['val'], label_one_hot=True) train_loader = DataLoader(trainset, batch_size=args.batch_size, shuffle=True, **kwargs) val_loader = DataLoader(valset, batch_size=args.batch_size, shuffle=False, **kwargs) model = resnet101(pretrained=True) num_ftrs = model.fc.in_features fc_layers = nn.Sequential( nn.Linear(num_ftrs, 512), nn.ReLU(inplace=True), nn.Linear(512, args.num_class), ) model.fc_new = fc_layers train_blocks = args.train_blocks.split('.') # default turn-off fc, turn-on fc_new for param in model.fc.parameters(): param.requires_grad = False ##### Freeze several bottom layers (Optional) ##### non_train_blocks = ['conv1', 'bn1', 'layer1', 'layer2', 'layer3', 'layer4', 'fc'] for name in train_blocks: try: non_train_blocks.remove(name) except Exception: print("cannot find block name %s\nAvailable blocks are: conv1, bn1, layer1, layer2, layer3, layer4, fc"%name) for name in non_train_blocks: for param in getattr(model, name).parameters(): param.requires_grad = False # Setup optimizer factor = 0.1 sgd_in = [] for name in train_blocks: if name != 'fc': sgd_in.append({'params': get_params(model, [name]), 'lr': factor*args.lr}) else: sgd_in.append({'params': get_params(model, ["fc_new"]), 'lr': args.lr}) base_lrs = [ group['lr'] for group in sgd_in ] optimizer = SGD(sgd_in, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # Optionally resume from a checkpoint if args.resume != 'none': if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch'])) else: print("=ImageClassdata> no checkpoint found at '{}'".format(args.resume)) model = model.cuda() model_old = None if args.lwf > 0: # create a fixed model copy for Life-long learning model_old = resnet101(pretrained=True) for param in model_old.parameters(): param.requires_grad = False model_old.eval() model_old.cuda() if args.evaluate: prec1 = validate(val_loader, model) print(prec1) exit(0) # Main training loop iter_max = args.epochs * len(train_loader) iter_stat = IterNums(iter_max) for epoch in range(args.start_epoch, args.epochs): print("<< ============== JOB (PID = %d) %s ============== >>"%(PID, args.save_dir)) logger.log("Epoch: %d"%(epoch+1)) # train for one epoch train(train_loader, model, optimizer, base_lrs, iter_stat, epoch, logger.writer, model_old=model_old, adjust_lr=True) # evaluate on validation set prec1 = validate(val_loader, model) logger.writer.add_scalar("prec", prec1, epoch) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint(args.save_dir, { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, }, is_best) logging.info('Best accuracy: {prec1:.3f}'.format(prec1=best_prec1))
def main(): args = get_args() PID = os.getpid() print("<< ============== JOB (PID = %d) %s ============== >>" % (PID, args.save_dir)) prepare_seed(args.rand_seed) if args.timestamp == 'none': args.timestamp = "{:}".format( time.strftime('%h-%d-%C_%H-%M-%s', time.gmtime(time.time()))) torch.set_num_threads(1) # Log outputs args.save_dir = args.save_dir + \ "/Visda17-L2O.train.Res101-%s-train.%s-LR%.2E-epoch%d-batch%d-seed%d"%( "LWF" if args.lwf > 0 else "XE", args.train_blocks, args.lr, args.epochs, args.batch_size, args.rand_seed) + \ "%s/%s"%('/'+args.resume if args.resume != 'none' else '', args.timestamp) logger = prepare_logger(args) best_prec1 = 0 #### preparation ########################################### data_transforms = { 'train': transforms.Compose([ transforms.Resize(224), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]), 'val': transforms.Compose([ transforms.Resize(224), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]), } kwargs = {'num_workers': 20, 'pin_memory': True} trainset = VisDA17(txt_file=os.path.join(args.data, "train/image_list.txt"), root_dir=os.path.join(args.data, "train"), transform=data_transforms['train']) valset = VisDA17(txt_file=os.path.join(args.data, "validation/image_list.txt"), root_dir=os.path.join(args.data, "validation"), transform=data_transforms['val'], label_one_hot=True) train_loader = DataLoader(trainset, batch_size=args.batch_size, shuffle=True, **kwargs) val_loader = DataLoader(valset, batch_size=args.batch_size, shuffle=False, **kwargs) train_loader_iter = iter(train_loader) current_optimizee_step, prev_optimizee_step = 0, 0 model_old = None if args.lwf > 0: # create a fixed model copy for Life-long learning model_old = resnet101(pretrained=True) for param in model_old.parameters(): param.requires_grad = False model_old.eval() model_old.cuda() ############################################################ ### Agent Settings ######################################## RANDOM = False # False | True | 'init' action_space = np.arange(0, 1.1, 0.1) obs_avg = True _window_size = 1 window_size = 1 if obs_avg else _window_size window_shrink_size = 20 # larger: controller will be updated more frequently sgd_in_names = [ "conv1", "bn1", "layer1", "layer2", "layer3", "layer4", "fc_new" ] coord_size = len(sgd_in_names) ob_name_lstm = ["loss", "loss_kl", "step", "fc_mean", "fc_std"] ob_name_scalar = [] obs_shape = (len(ob_name_lstm) * window_size + len(ob_name_scalar) + coord_size, ) _hidden_size = 20 hidden_size = _hidden_size * len(ob_name_lstm) actor_critic = Policy(coord_size, input_size=(len(ob_name_lstm), len(ob_name_scalar)), action_space=len(action_space), hidden_size=_hidden_size, window_size=window_size) actor_critic.cuda() actor_critic.eval() partial = torch.load(args.agent_load_dir, map_location=lambda storage, loc: storage) state = actor_critic.state_dict() pretrained_dict = {k: v for k, v in partial.items()} state.update(pretrained_dict) actor_critic.load_state_dict(state) ################################################################ _min_iter = 10 # reset optmizee model, optimizer, current_optimizee_step, prev_optimizee_step = prepare_optimizee( args, sgd_in_names, obs_shape, hidden_size, actor_critic, current_optimizee_step, prev_optimizee_step) epoch_size = len(train_loader) total_steps = epoch_size * args.epochs bar_format = '{desc}[{elapsed}<{remaining},{rate_fmt}]' pbar = tqdm(range(int(epoch_size * args.epochs)), file=sys.stdout, bar_format=bar_format, ncols=100) _window_size = max( _min_iter, current_optimizee_step + prev_optimizee_step // window_shrink_size) train_loader_iter, obs, loss, loss_kl, fc_mean, fc_std = train_step( args, _window_size, train_loader_iter, train_loader, model, optimizer, obs_avg, args.lr, pbar, current_optimizee_step + prev_optimizee_step, total_steps, model_old=model_old) logger.writer.add_scalar("loss/ce", loss, current_optimizee_step + prev_optimizee_step) logger.writer.add_scalar("loss/kl", loss_kl, current_optimizee_step + prev_optimizee_step) logger.writer.add_scalar("loss/total", loss + loss_kl, current_optimizee_step + prev_optimizee_step) logger.writer.add_scalar("fc/mean", fc_mean, current_optimizee_step + prev_optimizee_step) logger.writer.add_scalar("fc/std", fc_std, current_optimizee_step + prev_optimizee_step) current_optimizee_step += _window_size pbar.update(_window_size) prev_obs = obs.unsqueeze(0) prev_hidden = torch.zeros(actor_critic.net.num_recurrent_layers, 1, hidden_size).cuda() for epoch in range(args.epochs): print("\n===== Epoch %d / %d =====" % (epoch + 1, args.epochs)) print("<< ============== JOB (PID = %d) %s ============== >>" % (PID, args.save_dir)) while current_optimizee_step < epoch_size: # Sample actions with torch.no_grad(): if not RANDOM: value, action, action_log_prob, recurrent_hidden_states, distribution = actor_critic.act( prev_obs, prev_hidden, deterministic=False) action = action.squeeze() action_log_prob = action_log_prob.squeeze() value = value.squeeze() for idx in range(len(action)): logger.writer.add_scalar( "action/%s" % sgd_in_names[idx], action[idx], current_optimizee_step + prev_optimizee_step) logger.writer.add_scalar( "entropy/%s" % sgd_in_names[idx], distribution.distributions[idx].entropy(), current_optimizee_step + prev_optimizee_step) optimizer.param_groups[idx]['lr'] = float( action_space[action[idx]]) * args.lr logger.writer.add_scalar( "LR/%s" % sgd_in_names[idx], optimizer.param_groups[idx]['lr'], current_optimizee_step + prev_optimizee_step) else: if RANDOM is True or RANDOM == 'init': for idx in range(coord_size): optimizer.param_groups[idx]['lr'] = float( choice(action_space)) * args.lr if RANDOM == 'init': RANDOM = 'done' for idx in range(coord_size): logger.writer.add_scalar( "LR/%s" % sgd_in_names[idx], optimizer.param_groups[idx]['lr'], current_optimizee_step + prev_optimizee_step) # Obser reward and next obs _window_size = max( _min_iter, current_optimizee_step + prev_optimizee_step // window_shrink_size) _window_size = min(_window_size, epoch_size - current_optimizee_step) train_loader_iter, obs, loss, loss_kl, fc_mean, fc_std = train_step( args, _window_size, train_loader_iter, train_loader, model, optimizer, obs_avg, args.lr, pbar, current_optimizee_step + prev_optimizee_step, total_steps, model_old=model_old) logger.writer.add_scalar( "loss/ce", loss, current_optimizee_step + prev_optimizee_step) logger.writer.add_scalar( "loss/kl", loss_kl, current_optimizee_step + prev_optimizee_step) logger.writer.add_scalar( "loss/total", loss + loss_kl, current_optimizee_step + prev_optimizee_step) logger.writer.add_scalar( "fc/mean", fc_mean, current_optimizee_step + prev_optimizee_step) logger.writer.add_scalar( "fc/std", fc_std, current_optimizee_step + prev_optimizee_step) current_optimizee_step += _window_size pbar.update(_window_size) prev_obs = obs.unsqueeze(0) if not RANDOM: prev_hidden = recurrent_hidden_states prev_optimizee_step += current_optimizee_step current_optimizee_step = 0 # evaluate on validation set prec1 = validate(val_loader, model, args) logger.writer.add_scalar("prec", prec1, epoch) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( args.save_dir, { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, }, is_best) logging.info('Best accuracy: {prec1:.3f}'.format(prec1=best_prec1))
def main(args): assert torch.cuda.is_available(), 'CUDA is not available' torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True torch.backends.cudnn.deterministic = True if args.retrain_resume and args.evaluation == False: # if resume from the last retrain config_file_path = os.path.join(args.resume_file, 'retrain.config') assert os.path.exists( config_file_path ), 'cannot find config_file {:} from the last retrain phase'.format( config_file_path) f = open(config_file_path, 'r') config_dict = json.load(f) f.close() configs_resume(args, config_dict, 'retrain') # config resume from the last retrain # get EXP_time in last_retrain for flag EXP_time_last_retrain = config_dict['path'].split('/')[-1] Exp_name_last_retrain = config_dict['path'].split('/')[-2] EXP_time = time_for_file() args.path = os.path.join(args.path, args.exp_name, EXP_time) torch.set_num_threads(args.workers) set_manual_seed(args.random_seed) # from the last retrain. os.makedirs(args.path, exist_ok=True) create_exp_dir(args.path, scripts_to_save='../Efficient_AutoDeeplab') elif args.retrain_resume == False and args.evaluation: config_file_path = os.path.join(args.evaluation_ckpt, 'retrain.config') assert os.path.exists( config_file_path ), 'cannot find config_file {:} from the best checkpoint'.format( config_file_path) f = open(config_file_path, 'r') config_dict = json.load(f) f.close() configs_resume(args, config_dict, 'retrain') EXP_time_best_checkpoint = config_dict['path'].split('/')[-1] EXP_name_best_checkpoint = config_dict['path'].split('/')[-2] EXP_time = time_for_file() args.path = os.path.join(args.path, args.exp_name, EXP_time) torch.set_num_threads(args.workers) set_manual_seed(args.random_seed) os.makedirs(args.path, exist_ok=True) create_exp_dir(args.path, scripts_to_save='../Efficient_AutoDeeplab') elif args.retrain_resume == False and args.evaluation == False: # resume from the searching phrase. config_file_path = os.path.join(args.checkpoint_file, 'search.config') assert os.path.exists( config_file_path ), 'cannot find config_file {:} from the search phase'.format( config_file_path) f = open(config_file_path, 'r') config_dict = json.load(f) f.close() args.random_seed = config_dict['random_seed'] # get random_seed # get EXP_time in search phase, for flag EXP_time_search = config_dict['path'].split('/')[-1] EXP_name_search = config_dict['path'].split('/')[-2] EXP_time = time_for_file() args.path = os.path.join(args.path, args.exp_name, EXP_time) torch.set_num_threads(args.workers) set_manual_seed( args.random_seed) # from the last retrain phase or search phase. os.makedirs(args.path, exist_ok=True) create_exp_dir(args.path, scripts_to_save='../Efficient_AutoDeeplab') else: raise NotImplementedError( 'invalid mode retrain_resume {:} open_vis {:}'.format( args.retrain_resume, args.open_vis)) # optimizer params if args.weight_optimizer_type == 'SGD': weight_optimizer_params = { 'momentum': args.momentum, 'nesterov': args.nesterov, 'weight_decay': args.weight_decay, } elif args.weight_optimizer_type == 'RMSprop': weight_optimizer_params = { 'momentum': args.momentum, 'weight_decay': args.weight_decay, } else: weight_optimizer_params = None # scheduler params if args.scheduler == 'cosine': scheduler_params = {'T_max': args.T_max, 'eta_min': args.eta_min} elif args.scheduler == 'multistep': scheduler_params = { 'milestones': args.milestones, 'gammas': args.gammas } elif args.scheduler == 'exponential': scheduler_params = {'gamma': args.gamma} elif args.scheduler == 'linear': scheduler_params = {'min_lr': args.min_lr} else: scheduler_params = None # criterion params if args.criterion == 'SmoothSoftmax': criterion_params = {'label_smooth': args.label_smoothing} else: criterion_params = None args.optimizer_config = { 'optimizer_type': args.weight_optimizer_type, 'optimizer_params': weight_optimizer_params, 'scheduler': args.scheduler, 'scheduler_params': scheduler_params, 'criterion': args.criterion, 'criterion_params': criterion_params, 'init_lr': args.init_lr, 'epochs': args.epochs, 'class_num': args.nb_classes, } if args.search_space == 'autodeeplab': conv_candidates = autodeeplab elif args.search_space == 'proxyless': conv_candidates = proxyless elif args.search_space == 'counter': conv_candidates = counter elif args.search_space == 'my_search_space': conv_candidates = my_search_space else: raise ValueError('search_space : {:} is not supported'.format( args.search_space)) # related to entropy constraint loss if args.reg_loss_type == 'add#linear': args.reg_loss_params = { 'lambda1': args.reg_loss_lambda1, 'lambda2': args.reg_loss_lambda2 } elif args.reg_loss_type == 'add#linear#linearschedule': args.reg_loss_params = { 'lambda1': args.reg_loss_lambda1, 'lambda2': args.reg_loss_lambda2, } elif args.reg_loss_type == 'mul#log': args.reg_loss_params = { 'alpha': args.reg_loss_alpha, 'beta': args.reg_loss_beta } else: args.reg_loss_params = None logger = prepare_logger(args) if args.retrain_resume and args.evaluation == False: logger.log( '=> loading configs {:} from the last retrain phase.'.format( config_file_path), 'info') elif args.retrain_resume == False and args.evaluation: logger.log( '=> loading configs {:} from the best retrain phrase.'.format( config_file_path), 'info') elif args.retrain_resume == False and args.evaluation == False: logger.log( '=> loading configs {:} from search phrase.'.format( config_file_path), 'info') # save new config, and create logger. save_configs(args.__dict__, args.path, 'retrain') # create run_config run_config = RunConfig(**args.__dict__) # only open_vis in retrain phrase if args.open_vis: assert args.evaluation == False, 'invalid mode open_vis {:} and open_test {:}'.format( args.open_vis, args.evaluation) vis = visdomer(args.port, args.server, args.exp_name, args.compare_phase, args.elements, init_params=None) else: vis = None #print(args.evaluation) if args.evaluation: assert os.path.exists(args.evaluation_ckpt ), 'cannot find the best checkpoint {:}'.format( args.evaluation_ckpt) checkpoint_path = os.path.join( args.evaluation_ckpt, 'checkpoints', 'seed-{:}-retrain-best.pth'.format(args.random_seed)) checkpoint = torch.load(checkpoint_path) actual_path, cell_genotypes = checkpoint['actual_path'], checkpoint[ 'cell_genotypes'] #print(actual_path) #print(cell_genotypes) ''' my_search_space = [ '3x3_SepFacConv1', '5x5_SepFacConv1', '3x3_SepFacConv2', '5x5_SepFacConv2', '3x3_SepFacConv4', '5x5_SepFacConv4',] ''' # 0:4 1:4 2:5 3:5 4:4 5:2 actual_path = [0, 0, 0, 0, 1, 1, 1, 2, 3, 3, 3, 2] cell_genotypes = [(0, [[('2<-1', 0), ('2<-0', 3)]]), (2, [[('2<-1', 4), ('2<-0', 1)]]), (7, [[('2<-1', 3), ('2<-0', 0)]]), (15, [[('2<-1', 1), ('2<-0', 2)]]), (27, [[('2<-1', 4), ('2<-0', 3)]]), (38, [[('2<-1', 4), ('2<-0', 0)]]), (48, [[('2<-1', 2), ('2<-0', 5)]]), (60, [[('2<-1', 0), ('2<-0', 1)]]), (73, [[('2<-0', 3), ('2<-1', 3)]]), (84, [[('2<-1', 2), ('2<-0', 1)]]), (94, [[('2<-1', 4), ('2<-0', 2)]]), (102, [[('2<-1', 2), ('2<-0', 5)]])] ''' actual_path = [0, 0, 0, 1, 1, 1, 2, 3, 3, 3, 2, 1] cell_genotypes = [(0, [[('2<-1', 4), ('2<-0', 5)]]), (2, [[('2<-1', 3), ('2<-0', 1)]]), (7, [[('2<-1', 2), ('2<-0', 5)]]), (17, [[('2<-0', 1), ('2<-1', 1)]]), (28, [[('2<-1', 4), ('2<-0', 1)]]), (38, [[('2<-1', 4), ('2<-0', 2)]]), (50, [[('2<-1', 5), ('2<-0', 1)]]), (63, [[('2<-1', 4), ('2<-0', 2)]]), (74, [[('2<-1', 1), ('2<-0', 0)]]), (84, [[('2<-1', 3), ('2<-0', 1)]]), (92, [[('2<-1', 4), ('2<-0', 5)]]), (99, [[('2<-1', 0), ('2<-0', 3)]])] ''' normal_network = NewGumbelAutoDeeplab(args.nb_layers, args.filter_multiplier, args.block_multiplier, args.steps, args.nb_classes, actual_path, cell_genotypes, args.search_space, affine=True) # save new config, and create logger. #save_configs(args.__dict__, args.path, 'retrain') # create run_config #run_config = RunConfig(**args.__dict__) evaluation_run_manager = RunManager(args.path, normal_network, logger, run_config, vis, out_log=True) normal_network.load_state_dict(checkpoint['state_dict']) display_all_families_information(args, 'retrain', evaluation_run_manager, logger) logger.log( '=> loaded the best checkpoint from {:}, start evaluation'.format( checkpoint_path), 'info') evaluation_run_manager.validate(is_test=True, use_train_mode=False) else: # resume from the last retrain if args.retrain_resume: logger.log( '=> Loading checkpoint from {:} of the last retrain phase'. format(args.resume_file), mode='info') # checkpoint_file from the last retrain phase. checkpoint_path = os.path.join( args.resume_file, 'checkpoints', 'seed-{:}-retrain.pth'.format(args.random_seed)) assert os.path.exists( checkpoint_path ), 'cannot find retrain checkpoint file {:}'.format( checkpoint_path) checkpoint = torch.load(checkpoint_path) actual_path, cell_genotypes = checkpoint[ 'actual_path'], checkpoint['cell_genotypes'] args.actual_path = actual_path args.cell_genotypes = cell_genotypes normal_network = NewGumbelAutoDeeplab(args.nb_layers, args.filter_multiplier, args.block_multiplier, args.steps, args.nb_classes, actual_path, cell_genotypes, args.search_space, affine=True) flop, param = get_model_infos(normal_network, [1, 3, 512, 512]) logger.log( '|#################### Network Info ####################|\n' 'FLOPs:{:.2f} M, Params:{:.2f} MB'.format(flop, param), mode='info') # save new config, and create logger. #save_configs(args.__dict__, args.path, 'retrain') # create run_config #run_config = RunConfig(**args.__dict__) retrain_run_manager = RunManager(args.path, normal_network, logger, run_config, vis, out_log=True) normal_network.load_state_dict(checkpoint['state_dict']) display_all_families_information(args, 'retrain', retrain_run_manager, logger) retrain_run_manager.optimizer.load_state_dict( checkpoint['weight_optimizer']) retrain_run_manager.scheduler.load_state_dict( checkpoint['scheduler']) retrain_run_manager.monitor_metric = checkpoint['best_monitor'][0] retrain_run_manager.best_monitor = checkpoint['best_monitor'][1] retrain_run_manager.start_epoch = checkpoint[ 'start_epoch'] # has +1 logger.log( '=> loaded checkpoint file {:} from the last retrain phase, starts with {:}-th epoch' .format(checkpoint_path, checkpoint['start_epoch']), mode='info') else: # from search phrase, load the optimal architecture and perform retrain. arch_checkpoint_path = os.path.join( args.checkpoint_file, 'checkpoints', 'seed-{:}-arch-best.pth'.format(args.random_seed)) # TODO, the best epoch has gotten in advance. #checkpoint_path = os.path.join(args.checkpoint_file, 'checkpoints', 'seed-{:}-search-best.pth'.format(args.random_seed)) #tmp_checkpoint = torch.load(checkpoint_path) #best_epoch = tmp_checkpoint['start_epochs'] - 1 #logger.log('=> best epochs: {:}'.format(best_epoch), mode='info') # get the best_epoch assert os.path.exists( arch_checkpoint_path ), 'cannot find arch_checkpoint file {:} from search phase'.format( arch_checkpoint_path) checkpoint = torch.load(arch_checkpoint_path) actual_path, cell_genotypes = checkpoint[ 'actual_path'], checkpoint['cell_genotypes'] new_genotypes = [] for _index, genotype in cell_genotypes: xlist = [] for edge_genotype in genotype: for (node_str, select_index) in edge_genotype: xlist.append((node_str, conv_candidates[select_index])) new_genotypes.append((_index, xlist)) log_str = 'Obtained actual_path and cell_genotypes:\n' \ 'Actual_path: {:}\n' \ 'Genotype:\n'.format(actual_path) for _index, genotype in new_genotypes: log_str += 'index: {:} arch: {:}\n'.format(_index, genotype) logger.log(log_str, mode='info') args.actual_path = actual_path args.cell_genotypes = cell_genotypes normal_network = NewGumbelAutoDeeplab(args.nb_layers, args.filter_multiplier, args.block_multiplier, args.steps, args.nb_classes, actual_path, cell_genotypes, args.search_space, affine=True) flop, param = get_model_infos(normal_network, [1, 3, 512, 512]) logger.log( '|#################### Network Info ####################|\n' 'FLOPs:{:.2f} M, Params:{:.2f} MB'.format(flop, param), mode='info') # save new config, and create logger. #save_configs(args.__dict__, args.path, 'retrain') # create run_config #run_config = RunConfig(**args.__dict__) retrain_run_manager = RunManager(args.path, normal_network, logger, run_config, vis, out_log=True) #normal_network.load_state_dict(checkpoint['state_dict']) display_all_families_information(args, 'retrain', retrain_run_manager, logger) logger.log( '=> Construct NewGumbelAutoDeeplab according to the last-arch obtained from search phase', mode='info') # perform train and validation in train() method retrain_run_manager.train() logger.close()
def main(args): assert torch.cuda.is_available(), 'CUDA is not available' torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True torch.backends.cudnn.deterministic = True if args.retrain_resume: config_file_path = os.path.join(args.resume_file, 'retrain.config') assert os.path.exists( config_file_path ), 'cannot find config_file {:} from the last retrain phase'.format( config_file_path) f = open(config_file_path, 'r') config_dict = json.load(f) f.close() configs_resume(args, config_dict, 'retrain') # get EXP_time in last_retrain for flag EXP_time_last_retrain = config_dict['path'].split('/')[-1] EXP_time = time_for_file() args.path = os.path.join( args.path, args.exp_name, EXP_time + '-resume-{:}'.format(EXP_time_last_retrain)) torch.set_num_threads(args.workers) set_manual_seed( args.random_seed) # from the last retrain phase or search phase. os.makedirs(args.path, exist_ok=True) create_exp_dir(args.path, scripts_to_save=glob.glob('./*/*.py')) save_configs(args.__dict__, args.path, 'retrain') logger = prepare_logger(args) logger.log( '=> loading configs {:} from the last retrain phase.'.format( config_file_path), mode='info') if args.search_space == 'autodeeplab': conv_candidates = autodeeplab elif args.search_space == 'proxyless': conv_candidates = proxyless elif args.search_space == 'my_search_space': conv_candidates = my_search_space else: raise ValueError('search space {:} is not supported'.format( args.search_space)) else: # resume partial configs setting and arch_checkpoint from the search phase by default. config_file_path = os.path.join(args.checkpoint_file, 'search.config') assert os.path.exists( config_file_path ), 'cannot find config_file {:} from the search phase'.format( config_file_path) f = open(config_file_path, 'r') config_dict = json.load(f) f.close() args.random_seed = config_dict['random_seed'] # get EXP_time in search phase, for flag EXP_time_search = config_dict['path'].split('/')[-1] EXP_time = time_for_file() args.path = os.path.join( args.path, args.exp_name, EXP_time + '-resume-{:}'.format(EXP_time_search)) torch.set_num_threads(args.workers) set_manual_seed( args.random_seed) # from the last retrain phase or search phase. os.makedirs(args.path, exist_ok=True) create_exp_dir(args.path, scripts_to_save=glob.glob('./*/*.py')) save_configs(args.__dict__, args.path, 'retrain') logger = prepare_logger(args) logger.log( '=> starting retrain from the search phase config {:}.'.format( config_file_path), mode='info') # optimizer params if args.weight_optimizer_type == 'SGD': weight_optimizer_params = { 'momentum': args.momentum, 'nesterov': args.nesterov, 'weight_decay': args.weight_decay, } elif args.weight_optimizer_type == 'RMSprop': weight_optimizer_params = { 'momentum': args.momentum, 'weight_decay': args.weight_decay, } else: weight_optimizer_params = None # scheduler params if args.scheduler == 'cosine': scheduler_params = {'T_max': args.T_max, 'eta_min': args.eta_min} elif args.scheduler == 'multistep': scheduler_params = { 'milestones': args.milestones, 'gammas': args.gammas } elif args.scheduler == 'exponential': scheduler_params = {'gamma': args.gamma} elif args.scheduler == 'linear': scheduler_params = {'min_lr': args.min_lr} else: scheduler_params = None # criterion params if args.criterion == 'SmoothSoftmax': criterion_params = {'label_smooth': args.label_smoothing} else: criterion_params = None args.optimizer_config = { 'optimizer_type': args.weight_optimizer_type, 'optimizer_params': weight_optimizer_params, 'scheduler': args.scheduler, 'scheduler_params': scheduler_params, 'criterion': args.criterion, 'criterion_params': criterion_params, 'init_lr': args.init_lr, 'epochs': args.epochs, 'class_num': args.nb_classes, } if args.search_space == 'autodeeplab': conv_candidates = autodeeplab elif args.search_space == 'proxyless': conv_candidates = proxyless elif args.search_space == 'counter': conv_candidates = counter elif args.search_space == 'my_search_space': conv_candidates = my_search_space else: raise ValueError('search_space : {:} is not supported'.format( args.search_space)) # related to entropy constraint loss if args.reg_loss_type == 'add#linear': args.reg_loss_params = {'lambda': args.reg_loss_lambda} elif args.reg_loss_type == 'mul#log': args.reg_loss_params = { 'alpha': args.reg_loss_alpha, 'beta': args.reg_loss_beta } else: args.reg_loss_params = None # create run_config run_config = RunConfig(**args.__dict__) #if args.open_test == False: # retrain and validate if args.open_vis: # only open_vis in re-train phase, rather than both re-train and test. vis = visdomer(args.port, args.server, args.exp_name, args.compare_phase, args.elements, init_params=None) else: vis = None if args.retrain_resume: logger.log( '=> Loading checkpoint from {:} of the last retrain phase'.format( args.resume_file), mode='info') # checkpoint_file from the last retrain phase. checkpoint_path = os.path.join( args.resume_file, 'checkpoints', 'seed-{:}-retrain.pth'.format(args.random_seed)) assert os.path.exists( checkpoint_path), 'cannot find retrain checkpoint file {:}'.format( checkpoint_path) checkpoint = torch.load(checkpoint_path) actual_path, cell_genotypes = checkpoint['actual_path'], checkpoint[ 'cell_genotypes'] args.actual_path = actual_path args.cell_genotypes = cell_genotypes normal_network = NewGumbelAutoDeeplab(args.nb_layers, args.filter_multiplier, args.block_multiplier, args.steps, args.nb_classes, actual_path, cell_genotypes, args.search_space, affine=True) retrain_run_manager = RunManager(args.path, normal_network, logger, run_config, vis, out_log=True) normal_network.load_state_dict(checkpoint['state_dict']) display_all_families_information(args, 'retrain', retrain_run_manager, logger) retrain_run_manager.optimizer.load_state_dict( checkpoint['weight_optimizer']) retrain_run_manager.scheduler.load_state_dict(checkpoint['scheduler']) retrain_run_manager.monitor_metric = checkpoint['best_monitor'][0] retrain_run_manager.best_monitor = checkpoint['best_monitor'][1] retrain_run_manager.start_epoch = checkpoint['start_epoch'] logger.log( '=> loaded checkpoint file {:} from the last retrain phase, starts with {:}-th epoch' .format(checkpoint_path, checkpoint['start_epoch']), mode='info') else: # todo from the search phase, read the last arch_checkpoint, rather than the best one. arch_checkpoint_path = os.path.join( args.checkpoint_file, 'checkpoints', 'seed-{:}-arch.pth'.format(args.random_seed)) assert os.path.exists( arch_checkpoint_path ), 'cannot find arch_checkpoint file {:} from search phase'.format( arch_checkpoint_path) checkpoint = torch.load(arch_checkpoint_path) actual_path, cell_genotypes = checkpoint['actual_path'], checkpoint[ 'cell_genotypes'] new_genotypes = [] for _index, genotype in cell_genotypes: xlist = [] for edge_genotype in genotype: for (node_str, select_index) in edge_genotype: xlist.append((node_str, conv_candidates[select_index])) new_genotypes.append((_index, xlist)) log_str = 'Obtained actual_path and cell_genotypes:\n' \ 'Actual_path: {:}\n' \ 'Genotype:\n'.format(actual_path) for _index, genotype in new_genotypes: log_str += 'index: {:} arch: {:}\n'.format(_index, genotype) logger.log(log_str, mode='info') args.actual_path = actual_path args.cell_genotypes = cell_genotypes normal_network = NewGumbelAutoDeeplab(args.nb_layers, args.filter_multiplier, args.block_multiplier, args.steps, args.nb_classes, actual_path, cell_genotypes, args.search_space, affine=True) retrain_run_manager = RunManager(args.path, normal_network, logger, run_config, vis, out_log=True) #normal_network.load_state_dict(checkpoint['state_dict']) display_all_families_information(args, 'retrain', retrain_run_manager, logger) logger.log( '=> Construct NewGumbelAutoDeeplab according to the last-arch obtained from search phase', mode='info') # perform train and validation in train() method retrain_run_manager.train() ''' else: # test phase checkpoint_path = os.path.join(args.resume_file, 'checkpoints', 'seed-{:}-retrain-best.pth'.format(args.random_seed)) assert os.path.exists(checkpoint_path), 'cannot find best checkpoint {:} from the retrain phase'.format(checkpoint_path) checkpoint = torch.load(checkpoint_path) actual_path, cell_genotypes = checkpoint['actual_path'], checkpoint['cell_genotypes'] normal_network = NewGumbelAutoDeeplab(args.nb_layers, args.filter_multiplier, args.block_multiplier, args.steps, args.nb_classes, actual_path, cell_genotypes, args.search_space, affine=True) normal_network.load_state_dict(checkpoint['state_dict']) test_manager = RunManager(args.path, normal_network, logger, run_config, vis=None, out_log=True) display_all_families_information(args, 'retrain', test_manager, logger) # save testing configs save_configs(args.__dict__, args.path, 'test') test_manager.validate(epoch=None, is_test= True, use_train_mode = False) ''' logger.close()
def main(): global args, best_mIoU PID = os.getpid() args = parser.parse_args() prepare_seed(args.rand_seed) device = torch.device("cuda:" + str(args.gpus)) if args.timestamp == 'none': args.timestamp = "{:}".format( time.strftime('%h-%d-%C_%H-%M-%s', time.gmtime(time.time()))) switch_model = args.switch_model assert switch_model in ["deeplab50", "deeplab101"] # Log outputs if args.evaluate: args.save_dir = args.save_dir + "/GTA5-%s-evaluate"%switch_model + \ "%s/%s"%('/'+args.resume if args.resume != 'none' else '', args.timestamp) else: args.save_dir = args.save_dir + \ "/GTA5_512x512-{model}-LWF.stg{csg_stages}.w{csg_weight}-APool.{apool}-Aug.{augment}-chunk{chunks}-mlp{mlp}.K{csg_k}-LR{lr}.bone{factor}-epoch{epochs}-batch{batch_size}-seed{seed}".format( model=switch_model, csg_stages=args.csg_stages, mlp=args.mlp, csg_weight=args.csg, apool=args.apool, augment=args.augment, chunks=args.chunks, csg_k=args.csg_k, lr="%.2E"%args.lr, factor="%.1f"%args.factor, epochs=args.epochs, batch_size=args.batch_size, seed=args.rand_seed ) + \ "%s/%s"%('/'+args.resume if args.resume != 'none' else '', args.timestamp) logger = prepare_logger(args) from config_seg import config as data_setting data_setting.batch_size = args.batch_size train_loader = get_train_loader(data_setting, GTA5, test=False, augment=args.augment) args.stages = [int(stage) for stage in args.csg_stages.split('.') ] if len(args.csg_stages) > 0 else [] chunks = [int(chunk) for chunk in args.chunks.split('.') ] if len(args.chunks) > 0 else [] assert len(chunks) == 1 or len(chunks) == len(args.stages) if len(chunks) < len(args.stages): chunks = [chunks[0]] * len(args.stages) if switch_model == 'deeplab50': layers = [3, 4, 6, 3] elif switch_model == 'deeplab101': layers = [3, 4, 23, 3] model = csg_builder.CSG(deeplab, get_head=None, K=args.csg_k, stages=args.stages, chunks=chunks, task='new-seg', apool=args.apool, mlp=args.mlp, base_encoder_kwargs={ 'num_seg_classes': args.num_classes, 'layers': layers }) threds = 3 evaluator = SegEvaluator( Cityscapes(data_setting, 'val', None), args.num_classes, np.array([0.485, 0.456, 0.406]), np.array([0.229, 0.224, 0.225]), model.encoder_q, [ 1, ], False, devices=args.gpus, config=data_setting, threds=threds, verbose=False, save_path=None, show_image=False ) # just calculate mIoU, no prediction file is generated # verbose=False, save_path="./prediction_files", show_image=True, show_prediction=True) # generate prediction files # Setup optimizer factor = args.factor sgd_in = [ { 'params': get_params(model.encoder_q, ["conv1"]), 'lr': factor * args.lr }, { 'params': get_params(model.encoder_q, ["bn1"]), 'lr': factor * args.lr }, { 'params': get_params(model.encoder_q, ["layer1"]), 'lr': factor * args.lr }, { 'params': get_params(model.encoder_q, ["layer2"]), 'lr': factor * args.lr }, { 'params': get_params(model.encoder_q, ["layer3"]), 'lr': factor * args.lr }, { 'params': get_params(model.encoder_q, ["layer4"]), 'lr': factor * args.lr }, { 'params': get_params(model.encoder_q, ["fc_new"]), 'lr': args.lr }, ] base_lrs = [group['lr'] for group in sgd_in] optimizer = SGD(sgd_in, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # Optionally resume from a checkpoint if args.resume != 'none': if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage) args.start_epoch = checkpoint['epoch'] best_mIoU = checkpoint['best_mIoU'] msg = model.load_state_dict(checkpoint['state_dict']) print("resume weights: ", msg) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=ImageClassdata> no checkpoint found at '{}'".format( args.resume)) model = model.to(device) if args.evaluate: mIoU = validate(evaluator, model, -1) print(mIoU) exit(0) # Main training loop iter_max = args.epochs * len(train_loader) iter_stat = IterNums(iter_max) for epoch in range(args.start_epoch, args.epochs): print("<< ============== JOB (PID = %d) %s ============== >>" % (PID, args.save_dir)) logger.log("Epoch: %d" % (epoch + 1)) # train for one epoch train(args, train_loader, model, optimizer, base_lrs, iter_stat, epoch, logger, device, adjust_lr=epoch < args.epochs) # evaluate on validation set torch.cuda.empty_cache() mIoU = validate(evaluator, model, epoch) logger.writer.add_scalar("mIoU", mIoU, epoch + 1) logger.log("mIoU: %f" % mIoU) # remember best mIoU and save checkpoint is_best = mIoU > best_mIoU best_mIoU = max(mIoU, best_mIoU) save_checkpoint( args.save_dir, { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_mIoU': best_mIoU, }, is_best) logging.info('Best accuracy: {mIoU:.3f}'.format(mIoU=best_mIoU))
def main(): global args, best_prec1 PID = os.getpid() args = parser.parse_args() prepare_seed(args.rand_seed) if args.timestamp == 'none': args.timestamp = "{:}".format( time.strftime('%h-%d-%C_%H-%M-%s', time.gmtime(time.time()))) # Log outputs if args.evaluate: args.save_dir = args.save_dir + "/Visda17-Res101-evaluate" + \ "%s/%s"%('/'+args.resume.replace('/', '+') if args.resume != 'none' else '', args.timestamp) else: args.save_dir = args.save_dir + \ "/VisDA-Res101-CSG.stg{csg_stages}.w{csg_weight}-APool.{apool}-Aug.{augment}-chunk{chunks}-mlp{mlp}.K{csg_k}-LR{lr}.bone{factor}-epoch{epochs}-batch{batch_size}-seed{seed}".format( csg_stages=args.csg_stages, mlp=args.mlp, csg_weight=args.csg, apool=args.apool, augment=args.augment, chunks=args.chunks, csg_k=args.csg_k, lr="%.2E"%args.lr, factor="%.1f"%args.factor, epochs=args.epochs, batch_size=args.batch_size, seed=args.rand_seed ) + \ "%s/%s"%('/'+args.resume.replace('/', '+') if args.resume != 'none' else '', args.timestamp) logger = prepare_logger(args) data_transforms = { 'val': transforms.Compose([ transforms.Resize(224), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]), } if args.augment: data_transforms['train'] = transforms.Compose([ RandAugment(1, 6., augment_list), transforms.Resize(224), transforms.RandomCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) else: data_transforms['train'] = transforms.Compose([ transforms.Resize(224), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) kwargs = {'num_workers': 20, 'pin_memory': True} if args.augment: # two source trainset = VisDA17(txt_file=os.path.join(args.data, "train/image_list.txt"), root_dir=os.path.join(args.data, "train"), transform=TwoCropsTransform( data_transforms['train'], data_transforms['train'])) else: # one source trainset = VisDA17(txt_file=os.path.join(args.data, "train/image_list.txt"), root_dir=os.path.join(args.data, "train"), transform=data_transforms['train']) train_loader = DataLoader(trainset, batch_size=args.batch_size, shuffle=True, drop_last=True, **kwargs) valset = VisDA17(txt_file=os.path.join(args.data, "validation/image_list.txt"), root_dir=os.path.join(args.data, "validation"), transform=data_transforms['val'], label_one_hot=True) val_loader = DataLoader(valset, batch_size=args.batch_size, shuffle=False, **kwargs) args.stages = [int(stage) for stage in args.csg_stages.split('.') ] if len(args.csg_stages) > 0 else [] chunks = [int(chunk) for chunk in args.chunks.split('.') ] if len(args.chunks) > 0 else [] assert len(chunks) == 1 or len(chunks) == len(args.stages) if len(chunks) < len(args.stages): chunks = [chunks[0]] * len(args.stages) def get_head(num_ftrs, num_classes): _dim = 512 return nn.Sequential( nn.Linear(num_ftrs, _dim), nn.ReLU(inplace=False), nn.Linear(_dim, num_classes), ) model = csg_builder.CSG( resnet101, get_head=get_head, K=args.csg_k, stages=args.stages, chunks=chunks, apool=args.apool, mlp=args.mlp, ) train_blocks = "conv1.bn1.layer1.layer2.layer3.layer4.fc" train_blocks = train_blocks.split('.') # Setup optimizer factor = args.factor sgd_in = [] for name in train_blocks: if name != 'fc': sgd_in.append({ 'params': get_params(model.encoder_q, [name]), 'lr': factor * args.lr }) else: # no update to fc but to fc_new sgd_in.append({ 'params': get_params(model.encoder_q, ["fc_new"]), 'lr': args.lr }) if model.mlp: sgd_in.append({ 'params': get_params(model.encoder_q, ["fc_csg"]), 'lr': args.lr }) base_lrs = [group['lr'] for group in sgd_in] optimizer = SGD(sgd_in, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # Optionally resume from a checkpoint if args.resume != 'none': if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] msg = model.load_state_dict(checkpoint['state_dict'], strict=False) print("resume weights: ", msg) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=ImageClassdata> no checkpoint found at '{}'".format( args.resume)) model = model.cuda() if args.evaluate: prec1 = validate(val_loader, model, args, 0) print(prec1) exit(0) # Main training loop iter_max = args.epochs * len(train_loader) iter_stat = IterNums(iter_max) for epoch in range(args.start_epoch, args.epochs): print("<< ============== JOB (PID = %d) %s ============== >>" % (PID, args.save_dir)) logger.log("Epoch: %d" % (epoch + 1)) train(train_loader, model, optimizer, base_lrs, iter_stat, epoch, logger, args, adjust_lr=epoch < args.epochs) prec1 = validate(val_loader, model, args, epoch) logger.writer.add_scalar("prec", prec1, epoch + 1) logger.log("prec: %f" % prec1) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint(args.save_dir, { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, }, is_best, keep_last=1) logging.info('Best accuracy: {prec1:.3f}'.format(prec1=best_prec1))
def main(args): assert torch.cuda.is_available(), 'CUDA is not available' torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads(args.workers) set_manual_seed(args.random_seed) #print_experiment_environment() EXP_time = time_for_file() args.path = os.path.join(args.path, args.exp_name, EXP_time) os.makedirs(args.path, exist_ok=True) create_exp_dir(args.path, scripts_to_save='../Efficient_AutoDeeplab') # weight optimizer config, related to network_weight_optimizer, scheduler, and criterion if args.weight_optimizer_type == 'SGD': weight_optimizer_params = { 'momentum': args.momentum, 'nesterov': args.nesterov, 'weight_decay': args.weight_decay, } elif args.weight_optimizer_type == 'RMSprop': weight_optimizer_params = { 'momentum': args.momentum, 'weight_decay': args.weight_decay, } else: weight_optimizer_params = None if args.scheduler == 'cosine': scheduler_params = {'T_max': args.T_max, 'eta_min': args.eta_min} elif args.scheduler == 'multistep': scheduler_params = { 'milestones': args.milestones, 'gammas': args.gammas } elif args.scheduler == 'exponential': scheduler_params = {'gamma': args.gamma} elif args.scheduler == 'linear': scheduler_params = {'min_lr': args.min_lr} else: scheduler_params = None if args.criterion == 'SmoothSoftmax': criterion_params = {'label_smooth': args.label_smoothing} else: criterion_params = None # weight_optimizer_config, used in run_manager to get weight_optimizer, scheduler, and criterion. args.optimizer_config = { 'optimizer_type': args.weight_optimizer_type, 'optimizer_params': weight_optimizer_params, 'scheduler': args.scheduler, 'scheduler_params': scheduler_params, 'criterion': args.criterion, 'criterion_params': criterion_params, 'init_lr': args.init_lr, 'warmup_epoch': args.warmup_epochs, 'epochs': args.epochs, 'class_num': args.nb_classes, } # TODO need modification, not need in counter_network args.conv_candidates = [ '3x3_MBConv3', '3x3_MBConv6', '5x5_MBConv3', '5x5_MBConv6', '7x7_MBConv3', '7x7_MBConv6', 'Zero', #'Identity' ] run_config = RunConfig(**args.__dict__) # arch_optimizer_config if args.arch_optimizer_type == 'adam': args.arch_optimizer_params = { 'betas': (args.arch_adam_beta1, args.arch_adam_beta2), 'eps': args.arch_adam_eps } else: args.arch_optimizer_params = None # related to hardware constraint # TODO: get rid of if args.reg_loss_type == 'add#linear': args.reg_loss_params = {'lambda': args.reg_loss_lambda} elif args.reg_loss_type == 'mul#log': args.reg_loss_params = { 'alpha': args.reg_loss_alpha, 'beta': args.reg_loss_beta } else: args.reg_loss_params = None arch_search_config = ArchSearchConfig(**args.__dict__) # perform config save, for run_configs and arch_search_configs save_configs(run_config.config, arch_search_config.config, args.path, 'search') logger = prepare_logger(args) if args.open_vis: vis = visdomer(args.port, args.server, args.exp_name, args.compare_phase, args.elements, init_params=None) else: vis = None ''' super_network = GumbelAutoDeepLab( args.filter_multiplier, args.block_multiplier, args.steps, args.nb_classes, args.nb_layers, args.bn_momentum, args.bn_eps, args.conv_candidates, logger ) ''' super_network = CounterMBConvNet(2, search_space=args.search_space) train_manager = RunManager(args.path, super_network, logger, run_config, vis=vis, out_log=True) # train search phase train_manager.train() logger.close()