def main(args): assert torch.cuda.is_available(), 'CUDA is not available' torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True # if resume is True, resume configs and checkpoint from the existing files. if args.search_resume: # args.resume_file path to ... .../EXP-time # resume experiment in a new File, rather than the same file. # configs resume assert os.path.exists( args.resume_file ), 'cannot find the resume file {:}, please re-check'.format( args.resume_file) config_file_path = os.path.join(args.resume_file, 'search.config') assert os.path.exists( config_file_path ), "the path to configs file path {:} is not exists".format( config_file_path) f = open(config_file_path, 'r') config_dict = json.load(f) f.close() configs_resume(args, config_dict, 'search') # new EXP file initialize resume_EXP_time = config_dict['path'].split('/')[-1] resume_exp_name = config_dict['path'].split('/')[-2] EXP_time = time_for_file() args.path = os.path.join( args.path, args.exp_name, EXP_time + '-resume-{:}'.format(resume_exp_name + '-' + resume_EXP_time)) os.makedirs(args.path, exist_ok=True) create_exp_dir(args.path, scripts_to_save='../Efficient_AutoDeeplab') #save_configs(args.__dict__, args.path, 'search') #logger = prepare_logger(args) #logger.log("=> loading configs from the file '{:}' start.".format(args.resume_file), mode='info') torch.set_num_threads(args.workers) set_manual_seed(args.random_seed) else: # training initialization torch.set_num_threads(args.workers) set_manual_seed(args.random_seed) EXP_time = time_for_file() args.path = os.path.join(args.path, args.exp_name, EXP_time) os.makedirs(args.path, exist_ok=True) create_exp_dir(args.path, scripts_to_save='../Efficient_AutoDeeplab') # weight optimizer config, related to network_weight_optimizer, scheduler, and criterion if args.weight_optimizer_type == 'SGD': weight_optimizer_params = { 'momentum': args.momentum, 'nesterov': args.nesterov, 'weight_decay': args.weight_decay, } elif args.weight_optimizer_type == 'RMSprop': weight_optimizer_params = { 'momentum': args.momentum, 'weight_decay': args.weight_decay, } else: weight_optimizer_params = None if args.scheduler == 'cosine': scheduler_params = {'T_max': args.T_max, 'eta_min': args.eta_min} elif args.scheduler == 'multistep': scheduler_params = { 'milestones': args.milestones, 'gammas': args.gammas } elif args.scheduler == 'exponential': scheduler_params = {'gamma': args.gamma} elif args.scheduler == 'linear': scheduler_params = {'min_lr': args.min_lr} else: scheduler_params = None if args.criterion == 'SmoothSoftmax': criterion_params = {'label_smooth': args.label_smoothing} else: criterion_params = None # weight_optimizer_config, used in run_manager to get weight_optimizer, scheduler, and criterion. args.optimizer_config = { 'optimizer_type': args.weight_optimizer_type, 'optimizer_params': weight_optimizer_params, 'scheduler': args.scheduler, 'scheduler_params': scheduler_params, 'criterion': args.criterion, 'criterion_params': criterion_params, 'init_lr': args.init_lr, 'warmup_epoch': args.warmup_epochs, 'epochs': args.epochs, 'class_num': args.nb_classes, } # arch_optimizer_config if args.arch_optimizer_type == 'adam': args.arch_optimizer_params = { 'betas': (args.arch_adam_beta1, args.arch_adam_beta2), 'eps': args.arch_adam_eps } else: args.arch_optimizer_params = None # related to entropy constraint loss # TODO: pay attention, use separate lambda for cell_entropy and network_entropy. if args.reg_loss_type == 'add#linear': args.reg_loss_params = { 'lambda1': args.reg_loss_lambda1, 'lambda2': args.reg_loss_lambda2, } elif args.reg_loss_type == 'add#linear#linearschedule': args.reg_loss_params = { 'lambda1': args.reg_loss_lambda1, 'lambda2': args.reg_loss_lambda2, } elif args.reg_loss_type == 'mul#log': args.reg_loss_params = { 'alpha': args.reg_loss_alpha, 'beta': args.reg_loss_beta } else: args.reg_loss_params = None # perform config save, for run_configs and arch_search_configs save_configs(args.__dict__, args.path, 'search') logger = prepare_logger(args) logger.log("=> loading configs from the file '{:}' start.".format( args.resume_file) if args.search_resume else '=> train-search phase initialization done', mode='info') #print(args.optimizer_config) run_config = RunConfig(**args.__dict__) arch_search_config = ArchSearchConfig(**args.__dict__) # args.bn_momentum and args.bn_eps are not used super_network = GumbelAutoDeepLab(args.filter_multiplier, args.block_multiplier, args.steps, args.nb_classes, args.nb_layers, args.bn_momentum, args.bn_eps, args.search_space, logger, affine=False) # calculate init entropy _, network_index = super_network.get_network_arch_hardwts( ) # set self.hardwts again _, aspp_index = super_network.get_aspp_hardwts_index() single_path = super_network.sample_single_path(args.nb_layers, aspp_index, network_index) cell_arch_entropy, network_arch_entropy, entropy = super_network.calculate_entropy( single_path) logger.log('=> entropy : {:}'.format(entropy), mode='info') vis_init_params = { 'cell_entropy': cell_arch_entropy, 'network_entropy': network_arch_entropy, 'entropy': entropy, } #vis_elements = args.elements #vis_elements.extend(['cell_entropy', 'network_entropy', 'entropy']) #args.elements = vis_elements args.vis_init_params = vis_init_params if args.open_vis: vis = visdomer(args.port, args.server, args.exp_name, args.compare_phase, args.elements, init_params=args.vis_init_params) else: vis = None ''' from exp.autodeeplab.auto_deeplab import AutoDeeplab super_network = AutoDeeplab(args.filter_multiplier, args.block_multiplier, args.steps, args.nb_classes, args.nb_layers, args.search_space, logger, affine=False) ''' ''' from exp.fixed_network_level.supernetwork import FixedNetwork super_network = FixedNetwork(args.filter_multiplier, args.block_multiplier, args.steps, args.nb_classes, args.nb_layers, args.search_space, logger, affine=False) ''' arch_search_run_manager = ArchSearchRunManager(args.path, super_network, run_config, arch_search_config, logger, vis) display_all_families_information(args, 'search', arch_search_run_manager, logger) ''' # get_model_infos, perform inference # TODO: modify the way of forward into gdas_forward flop, param = get_model_infos(super_network, [1, 3, 512, 512]) print('||||||| FLOPS & PARAMS |||||||') print('FLOP = {:.2f} M, Params = {:.2f} MB'.format(flop, param)) ''' # 1. resume warmup phase # 2. resume search phase # 3. add last_info log × not last_info, every time, the saved_file name is not consistent, should given resume_file # 1. given EXP file time completed :: resume_file :: ->EXP-time # 2. get configs, and load config completed # 3. resume checkpoint completed # TODO: have issue in resume semantics. After resume, it will allocate more GPU memory than the normal one, which will raise OOM in search phase. if args.search_resume: if os.path.exists(args.resume_file): # resume_file :: path to EXP-time logger.log("=> loading checkpoint of the file '{:}' start".format( args.resume_file), mode='info') warm_up_checkpoint = os.path.join( args.resume_file, 'checkpoints', 'seed-{:}-warm.pth'.format(args.random_seed)) search_checkpoint = os.path.join( args.resume_file, 'checkpoints', 'seed-{:}-search.pth'.format(args.random_seed)) if args.resume_from_warmup == False: # resume checkpoint in search phase checkpoint = torch.load(search_checkpoint) super_network.load_state_dict(checkpoint['state_dict']) arch_search_run_manager.run_manager.optimizer.load_state_dict( checkpoint['weight_optimizer']) arch_search_run_manager.run_manager.scheduler.load_state_dict( checkpoint['weight_scheduler']) arch_search_run_manager.arch_optimizer.load_state_dict( checkpoint['arch_optimizer']) arch_search_run_manager.run_manager.monitor_metric = checkpoint[ 'best_monitor'][0] arch_search_run_manager.run_manager.best_monitor = checkpoint[ 'best_monitor'][1] arch_search_run_manager.warmup = checkpoint['warmup'] arch_search_run_manager.start_epoch = checkpoint[ 'start_epochs'] # pay attention:: start_epochs and warmup_epoch in nas_manager logger.log( "=> loading checkpoint of the file '{:}' start with {:}-th epochs in search phase" .format(search_checkpoint, checkpoint['start_epochs']), mode='info') else: # resume checkpoint in warmup phase checkpoint = torch.load(warm_up_checkpoint) super_network.load_state_dict(checkpoint['state_dict']) arch_search_run_manager.run_manager.optimizer.load_state_dict( checkpoint['weight_optimizer']) arch_search_run_manager.run_manager.scheduler.load_state_dict( checkpoint['weight_scheduler']) arch_search_run_manager.warmup = checkpoint['warmup'] arch_search_run_manager.warmup_epoch = checkpoint[ 'warmup_epoch'] logger.log( "=> loading checkpoint of the file '{:}' start with {:}-th epochs in warmup phase" .format(warm_up_checkpoint, checkpoint['warmup_epoch']), mode='info') else: logger.log( "=> can not find the file: {:} please re-confirm it\n" "=> start warm-up and search from scratch... ...".format( args.resume_file), mode='info') else: logger.log("=> start warm-up and search from scratch... ...", mode='info') # torch.autograd.set_detect_anomaly(True) # warm up phase if arch_search_run_manager.warmup: arch_search_run_manager.warm_up(warmup_epochs=args.warmup_epochs) # train search phase arch_search_run_manager.train() logger.close()
def main(args): assert torch.cuda.is_available(), 'CUDA is not available' torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True torch.backends.cudnn.deterministic = True if args.retrain_resume and args.evaluation == False: # if resume from the last retrain config_file_path = os.path.join(args.resume_file, 'retrain.config') assert os.path.exists( config_file_path ), 'cannot find config_file {:} from the last retrain phase'.format( config_file_path) f = open(config_file_path, 'r') config_dict = json.load(f) f.close() configs_resume(args, config_dict, 'retrain') # config resume from the last retrain # get EXP_time in last_retrain for flag EXP_time_last_retrain = config_dict['path'].split('/')[-1] Exp_name_last_retrain = config_dict['path'].split('/')[-2] EXP_time = time_for_file() args.path = os.path.join(args.path, args.exp_name, EXP_time) torch.set_num_threads(args.workers) set_manual_seed(args.random_seed) # from the last retrain. os.makedirs(args.path, exist_ok=True) create_exp_dir(args.path, scripts_to_save='../Efficient_AutoDeeplab') elif args.retrain_resume == False and args.evaluation: config_file_path = os.path.join(args.evaluation_ckpt, 'retrain.config') assert os.path.exists( config_file_path ), 'cannot find config_file {:} from the best checkpoint'.format( config_file_path) f = open(config_file_path, 'r') config_dict = json.load(f) f.close() configs_resume(args, config_dict, 'retrain') EXP_time_best_checkpoint = config_dict['path'].split('/')[-1] EXP_name_best_checkpoint = config_dict['path'].split('/')[-2] EXP_time = time_for_file() args.path = os.path.join(args.path, args.exp_name, EXP_time) torch.set_num_threads(args.workers) set_manual_seed(args.random_seed) os.makedirs(args.path, exist_ok=True) create_exp_dir(args.path, scripts_to_save='../Efficient_AutoDeeplab') elif args.retrain_resume == False and args.evaluation == False: # resume from the searching phrase. config_file_path = os.path.join(args.checkpoint_file, 'search.config') assert os.path.exists( config_file_path ), 'cannot find config_file {:} from the search phase'.format( config_file_path) f = open(config_file_path, 'r') config_dict = json.load(f) f.close() args.random_seed = config_dict['random_seed'] # get random_seed # get EXP_time in search phase, for flag EXP_time_search = config_dict['path'].split('/')[-1] EXP_name_search = config_dict['path'].split('/')[-2] EXP_time = time_for_file() args.path = os.path.join(args.path, args.exp_name, EXP_time) torch.set_num_threads(args.workers) set_manual_seed( args.random_seed) # from the last retrain phase or search phase. os.makedirs(args.path, exist_ok=True) create_exp_dir(args.path, scripts_to_save='../Efficient_AutoDeeplab') else: raise NotImplementedError( 'invalid mode retrain_resume {:} open_vis {:}'.format( args.retrain_resume, args.open_vis)) # optimizer params if args.weight_optimizer_type == 'SGD': weight_optimizer_params = { 'momentum': args.momentum, 'nesterov': args.nesterov, 'weight_decay': args.weight_decay, } elif args.weight_optimizer_type == 'RMSprop': weight_optimizer_params = { 'momentum': args.momentum, 'weight_decay': args.weight_decay, } else: weight_optimizer_params = None # scheduler params if args.scheduler == 'cosine': scheduler_params = {'T_max': args.T_max, 'eta_min': args.eta_min} elif args.scheduler == 'multistep': scheduler_params = { 'milestones': args.milestones, 'gammas': args.gammas } elif args.scheduler == 'exponential': scheduler_params = {'gamma': args.gamma} elif args.scheduler == 'linear': scheduler_params = {'min_lr': args.min_lr} else: scheduler_params = None # criterion params if args.criterion == 'SmoothSoftmax': criterion_params = {'label_smooth': args.label_smoothing} else: criterion_params = None args.optimizer_config = { 'optimizer_type': args.weight_optimizer_type, 'optimizer_params': weight_optimizer_params, 'scheduler': args.scheduler, 'scheduler_params': scheduler_params, 'criterion': args.criterion, 'criterion_params': criterion_params, 'init_lr': args.init_lr, 'epochs': args.epochs, 'class_num': args.nb_classes, } if args.search_space == 'autodeeplab': conv_candidates = autodeeplab elif args.search_space == 'proxyless': conv_candidates = proxyless elif args.search_space == 'counter': conv_candidates = counter elif args.search_space == 'my_search_space': conv_candidates = my_search_space else: raise ValueError('search_space : {:} is not supported'.format( args.search_space)) # related to entropy constraint loss if args.reg_loss_type == 'add#linear': args.reg_loss_params = { 'lambda1': args.reg_loss_lambda1, 'lambda2': args.reg_loss_lambda2 } elif args.reg_loss_type == 'add#linear#linearschedule': args.reg_loss_params = { 'lambda1': args.reg_loss_lambda1, 'lambda2': args.reg_loss_lambda2, } elif args.reg_loss_type == 'mul#log': args.reg_loss_params = { 'alpha': args.reg_loss_alpha, 'beta': args.reg_loss_beta } else: args.reg_loss_params = None logger = prepare_logger(args) if args.retrain_resume and args.evaluation == False: logger.log( '=> loading configs {:} from the last retrain phase.'.format( config_file_path), 'info') elif args.retrain_resume == False and args.evaluation: logger.log( '=> loading configs {:} from the best retrain phrase.'.format( config_file_path), 'info') elif args.retrain_resume == False and args.evaluation == False: logger.log( '=> loading configs {:} from search phrase.'.format( config_file_path), 'info') # save new config, and create logger. save_configs(args.__dict__, args.path, 'retrain') # create run_config run_config = RunConfig(**args.__dict__) # only open_vis in retrain phrase if args.open_vis: assert args.evaluation == False, 'invalid mode open_vis {:} and open_test {:}'.format( args.open_vis, args.evaluation) vis = visdomer(args.port, args.server, args.exp_name, args.compare_phase, args.elements, init_params=None) else: vis = None #print(args.evaluation) if args.evaluation: assert os.path.exists(args.evaluation_ckpt ), 'cannot find the best checkpoint {:}'.format( args.evaluation_ckpt) checkpoint_path = os.path.join( args.evaluation_ckpt, 'checkpoints', 'seed-{:}-retrain-best.pth'.format(args.random_seed)) checkpoint = torch.load(checkpoint_path) actual_path, cell_genotypes = checkpoint['actual_path'], checkpoint[ 'cell_genotypes'] #print(actual_path) #print(cell_genotypes) ''' my_search_space = [ '3x3_SepFacConv1', '5x5_SepFacConv1', '3x3_SepFacConv2', '5x5_SepFacConv2', '3x3_SepFacConv4', '5x5_SepFacConv4',] ''' # 0:4 1:4 2:5 3:5 4:4 5:2 actual_path = [0, 0, 0, 0, 1, 1, 1, 2, 3, 3, 3, 2] cell_genotypes = [(0, [[('2<-1', 0), ('2<-0', 3)]]), (2, [[('2<-1', 4), ('2<-0', 1)]]), (7, [[('2<-1', 3), ('2<-0', 0)]]), (15, [[('2<-1', 1), ('2<-0', 2)]]), (27, [[('2<-1', 4), ('2<-0', 3)]]), (38, [[('2<-1', 4), ('2<-0', 0)]]), (48, [[('2<-1', 2), ('2<-0', 5)]]), (60, [[('2<-1', 0), ('2<-0', 1)]]), (73, [[('2<-0', 3), ('2<-1', 3)]]), (84, [[('2<-1', 2), ('2<-0', 1)]]), (94, [[('2<-1', 4), ('2<-0', 2)]]), (102, [[('2<-1', 2), ('2<-0', 5)]])] ''' actual_path = [0, 0, 0, 1, 1, 1, 2, 3, 3, 3, 2, 1] cell_genotypes = [(0, [[('2<-1', 4), ('2<-0', 5)]]), (2, [[('2<-1', 3), ('2<-0', 1)]]), (7, [[('2<-1', 2), ('2<-0', 5)]]), (17, [[('2<-0', 1), ('2<-1', 1)]]), (28, [[('2<-1', 4), ('2<-0', 1)]]), (38, [[('2<-1', 4), ('2<-0', 2)]]), (50, [[('2<-1', 5), ('2<-0', 1)]]), (63, [[('2<-1', 4), ('2<-0', 2)]]), (74, [[('2<-1', 1), ('2<-0', 0)]]), (84, [[('2<-1', 3), ('2<-0', 1)]]), (92, [[('2<-1', 4), ('2<-0', 5)]]), (99, [[('2<-1', 0), ('2<-0', 3)]])] ''' normal_network = NewGumbelAutoDeeplab(args.nb_layers, args.filter_multiplier, args.block_multiplier, args.steps, args.nb_classes, actual_path, cell_genotypes, args.search_space, affine=True) # save new config, and create logger. #save_configs(args.__dict__, args.path, 'retrain') # create run_config #run_config = RunConfig(**args.__dict__) evaluation_run_manager = RunManager(args.path, normal_network, logger, run_config, vis, out_log=True) normal_network.load_state_dict(checkpoint['state_dict']) display_all_families_information(args, 'retrain', evaluation_run_manager, logger) logger.log( '=> loaded the best checkpoint from {:}, start evaluation'.format( checkpoint_path), 'info') evaluation_run_manager.validate(is_test=True, use_train_mode=False) else: # resume from the last retrain if args.retrain_resume: logger.log( '=> Loading checkpoint from {:} of the last retrain phase'. format(args.resume_file), mode='info') # checkpoint_file from the last retrain phase. checkpoint_path = os.path.join( args.resume_file, 'checkpoints', 'seed-{:}-retrain.pth'.format(args.random_seed)) assert os.path.exists( checkpoint_path ), 'cannot find retrain checkpoint file {:}'.format( checkpoint_path) checkpoint = torch.load(checkpoint_path) actual_path, cell_genotypes = checkpoint[ 'actual_path'], checkpoint['cell_genotypes'] args.actual_path = actual_path args.cell_genotypes = cell_genotypes normal_network = NewGumbelAutoDeeplab(args.nb_layers, args.filter_multiplier, args.block_multiplier, args.steps, args.nb_classes, actual_path, cell_genotypes, args.search_space, affine=True) flop, param = get_model_infos(normal_network, [1, 3, 512, 512]) logger.log( '|#################### Network Info ####################|\n' 'FLOPs:{:.2f} M, Params:{:.2f} MB'.format(flop, param), mode='info') # save new config, and create logger. #save_configs(args.__dict__, args.path, 'retrain') # create run_config #run_config = RunConfig(**args.__dict__) retrain_run_manager = RunManager(args.path, normal_network, logger, run_config, vis, out_log=True) normal_network.load_state_dict(checkpoint['state_dict']) display_all_families_information(args, 'retrain', retrain_run_manager, logger) retrain_run_manager.optimizer.load_state_dict( checkpoint['weight_optimizer']) retrain_run_manager.scheduler.load_state_dict( checkpoint['scheduler']) retrain_run_manager.monitor_metric = checkpoint['best_monitor'][0] retrain_run_manager.best_monitor = checkpoint['best_monitor'][1] retrain_run_manager.start_epoch = checkpoint[ 'start_epoch'] # has +1 logger.log( '=> loaded checkpoint file {:} from the last retrain phase, starts with {:}-th epoch' .format(checkpoint_path, checkpoint['start_epoch']), mode='info') else: # from search phrase, load the optimal architecture and perform retrain. arch_checkpoint_path = os.path.join( args.checkpoint_file, 'checkpoints', 'seed-{:}-arch-best.pth'.format(args.random_seed)) # TODO, the best epoch has gotten in advance. #checkpoint_path = os.path.join(args.checkpoint_file, 'checkpoints', 'seed-{:}-search-best.pth'.format(args.random_seed)) #tmp_checkpoint = torch.load(checkpoint_path) #best_epoch = tmp_checkpoint['start_epochs'] - 1 #logger.log('=> best epochs: {:}'.format(best_epoch), mode='info') # get the best_epoch assert os.path.exists( arch_checkpoint_path ), 'cannot find arch_checkpoint file {:} from search phase'.format( arch_checkpoint_path) checkpoint = torch.load(arch_checkpoint_path) actual_path, cell_genotypes = checkpoint[ 'actual_path'], checkpoint['cell_genotypes'] new_genotypes = [] for _index, genotype in cell_genotypes: xlist = [] for edge_genotype in genotype: for (node_str, select_index) in edge_genotype: xlist.append((node_str, conv_candidates[select_index])) new_genotypes.append((_index, xlist)) log_str = 'Obtained actual_path and cell_genotypes:\n' \ 'Actual_path: {:}\n' \ 'Genotype:\n'.format(actual_path) for _index, genotype in new_genotypes: log_str += 'index: {:} arch: {:}\n'.format(_index, genotype) logger.log(log_str, mode='info') args.actual_path = actual_path args.cell_genotypes = cell_genotypes normal_network = NewGumbelAutoDeeplab(args.nb_layers, args.filter_multiplier, args.block_multiplier, args.steps, args.nb_classes, actual_path, cell_genotypes, args.search_space, affine=True) flop, param = get_model_infos(normal_network, [1, 3, 512, 512]) logger.log( '|#################### Network Info ####################|\n' 'FLOPs:{:.2f} M, Params:{:.2f} MB'.format(flop, param), mode='info') # save new config, and create logger. #save_configs(args.__dict__, args.path, 'retrain') # create run_config #run_config = RunConfig(**args.__dict__) retrain_run_manager = RunManager(args.path, normal_network, logger, run_config, vis, out_log=True) #normal_network.load_state_dict(checkpoint['state_dict']) display_all_families_information(args, 'retrain', retrain_run_manager, logger) logger.log( '=> Construct NewGumbelAutoDeeplab according to the last-arch obtained from search phase', mode='info') # perform train and validation in train() method retrain_run_manager.train() logger.close()
''' args = obtain_train_search_args() set_manual_seed(args.random_seed) torch.backends.cudnn.benchmark = True torch.backends.cudnn.deterministic = True print_experiment_environment() #os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu_ids) os.makedirs(args.path, exist_ok=True) EXP_time = time_for_file() # /home/jingweipeng/ljb/Jingbo.TTB/proxy_auto_deeplab/exp_time args.path = os.path.join(args.path, args.exp_name, EXP_time) # save experiment scripts create_exp_dir(args.path, scripts_to_save='../Efficient_AutoDeeplab') # build run configs args.lr_scheduler_param = None args.optim_params = { 'momentum': args.momentum, 'nesterov': args.nesterov } args.conv_candidates = [
def main(args): assert torch.cuda.is_available(), 'CUDA is not available' torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True torch.backends.cudnn.deterministic = True if args.retrain_resume: config_file_path = os.path.join(args.resume_file, 'retrain.config') assert os.path.exists( config_file_path ), 'cannot find config_file {:} from the last retrain phase'.format( config_file_path) f = open(config_file_path, 'r') config_dict = json.load(f) f.close() configs_resume(args, config_dict, 'retrain') # get EXP_time in last_retrain for flag EXP_time_last_retrain = config_dict['path'].split('/')[-1] EXP_time = time_for_file() args.path = os.path.join( args.path, args.exp_name, EXP_time + '-resume-{:}'.format(EXP_time_last_retrain)) torch.set_num_threads(args.workers) set_manual_seed( args.random_seed) # from the last retrain phase or search phase. os.makedirs(args.path, exist_ok=True) create_exp_dir(args.path, scripts_to_save=glob.glob('./*/*.py')) save_configs(args.__dict__, args.path, 'retrain') logger = prepare_logger(args) logger.log( '=> loading configs {:} from the last retrain phase.'.format( config_file_path), mode='info') if args.search_space == 'autodeeplab': conv_candidates = autodeeplab elif args.search_space == 'proxyless': conv_candidates = proxyless elif args.search_space == 'my_search_space': conv_candidates = my_search_space else: raise ValueError('search space {:} is not supported'.format( args.search_space)) else: # resume partial configs setting and arch_checkpoint from the search phase by default. config_file_path = os.path.join(args.checkpoint_file, 'search.config') assert os.path.exists( config_file_path ), 'cannot find config_file {:} from the search phase'.format( config_file_path) f = open(config_file_path, 'r') config_dict = json.load(f) f.close() args.random_seed = config_dict['random_seed'] # get EXP_time in search phase, for flag EXP_time_search = config_dict['path'].split('/')[-1] EXP_time = time_for_file() args.path = os.path.join( args.path, args.exp_name, EXP_time + '-resume-{:}'.format(EXP_time_search)) torch.set_num_threads(args.workers) set_manual_seed( args.random_seed) # from the last retrain phase or search phase. os.makedirs(args.path, exist_ok=True) create_exp_dir(args.path, scripts_to_save=glob.glob('./*/*.py')) save_configs(args.__dict__, args.path, 'retrain') logger = prepare_logger(args) logger.log( '=> starting retrain from the search phase config {:}.'.format( config_file_path), mode='info') # optimizer params if args.weight_optimizer_type == 'SGD': weight_optimizer_params = { 'momentum': args.momentum, 'nesterov': args.nesterov, 'weight_decay': args.weight_decay, } elif args.weight_optimizer_type == 'RMSprop': weight_optimizer_params = { 'momentum': args.momentum, 'weight_decay': args.weight_decay, } else: weight_optimizer_params = None # scheduler params if args.scheduler == 'cosine': scheduler_params = {'T_max': args.T_max, 'eta_min': args.eta_min} elif args.scheduler == 'multistep': scheduler_params = { 'milestones': args.milestones, 'gammas': args.gammas } elif args.scheduler == 'exponential': scheduler_params = {'gamma': args.gamma} elif args.scheduler == 'linear': scheduler_params = {'min_lr': args.min_lr} else: scheduler_params = None # criterion params if args.criterion == 'SmoothSoftmax': criterion_params = {'label_smooth': args.label_smoothing} else: criterion_params = None args.optimizer_config = { 'optimizer_type': args.weight_optimizer_type, 'optimizer_params': weight_optimizer_params, 'scheduler': args.scheduler, 'scheduler_params': scheduler_params, 'criterion': args.criterion, 'criterion_params': criterion_params, 'init_lr': args.init_lr, 'epochs': args.epochs, 'class_num': args.nb_classes, } if args.search_space == 'autodeeplab': conv_candidates = autodeeplab elif args.search_space == 'proxyless': conv_candidates = proxyless elif args.search_space == 'counter': conv_candidates = counter elif args.search_space == 'my_search_space': conv_candidates = my_search_space else: raise ValueError('search_space : {:} is not supported'.format( args.search_space)) # related to entropy constraint loss if args.reg_loss_type == 'add#linear': args.reg_loss_params = {'lambda': args.reg_loss_lambda} elif args.reg_loss_type == 'mul#log': args.reg_loss_params = { 'alpha': args.reg_loss_alpha, 'beta': args.reg_loss_beta } else: args.reg_loss_params = None # create run_config run_config = RunConfig(**args.__dict__) #if args.open_test == False: # retrain and validate if args.open_vis: # only open_vis in re-train phase, rather than both re-train and test. vis = visdomer(args.port, args.server, args.exp_name, args.compare_phase, args.elements, init_params=None) else: vis = None if args.retrain_resume: logger.log( '=> Loading checkpoint from {:} of the last retrain phase'.format( args.resume_file), mode='info') # checkpoint_file from the last retrain phase. checkpoint_path = os.path.join( args.resume_file, 'checkpoints', 'seed-{:}-retrain.pth'.format(args.random_seed)) assert os.path.exists( checkpoint_path), 'cannot find retrain checkpoint file {:}'.format( checkpoint_path) checkpoint = torch.load(checkpoint_path) actual_path, cell_genotypes = checkpoint['actual_path'], checkpoint[ 'cell_genotypes'] args.actual_path = actual_path args.cell_genotypes = cell_genotypes normal_network = NewGumbelAutoDeeplab(args.nb_layers, args.filter_multiplier, args.block_multiplier, args.steps, args.nb_classes, actual_path, cell_genotypes, args.search_space, affine=True) retrain_run_manager = RunManager(args.path, normal_network, logger, run_config, vis, out_log=True) normal_network.load_state_dict(checkpoint['state_dict']) display_all_families_information(args, 'retrain', retrain_run_manager, logger) retrain_run_manager.optimizer.load_state_dict( checkpoint['weight_optimizer']) retrain_run_manager.scheduler.load_state_dict(checkpoint['scheduler']) retrain_run_manager.monitor_metric = checkpoint['best_monitor'][0] retrain_run_manager.best_monitor = checkpoint['best_monitor'][1] retrain_run_manager.start_epoch = checkpoint['start_epoch'] logger.log( '=> loaded checkpoint file {:} from the last retrain phase, starts with {:}-th epoch' .format(checkpoint_path, checkpoint['start_epoch']), mode='info') else: # todo from the search phase, read the last arch_checkpoint, rather than the best one. arch_checkpoint_path = os.path.join( args.checkpoint_file, 'checkpoints', 'seed-{:}-arch.pth'.format(args.random_seed)) assert os.path.exists( arch_checkpoint_path ), 'cannot find arch_checkpoint file {:} from search phase'.format( arch_checkpoint_path) checkpoint = torch.load(arch_checkpoint_path) actual_path, cell_genotypes = checkpoint['actual_path'], checkpoint[ 'cell_genotypes'] new_genotypes = [] for _index, genotype in cell_genotypes: xlist = [] for edge_genotype in genotype: for (node_str, select_index) in edge_genotype: xlist.append((node_str, conv_candidates[select_index])) new_genotypes.append((_index, xlist)) log_str = 'Obtained actual_path and cell_genotypes:\n' \ 'Actual_path: {:}\n' \ 'Genotype:\n'.format(actual_path) for _index, genotype in new_genotypes: log_str += 'index: {:} arch: {:}\n'.format(_index, genotype) logger.log(log_str, mode='info') args.actual_path = actual_path args.cell_genotypes = cell_genotypes normal_network = NewGumbelAutoDeeplab(args.nb_layers, args.filter_multiplier, args.block_multiplier, args.steps, args.nb_classes, actual_path, cell_genotypes, args.search_space, affine=True) retrain_run_manager = RunManager(args.path, normal_network, logger, run_config, vis, out_log=True) #normal_network.load_state_dict(checkpoint['state_dict']) display_all_families_information(args, 'retrain', retrain_run_manager, logger) logger.log( '=> Construct NewGumbelAutoDeeplab according to the last-arch obtained from search phase', mode='info') # perform train and validation in train() method retrain_run_manager.train() ''' else: # test phase checkpoint_path = os.path.join(args.resume_file, 'checkpoints', 'seed-{:}-retrain-best.pth'.format(args.random_seed)) assert os.path.exists(checkpoint_path), 'cannot find best checkpoint {:} from the retrain phase'.format(checkpoint_path) checkpoint = torch.load(checkpoint_path) actual_path, cell_genotypes = checkpoint['actual_path'], checkpoint['cell_genotypes'] normal_network = NewGumbelAutoDeeplab(args.nb_layers, args.filter_multiplier, args.block_multiplier, args.steps, args.nb_classes, actual_path, cell_genotypes, args.search_space, affine=True) normal_network.load_state_dict(checkpoint['state_dict']) test_manager = RunManager(args.path, normal_network, logger, run_config, vis=None, out_log=True) display_all_families_information(args, 'retrain', test_manager, logger) # save testing configs save_configs(args.__dict__, args.path, 'test') test_manager.validate(epoch=None, is_test= True, use_train_mode = False) ''' logger.close()
def main(args): assert torch.cuda.is_available(), 'CUDA is not available' torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads(args.workers) set_manual_seed(args.random_seed) print_experiment_environment() os.makedirs(args.path, exist_ok=True) EXP_time = time_for_file() args.path = os.path.join(args.path, args.exp_name, EXP_time) create_exp_dir(args.path, scripts_to_save=glob.glob('./*/*.py')) # weight optimizer config, related to network_weight_optimizer, scheduler, and criterion if args.weight_optimizer_type == 'SGD': weight_optimizer_params = { 'momentum': args.momentum, 'nesterov': args.nesterov, 'weight_decay': args.weight_decay, } elif args.weight_optimizer_type == 'RMSprop': weight_optimizer_params = { 'momentum': args.momentum, 'weight_decay': args.weight_decay, } else: weight_optimizer_params = None if args.scheduler == 'cosine': # TODO: add additional params in args scheduler_params = {'T_max': args.T_max, 'eta_min': args.eta_min} elif args.scheduler == 'multistep': scheduler_params = { 'milestones': args.milestones, 'gammas': args.gammas } elif args.scheduler == 'exponential': scheduler_params = {'gamma': args.gamma} elif args.scheduler == 'linear': scheduler_params = {'min_lr': args.min_lr} else: scheduler_params = None if args.criterion == 'SmoothSoftmax': criterion_params = {'label_smooth': args.label_smoothing} else: criterion_params = None # weight_optimizer_config, used in run_manager to get weight_optimizer, scheduler, and criterion. args.optimizer_config = { 'optimizer_type': args.weight_optimizer_type, 'optimizer_params': weight_optimizer_params, 'scheduler': args.scheduler, 'scheduler_params': scheduler_params, 'criterion': args.criterion, 'criterion_params': criterion_params, 'init_lr': args.init_lr, 'warmup_epoch': args.warmup_epochs, 'epochs': args.total_epochs, 'class_num': args.nb_classes, } # TODO need modification args.conv_candidates = [ '3x3_MBConv3', '3x3_MBConv6', '5x5_MBConv3', '5x5_MBConv6', '7x7_MBConv3', '7x7_MBConv6', 'Zero', #'Identity' ] run_config = RunConfig(**args.__dict__) # arch_optimizer_config if args.arch_optimizer_type == 'adam': args.arch_optimizer_params = { 'betas': (args.arch_adam_beta1, args.arch_adam_beta2), 'eps': args.arch_adam_eps } else: args.arch_optimizer_params = None # related to hardware constraint # TODO: get rid of if args.reg_loss_type == 'add#linear': args.reg_loss_params = {'lambda': args.reg_loss_lambda} elif args.reg_loss_type == 'mul#log': args.reg_loss_params = { 'alpha': args.reg_loss_alpha, 'beta': args.reg_loss_beta } else: args.reg_loss_params = None arch_search_config = ArchSearchConfig(**args.__dict__) # perform config save, for run_configs and arch_search_configs save_configs(run_config.config, arch_search_config.config, args.path) print('Run Configs:') for k, v in run_config.config.items(): print('\t{}: {}'.format(k, v)) print('Architecture Search Configs:') for k, v in arch_search_config.config.items(): print('\t{}: {}'.format(k, v)) # TODO: configs saving super_network = GumbelAutoDeepLab(args.filter_multiplier, args.block_multiplier, args.steps, args.nb_classes, args.nb_layers, args.conv_candidates) arch_search_run_manager = ArchSearchRunManager(args.path, super_network, run_config, arch_search_config) # TODO: perform resume # warm up phase if arch_search_run_manager.warmup: arch_search_run_manager.warm_up(warmup_epochs=args.warmup_epochs) # train search phase arch_search_run_manager.train()
def main(args): assert torch.cuda.is_available(), 'CUDA is not available' torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads(args.workers) set_manual_seed(args.random_seed) #print_experiment_environment() EXP_time = time_for_file() args.path = os.path.join(args.path, args.exp_name, EXP_time) os.makedirs(args.path, exist_ok=True) create_exp_dir(args.path, scripts_to_save='../Efficient_AutoDeeplab') # weight optimizer config, related to network_weight_optimizer, scheduler, and criterion if args.weight_optimizer_type == 'SGD': weight_optimizer_params = { 'momentum': args.momentum, 'nesterov': args.nesterov, 'weight_decay': args.weight_decay, } elif args.weight_optimizer_type == 'RMSprop': weight_optimizer_params = { 'momentum': args.momentum, 'weight_decay': args.weight_decay, } else: weight_optimizer_params = None if args.scheduler == 'cosine': scheduler_params = {'T_max': args.T_max, 'eta_min': args.eta_min} elif args.scheduler == 'multistep': scheduler_params = { 'milestones': args.milestones, 'gammas': args.gammas } elif args.scheduler == 'exponential': scheduler_params = {'gamma': args.gamma} elif args.scheduler == 'linear': scheduler_params = {'min_lr': args.min_lr} else: scheduler_params = None if args.criterion == 'SmoothSoftmax': criterion_params = {'label_smooth': args.label_smoothing} else: criterion_params = None # weight_optimizer_config, used in run_manager to get weight_optimizer, scheduler, and criterion. args.optimizer_config = { 'optimizer_type': args.weight_optimizer_type, 'optimizer_params': weight_optimizer_params, 'scheduler': args.scheduler, 'scheduler_params': scheduler_params, 'criterion': args.criterion, 'criterion_params': criterion_params, 'init_lr': args.init_lr, 'warmup_epoch': args.warmup_epochs, 'epochs': args.epochs, 'class_num': args.nb_classes, } # TODO need modification, not need in counter_network args.conv_candidates = [ '3x3_MBConv3', '3x3_MBConv6', '5x5_MBConv3', '5x5_MBConv6', '7x7_MBConv3', '7x7_MBConv6', 'Zero', #'Identity' ] run_config = RunConfig(**args.__dict__) # arch_optimizer_config if args.arch_optimizer_type == 'adam': args.arch_optimizer_params = { 'betas': (args.arch_adam_beta1, args.arch_adam_beta2), 'eps': args.arch_adam_eps } else: args.arch_optimizer_params = None # related to hardware constraint # TODO: get rid of if args.reg_loss_type == 'add#linear': args.reg_loss_params = {'lambda': args.reg_loss_lambda} elif args.reg_loss_type == 'mul#log': args.reg_loss_params = { 'alpha': args.reg_loss_alpha, 'beta': args.reg_loss_beta } else: args.reg_loss_params = None arch_search_config = ArchSearchConfig(**args.__dict__) # perform config save, for run_configs and arch_search_configs save_configs(run_config.config, arch_search_config.config, args.path, 'search') logger = prepare_logger(args) if args.open_vis: vis = visdomer(args.port, args.server, args.exp_name, args.compare_phase, args.elements, init_params=None) else: vis = None ''' super_network = GumbelAutoDeepLab( args.filter_multiplier, args.block_multiplier, args.steps, args.nb_classes, args.nb_layers, args.bn_momentum, args.bn_eps, args.conv_candidates, logger ) ''' super_network = CounterMBConvNet(2, search_space=args.search_space) train_manager = RunManager(args.path, super_network, logger, run_config, vis=vis, out_log=True) # train search phase train_manager.train() logger.close()