Beispiel #1
0
    def load_model(self, checkpoint_file=None):
        # only used in nas_manager
        assert checkpoint_file is not None and os.path.exists(checkpoint_file), \
            'checkpoint_file can not be found'

        if self.run_manager.out_log:
            print('=' * 30 +
                  '=>\tLoading Checkpoint {}'.format(checkpoint_file))
        if torch.cuda.is_available():
            checkpoint = torch.load(checkpoint_file)
        else:
            checkpoint = torch.load(checkpoint_file, map_location='cpu')

        model_dict = self.net.state_dict()
        model_dict.update(checkpoint['state_dict'])
        self.net.load_state_dict(model_dict)

        # TODO:  why set new manual seed
        new_manual_seed = int(time.time())
        set_manual_seed(new_manual_seed)

        self.start_epoch = checkpoint['start_epochs']
        self.monitor_metric, self.best_monitor = checkpoint['best_monitor']
        self.run_manager.optimizer.load_state_dict(
            checkpoint['weight_optimizer'])
        scheduler_dict = self.run_manager.scheduler.state_dict()
        scheduler_dict.update(checkpoint['weight_scheduler'])
        self.run_manager.scheduler.load_state_dict(scheduler_dict)
        self.arch_optimizer.load_state_dict(checkpoint['arch_optimizer'])
        self.warm_up = checkpoint['warmup']
        if self.run_manager.out_log:
            print('=' * 30 +
                  '=>\tLoaded Checkpoint {}'.format(checkpoint_file))
Beispiel #2
0
    def load_model(self, checkpoint_file):
        # TODO: get rid of load_model. load resume_file have completed in the main function in each phase.
        # only used in run_manager
        assert checkpoint_file is not None and os.path.exists(checkpoint_file),\
            'checkpoint_file can not be found'
        print('=' * 30 + '=>\tLoading Checkpoint {}'.format(checkpoint_file))
        if torch.cuda.is_available():
            checkpoint = torch.load(checkpoint_file)
        else:
            checkpoint = torch.load(checkpoint_file, map_location='cpu')
        model_dict = self.model.state_dict()
        model_dict.update(checkpoint['state_dict'])
        self.model.load_state_dict(model_dict)

        # TODO: why set new manual seed
        new_manual_seed = int(time.time())
        set_manual_seed(new_manual_seed)

        self.start_epoch = checkpoint['start_epochs']
        self.monitor_metric, self.best_monitor = checkpoint['best_monitor']
        self.optimizer.load_state_dict(checkpoint['weight_optimizer'])
        scheduler_dict = self.scheduler.state_dict()
        scheduler_dict.update(checkpoint['weight_scheduler'])
        self.scheduler.load_state_dict(scheduler_dict)

        # TODO: something should loaded in nas_manager, related to warm_up, train search, and arch_optimizer info

        print('=' * 30 + '=>\tLoaded Checkpoint {}'.format(checkpoint_file))
Beispiel #3
0
def main(args):

    assert torch.cuda.is_available(), 'CUDA is not available'
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    # if resume is True, resume configs and checkpoint from the existing files.
    if args.search_resume:
        # args.resume_file path to ... .../EXP-time
        # resume experiment in a new File, rather than the same file.
        # configs resume
        assert os.path.exists(
            args.resume_file
        ), 'cannot find the resume file {:}, please re-check'.format(
            args.resume_file)
        config_file_path = os.path.join(args.resume_file, 'search.config')
        assert os.path.exists(
            config_file_path
        ), "the path to configs file path {:} is not exists".format(
            config_file_path)
        f = open(config_file_path, 'r')
        config_dict = json.load(f)
        f.close()
        configs_resume(args, config_dict, 'search')
        # new EXP file initialize
        resume_EXP_time = config_dict['path'].split('/')[-1]
        resume_exp_name = config_dict['path'].split('/')[-2]
        EXP_time = time_for_file()
        args.path = os.path.join(
            args.path, args.exp_name, EXP_time +
            '-resume-{:}'.format(resume_exp_name + '-' + resume_EXP_time))
        os.makedirs(args.path, exist_ok=True)
        create_exp_dir(args.path, scripts_to_save='../Efficient_AutoDeeplab')
        #save_configs(args.__dict__, args.path, 'search')
        #logger = prepare_logger(args)
        #logger.log("=> loading configs from the file '{:}' start.".format(args.resume_file), mode='info')
        torch.set_num_threads(args.workers)
        set_manual_seed(args.random_seed)
    else:
        # training initialization
        torch.set_num_threads(args.workers)
        set_manual_seed(args.random_seed)
        EXP_time = time_for_file()
        args.path = os.path.join(args.path, args.exp_name, EXP_time)
        os.makedirs(args.path, exist_ok=True)
        create_exp_dir(args.path, scripts_to_save='../Efficient_AutoDeeplab')

    # weight optimizer config, related to network_weight_optimizer, scheduler, and criterion
    if args.weight_optimizer_type == 'SGD':
        weight_optimizer_params = {
            'momentum': args.momentum,
            'nesterov': args.nesterov,
            'weight_decay': args.weight_decay,
        }
    elif args.weight_optimizer_type == 'RMSprop':
        weight_optimizer_params = {
            'momentum': args.momentum,
            'weight_decay': args.weight_decay,
        }
    else:
        weight_optimizer_params = None
    if args.scheduler == 'cosine':
        scheduler_params = {'T_max': args.T_max, 'eta_min': args.eta_min}
    elif args.scheduler == 'multistep':
        scheduler_params = {
            'milestones': args.milestones,
            'gammas': args.gammas
        }
    elif args.scheduler == 'exponential':
        scheduler_params = {'gamma': args.gamma}
    elif args.scheduler == 'linear':
        scheduler_params = {'min_lr': args.min_lr}
    else:
        scheduler_params = None
    if args.criterion == 'SmoothSoftmax':
        criterion_params = {'label_smooth': args.label_smoothing}
    else:
        criterion_params = None
    # weight_optimizer_config, used in run_manager to get weight_optimizer, scheduler, and criterion.
    args.optimizer_config = {
        'optimizer_type': args.weight_optimizer_type,
        'optimizer_params': weight_optimizer_params,
        'scheduler': args.scheduler,
        'scheduler_params': scheduler_params,
        'criterion': args.criterion,
        'criterion_params': criterion_params,
        'init_lr': args.init_lr,
        'warmup_epoch': args.warmup_epochs,
        'epochs': args.epochs,
        'class_num': args.nb_classes,
    }
    # arch_optimizer_config
    if args.arch_optimizer_type == 'adam':
        args.arch_optimizer_params = {
            'betas': (args.arch_adam_beta1, args.arch_adam_beta2),
            'eps': args.arch_adam_eps
        }
    else:
        args.arch_optimizer_params = None
    # related to entropy constraint loss
    # TODO: pay attention, use separate lambda for cell_entropy and network_entropy.
    if args.reg_loss_type == 'add#linear':
        args.reg_loss_params = {
            'lambda1': args.reg_loss_lambda1,
            'lambda2': args.reg_loss_lambda2,
        }
    elif args.reg_loss_type == 'add#linear#linearschedule':
        args.reg_loss_params = {
            'lambda1': args.reg_loss_lambda1,
            'lambda2': args.reg_loss_lambda2,
        }
    elif args.reg_loss_type == 'mul#log':
        args.reg_loss_params = {
            'alpha': args.reg_loss_alpha,
            'beta': args.reg_loss_beta
        }
    else:
        args.reg_loss_params = None
    # perform config save, for run_configs and arch_search_configs
    save_configs(args.__dict__, args.path, 'search')
    logger = prepare_logger(args)
    logger.log("=> loading configs from the file '{:}' start.".format(
        args.resume_file) if args.search_resume else
               '=> train-search phase initialization done',
               mode='info')

    #print(args.optimizer_config)
    run_config = RunConfig(**args.__dict__)
    arch_search_config = ArchSearchConfig(**args.__dict__)

    # args.bn_momentum and args.bn_eps are not used

    super_network = GumbelAutoDeepLab(args.filter_multiplier,
                                      args.block_multiplier,
                                      args.steps,
                                      args.nb_classes,
                                      args.nb_layers,
                                      args.bn_momentum,
                                      args.bn_eps,
                                      args.search_space,
                                      logger,
                                      affine=False)

    # calculate init entropy
    _, network_index = super_network.get_network_arch_hardwts(
    )  # set self.hardwts again
    _, aspp_index = super_network.get_aspp_hardwts_index()
    single_path = super_network.sample_single_path(args.nb_layers, aspp_index,
                                                   network_index)
    cell_arch_entropy, network_arch_entropy, entropy = super_network.calculate_entropy(
        single_path)

    logger.log('=> entropy : {:}'.format(entropy), mode='info')

    vis_init_params = {
        'cell_entropy': cell_arch_entropy,
        'network_entropy': network_arch_entropy,
        'entropy': entropy,
    }
    #vis_elements = args.elements
    #vis_elements.extend(['cell_entropy', 'network_entropy', 'entropy'])
    #args.elements = vis_elements
    args.vis_init_params = vis_init_params
    if args.open_vis:
        vis = visdomer(args.port,
                       args.server,
                       args.exp_name,
                       args.compare_phase,
                       args.elements,
                       init_params=args.vis_init_params)
    else:
        vis = None
    '''
    from exp.autodeeplab.auto_deeplab import AutoDeeplab
    super_network = AutoDeeplab(args.filter_multiplier, args.block_multiplier, args.steps,
                                args.nb_classes, args.nb_layers, args.search_space, logger, affine=False)
    '''
    '''
    from exp.fixed_network_level.supernetwork import FixedNetwork
    super_network = FixedNetwork(args.filter_multiplier, args.block_multiplier, args.steps, args.nb_classes,
                                 args.nb_layers, args.search_space, logger, affine=False)
    '''
    arch_search_run_manager = ArchSearchRunManager(args.path, super_network,
                                                   run_config,
                                                   arch_search_config, logger,
                                                   vis)
    display_all_families_information(args, 'search', arch_search_run_manager,
                                     logger)
    '''
    # get_model_infos, perform inference
    # TODO: modify the way of forward into gdas_forward
    flop, param = get_model_infos(super_network, [1, 3, 512, 512])
    print('||||||| FLOPS & PARAMS |||||||')
    print('FLOP = {:.2f} M, Params = {:.2f} MB'.format(flop, param))
    '''
    # 1. resume warmup phase
    # 2. resume search phase
    # 3. add last_info log × not last_info, every time, the saved_file name is not consistent, should given resume_file

    # 1. given EXP file time           completed :: resume_file :: ->EXP-time
    # 2. get configs, and load config  completed
    # 3. resume checkpoint             completed

    # TODO: have issue in resume semantics. After resume, it will allocate more GPU memory than the normal one, which will raise OOM in search phase.

    if args.search_resume:
        if os.path.exists(args.resume_file):  # resume_file :: path to EXP-time
            logger.log("=> loading checkpoint of the file '{:}' start".format(
                args.resume_file),
                       mode='info')
            warm_up_checkpoint = os.path.join(
                args.resume_file, 'checkpoints',
                'seed-{:}-warm.pth'.format(args.random_seed))
            search_checkpoint = os.path.join(
                args.resume_file, 'checkpoints',
                'seed-{:}-search.pth'.format(args.random_seed))
            if args.resume_from_warmup == False:  # resume checkpoint in search phase
                checkpoint = torch.load(search_checkpoint)
                super_network.load_state_dict(checkpoint['state_dict'])
                arch_search_run_manager.run_manager.optimizer.load_state_dict(
                    checkpoint['weight_optimizer'])
                arch_search_run_manager.run_manager.scheduler.load_state_dict(
                    checkpoint['weight_scheduler'])
                arch_search_run_manager.arch_optimizer.load_state_dict(
                    checkpoint['arch_optimizer'])
                arch_search_run_manager.run_manager.monitor_metric = checkpoint[
                    'best_monitor'][0]
                arch_search_run_manager.run_manager.best_monitor = checkpoint[
                    'best_monitor'][1]
                arch_search_run_manager.warmup = checkpoint['warmup']
                arch_search_run_manager.start_epoch = checkpoint[
                    'start_epochs']  # pay attention:: start_epochs and warmup_epoch in nas_manager
                logger.log(
                    "=> loading checkpoint of the file '{:}' start with {:}-th epochs in search phase"
                    .format(search_checkpoint, checkpoint['start_epochs']),
                    mode='info')
            else:  # resume checkpoint in warmup phase
                checkpoint = torch.load(warm_up_checkpoint)
                super_network.load_state_dict(checkpoint['state_dict'])
                arch_search_run_manager.run_manager.optimizer.load_state_dict(
                    checkpoint['weight_optimizer'])
                arch_search_run_manager.run_manager.scheduler.load_state_dict(
                    checkpoint['weight_scheduler'])
                arch_search_run_manager.warmup = checkpoint['warmup']
                arch_search_run_manager.warmup_epoch = checkpoint[
                    'warmup_epoch']
                logger.log(
                    "=> loading checkpoint of the file '{:}' start with {:}-th epochs in warmup phase"
                    .format(warm_up_checkpoint, checkpoint['warmup_epoch']),
                    mode='info')
        else:
            logger.log(
                "=> can not find the file: {:} please re-confirm it\n"
                "=> start warm-up and search from scratch... ...".format(
                    args.resume_file),
                mode='info')
    else:
        logger.log("=> start warm-up and search from scratch... ...",
                   mode='info')

    # torch.autograd.set_detect_anomaly(True)
    # warm up phase
    if arch_search_run_manager.warmup:
        arch_search_run_manager.warm_up(warmup_epochs=args.warmup_epochs)
    # train search phase
    arch_search_run_manager.train()

    logger.close()
def main(args):
    assert torch.cuda.is_available(), 'CUDA is not available'
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = True

    if args.retrain_resume and args.evaluation == False:  # if resume from the last retrain
        config_file_path = os.path.join(args.resume_file, 'retrain.config')
        assert os.path.exists(
            config_file_path
        ), 'cannot find config_file {:} from the last retrain phase'.format(
            config_file_path)
        f = open(config_file_path, 'r')
        config_dict = json.load(f)
        f.close()
        configs_resume(args, config_dict,
                       'retrain')  # config resume from the last retrain
        # get EXP_time in last_retrain for flag
        EXP_time_last_retrain = config_dict['path'].split('/')[-1]
        Exp_name_last_retrain = config_dict['path'].split('/')[-2]
        EXP_time = time_for_file()
        args.path = os.path.join(args.path, args.exp_name, EXP_time)
        torch.set_num_threads(args.workers)
        set_manual_seed(args.random_seed)  # from the last retrain.
        os.makedirs(args.path, exist_ok=True)
        create_exp_dir(args.path, scripts_to_save='../Efficient_AutoDeeplab')
    elif args.retrain_resume == False and args.evaluation:
        config_file_path = os.path.join(args.evaluation_ckpt, 'retrain.config')
        assert os.path.exists(
            config_file_path
        ), 'cannot find config_file {:} from the best checkpoint'.format(
            config_file_path)
        f = open(config_file_path, 'r')
        config_dict = json.load(f)
        f.close()
        configs_resume(args, config_dict, 'retrain')
        EXP_time_best_checkpoint = config_dict['path'].split('/')[-1]
        EXP_name_best_checkpoint = config_dict['path'].split('/')[-2]
        EXP_time = time_for_file()
        args.path = os.path.join(args.path, args.exp_name, EXP_time)
        torch.set_num_threads(args.workers)
        set_manual_seed(args.random_seed)
        os.makedirs(args.path, exist_ok=True)
        create_exp_dir(args.path, scripts_to_save='../Efficient_AutoDeeplab')

    elif args.retrain_resume == False and args.evaluation == False:
        # resume from the searching phrase.
        config_file_path = os.path.join(args.checkpoint_file, 'search.config')
        assert os.path.exists(
            config_file_path
        ), 'cannot find config_file {:} from the search phase'.format(
            config_file_path)
        f = open(config_file_path, 'r')
        config_dict = json.load(f)
        f.close()
        args.random_seed = config_dict['random_seed']  # get random_seed
        # get EXP_time in search phase, for flag
        EXP_time_search = config_dict['path'].split('/')[-1]
        EXP_name_search = config_dict['path'].split('/')[-2]
        EXP_time = time_for_file()
        args.path = os.path.join(args.path, args.exp_name, EXP_time)
        torch.set_num_threads(args.workers)
        set_manual_seed(
            args.random_seed)  # from the last retrain phase or search phase.
        os.makedirs(args.path, exist_ok=True)
        create_exp_dir(args.path, scripts_to_save='../Efficient_AutoDeeplab')
    else:
        raise NotImplementedError(
            'invalid mode retrain_resume {:} open_vis {:}'.format(
                args.retrain_resume, args.open_vis))
    # optimizer params
    if args.weight_optimizer_type == 'SGD':
        weight_optimizer_params = {
            'momentum': args.momentum,
            'nesterov': args.nesterov,
            'weight_decay': args.weight_decay,
        }
    elif args.weight_optimizer_type == 'RMSprop':
        weight_optimizer_params = {
            'momentum': args.momentum,
            'weight_decay': args.weight_decay,
        }
    else:
        weight_optimizer_params = None
    # scheduler params
    if args.scheduler == 'cosine':
        scheduler_params = {'T_max': args.T_max, 'eta_min': args.eta_min}
    elif args.scheduler == 'multistep':
        scheduler_params = {
            'milestones': args.milestones,
            'gammas': args.gammas
        }
    elif args.scheduler == 'exponential':
        scheduler_params = {'gamma': args.gamma}
    elif args.scheduler == 'linear':
        scheduler_params = {'min_lr': args.min_lr}
    else:
        scheduler_params = None
    # criterion params
    if args.criterion == 'SmoothSoftmax':
        criterion_params = {'label_smooth': args.label_smoothing}
    else:
        criterion_params = None

    args.optimizer_config = {
        'optimizer_type': args.weight_optimizer_type,
        'optimizer_params': weight_optimizer_params,
        'scheduler': args.scheduler,
        'scheduler_params': scheduler_params,
        'criterion': args.criterion,
        'criterion_params': criterion_params,
        'init_lr': args.init_lr,
        'epochs': args.epochs,
        'class_num': args.nb_classes,
    }
    if args.search_space == 'autodeeplab':
        conv_candidates = autodeeplab
    elif args.search_space == 'proxyless':
        conv_candidates = proxyless
    elif args.search_space == 'counter':
        conv_candidates = counter
    elif args.search_space == 'my_search_space':
        conv_candidates = my_search_space
    else:
        raise ValueError('search_space : {:} is not supported'.format(
            args.search_space))

    # related to entropy constraint loss
    if args.reg_loss_type == 'add#linear':
        args.reg_loss_params = {
            'lambda1': args.reg_loss_lambda1,
            'lambda2': args.reg_loss_lambda2
        }
    elif args.reg_loss_type == 'add#linear#linearschedule':
        args.reg_loss_params = {
            'lambda1': args.reg_loss_lambda1,
            'lambda2': args.reg_loss_lambda2,
        }
    elif args.reg_loss_type == 'mul#log':
        args.reg_loss_params = {
            'alpha': args.reg_loss_alpha,
            'beta': args.reg_loss_beta
        }
    else:
        args.reg_loss_params = None

    logger = prepare_logger(args)
    if args.retrain_resume and args.evaluation == False:
        logger.log(
            '=> loading configs {:} from the last retrain phase.'.format(
                config_file_path), 'info')
    elif args.retrain_resume == False and args.evaluation:
        logger.log(
            '=> loading configs {:} from the best retrain phrase.'.format(
                config_file_path), 'info')
    elif args.retrain_resume == False and args.evaluation == False:
        logger.log(
            '=> loading configs {:} from search phrase.'.format(
                config_file_path), 'info')

    # save new config, and create logger.
    save_configs(args.__dict__, args.path, 'retrain')
    # create run_config
    run_config = RunConfig(**args.__dict__)

    # only open_vis in retrain phrase
    if args.open_vis:
        assert args.evaluation == False, 'invalid mode open_vis {:} and open_test {:}'.format(
            args.open_vis, args.evaluation)
        vis = visdomer(args.port,
                       args.server,
                       args.exp_name,
                       args.compare_phase,
                       args.elements,
                       init_params=None)
    else:
        vis = None

    #print(args.evaluation)

    if args.evaluation:
        assert os.path.exists(args.evaluation_ckpt
                              ), 'cannot find the best checkpoint {:}'.format(
                                  args.evaluation_ckpt)
        checkpoint_path = os.path.join(
            args.evaluation_ckpt, 'checkpoints',
            'seed-{:}-retrain-best.pth'.format(args.random_seed))
        checkpoint = torch.load(checkpoint_path)
        actual_path, cell_genotypes = checkpoint['actual_path'], checkpoint[
            'cell_genotypes']
        #print(actual_path)
        #print(cell_genotypes)
        '''
        my_search_space = [
                '3x3_SepFacConv1', '5x5_SepFacConv1',
                '3x3_SepFacConv2', '5x5_SepFacConv2',
                '3x3_SepFacConv4', '5x5_SepFacConv4',]
        '''

        # 0:4 1:4 2:5 3:5 4:4 5:2
        actual_path = [0, 0, 0, 0, 1, 1, 1, 2, 3, 3, 3, 2]
        cell_genotypes = [(0, [[('2<-1', 0), ('2<-0', 3)]]),
                          (2, [[('2<-1', 4), ('2<-0', 1)]]),
                          (7, [[('2<-1', 3), ('2<-0', 0)]]),
                          (15, [[('2<-1', 1), ('2<-0', 2)]]),
                          (27, [[('2<-1', 4), ('2<-0', 3)]]),
                          (38, [[('2<-1', 4), ('2<-0', 0)]]),
                          (48, [[('2<-1', 2), ('2<-0', 5)]]),
                          (60, [[('2<-1', 0), ('2<-0', 1)]]),
                          (73, [[('2<-0', 3), ('2<-1', 3)]]),
                          (84, [[('2<-1', 2), ('2<-0', 1)]]),
                          (94, [[('2<-1', 4), ('2<-0', 2)]]),
                          (102, [[('2<-1', 2), ('2<-0', 5)]])]
        '''
        actual_path = [0, 0, 0, 1, 1, 1, 2, 3, 3, 3, 2, 1]
        cell_genotypes = [(0, [[('2<-1', 4), ('2<-0', 5)]]), (2, [[('2<-1', 3), ('2<-0', 1)]]), (7, [[('2<-1', 2), ('2<-0', 5)]]), (17, [[('2<-0',
 1), ('2<-1', 1)]]), (28, [[('2<-1', 4), ('2<-0', 1)]]), (38, [[('2<-1', 4), ('2<-0', 2)]]), (50, [[('2<-1', 5), ('2<-0', 1)]]), (63, [[('2<-1', 4), ('2<-0', 2)]]), (74, [[('2<-1', 1), ('2<-0', 0)]]), (84, [[('2<-1', 3), ('2<-0', 1)]]), (92, [[('2<-1', 4), ('2<-0', 5)]]), (99, [[('2<-1', 0), ('2<-0', 3)]])]
 '''

        normal_network = NewGumbelAutoDeeplab(args.nb_layers,
                                              args.filter_multiplier,
                                              args.block_multiplier,
                                              args.steps,
                                              args.nb_classes,
                                              actual_path,
                                              cell_genotypes,
                                              args.search_space,
                                              affine=True)

        # save new config, and create logger.
        #save_configs(args.__dict__, args.path, 'retrain')
        # create run_config
        #run_config = RunConfig(**args.__dict__)

        evaluation_run_manager = RunManager(args.path,
                                            normal_network,
                                            logger,
                                            run_config,
                                            vis,
                                            out_log=True)
        normal_network.load_state_dict(checkpoint['state_dict'])
        display_all_families_information(args, 'retrain',
                                         evaluation_run_manager, logger)
        logger.log(
            '=> loaded the best checkpoint from {:}, start evaluation'.format(
                checkpoint_path), 'info')

        evaluation_run_manager.validate(is_test=True, use_train_mode=False)

    else:
        # resume from the last retrain
        if args.retrain_resume:
            logger.log(
                '=> Loading checkpoint from {:} of the last retrain phase'.
                format(args.resume_file),
                mode='info')
            # checkpoint_file from the last retrain phase.
            checkpoint_path = os.path.join(
                args.resume_file, 'checkpoints',
                'seed-{:}-retrain.pth'.format(args.random_seed))
            assert os.path.exists(
                checkpoint_path
            ), 'cannot find retrain checkpoint file {:}'.format(
                checkpoint_path)
            checkpoint = torch.load(checkpoint_path)
            actual_path, cell_genotypes = checkpoint[
                'actual_path'], checkpoint['cell_genotypes']
            args.actual_path = actual_path
            args.cell_genotypes = cell_genotypes
            normal_network = NewGumbelAutoDeeplab(args.nb_layers,
                                                  args.filter_multiplier,
                                                  args.block_multiplier,
                                                  args.steps,
                                                  args.nb_classes,
                                                  actual_path,
                                                  cell_genotypes,
                                                  args.search_space,
                                                  affine=True)
            flop, param = get_model_infos(normal_network, [1, 3, 512, 512])
            logger.log(
                '|#################### Network Info ####################|\n'
                'FLOPs:{:.2f} M,     Params:{:.2f} MB'.format(flop, param),
                mode='info')

            # save new config, and create logger.
            #save_configs(args.__dict__, args.path, 'retrain')
            # create run_config
            #run_config = RunConfig(**args.__dict__)

            retrain_run_manager = RunManager(args.path,
                                             normal_network,
                                             logger,
                                             run_config,
                                             vis,
                                             out_log=True)
            normal_network.load_state_dict(checkpoint['state_dict'])
            display_all_families_information(args, 'retrain',
                                             retrain_run_manager, logger)
            retrain_run_manager.optimizer.load_state_dict(
                checkpoint['weight_optimizer'])
            retrain_run_manager.scheduler.load_state_dict(
                checkpoint['scheduler'])
            retrain_run_manager.monitor_metric = checkpoint['best_monitor'][0]
            retrain_run_manager.best_monitor = checkpoint['best_monitor'][1]
            retrain_run_manager.start_epoch = checkpoint[
                'start_epoch']  # has +1
            logger.log(
                '=> loaded checkpoint file {:} from the last retrain phase, starts with {:}-th epoch'
                .format(checkpoint_path, checkpoint['start_epoch']),
                mode='info')
        else:
            # from search phrase, load the optimal architecture and perform retrain.
            arch_checkpoint_path = os.path.join(
                args.checkpoint_file, 'checkpoints',
                'seed-{:}-arch-best.pth'.format(args.random_seed))

            # TODO, the best epoch has gotten in advance.
            #checkpoint_path = os.path.join(args.checkpoint_file, 'checkpoints', 'seed-{:}-search-best.pth'.format(args.random_seed))
            #tmp_checkpoint = torch.load(checkpoint_path)
            #best_epoch = tmp_checkpoint['start_epochs'] - 1
            #logger.log('=> best epochs: {:}'.format(best_epoch), mode='info') # get the best_epoch

            assert os.path.exists(
                arch_checkpoint_path
            ), 'cannot find arch_checkpoint file {:} from search phase'.format(
                arch_checkpoint_path)
            checkpoint = torch.load(arch_checkpoint_path)
            actual_path, cell_genotypes = checkpoint[
                'actual_path'], checkpoint['cell_genotypes']
            new_genotypes = []
            for _index, genotype in cell_genotypes:
                xlist = []
                for edge_genotype in genotype:
                    for (node_str, select_index) in edge_genotype:
                        xlist.append((node_str, conv_candidates[select_index]))
                new_genotypes.append((_index, xlist))
            log_str = 'Obtained actual_path and cell_genotypes:\n' \
                      'Actual_path: {:}\n' \
                      'Genotype:\n'.format(actual_path)
            for _index, genotype in new_genotypes:
                log_str += 'index: {:} arch: {:}\n'.format(_index, genotype)
            logger.log(log_str, mode='info')
            args.actual_path = actual_path
            args.cell_genotypes = cell_genotypes
            normal_network = NewGumbelAutoDeeplab(args.nb_layers,
                                                  args.filter_multiplier,
                                                  args.block_multiplier,
                                                  args.steps,
                                                  args.nb_classes,
                                                  actual_path,
                                                  cell_genotypes,
                                                  args.search_space,
                                                  affine=True)

            flop, param = get_model_infos(normal_network, [1, 3, 512, 512])
            logger.log(
                '|#################### Network Info ####################|\n'
                'FLOPs:{:.2f} M,     Params:{:.2f} MB'.format(flop, param),
                mode='info')

            # save new config, and create logger.
            #save_configs(args.__dict__, args.path, 'retrain')
            # create run_config
            #run_config = RunConfig(**args.__dict__)

            retrain_run_manager = RunManager(args.path,
                                             normal_network,
                                             logger,
                                             run_config,
                                             vis,
                                             out_log=True)
            #normal_network.load_state_dict(checkpoint['state_dict'])
            display_all_families_information(args, 'retrain',
                                             retrain_run_manager, logger)
            logger.log(
                '=> Construct NewGumbelAutoDeeplab according to the last-arch obtained from search phase',
                mode='info')

        # perform train and validation in train() method
        retrain_run_manager.train()

    logger.close()
    # args.path -->| save_path: save ckpts: latest.txt, checkpoint-epoch.pth.tar, checkpoint-best.pth.tar,
    #                                       checkpoint-warmup.pth.tar
    #              | log_path: save logs: net_info.txt, net.config, run.config
    #                                     gradient_search.txt, arch_txt, train_console.txt, valid_test_console.txt
    #              | prediction: save predictions
    #              | learned_net: normal network configs: net.config, run.config, init
    #              | dataset+'_classes_weights.npy  
    '''
    '''
    # Noting: 1.get ride of logging print.
    '''
    args = obtain_train_search_args()



    set_manual_seed(args.random_seed)

    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = True

    print_experiment_environment()

    #os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu_ids)
    os.makedirs(args.path, exist_ok=True)

    EXP_time = time_for_file()
    # /home/jingweipeng/ljb/Jingbo.TTB/proxy_auto_deeplab/exp_time
    args.path = os.path.join(args.path, args.exp_name, EXP_time)

    # save experiment scripts
Beispiel #6
0
def main(args):
    assert torch.cuda.is_available(), 'CUDA is not available'
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = True

    if args.retrain_resume:
        config_file_path = os.path.join(args.resume_file, 'retrain.config')
        assert os.path.exists(
            config_file_path
        ), 'cannot find config_file {:} from the last retrain phase'.format(
            config_file_path)
        f = open(config_file_path, 'r')
        config_dict = json.load(f)
        f.close()
        configs_resume(args, config_dict, 'retrain')
        # get EXP_time in last_retrain for flag
        EXP_time_last_retrain = config_dict['path'].split('/')[-1]
        EXP_time = time_for_file()
        args.path = os.path.join(
            args.path, args.exp_name,
            EXP_time + '-resume-{:}'.format(EXP_time_last_retrain))
        torch.set_num_threads(args.workers)
        set_manual_seed(
            args.random_seed)  # from the last retrain phase or search phase.
        os.makedirs(args.path, exist_ok=True)
        create_exp_dir(args.path, scripts_to_save=glob.glob('./*/*.py'))
        save_configs(args.__dict__, args.path, 'retrain')
        logger = prepare_logger(args)
        logger.log(
            '=> loading configs {:} from the last retrain phase.'.format(
                config_file_path),
            mode='info')
        if args.search_space == 'autodeeplab':
            conv_candidates = autodeeplab
        elif args.search_space == 'proxyless':
            conv_candidates = proxyless
        elif args.search_space == 'my_search_space':
            conv_candidates = my_search_space
        else:
            raise ValueError('search space {:} is not supported'.format(
                args.search_space))
    else:
        # resume partial configs setting and arch_checkpoint from the search phase by default.
        config_file_path = os.path.join(args.checkpoint_file, 'search.config')
        assert os.path.exists(
            config_file_path
        ), 'cannot find config_file {:} from the search phase'.format(
            config_file_path)
        f = open(config_file_path, 'r')
        config_dict = json.load(f)
        f.close()
        args.random_seed = config_dict['random_seed']
        # get EXP_time in search phase, for flag
        EXP_time_search = config_dict['path'].split('/')[-1]
        EXP_time = time_for_file()
        args.path = os.path.join(
            args.path, args.exp_name,
            EXP_time + '-resume-{:}'.format(EXP_time_search))
        torch.set_num_threads(args.workers)
        set_manual_seed(
            args.random_seed)  # from the last retrain phase or search phase.
        os.makedirs(args.path, exist_ok=True)
        create_exp_dir(args.path, scripts_to_save=glob.glob('./*/*.py'))
        save_configs(args.__dict__, args.path, 'retrain')
        logger = prepare_logger(args)
        logger.log(
            '=> starting retrain from the search phase config {:}.'.format(
                config_file_path),
            mode='info')

        # optimizer params
        if args.weight_optimizer_type == 'SGD':
            weight_optimizer_params = {
                'momentum': args.momentum,
                'nesterov': args.nesterov,
                'weight_decay': args.weight_decay,
            }
        elif args.weight_optimizer_type == 'RMSprop':
            weight_optimizer_params = {
                'momentum': args.momentum,
                'weight_decay': args.weight_decay,
            }
        else:
            weight_optimizer_params = None
        # scheduler params
        if args.scheduler == 'cosine':
            scheduler_params = {'T_max': args.T_max, 'eta_min': args.eta_min}
        elif args.scheduler == 'multistep':
            scheduler_params = {
                'milestones': args.milestones,
                'gammas': args.gammas
            }
        elif args.scheduler == 'exponential':
            scheduler_params = {'gamma': args.gamma}
        elif args.scheduler == 'linear':
            scheduler_params = {'min_lr': args.min_lr}
        else:
            scheduler_params = None
        # criterion params
        if args.criterion == 'SmoothSoftmax':
            criterion_params = {'label_smooth': args.label_smoothing}
        else:
            criterion_params = None

        args.optimizer_config = {
            'optimizer_type': args.weight_optimizer_type,
            'optimizer_params': weight_optimizer_params,
            'scheduler': args.scheduler,
            'scheduler_params': scheduler_params,
            'criterion': args.criterion,
            'criterion_params': criterion_params,
            'init_lr': args.init_lr,
            'epochs': args.epochs,
            'class_num': args.nb_classes,
        }
        if args.search_space == 'autodeeplab':
            conv_candidates = autodeeplab
        elif args.search_space == 'proxyless':
            conv_candidates = proxyless
        elif args.search_space == 'counter':
            conv_candidates = counter
        elif args.search_space == 'my_search_space':
            conv_candidates = my_search_space
        else:
            raise ValueError('search_space : {:} is not supported'.format(
                args.search_space))

        # related to entropy constraint loss
        if args.reg_loss_type == 'add#linear':
            args.reg_loss_params = {'lambda': args.reg_loss_lambda}
        elif args.reg_loss_type == 'mul#log':
            args.reg_loss_params = {
                'alpha': args.reg_loss_alpha,
                'beta': args.reg_loss_beta
            }
        else:
            args.reg_loss_params = None

    # create run_config
    run_config = RunConfig(**args.__dict__)

    #if args.open_test == False: # retrain and validate
    if args.open_vis:  # only open_vis in re-train phase, rather than both re-train and test.
        vis = visdomer(args.port,
                       args.server,
                       args.exp_name,
                       args.compare_phase,
                       args.elements,
                       init_params=None)
    else:
        vis = None
    if args.retrain_resume:
        logger.log(
            '=> Loading checkpoint from {:} of the last retrain phase'.format(
                args.resume_file),
            mode='info')
        # checkpoint_file from the last retrain phase.
        checkpoint_path = os.path.join(
            args.resume_file, 'checkpoints',
            'seed-{:}-retrain.pth'.format(args.random_seed))
        assert os.path.exists(
            checkpoint_path), 'cannot find retrain checkpoint file {:}'.format(
                checkpoint_path)
        checkpoint = torch.load(checkpoint_path)
        actual_path, cell_genotypes = checkpoint['actual_path'], checkpoint[
            'cell_genotypes']
        args.actual_path = actual_path
        args.cell_genotypes = cell_genotypes
        normal_network = NewGumbelAutoDeeplab(args.nb_layers,
                                              args.filter_multiplier,
                                              args.block_multiplier,
                                              args.steps,
                                              args.nb_classes,
                                              actual_path,
                                              cell_genotypes,
                                              args.search_space,
                                              affine=True)
        retrain_run_manager = RunManager(args.path,
                                         normal_network,
                                         logger,
                                         run_config,
                                         vis,
                                         out_log=True)
        normal_network.load_state_dict(checkpoint['state_dict'])
        display_all_families_information(args, 'retrain', retrain_run_manager,
                                         logger)
        retrain_run_manager.optimizer.load_state_dict(
            checkpoint['weight_optimizer'])
        retrain_run_manager.scheduler.load_state_dict(checkpoint['scheduler'])
        retrain_run_manager.monitor_metric = checkpoint['best_monitor'][0]
        retrain_run_manager.best_monitor = checkpoint['best_monitor'][1]
        retrain_run_manager.start_epoch = checkpoint['start_epoch']
        logger.log(
            '=> loaded checkpoint file {:} from the last retrain phase, starts with {:}-th epoch'
            .format(checkpoint_path, checkpoint['start_epoch']),
            mode='info')
    else:
        # todo from the search phase, read the last arch_checkpoint, rather than the best one.
        arch_checkpoint_path = os.path.join(
            args.checkpoint_file, 'checkpoints',
            'seed-{:}-arch.pth'.format(args.random_seed))
        assert os.path.exists(
            arch_checkpoint_path
        ), 'cannot find arch_checkpoint file {:} from search phase'.format(
            arch_checkpoint_path)
        checkpoint = torch.load(arch_checkpoint_path)
        actual_path, cell_genotypes = checkpoint['actual_path'], checkpoint[
            'cell_genotypes']
        new_genotypes = []
        for _index, genotype in cell_genotypes:
            xlist = []
            for edge_genotype in genotype:
                for (node_str, select_index) in edge_genotype:
                    xlist.append((node_str, conv_candidates[select_index]))
            new_genotypes.append((_index, xlist))
        log_str = 'Obtained actual_path and cell_genotypes:\n' \
                  'Actual_path: {:}\n' \
                  'Genotype:\n'.format(actual_path)
        for _index, genotype in new_genotypes:
            log_str += 'index: {:} arch: {:}\n'.format(_index, genotype)
        logger.log(log_str, mode='info')
        args.actual_path = actual_path
        args.cell_genotypes = cell_genotypes
        normal_network = NewGumbelAutoDeeplab(args.nb_layers,
                                              args.filter_multiplier,
                                              args.block_multiplier,
                                              args.steps,
                                              args.nb_classes,
                                              actual_path,
                                              cell_genotypes,
                                              args.search_space,
                                              affine=True)
        retrain_run_manager = RunManager(args.path,
                                         normal_network,
                                         logger,
                                         run_config,
                                         vis,
                                         out_log=True)
        #normal_network.load_state_dict(checkpoint['state_dict'])
        display_all_families_information(args, 'retrain', retrain_run_manager,
                                         logger)
        logger.log(
            '=> Construct NewGumbelAutoDeeplab according to the last-arch obtained from search phase',
            mode='info')
    # perform train and validation in train() method
    retrain_run_manager.train()
    '''
    else: # test phase
        checkpoint_path = os.path.join(args.resume_file, 'checkpoints', 'seed-{:}-retrain-best.pth'.format(args.random_seed))
        assert os.path.exists(checkpoint_path), 'cannot find best checkpoint {:} from the retrain phase'.format(checkpoint_path)
        checkpoint = torch.load(checkpoint_path)
        actual_path, cell_genotypes = checkpoint['actual_path'], checkpoint['cell_genotypes']
        normal_network = NewGumbelAutoDeeplab(args.nb_layers, args.filter_multiplier, args.block_multiplier,
                                              args.steps, args.nb_classes, actual_path, cell_genotypes, args.search_space, affine=True)
        normal_network.load_state_dict(checkpoint['state_dict'])
        test_manager = RunManager(args.path, normal_network, logger, run_config, vis=None, out_log=True)
        display_all_families_information(args, 'retrain', test_manager, logger)

        # save testing configs
        save_configs(args.__dict__, args.path, 'test')
        test_manager.validate(epoch=None, is_test=    True, use_train_mode = False)
    '''
    logger.close()
Beispiel #7
0
def main(args):

    assert torch.cuda.is_available(), 'CUDA is not available'
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.set_num_threads(args.workers)
    set_manual_seed(args.random_seed)
    print_experiment_environment()
    os.makedirs(args.path, exist_ok=True)
    EXP_time = time_for_file()
    args.path = os.path.join(args.path, args.exp_name, EXP_time)
    create_exp_dir(args.path, scripts_to_save=glob.glob('./*/*.py'))

    # weight optimizer config, related to network_weight_optimizer, scheduler, and criterion
    if args.weight_optimizer_type == 'SGD':
        weight_optimizer_params = {
            'momentum': args.momentum,
            'nesterov': args.nesterov,
            'weight_decay': args.weight_decay,
        }
    elif args.weight_optimizer_type == 'RMSprop':
        weight_optimizer_params = {
            'momentum': args.momentum,
            'weight_decay': args.weight_decay,
        }
    else:
        weight_optimizer_params = None
    if args.scheduler == 'cosine':
        # TODO: add additional params in args
        scheduler_params = {'T_max': args.T_max, 'eta_min': args.eta_min}
    elif args.scheduler == 'multistep':
        scheduler_params = {
            'milestones': args.milestones,
            'gammas': args.gammas
        }
    elif args.scheduler == 'exponential':
        scheduler_params = {'gamma': args.gamma}
    elif args.scheduler == 'linear':
        scheduler_params = {'min_lr': args.min_lr}
    else:
        scheduler_params = None
    if args.criterion == 'SmoothSoftmax':
        criterion_params = {'label_smooth': args.label_smoothing}
    else:
        criterion_params = None
    # weight_optimizer_config, used in run_manager to get weight_optimizer, scheduler, and criterion.
    args.optimizer_config = {
        'optimizer_type': args.weight_optimizer_type,
        'optimizer_params': weight_optimizer_params,
        'scheduler': args.scheduler,
        'scheduler_params': scheduler_params,
        'criterion': args.criterion,
        'criterion_params': criterion_params,
        'init_lr': args.init_lr,
        'warmup_epoch': args.warmup_epochs,
        'epochs': args.total_epochs,
        'class_num': args.nb_classes,
    }
    # TODO need modification
    args.conv_candidates = [
        '3x3_MBConv3',
        '3x3_MBConv6',
        '5x5_MBConv3',
        '5x5_MBConv6',
        '7x7_MBConv3',
        '7x7_MBConv6',
        'Zero',  #'Identity'
    ]
    run_config = RunConfig(**args.__dict__)
    # arch_optimizer_config
    if args.arch_optimizer_type == 'adam':
        args.arch_optimizer_params = {
            'betas': (args.arch_adam_beta1, args.arch_adam_beta2),
            'eps': args.arch_adam_eps
        }
    else:
        args.arch_optimizer_params = None
    # related to hardware constraint
    # TODO: get rid of
    if args.reg_loss_type == 'add#linear':
        args.reg_loss_params = {'lambda': args.reg_loss_lambda}
    elif args.reg_loss_type == 'mul#log':
        args.reg_loss_params = {
            'alpha': args.reg_loss_alpha,
            'beta': args.reg_loss_beta
        }
    else:
        args.reg_loss_params = None
    arch_search_config = ArchSearchConfig(**args.__dict__)
    # perform config save, for run_configs and arch_search_configs
    save_configs(run_config.config, arch_search_config.config, args.path)

    print('Run Configs:')
    for k, v in run_config.config.items():
        print('\t{}: {}'.format(k, v))
    print('Architecture Search Configs:')
    for k, v in arch_search_config.config.items():
        print('\t{}: {}'.format(k, v))
    # TODO: configs saving
    super_network = GumbelAutoDeepLab(args.filter_multiplier,
                                      args.block_multiplier, args.steps,
                                      args.nb_classes, args.nb_layers,
                                      args.conv_candidates)

    arch_search_run_manager = ArchSearchRunManager(args.path, super_network,
                                                   run_config,
                                                   arch_search_config)

    # TODO: perform resume

    # warm up phase
    if arch_search_run_manager.warmup:
        arch_search_run_manager.warm_up(warmup_epochs=args.warmup_epochs)
    # train search phase
    arch_search_run_manager.train()
Beispiel #8
0
def main(args):

    assert torch.cuda.is_available(), 'CUDA is not available'
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.set_num_threads(args.workers)
    set_manual_seed(args.random_seed)
    #print_experiment_environment()
    EXP_time = time_for_file()
    args.path = os.path.join(args.path, args.exp_name, EXP_time)
    os.makedirs(args.path, exist_ok=True)
    create_exp_dir(args.path, scripts_to_save='../Efficient_AutoDeeplab')

    # weight optimizer config, related to network_weight_optimizer, scheduler, and criterion
    if args.weight_optimizer_type == 'SGD':
        weight_optimizer_params = {
            'momentum': args.momentum,
            'nesterov': args.nesterov,
            'weight_decay': args.weight_decay,
        }
    elif args.weight_optimizer_type == 'RMSprop':
        weight_optimizer_params = {
            'momentum': args.momentum,
            'weight_decay': args.weight_decay,
        }
    else:
        weight_optimizer_params = None
    if args.scheduler == 'cosine':
        scheduler_params = {'T_max': args.T_max, 'eta_min': args.eta_min}
    elif args.scheduler == 'multistep':
        scheduler_params = {
            'milestones': args.milestones,
            'gammas': args.gammas
        }
    elif args.scheduler == 'exponential':
        scheduler_params = {'gamma': args.gamma}
    elif args.scheduler == 'linear':
        scheduler_params = {'min_lr': args.min_lr}
    else:
        scheduler_params = None
    if args.criterion == 'SmoothSoftmax':
        criterion_params = {'label_smooth': args.label_smoothing}
    else:
        criterion_params = None
    # weight_optimizer_config, used in run_manager to get weight_optimizer, scheduler, and criterion.
    args.optimizer_config = {
        'optimizer_type': args.weight_optimizer_type,
        'optimizer_params': weight_optimizer_params,
        'scheduler': args.scheduler,
        'scheduler_params': scheduler_params,
        'criterion': args.criterion,
        'criterion_params': criterion_params,
        'init_lr': args.init_lr,
        'warmup_epoch': args.warmup_epochs,
        'epochs': args.epochs,
        'class_num': args.nb_classes,
    }
    # TODO need modification, not need in counter_network
    args.conv_candidates = [
        '3x3_MBConv3',
        '3x3_MBConv6',
        '5x5_MBConv3',
        '5x5_MBConv6',
        '7x7_MBConv3',
        '7x7_MBConv6',
        'Zero',  #'Identity'
    ]
    run_config = RunConfig(**args.__dict__)
    # arch_optimizer_config
    if args.arch_optimizer_type == 'adam':
        args.arch_optimizer_params = {
            'betas': (args.arch_adam_beta1, args.arch_adam_beta2),
            'eps': args.arch_adam_eps
        }
    else:
        args.arch_optimizer_params = None

    # related to hardware constraint
    # TODO: get rid of
    if args.reg_loss_type == 'add#linear':
        args.reg_loss_params = {'lambda': args.reg_loss_lambda}
    elif args.reg_loss_type == 'mul#log':
        args.reg_loss_params = {
            'alpha': args.reg_loss_alpha,
            'beta': args.reg_loss_beta
        }
    else:
        args.reg_loss_params = None

    arch_search_config = ArchSearchConfig(**args.__dict__)
    # perform config save, for run_configs and arch_search_configs
    save_configs(run_config.config, arch_search_config.config, args.path,
                 'search')
    logger = prepare_logger(args)
    if args.open_vis:
        vis = visdomer(args.port,
                       args.server,
                       args.exp_name,
                       args.compare_phase,
                       args.elements,
                       init_params=None)
    else:
        vis = None
    '''
    super_network = GumbelAutoDeepLab(
        args.filter_multiplier, args.block_multiplier, args.steps,
        args.nb_classes, args.nb_layers, args.bn_momentum, args.bn_eps, args.conv_candidates, logger
    )
    '''
    super_network = CounterMBConvNet(2, search_space=args.search_space)
    train_manager = RunManager(args.path,
                               super_network,
                               logger,
                               run_config,
                               vis=vis,
                               out_log=True)
    # train search phase
    train_manager.train()
    logger.close()