Ejemplo n.º 1
0
def main(args):
    file_name = 'log_%s_%d' % ('gpus', args.gpu)
    logger = setup_logger(file_name,
                          args.save_dir,
                          args.gpu,
                          log_level='DEBUG',
                          filename='%s.txt' % file_name)
    logger.info(args)
    if args.search_space == 'darts':
        with open(args.darts_file_path, 'rb') as f:
            if args.darts_training_nums:
                all_data = pickle.load(f)[:args.darts_training_nums]
            else:
                all_data = pickle.load(f)
    else:
        nasbench_datas = data.build_datasets(args)
        all_data = data.dataset_all(args, nasbench_datas)
    for predictor in args.predictor_list:
        logger.info(
            f'==================  predictor type: {predictor}  ======================'
        )
        predictor_unsupervised(args,
                               predictor,
                               all_data,
                               train_epochs=args.epochs,
                               logger=logger)
def data_consumers(args, q, save_dir, i, search_space):
    set_random_seed(int(str(time.time()).split('.')[0][::-1][:9]))
    file_name = 'log_%s_%d' % ('gpus', i)
    logger = setup_logger(file_name,
                          save_dir,
                          i,
                          log_level='DEBUG',
                          filename='%s.txt' % file_name)
    while True:
        msg = q.get()
        if msg == 'done':
            logger.info('thread %d end' % i)
            break
        iterations = msg['iterate']
        run_experiments_bananas_paradigm(args, save_dir, i, iterations, logger,
                                         search_space)
def model_consumer(q, gpu, save_dir, total_data_dict, model_data, dataset):
    file_name = 'log_%s_%d' % ('gpus', gpu)
    logger = setup_logger(file_name,
                          save_dir,
                          gpu,
                          log_level='DEBUG',
                          filename='%s.txt' % file_name)
    while True:
        msg = q.get()
        if msg == 'done':
            logger.info('thread %d end' % gpu)
            break
        model_idx = msg['idx']
        model = model_data[model_idx]
        if dataset == 'cifar10':
            val_acc, test_acc, hash_key = model_trainer_cifar10(
                model, gpu, logger, save_dir)
            total_data_dict[model_idx] = [val_acc, test_acc, hash_key]
Ejemplo n.º 4
0
def main_worker(gpu, ngpus_per_node, args, distributed=True):
    args.gpu = gpu + args.gpu_base
    if args.multiprocessing_distributed:
        if args.gpu == args.gpu_base:
            file_name = 'log_%s_%d' % ('gpus', args.gpu)
            logger = setup_logger(file_name,
                                  args.save_dir,
                                  args.gpu,
                                  log_level='DEBUG',
                                  filename='%s.txt' % file_name)
        else:
            logger = DummyLogger()
    else:
        file_name = 'log_%s_%d' % ('gpus', args.gpu)
        logger = setup_logger(file_name,
                              args.save_dir,
                              args.gpu,
                              log_level='DEBUG',
                              filename='%s.txt' % file_name)
    # suppress printing if not master
    if args.multiprocessing_distributed and args.gpu != 0:

        def print_pass(*args):
            pass

        builtins.print = print_pass

    if args.gpu is not None:
        logger.info("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)
    # create model
    logger.info("=> creating model '{}'".format(args.arch))
    model = CCLNas(build_model(args.arch, args.with_g_func),
                   args.input_dim,
                   args.moco_dim_fc,
                   args.moco_dim,
                   distributed=distributed,
                   train_samples=args.train_samples,
                   t=args.moco_t,
                   min_negative_size=args.min_negative_size,
                   margin=args.margin)
    logger.info(model)

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(
                (args.workers + ngpus_per_node - 1) / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
        # comment out the following line for debugging
        # raise NotImplementedError("Only DistributedDataParallel is supported.")
    else:
        # AllGather implementation (batch shuffle, queue update, etc.) in
        # this code only supports DistributedDataParallel.
        raise NotImplementedError("Only DistributedDataParallel is supported.")

    criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    optimizer = torch.optim.Adam(model.parameters(),
                                 args.lr,
                                 betas=(0.0, 0.9),
                                 weight_decay=args.weight_decay)
    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            logger.info("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            logger.info("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            logger.info("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    if args.search_space == 'nasbench_101':
        train_dataset = NASBenche101Dataset(model_type='SS_CCL')
    elif args.search_space == 'nasbench_201':
        train_dataset = NASBenche201Dataset(model_type='SS_CCL')
    elif args.search_space == 'darts':
        train_dataset = DartsDataset(model_type='SS_CCL',
                                     arch_path=args.darts_arch_path)
    else:
        raise NotImplementedError('This kind nasbench has not implemented.')

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
    else:
        train_sampler = None
    collator = BatchCollator()
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=False,
                                               sampler=train_sampler,
                                               drop_last=True,
                                               collate_fn=collator)

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)
        # train for one epoch
        center_vec = train_nested(train_loader, model, criterion, optimizer,
                                  epoch, args, logger)

        if not args.multiprocessing_distributed or (
                args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            save_path = os.path.join(args.save_dir,
                                     'checkpoint_{:04d}.pth.tar'.format(epoch))
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'centers': center_vec
                },
                is_best=False,
                filename=save_path)
def main(args):
    file_name = 'log_%s_%d' % ('gpus', args.gpu)
    logger = setup_logger(file_name,
                          args.save_dir,
                          args.gpu,
                          log_level='DEBUG',
                          filename='%s.txt' % file_name)
    logger.info(args)
    if args.search_space == 'nasbench_101':
        with open(nas_bench_101_all_data, 'rb') as fpkl:
            all_data = pickle.load(fpkl)
    else:
        raise NotImplementedError(
            f'The search space {args.search_space} does not support now!')

    for k in range(args.trails):
        seed = random_id_int(4)
        set_random_seed(seed)
        s_results_dict = defaultdict(list)
        k_results_dict = defaultdict(list)
        logger.info(
            f'======================  Trails {k} Begin Setting Seed to {seed} ==========================='
        )
        for budget in args.search_budget:
            train_data, test_data = data.dataset_split_idx(all_data, budget)
            print(
                f'budget: {budget}, train data size: {len(train_data)}, test data size: {len(test_data)}'
            )
            for epochs in args.train_iterations:
                if args.compare_supervised == 'T':
                    logger.info(
                        f'====  predictor type: SUPERVISED, load pretrain model False, '
                        f'search budget is {budget}. Training epoch is {epochs} ===='
                    )
                    spearman_corr, kendalltau_corr, duration = predictor_retrain_compare(
                        args,
                        'SS_RL',
                        train_data,
                        test_data,
                        flag=False,
                        train_epochs=epochs,
                        logger=logger)
                    if math.isnan(spearman_corr):
                        spearman_corr = 0
                    if math.isnan(kendalltau_corr):
                        kendalltau_corr = 0
                    s_results_dict[f'supervised#{budget}#{epochs}'].append(
                        spearman_corr)
                    k_results_dict[f'supervised#{budget}#{epochs}'].append(
                        kendalltau_corr)
                for predictor_type, dir in zip(args.predictor_list,
                                               args.load_dir):
                    logger.info(
                        f'====  predictor type: {predictor_type}, load pretrain model True. '
                        f'Search budget is {budget}. Training epoch is {epochs}. '
                        f'The model save dir is {dir.split("/")[-1][:-3]}  ===='
                    )
                    spearman_corr, kendalltau_corr, duration = predictor_retrain_compare(
                        args,
                        predictor_type,
                        train_data,
                        test_data,
                        flag=True,
                        load_dir=dir,
                        train_epochs=epochs,
                        logger=logger)
                    if math.isnan(spearman_corr):
                        spearman_corr = 0
                    if math.isnan(kendalltau_corr):
                        kendalltau_corr = 0
                    s_results_dict[predictor_type + '#' + str(budget) + '#' +
                                   str(epochs)].append(spearman_corr)
                    k_results_dict[predictor_type + '#' + str(budget) + '#' +
                                   str(epochs)].append(kendalltau_corr)
        file_id = random_id(6)
        save_path = os.path.join(
            args.save_dir,
            f'{file_id}_{args.predictor_list[0]}_{args.search_space.split("_")[-1]}_{args.gpu}_{k}.pkl'
        )
        with open(save_path, 'wb') as fp:
            pickle.dump(s_results_dict, fp)
            pickle.dump(k_results_dict, fp)
                        help='name of save directory')
    args = parser.parse_args()

    # make save directory
    save_dir = args.save_dir
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    if not os.path.exists(os.path.join(save_dir, 'model_pkl')):
        os.mkdir(os.path.join(save_dir, 'model_pkl'))
    if not os.path.exists(os.path.join(save_dir, 'results')):
        os.mkdir(os.path.join(save_dir, 'results'))
    if not os.path.exists(os.path.join(save_dir, 'pre_train_models')):
        os.mkdir(os.path.join(save_dir, 'pre_train_models'))
    # 2. build architecture training dataset
    arch_dataset = build_open_search_space_dataset(args.search_space)
    logger = setup_logger("nasbench_open_%s_cifar10" % args.search_space, args.save_dir, 0, log_level=args.log_level)
    algo_info = algo_params_open_domain(args.algorithm)
    algo_info['total_queries'] = args.budget
    starttime = time.time()
    multiprocessing.set_start_method('spawn')
    temp_k = 10
    file_name = save_dir + '/results/%s_%d.pkl' % (algo_info['algo_name'], algo_info['total_queries'])

    data = build_open_algos(algo_info['algo_name'])(search_space=arch_dataset,
                                                    algo_info=algo_info,
                                                    logger=logger,
                                                    gpus=args.gpus,
                                                    save_dir=save_dir,
                                                    seed=args.seed)
    if 'random' in algo_info['algo_name']:
        results, result_keys = compute_best_test_losses(data, temp_k, total_queries=algo_info['total_queries'])