Beispiel #1
0
        def load_state_dict(model: Network, state_dict, strict=True):
            """Copies parameters and buffers from :attr:`state_dict` into
            this module and its descendants. If :attr:`strict` is ``True`` then
            the keys of :attr:`state_dict` must exactly match the keys returned
            by this module's :func:`state_dict()` function.

            Arguments:
                state_dict (dict): A dict containing parameters and
                    persistent buffers.
                strict (bool): Strictly enforce that the keys in :attr:`state_dict`
                    match the keys returned by this module's `:func:`state_dict()`
                    function.
                    :param strict:
                    :param state_dict:
                    :param model:
            """
            own_state = model.state_dict()
            for name, param in state_dict.items():
                if name in own_state:
                    if isinstance(param, torch.nn.Parameter):
                        # backwards compatibility for serialized parameters
                        param = param.detach()
                    try:
                        own_state[name].copy_(param)
                    except Exception:
                        raise RuntimeError(
                            'While copying the parameter named {}, '
                            'whose dimensions in the model are {} and '
                            'whose dimensions in the checkpoint are {}.'.
                            format(name, own_state[name].size(), param.size()))
                elif strict:
                    raise KeyError(
                        'unexpected key "{}" in state_dict'.format(name))
            if strict:
                missing = set(own_state.keys()) - set(state_dict.keys())
                if len(missing) > 0:
                    raise KeyError(
                        'missing keys in state_dict: "{}"'.format(missing))
Beispiel #2
0
def main():
    args.exp_path /= f'{args.gpu}_{time.strftime("%Y%m%d-%H%M%S")}'
    utils.create_exp_dir(Path(args.exp_path),
                         scripts_to_save=glob.glob('*.py'))

    log_format = '%(asctime)s %(message)s'
    logging.basicConfig(stream=sys.stdout,
                        level=logging.INFO,
                        format=log_format,
                        datefmt='%m/%d %I:%M:%S %p')
    fh = logging.FileHandler(args.exp_path / 'log.txt')
    fh.setFormatter(logging.Formatter(log_format))
    logging.getLogger().addHandler(fh)

    if args.seed is None:
        raise Exception('designate seed.')
    np.random.seed(args.seed)
    cudnn.benchmark = True
    cudnn.enabled = True
    torch.manual_seed(args.seed)

    # ================================================
    # total, used = os.popen(
    #     'nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader'
    # ).read().split('\n')[args.gpu].split(',')
    # total = int(total)
    # used = int(used)

    # print('Total GPU mem:', total, 'used:', used)

    # try:
    #     block_mem = 0.85 * (total - used)
    #     print(block_mem)
    #     x = torch.empty((256, 1024, int(block_mem))).cuda()
    #     del x
    # except RuntimeError as err:
    #     print(err)
    #     block_mem = 0.8 * (total - used)
    #     print(block_mem)
    #     x = torch.empty((256, 1024, int(block_mem))).cuda()
    #     del x
    #
    #
    # print('reuse mem now ...')
    # ================================================

    logging.info(f'GPU device = {args.gpu}')
    logging.info(f'args = {args}')

    criterion = nn.CrossEntropyLoss().to(device)

    setting = args.location

    model = Network(args.init_ch, 10, args.layers, criterion, setting)
    checkpoint = None
    previous_epochs = 0
    if args.checkpoint_path:
        checkpoint = torch.load(args.checkpoint_path)
        utils.load(model, checkpoint['state_dict'], False)
        previous_epochs = checkpoint['epoch']
        args.epochs -= previous_epochs
        if args.epochs <= 0:
            raise Exception('args.epochs is too small.')

    if use_DataParallel:
        print('use Data Parallel')
        model = nn.parallel.DataParallel(model)
        model = model.cuda()
        module = model.module
        torch.cuda.manual_seed_all(args.seed)
    else:
        model = model.to(device)
        module = model

    param_size = utils.count_parameters_in_MB(model)
    logging.info(f'param size = {param_size}MB')

    arch_and_attn_params = list(
        map(
            id,
            module.arch_and_attn_parameters()
            if use_DataParallel else model.arch_and_attn_parameters()))
    weight_params = filter(
        lambda p: id(p) not in arch_and_attn_params,
        module.parameters() if use_DataParallel else model.parameters())

    optimizer = optim.SGD(weight_params,
                          args.lr,
                          momentum=args.momentum,
                          weight_decay=args.wd)
    if checkpoint:
        optimizer.load_state_dict(checkpoint['optimizer'])

    train_transform, valid_transform = utils._data_transforms_cifar10(args)
    train_data = dset.CIFAR10(root=args.data,
                              train=True,
                              download=True,
                              transform=train_transform)

    num_train = len(train_data)  # 50000
    indices = list(range(num_train))
    split = int(np.floor(args.train_portion * num_train))  # 25000

    train_queue = torch.utils.data.DataLoader(
        train_data,
        batch_size=args.batchsz,
        sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]),
        pin_memory=True,
        num_workers=8)  # from 2

    valid_queue = torch.utils.data.DataLoader(
        train_data,
        batch_size=args.batchsz,
        sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[split:]),
        pin_memory=True,
        num_workers=8)

    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                     args.epochs,
                                                     eta_min=args.lr_min)
    if checkpoint:
        scheduler.load_state_dict(checkpoint['scheduler'])

    arch = Arch(model, criterion, args)
    if checkpoint:
        arch.optimizer.load_state_dict(checkpoint['arch_optimizer'])

    for epoch in tqdm(range(args.epochs), desc='Total Progress'):
        scheduler.step()
        lr = scheduler.get_lr()[0]

        logging.info(f'\nEpoch: {epoch} lr: {lr}')
        gen = module.genotype()
        logging.info(f'Genotype: {gen}')

        print(F.softmax(module.alphas_normal, dim=-1))
        print(F.softmax(module.alphas_reduce, dim=-1))
        if module.betas_normal is not None:
            print(F.softmax(module.betas_normal, dim=-1))
            print(F.softmax(module.betas_reduce, dim=-1))
        if module.gammas_normal is not None:
            print(F.softmax(module.gammas_normal, dim=-1))
            print(F.softmax(module.gammas_reduce, dim=-1))

        # training
        train_acc, train_obj = train(train_queue, valid_queue, model, arch,
                                     criterion, optimizer, lr, epoch + 1)
        logging.info(f'train acc: {train_acc}')

        # validation
        valid_acc, valid_obj = infer(valid_queue, model, criterion, epoch + 1)
        logging.info(f'valid acc: {valid_acc}')

        utils.save(model, args.exp_path / 'search.pt')
        utils.save_checkpoint(
            {
                'epoch': epoch + 1 + previous_epochs,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'arch_optimizer': arch.optimizer.state_dict(),
                'scheduler': scheduler.state_dict()
            }, False, args.exp_path)

        gen = module.genotype()
        gen_path = args.exp_path / 'genotype.json'
        utils.save_genotype(gen, gen_path)

        logging.info(f'Result genotype: {gen}')
Beispiel #3
0
arch_optimizer = torch.optim.Adam(model.arch_parameters(),
                                  lr=args.arch_learning_rate,
                                  betas=(0.5, 0.999),
                                  weight_decay=args.arch_weight_decay)

# Restore from a previous checkpoint, if initial_epoch is specified.
# Horovod: restore on the first worker which will broadcast weights to other workers.
if resume_from_epoch > 0 and hvd.rank() == 0:
    filepath = args.checkpoint_format.format(exp=args.save,
                                             epoch=resume_from_epoch)
    checkpoint = torch.load(filepath)
    model.load_state_dict(checkpoint['model'])
    optimizer.load_state_dict(checkpoint['optimizer'])

# Horovod: broadcast parameters & optimizer state.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer, float(args.epochs), eta_min=args.learning_rate_min)

architect = Architect(model, args)

# model_path = "./search-EXP-final/weights.pt"
# model.load_state_dict(torch.load(model_path))

start_time = time.time()
if hvd.rank() == 0:
    logging.info("param size = %fMB", utils.count_parameters_in_MB(model))
for epoch in range(resume_from_epoch, args.epochs):
Beispiel #4
0
def main():
    start = time.time()
    if not torch.cuda.is_available():
        logging.info('no gpu device available')
        sys.exit(1)

    np.random.seed(config.seed)
    torch.manual_seed(config.seed)
    torch.cuda.manual_seed_all(config.seed)
    torch.backends.cudnn.benchmark = True

    CLASSES = 1000
    channels = SEARCH_SPACE['channel_size']
    strides = SEARCH_SPACE['strides']

    # Model
    model = Network(channels, strides, CLASSES)
    model = model.to(device)
    model = nn.DataParallel(model, device_ids=config.gpus)
    logger.info("param size = %fMB", utils.count_parameters_in_MB(model))
    config.world_size = 0

    if config.target_hardware is None:
        config.ref_value = None
    else:
        config.ref_value = ref_values[config.target_hardware][
            '%.2f' % config.width_mult]

    # Loss
    criterion = LatencyLoss(config, channels, strides).cuda(config.gpus)
    normal_critersion = nn.CrossEntropyLoss()

    alpha_weight = model.module.arch_parameters()
    # weight = [param for param in model.parameters() if not utils.check_tensor_in_list(param, alpha_weight)]
    weight = model.module.weight_parameters()
    # Optimizer
    w_optimizer = torch.optim.SGD(weight,
                                  config.w_lr,
                                  momentum=config.w_momentum,
                                  weight_decay=config.w_weight_decay)

    alpha_optimizer = torch.optim.Adam(alpha_weight,
                                       lr=config.alpha_lr,
                                       betas=(config.arch_adam_beta1,
                                              config.arch_adam_beta2),
                                       eps=config.arch_adam_eps,
                                       weight_decay=config.alpha_weight_decay)

    train_data = get_imagenet_torch(
        type='train',
        # image_dir="/googol/atlas/public/cv/ILSVRC/Data/"
        # use soft link `mkdir ./data/imagenet && ln -s /googol/atlas/public/cv/ILSVRC/Data/CLS-LOC/* ./data/imagenet/`
        image_dir=config.data_path + "/" + config.dataset.lower(),
        batch_size=config.batch_size,
        num_threads=config.workers,
        world_size=config.world_size,
        crop=224,
        device_id=0,
        num_gpus=len(config.gpus),
        portion=config.train_portion)

    valid_data = get_imagenet_torch(
        type='val',
        # image_dir="/googol/atlas/public/cv/ILSVRC/Data/"
        # use soft link `mkdir ./data/imagenet && ln -s /googol/atlas/public/cv/ILSVRC/Data/CLS-LOC/* ./data/imagenet/`
        image_dir=config.data_path + "/" + config.dataset.lower(),
        batch_size=config.batch_size,
        num_threads=config.workers,
        world_size=config.world_size,
        crop=224,
        device_id=0,
        num_gpus=len(config.gpus),
        portion=config.val_portion)

    best_top1 = 0.
    best_genotype = list()
    lr = 0

    config.start_epoch = -1
    config.warmup_epoch = 0
    config.warmup = True
    ### Resume form warmup model or train model ###
    if config.resume:
        try:
            model_path = config.path + '/checkpoint.pth.tar'
            model, w_optimizer, alpha_optimizer = load_model(
                model,
                model_fname=model_path,
                optimizer=w_optimizer,
                arch_optimizer=alpha_optimizer)
        except Exception:
            warmup_path = config.path + '/warmup.pth.tar'
            if os.path.exists(warmup_path):
                print('load warmup weights')
                model, w_optimizer, alpha_optimizer = load_model(
                    model,
                    model_fname=warmup_path,
                    optimizer=w_optimizer,
                    arch_optimizer=alpha_optimizer)
            else:
                print('fail to load models')

    w_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        w_optimizer, float(config.epochs), eta_min=config.w_lr_min)

    if config.start_epoch < 0 and config.warm_up:
        for epoch in range(config.warmup_epoch, config.warmup_epochs):
            # warmup
            train_top1, train_loss = warm_up(train_data, valid_data, model,
                                             normal_critersion, criterion,
                                             w_optimizer, epoch, writer)
            config.start_epoch = epoch

    update_schedule = utils.get_update_schedule_grad(len(train_data), config)
    for epoch in range(config.start_epoch + 1, config.epochs):
        if epoch > config.warmup_epochs:
            w_scheduler.step()
            lr = w_scheduler.get_lr()[0]
            logger.info('epoch %d lr %e', epoch, lr)
        # training
        train_top1, train_loss = train(train_data, valid_data, model,
                                       normal_critersion, criterion,
                                       w_optimizer, alpha_optimizer, lr, epoch,
                                       writer, update_schedule)
        logger.info('Train top1 %f', train_top1)

        # validation
        top1 = train_top1
        if epoch % 10 == 0:
            top1, loss = infer(valid_data, model, epoch, criterion,
                               normal_critersion, writer)
            logger.info('valid top1 %f', top1)

        genotype = model.module.genotype()
        logger.info("genotype = {}".format(genotype))

        # save
        if best_top1 < top1:
            best_top1 = top1
            best_genotype = genotype
            is_best = True
        else:
            is_best = False
        save_model(model, {
            'warmup': False,
            'epoch': epoch,
            'w_optimizer': w_optimizer.state_dict(),
            'alpha_optimizer': alpha_optimizer.state_dict(),
            'state_dict': model.state_dict()
        },
                   is_best=is_best)

    utils.time(time.time() - start)
    logger.info("Final best Prec@1 = {:.4%}".format(best_top1))
    logger.info("Best Genotype = {}".format(best_genotype))
Beispiel #5
0
def main():
    if not torch.cuda.is_available():
        logging.info('no gpu device available')
        sys.exit(1)

    np.random.seed(args.seed)
    cudnn.benchmark = True
    torch.manual_seed(args.seed)
    cudnn.enabled = True
    torch.cuda.manual_seed(args.seed)
    logging.info("args = %s", args)

    criterion = nn.CrossEntropyLoss()
    criterion = criterion.cuda()
    model = Network(args.init_channels, CIFAR_CLASSES, args.layers, criterion)
    model = model.cuda()

    logging.info("param size = %fMB", utils.count_parameters_in_MB(model))
    train_transform, valid_transform = utils._data_transforms_cifar10(args)

    train_data = dset.CIFAR10(root=args.data,
                              train=True,
                              download=True,
                              transform=train_transform)

    num_train = len(train_data)
    indices = list(range(num_train))
    split = int(np.floor(args.train_portion * num_train))

    train_queue = torch.utils.data.DataLoader(
        train_data,
        batch_size=args.batch_size,
        sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]),
        pin_memory=True,
        num_workers=2)

    valid_queue = torch.utils.data.DataLoader(
        train_data,
        batch_size=args.batch_size,
        sampler=torch.utils.data.sampler.SubsetRandomSampler(
            indices[split:num_train]),
        pin_memory=True,
        num_workers=2)
    test_data = dset.CIFAR10(root=args.data,
                             train=False,
                             download=True,
                             transform=valid_transform)
    test_queue = torch.utils.data.DataLoader(test_data,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             pin_memory=True,
                                             num_workers=2)

    optimizer = torch.optim.SGD(model.parameters(),
                                args.learning_rate,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, float(args.epochs), eta_min=args.learning_rate_min)

    architect = Architect(model, args)
    bin_op = bin_utils_search.BinOp(model, args)
    best_acc = 0.
    best_genotypes = []
    for epoch in range(args.epochs):
        scheduler.step()
        lr = scheduler.get_lr()[0]
        logging.info('epoch %d lr %e', epoch, lr)

        genotype = model.genotype()
        genotype_img = model.genotype(args.gamma)
        logging.info('genotype = %s', genotype)
        logging.info(F.softmax(model.alphas_normal, dim=-1))
        logging.info(F.softmax(model.alphas_reduce, dim=-1))

        # training
        train_acc, train_obj = train(train_queue, valid_queue, model,
                                     architect, criterion, optimizer, lr,
                                     bin_op, epoch)
        logging.info('train_acc %f', train_acc)

        # validation
        valid_acc, valid_obj = infer(valid_queue, model, criterion, bin_op)
        logging.info('valid_acc %f', valid_acc)
        if best_acc < valid_acc:
            best_acc = valid_acc
            if len(best_genotypes) > 0:
                best_genotypes[0] = genotype
                best_genotypes[1] = genotype_img
            else:
                best_genotypes.append(genotype)
                best_genotypes.append(genotype_img)
        utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'arch_param': model.arch_parameters(),
                'val_acc': valid_acc,
                'optimizer': optimizer.state_dict(),
            }, False, args.save)

    with open('./genotypes.py', 'a') as f:
        f.write(args.geno_name + ' = ' + str(best_genotypes[0]) + '\n')
        f.write(args.geno_name + '_img' + ' = ' + str(best_genotypes[1]) +
                '\n')
Beispiel #6
0
#scheduler.step()
lr = scheduler.get_lr()[0]

# STAGE 1
start = time.time()
for epoch in range(args.epochs):
    ## Training the whole population
    logging.info("[INFO] Generation {} training with learning rate {}".format(
        epoch + 1,
        scheduler.get_lr()[0]))
    start_time = time.time()

    train(model, train_queue, criterion, optimizer, epoch + 1)
    logging.info("[INFO] Training finished in {} minutes".format(
        (time.time() - start_time) / 60))
    torch.save(model.state_dict(), "model.pt")
    #lr = scheduler.get_lr()[0]
    scheduler.step()

    logging.info("[INFO] Evaluating Generation {} ".format(epoch + 1))
    validation(model, valid_queue, criterion, epoch + 1)
    population.pop_sort()

    for i, p in enumerate(population.get_population()):
        writer.add_scalar("pop_top1_{}".format(i + 1), p.get_fitness(),
                          epoch + 1)
        writer.add_scalar("pop_top5_{}".format(i + 1), p.top5.avg, epoch + 1)
        writer.add_scalar("pop_obj_valid_{}".format(i + 1), p.objs.avg,
                          epoch + 1)

    with open(os.path.join(DIR, "population_{}.pickle".format(epoch + 1)),
Beispiel #7
0
def main():
    if not torch.cuda.is_available():
        logging.info('no gpu device available')
        sys.exit(1)

    np.random.seed(args.seed)
    torch.cuda.set_device(args.gpu)
    cudnn.benchmark = True
    torch.manual_seed(args.seed)
    cudnn.enabled = True
    torch.cuda.manual_seed(args.seed)
    logging.info('gpu device = %d' % args.gpu)
    logging.info("args = %s", args)

    criterion = nn.CrossEntropyLoss()
    criterion = criterion.cuda()

    start_epoch = 1

    model = Network(args.init_channels, CIFAR_CLASSES, args.layers, criterion)
    # utils.print_model_param_nums(model)
    model = model.cuda()
    model = nn.DataParallel(model)

    logger = Logger('./logs')
    logging.info("param size = %fMB", utils.count_parameters_in_MB(model))

    optimizer = torch.optim.SGD(model.parameters(),
                                args.learning_rate,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    train_transform, valid_transform = utils._data_transforms_cifar10(args)
    if args.set == 'cifar100':
        train_data = dset.CIFAR100(root=args.data,
                                   train=True,
                                   download=True,
                                   transform=train_transform)
        valid_data = dset.CIFAR100(root=args.data,
                                   train=False,
                                   download=True,
                                   transform=valid_transform)
    else:
        train_data = dset.CIFAR10(root=args.data,
                                  train=True,
                                  download=True,
                                  transform=train_transform)
        valid_data = dset.CIFAR10(root=args.data,
                                  train=False,
                                  download=True,
                                  transform=valid_transform)

    # num_train = len(train_data)
    # indices = list(range(num_train))
    # split = int(np.floor(args.train_portion * num_train))

    train_queue = torch.utils.data.DataLoader(train_data,
                                              batch_size=args.batch_size,
                                              shuffle=True,
                                              pin_memory=True,
                                              num_workers=2)

    valid_queue = torch.utils.data.DataLoader(valid_data,
                                              batch_size=args.batch_size,
                                              shuffle=False,
                                              pin_memory=True,
                                              num_workers=2)

    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, float(args.epochs), eta_min=args.learning_rate_min)

    # scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,0.9)

    # architect = Architect(model, args)

    for epoch in range(args.epochs):
        scheduler.step()
        lr = scheduler.get_lr()[0]
        logging.info('epoch %d lr %e', epoch, lr)

        # genotype = model.genotype()
        # logging.info('genotype = %s', genotype)

        #print(F.softmax(model.alphas_normal, dim=-1))
        #print(F.softmax(model.alphas_reduce, dim=-1))

        # training
        train_acc, train_obj = train(train_queue, model, criterion, optimizer,
                                     logger)
        logging.info('train_acc %f', train_acc)

        # validation
        if args.epochs - epoch <= 1:
            valid_acc, valid_obj = infer(valid_queue, model, criterion, epoch,
                                         logger)
            logging.info('valid_acc %f', valid_acc)

        checkpoint = {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
        }
        if not os.path.isdir('checkpoint'):
            os.mkdir('checkpoint')
        torch.save(checkpoint, PATH)

        utils.save(model, os.path.join(args.save, 'weights.pt'))
Beispiel #8
0
def train():

    use_gpu = cfg.MODEL.DEVICE == "cuda"
    # 1、make dataloader
    train_loader, val_loader, test_loader, num_query, num_class = darts_make_data_loader(
        cfg)
    # print(num_query, num_class)

    # 2、make model
    model = Network(num_class, cfg)
    # tensor = torch.randn(2, 3, 256, 128)
    # res = model(tensor)
    # print(res[0].size()) [2, 751]

    # 3、make optimizer
    optimizer = make_optimizer(cfg, model)
    arch_optimizer = torch.optim.Adam(
        model._arch_parameters(),
        lr=cfg.SOLVER.ARCH_LR,
        betas=(0.5, 0.999),
        weight_decay=cfg.SOLVER.ARCH_WEIGHT_DECAY)

    # 4、make lr scheduler
    lr_scheduler = make_lr_scheduler(cfg, optimizer)
    # make lr scheduler
    arch_lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        arch_optimizer, [80, 160], 0.1)

    # 5、make loss
    loss_fn = darts_make_loss(cfg)

    # model._set_loss(loss_fn, compute_loss_acc)

    # 6、make architect
    # architect = Architect(model, cfg)

    # get parameters
    device = cfg.MODEL.DEVICE
    use_gpu = device == "cuda"
    pretrained = cfg.MODEL.PRETRAINED != ""

    log_period = cfg.OUTPUT.LOG_PERIOD
    ckpt_period = cfg.OUTPUT.CKPT_PERIOD
    eval_period = cfg.OUTPUT.EVAL_PERIOD
    output_dir = cfg.OUTPUT.DIRS
    ckpt_save_path = output_dir + cfg.OUTPUT.CKPT_DIRS

    epochs = cfg.SOLVER.MAX_EPOCHS
    batch_size = cfg.SOLVER.BATCH_SIZE
    grad_clip = cfg.SOLVER.GRAD_CLIP

    batch_num = len(train_loader)
    log_iters = batch_num // log_period

    if not os.path.exists(ckpt_save_path):
        os.makedirs(ckpt_save_path)

    # create *_result.xlsx
    # save the result for analyze
    name = (cfg.OUTPUT.LOG_NAME).split(".")[0] + ".xlsx"
    result_path = cfg.OUTPUT.DIRS + name

    wb = xl.Workbook()
    sheet = wb.worksheets[0]
    titles = [
        'size/M', 'speed/ms', 'final_planes', 'acc', 'mAP', 'r1', 'r5', 'r10',
        'loss', 'acc', 'mAP', 'r1', 'r5', 'r10', 'loss', 'acc', 'mAP', 'r1',
        'r5', 'r10', 'loss'
    ]
    sheet.append(titles)
    check_epochs = [40, 80, 120, 160, 200, 240, 280, 320, 360, epochs]
    values = []

    logger = logging.getLogger("CSNet_Search.train")
    size = count_parameters(model)
    values.append(format(size, '.2f'))
    values.append(model.final_planes)

    logger.info("the param number of the model is {:.2f} M".format(size))

    logger.info("Starting Search CDNetwork")

    best_mAP, best_r1 = 0., 0.
    is_best = False
    avg_loss, avg_acc = RunningAverageMeter(), RunningAverageMeter()
    avg_time, global_avg_time = AverageMeter(), AverageMeter()

    if use_gpu:
        model = model.to(device)

    if pretrained:
        logger.info("load self pretrained chekpoint to init")
        model.load_pretrained_model(cfg.MODEL.PRETRAINED)
    else:
        logger.info("use kaiming init to init the model")
        model.kaiming_init_()
    # exit(1)
    for epoch in range(epochs):
        model.set_tau(cfg.MODEL.TAU_MAX -
                      (cfg.MODEL.TAU_MAX - cfg.MODEL.TAU_MIN) * epoch /
                      (epochs - 1))
        lr_scheduler.step()
        lr = lr_scheduler.get_lr()[0]
        # architect lr.step
        arch_lr_scheduler.step()

        # if save epoch_num k, then run k+1 epoch next
        if pretrained and epoch < model.start_epoch:
            continue

        # print(epoch)
        # exit(1)
        model.train()
        avg_loss.reset()
        avg_acc.reset()
        avg_time.reset()

        for i, batch in enumerate(train_loader):

            t0 = time.time()
            imgs, labels = batch
            val_imgs, val_labels = next(iter(val_loader))

            if use_gpu:
                imgs = imgs.to(device)
                labels = labels.to(device)
                val_imgs = val_imgs.to(device)
                val_labels = val_labels.to(device)

            # 1、 update the weights
            optimizer.zero_grad()
            res = model(imgs)

            # loss = loss_fn(scores, feats, labels)
            loss, acc = compute_loss_acc(res, labels, loss_fn)
            loss.backward()

            if grad_clip != 0:
                nn.utils.clip_grad_norm_(model.parameters(), grad_clip)

            optimizer.step()

            # 2、update the alpha
            arch_optimizer.zero_grad()
            res = model(val_imgs)

            val_loss, val_acc = compute_loss_acc(res, val_labels, loss_fn)
            val_loss.backward()
            arch_optimizer.step()

            # compute the acc
            # acc = (scores.max(1)[1] == labels).float().mean()

            t1 = time.time()
            avg_time.update((t1 - t0) / batch_size)
            avg_loss.update(loss)
            avg_acc.update(acc)

            # log info
            if (i + 1) % log_iters == 0:
                logger.info(
                    "epoch {}: {}/{} with loss is {:.5f} and acc is {:.3f}".
                    format(epoch + 1, i + 1, batch_num, avg_loss.avg,
                           avg_acc.avg))

        logger.info(
            "end epochs {}/{} with lr: {:.5f} and avg_time is: {:.3f} ms".
            format(epoch + 1, epochs, lr, avg_time.avg * 1000))
        global_avg_time.update(avg_time.avg)

        # test the model
        if (epoch + 1) % eval_period == 0 or (epoch + 1) in check_epochs:

            model.eval()
            metrics = R1_mAP(num_query, use_gpu=use_gpu)

            with torch.no_grad():
                for vi, batch in enumerate(test_loader):
                    # break
                    # print(len(batch))
                    imgs, labels, camids = batch
                    if use_gpu:
                        imgs = imgs.to(device)

                    feats = model(imgs)
                    metrics.update((feats, labels, camids))

                #compute cmc and mAP
                cmc, mAP = metrics.compute()
                logger.info("validation results at epoch {}".format(epoch + 1))
                logger.info("mAP:{:2%}".format(mAP))
                for r in [1, 5, 10]:
                    logger.info("CMC curve, Rank-{:<3}:{:.2%}".format(
                        r, cmc[r - 1]))

                # determine whether current model is the best
                if mAP > best_mAP:
                    is_best = True
                    best_mAP = mAP
                    logger.info("Get a new best mAP")
                if cmc[0] > best_r1:
                    is_best = True
                    best_r1 = cmc[0]
                    logger.info("Get a new best r1")

                # add the result to sheet
                if (epoch + 1) in check_epochs:
                    val = [avg_acc.avg, mAP, cmc[0], cmc[4], cmc[9]]
                    change = [format(v * 100, '.2f') for v in val]
                    change.append(format(avg_loss.avg, '.3f'))
                    values.extend(change)

        # whether to save the model
        if (epoch + 1) % ckpt_period == 0 or is_best:
            torch.save(model.state_dict(),
                       ckpt_save_path + "checkpoint_{}.pth".format(epoch + 1))
            model._parse_genotype(file=ckpt_save_path +
                                  "genotype_{}.json".format(epoch + 1))
            logger.info("checkpoint {} was saved".format(epoch + 1))

            if is_best:
                torch.save(model.state_dict(),
                           ckpt_save_path + "best_ckpt.pth")
                model._parse_genotype(file=ckpt_save_path +
                                      "best_genotype.json")
                logger.info("best_checkpoint was saved")
                is_best = False
        # exit(1)

    values.insert(1, format(global_avg_time.avg * 1000, '.2f'))
    sheet.append(values)
    wb.save(result_path)

    logger.info("Ending Search GDAS_Search")
Beispiel #9
0
####MAIN 함수 
val_acc_top5 = []
val_acc_top1 = []
for epoch in range(opt.epochs):
    np.random.seed(2)
    torch.cuda.manual_seed(2)
    # training
    train_acc_top1, train_acc_top5 , train_valoss,train_poloss = train(train_queue, valid_queue, model,criterion, optimizer_arch,optimizer_model,opt.arch_learning_rate,opt.lr_model)

    # validation
    valid_acc_top1,valid_acc_top5, valid_valoss = infer(valid_queue, model, criterion)

    f.write("%5.5f  "% train_acc_top1)
    f.write("%5.5f  "% train_acc_top5)
    f.write("%5.5f  "% train_valoss)
    f.write("%5.5f  "% train_poloss ) 
    f.write("%5.5f  "% valid_acc_top1 ) 
    f.write("%5.5f  "% valid_acc_top5 ) 
    f.write("%5.5f  "% valid_valoss ) 
    f.write("\n")


    print("epoch : " , epoch , "Train_Acc_Top1 : " , train_acc_top1 , "Train_value_loss : ",train_valoss,"Train_policy : " , train_poloss )
    print("epoch : " , epoch, "Val_Acc_Top1 : " , valid_acc_top1 , "Val_Acc_Top5 : " , valid_acc_top5,"Loss : " , valid_valoss)
    torch.save(model.state_dict(),'weights.pt')
f.close()