Beispiel #1
0
def main():
    if not torch.cuda.is_available():
        logging.info('No GPU device available')
        sys.exit(1)
    set_seed(args.seed)
    cudnn.enabled = True
    cudnn.benchmark = True
    args.distributed = False
    if 'WORLD_SIZE' in os.environ:
        args.distributed = int(os.environ['WORLD_SIZE']) > 1
    args.gpu = 0
    args.world_size = 1
    if args.distributed:
        set_seed(args.local_rank)
        args.gpu = args.local_rank
        torch.cuda.set_device(args.gpu)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        args.world_size = torch.distributed.get_world_size()
    if args.local_rank == 0:
        logging.info("args = {}".format(args))
        logging.info("unparsed_args = {}".format(unparsed))
        logging.info("distributed = {}".format(args.distributed))
        logging.info("opt_level = {}".format(args.opt_level))
        logging.info("keep_batchnorm_fp32 = {}".format(
            args.keep_batchnorm_fp32))
        logging.info("loss_scale = {}".format(args.loss_scale))
        logging.info("CUDNN VERSION: {}".format(
            torch.backends.cudnn.version()))

    # create model
    if args.local_rank == 0:
        logging.info('parsing the architecture')
    if args.model_path and os.path.isfile(args.model_path):
        op_weights, depth_weights = get_op_and_depth_weights(args.model_path)
        parsed_arch = parse_architecture(op_weights, depth_weights)
        mc_mask_dddict = torch.load(args.model_path)['mc_mask_dddict']
        mc_num_dddict = get_mc_num_dddict(mc_mask_dddict)
        model = Network(args.num_classes, parsed_arch, mc_num_dddict, None,
                        args.dropout_rate, args.drop_connect_rate)
    elif args.config_path and os.path.isfile(args.config_path):
        model_config = json.load(open(args.config_path, 'r'))
        model = NetworkCfg(args.num_classes, model_config, None,
                           args.dropout_rate, args.drop_connect_rate)
    else:
        raise Exception('invalid --model_path and --config_path')
    if args.sync_bn:
        if args.local_rank == 0: logging.info("using apex synced BN")
        model = parallel.convert_syncbn_model(model)
    model = model.cuda().to(memory_format=memory_format
                            ) if memory_format is not None else model.cuda()
    config = model.config
    if args.local_rank == 0:
        with open(os.path.join(args.save, 'model.config'), 'w') as f:
            json.dump(config, f, indent=4)
        # logging.info(config)
        logging.info("param size = %fMB", count_parameters_in_MB(model))

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss()
    criterion = criterion.cuda()
    criterion_smooth = CrossEntropyLabelSmooth(args.num_classes,
                                               args.label_smooth)
    criterion_smooth = criterion_smooth.cuda()

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)
    # Initialize Amp
    if args.opt_level is not None:
        model, optimizer = amp.initialize(
            model,
            optimizer,
            opt_level=args.opt_level,
            keep_batchnorm_fp32=args.keep_batchnorm_fp32,
            loss_scale=args.loss_scale)

    # For distributed training, wrap the model with apex.parallel.DistributedDataParallel.
    # This must be done AFTER the call to amp.initialize.
    if args.distributed:
        # By default, apex.parallel.DistributedDataParallel overlaps communication with
        # computation in the backward pass.
        # delay_allreduce delays all communication to the end of the backward pass.
        model = DDP(model, delay_allreduce=True)
    else:
        model = nn.DataParallel(model)

    # define transform and initialize dataloader
    batch_size = args.batch_size // args.world_size
    workers = args.workers // args.world_size
    train_transform = transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ColorJitter(
            brightness=0.4,
            contrast=0.4,
            saturation=0.4,  #),
            hue=0.2),
        transforms.ToTensor(),
        transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD),
    ])
    val_transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD),
    ])
    train_dataset = ImageList(root=args.train_root,
                              list_path=args.train_list,
                              transform=train_transform)
    val_dataset = ImageList(root=args.val_root,
                            list_path=args.val_list,
                            transform=val_transform)
    train_sampler = None
    val_sampler = None
    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
        val_sampler = torch.utils.data.distributed.DistributedSampler(
            val_dataset)
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size,
                                               num_workers=workers,
                                               pin_memory=True,
                                               sampler=train_sampler,
                                               shuffle=(train_sampler is None))
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=batch_size,
                                             num_workers=workers,
                                             pin_memory=True,
                                             sampler=val_sampler,
                                             shuffle=False)

    # define learning rate scheduler
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, float(args.epochs))
    best_acc_top1 = 0
    best_acc_top5 = 0
    start_epoch = 0

    # restart from snapshot
    if args.snapshot and os.path.isfile(args.snapshot):
        if args.local_rank == 0:
            logging.info('loading snapshot from {}'.format(args.snapshot))
        checkpoint = torch.load(
            args.snapshot,
            map_location=lambda storage, loc: storage.cuda(args.gpu))
        start_epoch = checkpoint['epoch']
        best_acc_top1 = checkpoint['best_acc_top1']
        best_acc_top5 = checkpoint['best_acc_top5']
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        if args.opt_level is not None:
            amp.load_state_dict(checkpoint['amp'])
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, float(args.epochs), last_epoch=0)
        for epoch in range(start_epoch):
            current_lr = scheduler.get_lr()[0]
            if args.local_rank == 0:
                logging.info('Epoch: %d lr %e', epoch, current_lr)
            if epoch < 5 and args.batch_size > 256:
                for param_group in optimizer.param_groups:
                    param_group['lr'] = current_lr * (epoch + 1) / 5.0
                if args.local_rank == 0:
                    logging.info('Warming-up Epoch: %d, LR: %e', epoch,
                                 current_lr * (epoch + 1) / 5.0)
            if epoch < 5 and args.batch_size > 256:
                for param_group in optimizer.param_groups:
                    param_group['lr'] = current_lr
            scheduler.step()

    # the main loop
    for epoch in range(start_epoch, args.epochs):
        current_lr = scheduler.get_lr()[0]
        if args.local_rank == 0:
            logging.info('Epoch: %d lr %e', epoch, current_lr)
        if epoch < 5 and args.batch_size > 256:
            for param_group in optimizer.param_groups:
                param_group['lr'] = current_lr * (epoch + 1) / 5.0
            if args.local_rank == 0:
                logging.info('Warming-up Epoch: %d, LR: %e', epoch,
                             current_lr * (epoch + 1) / 5.0)

        if args.distributed:
            train_sampler.set_epoch(epoch)

        epoch_start = time.time()
        train_acc, train_obj = train(train_loader, model, criterion_smooth,
                                     optimizer)
        if args.local_rank == 0:
            logging.info('Train_acc: %f', train_acc)

        val_acc_top1, val_acc_top5, val_obj = validate(val_loader, model,
                                                       criterion)
        if args.local_rank == 0:
            logging.info('Val_acc_top1: %f', val_acc_top1)
            logging.info('Val_acc_top5: %f', val_acc_top5)
            logging.info('Epoch time: %ds.', time.time() - epoch_start)

        if args.local_rank == 0:
            is_best = False
            if val_acc_top1 > best_acc_top1:
                best_acc_top1 = val_acc_top1
                best_acc_top5 = val_acc_top5
                is_best = True
            save_checkpoint(
                {
                    'epoch':
                    epoch + 1,
                    'state_dict':
                    model.state_dict(),
                    'best_acc_top1':
                    best_acc_top1,
                    'best_acc_top5':
                    best_acc_top5,
                    'optimizer':
                    optimizer.state_dict(),
                    'amp':
                    amp.state_dict() if args.opt_level is not None else None,
                }, is_best, args.save)

        if epoch < 5 and args.batch_size > 256:
            for param_group in optimizer.param_groups:
                param_group['lr'] = current_lr

        scheduler.step()
Beispiel #2
0
    logging.getLogger().addHandler(fh)

    if not torch.cuda.is_available():
        logging.info('no gpu device available')
        sys.exit(1)

    cudnn.benchmark = True
    cudnn.enabled = True

    SearchSpace = importlib.import_module('models.search_space_' +
                                          config.net_type).Network
    super_model = SearchSpace(config.optim.init_dim, config.data.dataset,
                              config)

    super_model.eval()
    logging.info("Params = %fMB" % utils.count_parameters_in_MB(super_model))

    if args.device == 'gpu':
        super_model = super_model.cuda()

    latency_list, total_latency = super_model.get_cost_list(
        args.input_size,
        cost_type='latency',
        use_gpu=(args.device == 'gpu'),
        meas_times=args.meas_times)

    logging.info('latency_list:\n' + str(latency_list))
    logging.info('total latency: ' + str(total_latency) + 'ms')

    with open(os.path.join(args.save, args.list_name), 'w') as f:
        f.write(str(latency_list))
Beispiel #3
0
    model.eval()
    if hasattr(model, 'net_config'):
        logging.info("Network Structure: \n" +
                     '|\n'.join(map(str, model.net_config)))
    if args.meas_lat:
        latency_cpu = utils.latency_measure(model, (3, 224, 224),
                                            1,
                                            2000,
                                            mode='cpu')
        logging.info('latency_cpu (batch 1): %.2fms' % latency_cpu)
        latency_gpu = utils.latency_measure(model, (3, 224, 224),
                                            32,
                                            5000,
                                            mode='gpu')
        logging.info('latency_gpu (batch 32): %.2fms' % latency_gpu)
    params = utils.count_parameters_in_MB(model)
    logging.info("Params = %.2fMB" % params)
    mult_adds = comp_multadds(model, input_size=config.data.input_size)
    logging.info("Mult-Adds = %.2fMB" % mult_adds)

    model = nn.DataParallel(model)

    # whether to resume from a checkpoint
    if config.optim.if_resume:
        utils.load_model(model, config.optim.resume.load_path)
        start_epoch = config.optim.resume.load_epoch + 1
    else:
        start_epoch = 0

    model = model.cuda()
Beispiel #4
0
    parsed_arch = parse_architecture(op_weights, depth_weights)
    with open(args.lookup_path, 'rb') as f:
        lat_lookup = pickle.load(f)
    mc_mask_dddict = torch.load(args.model_path)['mc_mask_dddict']
    mc_num_dddict = get_mc_num_dddict(mc_mask_dddict)
    model = Network(1000, parsed_arch, mc_num_dddict, lat_lookup, 0.0, 0.0)
    model = model.cuda()

    x = torch.randn((1, 3, 224, 224))
    x = x.cuda()

    config = model.config
    with open(args.save_path, 'w') as f:
        json.dump(config, f, indent=4)

    params = count_parameters_in_MB(model)
    print('Params:  \t{:.4f}MB'.format(params))

    flops = calculate_FLOPs_in_M(model, (1, 3, 224, 224))
    print('FLOPs:  \t{:.4f}M'.format(flops))

    if args.print_lat:
        # latency in lookup table
        lat_lut = model.get_lookup_latency(x)
        print('Lat_LUT:\t{:.4f}ms'.format(lat_lut))

        lat_gpu = measure_latency_in_ms(model, (32, 3, 224, 224), is_cuda=True)
        print('Lat_GPU bs=32:\t{:.4f}ms'.format(lat_gpu))

        lat_gpu = measure_latency_in_ms(model, (1, 3, 224, 224), is_cuda=True)
        print('Lat_GPU bs=1:\t{:.4f}ms'.format(lat_gpu))
Beispiel #5
0
def main():
    if not torch.cuda.is_available():
        logging.info('No GPU device available')
        sys.exit(1)
    set_seed(args.seed)
    cudnn.enabled = True
    cudnn.benchmark = True
    logging.info("args = %s", args)
    logging.info("unparsed_args = %s", unparsed)

    # create model
    logging.info('parsing the architecture')
    if args.model_path and os.path.isfile(args.model_path):
        op_weights, depth_weights = get_op_and_depth_weights(args.model_path)
        parsed_arch = parse_architecture(op_weights, depth_weights)
        mc_mask_dddict = torch.load(args.model_path)['mc_mask_dddict']
        mc_num_dddict = get_mc_num_dddict(mc_mask_dddict)
        model = Network(args.num_classes, parsed_arch, mc_num_dddict, None,
                        args.dropout_rate, args.drop_connect_rate)
    elif args.config_path and os.path.isfile(args.config_path):
        model_config = json.load(open(args.config_path, 'r'))
        model = NetworkCfg(args.num_classes, model_config, None,
                           args.dropout_rate, args.drop_connect_rate)
    else:
        raise Exception('invalid --model_path and --config_path')
    model = nn.DataParallel(model).cuda()
    config = model.module.config
    with open(os.path.join(args.save, 'model.config'), 'w') as f:
        json.dump(config, f, indent=4)
    # logging.info(config)
    logging.info("param size = %fMB", count_parameters_in_MB(model))

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss()
    criterion = criterion.cuda()
    criterion_smooth = CrossEntropyLabelSmooth(args.num_classes,
                                               args.label_smooth)
    criterion_smooth = criterion_smooth.cuda()

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # define transform and initialize dataloader
    train_transform = transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ColorJitter(
            brightness=0.4,
            contrast=0.4,
            saturation=0.4,  #),
            hue=0.2),
        transforms.ToTensor(),
        transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD),
    ])
    val_transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD),
    ])
    train_queue = torch.utils.data.DataLoader(ImageList(
        root=args.train_root,
        list_path=args.train_list,
        transform=train_transform,
    ),
                                              batch_size=args.batch_size,
                                              shuffle=True,
                                              pin_memory=True,
                                              num_workers=args.workers)
    val_queue = torch.utils.data.DataLoader(ImageList(
        root=args.val_root,
        list_path=args.val_list,
        transform=val_transform,
    ),
                                            batch_size=args.batch_size,
                                            shuffle=False,
                                            pin_memory=True,
                                            num_workers=args.workers)

    # define learning rate scheduler
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, float(args.epochs))
    best_acc_top1 = 0
    best_acc_top5 = 0
    start_epoch = 0

    # restart from snapshot
    if args.snapshot:
        logging.info('loading snapshot from {}'.format(args.snapshot))
        checkpoint = torch.load(args.snapshot)
        start_epoch = checkpoint['epoch']
        best_acc_top1 = checkpoint['best_acc_top1']
        best_acc_top5 = checkpoint['best_acc_top5']
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, float(args.epochs), last_epoch=0)
        for epoch in range(start_epoch):
            current_lr = scheduler.get_lr()[0]
            logging.info('Epoch: %d lr %e', epoch, current_lr)
            if epoch < 5 and args.batch_size > 256:
                for param_group in optimizer.param_groups:
                    param_group['lr'] = current_lr * (epoch + 1) / 5.0
                logging.info('Warming-up Epoch: %d, LR: %e', epoch,
                             current_lr * (epoch + 1) / 5.0)
            if epoch < 5 and args.batch_size > 256:
                for param_group in optimizer.param_groups:
                    param_group['lr'] = current_lr
            scheduler.step()

    # the main loop
    for epoch in range(start_epoch, args.epochs):
        current_lr = scheduler.get_lr()[0]
        logging.info('Epoch: %d lr %e', epoch, current_lr)
        if epoch < 5 and args.batch_size > 256:
            for param_group in optimizer.param_groups:
                param_group['lr'] = current_lr * (epoch + 1) / 5.0
            logging.info('Warming-up Epoch: %d, LR: %e', epoch,
                         current_lr * (epoch + 1) / 5.0)

        epoch_start = time.time()
        train_acc, train_obj = train(train_queue, model, criterion_smooth,
                                     optimizer)
        logging.info('Train_acc: %f', train_acc)

        val_acc_top1, val_acc_top5, val_obj = validate(val_queue, model,
                                                       criterion)
        logging.info('Val_acc_top1: %f', val_acc_top1)
        logging.info('Val_acc_top5: %f', val_acc_top5)
        logging.info('Epoch time: %ds.', time.time() - epoch_start)

        is_best = False
        if val_acc_top1 > best_acc_top1:
            best_acc_top1 = val_acc_top1
            best_acc_top5 = val_acc_top5
            is_best = True
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'best_acc_top1': best_acc_top1,
                'best_acc_top5': best_acc_top5,
                'optimizer': optimizer.state_dict(),
            }, is_best, args.save)

        if epoch < 5 and args.batch_size > 256:
            for param_group in optimizer.param_groups:
                param_group['lr'] = current_lr

        scheduler.step()
Beispiel #6
0
    cudnn.enabled = True
    utils.set_seed(config.data.seed)

    logging.info("args = %s", args)
    logging.info('Training with config:')
    logging.info(pprint.pformat(config))

    config.net_config, net_type = utils.load_net_config(
        os.path.join(args.load_path, 'net_config'))

    derivedNetwork = getattr(model_derived, '%s_Net' % net_type.upper())
    model = derivedNetwork(config.net_config, config=config, num_classes=1000)

    logging.info("Network Structure: \n" +
                 '\n'.join(map(str, model.net_config)))
    logging.info("Params = %.2fMB" % utils.count_parameters_in_MB(model))
    logging.info("Mult-Adds = %.2fMB" %
                 comp_multadds(model, input_size=config.data.input_size))

    model = model.cuda()
    model = nn.DataParallel(model)
    checkpoint = torch.load(os.path.join(args.load_path, 'weight.pt'),
                            map_location="cpu")  # weight checkpoint
    model.load_state_dict(checkpoint['state_dict'], strict=False)

    imagenet = imagenet_data.ImageNet12(
        trainFolder=os.path.join(args.data_path, 'train'),
        testFolder=os.path.join(args.data_path, 'val'),
        num_workers=config.data.num_workers,
        data_config=config.data)
    valid_queue = imagenet.getTestLoader(config.data.batch_size)
Beispiel #7
0
def main():
    if not torch.cuda.is_available():
        logging.info('No GPU device available')
        sys.exit(1)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    cudnn.enabled = True
    cudnn.benchmark = True
    logging.info("args = %s", args)

    with open(args.lookup_path, 'rb') as f:
        lat_lookup = pickle.load(f)

    mc_maxnum_dddict = get_mc_num_dddict(mc_mask_dddict, is_max=True)
    model = Network(args.num_classes, mc_maxnum_dddict, lat_lookup)
    model = torch.nn.DataParallel(model).cuda()
    logging.info("param size = %fMB", count_parameters_in_MB(model))

    # save initial model
    model_path = os.path.join(args.save, 'searched_model_00.pth.tar')
    torch.save(
        {
            'state_dict': model.state_dict(),
            'mc_mask_dddict': mc_mask_dddict,
        }, model_path)

    # get lr list
    lr_list = []
    optimizer_w = torch.optim.SGD(model.module.weight_parameters(),
                                  lr=args.w_lr,
                                  momentum=args.w_mom,
                                  weight_decay=args.w_wd)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer_w, float(args.epochs))
    for _ in range(args.epochs):
        lr = scheduler.get_lr()[0]
        lr_list.append(lr)
        scheduler.step()
    del model
    del optimizer_w
    del scheduler

    criterion = nn.CrossEntropyLoss()
    criterion = criterion.cuda()

    normalize = transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
    train_transform = transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ColorJitter(brightness=0.4,
                               contrast=0.4,
                               saturation=0.4,
                               hue=0.2),
        transforms.ToTensor(),
        normalize,
    ])
    val_transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        normalize,
    ])

    train_queue = torch.utils.data.DataLoader(ImageList(
        root=args.img_root,
        list_path=args.train_list,
        transform=train_transform),
                                              batch_size=args.batch_size,
                                              shuffle=True,
                                              pin_memory=True,
                                              num_workers=args.workers)

    val_queue = torch.utils.data.DataLoader(ImageList(root=args.img_root,
                                                      list_path=args.val_list,
                                                      transform=val_transform),
                                            batch_size=args.batch_size,
                                            shuffle=True,
                                            pin_memory=True,
                                            num_workers=args.workers)

    for epoch in range(args.epochs):
        mc_num_dddict = get_mc_num_dddict(mc_mask_dddict)
        model = Network(args.num_classes, mc_num_dddict, lat_lookup)
        model = torch.nn.DataParallel(model).cuda()
        model.module.set_temperature(args.T)

        # load model
        model_path = os.path.join(args.save,
                                  'searched_model_{:02}.pth.tar'.format(epoch))
        state_dict = torch.load(model_path)['state_dict']
        for key in state_dict:
            if 'm_ops' not in key:
                exec('model.{}.data = state_dict[key].data'.format(key))
        for stage in mc_mask_dddict:
            for block in mc_mask_dddict[stage]:
                for op_idx in mc_mask_dddict[stage][block]:
                    index = torch.nonzero(
                        mc_mask_dddict[stage][block][op_idx]).view(-1)
                    index = index.cuda()
                    iw = 'model.module.{}.{}.m_ops[{}].inverted_bottleneck.conv.weight.data'.format(
                        stage, block, op_idx)
                    iw_key = 'module.{}.{}.m_ops.{}.inverted_bottleneck.conv.weight'.format(
                        stage, block, op_idx)
                    exec(
                        iw +
                        ' = torch.index_select(state_dict[iw_key], 0, index).data'
                    )
                    dw = 'model.module.{}.{}.m_ops[{}].depth_conv.conv.weight.data'.format(
                        stage, block, op_idx)
                    dw_key = 'module.{}.{}.m_ops.{}.depth_conv.conv.weight'.format(
                        stage, block, op_idx)
                    exec(
                        dw +
                        ' = torch.index_select(state_dict[dw_key], 0, index).data'
                    )
                    pw = 'model.module.{}.{}.m_ops[{}].point_linear.conv.weight.data'.format(
                        stage, block, op_idx)
                    pw_key = 'module.{}.{}.m_ops.{}.point_linear.conv.weight'.format(
                        stage, block, op_idx)
                    exec(
                        pw +
                        ' = torch.index_select(state_dict[pw_key], 1, index).data'
                    )
                    if op_idx >= 4:
                        se_cr_w = 'model.module.{}.{}.m_ops[{}].squeeze_excite.conv_reduce.weight.data'.format(
                            stage, block, op_idx)
                        se_cr_w_key = 'module.{}.{}.m_ops.{}.squeeze_excite.conv_reduce.weight'.format(
                            stage, block, op_idx)
                        exec(
                            se_cr_w +
                            ' = torch.index_select(state_dict[se_cr_w_key], 1, index).data'
                        )
                        se_cr_b = 'model.module.{}.{}.m_ops[{}].squeeze_excite.conv_reduce.bias.data'.format(
                            stage, block, op_idx)
                        se_cr_b_key = 'module.{}.{}.m_ops.{}.squeeze_excite.conv_reduce.bias'.format(
                            stage, block, op_idx)
                        exec(se_cr_b + ' = state_dict[se_cr_b_key].data')
                        se_ce_w = 'model.module.{}.{}.m_ops[{}].squeeze_excite.conv_expand.weight.data'.format(
                            stage, block, op_idx)
                        se_ce_w_key = 'module.{}.{}.m_ops.{}.squeeze_excite.conv_expand.weight'.format(
                            stage, block, op_idx)
                        exec(
                            se_ce_w +
                            ' = torch.index_select(state_dict[se_ce_w_key], 0, index).data'
                        )
                        se_ce_b = 'model.module.{}.{}.m_ops[{}].squeeze_excite.conv_expand.bias.data'.format(
                            stage, block, op_idx)
                        se_ce_b_key = 'module.{}.{}.m_ops.{}.squeeze_excite.conv_expand.bias'.format(
                            stage, block, op_idx)
                        exec(
                            se_ce_b +
                            ' = torch.index_select(state_dict[se_ce_b_key], 0, index).data'
                        )
        del index

        lr = lr_list[epoch]
        optimizer_w = torch.optim.SGD(model.module.weight_parameters(),
                                      lr=lr,
                                      momentum=args.w_mom,
                                      weight_decay=args.w_wd)
        optimizer_a = torch.optim.Adam(model.module.arch_parameters(),
                                       lr=args.a_lr,
                                       betas=(args.a_beta1, args.a_beta2),
                                       weight_decay=args.a_wd)
        logging.info('Epoch: %d lr: %e T: %e', epoch, lr, args.T)

        # training
        epoch_start = time.time()
        if epoch < 10:
            train_acc = train_wo_arch(train_queue, model, criterion,
                                      optimizer_w)
        else:
            train_acc = train_w_arch(train_queue, val_queue, model, criterion,
                                     optimizer_w, optimizer_a)
            args.T *= args.T_decay
        # logging arch parameters
        logging.info('The current arch parameters are:')
        for param in model.module.log_alphas_parameters():
            param = np.exp(param.detach().cpu().numpy())
            logging.info(' '.join(['{:.6f}'.format(p) for p in param]))
        for param in model.module.betas_parameters():
            param = F.softmax(param.detach().cpu(), dim=-1)
            param = param.numpy()
            logging.info(' '.join(['{:.6f}'.format(p) for p in param]))
        logging.info('Train_acc %f', train_acc)
        epoch_duration = time.time() - epoch_start
        logging.info('Epoch time: %ds', epoch_duration)

        # validation for last 5 epochs
        if args.epochs - epoch < 5:
            val_acc = validate(val_queue, model, criterion)
            logging.info('Val_acc %f', val_acc)

        # update state_dict
        state_dict_from_model = model.state_dict()
        for key in state_dict:
            if 'm_ops' not in key:
                state_dict[key].data = state_dict_from_model[key].data
        for stage in mc_mask_dddict:
            for block in mc_mask_dddict[stage]:
                for op_idx in mc_mask_dddict[stage][block]:
                    index = torch.nonzero(
                        mc_mask_dddict[stage][block][op_idx]).view(-1)
                    index = index.cuda()
                    iw_key = 'module.{}.{}.m_ops.{}.inverted_bottleneck.conv.weight'.format(
                        stage, block, op_idx)
                    state_dict[iw_key].data[
                        index, :, :, :] = state_dict_from_model[iw_key]
                    dw_key = 'module.{}.{}.m_ops.{}.depth_conv.conv.weight'.format(
                        stage, block, op_idx)
                    state_dict[dw_key].data[
                        index, :, :, :] = state_dict_from_model[dw_key]
                    pw_key = 'module.{}.{}.m_ops.{}.point_linear.conv.weight'.format(
                        stage, block, op_idx)
                    state_dict[
                        pw_key].data[:, index, :, :] = state_dict_from_model[
                            pw_key]
                    if op_idx >= 4:
                        se_cr_w_key = 'module.{}.{}.m_ops.{}.squeeze_excite.conv_reduce.weight'.format(
                            stage, block, op_idx)
                        state_dict[
                            se_cr_w_key].data[:,
                                              index, :, :] = state_dict_from_model[
                                                  se_cr_w_key]
                        se_cr_b_key = 'module.{}.{}.m_ops.{}.squeeze_excite.conv_reduce.bias'.format(
                            stage, block, op_idx)
                        state_dict[
                            se_cr_b_key].data[:] = state_dict_from_model[
                                se_cr_b_key]
                        se_ce_w_key = 'module.{}.{}.m_ops.{}.squeeze_excite.conv_expand.weight'.format(
                            stage, block, op_idx)
                        state_dict[se_ce_w_key].data[
                            index, :, :, :] = state_dict_from_model[
                                se_ce_w_key]
                        se_ce_b_key = 'module.{}.{}.m_ops.{}.squeeze_excite.conv_expand.bias'.format(
                            stage, block, op_idx)
                        state_dict[se_ce_b_key].data[
                            index] = state_dict_from_model[se_ce_b_key]
        del state_dict_from_model, index

        # shrink and expand
        if epoch >= 10:
            logging.info('Now shrinking or expanding the arch')
            op_weights, depth_weights = get_op_and_depth_weights(model)
            parsed_arch = parse_architecture(op_weights, depth_weights)
            mc_num_dddict = get_mc_num_dddict(mc_mask_dddict)
            before_lat = get_lookup_latency(parsed_arch, mc_num_dddict,
                                            lat_lookup_key_dddict, lat_lookup)
            logging.info(
                'Before, the current lat: {:.4f}, the target lat: {:.4f}'.
                format(before_lat, args.target_lat))

            if before_lat > args.target_lat:
                logging.info('Shrinking......')
                stages = ['stage{}'.format(x) for x in range(1, 7)]
                mc_num_dddict, after_lat = fit_mc_num_by_latency(
                    parsed_arch,
                    mc_num_dddict,
                    mc_maxnum_dddict,
                    lat_lookup_key_dddict,
                    lat_lookup,
                    args.target_lat,
                    stages,
                    sign=-1)
                for start in range(2, 7):
                    stages = ['stage{}'.format(x) for x in range(start, 7)]
                    mc_num_dddict, after_lat = fit_mc_num_by_latency(
                        parsed_arch,
                        mc_num_dddict,
                        mc_maxnum_dddict,
                        lat_lookup_key_dddict,
                        lat_lookup,
                        args.target_lat,
                        stages,
                        sign=1)
            elif before_lat < args.target_lat:
                logging.info('Expanding......')
                stages = ['stage{}'.format(x) for x in range(1, 7)]
                mc_num_dddict, after_lat = fit_mc_num_by_latency(
                    parsed_arch,
                    mc_num_dddict,
                    mc_maxnum_dddict,
                    lat_lookup_key_dddict,
                    lat_lookup,
                    args.target_lat,
                    stages,
                    sign=1)
                for start in range(2, 7):
                    stages = ['stage{}'.format(x) for x in range(start, 7)]
                    mc_num_dddict, after_lat = fit_mc_num_by_latency(
                        parsed_arch,
                        mc_num_dddict,
                        mc_maxnum_dddict,
                        lat_lookup_key_dddict,
                        lat_lookup,
                        args.target_lat,
                        stages,
                        sign=1)
            else:
                logging.info('No opeartion')
                after_lat = before_lat

            # change mc_mask_dddict based on mc_num_dddict
            for stage in parsed_arch:
                for block in parsed_arch[stage]:
                    op_idx = parsed_arch[stage][block]
                    if mc_num_dddict[stage][block][op_idx] != int(
                            sum(mc_mask_dddict[stage][block][op_idx]).item()):
                        mc_num = mc_num_dddict[stage][block][op_idx]
                        max_mc_num = mc_mask_dddict[stage][block][op_idx].size(
                            0)
                        mc_mask_dddict[stage][block][op_idx].data[
                            [True] * max_mc_num] = 0.0
                        key = 'module.{}.{}.m_ops.{}.depth_conv.conv.weight'.format(
                            stage, block, op_idx)
                        weight_copy = state_dict[key].clone().abs().cpu(
                        ).numpy()
                        weight_l1_norm = np.sum(weight_copy, axis=(1, 2, 3))
                        weight_l1_order = np.argsort(weight_l1_norm)
                        weight_l1_order_rev = weight_l1_order[::-1][:mc_num]
                        mc_mask_dddict[stage][block][op_idx].data[
                            weight_l1_order_rev.tolist()] = 1.0

            logging.info(
                'After, the current lat: {:.4f}, the target lat: {:.4f}'.
                format(after_lat, args.target_lat))

        # save model
        model_path = os.path.join(
            args.save, 'searched_model_{:02}.pth.tar'.format(epoch + 1))
        torch.save(
            {
                'state_dict': state_dict,
                'mc_mask_dddict': mc_mask_dddict,
            }, model_path)
Beispiel #8
0
        logging.info("Super Network flops (M) list: \n")
        logging.info(str(flops_list))
        logging.info("Total flops: " + str(total_flops))
    elif config.optim.sub_obj.type == 'latency':
        with open(
                os.path.join('latency_list',
                             config.optim.sub_obj.latency_list_path),
                'r') as f:
            latency_list = eval(f.readline())
        super_model.module.sub_obj_list = latency_list
        logging.info("Super Network latency (ms) list: \n")
        logging.info(str(latency_list))
    else:
        raise NotImplementedError
    logging.info("Num Params = %.2fMB",
                 utils.count_parameters_in_MB(super_model))

    if config.data.dataset == 'imagenet':
        imagenet = imagenet_data.ImageNet12(
            trainFolder=os.path.join(args.data_path, 'train'),
            testFolder=os.path.join(args.data_path, 'val'),
            num_workers=config.data.num_workers,
            type_of_data_augmentation=config.data.type_of_data_aug,
            data_config=config.data)
        train_queue, valid_queue = imagenet.getTrainTestLoader(
            config.data.batch_size, train_shuffle=True, val_shuffle=True)
    else:
        raise NotImplementedError

    search_optim = Optimizer(super_model, criterion, config)
Beispiel #9
0
    train_loader = ClusterLoader(cluster_data,
                                 batch_size=150,
                                 shuffle=True,
                                 num_workers=12)
    subgraph_loader = NeighborSampler(data.edge_index,
                                      sizes=[-1],
                                      batch_size=1024,
                                      shuffle=False,
                                      num_workers=12)

    ######!!!!这里选择结构
    genotype = eval("genotypes.%s" % args.arch)  #eval()执行一个字符串表达式,并返回表达式的值。
    model = Network(args.init_channels,
                    args.classes,
                    args.num_cells,
                    genotype,
                    in_channels=args.in_channels)
    model = model.to(DEVICE)

    print("param size = {:.6f}MB".format(utils.count_parameters_in_MB(model)))

    criterion = nn.BCEWithLogitsLoss().to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.learning_rate,
                                 weight_decay=args.weight_decay)
    # optimizer = torch.optim.SGD(model.parameters(),lr=args.learning_rate,momentum=args.momentum)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, float(args.epochs))

    main()
Beispiel #10
0
def main():
    np.random.seed(args.seed)
    cudnn.benchmark = True
    torch.manual_seed(args.seed)
    cudnn.enabled = True
    torch.cuda.manual_seed(args.seed)
    logging.info("args = %s", args)

    genotype = eval("genotypes.%s" % args.arch)
    model = Network(args.init_channels, NUM_CLASSES, args.layers,
                    config.optim.auxiliary, genotype)

    start_epoch = 0
    model.eval()
    model.drop_path_prob = args.drop_path_prob * 0
    # compute the params as well as the multi-adds
    params = count_parameters_in_MB(model)
    logging.info("Params = %.2fMB" % params)
    mult_adds = comp_multadds(model, input_size=config.data.input_size)
    logging.info("Mult-Adds = %.2fMB" % mult_adds)

    model.train()
    if len(args.gpus) > 1:
        model = nn.DataParallel(model)
    model = model.cuda()
    if config.optim.label_smooth:
        criterion = CrossEntropyLabelSmooth(NUM_CLASSES,
                                            config.optim.smooth_alpha)
    else:
        criterion = nn.CrossEntropyLoss()
    criterion = criterion.cuda()

    optimizer = torch.optim.SGD(model.parameters(),
                                config.optim.init_lr,
                                momentum=config.optim.momentum,
                                weight_decay=config.optim.weight_decay)

    imagenet = imagenet_data.ImageNet12(
        trainFolder=os.path.join(args.data_path, 'train'),
        testFolder=os.path.join(args.data_path, 'val'),
        num_workers=config.data.num_workers,
        type_of_data_augmentation=config.data.type_of_data_aug,
        data_config=config.data,
        size_images=config.data.input_size[1],
        scaled_size=config.data.scaled_size[1])
    train_queue, valid_queue = imagenet.getTrainTestLoader(
        config.data.batch_size)

    if config.optim.lr_schedule == 'cosine':
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, float(config.train_params.epochs))

    trainer = Trainer(train_queue, valid_queue, criterion, config,
                      args.report_freq)
    best_epoch = [0, 0, 0]  # [epoch, acc_top1, acc_top5]
    lr = config.optim.init_lr
    for epoch in range(start_epoch, config.train_params.epochs):
        if config.optim.lr_schedule == 'cosine':
            scheduler.step()
            current_lr = scheduler.get_lr()[0]
        elif config.optim.lr_schedule == 'linear':  # with warmup initial
            optimizer, current_lr = adjust_lr(optimizer,
                                              config.train_params.epochs, lr,
                                              epoch)
        else:
            print('Wrong lr type, exit')
            sys.exit(1)
        if epoch < 5:  # Warmup epochs for 5
            current_lr = lr * (epoch + 1) / 5.0
            for param_group in optimizer.param_groups:
                param_group['lr'] = current_lr
            logging.info('Warming-up Epoch: %d, LR: %e', epoch,
                         lr * (epoch + 1) / 5.0)

        logging.info('Epoch: %d lr %e', epoch, current_lr)
        if len(args.gpus) > 1:
            model.module.drop_path_prob = args.drop_path_prob * epoch / config.train_params.epochs
        else:
            model.drop_path_prob = args.drop_path_prob * epoch / config.train_params.epochs
        train_acc_top1, train_acc_top5, train_obj, batch_time, data_time = trainer.train(
            model, optimizer, epoch)
        with torch.no_grad():
            val_acc_top1, val_acc_top5, batch_time, data_time = trainer.infer(
                model, epoch)
        if val_acc_top1 > best_epoch[1]:
            best_epoch = [epoch, val_acc_top1, val_acc_top5]
            if epoch >= 0:  # 120
                utils.save_checkpoint(
                    {
                        'epoch': epoch + 1,
                        'state_dict': model.module.state_dict(),
                        'best_acc_top1': val_acc_top1,
                        'optimizer': optimizer.state_dict(),
                    },
                    save_path=args.save,
                    epoch=epoch,
                    is_best=True)
                if len(args.gpus) > 1:
                    utils.save(
                        model.module.state_dict(),
                        os.path.join(
                            args.save,
                            'weights_{}_{}.pt'.format(epoch, val_acc_top1)))
                else:
                    utils.save(
                        model.state_dict(),
                        os.path.join(
                            args.save,
                            'weights_{}_{}.pt'.format(epoch, val_acc_top1)))

        logging.info('BEST EPOCH %d  val_top1 %.2f val_top5 %.2f',
                     best_epoch[0], best_epoch[1], best_epoch[2])
        logging.info(
            'epoch: {} \t train_acc_top1: {:.4f} \t train_loss: {:.4f} \t val_acc_top1: {:.4f}'
            .format(epoch, train_acc_top1, train_obj, val_acc_top1))

    logging.info("Params = %.2fMB" % params)
    logging.info("Mult-Adds = %.2fMB" % mult_adds)
Beispiel #11
0
def main():
    if not torch.cuda.is_available():
        print('no gpu device available')
        sys.exit(1)
    if args.random_seed:
        args.seed = np.random.randint(0, 1000, 1)
    # reproducible ,再次运行代码时,初始化值不变。
    #you should ensure that all other libraries your code relies on and which use random numbers also use a fixed seed.
    np.random.seed(args.seed)
    torch.cuda.set_device(args.gpu)
    cudnn.benchmark = True
    torch.manual_seed(args.seed)
    cudnn.enabled = True
    torch.cuda.manual_seed(args.seed)
    logging.info('gpu device = %d' % args.gpu)
    logging.info("args = %s", args)

    criterion = torch.nn.BCEWithLogitsLoss().cuda()
    ## in_channels是特征维度 !!
    model = Network(args.init_channels,
                    args.classes,
                    args.num_cells,
                    criterion,
                    args.n_steps,
                    in_channels=args.in_channels).cuda()
    logging.info("param size = %fMB", utils.count_parameters_in_MB(model))

    num_edges = model._steps * 2
    post_train = 5
    args.epochs = args.warmup_dec_epoch + args.decision_freq * (
        num_edges - 1) + post_train + 1
    logging.info("total epochs: %d", args.epochs)

    optimizer = torch.optim.SGD(model.parameters(),
                                args.learning_rate,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, float(args.epochs), eta_min=args.learning_rate_min)

    architect = Architect(model, args)

    normal_selected_idxs = torch.tensor(len(model.alphas_normal) * [-1],
                                        requires_grad=False,
                                        dtype=torch.int).cuda()
    normal_candidate_flags = torch.tensor(len(model.alphas_normal) * [True],
                                          requires_grad=False,
                                          dtype=torch.bool).cuda()
    logging.info('normal_selected_idxs: {}'.format(normal_selected_idxs))
    logging.info('normal_candidate_flags: {}'.format(normal_candidate_flags))
    model.normal_selected_idxs = normal_selected_idxs
    model.normal_candidate_flags = normal_candidate_flags

    print(F.softmax(torch.stack(model.alphas_normal, dim=0), dim=-1).detach())

    normal_probs_history = []
    train_losses, valid_losses = utils.AverageMeter(), utils.AverageMeter()
    for epoch in range(args.epochs):
        lr = scheduler.get_lr()[0]
        logging.info('epoch %d lr %e', epoch, lr)

        # training
        train_acc, train_loss = train(model, architect, criterion, optimizer,
                                      lr)
        print("!!!!!!!!!!!!!!!!train_loss:", train_loss)
        valid_acc, valid_losses = infer(model, criterion, valid_losses)
        logging.info('train_acc %f\tvalid_acc %f', train_acc, valid_acc)

        # make edge decisions
        saved_memory_normal, model.normal_selected_idxs, \
        model.normal_candidate_flags = edge_decision('normal',
                                                     model.alphas_normal,
                                                     model.normal_selected_idxs,
                                                     model.normal_candidate_flags,
                                                     normal_probs_history,
                                                     epoch,
                                                     model,
                                                     args)

        writer.add_scalar('stats/train_acc', train_acc, epoch)
        writer.add_scalar('stats/valid_acc', valid_acc, epoch)
        utils.save(model, os.path.join(args.save, 'search_weights.pt'))
        scheduler.step()

    logging.info("#" * 30 + " Done " + "#" * 30)
    logging.info('genotype = %s', model.get_genotype())