def profile(self, config=None, verbose=True):
     netG = self.netG_A
     if isinstance(netG, nn.DataParallel):
         netG = netG.module
     if config is not None:
         netG.configs = config
     style = torch.randn(1, self.opt.style_dim, 1, 1, device=self.device)
     with torch.no_grad():
         macs_no_style_encoder = profile_macs(netG,
                                              (self.real_A[:1], style))
     if not self.opt.no_style_encoder:
         macs_with_style_encoder = profile_macs(netG, (self.real_A[:1]))
     params = 0
     for p in netG.parameters():
         params += p.numel()
     if verbose:
         if self.opt.no_style_encoder:
             print('MACs (no style encoder): %.3fG\tParams: %.3fM' %
                   (macs_no_style_encoder / 1e9, params / 1e6),
                   flush=True)
         else:
             print(
                 'MACs (no style encoder): %.3fG\tMACs (with style encoder): %.3fG\tParams: %.3fM'
                 % (macs_no_style_encoder / 1e9,
                    (macs_with_style_encoder) / 1e9, params / 1e6),
                 flush=True)
     return macs_with_style_encoder, params
Beispiel #2
0
	def get_architecture(self, args):
		g_lst, pred_acc_lst, x_lst = [], [], []
		searched_g, max_pred_acc = None, 0
		
		with torch.no_grad():
			for n in range(self.num_gen_arch):
				file_acc = self.lines[n].split()[0]
				g_dict = ' '.join(self.lines[n].split())
				g = json.loads(g_dict.replace("'", "\""))
				
				if args.bound is not None:
					subnet, config = self.sample(g)
					net = NSGANetV2.build_from_config(subnet.config,
					                                  drop_connect_rate=args.drop_path)
					inputs = torch.randn(1, 3, args.img_size, args.img_size)
					flops = profile_macs(copy.deepcopy(net), inputs) / 1e6
					if flops <= args.bound:
						searched_g = g
						break
				else:
					searched_g = g
					pred_acc_lst.append(file_acc)
					break
		
		if searched_g is None:
			raise ValueError(searched_g)
		return searched_g, pred_acc_lst
Beispiel #3
0
def set_architecture(n_cls):
	if args.model_config.startswith('flops@'):
		names = {'cifar10': 'CIFAR-10', 'cifar100': 'CIFAR-100',
		         'aircraft100': 'Aircraft', 'pets': 'Pets'}
		p = os.path.join('./searched-architectures/{}/net-{}/net.subnet'.
		                 format(names[args.data_name], args.model_config))
		g = json.load(open(p))
	else:
		g, acc = evaluator.get_architecture(args)
	
	subnet, config = evaluator.sample(g)
	net = NSGANetV2.build_from_config(subnet.config, drop_connect_rate=args.drop_path)
	net.load_state_dict(subnet.state_dict())
	
	NSGANetV2.reset_classifier(
		net, last_channel=net.classifier.in_features,
		n_classes=n_cls, dropout_rate=args.drop)
	# calculate #Paramaters and #FLOPS
	inputs = torch.randn(1, 3, args.img_size, args.img_size)
	flops = profile_macs(copy.deepcopy(net), inputs) / 1e6
	params = sum(p.numel() for p in net.parameters() if p.requires_grad) / 1e6
	net_name = "net_flops@{:.0f}".format(flops)
	logging.info('#params {:.2f}M, #flops {:.0f}M'.format(params, flops))
	OFAEvaluator.save_net_config(args.save_path, net, net_name + '.config')
	if args.n_gpus > 1:
		net = nn.DataParallel(net)  # data parallel in case more than 1 gpu available
	net = net.to(args.device)
	
	return net, net_name
Beispiel #4
0
def get_net_info(net,
                 input_shape=(3, 224, 224),
                 measure_latency=None,
                 print_info=True,
                 clean=False,
                 lut=None):
    """
    Modified from https://github.com/mit-han-lab/once-for-all/blob/
    35ddcb9ca30905829480770a6a282d49685aa282/ofa/imagenet_codebase/utils/pytorch_utils.py#L139
    """
    from ofa.imagenet_codebase.utils.pytorch_utils import count_parameters, measure_net_latency

    # artificial input data
    inputs = torch.randn(1, 3, input_shape[-2], input_shape[-1])

    # move network to GPU if available
    if torch.cuda.is_available():
        device = torch.device('cuda:0')
        net = net.to(device)
        cudnn.benchmark = True
        inputs = inputs.to(device)

    net_info = {}
    if isinstance(net, nn.DataParallel):
        net = net.module

    # parameters
    net_info['params'] = count_parameters(net)

    # flops
    net_info['flops'] = int(profile_macs(copy.deepcopy(net), inputs))

    # latencies
    latency_types = [] if measure_latency is None else measure_latency.split(
        '#')

    # print(latency_types)
    for l_type in latency_types:
        if lut is not None and l_type in lut:
            latency_estimator = LatencyEstimator(lut[l_type])
            latency = look_up_latency(net, latency_estimator, input_shape[2])
            measured_latency = None
        else:
            latency, measured_latency = measure_net_latency(
                net, l_type, fast=False, input_shape=input_shape, clean=clean)
        net_info['%s latency' % l_type] = {
            'val': latency,
            'hist': measured_latency
        }

    if print_info:
        # print(net)
        print('Total training params: %.2fM' % (net_info['params'] / 1e6))
        print('Total FLOPs: %.2fM' % (net_info['flops'] / 1e6))
        for l_type in latency_types:
            print('Estimated %s latency: %.3fms' %
                  (l_type, net_info['%s latency' % l_type]['val']))

    return net_info
Beispiel #5
0
    def net_flops(self):
        data_shape = [1] + list(self.run_config.data_provider.data_shape)

        net = self.net
        input_var = torch.zeros(data_shape).cuda()
        with torch.no_grad():
            flops = profile_macs(net, input_var)
        return flops
Beispiel #6
0
 def profile(self, input_semantics):
     netG = self.netG
     if isinstance(netG, nn.DataParallel):
         netG = netG.module
     if self.config is not None:
         netG.config = self.config
     with torch.no_grad():
         macs = profile_macs(netG, (input_semantics, ))
     params = 0
     for p in netG.parameters():
         params += p.numel()
     return macs, params
Beispiel #7
0
 def profile_macs(self, *inputs, batch_size=2) -> np.int64:
     """
     measure the required macs (memory access costs) of a forward pass
     prevent randomly changing the architecture
     """
     with torch.no_grad():
         if len(inputs) == 0:
             inputs = self.get_shape_in().random_tensor(
                 batch_size=batch_size).to(self.get_device())
         if isinstance(inputs, (tuple, list)) and len(inputs) == 1:
             inputs = inputs[0]
         return torchprofile.profile_macs(self, args=inputs) // batch_size
Beispiel #8
0
 def profile(self, config=None, verbose=True):
     netG = self.netG
     if isinstance(netG, nn.DataParallel):
         netG = netG.module
     if config is not None:
         netG.configs = config
     with torch.no_grad():
         macs = profile_macs(netG, (self.real_A[:1],))
     params = 0
     for p in netG.parameters():
         params += p.numel()
     if verbose:
         print('MACs: %.3fG\tParams: %.3fM' % (macs / 1e9, params / 1e6), flush=True)
     return macs, params
Beispiel #9
0
def inference_macs(
    network: nn.Module,
    args: Tuple = (),
    data_shape: Optional[Tuple] = None,
    unit: float = 1e6,
) -> float:
    if is_parallel(network):
        network = network.module
    if data_shape is not None:
        if len(args) > 0:
            raise ValueError("Please provide either data_shape or args tuple.")
        args = (torch.zeros(data_shape, device=get_module_device(network)), )
    is_training = network.training
    network.eval()
    macs = profile_macs(network, args=args) / unit
    network.train(is_training)
    return macs
Beispiel #10
0
def profile(c):
    c.setdefault(hebbian=False, distributed=False)
    net, step = config.var(device="cpu").load_model("max")
    print('profiling')
    net.eval()

    data_val = SequentialIterator(config, config.eval_batch, split="test")
    with torch.no_grad():
        for batch in data_val:
            x = to_torch(batch, c.device).t()

            inputs, labels = x[:-1], x[1:]

            macs = profile_macs(net, (inputs, labels))
            print("==> FLOPS: ", macs / config.eval_chunk * 2)

            print("==> Models size: ", count_params(net))

            exit(0)
Beispiel #11
0
    def add_gene(self, gene, macs=None, score=None, method=0, parents=None):
        if gene not in self.store:
            self.model.eval()
            if self.model.config.model_type == "distilbert":
                bert = self.model.distilbert
            else:
                assert hasattr(self.model, "bert")
                bert = self.model.bert
            bert.set_length_config(gene)
            macs = macs or torchprofile.profile_macs(self.model, args=self.dummy_inputs)
            if macs < self.lower_constraint:
                return False
            score = score or self.evaluate(self.args, self.model, self.tokenizer)[0]['f1']
            self.store[gene] = (macs, score, method, parents)
            logger.info(store2str(gene, macs, score, method, parents))

        macs = self.store[gene][0]
        if macs >= self.lower_constraint \
                and (self.upper_constraint is None or macs <= self.upper_constraint) \
                and gene not in self.population:
            self.population.append(gene)
            return True
        return False
Beispiel #12
0
    outC = random.choice(outChannelList)
    network.append(Conv2d(inC, outC, 1, 1, 0))
    inDim = convolution(inDim, inC, outC, 1, 1, 0, netEmbedding)

    inC = outC
    outC = 1000
    network.append(Conv2d(inC, outC, 1, 1, 0))
    inDim = convolution(inDim, inC, outC, 1, 1, 0, netEmbedding)

    net = nn.Sequential(*network)
    # print(net)

    x = torch.rand([1, 3, 224, 224])
    with torch.autograd.profiler.profile() as prof:
        y = net(x)
    macs = profile_macs(net, x)
    if 40000000 < macs < 400000000:
        i += 1
    else:
        continue
    params = sum([p.numel() for p in net.parameters()])
    flopsList.append(macs / 1e6)
    modelSizeList.append((params * 4.0) / (1024**2))
    inferenceTimeList.append(prof.self_cpu_time_total / 1000.0)
    print(
        'Model ' + str(i) +
        ' NumBottle:%d Skips: %d MACS: %f M   ModelSize: %f MB  Inference: %f ms'
        % (numMB, numSkip, macs / 1e6,
           (params * 4.0) / (1024**2), prof.self_cpu_time_total / 1000.0))
    net.eval()
Beispiel #13
0
def main():
    if not torch.cuda.is_available():
        logging.info('no gpu device available')
        sys.exit(1)

    logging.info("args = %s", args)

    if args.seed is not None:
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)

    best_acc = 0  # initiate a artificial best accuracy so far
    top_checkpoints = []  # initiate a list to keep track of

    # Data
    train_transform, valid_transform = _data_transforms(args)
    if dataset == 'cifar100':
        train_data = torchvision.datasets.CIFAR100(root=args.data,
                                                   train=True,
                                                   download=True,
                                                   transform=train_transform)
        valid_data = torchvision.datasets.CIFAR100(root=args.data,
                                                   train=False,
                                                   download=True,
                                                   transform=valid_transform)
    elif dataset == 'cifar10':
        train_data = torchvision.datasets.CIFAR10(root=args.data,
                                                  train=True,
                                                  download=True,
                                                  transform=train_transform)
        valid_data = torchvision.datasets.CIFAR10(root=args.data,
                                                  train=False,
                                                  download=True,
                                                  transform=valid_transform)
    else:
        raise KeyError

    train_queue = torch.utils.data.DataLoader(train_data,
                                              batch_size=args.batch_size,
                                              shuffle=True,
                                              pin_memory=True,
                                              num_workers=args.num_workers)

    valid_queue = torch.utils.data.DataLoader(valid_data,
                                              batch_size=200,
                                              shuffle=False,
                                              pin_memory=True,
                                              num_workers=args.num_workers)

    net = factory(args.model, pretrained=False,
                  num_classes=1000)  # assuming transfer from ImageNet
    load_checkpoint(net, args.imagenet, use_ema=True)

    net.reset_classifier(num_classes=NUM_CLASSES)
    net.drop_rate = args.drop

    # calculate #Paramaters and #FLOPS
    params = sum(p.numel() for p in net.parameters() if p.requires_grad) / 1e6
    try:
        inputs = torch.randn(1, 3, 224, 224)
        flops = profile_macs(copy.deepcopy(net), inputs) / 1e6
        logging.info('#params {:.2f}M, #flops {:.0f}M'.format(params, flops))
    except:
        logging.info('#params {:.2f}M'.format(params))

    if args.n_gpus > 1:
        net = nn.DataParallel(
            net)  # data parallel in case more than 1 gpu available

    net = net.to(device)

    n_epochs = args.epochs

    parameters = filter(lambda p: p.requires_grad, net.parameters())

    criterion = nn.CrossEntropyLoss().to(device)

    optimizer = optim.SGD(parameters,
                          lr=args.lr,
                          momentum=args.momentum,
                          weight_decay=args.weight_decay)

    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, n_epochs)

    for epoch in range(n_epochs):

        logging.info('epoch %d lr %e', epoch, scheduler.get_lr()[0])

        train(train_queue, net, criterion, optimizer)
        _, valid_acc = infer(valid_queue, net, criterion)

        # checkpoint saving
        if args.save:
            if valid_acc > best_acc:
                torch.save(net.state_dict(),
                           os.path.join(args.save, 'weights.pt'))
                best_acc = valid_acc

        scheduler.step()
Beispiel #14
0
def main():
    setup_default_logging()
    args, args_text = _parse_args()

    args.prefetcher = not args.no_prefetcher
    args.distributed = False
    if 'WORLD_SIZE' in os.environ:
        args.distributed = int(os.environ['WORLD_SIZE']) > 1
        if args.distributed and args.num_gpu > 1:
            _logger.warning(
                'Using more than one GPU per process in distributed mode is not allowed.Setting num_gpu to 1.'
            )
            args.num_gpu = 1

    args.device = 'cuda:0'
    args.world_size = 1
    args.rank = 0  # global rank
    if args.distributed:
        args.num_gpu = 1
        args.device = 'cuda:%d' % args.local_rank
        torch.cuda.set_device(args.local_rank)
        args.world_size = int(os.environ['WORLD_SIZE'])
        args.rank = int(os.environ['RANK'])
        torch.distributed.init_process_group(backend='nccl',
                                             init_method=args.init_method,
                                             rank=args.rank,
                                             world_size=args.world_size)
        args.world_size = torch.distributed.get_world_size()
        args.rank = torch.distributed.get_rank()
    assert args.rank >= 0

    if args.distributed:
        _logger.info(
            'Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.'
            % (args.rank, args.world_size))
    else:
        _logger.info('Training with a single process on %d GPUs.' %
                     args.num_gpu)

    torch.manual_seed(args.seed + args.rank)

    model = create_model(
        args.model,
        pretrained=args.pretrained,
        num_classes=args.num_classes,
        drop_rate=args.drop,
        drop_connect_rate=args.drop_connect,  # DEPRECATED, use drop_path
        drop_path_rate=args.drop_path,
        drop_block_rate=args.drop_block,
        global_pool=args.gp,
        bn_tf=args.bn_tf,
        bn_momentum=args.bn_momentum,
        bn_eps=args.bn_eps,
        checkpoint_path=args.initial_checkpoint)

    ################## pretrain ############
    if args.pretrain_path is not None:
        print('Loading:', args.pretrain_path)
        state_dict = torch.load(args.pretrain_path)
        model.load_state_dict(state_dict, strict=False)
        print('Pretrain weights loaded.')
    ################### flops #################
    print(model)
    if hasattr(model, 'default_cfg'):
        default_cfg = model.default_cfg
        input_size = [1] + list(default_cfg['input_size'])
    else:
        input_size = [1, 3, 224, 224]
    input = torch.randn(input_size)  #.cuda()

    from torchprofile import profile_macs
    macs = profile_macs(model, input)
    print('model flops:', macs, 'input_size:', input_size)
    ##########################################

    if args.local_rank == 0:
        _logger.info('Model %s created, param count: %d' %
                     (args.model, sum([m.numel()
                                       for m in model.parameters()])))

    data_config = resolve_data_config(vars(args),
                                      model=model,
                                      verbose=args.local_rank == 0)

    num_aug_splits = 0
    if args.aug_splits > 0:
        assert args.aug_splits > 1, 'A split of 1 makes no sense'
        num_aug_splits = args.aug_splits

    if args.split_bn:
        assert num_aug_splits > 1 or args.resplit
        model = convert_splitbn_model(model, max(num_aug_splits, 2))

    use_amp = None
    if args.amp:
        # for backwards compat, `--amp` arg tries apex before native amp
        if has_apex:
            args.apex_amp = True
        elif has_native_amp:
            args.native_amp = True
    if args.apex_amp and has_apex:
        use_amp = 'apex'
    elif args.native_amp and has_native_amp:
        use_amp = 'native'
    elif args.apex_amp or args.native_amp:
        _logger.warning(
            "Neither APEX or native Torch AMP is available, using float32. "
            "Install NVIDA apex or upgrade to PyTorch 1.6")

    if args.num_gpu > 1:
        if use_amp == 'apex':
            _logger.warning(
                'Apex AMP does not work well with nn.DataParallel, disabling. Use DDP or Torch AMP.'
            )
            use_amp = None
        model = nn.DataParallel(model,
                                device_ids=list(range(args.num_gpu))).cuda()
        assert not args.channels_last, "Channels last not supported with DP, use DDP."
    else:
        model.cuda()
        if args.channels_last:
            model = model.to(memory_format=torch.channels_last)

    optimizer = create_optimizer(args, model)

    amp_autocast = suppress  # do nothing
    loss_scaler = None
    if use_amp == 'apex':
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
        loss_scaler = ApexScaler()
        if args.local_rank == 0:
            _logger.info('Using NVIDIA APEX AMP. Training in mixed precision.')
    elif use_amp == 'native':
        amp_autocast = torch.cuda.amp.autocast
        loss_scaler = NativeScaler()
        if args.local_rank == 0:
            _logger.info(
                'Using native Torch AMP. Training in mixed precision.')
    else:
        if args.local_rank == 0:
            _logger.info('AMP not enabled. Training in float32.')

    # optionally resume from a checkpoint
    resume_epoch = None
    if args.resume:
        resume_epoch = resume_checkpoint(
            model,
            args.resume,
            optimizer=None if args.no_resume_opt else optimizer,
            loss_scaler=None if args.no_resume_opt else loss_scaler,
            log_info=args.local_rank == 0)

    model_ema = None
    if args.model_ema:
        # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper
        model_ema = ModelEma(model,
                             decay=args.model_ema_decay,
                             device='cpu' if args.model_ema_force_cpu else '',
                             resume=args.resume)

    if args.distributed:
        if args.sync_bn:
            assert not args.split_bn
            try:
                if has_apex and use_amp != 'native':
                    # Apex SyncBN preferred unless native amp is activated
                    model = convert_syncbn_model(model)
                else:
                    model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(
                        model)
                if args.local_rank == 0:
                    _logger.info(
                        'Converted model to use Synchronized BatchNorm. WARNING: You may have issues if using '
                        'zero initialized BN layers (enabled by default for ResNets) while sync-bn enabled.'
                    )
            except Exception as e:
                _logger.error(
                    'Failed to enable Synchronized BatchNorm. Install Apex or Torch >= 1.1'
                )
        if has_apex and use_amp != 'native':
            # Apex DDP preferred unless native amp is activated
            if args.local_rank == 0:
                _logger.info("Using NVIDIA APEX DistributedDataParallel.")
            model = ApexDDP(model, delay_allreduce=True)
        else:
            if args.local_rank == 0:
                _logger.info("Using native Torch DistributedDataParallel.")
            model = NativeDDP(model, device_ids=[
                args.local_rank
            ])  # can use device str in Torch >= 1.1
        # NOTE: EMA model does not need to be wrapped by DDP

    lr_scheduler, num_epochs = create_scheduler(args, optimizer)
    start_epoch = 0
    if args.start_epoch is not None:
        # a specified start_epoch will always override the resume epoch
        start_epoch = args.start_epoch
    elif resume_epoch is not None:
        start_epoch = resume_epoch
    if lr_scheduler is not None and start_epoch > 0:
        lr_scheduler.step(start_epoch)

    if args.local_rank == 0:
        _logger.info('Scheduled epochs: {}'.format(num_epochs))

    train_dir = os.path.join(args.data, 'train')
    if not os.path.exists(train_dir):
        _logger.error(
            'Training folder does not exist at: {}'.format(train_dir))
        exit(1)
    dataset_train = Dataset(train_dir)

    collate_fn = None
    mixup_fn = None
    mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None
    if mixup_active:
        mixup_args = dict(mixup_alpha=args.mixup,
                          cutmix_alpha=args.cutmix,
                          cutmix_minmax=args.cutmix_minmax,
                          prob=args.mixup_prob,
                          switch_prob=args.mixup_switch_prob,
                          mode=args.mixup_mode,
                          label_smoothing=args.smoothing,
                          num_classes=args.num_classes)
        if args.prefetcher:
            assert not num_aug_splits  # collate conflict (need to support deinterleaving in collate mixup)
            collate_fn = FastCollateMixup(**mixup_args)
        else:
            mixup_fn = Mixup(**mixup_args)

    if num_aug_splits > 1:
        dataset_train = AugMixDataset(dataset_train, num_splits=num_aug_splits)

    train_interpolation = args.train_interpolation
    if args.no_aug or not train_interpolation:
        train_interpolation = data_config['interpolation']
    loader_train = create_loader(
        dataset_train,
        input_size=data_config['input_size'],
        batch_size=args.batch_size,
        is_training=True,
        use_prefetcher=args.prefetcher,
        no_aug=args.no_aug,
        re_prob=args.reprob,
        re_mode=args.remode,
        re_count=args.recount,
        re_split=args.resplit,
        scale=args.scale,
        ratio=args.ratio,
        hflip=args.hflip,
        vflip=args.vflip,
        color_jitter=args.color_jitter,
        auto_augment=args.aa,
        num_aug_splits=num_aug_splits,
        interpolation=train_interpolation,
        mean=data_config['mean'],
        std=data_config['std'],
        num_workers=args.workers,
        distributed=args.distributed,
        collate_fn=collate_fn,
        pin_memory=args.pin_mem,
        use_multi_epochs_loader=args.use_multi_epochs_loader,
        repeated_aug=args.repeated_aug)

    eval_dir = os.path.join(args.data, 'val')
    if not os.path.isdir(eval_dir):
        eval_dir = os.path.join(args.data, 'validation')
        if not os.path.isdir(eval_dir):
            _logger.error(
                'Validation folder does not exist at: {}'.format(eval_dir))
            exit(1)
    dataset_eval = Dataset(eval_dir)

    loader_eval = create_loader(
        dataset_eval,
        input_size=data_config['input_size'],
        batch_size=args.validation_batch_size_multiplier * args.batch_size,
        is_training=False,
        use_prefetcher=args.prefetcher,
        interpolation=data_config['interpolation'],
        mean=data_config['mean'],
        std=data_config['std'],
        num_workers=args.workers,
        distributed=args.distributed,
        crop_pct=data_config['crop_pct'],
        pin_memory=args.pin_mem,
    )

    if args.jsd:
        assert num_aug_splits > 1  # JSD only valid with aug splits set
        train_loss_fn = JsdCrossEntropy(num_splits=num_aug_splits,
                                        smoothing=args.smoothing).cuda()
    elif mixup_active:
        # smoothing is handled with mixup target transform
        train_loss_fn = SoftTargetCrossEntropy().cuda()
    elif args.smoothing:
        train_loss_fn = LabelSmoothingCrossEntropy(
            smoothing=args.smoothing).cuda()
    else:
        train_loss_fn = nn.CrossEntropyLoss().cuda()
    validate_loss_fn = nn.CrossEntropyLoss().cuda()

    if args.evaluate:
        eval_metrics = validate(model,
                                loader_eval,
                                validate_loss_fn,
                                args,
                                amp_autocast=amp_autocast)
        print(eval_metrics)
        return

    eval_metric = args.eval_metric
    best_metric = None
    best_epoch = None
    saver = None
    output_dir = ''
    if args.local_rank == 0:
        output_base = args.output if args.output else './output'
        exp_name = '-'.join([
            datetime.now().strftime("%Y%m%d-%H%M%S"), args.model,
            str(data_config['input_size'][-1])
        ])
        output_dir = get_outdir(output_base, 'train', exp_name)
        decreasing = True if eval_metric == 'loss' else False
        saver = CheckpointSaver(model=model,
                                optimizer=optimizer,
                                args=args,
                                model_ema=model_ema,
                                amp_scaler=loss_scaler,
                                checkpoint_dir=output_dir,
                                recovery_dir=output_dir,
                                decreasing=decreasing)
        with open(os.path.join(output_dir, 'args.yaml'), 'w') as f:
            f.write(args_text)

    try:
        for epoch in range(start_epoch, num_epochs):
            if args.distributed:
                loader_train.sampler.set_epoch(epoch)

            train_metrics = train_epoch(epoch,
                                        model,
                                        loader_train,
                                        optimizer,
                                        train_loss_fn,
                                        args,
                                        lr_scheduler=lr_scheduler,
                                        saver=saver,
                                        output_dir=output_dir,
                                        amp_autocast=amp_autocast,
                                        loss_scaler=loss_scaler,
                                        model_ema=model_ema,
                                        mixup_fn=mixup_fn)

            if args.distributed and args.dist_bn in ('broadcast', 'reduce'):
                if args.local_rank == 0:
                    _logger.info(
                        "Distributing BatchNorm running means and vars")
                distribute_bn(model, args.world_size, args.dist_bn == 'reduce')

            eval_metrics = validate(model,
                                    loader_eval,
                                    validate_loss_fn,
                                    args,
                                    amp_autocast=amp_autocast)

            if model_ema is not None and not args.model_ema_force_cpu:
                if args.distributed and args.dist_bn in ('broadcast',
                                                         'reduce'):
                    distribute_bn(model_ema, args.world_size,
                                  args.dist_bn == 'reduce')
                ema_eval_metrics = validate(model_ema.ema,
                                            loader_eval,
                                            validate_loss_fn,
                                            args,
                                            amp_autocast=amp_autocast,
                                            log_suffix=' (EMA)')
                eval_metrics = ema_eval_metrics

            if lr_scheduler is not None:
                # step LR for next epoch
                lr_scheduler.step(epoch + 1, eval_metrics[eval_metric])

            update_summary(epoch,
                           train_metrics,
                           eval_metrics,
                           os.path.join(output_dir, 'summary.csv'),
                           write_header=best_metric is None)

            if saver is not None:
                # save proper checkpoint with eval metric
                save_metric = eval_metrics[eval_metric]
                best_metric, best_epoch = saver.save_checkpoint(
                    epoch, metric=save_metric)

    except KeyboardInterrupt:
        pass
    if best_metric is not None:
        _logger.info('*** Best metric: {0} (epoch {1})'.format(
            best_metric, best_epoch))
import torch
from torchprofile import profile_macs
from torchvision import models

if __name__ == '__main__':
    for name, model in models.__dict__.items():
        if not name.islower() or name.startswith('__') or not callable(model):
            continue

        model = model().eval()
        if 'inception' not in name:
            inputs = torch.randn(1, 3, 224, 224)
        else:
            inputs = torch.randn(1, 3, 299, 299)

        macs = profile_macs(model, inputs)
        print('{}: {:.4g} G'.format(name, macs / 1e9))
Beispiel #16
0
def get_net_macs(model, sample_input):
    macs = profile_macs(model, sample_input)
    return macs
Beispiel #17
0
def train(hyp):
    cfg = opt.cfg
    data = opt.data
    epochs = opt.epochs  # 500200 batches at bs 64, 117263 images = 273 epochs
    batch_size = opt.batch_size
    accumulate = max(round(64 / batch_size), 1)  # accumulate n times before optimizer update (bs 64)
    weights = opt.weights  # initial training weights
    imgsz_min, imgsz_max, imgsz_test = opt.img_size  # img sizes (min, max, test)


    # initialize
    wdir = opt.wdir + os.sep  # weights dir
    os.makedirs(wdir, exist_ok=True)
    last = wdir + 'last.pt'
    best = wdir + 'best.pt'
    results_file = wdir + 'results_'+opt.name+".txt"

    # Image Sizes
    gs = 32  # (pixels) grid size
    assert math.fmod(imgsz_min, gs) == 0, '--img-size %g must be a %g-multiple' % (imgsz_min, gs)
    opt.multi_scale |= imgsz_min != imgsz_max  # multi if different (min, max)
    if opt.multi_scale:
        if imgsz_min == imgsz_max:
            imgsz_min //= 1.5
            imgsz_max //= 0.667
        grid_min, grid_max = imgsz_min // gs, imgsz_max // gs
        imgsz_min, imgsz_max = int(grid_min * gs), int(grid_max * gs)
    img_size = imgsz_max  # initialize with max size

    # Configure run
    init_seeds()
    data_dict = parse_data_cfg(data)
    train_path = data_dict['train']
    test_path = data_dict['valid']
    nc = 1 if opt.single_cls else int(data_dict['classes'])  # number of classes
    hyp['cls'] *= nc / 80  # update coco-tuned hyp['cls'] to current dataset

    # Remove previous results
    for f in glob.glob('*_batch*.jpg') + glob.glob(results_file):
        os.remove(f)

    # Initialize model
    model = Darknet(cfg).to(device)

    # Calculate flops
    inputs = torch.randn(1, 3, imgsz_test, imgsz_test).cuda()
    macs = profile_macs(copy.deepcopy(model), inputs)
    print('Model FLOPs: {}GFlops'.format(round(macs*2/1e9, 2)))

    # Optimizer
    pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
    for k, v in dict(model.named_parameters()).items():
        if '.bias' in k:
            pg2 += [v]  # biases
        elif 'Conv2d.weight' in k:
            pg1 += [v]  # apply weight_decay
        else:
            pg0 += [v]  # all else

    if opt.adam:
        # hyp['lr0'] *= 0.1  # reduce lr (i.e. SGD=5E-3, Adam=5E-4)
        optimizer = optim.Adam(pg0, lr=hyp['lr0'])
        # optimizer = AdaBound(pg0, lr=hyp['lr0'], final_lr=0.1)
    else:
        optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
    optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']})  # add pg1 with weight_decay
    optimizer.add_param_group({'params': pg2})  # add pg2 (biases)
    print('Optimizer groups: %g .bias, %g Conv2d.weight, %g other' % (len(pg2), len(pg1), len(pg0)))
    del pg0, pg1, pg2

    start_epoch = 0
    best_fitness = 0.0
    attempt_download(weights)
    if weights.endswith('.pt'):  # pytorch format
        # possible weights are '*.pt', 'yolov3-spp.pt', 'yolov3-tiny.pt' etc.
        ckpt = torch.load(weights, map_location=device)

        # load model
        try:
            ckpt['model'] = {k: v for k, v in ckpt['model'].items() if model.state_dict()[k].numel() == v.numel()}
            model.load_state_dict(ckpt['model'], strict=False)
        except KeyError as e:
            s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s. " \
                "See https://github.com/ultralytics/yolov3/issues/657" % (opt.weights, opt.cfg, opt.weights)
            raise KeyError(s) from e

        # load optimizer
        if ckpt['optimizer'] is not None:
            optimizer.load_state_dict(ckpt['optimizer'])
            best_fitness = ckpt['best_fitness']

        # load results
        if ckpt.get('training_results') is not None:
            with open(results_file, 'w') as file:
                file.write(ckpt['training_results'])  # write results.txt

        # epochs
        start_epoch = ckpt['epoch'] + 1
        if epochs < start_epoch:
            print('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' %
                  (opt.weights, ckpt['epoch'], epochs))
            epochs += ckpt['epoch']  # finetune additional epochs

        del ckpt

    elif len(weights) > 0:  # darknet format
        # possible weights are '*.weights', 'yolov3-tiny.conv.15',  'darknet53.conv.74' etc.
        load_darknet_weights(model, weights)
    
    if opt.freeze_layers:                                                                                                                                                            
        output_layer_indices = [idx - 1 for idx, module in enumerate(model.module_list) if isinstance(module, YOLOLayer)]                                                                                                                      
        freeze_layer_indices = [x for x in range(len(model.module_list)) if                                                                                                         
                                (x not in output_layer_indices) and                                                                                                               
                                (x - 1 not in output_layer_indices)]                                                                                                                 
        for idx in freeze_layer_indices:                                                                                                                                             
            for parameter in model.module_list[idx].parameters():                                                                                                                    
                parameter.requires_grad_(False)                                                                                                                                      

    # Mixed precision training https://github.com/NVIDIA/apex
    if mixed_precision:
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0)

    # Scheduler https://arxiv.org/pdf/1812.01187.pdf
    lf = lambda x: (((1 + math.cos(x * math.pi / epochs)) / 2) ** 1.0) * 0.95 + 0.05  # cosine
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
    scheduler.last_epoch = start_epoch - 1  # see link below
    # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822

    # Plot lr schedule
    # y = []
    # for _ in range(epochs):
    #     scheduler.step()
    #     y.append(optimizer.param_groups[0]['lr'])
    # plt.plot(y, '.-', label='LambdaLR')
    # plt.xlabel('epoch')
    # plt.ylabel('LR')
    # plt.tight_layout()
    # plt.savefig('LR.png', dpi=300)

    # Initialize distributed training
    if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available():
        dist.init_process_group(backend='nccl',  # 'distributed backend'
                                init_method='tcp://127.0.0.1:9999',  # distributed training init method
                                world_size=1,  # number of nodes for distributed training
                                rank=0)  # distributed training node rank
        model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True)
        model.yolo_layers = model.module.yolo_layers  # move yolo layer indices to top level

    # Dataset
    dataset = LoadImagesAndLabels(train_path, img_size, batch_size,
                                  augment=True,
                                  hyp=hyp,  # augmentation hyperparameters
                                  rect=opt.rect,  # rectangular training
                                  cache_images=opt.cache_images,
                                  single_cls=opt.single_cls)

    # Dataloader
    batch_size = min(batch_size, len(dataset))
    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])  # number of workers
    #nw = 32
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             num_workers=nw,
                                             shuffle=not opt.rect,  # Shuffle=True unless rectangular training is used
                                             pin_memory=True,
                                             collate_fn=dataset.collate_fn)

    # Testloader
    testloader = torch.utils.data.DataLoader(LoadImagesAndLabels(test_path, imgsz_test, batch_size,
                                                                hyp=hyp,
                                                                rect=True,
                                                                cache_images=opt.cache_images,
                                                                single_cls=opt.single_cls),
                                                                batch_size=batch_size,
                                                                num_workers=nw,
                                                                pin_memory=True,
                                                                collate_fn=dataset.collate_fn)

    # Model parameters
    model.nc = nc  # attach number of classes to model
    model.hyp = hyp  # attach hyperparameters to model
    model.gr = 1.0  # giou loss ratio (obj_loss = 1.0 or giou)
    model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device)  # attach class weights

    # Model EMA
    ema = torch_utils.ModelEMA(model)

    # Start training
    nb = len(dataloader)  # number of batches
    n_burn = max(3 * nb, 500)  # burn-in iterations, max(3 epochs, 500 iterations)
    maps = np.zeros(nc)  # mAP per class
    # torch.autograd.set_detect_anomaly(True)
    results = (0, 0, 0, 0, 0, 0, 0)  # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'
    t0 = time.time()
    print('Image sizes %g - %g train, %g test' % (imgsz_min, imgsz_max, imgsz_test))
    print('Using %g dataloader workers' % nw)
    print('Starting training for %g epochs...' % epochs)
    for epoch in range(start_epoch, epochs):  # epoch ------------------------------------------------------------------
        model.train()

        # Update image weights (optional)
        if dataset.image_weights:
            w = model.class_weights.cpu().numpy() * (1 - maps) ** 2  # class weights
            image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w)
            dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n)  # rand weighted idx

        mloss = torch.zeros(4).to(device)  # mean losses
        print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size'))
        pbar = tqdm(enumerate(dataloader), total=nb)  # progress bar
        for i, (imgs, targets, paths, _) in pbar:  # batch -------------------------------------------------------------
            ni = i + nb * epoch  # number integrated batches (since train start)
            imgs = imgs.to(device).float() / 255.0  # uint8 to float32, 0 - 255 to 0.0 - 1.0
            targets = targets.to(device)

            # Burn-in
            if ni <= n_burn:
                xi = [0, n_burn]  # x interp
                model.gr = np.interp(ni, xi, [0.0, 1.0])  # giou loss ratio (obj_loss = 1.0 or giou)
                accumulate = max(1, np.interp(ni, xi, [1, 64 / batch_size]).round())
                for j, x in enumerate(optimizer.param_groups):
                    # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
                    x['lr'] = np.interp(ni, xi, [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)])
                    x['weight_decay'] = np.interp(ni, xi, [0.0, hyp['weight_decay'] if j == 1 else 0.0])
                    if 'momentum' in x:
                        x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']])

            # Multi-Scale
            if opt.multi_scale:
                if ni / accumulate % 1 == 0:  #  adjust img_size (67% - 150%) every 1 batch
                    img_size = random.randrange(grid_min, grid_max + 1) * gs
                sf = img_size / max(imgs.shape[2:])  # scale factor
                if sf != 1:
                    ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]]  # new shape (stretched to 32-multiple)
                    imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)

            # Forward
            pred = model(imgs)

            # Loss
            loss, loss_items = compute_loss(pred, targets, model)
            if not torch.isfinite(loss):
                print('WARNING: non-finite loss, ending training ', loss_items)
                return results

            # Backward
            loss *= batch_size / 64  # scale loss
            if mixed_precision:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            # Optimize
            if ni % accumulate == 0:
                optimizer.step()
                optimizer.zero_grad()
                ema.update(model)

            # Print
            mloss = (mloss * i + loss_items) / (i + 1)  # update mean losses
            mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0)  # (GB)
            s = ('%10s' * 2 + '%10.3g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, len(targets), img_size)
            pbar.set_description(s)

            # Plot
            if ni < 1:
                f = 'train_batch%g.jpg' % i  # filename
                res = plot_images(images=imgs, targets=targets, paths=paths, fname=f)
                if tb_writer:
                    tb_writer.add_image(f, res, dataformats='HWC', global_step=epoch)
                    # tb_writer.add_graph(model, imgs)  # add model to tensorboard

            # end batch ------------------------------------------------------------------------------------------------

        # Update scheduler
        scheduler.step()

        # Process epoch results
        ema.update_attr(model)
        final_epoch = epoch + 1 == epochs
        if not opt.notest or final_epoch:  # Calculate mAP
            is_coco = any([x in data for x in ['coco.data', 'coco2014.data', 'coco2017.data']]) and model.nc == 80
            results, maps = test.test(cfg,
                                      data,
                                      batch_size=batch_size,
                                      imgsz=imgsz_test,
                                      model=ema.ema,
                                      save_json=final_epoch and is_coco,
                                      single_cls=opt.single_cls,
                                      dataloader=testloader,
                                      multi_label=ni > n_burn)

        # Write
        with open(results_file, 'a') as f:
            f.write(s + '%10.3g' * 7 % results + '\n')  # P, R, mAP, F1, test_losses=(GIoU, obj, cls)
        if len(opt.name) and opt.bucket:
            os.system('gsutil cp results.txt gs://%s/results/results%s.txt' % (opt.bucket, opt.name))

        # Tensorboard
        if tb_writer:
            tags = ['train/giou_loss', 'train/obj_loss', 'train/cls_loss',
                    'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/F1',
                    'val/giou_loss', 'val/obj_loss', 'val/cls_loss']
            for x, tag in zip(list(mloss[:-1]) + list(results), tags):
                tb_writer.add_scalar(tag, x, epoch)

        # Update best mAP
        fi = fitness(np.array(results).reshape(1, -1))  # fitness_i = weighted combination of [P, R, mAP, F1]
        if fi > best_fitness:
            best_fitness = fi

        # Save model
        save = (not opt.nosave) or (final_epoch and not opt.evolve)
        if save:
            with open(results_file, 'r') as f:  # create checkpoint
                ckpt = {'epoch': epoch,
                         'best_fitness': best_fitness,
                         'training_results': f.read(),
                         'model': ema.ema.module.state_dict() if hasattr(model, 'module') else ema.ema.state_dict(),
                         'optimizer': None if final_epoch else optimizer.state_dict()}

            # Save last, best and delete
            torch.save(ckpt, last)
            if (best_fitness == fi) and not final_epoch:
                torch.save(ckpt, best)
            del ckpt

        # end epoch ----------------------------------------------------------------------------------------------------
    # end training

    n = opt.name
    if len(n):
        n = '_' + n if not n.isnumeric() else n
        fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n
        for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]):
            if os.path.exists(f1):
                os.rename(f1, f2)  # rename
                ispt = f2.endswith('.pt')  # is *.pt
                strip_optimizer(f2) if ispt else None  # strip optimizer
                os.system('gsutil cp %s gs://%s/weights' % (f2, opt.bucket)) if opt.bucket and ispt else None  # upload

    if not opt.evolve:
        plot_results()  # save as results.png
    print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))
    dist.destroy_process_group() if torch.cuda.device_count() > 1 else None
    torch.cuda.empty_cache()
    return results
Beispiel #18
0
    def __init__(self, task):
        if task == 'train':
            from options.train_options import TrainOptions as Options
            from models import create_model as create_model
        elif task == 'distill':
            from options.distill_options import DistillOptions as Options
            from distillers import create_distiller as create_model
        else:
            raise NotImplementedError('Unknown task [%s]!!!' % task)
        opt = Options().parse()
        opt.tensorboard_dir = opt.log_dir if opt.tensorboard_dir is None else opt.tensorboard_dir
        print(' '.join(sys.argv))
        if opt.phase != 'train':
            warnings.warn('You are not using training set for %s!!!' % task)
        with open(os.path.join(opt.log_dir, 'opt.txt'), 'a') as f:
            f.write(' '.join(sys.argv) + '\n')
        set_seed(opt.seed)

        dataloader = create_dataloader(opt)
        dataset_size = len(dataloader.dataset)
        print('The number of training images = %d' % dataset_size)
        opt.iters_per_epoch = len(dataloader)
        if opt.dataset_mode in ['aligned', 'unaligned']:
            opt.data_channel, opt.data_height, opt.data_width = next(
                iter(dataloader))['A' if opt.direction ==
                                  'AtoB' else 'B'].shape[1:]
        elif opt.dataset_mode in ['cityscapes']:
            input_ = next(iter(dataloader))
            opt.data_height, opt.data_width = input_['label'].shape[2:]
            opt.data_channel = opt.input_nc
            if opt.contain_dontcare_label:
                opt.data_channel += 1
            if not opt.no_instance:
                opt.data_channel += input_['instance'].shape[1]
        else:
            raise NotImplementedError
        print(
            f'data shape is: channel={opt.data_channel}, height={opt.data_height}, width={opt.data_width}.'
        )

        model = create_model(opt)
        model.setup(opt)
        logger = Logger(opt)

        if getattr(opt, 'pretrained_student_G_path', '') and task == 'distill':
            if 'spade' in opt.teacher_netG:
                assert 'spade' in opt.student_netG
                assert 'spade' in opt.pretrained_netG
                load_pretrained_spade_student(model, opt)
            else:
                load_pretrained_student(model, opt)

        self.opt = opt
        self.dataloader = dataloader
        self.model = model
        self.logger = logger
        self.task = task

        modules_on_one_gpu = getattr(model, 'modules_on_one_gpu', model)
        if self.task == 'distill':
            logger.print_info(
                f'netG teacher FLOPs: {mc.unwrap_model(modules_on_one_gpu.netG_teacher).n_macs}.'
            )
            logger.print_info(
                f'netG student FLOPs: {mc.unwrap_model(modules_on_one_gpu.netG_student).n_macs}.'
            )

            data_input = torch.ones(
                [1, opt.data_channel, opt.data_height,
                 opt.data_width]).to(model.device)
            macs_t = profile_macs(
                mc.unwrap_model(modules_on_one_gpu.netG_teacher).to(
                    model.device), data_input)
            macs_s = profile_macs(
                mc.unwrap_model(modules_on_one_gpu.netG_student).to(
                    model.device), data_input)
            params_t = 0
            params_s = 0
            for p in modules_on_one_gpu.netG_teacher.parameters():
                params_t += p.numel()
            for p in modules_on_one_gpu.netG_student.parameters():
                params_s += p.numel()
            logger.print_info(
                f'netG teacher FLOPs: {macs_t}; Params: {params_t}.')
            logger.print_info(
                f'netG student FLOPs: {macs_s}; Params: {params_s}.')
Beispiel #19
0
        return [block.get_weight() for block in self.get_blocks()]

    def get_block(self, depth):
        return self.get_blocks()[depth]

    def get_weight(self, depth):
        return self.get_weights()[depth]


def nlresnet18(**kwargs):
    """ResNet-18 model from
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
    """
    return ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)


if __name__ == '__main__':
    from torchprofile import profile_macs

    model = resnet18(num_classes=10, non_learnable=True, binary=False, sparsity=0.99, depthwise=True)
    print(model)
    param_count = sum(p.numel() for p in model.parameters() if p.requires_grad)
    param_count_full = sum(p.numel() for p in model.parameters())

    data = torch.rand(1, 3, 64, 64)
    out = model(data)
    flops = profile_macs(model, data) / 1e6
    print(param_count)
    print(param_count_full)
    print(flops)
Beispiel #20
0
def main():
    if not torch.cuda.is_available():
        logging.info('no gpu device available')
        sys.exit(1)

    logging.info("args = %s", args)

    if args.seed is not None:
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)

    best_acc = 0  # initiate a artificial best accuracy so far
    top_checkpoints = []  # initiate a list to keep track of

    # Data
    train_transform, valid_transform = _data_transforms(args)
    if dataset == 'cifar100':
        train_data = torchvision.datasets.CIFAR100(
            root=args.data, train=True, download=True, transform=train_transform)
        valid_data = torchvision.datasets.CIFAR100(
            root=args.data, train=False, download=True, transform=valid_transform)
    elif dataset == 'cifar10':
        train_data = torchvision.datasets.CIFAR10(
            root=args.data, train=True, download=True, transform=train_transform)
        valid_data = torchvision.datasets.CIFAR10(
            root=args.data, train=False, download=True, transform=valid_transform)
    elif dataset == 'cinic10':
        train_data = torchvision.datasets.ImageFolder(
            args.data + 'train_and_valid', transform=train_transform)
        valid_data = torchvision.datasets.ImageFolder(
            args.data + 'test', transform=valid_transform)
    else:
        raise KeyError

    train_queue = torch.utils.data.DataLoader(
        train_data, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=args.num_workers)

    valid_queue = torch.utils.data.DataLoader(
        valid_data, batch_size=200, shuffle=False, pin_memory=True, num_workers=args.num_workers)

    net_config = json.load(open(args.model_config))
    net = NSGANetV2.build_from_config(net_config, drop_connect_rate=args.drop_path)
    init = torch.load(args.initial_checkpoint, map_location='cpu')['state_dict']
    net.load_state_dict(init)

    NSGANetV2.reset_classifier(
        net, last_channel=net.classifier.in_features,
        n_classes=NUM_CLASSES, dropout_rate=args.drop)

    # calculate #Paramaters and #FLOPS
    inputs = torch.randn(1, 3, args.img_size, args.img_size)
    flops = profile_macs(copy.deepcopy(net), inputs) / 1e6
    params = sum(p.numel() for p in net.parameters() if p.requires_grad) / 1e6
    net_name = "net_flops@{:.0f}".format(flops)
    logging.info('#params {:.2f}M, #flops {:.0f}M'.format(params, flops))

    if args.n_gpus > 1:
        net = nn.DataParallel(net)  # data parallel in case more than 1 gpu available

    net = net.to(device)

    n_epochs = args.epochs

    parameters = filter(lambda p: p.requires_grad, net.parameters())

    criterion = nn.CrossEntropyLoss().to(device)

    optimizer = optim.SGD(parameters,
                          lr=args.lr,
                          momentum=args.momentum,
                          weight_decay=args.weight_decay)

    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, n_epochs)

    if args.evaluate:
        infer(valid_queue, net, criterion)
        sys.exit(0)

    for epoch in range(n_epochs):

        logging.info('epoch %d lr %e', epoch, scheduler.get_lr()[0])

        train(train_queue, net, criterion, optimizer)
        _, valid_acc = infer(valid_queue, net, criterion)

        # checkpoint saving
        if args.save:
            if len(top_checkpoints) < args.topk:
                OFAEvaluator.save_net(args.save, net, net_name+'.ckpt{}'.format(epoch))
                top_checkpoints.append((os.path.join(args.save, net_name+'.ckpt{}'.format(epoch)), valid_acc))
            else:
                idx = np.argmin([x[1] for x in top_checkpoints])
                if valid_acc > top_checkpoints[idx][1]:
                    OFAEvaluator.save_net(args.save, net, net_name + '.ckpt{}'.format(epoch))
                    top_checkpoints.append((os.path.join(args.save, net_name+'.ckpt{}'.format(epoch)), valid_acc))
                    # remove the idx
                    os.remove(top_checkpoints[idx][0])
                    top_checkpoints.pop(idx)
                    print(top_checkpoints)

            if valid_acc > best_acc:
                OFAEvaluator.save_net(args.save, net, net_name + '.best')
                best_acc = valid_acc

        scheduler.step()

    OFAEvaluator.save_net_config(args.save, net, net_name+'.config')
Beispiel #21
0
import torch
import torch.nn as nn
from torchprofile import profile_macs
from torchprofile.utils.trace import trace


class Model(nn.Module):
    def forward(self, a, b):
        return torch.matmul(a, b)


if __name__ == '__main__':
    a = torch.zeros(10, 20, 1, 20, 20)
    b = torch.zeros(20, 30)

    rnn = nn.LSTM(10, 20, 2)
    input = torch.randn(5, 3, 10)
    h0 = torch.randn(2, 3, 20)
    c0 = torch.randn(2, 3, 20)
    output, (hn, cn) = rnn(input, (h0, c0))
    print(trace(rnn, (input, (h0, c0))))
    print(profile_macs(rnn, (input, (h0, c0))))
def main(args, init_distributed=False):
    utils.import_user_module(args)
    utils.handle_save_path(args)

    assert args.max_tokens is not None or args.max_sentences is not None, \
        'Must specify batch size either with --max-tokens or --max-sentences'

    # Initialize CUDA and distributed training
    #if torch.cuda.is_available() and not args.cpu:
    #    torch.cuda.set_device(args.device_id)
    torch.manual_seed(args.seed)
    if init_distributed:
        args.distributed_rank = distributed_utils.distributed_init(args)

    if distributed_utils.is_master(args):
        checkpoint_utils.verify_checkpoint_directory(args.save_dir)

    # Print args
    print(f"| Configs: {args}")

    # Setup task, e.g., translation, language modeling, etc.
    task = tasks.setup_task(args)

    # Load valid dataset (we load training data below, based on the latest checkpoint)
    for valid_sub_split in args.valid_subset.split(','):
        task.load_dataset(valid_sub_split, combine=False, epoch=0)

    # Build model and criterion
    model = task.build_model(args)
    criterion = task.build_criterion(args)
    print(
        f"| Model: {args.arch} \n| Criterion: {criterion.__class__.__name__}")

    # Log architecture
    if args.train_subtransformer:
        print(" \n\n\t\tWARNING!!! Training one single SubTransformer\n\n")
        print(
            f"| SubTransformer Arch: {utils.get_subtransformer_config(args)} \n"
        )
    else:
        print(" \n\n\t\tWARNING!!! Training SuperTransformer\n\n")
        print(f"| SuperTransformer Arch: {model} \n")

    # Log model size
    if args.train_subtransformer:
        print(
            f"| SubTransformer size (without embedding weights): {model.get_sampled_params_numel(utils.get_subtransformer_config(args))}"
        )
        embed_size = args.decoder_embed_dim_subtransformer * len(task.tgt_dict)
        print(f"| Embedding layer size: {embed_size} \n")

    else:
        model_s = 0
        # if use model.state_dict, then will add 2 more parameters, they are encoder.version and decoder.version. Should not count them
        for name, param in model.named_parameters():
            if 'embed' not in name:
                model_s += param.numel()
        print(
            f"| SuperTransofmer model size (without embedding weights): {model_s}"
        )

        print(
            f"| Embedding layer size: {sum(p.numel() for p in model.parameters() if p.requires_grad) - model_s} \n"
        )

    # specify the length of the dummy input for profile
    # for iwslt, the average length is 23, for wmt, that is 30
    dummy_sentence_length_dict = {'iwslt': 23, 'wmt': 30}
    if 'iwslt' in args.arch:
        dummy_sentence_length = dummy_sentence_length_dict['iwslt']
    elif 'wmt' in args.arch:
        dummy_sentence_length = dummy_sentence_length_dict['wmt']
    else:
        raise NotImplementedError

    dummy_src_tokens = [2] + [7] * (dummy_sentence_length - 1)
    dummy_prev = [7] * (dummy_sentence_length - 1) + [2]

    # profile the overall FLOPs number
    if args.profile_flops:
        import torchprofile
        config_subtransformer = utils.get_subtransformer_config(args)
        model.set_sample_config(config_subtransformer)
        model.profile(mode=True)
        macs = torchprofile.profile_macs(model,
                                         args=(torch.tensor([dummy_src_tokens],
                                                            dtype=torch.long),
                                               torch.tensor([30]),
                                               torch.tensor([dummy_prev],
                                                            dtype=torch.long)))
        model.profile(mode=False)

        last_layer_macs = config_subtransformer['decoder'][
            'decoder_embed_dim'] * dummy_sentence_length * len(task.tgt_dict)

        print(f"| Total FLOPs: {macs * 2}")
        print(f"| Last layer FLOPs: {last_layer_macs * 2}")
        print(
            f"| Total FLOPs without last layer: {(macs - last_layer_macs) * 2} \n"
        )
        exit(0)
    with torch.autograd.set_detect_anomaly(True):
        # Build trainer
        trainer = Trainer(args, task, model, criterion)
    print(f"| Training on {args.distributed_world_size} GPUs")
    # print(f"| Max tokens per GPU = {args.max_tokens} and max sentences per GPU = {args.max_sentences} \n")
    print(
        f"| Max tokens per GPU = {args.max_tokens} and max sentences per GPU = {None} \n"
    )

    # Measure model latency, the program will exit after profiling latency
    if args.latcpu or args.latgpu:
        utils.measure_latency(args, model, dummy_src_tokens, dummy_prev)
        exit(0)

    # Load the latest checkpoint if one is available and restore the corresponding train iterator
    extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer)

    # Evaluate the SubTransformer
    if args.validate_subtransformer:
        config = utils.get_subtransformer_config(args)
        trainer.set_sample_config(config)
        valid_loss = validate(args, trainer, task, epoch_itr, ['valid'],
                              'SubTransformer')
        print(f"| SubTransformer validation loss:{valid_loss}")

    # Loop boundaries
    max_epoch = args.max_epoch or math.inf
    max_update = args.max_update or math.inf
    lr = trainer.get_lr()

    train_meter = StopwatchMeter()
    train_meter.start()
    valid_subsets = args.valid_subset.split(',')

    represent_configs = utils.get_represent_configs(args)

    # Main training loop
    while lr > args.stop_min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates(
    ) < max_update:
        # train for one epoch
        train(args, trainer, task, epoch_itr)

        if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0:
            for k, v in represent_configs.items():
                trainer.set_sample_config(config=v)
                valid_losses = validate(args,
                                        trainer,
                                        task,
                                        epoch_itr,
                                        valid_subsets,
                                        sampled_arch_name=k)
        else:
            valid_losses = [None]

        # update the best loss and get current lr; the real lr scheduling is done in trainer.train_step()
        lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0])

        # save checkpoint epoch level
        if epoch_itr.epoch % args.save_interval == 0:
            checkpoint_utils.save_checkpoint(args, trainer, epoch_itr,
                                             valid_losses[0])

    train_meter.stop()
    print('| Done training in {:.1f} seconds'.format(train_meter.sum))
Beispiel #23
0
    # set sub-generator
    if args.channel_ratio:
        from models.dynamic_channel import set_uniform_channel_ratio, CHANNEL_CONFIGS

        assert args.channel_ratio in CHANNEL_CONFIGS
        set_uniform_channel_ratio(generator, args.channel_ratio)

    if args.target_res is not None:
        generator.target_res = args.target_res

    # compute the flops of the generator (is possible)
    if hvd.rank() == 0:
        try:
            from torchprofile import profile_macs

            macs = profile_macs(generator, torch.rand(1, 1, 512).to(device))
            params = sum([p.numel() for p in generator.parameters()])
            print(' * MACs: {:.2f}G, Params: {:.2f}M'.format(
                macs / 1e9, params / 1e6))
        except:
            print(' * Profiling failed. Passed.')

    inception = models.get_pretrained('inception').to(device)
    inception.eval()

    inception_features = extract_feature_from_samples()
    # now perform all gather
    inception_features = hvd.allgather(
        inception_features, name='inception_features').numpy()[:args.n_sample]

    if hvd.rank() == 0:
Beispiel #24
0
 def profile(self, module: nn.Module, shape_in: ShapeOrList, mover: AbstractDeviceMover, batch_size: int) -> float:
     with torch.no_grad():
         inputs_ = mover.move(shape_in.random_tensor(batch_size=batch_size))
         return torchprofile.profile_macs(module, args=inputs_) // batch_size