def profile(self, config=None, verbose=True): netG = self.netG_A if isinstance(netG, nn.DataParallel): netG = netG.module if config is not None: netG.configs = config style = torch.randn(1, self.opt.style_dim, 1, 1, device=self.device) with torch.no_grad(): macs_no_style_encoder = profile_macs(netG, (self.real_A[:1], style)) if not self.opt.no_style_encoder: macs_with_style_encoder = profile_macs(netG, (self.real_A[:1])) params = 0 for p in netG.parameters(): params += p.numel() if verbose: if self.opt.no_style_encoder: print('MACs (no style encoder): %.3fG\tParams: %.3fM' % (macs_no_style_encoder / 1e9, params / 1e6), flush=True) else: print( 'MACs (no style encoder): %.3fG\tMACs (with style encoder): %.3fG\tParams: %.3fM' % (macs_no_style_encoder / 1e9, (macs_with_style_encoder) / 1e9, params / 1e6), flush=True) return macs_with_style_encoder, params
def get_architecture(self, args): g_lst, pred_acc_lst, x_lst = [], [], [] searched_g, max_pred_acc = None, 0 with torch.no_grad(): for n in range(self.num_gen_arch): file_acc = self.lines[n].split()[0] g_dict = ' '.join(self.lines[n].split()) g = json.loads(g_dict.replace("'", "\"")) if args.bound is not None: subnet, config = self.sample(g) net = NSGANetV2.build_from_config(subnet.config, drop_connect_rate=args.drop_path) inputs = torch.randn(1, 3, args.img_size, args.img_size) flops = profile_macs(copy.deepcopy(net), inputs) / 1e6 if flops <= args.bound: searched_g = g break else: searched_g = g pred_acc_lst.append(file_acc) break if searched_g is None: raise ValueError(searched_g) return searched_g, pred_acc_lst
def set_architecture(n_cls): if args.model_config.startswith('flops@'): names = {'cifar10': 'CIFAR-10', 'cifar100': 'CIFAR-100', 'aircraft100': 'Aircraft', 'pets': 'Pets'} p = os.path.join('./searched-architectures/{}/net-{}/net.subnet'. format(names[args.data_name], args.model_config)) g = json.load(open(p)) else: g, acc = evaluator.get_architecture(args) subnet, config = evaluator.sample(g) net = NSGANetV2.build_from_config(subnet.config, drop_connect_rate=args.drop_path) net.load_state_dict(subnet.state_dict()) NSGANetV2.reset_classifier( net, last_channel=net.classifier.in_features, n_classes=n_cls, dropout_rate=args.drop) # calculate #Paramaters and #FLOPS inputs = torch.randn(1, 3, args.img_size, args.img_size) flops = profile_macs(copy.deepcopy(net), inputs) / 1e6 params = sum(p.numel() for p in net.parameters() if p.requires_grad) / 1e6 net_name = "net_flops@{:.0f}".format(flops) logging.info('#params {:.2f}M, #flops {:.0f}M'.format(params, flops)) OFAEvaluator.save_net_config(args.save_path, net, net_name + '.config') if args.n_gpus > 1: net = nn.DataParallel(net) # data parallel in case more than 1 gpu available net = net.to(args.device) return net, net_name
def get_net_info(net, input_shape=(3, 224, 224), measure_latency=None, print_info=True, clean=False, lut=None): """ Modified from https://github.com/mit-han-lab/once-for-all/blob/ 35ddcb9ca30905829480770a6a282d49685aa282/ofa/imagenet_codebase/utils/pytorch_utils.py#L139 """ from ofa.imagenet_codebase.utils.pytorch_utils import count_parameters, measure_net_latency # artificial input data inputs = torch.randn(1, 3, input_shape[-2], input_shape[-1]) # move network to GPU if available if torch.cuda.is_available(): device = torch.device('cuda:0') net = net.to(device) cudnn.benchmark = True inputs = inputs.to(device) net_info = {} if isinstance(net, nn.DataParallel): net = net.module # parameters net_info['params'] = count_parameters(net) # flops net_info['flops'] = int(profile_macs(copy.deepcopy(net), inputs)) # latencies latency_types = [] if measure_latency is None else measure_latency.split( '#') # print(latency_types) for l_type in latency_types: if lut is not None and l_type in lut: latency_estimator = LatencyEstimator(lut[l_type]) latency = look_up_latency(net, latency_estimator, input_shape[2]) measured_latency = None else: latency, measured_latency = measure_net_latency( net, l_type, fast=False, input_shape=input_shape, clean=clean) net_info['%s latency' % l_type] = { 'val': latency, 'hist': measured_latency } if print_info: # print(net) print('Total training params: %.2fM' % (net_info['params'] / 1e6)) print('Total FLOPs: %.2fM' % (net_info['flops'] / 1e6)) for l_type in latency_types: print('Estimated %s latency: %.3fms' % (l_type, net_info['%s latency' % l_type]['val'])) return net_info
def net_flops(self): data_shape = [1] + list(self.run_config.data_provider.data_shape) net = self.net input_var = torch.zeros(data_shape).cuda() with torch.no_grad(): flops = profile_macs(net, input_var) return flops
def profile(self, input_semantics): netG = self.netG if isinstance(netG, nn.DataParallel): netG = netG.module if self.config is not None: netG.config = self.config with torch.no_grad(): macs = profile_macs(netG, (input_semantics, )) params = 0 for p in netG.parameters(): params += p.numel() return macs, params
def profile_macs(self, *inputs, batch_size=2) -> np.int64: """ measure the required macs (memory access costs) of a forward pass prevent randomly changing the architecture """ with torch.no_grad(): if len(inputs) == 0: inputs = self.get_shape_in().random_tensor( batch_size=batch_size).to(self.get_device()) if isinstance(inputs, (tuple, list)) and len(inputs) == 1: inputs = inputs[0] return torchprofile.profile_macs(self, args=inputs) // batch_size
def profile(self, config=None, verbose=True): netG = self.netG if isinstance(netG, nn.DataParallel): netG = netG.module if config is not None: netG.configs = config with torch.no_grad(): macs = profile_macs(netG, (self.real_A[:1],)) params = 0 for p in netG.parameters(): params += p.numel() if verbose: print('MACs: %.3fG\tParams: %.3fM' % (macs / 1e9, params / 1e6), flush=True) return macs, params
def inference_macs( network: nn.Module, args: Tuple = (), data_shape: Optional[Tuple] = None, unit: float = 1e6, ) -> float: if is_parallel(network): network = network.module if data_shape is not None: if len(args) > 0: raise ValueError("Please provide either data_shape or args tuple.") args = (torch.zeros(data_shape, device=get_module_device(network)), ) is_training = network.training network.eval() macs = profile_macs(network, args=args) / unit network.train(is_training) return macs
def profile(c): c.setdefault(hebbian=False, distributed=False) net, step = config.var(device="cpu").load_model("max") print('profiling') net.eval() data_val = SequentialIterator(config, config.eval_batch, split="test") with torch.no_grad(): for batch in data_val: x = to_torch(batch, c.device).t() inputs, labels = x[:-1], x[1:] macs = profile_macs(net, (inputs, labels)) print("==> FLOPS: ", macs / config.eval_chunk * 2) print("==> Models size: ", count_params(net)) exit(0)
def add_gene(self, gene, macs=None, score=None, method=0, parents=None): if gene not in self.store: self.model.eval() if self.model.config.model_type == "distilbert": bert = self.model.distilbert else: assert hasattr(self.model, "bert") bert = self.model.bert bert.set_length_config(gene) macs = macs or torchprofile.profile_macs(self.model, args=self.dummy_inputs) if macs < self.lower_constraint: return False score = score or self.evaluate(self.args, self.model, self.tokenizer)[0]['f1'] self.store[gene] = (macs, score, method, parents) logger.info(store2str(gene, macs, score, method, parents)) macs = self.store[gene][0] if macs >= self.lower_constraint \ and (self.upper_constraint is None or macs <= self.upper_constraint) \ and gene not in self.population: self.population.append(gene) return True return False
outC = random.choice(outChannelList) network.append(Conv2d(inC, outC, 1, 1, 0)) inDim = convolution(inDim, inC, outC, 1, 1, 0, netEmbedding) inC = outC outC = 1000 network.append(Conv2d(inC, outC, 1, 1, 0)) inDim = convolution(inDim, inC, outC, 1, 1, 0, netEmbedding) net = nn.Sequential(*network) # print(net) x = torch.rand([1, 3, 224, 224]) with torch.autograd.profiler.profile() as prof: y = net(x) macs = profile_macs(net, x) if 40000000 < macs < 400000000: i += 1 else: continue params = sum([p.numel() for p in net.parameters()]) flopsList.append(macs / 1e6) modelSizeList.append((params * 4.0) / (1024**2)) inferenceTimeList.append(prof.self_cpu_time_total / 1000.0) print( 'Model ' + str(i) + ' NumBottle:%d Skips: %d MACS: %f M ModelSize: %f MB Inference: %f ms' % (numMB, numSkip, macs / 1e6, (params * 4.0) / (1024**2), prof.self_cpu_time_total / 1000.0)) net.eval()
def main(): if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) logging.info("args = %s", args) if args.seed is not None: np.random.seed(args.seed) torch.manual_seed(args.seed) best_acc = 0 # initiate a artificial best accuracy so far top_checkpoints = [] # initiate a list to keep track of # Data train_transform, valid_transform = _data_transforms(args) if dataset == 'cifar100': train_data = torchvision.datasets.CIFAR100(root=args.data, train=True, download=True, transform=train_transform) valid_data = torchvision.datasets.CIFAR100(root=args.data, train=False, download=True, transform=valid_transform) elif dataset == 'cifar10': train_data = torchvision.datasets.CIFAR10(root=args.data, train=True, download=True, transform=train_transform) valid_data = torchvision.datasets.CIFAR10(root=args.data, train=False, download=True, transform=valid_transform) else: raise KeyError train_queue = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=args.num_workers) valid_queue = torch.utils.data.DataLoader(valid_data, batch_size=200, shuffle=False, pin_memory=True, num_workers=args.num_workers) net = factory(args.model, pretrained=False, num_classes=1000) # assuming transfer from ImageNet load_checkpoint(net, args.imagenet, use_ema=True) net.reset_classifier(num_classes=NUM_CLASSES) net.drop_rate = args.drop # calculate #Paramaters and #FLOPS params = sum(p.numel() for p in net.parameters() if p.requires_grad) / 1e6 try: inputs = torch.randn(1, 3, 224, 224) flops = profile_macs(copy.deepcopy(net), inputs) / 1e6 logging.info('#params {:.2f}M, #flops {:.0f}M'.format(params, flops)) except: logging.info('#params {:.2f}M'.format(params)) if args.n_gpus > 1: net = nn.DataParallel( net) # data parallel in case more than 1 gpu available net = net.to(device) n_epochs = args.epochs parameters = filter(lambda p: p.requires_grad, net.parameters()) criterion = nn.CrossEntropyLoss().to(device) optimizer = optim.SGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, n_epochs) for epoch in range(n_epochs): logging.info('epoch %d lr %e', epoch, scheduler.get_lr()[0]) train(train_queue, net, criterion, optimizer) _, valid_acc = infer(valid_queue, net, criterion) # checkpoint saving if args.save: if valid_acc > best_acc: torch.save(net.state_dict(), os.path.join(args.save, 'weights.pt')) best_acc = valid_acc scheduler.step()
def main(): setup_default_logging() args, args_text = _parse_args() args.prefetcher = not args.no_prefetcher args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed and args.num_gpu > 1: _logger.warning( 'Using more than one GPU per process in distributed mode is not allowed.Setting num_gpu to 1.' ) args.num_gpu = 1 args.device = 'cuda:0' args.world_size = 1 args.rank = 0 # global rank if args.distributed: args.num_gpu = 1 args.device = 'cuda:%d' % args.local_rank torch.cuda.set_device(args.local_rank) args.world_size = int(os.environ['WORLD_SIZE']) args.rank = int(os.environ['RANK']) torch.distributed.init_process_group(backend='nccl', init_method=args.init_method, rank=args.rank, world_size=args.world_size) args.world_size = torch.distributed.get_world_size() args.rank = torch.distributed.get_rank() assert args.rank >= 0 if args.distributed: _logger.info( 'Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.' % (args.rank, args.world_size)) else: _logger.info('Training with a single process on %d GPUs.' % args.num_gpu) torch.manual_seed(args.seed + args.rank) model = create_model( args.model, pretrained=args.pretrained, num_classes=args.num_classes, drop_rate=args.drop, drop_connect_rate=args.drop_connect, # DEPRECATED, use drop_path drop_path_rate=args.drop_path, drop_block_rate=args.drop_block, global_pool=args.gp, bn_tf=args.bn_tf, bn_momentum=args.bn_momentum, bn_eps=args.bn_eps, checkpoint_path=args.initial_checkpoint) ################## pretrain ############ if args.pretrain_path is not None: print('Loading:', args.pretrain_path) state_dict = torch.load(args.pretrain_path) model.load_state_dict(state_dict, strict=False) print('Pretrain weights loaded.') ################### flops ################# print(model) if hasattr(model, 'default_cfg'): default_cfg = model.default_cfg input_size = [1] + list(default_cfg['input_size']) else: input_size = [1, 3, 224, 224] input = torch.randn(input_size) #.cuda() from torchprofile import profile_macs macs = profile_macs(model, input) print('model flops:', macs, 'input_size:', input_size) ########################################## if args.local_rank == 0: _logger.info('Model %s created, param count: %d' % (args.model, sum([m.numel() for m in model.parameters()]))) data_config = resolve_data_config(vars(args), model=model, verbose=args.local_rank == 0) num_aug_splits = 0 if args.aug_splits > 0: assert args.aug_splits > 1, 'A split of 1 makes no sense' num_aug_splits = args.aug_splits if args.split_bn: assert num_aug_splits > 1 or args.resplit model = convert_splitbn_model(model, max(num_aug_splits, 2)) use_amp = None if args.amp: # for backwards compat, `--amp` arg tries apex before native amp if has_apex: args.apex_amp = True elif has_native_amp: args.native_amp = True if args.apex_amp and has_apex: use_amp = 'apex' elif args.native_amp and has_native_amp: use_amp = 'native' elif args.apex_amp or args.native_amp: _logger.warning( "Neither APEX or native Torch AMP is available, using float32. " "Install NVIDA apex or upgrade to PyTorch 1.6") if args.num_gpu > 1: if use_amp == 'apex': _logger.warning( 'Apex AMP does not work well with nn.DataParallel, disabling. Use DDP or Torch AMP.' ) use_amp = None model = nn.DataParallel(model, device_ids=list(range(args.num_gpu))).cuda() assert not args.channels_last, "Channels last not supported with DP, use DDP." else: model.cuda() if args.channels_last: model = model.to(memory_format=torch.channels_last) optimizer = create_optimizer(args, model) amp_autocast = suppress # do nothing loss_scaler = None if use_amp == 'apex': model, optimizer = amp.initialize(model, optimizer, opt_level='O1') loss_scaler = ApexScaler() if args.local_rank == 0: _logger.info('Using NVIDIA APEX AMP. Training in mixed precision.') elif use_amp == 'native': amp_autocast = torch.cuda.amp.autocast loss_scaler = NativeScaler() if args.local_rank == 0: _logger.info( 'Using native Torch AMP. Training in mixed precision.') else: if args.local_rank == 0: _logger.info('AMP not enabled. Training in float32.') # optionally resume from a checkpoint resume_epoch = None if args.resume: resume_epoch = resume_checkpoint( model, args.resume, optimizer=None if args.no_resume_opt else optimizer, loss_scaler=None if args.no_resume_opt else loss_scaler, log_info=args.local_rank == 0) model_ema = None if args.model_ema: # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper model_ema = ModelEma(model, decay=args.model_ema_decay, device='cpu' if args.model_ema_force_cpu else '', resume=args.resume) if args.distributed: if args.sync_bn: assert not args.split_bn try: if has_apex and use_amp != 'native': # Apex SyncBN preferred unless native amp is activated model = convert_syncbn_model(model) else: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm( model) if args.local_rank == 0: _logger.info( 'Converted model to use Synchronized BatchNorm. WARNING: You may have issues if using ' 'zero initialized BN layers (enabled by default for ResNets) while sync-bn enabled.' ) except Exception as e: _logger.error( 'Failed to enable Synchronized BatchNorm. Install Apex or Torch >= 1.1' ) if has_apex and use_amp != 'native': # Apex DDP preferred unless native amp is activated if args.local_rank == 0: _logger.info("Using NVIDIA APEX DistributedDataParallel.") model = ApexDDP(model, delay_allreduce=True) else: if args.local_rank == 0: _logger.info("Using native Torch DistributedDataParallel.") model = NativeDDP(model, device_ids=[ args.local_rank ]) # can use device str in Torch >= 1.1 # NOTE: EMA model does not need to be wrapped by DDP lr_scheduler, num_epochs = create_scheduler(args, optimizer) start_epoch = 0 if args.start_epoch is not None: # a specified start_epoch will always override the resume epoch start_epoch = args.start_epoch elif resume_epoch is not None: start_epoch = resume_epoch if lr_scheduler is not None and start_epoch > 0: lr_scheduler.step(start_epoch) if args.local_rank == 0: _logger.info('Scheduled epochs: {}'.format(num_epochs)) train_dir = os.path.join(args.data, 'train') if not os.path.exists(train_dir): _logger.error( 'Training folder does not exist at: {}'.format(train_dir)) exit(1) dataset_train = Dataset(train_dir) collate_fn = None mixup_fn = None mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None if mixup_active: mixup_args = dict(mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax, prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode, label_smoothing=args.smoothing, num_classes=args.num_classes) if args.prefetcher: assert not num_aug_splits # collate conflict (need to support deinterleaving in collate mixup) collate_fn = FastCollateMixup(**mixup_args) else: mixup_fn = Mixup(**mixup_args) if num_aug_splits > 1: dataset_train = AugMixDataset(dataset_train, num_splits=num_aug_splits) train_interpolation = args.train_interpolation if args.no_aug or not train_interpolation: train_interpolation = data_config['interpolation'] loader_train = create_loader( dataset_train, input_size=data_config['input_size'], batch_size=args.batch_size, is_training=True, use_prefetcher=args.prefetcher, no_aug=args.no_aug, re_prob=args.reprob, re_mode=args.remode, re_count=args.recount, re_split=args.resplit, scale=args.scale, ratio=args.ratio, hflip=args.hflip, vflip=args.vflip, color_jitter=args.color_jitter, auto_augment=args.aa, num_aug_splits=num_aug_splits, interpolation=train_interpolation, mean=data_config['mean'], std=data_config['std'], num_workers=args.workers, distributed=args.distributed, collate_fn=collate_fn, pin_memory=args.pin_mem, use_multi_epochs_loader=args.use_multi_epochs_loader, repeated_aug=args.repeated_aug) eval_dir = os.path.join(args.data, 'val') if not os.path.isdir(eval_dir): eval_dir = os.path.join(args.data, 'validation') if not os.path.isdir(eval_dir): _logger.error( 'Validation folder does not exist at: {}'.format(eval_dir)) exit(1) dataset_eval = Dataset(eval_dir) loader_eval = create_loader( dataset_eval, input_size=data_config['input_size'], batch_size=args.validation_batch_size_multiplier * args.batch_size, is_training=False, use_prefetcher=args.prefetcher, interpolation=data_config['interpolation'], mean=data_config['mean'], std=data_config['std'], num_workers=args.workers, distributed=args.distributed, crop_pct=data_config['crop_pct'], pin_memory=args.pin_mem, ) if args.jsd: assert num_aug_splits > 1 # JSD only valid with aug splits set train_loss_fn = JsdCrossEntropy(num_splits=num_aug_splits, smoothing=args.smoothing).cuda() elif mixup_active: # smoothing is handled with mixup target transform train_loss_fn = SoftTargetCrossEntropy().cuda() elif args.smoothing: train_loss_fn = LabelSmoothingCrossEntropy( smoothing=args.smoothing).cuda() else: train_loss_fn = nn.CrossEntropyLoss().cuda() validate_loss_fn = nn.CrossEntropyLoss().cuda() if args.evaluate: eval_metrics = validate(model, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast) print(eval_metrics) return eval_metric = args.eval_metric best_metric = None best_epoch = None saver = None output_dir = '' if args.local_rank == 0: output_base = args.output if args.output else './output' exp_name = '-'.join([ datetime.now().strftime("%Y%m%d-%H%M%S"), args.model, str(data_config['input_size'][-1]) ]) output_dir = get_outdir(output_base, 'train', exp_name) decreasing = True if eval_metric == 'loss' else False saver = CheckpointSaver(model=model, optimizer=optimizer, args=args, model_ema=model_ema, amp_scaler=loss_scaler, checkpoint_dir=output_dir, recovery_dir=output_dir, decreasing=decreasing) with open(os.path.join(output_dir, 'args.yaml'), 'w') as f: f.write(args_text) try: for epoch in range(start_epoch, num_epochs): if args.distributed: loader_train.sampler.set_epoch(epoch) train_metrics = train_epoch(epoch, model, loader_train, optimizer, train_loss_fn, args, lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir, amp_autocast=amp_autocast, loss_scaler=loss_scaler, model_ema=model_ema, mixup_fn=mixup_fn) if args.distributed and args.dist_bn in ('broadcast', 'reduce'): if args.local_rank == 0: _logger.info( "Distributing BatchNorm running means and vars") distribute_bn(model, args.world_size, args.dist_bn == 'reduce') eval_metrics = validate(model, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast) if model_ema is not None and not args.model_ema_force_cpu: if args.distributed and args.dist_bn in ('broadcast', 'reduce'): distribute_bn(model_ema, args.world_size, args.dist_bn == 'reduce') ema_eval_metrics = validate(model_ema.ema, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast, log_suffix=' (EMA)') eval_metrics = ema_eval_metrics if lr_scheduler is not None: # step LR for next epoch lr_scheduler.step(epoch + 1, eval_metrics[eval_metric]) update_summary(epoch, train_metrics, eval_metrics, os.path.join(output_dir, 'summary.csv'), write_header=best_metric is None) if saver is not None: # save proper checkpoint with eval metric save_metric = eval_metrics[eval_metric] best_metric, best_epoch = saver.save_checkpoint( epoch, metric=save_metric) except KeyboardInterrupt: pass if best_metric is not None: _logger.info('*** Best metric: {0} (epoch {1})'.format( best_metric, best_epoch))
import torch from torchprofile import profile_macs from torchvision import models if __name__ == '__main__': for name, model in models.__dict__.items(): if not name.islower() or name.startswith('__') or not callable(model): continue model = model().eval() if 'inception' not in name: inputs = torch.randn(1, 3, 224, 224) else: inputs = torch.randn(1, 3, 299, 299) macs = profile_macs(model, inputs) print('{}: {:.4g} G'.format(name, macs / 1e9))
def get_net_macs(model, sample_input): macs = profile_macs(model, sample_input) return macs
def train(hyp): cfg = opt.cfg data = opt.data epochs = opt.epochs # 500200 batches at bs 64, 117263 images = 273 epochs batch_size = opt.batch_size accumulate = max(round(64 / batch_size), 1) # accumulate n times before optimizer update (bs 64) weights = opt.weights # initial training weights imgsz_min, imgsz_max, imgsz_test = opt.img_size # img sizes (min, max, test) # initialize wdir = opt.wdir + os.sep # weights dir os.makedirs(wdir, exist_ok=True) last = wdir + 'last.pt' best = wdir + 'best.pt' results_file = wdir + 'results_'+opt.name+".txt" # Image Sizes gs = 32 # (pixels) grid size assert math.fmod(imgsz_min, gs) == 0, '--img-size %g must be a %g-multiple' % (imgsz_min, gs) opt.multi_scale |= imgsz_min != imgsz_max # multi if different (min, max) if opt.multi_scale: if imgsz_min == imgsz_max: imgsz_min //= 1.5 imgsz_max //= 0.667 grid_min, grid_max = imgsz_min // gs, imgsz_max // gs imgsz_min, imgsz_max = int(grid_min * gs), int(grid_max * gs) img_size = imgsz_max # initialize with max size # Configure run init_seeds() data_dict = parse_data_cfg(data) train_path = data_dict['train'] test_path = data_dict['valid'] nc = 1 if opt.single_cls else int(data_dict['classes']) # number of classes hyp['cls'] *= nc / 80 # update coco-tuned hyp['cls'] to current dataset # Remove previous results for f in glob.glob('*_batch*.jpg') + glob.glob(results_file): os.remove(f) # Initialize model model = Darknet(cfg).to(device) # Calculate flops inputs = torch.randn(1, 3, imgsz_test, imgsz_test).cuda() macs = profile_macs(copy.deepcopy(model), inputs) print('Model FLOPs: {}GFlops'.format(round(macs*2/1e9, 2))) # Optimizer pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in dict(model.named_parameters()).items(): if '.bias' in k: pg2 += [v] # biases elif 'Conv2d.weight' in k: pg1 += [v] # apply weight_decay else: pg0 += [v] # all else if opt.adam: # hyp['lr0'] *= 0.1 # reduce lr (i.e. SGD=5E-3, Adam=5E-4) optimizer = optim.Adam(pg0, lr=hyp['lr0']) # optimizer = AdaBound(pg0, lr=hyp['lr0'], final_lr=0.1) else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) print('Optimizer groups: %g .bias, %g Conv2d.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 start_epoch = 0 best_fitness = 0.0 attempt_download(weights) if weights.endswith('.pt'): # pytorch format # possible weights are '*.pt', 'yolov3-spp.pt', 'yolov3-tiny.pt' etc. ckpt = torch.load(weights, map_location=device) # load model try: ckpt['model'] = {k: v for k, v in ckpt['model'].items() if model.state_dict()[k].numel() == v.numel()} model.load_state_dict(ckpt['model'], strict=False) except KeyError as e: s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s. " \ "See https://github.com/ultralytics/yolov3/issues/657" % (opt.weights, opt.cfg, opt.weights) raise KeyError(s) from e # load optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # load results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # epochs start_epoch = ckpt['epoch'] + 1 if epochs < start_epoch: print('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (opt.weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt elif len(weights) > 0: # darknet format # possible weights are '*.weights', 'yolov3-tiny.conv.15', 'darknet53.conv.74' etc. load_darknet_weights(model, weights) if opt.freeze_layers: output_layer_indices = [idx - 1 for idx, module in enumerate(model.module_list) if isinstance(module, YOLOLayer)] freeze_layer_indices = [x for x in range(len(model.module_list)) if (x not in output_layer_indices) and (x - 1 not in output_layer_indices)] for idx in freeze_layer_indices: for parameter in model.module_list[idx].parameters(): parameter.requires_grad_(False) # Mixed precision training https://github.com/NVIDIA/apex if mixed_precision: model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: (((1 + math.cos(x * math.pi / epochs)) / 2) ** 1.0) * 0.95 + 0.05 # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) scheduler.last_epoch = start_epoch - 1 # see link below # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822 # Plot lr schedule # y = [] # for _ in range(epochs): # scheduler.step() # y.append(optimizer.param_groups[0]['lr']) # plt.plot(y, '.-', label='LambdaLR') # plt.xlabel('epoch') # plt.ylabel('LR') # plt.tight_layout() # plt.savefig('LR.png', dpi=300) # Initialize distributed training if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available(): dist.init_process_group(backend='nccl', # 'distributed backend' init_method='tcp://127.0.0.1:9999', # distributed training init method world_size=1, # number of nodes for distributed training rank=0) # distributed training node rank model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True) model.yolo_layers = model.module.yolo_layers # move yolo layer indices to top level # Dataset dataset = LoadImagesAndLabels(train_path, img_size, batch_size, augment=True, hyp=hyp, # augmentation hyperparameters rect=opt.rect, # rectangular training cache_images=opt.cache_images, single_cls=opt.single_cls) # Dataloader batch_size = min(batch_size, len(dataset)) nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers #nw = 32 dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=nw, shuffle=not opt.rect, # Shuffle=True unless rectangular training is used pin_memory=True, collate_fn=dataset.collate_fn) # Testloader testloader = torch.utils.data.DataLoader(LoadImagesAndLabels(test_path, imgsz_test, batch_size, hyp=hyp, rect=True, cache_images=opt.cache_images, single_cls=opt.single_cls), batch_size=batch_size, num_workers=nw, pin_memory=True, collate_fn=dataset.collate_fn) # Model parameters model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights # Model EMA ema = torch_utils.ModelEMA(model) # Start training nb = len(dataloader) # number of batches n_burn = max(3 * nb, 500) # burn-in iterations, max(3 epochs, 500 iterations) maps = np.zeros(nc) # mAP per class # torch.autograd.set_detect_anomaly(True) results = (0, 0, 0, 0, 0, 0, 0) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' t0 = time.time() print('Image sizes %g - %g train, %g test' % (imgsz_min, imgsz_max, imgsz_test)) print('Using %g dataloader workers' % nw) print('Starting training for %g epochs...' % epochs) for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if dataset.image_weights: w = model.class_weights.cpu().numpy() * (1 - maps) ** 2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx mloss = torch.zeros(4).to(device) # mean losses print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) pbar = tqdm(enumerate(dataloader), total=nb) # progress bar for i, (imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device).float() / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 targets = targets.to(device) # Burn-in if ni <= n_burn: xi = [0, n_burn] # x interp model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max(1, np.interp(ni, xi, [1, 64 / batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) x['weight_decay'] = np.interp(ni, xi, [0.0, hyp['weight_decay'] if j == 1 else 0.0]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']]) # Multi-Scale if opt.multi_scale: if ni / accumulate % 1 == 0: # adjust img_size (67% - 150%) every 1 batch img_size = random.randrange(grid_min, grid_max + 1) * gs sf = img_size / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to 32-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward pred = model(imgs) # Loss loss, loss_items = compute_loss(pred, targets, model) if not torch.isfinite(loss): print('WARNING: non-finite loss, ending training ', loss_items) return results # Backward loss *= batch_size / 64 # scale loss if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Optimize if ni % accumulate == 0: optimizer.step() optimizer.zero_grad() ema.update(model) # Print mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.3g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, len(targets), img_size) pbar.set_description(s) # Plot if ni < 1: f = 'train_batch%g.jpg' % i # filename res = plot_images(images=imgs, targets=targets, paths=paths, fname=f) if tb_writer: tb_writer.add_image(f, res, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Update scheduler scheduler.step() # Process epoch results ema.update_attr(model) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP is_coco = any([x in data for x in ['coco.data', 'coco2014.data', 'coco2017.data']]) and model.nc == 80 results, maps = test.test(cfg, data, batch_size=batch_size, imgsz=imgsz_test, model=ema.ema, save_json=final_epoch and is_coco, single_cls=opt.single_cls, dataloader=testloader, multi_label=ni > n_burn) # Write with open(results_file, 'a') as f: f.write(s + '%10.3g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp results.txt gs://%s/results/results%s.txt' % (opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = ['train/giou_loss', 'train/obj_loss', 'train/cls_loss', 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/F1', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss'] for x, tag in zip(list(mloss[:-1]) + list(results), tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness(np.array(results).reshape(1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = {'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema.module.state_dict() if hasattr(model, 'module') else ema.ema.state_dict(), 'optimizer': None if final_epoch else optimizer.state_dict()} # Save last, best and delete torch.save(ckpt, last) if (best_fitness == fi) and not final_epoch: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training n = opt.name if len(n): n = '_' + n if not n.isnumeric() else n fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename ispt = f2.endswith('.pt') # is *.pt strip_optimizer(f2) if ispt else None # strip optimizer os.system('gsutil cp %s gs://%s/weights' % (f2, opt.bucket)) if opt.bucket and ispt else None # upload if not opt.evolve: plot_results() # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if torch.cuda.device_count() > 1 else None torch.cuda.empty_cache() return results
def __init__(self, task): if task == 'train': from options.train_options import TrainOptions as Options from models import create_model as create_model elif task == 'distill': from options.distill_options import DistillOptions as Options from distillers import create_distiller as create_model else: raise NotImplementedError('Unknown task [%s]!!!' % task) opt = Options().parse() opt.tensorboard_dir = opt.log_dir if opt.tensorboard_dir is None else opt.tensorboard_dir print(' '.join(sys.argv)) if opt.phase != 'train': warnings.warn('You are not using training set for %s!!!' % task) with open(os.path.join(opt.log_dir, 'opt.txt'), 'a') as f: f.write(' '.join(sys.argv) + '\n') set_seed(opt.seed) dataloader = create_dataloader(opt) dataset_size = len(dataloader.dataset) print('The number of training images = %d' % dataset_size) opt.iters_per_epoch = len(dataloader) if opt.dataset_mode in ['aligned', 'unaligned']: opt.data_channel, opt.data_height, opt.data_width = next( iter(dataloader))['A' if opt.direction == 'AtoB' else 'B'].shape[1:] elif opt.dataset_mode in ['cityscapes']: input_ = next(iter(dataloader)) opt.data_height, opt.data_width = input_['label'].shape[2:] opt.data_channel = opt.input_nc if opt.contain_dontcare_label: opt.data_channel += 1 if not opt.no_instance: opt.data_channel += input_['instance'].shape[1] else: raise NotImplementedError print( f'data shape is: channel={opt.data_channel}, height={opt.data_height}, width={opt.data_width}.' ) model = create_model(opt) model.setup(opt) logger = Logger(opt) if getattr(opt, 'pretrained_student_G_path', '') and task == 'distill': if 'spade' in opt.teacher_netG: assert 'spade' in opt.student_netG assert 'spade' in opt.pretrained_netG load_pretrained_spade_student(model, opt) else: load_pretrained_student(model, opt) self.opt = opt self.dataloader = dataloader self.model = model self.logger = logger self.task = task modules_on_one_gpu = getattr(model, 'modules_on_one_gpu', model) if self.task == 'distill': logger.print_info( f'netG teacher FLOPs: {mc.unwrap_model(modules_on_one_gpu.netG_teacher).n_macs}.' ) logger.print_info( f'netG student FLOPs: {mc.unwrap_model(modules_on_one_gpu.netG_student).n_macs}.' ) data_input = torch.ones( [1, opt.data_channel, opt.data_height, opt.data_width]).to(model.device) macs_t = profile_macs( mc.unwrap_model(modules_on_one_gpu.netG_teacher).to( model.device), data_input) macs_s = profile_macs( mc.unwrap_model(modules_on_one_gpu.netG_student).to( model.device), data_input) params_t = 0 params_s = 0 for p in modules_on_one_gpu.netG_teacher.parameters(): params_t += p.numel() for p in modules_on_one_gpu.netG_student.parameters(): params_s += p.numel() logger.print_info( f'netG teacher FLOPs: {macs_t}; Params: {params_t}.') logger.print_info( f'netG student FLOPs: {macs_s}; Params: {params_s}.')
return [block.get_weight() for block in self.get_blocks()] def get_block(self, depth): return self.get_blocks()[depth] def get_weight(self, depth): return self.get_weights()[depth] def nlresnet18(**kwargs): """ResNet-18 model from `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_ """ return ResNet(BasicBlock, [2, 2, 2, 2], **kwargs) if __name__ == '__main__': from torchprofile import profile_macs model = resnet18(num_classes=10, non_learnable=True, binary=False, sparsity=0.99, depthwise=True) print(model) param_count = sum(p.numel() for p in model.parameters() if p.requires_grad) param_count_full = sum(p.numel() for p in model.parameters()) data = torch.rand(1, 3, 64, 64) out = model(data) flops = profile_macs(model, data) / 1e6 print(param_count) print(param_count_full) print(flops)
def main(): if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) logging.info("args = %s", args) if args.seed is not None: np.random.seed(args.seed) torch.manual_seed(args.seed) best_acc = 0 # initiate a artificial best accuracy so far top_checkpoints = [] # initiate a list to keep track of # Data train_transform, valid_transform = _data_transforms(args) if dataset == 'cifar100': train_data = torchvision.datasets.CIFAR100( root=args.data, train=True, download=True, transform=train_transform) valid_data = torchvision.datasets.CIFAR100( root=args.data, train=False, download=True, transform=valid_transform) elif dataset == 'cifar10': train_data = torchvision.datasets.CIFAR10( root=args.data, train=True, download=True, transform=train_transform) valid_data = torchvision.datasets.CIFAR10( root=args.data, train=False, download=True, transform=valid_transform) elif dataset == 'cinic10': train_data = torchvision.datasets.ImageFolder( args.data + 'train_and_valid', transform=train_transform) valid_data = torchvision.datasets.ImageFolder( args.data + 'test', transform=valid_transform) else: raise KeyError train_queue = torch.utils.data.DataLoader( train_data, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=args.num_workers) valid_queue = torch.utils.data.DataLoader( valid_data, batch_size=200, shuffle=False, pin_memory=True, num_workers=args.num_workers) net_config = json.load(open(args.model_config)) net = NSGANetV2.build_from_config(net_config, drop_connect_rate=args.drop_path) init = torch.load(args.initial_checkpoint, map_location='cpu')['state_dict'] net.load_state_dict(init) NSGANetV2.reset_classifier( net, last_channel=net.classifier.in_features, n_classes=NUM_CLASSES, dropout_rate=args.drop) # calculate #Paramaters and #FLOPS inputs = torch.randn(1, 3, args.img_size, args.img_size) flops = profile_macs(copy.deepcopy(net), inputs) / 1e6 params = sum(p.numel() for p in net.parameters() if p.requires_grad) / 1e6 net_name = "net_flops@{:.0f}".format(flops) logging.info('#params {:.2f}M, #flops {:.0f}M'.format(params, flops)) if args.n_gpus > 1: net = nn.DataParallel(net) # data parallel in case more than 1 gpu available net = net.to(device) n_epochs = args.epochs parameters = filter(lambda p: p.requires_grad, net.parameters()) criterion = nn.CrossEntropyLoss().to(device) optimizer = optim.SGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, n_epochs) if args.evaluate: infer(valid_queue, net, criterion) sys.exit(0) for epoch in range(n_epochs): logging.info('epoch %d lr %e', epoch, scheduler.get_lr()[0]) train(train_queue, net, criterion, optimizer) _, valid_acc = infer(valid_queue, net, criterion) # checkpoint saving if args.save: if len(top_checkpoints) < args.topk: OFAEvaluator.save_net(args.save, net, net_name+'.ckpt{}'.format(epoch)) top_checkpoints.append((os.path.join(args.save, net_name+'.ckpt{}'.format(epoch)), valid_acc)) else: idx = np.argmin([x[1] for x in top_checkpoints]) if valid_acc > top_checkpoints[idx][1]: OFAEvaluator.save_net(args.save, net, net_name + '.ckpt{}'.format(epoch)) top_checkpoints.append((os.path.join(args.save, net_name+'.ckpt{}'.format(epoch)), valid_acc)) # remove the idx os.remove(top_checkpoints[idx][0]) top_checkpoints.pop(idx) print(top_checkpoints) if valid_acc > best_acc: OFAEvaluator.save_net(args.save, net, net_name + '.best') best_acc = valid_acc scheduler.step() OFAEvaluator.save_net_config(args.save, net, net_name+'.config')
import torch import torch.nn as nn from torchprofile import profile_macs from torchprofile.utils.trace import trace class Model(nn.Module): def forward(self, a, b): return torch.matmul(a, b) if __name__ == '__main__': a = torch.zeros(10, 20, 1, 20, 20) b = torch.zeros(20, 30) rnn = nn.LSTM(10, 20, 2) input = torch.randn(5, 3, 10) h0 = torch.randn(2, 3, 20) c0 = torch.randn(2, 3, 20) output, (hn, cn) = rnn(input, (h0, c0)) print(trace(rnn, (input, (h0, c0)))) print(profile_macs(rnn, (input, (h0, c0))))
def main(args, init_distributed=False): utils.import_user_module(args) utils.handle_save_path(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training #if torch.cuda.is_available() and not args.cpu: # torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args print(f"| Configs: {args}") # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print( f"| Model: {args.arch} \n| Criterion: {criterion.__class__.__name__}") # Log architecture if args.train_subtransformer: print(" \n\n\t\tWARNING!!! Training one single SubTransformer\n\n") print( f"| SubTransformer Arch: {utils.get_subtransformer_config(args)} \n" ) else: print(" \n\n\t\tWARNING!!! Training SuperTransformer\n\n") print(f"| SuperTransformer Arch: {model} \n") # Log model size if args.train_subtransformer: print( f"| SubTransformer size (without embedding weights): {model.get_sampled_params_numel(utils.get_subtransformer_config(args))}" ) embed_size = args.decoder_embed_dim_subtransformer * len(task.tgt_dict) print(f"| Embedding layer size: {embed_size} \n") else: model_s = 0 # if use model.state_dict, then will add 2 more parameters, they are encoder.version and decoder.version. Should not count them for name, param in model.named_parameters(): if 'embed' not in name: model_s += param.numel() print( f"| SuperTransofmer model size (without embedding weights): {model_s}" ) print( f"| Embedding layer size: {sum(p.numel() for p in model.parameters() if p.requires_grad) - model_s} \n" ) # specify the length of the dummy input for profile # for iwslt, the average length is 23, for wmt, that is 30 dummy_sentence_length_dict = {'iwslt': 23, 'wmt': 30} if 'iwslt' in args.arch: dummy_sentence_length = dummy_sentence_length_dict['iwslt'] elif 'wmt' in args.arch: dummy_sentence_length = dummy_sentence_length_dict['wmt'] else: raise NotImplementedError dummy_src_tokens = [2] + [7] * (dummy_sentence_length - 1) dummy_prev = [7] * (dummy_sentence_length - 1) + [2] # profile the overall FLOPs number if args.profile_flops: import torchprofile config_subtransformer = utils.get_subtransformer_config(args) model.set_sample_config(config_subtransformer) model.profile(mode=True) macs = torchprofile.profile_macs(model, args=(torch.tensor([dummy_src_tokens], dtype=torch.long), torch.tensor([30]), torch.tensor([dummy_prev], dtype=torch.long))) model.profile(mode=False) last_layer_macs = config_subtransformer['decoder'][ 'decoder_embed_dim'] * dummy_sentence_length * len(task.tgt_dict) print(f"| Total FLOPs: {macs * 2}") print(f"| Last layer FLOPs: {last_layer_macs * 2}") print( f"| Total FLOPs without last layer: {(macs - last_layer_macs) * 2} \n" ) exit(0) with torch.autograd.set_detect_anomaly(True): # Build trainer trainer = Trainer(args, task, model, criterion) print(f"| Training on {args.distributed_world_size} GPUs") # print(f"| Max tokens per GPU = {args.max_tokens} and max sentences per GPU = {args.max_sentences} \n") print( f"| Max tokens per GPU = {args.max_tokens} and max sentences per GPU = {None} \n" ) # Measure model latency, the program will exit after profiling latency if args.latcpu or args.latgpu: utils.measure_latency(args, model, dummy_src_tokens, dummy_prev) exit(0) # Load the latest checkpoint if one is available and restore the corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Evaluate the SubTransformer if args.validate_subtransformer: config = utils.get_subtransformer_config(args) trainer.set_sample_config(config) valid_loss = validate(args, trainer, task, epoch_itr, ['valid'], 'SubTransformer') print(f"| SubTransformer validation loss:{valid_loss}") # Loop boundaries max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') represent_configs = utils.get_represent_configs(args) # Main training loop while lr > args.stop_min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: for k, v in represent_configs.items(): trainer.set_sample_config(config=v) valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets, sampled_arch_name=k) else: valid_losses = [None] # update the best loss and get current lr; the real lr scheduling is done in trainer.train_step() lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint epoch level if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) train_meter.stop() print('| Done training in {:.1f} seconds'.format(train_meter.sum))
# set sub-generator if args.channel_ratio: from models.dynamic_channel import set_uniform_channel_ratio, CHANNEL_CONFIGS assert args.channel_ratio in CHANNEL_CONFIGS set_uniform_channel_ratio(generator, args.channel_ratio) if args.target_res is not None: generator.target_res = args.target_res # compute the flops of the generator (is possible) if hvd.rank() == 0: try: from torchprofile import profile_macs macs = profile_macs(generator, torch.rand(1, 1, 512).to(device)) params = sum([p.numel() for p in generator.parameters()]) print(' * MACs: {:.2f}G, Params: {:.2f}M'.format( macs / 1e9, params / 1e6)) except: print(' * Profiling failed. Passed.') inception = models.get_pretrained('inception').to(device) inception.eval() inception_features = extract_feature_from_samples() # now perform all gather inception_features = hvd.allgather( inception_features, name='inception_features').numpy()[:args.n_sample] if hvd.rank() == 0:
def profile(self, module: nn.Module, shape_in: ShapeOrList, mover: AbstractDeviceMover, batch_size: int) -> float: with torch.no_grad(): inputs_ = mover.move(shape_in.random_tensor(batch_size=batch_size)) return torchprofile.profile_macs(module, args=inputs_) // batch_size