def main(arch, model_path, output_path, input_shape=(224, 224), batch_size=1): repvgg_build_func = get_RepVGG_func_by_name(arch) model = repvgg_build_func(deploy=True) model.load_state_dict(torch.load(model_path)) dummy_input = torch.autograd.Variable( torch.randn(batch_size, 3, input_shape[0], input_shape[1])) torch.onnx.export(model, dummy_input, output_path, verbose=True, keep_initializers_as_inputs=True, opset_version=12, input_names=['input'], output_names=['output']) # onnx_model = onnx.load(output_path) # load onnx model # model_simp, check = simplify(onnx_model) # assert check, "Simplified ONNX model could not be validated" # onnx.save(model_simp, output_path) # print('finished exporting onnx ') model_d = onnx.load_model(output_path) d = model_d.graph.input[0].type.tensor_type.shape.dim # d[0].dim_value = input_shape[0] d[0].dim_param = '?' for output in model_d.graph.output: d = output.type.tensor_type.shape.dim # d[0].dim_value = input_shape[0] d[0].dim_param = '?' onnx.save_model(model_d, output_path)
def test(): args = parser.parse_args() repvgg_build_func = get_RepVGG_func_by_name(args.arch) model = repvgg_build_func(deploy=args.mode == 'deploy') if not torch.cuda.is_available(): print('using CPU, this will be slow') use_gpu = False else: model = model.cuda() use_gpu = True # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() if os.path.isfile(args.weights): print("=> loading checkpoint '{}'".format(args.weights)) checkpoint = torch.load(args.weights) if 'state_dict' in checkpoint: checkpoint = checkpoint['state_dict'] ckpt = {k.replace('module.', ''): v for k, v in checkpoint.items()} # strip the names model.load_state_dict(ckpt) else: print("=> no checkpoint found at '{}'".format(args.weights)) cudnn.benchmark = True # Data loading code valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) validate(val_loader, model, criterion, use_gpu)
def main(arch, model_path, output_path, input_shape=(224, 224), batch_size=1): repvgg_build_func = get_RepVGG_func_by_name(arch) model = repvgg_build_func(deploy=True) model.load_state_dict(torch.load(model_path)) dummy_input = torch.autograd.Variable( torch.randn(batch_size, 3, input_shape[0], input_shape[1])) torch.onnx.export(model, dummy_input, output_path, verbose=True, keep_initializers_as_inputs=True, opset_version=12) onnx_model = onnx.load(output_path) # load onnx model model_simp, check = simplify(onnx_model) assert check, "Simplified ONNX model could not be validated" onnx.save(model_simp, output_path) print('finished exporting onnx ')
def convert(): args = parser.parse_args() repvgg_build_func = get_RepVGG_func_by_name(args.arch) train_model = repvgg_build_func(deploy=False) if os.path.isfile(args.load): print("=> loading checkpoint '{}'".format(args.load)) checkpoint = torch.load(args.load) if 'state_dict' in checkpoint: train_model.load_state_dict(checkpoint['state_dict']) else: train_model.load_state_dict(checkpoint) else: print("=> no checkpoint found at '{}'".format(args.load)) repvgg_model_convert(train_model, build_func=repvgg_build_func, save_path=args.save)
def convert(): args = parser.parse_args() repvgg_build_func = get_RepVGG_func_by_name(args.arch) train_model = repvgg_build_func(deploy=False) if os.path.isfile(args.load): print("=> loading checkpoint '{}'".format(args.load)) checkpoint = torch.load(args.load) if 'state_dict' in checkpoint: checkpoint = checkpoint['state_dict'] ckpt = {k.replace('module.', ''): v for k, v in checkpoint.items()} # strip the names train_model.load_state_dict(ckpt) else: print("=> no checkpoint found at '{}'".format(args.load)) repvgg_model_convert(train_model, save_path=args.save)
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) repvgg_build_func = get_RepVGG_func_by_name(args.arch) model = repvgg_build_func(deploy=False) if not torch.cuda.is_available(): print('using CPU, this will be slow') elif args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) optimizer = sgd_optimizer(model, args.lr, args.momentum, args.weight_decay) lr_scheduler = CosineAnnealingLR( optimizer=optimizer, T_max=args.epochs * IMAGENET_TRAINSET_SIZE // args.batch_size // ngpus_per_node) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_dataset = datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, args) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) # adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args, lr_scheduler) # evaluate on validation set acc1 = validate(val_loader, model, criterion, args) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best)
def __init__(self, backbone_name, backbone_file, deploy, bins=(1, 2, 3, 6), dropout=0.1, classes=2, zoom_factor=8, use_ppm=True, criterion=nn.CrossEntropyLoss(ignore_index=255), BatchNorm=nn.BatchNorm2d, pretrained=True): super(PSPNet, self).__init__() assert 2048 % len(bins) == 0 assert classes > 1 assert zoom_factor in [1, 2, 4, 8] self.zoom_factor = zoom_factor self.use_ppm = use_ppm self.criterion = criterion repvgg_fn = get_RepVGG_func_by_name(backbone_name) backbone = repvgg_fn(deploy) if pretrained: checkpoint = torch.load(backbone_file) if 'state_dict' in checkpoint: checkpoint = checkpoint['state_dict'] ckpt = { k.replace('module.', ''): v for k, v in checkpoint.items() } # strip the names backbone.load_state_dict(ckpt) self.layer0, self.layer1, self.layer2, self.layer3, self.layer4 = backbone.stage0, backbone.stage1, backbone.stage2, backbone.stage3, backbone.stage4 # The last two stages should have stride=1 for semantic segmentation # Note that the stride of 1x1 should be the same as the 3x3 # Use dilation following the implementation of PSPNet secondlast_channel = 0 for n, m in self.layer3.named_modules(): if ('rbr_dense' in n or 'rbr_reparam' in n) and isinstance( m, nn.Conv2d): m.dilation, m.padding, m.stride = (2, 2), (2, 2), (1, 1) print('change dilation, padding, stride of ', n) secondlast_channel = m.out_channels elif 'rbr_1x1' in n and isinstance(m, nn.Conv2d): m.stride = (1, 1) print('change stride of ', n) last_channel = 0 for n, m in self.layer4.named_modules(): if ('rbr_dense' in n or 'rbr_reparam' in n) and isinstance( m, nn.Conv2d): m.dilation, m.padding, m.stride = (4, 4), (4, 4), (1, 1) print('change dilation, padding, stride of ', n) last_channel = m.out_channels elif 'rbr_1x1' in n and isinstance(m, nn.Conv2d): m.stride = (1, 1) print('change stride of ', n) fea_dim = last_channel aux_in = secondlast_channel if use_ppm: self.ppm = PPM(fea_dim, int(fea_dim / len(bins)), bins, BatchNorm) fea_dim *= 2 self.cls = nn.Sequential( nn.Conv2d(fea_dim, 512, kernel_size=3, padding=1, bias=False), BatchNorm(512), nn.ReLU(inplace=True), nn.Dropout2d(p=dropout), nn.Conv2d(512, classes, kernel_size=1)) if self.training: self.aux = nn.Sequential( nn.Conv2d(aux_in, 256, kernel_size=3, padding=1, bias=False), BatchNorm(256), nn.ReLU(inplace=True), nn.Dropout2d(p=dropout), nn.Conv2d(256, classes, kernel_size=1))