for num in num_types: num_wl = getattr(args, "wl_{}".format(num)) number_dict[num] = BlockFloatingPoint(wl=num_wl, dim=0) print("{:10}: {}".format(num, number_dict[num])) quant_dict = dict() for num in ["weight", "momentum", "grad"]: quant_dict[num] = quantizer(forward_number=number_dict[num], forward_rounding=args.rounding) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = models.__dict__['vgg'](dataset=args.dataset, depth=args.depth) # automatically insert quantization modules model = sequential_lower(model, layer_types=["conv", "linear"], forward_number=number_dict["activate"], backward_number=number_dict["error"], forward_rounding=args.rounding, backward_rounding=args.rounding) # removing the final quantization module model.classifier = model.classifier[0] if args.model: if os.path.isfile(args.model): print("=> loading checkpoint '{}'".format(args.model)) checkpoint = torch.load(args.model) # args.start_epoch = checkpoint['epoch'] # best_prec1 = checkpoint['best_prec1'] try: model.load_state_dict(checkpoint['state_dict']) except: model = torch.nn.DataParallel(model,
param.data = weight_quantizer(param.data) elif args.quantize_method == 'static_quant_int8': print("Using static_quant_int8 model") mtcnn = auto_low.sequential_lower(mtcnn, static_quant.Quantizer, layer_types=['conv', 'linear'], device=device) mtcnn.load_state_dict(torch.load("model")) static_quant.lower(mtcnn) static_quant.show(mtcnn) elif args.quantize_method == 'float_quant': print("Using float_quant model") forward_num = FloatingPoint(exp=5, man=3) mtcnn = sequential_lower(mtcnn, layer_types=['conv', 'linear'], forward_number=forward_num, forward_rounding="nearest") weight_quantizer = lambda x: float_quantize( x, exp=5, man=3, rounding="nearest") for name, param in mtcnn.named_parameters(): param.data = weight_quantizer(param.data) else: print("Using original model") print(mtcnn) video = mmcv.VideoReader('video.mp4') frames = [ Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) for frame in video
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() forward_num = FloatingPoint(exp=5, man=3) #backward_num = FloatingPoint(exp=5, man=3) model = sequential_lower(model, layer_types=['linear'], forward_number=forward_num, forward_rounding="nearest") # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) weight_quant = lambda x: float_quantize( x, exp=5, man=3, rounding="nearest") optimizer = OptimLP(optimizer, weight_quant=weight_quant) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, args) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on validation set acc1 = validate(val_loader, model, criterion, args) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best)