def eval_pnorm(p): args.qtype = 'lp_norm' args.lp = p # Fix the seed random.seed(args.seed) if not args.dont_fix_np_seed: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.backends.cudnn.benchmark = False inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath, batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq, cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args) all_layers = [] if args.bit_weights is not None: all_layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.Conv2d)][1:-1] if args.bit_act is not None: all_layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU)][1:-1] if args.bit_act is not None and 'mobilenet' in args.arch: all_layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU6)][1:-1] mq = ModelQuantizer(inf_model.model, args, all_layers, replacement_factory) loss = inf_model.evaluate_calibration() point = mq.get_clipping() del inf_model del mq return point, loss
def eval_pnorm(p): args.qtype = 'lp_norm' args.lp = p # Fix the seed random.seed(args.seed) if not args.dont_fix_np_seed: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.backends.cudnn.benchmark = False inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath, batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq, cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args) mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory) loss = inf_model.evaluate_calibration() point = mq.get_clipping() # evaluate acc = inf_model.validate() del inf_model del mq return point, loss, acc
def main_worker(args, ml_logger): global best_acc1 if args.gpu_ids is not None: print("Use GPU: {} for training".format(args.gpu_ids)) if args.log_stats: from utils.stats_trucker import StatsTrucker as ST ST("W{}A{}".format(args.bit_weights, args.bit_act)) if 'resnet' in args.arch and args.custom_resnet: model = custom_resnet(arch=args.arch, pretrained=args.pretrained, depth=arch2depth(args.arch), dataset=args.dataset) elif 'inception_v3' in args.arch and args.custom_inception: model = custom_inception(pretrained=args.pretrained) else: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=args.pretrained) device = torch.device('cuda:{}'.format(args.gpu_ids[0])) cudnn.benchmark = True torch.cuda.set_device(args.gpu_ids[0]) model = model.to(device) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, device) args.start_epoch = checkpoint['epoch'] # best_acc1 = checkpoint['best_acc1'] # best_acc1 may be from a checkpoint from a different GPU # best_acc1 = best_acc1.to(device) checkpoint['state_dict'] = { normalize_module_name(k): v for k, v in checkpoint['state_dict'].items() } model.load_state_dict(checkpoint['state_dict'], strict=False) # optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) if len(args.gpu_ids) > 1: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features, args.gpu_ids) else: model = torch.nn.DataParallel(model, args.gpu_ids) default_transform = { 'train': get_transform(args.dataset, augment=True), 'eval': get_transform(args.dataset, augment=False) } val_data = get_dataset(args.dataset, 'val', default_transform['eval']) val_loader = torch.utils.data.DataLoader(val_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().to(device) train_data = get_dataset(args.dataset, 'train', default_transform['train']) train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, drop_last=True) # TODO: replace this call by initialization on small subset of training data # TODO: enable for activations # validate(val_loader, model, criterion, args, device) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) lr_scheduler = StepLR(optimizer, step_size=args.lr_step, gamma=0.1) mq = None if args.quantize: if args.bn_folding: print( "Applying batch-norm folding ahead of post-training quantization" ) from utils.absorb_bn import search_absorbe_bn search_absorbe_bn(model) all_convs = [ n for n, m in model.named_modules() if isinstance(m, nn.Conv2d) ] # all_convs = [l for l in all_convs if 'downsample' not in l] all_relu = [ n for n, m in model.named_modules() if isinstance(m, nn.ReLU) ] all_relu6 = [ n for n, m in model.named_modules() if isinstance(m, nn.ReLU6) ] layers = all_relu[1:-1] + all_relu6[1:-1] + all_convs[1:] replacement_factory = { nn.ReLU: ActivationModuleWrapper, nn.ReLU6: ActivationModuleWrapper, nn.Conv2d: ParameterModuleWrapper } mq = ModelQuantizer( model, args, layers, replacement_factory, OptimizerBridge(optimizer, settings={ 'algo': 'SGD', 'dataset': args.dataset })) if args.resume: # Load quantization parameters from state dict mq.load_state_dict(checkpoint['state_dict']) mq.log_quantizer_state(ml_logger, -1) if args.model_freeze: mq.freeze() if args.evaluate: if args.log_stats: mean = [] var = [] skew = [] kurt = [] for n, p in model.named_parameters(): if n.replace('.weight', '') in all_convs[1:]: mu = p.mean() std = p.std() mean.append((n, mu.item())) var.append((n, (std**2).item())) skew.append((n, torch.mean(((p - mu) / std)**3).item())) kurt.append((n, torch.mean(((p - mu) / std)**4).item())) for i in range(len(mean)): ml_logger.log_metric(mean[i][0] + '.mean', mean[i][1]) ml_logger.log_metric(var[i][0] + '.var', var[i][1]) ml_logger.log_metric(skew[i][0] + '.skewness', skew[i][1]) ml_logger.log_metric(kurt[i][0] + '.kurtosis', kurt[i][1]) ml_logger.log_metric('weight_mean', np.mean([s[1] for s in mean])) ml_logger.log_metric('weight_var', np.mean([s[1] for s in var])) ml_logger.log_metric('weight_skewness', np.mean([s[1] for s in skew])) ml_logger.log_metric('weight_kurtosis', np.mean([s[1] for s in kurt])) acc = validate(val_loader, model, criterion, args, device) ml_logger.log_metric('Val Acc1', acc) if args.log_stats: stats = ST().get_stats() for s in stats: ml_logger.log_metric(s, np.mean(stats[s])) return # evaluate on validation set acc1 = validate(val_loader, model, criterion, args, device) ml_logger.log_metric('Val Acc1', acc1, -1) # evaluate with k-means quantization # if args.model_freeze: # with mq.disable(): # acc1_nq = validate(val_loader, model, criterion, args, device) # ml_logger.log_metric('Val Acc1 fp32', acc1_nq, -1) for epoch in range(0, args.epochs): # train for one epoch print('Timestamp Start epoch: {:%Y-%m-%d %H:%M:%S}'.format( datetime.datetime.now())) train(train_loader, model, criterion, optimizer, epoch, args, device, ml_logger, val_loader, mq) print('Timestamp End epoch: {:%Y-%m-%d %H:%M:%S}'.format( datetime.datetime.now())) if not args.lr_freeze: lr_scheduler.step() # evaluate on validation set acc1 = validate(val_loader, model, criterion, args, device) ml_logger.log_metric('Val Acc1', acc1, step='auto') # evaluate with k-means quantization # if args.model_freeze: # with mq.quantization_method('kmeans'): # acc1_kmeans = validate(val_loader, model, criterion, args, device) # ml_logger.log_metric('Val Acc1 kmeans', acc1_kmeans, epoch) # with mq.disable(): # acc1_nq = validate(val_loader, model, criterion, args, device) # ml_logger.log_metric('Val Acc1 fp32', acc1_nq, step='auto') if args.quantize: mq.log_quantizer_state(ml_logger, epoch) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict() if len(args.gpu_ids) == 1 else model.module.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best)
def main(args, ml_logger): # Fix the seed random.seed(args.seed) if not args.dont_fix_np_seed: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.backends.cudnn.benchmark = False args.qtype = 'max_static' # create model # Always enable shuffling to avoid issues where we get bad results due to weak statistics inf_model = CnnModel(args.arch, args.custom_resnet, args.pretrained, args.dataset, args.gpu_ids, args.datapath, batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq, cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args) all_layers = [] if args.bit_weights is not None: all_layers += [ n for n, m in inf_model.model.named_modules() if isinstance(m, nn.Conv2d) ][1:-1] if args.bit_act is not None: all_layers += [ n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU) ][1:-1] if args.bit_act is not None and 'mobilenet' in args.arch: all_layers += [ n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU6) ][1:-1] replacement_factory = { nn.ReLU: ActivationModuleWrapperPost, nn.ReLU6: ActivationModuleWrapperPost, nn.Conv2d: ParameterModuleWrapperPost } mq = ModelQuantizer(inf_model.model, args, all_layers, replacement_factory) loss = inf_model.evaluate_calibration() print("loss: {:.4f}".format(loss.item())) ml_logger.log_metric('loss', loss.item(), step='auto') # get clipping values p_max = mq.get_clipping() # print(init) args.qtype = 'l2_norm' inf_model = CnnModel(args.arch, args.custom_resnet, args.pretrained, args.dataset, args.gpu_ids, args.datapath, batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq, cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args) mq = ModelQuantizer(inf_model.model, args, all_layers, replacement_factory) loss = inf_model.evaluate_calibration() print("loss l2: {:.4f}".format(loss.item())) p_l2 = mq.get_clipping() args.qtype = 'l3_norm' inf_model = CnnModel(args.arch, args.custom_resnet, args.pretrained, args.dataset, args.gpu_ids, args.datapath, batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq, cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args) mq = ModelQuantizer(inf_model.model, args, all_layers, replacement_factory) loss = inf_model.evaluate_calibration() print("loss l2: {:.4f}".format(loss.item())) p_l3 = mq.get_clipping() # gamma_avg = 0 # T_avg = 0 num_iter = args.num_iter n = args.num_points def status_callback(i, gamma, T, f_max): T = T.item() gamma = gamma.item() f_max = f_max.item() print("gamma^2: {}, T: {}, max: {}".format(gamma, T, f_max)) ml_logger.log_metric('gamma', gamma, step='auto') ml_logger.log_metric('T', T, step='auto') ml_logger.log_metric('f_max', f_max, step='auto') T_norm = T / np.sqrt(i + 1) ml_logger.log_metric('T_norm', T_norm, step='auto') gamma_norm = gamma / f_max**2 ml_logger.log_metric('gamma_norm', gamma_norm, step='auto') gamma_, T_, f_max = separability_index( lambda x: model_func(x, p_max, inf_model, mq, p_l2, p_l3), len(p_max), n, num_iter, gpu=True, status_callback=status_callback) gamma_norm = np.mean(np.array(gamma_) / f_max.item()**2) T_norm = np.mean(np.array(T_) / np.sqrt(np.arange(1, num_iter + 1))) print("gamma^2 norm: {}, T norm: {}".format(gamma_norm, T_norm)) ml_logger.log_metric('gamma_tot', gamma_norm, step='auto') ml_logger.log_metric('T_tot', T_norm, step='auto')
def main(args, ml_logger): # Fix the seed random.seed(args.seed) if not args.dont_fix_np_seed: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.backends.cudnn.benchmark = False if args.tag is not None: ml_logger.mlflow.log_param('tag', args.tag) args.qtype = 'max_static' # create model # Always enable shuffling to avoid issues where we get bad results due to weak statistics custom_resnet = True custom_inception = True inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath, batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq, cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args) layers = [] # TODO: make it more generic if args.bit_weights is not None: layers += [ n for n, m in inf_model.model.named_modules() if isinstance(m, nn.Conv2d) ][1:-1] if args.bit_act is not None: layers += [ n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU) ][1:-1] if args.bit_act is not None and 'mobilenet' in args.arch: layers += [ n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU6) ][1:-1] replacement_factory = { nn.ReLU: ActivationModuleWrapperPost, nn.ReLU6: ActivationModuleWrapperPost, nn.Conv2d: ParameterModuleWrapperPost } mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory) loss = inf_model.evaluate_calibration() # evaluate max_acc = inf_model.validate() max_point = mq.get_clipping() ml_logger.log_metric('Loss max', loss.item(), step='auto') ml_logger.log_metric('Acc max', max_acc, step='auto') data = {'max': {'alpha': max_point.cpu().numpy(), 'loss': loss.item()}} print("max loss: {:.4f}, max_acc: {:.4f}".format(loss.item(), max_acc)) def eval_pnorm(p): args.qtype = 'lp_norm' args.lp = p # Fix the seed random.seed(args.seed) if not args.dont_fix_np_seed: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.backends.cudnn.benchmark = False inf_model = CnnModel( args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath, batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq, cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args, ) mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory) loss = inf_model.evaluate_calibration() point = mq.get_clipping() # evaluate acc = inf_model.validate() del inf_model del mq return point, loss, acc del inf_model del mq print("Evaluate L2 norm optimization") l2_point, l2_loss, l2_acc = eval_pnorm(2.) print("loss l2: {:.4f}".format(l2_loss.item())) ml_logger.log_metric('Loss l2', l2_loss.item(), step='auto') ml_logger.log_metric('Acc l2', l2_acc, step='auto') data['l2'] = { 'alpha': l2_point.cpu().numpy(), 'loss': l2_loss.item(), 'acc': l2_acc } print("Evaluate L2.5 norm optimization") l25_point, l25_loss, l25_acc = eval_pnorm(2.5) print("loss l2.5: {:.4f}".format(l25_loss.item())) ml_logger.log_metric('Loss l2.5', l25_loss.item(), step='auto') ml_logger.log_metric('Acc l2.5', l25_acc, step='auto') data['l2.5'] = { 'alpha': l25_point.cpu().numpy(), 'loss': l25_loss.item(), 'acc': l25_acc } print("Evaluate L3 norm optimization") l3_point, l3_loss, l3_acc = eval_pnorm(3.) print("loss l3: {:.4f}".format(l3_loss.item())) ml_logger.log_metric('Loss l3', l3_loss.item(), step='auto') ml_logger.log_metric('Acc l3', l3_acc, step='auto') data['l3'] = { 'alpha': l3_point.cpu().numpy(), 'loss': l3_loss.item(), 'acc': l3_acc } # Interpolate optimal p xp = np.linspace(1, 5, 50) z = np.polyfit([2, 2.5, 3], [l2_acc, l25_acc, l3_acc], 2) y = np.poly1d(z) p_intr = xp[np.argmax(y(xp))] print("p intr: {:.2f}".format(p_intr)) ml_logger.log_metric('p intr', p_intr, step='auto') args.qtype = 'lp_norm' args.lp = p_intr # Fix the seed random.seed(args.seed) if not args.dont_fix_np_seed: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.backends.cudnn.benchmark = False inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath, batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq, cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args) if args.bn_folding: print( "Applying batch-norm folding ahead of post-training quantization") # pdb.set_trace() from utils.absorb_bn import search_absorbe_bn search_absorbe_bn(inf_model.model) mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory) # Evaluate with optimal p lp_loss = inf_model.evaluate_calibration() lp_point = mq.get_clipping() # evaluate lp_acc = inf_model.validate() print("loss p intr: {:.4f}".format(lp_loss.item())) ml_logger.log_metric('Loss p intr', lp_loss.item(), step='auto') ml_logger.log_metric('Acc p intr', lp_acc, step='auto') global _eval_count, _min_loss _min_loss = lp_loss.item() idx = np.argmax([l2_acc, l25_acc, l3_acc, lp_acc]) init = [l2_point, l25_point, l3_point, lp_point][idx] # run optimizer min_options = {} if args.maxiter is not None: min_options['maxiter'] = args.maxiter if args.maxfev is not None: min_options['maxfev'] = args.maxfev _iter = count(0) def local_search_callback(x): it = next(_iter) mq.set_clipping(x, inf_model.device) loss = inf_model.evaluate_calibration() print("\n[{}]: Local search callback".format(it)) print("loss: {:.4f}\n".format(loss.item())) print(x) ml_logger.log_metric('Loss {}'.format(args.min_method), loss.item(), step='auto') # evaluate acc = inf_model.validate() ml_logger.log_metric('Acc {}'.format(args.min_method), acc, step='auto') args.min_method = "Powell" method = coord_descent if args.min_method == 'CD' else args.min_method res = opt.minimize( lambda scales: evaluate_calibration_clipped(scales, inf_model, mq), init.cpu().numpy(), method=method, options=min_options, callback=local_search_callback) print(res) scales = res.x mq.set_clipping(scales, inf_model.device) loss = inf_model.evaluate_calibration() ml_logger.log_metric('Loss {}'.format(args.min_method), loss.item(), step='auto') # evaluate acc = inf_model.validate() ml_logger.log_metric('Acc {}'.format(args.min_method), acc, step='auto') data['powell'] = {'alpha': scales, 'loss': loss.item(), 'acc': acc} print("Starting coordinate descent") args.min_method = "CD" min_options[ 'maxiter'] = 1 # Perform only one iteration of coordinate descent to avoid divergence _iter = count(0) global _eval_count _eval_count = count(0) _min_loss = lp_loss.item() mq.set_clipping(init, inf_model.device) # Run coordinate descent for comparison method = coord_descent res = opt.minimize( lambda scales: evaluate_calibration_clipped(scales, inf_model, mq), init.cpu().numpy(), method=method, options=min_options, callback=local_search_callback) print(res) scales = res.x mq.set_clipping(scales, inf_model.device) loss = inf_model.evaluate_calibration() ml_logger.log_metric('Loss {}'.format("CD"), loss.item(), step='auto') # evaluate acc = inf_model.validate() ml_logger.log_metric('Acc {}'.format("CD"), acc, step='auto') data['cd'] = {'alpha': scales, 'loss': loss.item(), 'acc': acc} # save scales f_name = "scales_{}_W{}A{}.pkl".format(args.arch, args.bit_weights, args.bit_act) f = open(os.path.join(proj_root_dir, 'data', f_name), 'wb') pickle.dump(data, f) f.close() print("Data saved to {}".format(f_name))
def main(args): # Fix the seed random.seed(args.seed) if not args.dont_fix_np_seed: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.backends.cudnn.benchmark = False args.qtype = 'max_static' # create model # Always enable shuffling to avoid issues where we get bad results due to weak statistics custom_resnet = True inf_model = CnnModel(args.arch, custom_resnet, args.pretrained, args.dataset, args.gpu_ids, args.datapath, batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq, cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args) all_layers = [] if args.bit_weights is not None: all_layers += [ n for n, m in inf_model.model.named_modules() if isinstance(m, nn.Conv2d) ][1:-1] if args.bit_act is not None: all_layers += [ n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU) ][1:-1] if args.bit_act is not None and 'mobilenet' in args.arch: all_layers += [ n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU6) ][1:-1] id1 = 0 id2 = 1 layers = [all_layers[id1], all_layers[id2]] replacement_factory = { nn.ReLU: ActivationModuleWrapperPost, nn.ReLU6: ActivationModuleWrapperPost, nn.Conv2d: ParameterModuleWrapperPost } mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory) loss = inf_model.evaluate_calibration() print("loss: {:.4f}".format(loss.item())) max_point = mq.get_clipping() n = args.grid_resolution x = np.linspace(0.01, max_point[0].item(), n) y = np.linspace(0.01, max_point[1].item(), n) X, Y = np.meshgrid(x, y) Z = np.empty((n, n)) for i, x_ in enumerate(tqdm(x)): for j, y_ in enumerate(y): # set clip value to qwrappers scales = np.array([X[i, j], Y[i, j]]) mq.set_clipping(scales, inf_model.device) # evaluate with clipping loss = inf_model.evaluate_calibration() Z[i][j] = loss.item() max_point = np.concatenate([max_point.cpu().numpy(), loss.cpu().numpy()]) def eval_pnorm(p): args.qtype = 'lp_norm' args.lp = p # Fix the seed random.seed(args.seed) if not args.dont_fix_np_seed: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.backends.cudnn.benchmark = False inf_model = CnnModel(args.arch, custom_resnet, args.pretrained, args.dataset, args.gpu_ids, args.datapath, batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq, cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args) mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory) loss = inf_model.evaluate_calibration() point = mq.get_clipping() point = np.concatenate([point.cpu().numpy(), loss.cpu().numpy()]) del inf_model del mq return point del inf_model del mq l1_point = eval_pnorm(1.) print("loss l1: {:.4f}".format(l1_point[2])) l1_5_point = eval_pnorm(1.5) print("loss l1.5: {:.4f}".format(l1_5_point[2])) l2_point = eval_pnorm(2.) print("loss l2: {:.4f}".format(l2_point[2])) l2_5_point = eval_pnorm(2.5) print("loss l2.5: {:.4f}".format(l2_5_point[2])) l3_point = eval_pnorm(3.) print("loss l3: {:.4f}".format(l3_point[2])) f_name = "{}_l{}l{}_W{}A{}.pkl".format(args.arch, id1, id2, args.bit_weights, args.bit_act) f = open(os.path.join(proj_root_dir, 'data', f_name), 'wb') data = { 'X': X, 'Y': Y, 'Z': Z, 'max_point': max_point, 'l1_point': l1_point, 'l1.5_point': l1_5_point, 'l2_point': l2_point, 'l2.5_point': l2_5_point, 'l3_point': l3_point } pickle.dump(data, f) f.close() print("Data saved to {}".format(f_name))
def main(args, ml_logger): # Fix the seed random.seed(args.seed) if not args.dont_fix_np_seed: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.backends.cudnn.benchmark = False if args.tag is not None: ml_logger.mlflow.log_param('tag', args.tag) enable_bcorr = False if args.bcorr_w: args.bcorr_w = False enable_bcorr = True args.qtype = 'max_static' # create model # Always enable shuffling to avoid issues where we get bad results due to weak statistics custom_resnet = True custom_inception = True inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath, batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq, cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args) layers = [] # TODO: make it more generic if args.bit_weights is not None: layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.Conv2d)][1:-1] if args.bit_act is not None: layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU)][1:-1] if args.bit_act is not None and 'mobilenet' in args.arch: layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU6)][1:-1] replacement_factory = {nn.ReLU: ActivationModuleWrapperPost, nn.ReLU6: ActivationModuleWrapperPost, nn.Conv2d: ParameterModuleWrapperPost} mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory) maxabs_loss = inf_model.evaluate_calibration() print("max loss: {:.4f}".format(maxabs_loss.item())) max_point = mq.get_clipping() ml_logger.log_metric('Loss max', maxabs_loss.item(), step='auto') # evaluate maxabs_acc = 0#inf_model.validate() ml_logger.log_metric('Acc maxabs', maxabs_acc, step='auto') data = {'max': {'alpha': max_point.cpu().numpy(), 'loss': maxabs_loss.item(), 'acc': maxabs_acc}} del inf_model del mq def eval_pnorm(p): args.qtype = 'lp_norm' args.lp = p # Fix the seed random.seed(args.seed) if not args.dont_fix_np_seed: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.backends.cudnn.benchmark = False inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath, batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq, cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args) mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory) loss = inf_model.evaluate_calibration() point = mq.get_clipping() # evaluate acc = inf_model.validate() del inf_model del mq return point, loss, acc def eval_pnorm_on_calibration(p): args.qtype = 'lp_norm' args.lp = p # Fix the seed random.seed(args.seed) if not args.dont_fix_np_seed: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.backends.cudnn.benchmark = False inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath, batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq, cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args) mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory) loss = inf_model.evaluate_calibration() point = mq.get_clipping() del inf_model del mq return point, loss # l2_point, l2_loss = eval_pnorm_on_calibration(2) # print("loss l2: {:.4f}".format(l2_loss.item())) # ml_logger.log_metric('Loss l2', l2_loss.item(), step='auto') # # l4_point, l4_loss = eval_pnorm_on_calibration(4) # print("loss l4: {:.4f}".format(l4_loss.item())) # ml_logger.log_metric('Loss l4', l4_loss.item(), step='auto') # args.qtype = 'lp_norm' # args.lp = p_intr # Fix the seed random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.backends.cudnn.benchmark = False inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath, batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq, cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args) mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory) opt_point = np.array([0.42811054, 1.27721779, 0.53149996, 1.51492159, 0.91115569, 1.17987683, 1.13352566, 1.5227828 , 0.67026185, 0.75535328, 0.54173654, 0.70824616, 0.44899457, 1.25257411, 0.68778409]) start_point = 0.8*opt_point end_point = 1.5*opt_point k = 100 step = (end_point - start_point) / k print("start") print(start_point) print("end") print(end_point) losses = [] points = [] for i in range(k+1): point = start_point + i * step mq.set_clipping(point, inf_model.device) loss = inf_model.evaluate_calibration() losses.append(loss.item()) points.append(point) print("({}: loss) - {}".format(i, loss.item())) data = {'opt': opt_point, 'points': points, 'loss': losses} # save scales f_name = "quadratic_loss_{}_W{}A{}.pkl".format(args.arch, args.bit_weights, args.bit_act) f = open(os.path.join(proj_root_dir, 'data', f_name), 'wb') pickle.dump(data, f) f.close() print("Data saved to {}".format(f_name))
def main(args): # Fix the seed random.seed(args.seed) if not args.dont_fix_np_seed: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.backends.cudnn.benchmark = False args.qtype = 'max_static' # create model # Always enable shuffling to avoid issues where we get bad results due to weak statistics custom_resnet = True custom_inception = True replacement_factory = { nn.ReLU: ActivationModuleWrapperPost, nn.ReLU6: ActivationModuleWrapperPost, nn.Conv2d: ParameterModuleWrapperPost } def eval_pnorm(p): args.qtype = 'lp_norm' args.lp = p # Fix the seed random.seed(args.seed) if not args.dont_fix_np_seed: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.backends.cudnn.benchmark = False inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath, batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq, cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args) all_layers = [] if args.bit_weights is not None: all_layers += [ n for n, m in inf_model.model.named_modules() if isinstance(m, nn.Conv2d) ][1:-1] if args.bit_act is not None: all_layers += [ n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU) ][1:-1] if args.bit_act is not None and 'mobilenet' in args.arch: all_layers += [ n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU6) ][1:-1] mq = ModelQuantizer(inf_model.model, args, all_layers, replacement_factory) loss = inf_model.evaluate_calibration() point = mq.get_clipping() del inf_model del mq return point, loss random.seed(args.seed) if not args.dont_fix_np_seed: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.backends.cudnn.benchmark = False inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath, batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq, cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args) all_layers = [] if args.bit_weights is not None: all_layers += [ n for n, m in inf_model.model.named_modules() if isinstance(m, nn.Conv2d) ][1:-1] if args.bit_act is not None: all_layers += [ n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU) ][1:-1] if args.bit_act is not None and 'mobilenet' in args.arch: all_layers += [ n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU6) ][1:-1] mq = ModelQuantizer(inf_model.model, args, all_layers, replacement_factory) p1 = torch.tensor([ 0.7677084, 1.7640269, 0.80914754, 2.044024, 0.87229156, 1.2659631, 0.78454655, 1.3018194, 0.7894693, 0.92967707, 0.5754433, 0.9115604, 0.5689196, 1.2382566, 0.601773 ]) p2 = torch.tensor([ 0.8135005, 1.7248632, 0.8009758, 2.005755, 0.83956134, 1.2431265, 0.7720454, 1.3013302, 0.76733077, 0.96402454, 0.5914314, 0.9579072, 0.56543064, 1.2535284, 0.6261679 ]) k = 50 step = p1 - p2 losses = [] points = [] for i in range(k + 1): point = p1 + 0.4 * i * step - 10 * step mq.set_clipping(point, inf_model.device) loss = inf_model.evaluate_calibration() losses.append(loss.item()) points.append(point.cpu().numpy()) print("({}: loss) - {}".format(i, loss.item())) f_name = "{}_W{}A{}_loss_conjugate_dir.pkl".format(args.arch, args.bit_weights, args.bit_act) f = open(os.path.join(proj_root_dir, 'data', f_name), 'wb') data = {'start': p1.cpu().numpy(), 'loss': losses, 'points': points} pickle.dump(data, f) f.close() print("Data saved to {}".format(f_name))
def main(args): # Fix the seed random.seed(args.seed) if not args.dont_fix_np_seed: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.backends.cudnn.benchmark = False args.qtype = 'max_static' # create model # Always enable shuffling to avoid issues where we get bad results due to weak statistics custom_resnet = True custom_inception = True replacement_factory = {nn.ReLU: ActivationModuleWrapperPost, nn.ReLU6: ActivationModuleWrapperPost, nn.Conv2d: ParameterModuleWrapperPost} def eval_pnorm(p): args.qtype = 'lp_norm' args.lp = p # Fix the seed random.seed(args.seed) if not args.dont_fix_np_seed: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.backends.cudnn.benchmark = False inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath, batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq, cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args) all_layers = [] if args.bit_weights is not None: all_layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.Conv2d)][1:-1] if args.bit_act is not None: all_layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU)][1:-1] if args.bit_act is not None and 'mobilenet' in args.arch: all_layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU6)][1:-1] mq = ModelQuantizer(inf_model.model, args, all_layers, replacement_factory) loss = inf_model.evaluate_calibration() point = mq.get_clipping() del inf_model del mq return point, loss random.seed(args.seed) if not args.dont_fix_np_seed: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.backends.cudnn.benchmark = False inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath, batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq, cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args) all_layers = [] if args.bit_weights is not None: all_layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.Conv2d)][1:-1] if args.bit_act is not None: all_layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU)][1:-1] if args.bit_act is not None and 'mobilenet' in args.arch: all_layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU6)][1:-1] mq = ModelQuantizer(inf_model.model, args, all_layers, replacement_factory) start_point, start_loss = eval_pnorm(2) end_point, end_loss = eval_pnorm(4.5) k = 50 step = (end_point - start_point) / k print("start") print(start_point) print("end") print(end_point) losses = [] points = [] for i in range(k+1): point = start_point + i * step mq.set_clipping(point, inf_model.device) loss = inf_model.evaluate_calibration() losses.append(loss.item()) points.append(point.cpu().numpy()) print("({}: loss) - {}".format(i, loss.item())) f_name = "{}_W{}A{}_loss_vs_clipping.pkl".format(args.arch, args.bit_weights, args.bit_act) f = open(os.path.join(proj_root_dir, 'data', f_name), 'wb') data = {'start': start_point.cpu().numpy(), 'end': end_point.cpu().numpy(), 'loss': losses, 'points': points} pickle.dump(data, f) f.close() print("Data saved to {}".format(f_name))
def main_worker(args, ml_logger): global best_acc1 if args.gpu_ids is not None: print("Use GPU: {} for training".format(args.gpu_ids)) # create model if 'resnet' in args.arch and args.custom_resnet: model = custom_resnet(arch=args.arch, pretrained=args.pretrained, depth=arch2depth(args.arch), dataset=args.dataset) elif 'inception_v3' in args.arch and args.custom_inception: model = custom_inception(pretrained=args.pretrained) elif args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() device = torch.device('cuda:{}'.format(args.gpu_ids[0])) cudnn.benchmark = True torch.cuda.set_device(args.gpu_ids[0]) model = model.to(device) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): # mq = ModelQuantizer(model, args) print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, device) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] # best_acc1 may be from a checkpoint from a different GPU # best_acc1 = best_acc1.to(device) model.load_state_dict(checkpoint['state_dict']) # optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) if len(args.gpu_ids) > 1: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features, args.gpu_ids) else: model = torch.nn.DataParallel(model, args.gpu_ids) val_data = get_dataset( args.dataset, 'val', get_transform(args.dataset, augment=False, scale_size=299 if 'inception' in args.arch else None, input_size=299 if 'inception' in args.arch else None), datasets_path=args.datapath) val_loader = torch.utils.data.DataLoader(val_data, batch_size=args.batch_size, shuffle=args.shuffle, num_workers=args.workers, pin_memory=True) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().to(device) if 'inception' in args.arch and args.custom_inception: first = 3 last = -1 else: first = 1 last = -1 if args.quantize: all_convs = [ n for n, m in model.named_modules() if isinstance(m, nn.Conv2d) ] all_relu = [ n for n, m in model.named_modules() if isinstance(m, nn.ReLU) ] all_relu6 = [ n for n, m in model.named_modules() if isinstance(m, nn.ReLU6) ] layers = all_relu[first:last] + all_relu6[first:last] + all_convs[ first:last] replacement_factory = { nn.ReLU: ActivationModuleWrapperPost, nn.ReLU6: ActivationModuleWrapperPost, nn.Conv2d: ParameterModuleWrapperPost } mq = ModelQuantizer(model, args, layers, replacement_factory) mq.log_quantizer_state(ml_logger, -1) acc = validate(val_loader, model, criterion, args, device) ml_logger.log_metric('Val Acc1', acc, step='auto')
def main_worker(args, ml_logger): global best_acc1 datatime_str = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') suf_name = "_" + args.experiment if args.gpu_ids is not None: print("Use GPU: {} for training".format(args.gpu_ids)) if args.log_stats: from utils.stats_trucker import StatsTrucker as ST ST("W{}A{}".format(args.bit_weights, args.bit_act)) if 'resnet' in args.arch and args.custom_resnet: # pdb.set_trace() model = custom_resnet(arch=args.arch, pretrained=args.pretrained, depth=arch2depth(args.arch), dataset=args.dataset) elif 'inception_v3' in args.arch and args.custom_inception: model = custom_inception(pretrained=args.pretrained) else: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=args.pretrained) device = torch.device('cuda:{}'.format(args.gpu_ids[0])) cudnn.benchmark = True torch.cuda.set_device(args.gpu_ids[0]) model = model.to(device) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, device) args.start_epoch = checkpoint['epoch'] # best_acc1 = checkpoint['best_acc1'] # best_acc1 may be from a checkpoint from a different GPU # best_acc1 = best_acc1.to(device) checkpoint['state_dict'] = { normalize_module_name(k): v for k, v in checkpoint['state_dict'].items() } model.load_state_dict(checkpoint['state_dict'], strict=False) # optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) if len(args.gpu_ids) > 1: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features, args.gpu_ids) else: model = torch.nn.DataParallel(model, args.gpu_ids) default_transform = { 'train': get_transform(args.dataset, augment=True), 'eval': get_transform(args.dataset, augment=False) } val_data = get_dataset(args.dataset, 'val', default_transform['eval']) val_loader = torch.utils.data.DataLoader(val_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().to(device) train_data = get_dataset(args.dataset, 'train', default_transform['train']) train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, drop_last=True) # TODO: replace this call by initialization on small subset of training data # TODO: enable for activations # validate(val_loader, model, criterion, args, device) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) lr_scheduler = StepLR(optimizer, step_size=args.lr_step, gamma=0.1) # pdb.set_trace() mq = None if args.quantize: if args.bn_folding: print( "Applying batch-norm folding ahead of post-training quantization" ) from utils.absorb_bn import search_absorbe_bn search_absorbe_bn(model) all_convs = [ n for n, m in model.named_modules() if isinstance(m, nn.Conv2d) ] # all_convs = [l for l in all_convs if 'downsample' not in l] all_relu = [ n for n, m in model.named_modules() if isinstance(m, nn.ReLU) ] all_relu6 = [ n for n, m in model.named_modules() if isinstance(m, nn.ReLU6) ] layers = all_relu[1:-1] + all_relu6[1:-1] + all_convs[1:] replacement_factory = { nn.ReLU: ActivationModuleWrapper, nn.ReLU6: ActivationModuleWrapper, nn.Conv2d: ParameterModuleWrapper } mq = ModelQuantizer( model, args, layers, replacement_factory, OptimizerBridge(optimizer, settings={ 'algo': 'SGD', 'dataset': args.dataset })) if args.resume: # Load quantization parameters from state dict mq.load_state_dict(checkpoint['state_dict']) mq.log_quantizer_state(ml_logger, -1) if args.model_freeze: mq.freeze() # pdb.set_trace() if args.evaluate: acc = validate(val_loader, model, criterion, args, device) ml_logger.log_metric('Val Acc1', acc) return # evaluate on validation set acc1 = validate(val_loader, model, criterion, args, device) ml_logger.log_metric('Val Acc1', acc1, -1) # evaluate with k-means quantization # if args.model_freeze: # with mq.disable(): # acc1_nq = validate(val_loader, model, criterion, args, device) # ml_logger.log_metric('Val Acc1 fp32', acc1_nq, -1) # pdb.set_trace() # Kurtosis regularization on weights tensors weight_to_hook = {} if args.w_kurtosis: if args.weight_name[0] == 'all': all_convs = [ n.replace(".wrapped_module", "") + '.weight' for n, m in model.named_modules() if isinstance(m, nn.Conv2d) ] weight_name = all_convs[1:] if args.remove_weight_name: for rm_name in args.remove_weight_name: weight_name.remove(rm_name) else: weight_name = args.weight_name for name in weight_name: # pdb.set_trace() curr_param = fine_weight_tensor_by_name(model, name) # if not curr_param: # name = 'float_' + name # QAT name # curr_param = fine_weight_tensor_by_name(self.model, name) # if curr_param is not None: weight_to_hook[name] = curr_param for epoch in range(0, args.epochs): # train for one epoch print('Timestamp Start epoch: {:%Y-%m-%d %H:%M:%S}'.format( datetime.datetime.now())) train(train_loader, model, criterion, optimizer, epoch, args, device, ml_logger, val_loader, mq, weight_to_hook) print('Timestamp End epoch: {:%Y-%m-%d %H:%M:%S}'.format( datetime.datetime.now())) if not args.lr_freeze: lr_scheduler.step() # evaluate on validation set acc1 = validate(val_loader, model, criterion, args, device) ml_logger.log_metric('Val Acc1', acc1, step='auto') # evaluate with k-means quantization # if args.model_freeze: # with mq.quantization_method('kmeans'): # acc1_kmeans = validate(val_loader, model, criterion, args, device) # ml_logger.log_metric('Val Acc1 kmeans', acc1_kmeans, epoch) # with mq.disable(): # acc1_nq = validate(val_loader, model, criterion, args, device) # ml_logger.log_metric('Val Acc1 fp32', acc1_nq, step='auto') if args.quantize: mq.log_quantizer_state(ml_logger, epoch) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict() if len(args.gpu_ids) == 1 else model.module.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best, datatime_str=datatime_str, suf_name=suf_name)
def main_ratio(args, ml_logger): # Fix the seed random.seed(args.seed) if not args.dont_fix_np_seed: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.backends.cudnn.benchmark = False curr_best_acc = 0 curr_best_scale_point = None args.qtype = 'max_static' # create model # Always enable shuffling to avoid issues where we get bad results due to weak statistics custom_resnet = True custom_inception = True inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath, batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq, cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args) # pdb.set_trace() if args.bn_folding: print( "Applying batch-norm folding ahead of post-training quantization") # pdb.set_trace() from utils.absorb_bn import search_absorbe_bn search_absorbe_bn(inf_model.model) # pdb.set_trace() layers = [] # TODO: make it more generic if args.bit_weights is not None: layers += [ n for n, m in inf_model.model.named_modules() if isinstance(m, nn.Conv2d) ][1:-1] if args.bit_act is not None: layers += [ n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU) ][1:-1] if args.bit_act is not None and 'mobilenet' in args.arch: layers += [ n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU6) ][1:-1] replacement_factory = { nn.ReLU: ActivationModuleWrapperPost, nn.ReLU6: ActivationModuleWrapperPost, nn.Conv2d: ParameterModuleWrapperPost } mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory) loss = inf_model.evaluate_calibration() # evaluate max_acc = inf_model.validate() max_point = mq.get_clipping() # pdb.set_trace() if max_acc > curr_best_acc: curr_best_acc = max_acc curr_best_scale_point = max_point ml_logger.log_metric('Loss max', loss.item(), step='auto') ml_logger.log_metric('Acc max', max_acc, step='auto') data = {'max': {'alpha': max_point.cpu().numpy(), 'loss': loss.item()}} print("max loss: {:.4f}, max_acc: {:.4f}".format(loss.item(), max_acc)) def eval_pnorm(p): args.qtype = 'lp_norm' args.lp = p # Fix the seed random.seed(args.seed) if not args.dont_fix_np_seed: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.backends.cudnn.benchmark = False inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath, batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq, cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args) if args.bn_folding: print( "Applying batch-norm folding ahead of post-training quantization" ) # pdb.set_trace() from utils.absorb_bn import search_absorbe_bn search_absorbe_bn(inf_model.model) mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory) loss = inf_model.evaluate_calibration() point = mq.get_clipping() # evaluate acc = inf_model.validate() del inf_model del mq return point, loss, acc del inf_model del mq l2_point, l2_loss, l2_acc = eval_pnorm(2.) print("loss l2: {:.4f}".format(l2_loss.item())) ml_logger.log_metric('Loss l2', l2_loss.item(), step='auto') ml_logger.log_metric('Acc l2', l2_acc, step='auto') data['l2'] = { 'alpha': l2_point.cpu().numpy(), 'loss': l2_loss.item(), 'acc': l2_acc } if l2_acc > curr_best_acc: curr_best_acc = l2_acc curr_best_scale_point = l2_point l25_point, l25_loss, l25_acc = eval_pnorm(2.5) print("loss l2.5: {:.4f}".format(l25_loss.item())) ml_logger.log_metric('Loss l2.5', l25_loss.item(), step='auto') ml_logger.log_metric('Acc l2.5', l25_acc, step='auto') data['l2.5'] = { 'alpha': l25_point.cpu().numpy(), 'loss': l25_loss.item(), 'acc': l25_acc } if l25_acc > curr_best_acc: curr_best_acc = l25_acc curr_best_scale_point = l25_point l3_point, l3_loss, l3_acc = eval_pnorm(3.) print("loss l3: {:.4f}".format(l3_loss.item())) ml_logger.log_metric('Loss l3', l3_loss.item(), step='auto') ml_logger.log_metric('Acc l3', l3_acc, step='auto') data['l3'] = { 'alpha': l3_point.cpu().numpy(), 'loss': l3_loss.item(), 'acc': l3_acc } if l3_acc > curr_best_acc: curr_best_acc = l3_acc curr_best_scale_point = l3_point # Interpolate optimal p xp = np.linspace(1, 5, 50) z = np.polyfit([2, 2.5, 3], [l2_acc, l25_acc, l3_acc], 2) y = np.poly1d(z) p_intr = xp[np.argmax(y(xp))] print("p intr: {:.2f}".format(p_intr)) ml_logger.log_metric('p intr', p_intr, step='auto') args.qtype = 'lp_norm' args.lp = p_intr # Fix the seed random.seed(args.seed) if not args.dont_fix_np_seed: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.backends.cudnn.benchmark = False inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath, batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq, cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args) if args.bn_folding: print( "Applying batch-norm folding ahead of post-training quantization") # pdb.set_trace() from utils.absorb_bn import search_absorbe_bn search_absorbe_bn(inf_model.model) mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory) # Evaluate with optimal p lp_loss = inf_model.evaluate_calibration() lp_point = mq.get_clipping() # evaluate lp_acc = inf_model.validate() print("loss p intr: {:.4f}".format(lp_loss.item())) ml_logger.log_metric('Loss p intr', lp_loss.item(), step='auto') ml_logger.log_metric('Acc p intr', lp_acc, step='auto') if lp_acc > curr_best_acc: curr_best_acc = lp_acc curr_best_scale_point = lp_point global _eval_count, _min_loss _min_loss = lp_loss.item() idx = np.argmax([l2_acc, l25_acc, l3_acc, lp_acc]) init = [l2_point, l25_point, l3_point, lp_point][idx] # run optimizer min_options = {} if args.maxiter is not None: min_options['maxiter'] = args.maxiter if args.maxfev is not None: min_options['maxfev'] = args.maxfev _iter = count(0) def local_search_callback(x): it = next(_iter) mq.set_clipping(x, inf_model.device) loss = inf_model.evaluate_calibration() print("\n[{}]: Local search callback".format(it)) print("loss: {:.4f}\n".format(loss.item())) print(x) ml_logger.log_metric('Loss {}'.format(args.min_method), loss.item(), step='auto') # evaluate acc = inf_model.validate() ml_logger.log_metric('Acc {}'.format(args.min_method), acc, step='auto') args.min_method = "Powell" method = coord_descent if args.min_method == 'CD' else args.min_method res = opt.minimize( lambda scales: evaluate_calibration_clipped(scales, inf_model, mq), init.cpu().numpy(), method=method, options=min_options, callback=local_search_callback) print(res) scales = res.x mq.set_clipping(scales, inf_model.device) loss = inf_model.evaluate_calibration() ml_logger.log_metric('Loss {}'.format(args.min_method), loss.item(), step='auto') # evaluate acc = inf_model.validate() ml_logger.log_metric('Acc {}'.format(args.min_method), acc, step='auto') data['powell'] = {'alpha': scales, 'loss': loss.item(), 'acc': acc} if acc > curr_best_acc: curr_best_acc = acc curr_best_scale_point = scales print("Starting coordinate descent") args.min_method = "CD" _iter = count(0) global _eval_count _eval_count = count(0) _min_loss = lp_loss.item() mq.set_clipping(init, inf_model.device) # Run coordinate descent for comparison method = coord_descent res = opt.minimize( lambda scales: evaluate_calibration_clipped(scales, inf_model, mq), init.cpu().numpy(), method=method, options=min_options, callback=local_search_callback) print(res) scales = res.x mq.set_clipping(scales, inf_model.device) loss = inf_model.evaluate_calibration() ml_logger.log_metric('Loss {}'.format("CD"), loss.item(), step='auto') # evaluate acc = inf_model.validate() ml_logger.log_metric('Acc {}'.format("CD"), acc, step='auto') data['cd'] = {'alpha': scales, 'loss': loss.item(), 'acc': acc} if acc > curr_best_acc: curr_best_acc = acc curr_best_scale_point = scales pdb.set_trace() if curr_best_scale_point.is_cuda: curr_best_scale_point = curr_best_scale_point.cpu() best_point = np.concatenate( [curr_best_scale_point, torch.tensor([curr_best_acc])]) print("**** START LOSS GENERATION ****") print("best point:" + str(best_point)) best_point_values = best_point[:-1] mq.set_clipping(best_point_values, inf_model.device) loss = inf_model.evaluate_calibration() # evaluate top1 = inf_model.validate() print("best point: loss, top1: {:.4f}, {}".format(loss.item(), top1)) # best_point = curr_best_scale_point # best_point = mq.get_clipping() # best_point_values = curr_best_scale_point[:-1] # pdb.set_trace() n = args.grid_resolution min_ratio = args.min_ratio # 0.8 max_ratio = args.max_ratio # 1.2 x = np.linspace(min_ratio, max_ratio, n) # y = np.linspace(min_ratio, max_ratio, n) loss_best = loss # X, Y = np.meshgrid(x, y) Z_loss = np.empty(n) Z_top1 = np.empty(n) for i, x_ in enumerate(tqdm(x)): # set clip value to qwrappers scales_ratio = x_ mq.set_clipping((best_point_values * scales_ratio), inf_model.device) if scales_ratio == 1.0: print(best_point_values * scales_ratio) # evaluate with clipping loss = inf_model.evaluate_calibration() Z_loss[i] = loss.item() Z_top1[i] = inf_model.validate() str1 = "[x, loss, top1] = [{}, {}, {}]".format(x[i], Z_loss[i], Z_top1[i]) print(str1) # pdb.set_trace() # best_point = np.concatenate([1.0, loss_best.cpu().numpy()]) best_point_ratio = [1.0, loss_best.cpu().numpy()] print("best_point_ratio: " + str(best_point_ratio)) # best_point = [best_point_values, loss_best.cpu().numpy()] # print("best point: " + str(best_point)) print("best point values: " + str(best_point_values)) f_name = "loss_generation_lapq_{}_W{}A{}.pkl".format( args.arch, 'ALL', None) dir_fullname = os.path.join(os.getcwd(), args.experiment) if not os.path.exists(dir_fullname): os.makedirs(dir_fullname) f = open(os.path.join(dir_fullname, f_name), 'wb') data = { 'X': x, 'Z_loss': Z_loss, 'Z_top1': Z_top1, 'best_point_ratio': best_point_ratio, 'best_point': best_point_values } pickle.dump(data, f) f.close() print("Data saved to {}".format(f_name))
def main(args, ml_logger): # Fix the seed random.seed(args.seed) if not args.dont_fix_np_seed: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() # Create model model = NeuMF(2197225, 855776, mf_dim=64, mf_reg=0., mlp_layer_sizes=[256, 256, 128, 64], mlp_layer_regs=[0. for i in [256, 256, 128, 64]]) print(model) if use_cuda: # Move model and loss to GPU model = model.cuda() model.device = torch.device('cuda:{}'.format(0)) if args.load_ckp: ckp = torch.load(args.load_ckp) model.load_state_dict(ckp) all_embeding = [ n for n, m in model.named_modules() if isinstance(m, nn.Embedding) ] all_linear = [ n for n, m in model.named_modules() if isinstance(m, nn.Linear) ] all_relu = [n for n, m in model.named_modules() if isinstance(m, nn.ReLU)] all_relu6 = [ n for n, m in model.named_modules() if isinstance(m, nn.ReLU6) ] layers = all_relu + all_relu6 + all_linear + all_embeding replacement_factory = { nn.ReLU: ActivationModuleWrapperPost, nn.ReLU6: ActivationModuleWrapperPost, nn.Linear: ParameterModuleWrapperPost, nn.Embedding: ActivationModuleWrapperPost } mq = ModelQuantizer(model, args, layers, replacement_factory) # mq.log_quantizer_state(ml_logger, -1) test_users, test_items, dup_mask, real_indices, K, samples_per_user, num_user = data_loader( args.data) data = NcfData(test_users, test_items, dup_mask, real_indices, K, samples_per_user, num_user) cal_data = CalibrationSet('ml-20mx16x32/cal_set').cuda() cal_data.split(batch_size=10000) criterion = nn.BCEWithLogitsLoss(reduction='mean') criterion = criterion.cuda() print("init_method: {}, qtype {}".format(args.init_method, args.qtype)) # evaluate to initialize dynamic clipping loss = evaluate_calibration(model, cal_data, criterion) print("Initial loss: {:.4f}".format(loss)) # get clipping values init = get_clipping(mq) # evaluate hr, ndcg = validate(model, data) ml_logger.log_metric('HR init', hr, step='auto') # run optimizer min_options = {} if args.maxiter is not None: min_options['maxiter'] = args.maxiter if args.maxfev is not None: min_options['maxfev'] = args.maxfev _iter = count(0) def local_search_callback(x): it = next(_iter) loss = run_inference_on_calibration(x, model, mq, cal_data, criterion) print("\n[{}]: Local search callback".format(it)) print("loss: {:.4f}\n".format(loss)) res = opt.minimize(lambda scales: run_inference_on_calibration( scales, model, mq, cal_data, criterion), np.array(init), method=args.min_method, options=min_options, callback=local_search_callback) print(res) scales = res.x set_clipping(mq, scales, model.device) # evaluate hr, ndcg = validate(model, data) ml_logger.log_metric('HR Powell', hr, step='auto')
def main(args, ml_logger): # Fix the seed random.seed(args.seed) if not args.dont_fix_np_seed: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.backends.cudnn.benchmark = False if args.tag is not None: ml_logger.mlflow.log_param('tag', args.tag) enable_bcorr = False if args.bcorr_w: args.bcorr_w = False enable_bcorr = True if args.init_method == 'random': args.qtype = 'max_static' # create model # Always enable shuffling to avoid issues where we get bad results due to weak statistics custom_resnet = True custom_inception = True inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath, batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq, cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args) layers = [] # TODO: make it more generic if args.bit_weights is not None: layers += [ n for n, m in inf_model.model.named_modules() if isinstance(m, nn.Conv2d) ][1:-1] if args.bit_act is not None: layers += [ n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU) ][1:-1] if args.bit_act is not None and 'mobilenet' in args.arch: layers += [ n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU6) ][1:-1] replacement_factory = { nn.ReLU: ActivationModuleWrapperPost, nn.ReLU6: ActivationModuleWrapperPost, nn.Conv2d: ParameterModuleWrapperPost } mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory) init_loss = inf_model.evaluate_calibration() if args.init_method == 'random': clip = mq.get_clipping() for i, c in enumerate(clip.cpu()): clip[i] = np.random.uniform(0, c) print("Randomize initial clipping") print(clip) mq.set_clipping(clip, inf_model.device) init_loss = inf_model.evaluate_calibration() print("init loss: {:.4f}".format(init_loss.item())) ml_logger.log_metric('Init loss', init_loss.item(), step='auto') acc = inf_model.validate() ml_logger.log_metric('Acc init', acc, step='auto') init = mq.get_clipping() global _eval_count, _min_loss _min_loss = init_loss.item() # if enable_bcorr: # args.bcorr_w = True # inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath, # batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq, # cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args) # # mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory) # run optimizer min_options = {} if args.maxiter is not None: min_options['maxiter'] = args.maxiter if args.maxfev is not None: min_options['maxfev'] = args.maxfev _iter = count(0) def local_search_callback(x): it = next(_iter) mq.set_clipping(x, inf_model.device) loss = inf_model.evaluate_calibration() print("\n[{}]: Local search callback".format(it)) print("loss: {:.4f}\n".format(loss.item())) print(x) ml_logger.log_metric('Loss {}'.format(args.min_method), loss.item(), step='auto') # evaluate acc = inf_model.validate() ml_logger.log_metric('Acc {}'.format(args.min_method), acc, step='auto') args.min_method = "Powell" method = coord_descent if args.min_method == 'CD' else args.min_method res = opt.minimize( lambda scales: evaluate_calibration_clipped(scales, inf_model, mq), init.cpu().numpy(), method=method, options=min_options, callback=local_search_callback) print(res) scales = res.x mq.set_clipping(scales, inf_model.device) loss = inf_model.evaluate_calibration() ml_logger.log_metric('Loss {}'.format(args.min_method), loss.item(), step='auto') # evaluate acc = inf_model.validate() ml_logger.log_metric('Acc {}'.format(args.min_method), acc, step='auto')
def main(args, ml_logger): # Fix the seed random.seed(args.seed) if not args.dont_fix_np_seed: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.backends.cudnn.benchmark = False if args.tag is not None: ml_logger.mlflow.log_param('tag', args.tag) # enable_bcorr = False # if args.bcorr_w: # args.bcorr_w = False # enable_bcorr = True args.qtype = 'max_static' # create model # Always enable shuffling to avoid issues where we get bad results due to weak statistics custom_resnet = True custom_inception = True inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath, batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq, cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args) layers = [] # TODO: make it more generic if 'inception' in args.arch and args.custom_inception: first = 3 last = -1 else: first = 1 last = -1 if args.bit_weights is not None: layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.Conv2d)][first:last] if args.bit_act is not None: layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU)][first:last] if args.bit_act is not None and 'mobilenet' in args.arch: layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU6)][first:last] replacement_factory = {nn.ReLU: ActivationModuleWrapperPost, nn.ReLU6: ActivationModuleWrapperPost, nn.Conv2d: ParameterModuleWrapperPost} mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory) maxabs_loss = inf_model.evaluate_calibration() print("max loss: {:.4f}".format(maxabs_loss.item())) max_point = mq.get_clipping() ml_logger.log_metric('Loss max', maxabs_loss.item(), step='auto') # evaluate maxabs_acc = 0#inf_model.validate() ml_logger.log_metric('Acc maxabs', maxabs_acc, step='auto') data = {'max': {'alpha': max_point.cpu().numpy(), 'loss': maxabs_loss.item(), 'acc': maxabs_acc}} del inf_model del mq def eval_pnorm(p): args.qtype = 'lp_norm' args.lp = p # Fix the seed random.seed(args.seed) if not args.dont_fix_np_seed: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.backends.cudnn.benchmark = False inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath, batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq, cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args) mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory) loss = inf_model.evaluate_calibration() point = mq.get_clipping() # evaluate acc = inf_model.validate() del inf_model del mq return point, loss, acc def eval_pnorm_on_calibration(p): args.qtype = 'lp_norm' args.lp = p # Fix the seed random.seed(args.seed) if not args.dont_fix_np_seed: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.backends.cudnn.benchmark = False inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath, batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq, cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args) mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory) loss = inf_model.evaluate_calibration() point = mq.get_clipping() del inf_model del mq return point, loss ps = np.linspace(2, 4, 10) losses = [] for p in tqdm(ps): point, loss = eval_pnorm_on_calibration(p) losses.append(loss.item()) print("(p, loss) - ({}, {})".format(p, loss.item())) # Interpolate optimal p z = np.polyfit(ps, losses, 2) y = np.poly1d(z) p_intr = y.deriv().roots[0] # loss_opt = y(p_intr) print("p intr: {:.2f}".format(p_intr)) ml_logger.log_metric('p intr', p_intr, step='auto') lp_point, lp_loss, lp_acc = eval_pnorm(p_intr) print("loss p intr: {:.4f}".format(lp_loss.item())) print("acc p intr: {:.4f}".format(lp_acc)) ml_logger.log_metric('Init loss', lp_loss.item(), step='auto') ml_logger.log_metric('Acc init', lp_acc, step='auto') global _eval_count, _min_loss _min_loss = lp_loss.item() # loss_best = np.min(losses) # if loss_best < lp_loss: # p_intr = ps[np.argmin(losses)] # print("p best: {:.2f}".format(p_intr)) # ml_logger.log_metric('p best', p_intr, step='auto') # lp_point, lp_loss, lp_acc = eval_pnorm(p_intr) # print("loss p best: {:.4f}".format(lp_loss.item())) # print("acc p best: {:.4f}".format(lp_acc)) # ml_logger.log_metric('Loss p best', lp_loss.item(), step='auto') # ml_logger.log_metric('Acc p best', lp_acc, step='auto') # idx = np.argmin([maxabs_loss, lp_loss]) # init = [max_point, lp_point][idx] init = lp_point args.qtype = 'lp_norm' args.lp = p_intr # Fix the seed random.seed(args.seed) if not args.dont_fix_np_seed: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.backends.cudnn.benchmark = False # if enable_bcorr: # args.bcorr_w = True inf_model = CnnModel(args.arch, custom_resnet, custom_inception, args.pretrained, args.dataset, args.gpu_ids, args.datapath, batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq, cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args) mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory) # run optimizer min_options = {} if args.maxiter is not None: min_options['maxiter'] = args.maxiter if args.maxfev is not None: min_options['maxfev'] = args.maxfev _iter = count(0) def local_search_callback(x): it = next(_iter) mq.set_clipping(x, inf_model.device) loss = inf_model.evaluate_calibration() print("\n[{}]: Local search callback".format(it)) print("loss: {:.4f}\n".format(loss.item())) print(x) ml_logger.log_metric('Loss {}'.format(args.min_method), loss.item(), step='auto') # evaluate acc = inf_model.validate() ml_logger.log_metric('Acc {}'.format(args.min_method), acc, step='auto') args.min_method = "Powell" method = coord_descent if args.min_method == 'CD' else args.min_method res = opt.minimize(lambda scales: evaluate_calibration_clipped(scales, inf_model, mq), init.cpu().numpy(), method=method, options=min_options, callback=local_search_callback) print(res) scales = res.x mq.set_clipping(scales, inf_model.device) loss = inf_model.evaluate_calibration() ml_logger.log_metric('Loss {}'.format(args.min_method), loss.item(), step='auto') # evaluate acc = inf_model.validate() ml_logger.log_metric('Acc {}'.format(args.min_method), acc, step='auto') data['powell'] = {'alpha': scales, 'loss': loss.item(), 'acc': acc} # save scales f_name = "scales_{}_W{}A{}.pkl".format(args.arch, args.bit_weights, args.bit_act) f = open(os.path.join(proj_root_dir, 'data', f_name), 'wb') pickle.dump(data, f) f.close() print("Data saved to {}".format(f_name))
def main(): args = parse_args() if args.seed is not None: print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) np.random.seed(seed=args.seed) # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() # Create model model = NeuMF(2197225, 855776, mf_dim=64, mf_reg=0., mlp_layer_sizes=[256, 256, 128, 64], mlp_layer_regs=[0. for i in [256, 256, 128, 64]]) print(model) if use_cuda: # Move model and loss to GPU model = model.cuda() if args.load_ckp: ckp = torch.load(args.load_ckp) model.load_state_dict(ckp) if args.quantize: all_embeding = [ n for n, m in model.named_modules() if isinstance(m, nn.Embedding) ] all_linear = [ n for n, m in model.named_modules() if isinstance(m, nn.Linear) ] all_relu = [ n for n, m in model.named_modules() if isinstance(m, nn.ReLU) ] all_relu6 = [ n for n, m in model.named_modules() if isinstance(m, nn.ReLU6) ] # layers = all_relu + all_relu6 + all_linear layers = all_embeding replacement_factory = { nn.ReLU: ActivationModuleWrapperPost, nn.ReLU6: ActivationModuleWrapperPost, nn.Linear: ParameterModuleWrapperPost, nn.Embedding: ActivationModuleWrapperPost } mq = ModelQuantizer(model, args, layers, replacement_factory) # mq.log_quantizer_state(ml_logger, -1) test_users, test_items, dup_mask, real_indices, K, samples_per_user, num_user = data_loader( args.data) data = NcfData(test_users, test_items, dup_mask, real_indices, K, samples_per_user, num_user) hr, ndcg = val(model, data) print('') print('') print('HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format(K=K, hit_rate=hr, ndcg=ndcg))
def main(args, ml_logger): # Fix the seed random.seed(args.seed) if not args.dont_fix_np_seed: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.backends.cudnn.benchmark = False # create model # Always enable shuffling to avoid issues where we get bad results due to weak statistics inf_model = CnnModel(args.arch, args.custom_resnet, args.custom_inception,args.pretrained, args.dataset, args.gpu_ids, args.datapath, batch_size=args.batch_size, shuffle=True, workers=args.workers, print_freq=args.print_freq, cal_batch_size=args.cal_batch_size, cal_set_size=args.cal_set_size, args=args) layers = [] # TODO: make it more generic if args.bit_weights is not None: layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.Conv2d)][1:-1] if args.bit_act is not None: layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU)][1:-1] if args.bit_act is not None and 'mobilenet' in args.arch: layers += [n for n, m in inf_model.model.named_modules() if isinstance(m, nn.ReLU6)][1:-1] replacement_factory = {nn.ReLU: ActivationModuleWrapperPost, nn.ReLU6: ActivationModuleWrapperPost, nn.Conv2d: ParameterModuleWrapperPost} mq = ModelQuantizer(inf_model.model, args, layers, replacement_factory) # mq.log_quantizer_state(ml_logger, -1) print("init_method: {}, qtype {}".format(args.init_method, args.qtype)) # initialize scales if args.init_method == 'dynamic': # evaluate to initialize dynamic clipping loss = inf_model.evaluate_calibration() print("Initial loss: {:.4f}".format(loss.item())) # get clipping values init = mq.get_clipping() else: if args.init_method == 'static': init = np.array([args.siv] * len(layers)) elif args.init_method == 'random': init = np.random.uniform(0.5, 1., size=len(layers)) # TODO: pass range by argument else: raise RuntimeError("Invalid argument init_method {}".format(args.init_method)) # set clip value to qwrappers mq.set_clipping(init, inf_model.device) print("scales initialization: {}".format(str(init))) # evaluate with clipping loss = inf_model.evaluate_calibration() print("Initial loss: {:.4f}".format(loss.item())) ml_logger.log_metric('Loss init'.format(args.min_method), loss.item(), step='auto') global _min_loss _min_loss = loss.item() # evaluate acc = inf_model.validate() ml_logger.log_metric('Acc init', acc, step='auto') # run optimizer min_options = {} if args.maxiter is not None: min_options['maxiter'] = args.maxiter if args.maxfev is not None: min_options['maxfev'] = args.maxfev _iter = count(0) def local_search_callback(x): it = next(_iter) mq.set_clipping(x, inf_model.device) loss = inf_model.evaluate_calibration() print("\n[{}]: Local search callback".format(it)) print("loss: {:.4f}\n".format(loss.item())) print(x) ml_logger.log_metric('Loss {}'.format(args.min_method), loss.item(), step='auto') # evaluate acc = inf_model.validate() ml_logger.log_metric('Acc {}'.format(args.min_method), acc, step='auto') method = coord_descent if args.min_method == 'CD' else args.min_method res = opt.minimize(lambda scales: evaluate_calibration_clipped(scales, inf_model, mq), init.cpu().numpy(), method=method, options=min_options, callback=local_search_callback) print(res) scales = res.x mq.set_clipping(scales, inf_model.device) loss = inf_model.evaluate_calibration() ml_logger.log_metric('Loss {}'.format(args.min_method), loss.item(), step='auto') # evaluate acc = inf_model.validate() ml_logger.log_metric('Acc {}'.format(args.min_method), acc, step='auto')