net.train() loss = 0 correct = 0 total = 0 print('\n[Epoch: %d] \nTraining' % (epoch)) for batch_idx, (inputs, targets) in enumerate(target_train_loader): if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() outputs = net(inputs) losses = nn.CrossEntropyLoss()(outputs, targets) optimizer.zero_grad() losses.backward() optimizer.step() loss += losses.item() _, predicted = torch.max(outputs, dim=1) correct += predicted.eq(targets.data).cpu().sum().item() total += targets.size(0) progress_bar( batch_idx, len(target_train_loader), "Loss: %.3f | Acc: %.3f%%" % (loss / (batch_idx + 1), 100.0 * correct / total)) print('Test') validate(net, target_test_loader)
net.load_state_dict(torch.load('ft')) if use_cuda: net.cuda() net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count())) cudnn.benchmark = True criterion = nn.CrossEntropyLoss() # optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) optimizer = optim.Adam(net.parameters(), lr=args.lr, weight_decay=5e-4) break_count = 0 for epoch in range(start_epoch, start_epoch+200): print('\nEpoch: %d' %epoch) train(net, source_train_loader, optimizer=optimizer, n_epoch=1) acc_s = validate(net, source_test_loader) acc_t = validate(net, target_test_loader) if acc_s > best_acc: print('Saving..') if not os.path.exists('./checkpoint'): os.makedirs('./checkpoint') state = { 'net': net.module if use_cuda else net, 'acc': acc_s, 'epoch': epoch, } if not os.path.isdir('checkpoint'): os.mkdir('checkpoint') torch.save(state, './checkpoint/%s_%s_ckpt.t7' % (model_name, source_dataset))
def QuantizedNet(dataset,model_name,batch_size,lr,kbits,require_first_test,quantized_first_and_last): use_cuda = torch.cuda.is_available() # ---------------------------- Configuration -------------------------- model_name = args.model_name dataset = args.dataset exp_spec = args.exp_sec # Initialize some folder for data saving folder_init(model_name, ['train_record', 'val_record', 'save_models', \ 'trainable_names/LDNQ%s' %(exp_spec)]) pretrain_path = './%s/%s_%s_%s_pretrain.pth' %(model_name, dataset, model_name,1) quantized_path = './%s/%s_save_models/LDNQ%s.pth' %(model_name, dataset,exp_spec) hessian_root = './%s/hessian' %model_name kbits = args.kbits trainable_names_record_root = './%s/trainable_names/LDNQ%s' %(model_name, exp_spec) train_record = open('./%s/train_record/LDNQ%s.txt' %(model_name, exp_spec), 'w') val_record = open('./%s/val_record/LDNQ%s.txt' %(model_name, exp_spec), 'w') init_lr = 0.001 # -------------------------------------------------------------------- """print ('You are going to quantize model %s into %d bits, using dataset %s, with specification name as %s' \ %(model_name, kbits, dataset, exp_spec)) input('Press any to continue. Ctrl+C to break.')""" ################ # Load Dataset # ################ """train_loader = get_dataloader(dataset, 'limited', batch_size = batch_size, ratio=0.01) print ('Length of train loader: %d' %(len(train_loader))) hessian_loader = get_dataloader(dataset, 'limited', batch_size = 2) print ('Length of hessian loader: %d' %(len(hessian_loader))) test_loader = get_dataloader(dataset, 'test', batch_size = 100)""" train_loader = data_loading(DataPath,dataset,'limited',args.batch_size) print ('Length of train loader: %d' %(len(train_loader))) hessian_loader = data_loading(DataPath,dataset, 'limited', batch_size = 2) print ('Length of hessian loader: %d' %(len(hessian_loader))) test_loader = data_loading(DataPath,dataset,'test', batch_size = 100) print ('Length of test loader: %d' %(len(test_loader))) ################ # Load Models ## ################ if dataset =='MNIST': quantized_net = MnistResNet() pretrain_param = torch.load(pretrain_path) #quantized_net.load_state_dict(pretrain_param) original_net = MnistResNet() #original_net.load_state_dict(pretrain_param) elif dataset =='CIFAR10': quantized_net = Resnet20_CIFAR10(1) #quantized_net = resnet18() # For quantization of ResNet18 using ImageNet pretrain_param = torch.load(pretrain_path) quantized_net.load_state_dict(pretrain_param) original_net = Resnet20_CIFAR10(1) #original_net = resnet18() # For quantization of ResNet18 using ImageNet original_net.load_state_dict(pretrain_param) if use_cuda: print('Dispatch model in %d GPUs' % (len(range(torch.cuda.device_count())))) quantized_net.cuda() quantized_net = torch.nn.DataParallel(quantized_net, device_ids=range(torch.cuda.device_count())) original_net.cuda() original_net = torch.nn.DataParallel(original_net, device_ids=range(torch.cuda.device_count())) cudnn.benchmark = False #################### # First Validation # #################### if require_first_test: acc = validate(quantized_net, test_loader, dataset_name=dataset_name) print('Full-precision accuracy: %.3f' %acc) val_record.write('Full-precision accuracy: %.3f\n' %acc) # Generate layer name list: layers to be quantized layer_collection_list = generate_layer_name_collections(quantized_net, model_name=model_name, quantized_first_last_layer=quantized_first_and_last) ############### # Begin L-DNQ # ############### for layer_idx, layer_name in enumerate(layer_collection_list): print ('[%s] Process layer %s' % (datetime.now(), layer_name)) if train_record is not None: train_record.write('Process layer %s\n' % layer_name) if val_record is not None: val_record.write('Process layer %s\n' % layer_name) state_dict = quantized_net.state_dict() if 'linear' in layer_name or 'fc' in layer_name: # Generate Hessian hessian = generate_hessian(quantized_net, hessian_loader, layer_name, layer_type='F') updated_weight = state_dict['%s.weight' % (layer_name)].cpu().numpy() if use_cuda else \ state_dict['%s.weight' % (layer_name)].numpy() updated_bias = state_dict['%s.bias' % (layer_name)].cpu().numpy() if use_cuda else \ state_dict['%s.bias' % (layer_name)].numpy() # Perform Quantization quantized_weight, quantized_bias = ADMM_quantization(layer_name=layer_name, layer_type='F', kernel=updated_weight, bias=updated_bias, hessian=hessian, kbits=kbits) state_dict['%s.weight' % (layer_name)] = torch.FloatTensor(quantized_weight) state_dict['%s.bias' % (layer_name)] = torch.FloatTensor(quantized_bias) else: # Generate Hessian hessian = generate_hessian(quantized_net, hessian_loader, layer_name, layer_type='R', stride_factor = 1) updated_kernel = state_dict['%s.weight' % (layer_name)].cpu().numpy() if use_cuda else \ state_dict['%s.weight' % (layer_name)].numpy() # Perform Quantization quantized_kernel = ADMM_quantization(layer_name=layer_name, layer_type='R', kernel=updated_kernel, bias=None, hessian=hessian, kbits=kbits) # Step 2: Assignment # Assign processed layer with quantized weights state_dict['%s.weight' % (layer_name)] = torch.FloatTensor(quantized_kernel) ########################### # Cascaded Weights Update # ########################### quantized_net.load_state_dict(state_dict) print ('[%s] Finish layer %s' % (datetime.now(), layer_name)) # Generate the non-quantized / trainable parameters trainable_parameters, trainable_names = \ generate_trainable_parameters( quantized_net.named_parameters(), layer_name + '.weight', model_name=model_name, quantized_first_last_layer=quantized_first_and_last ) print ('Length of trainable parameters: %d' %(len(trainable_names))) trainable_names_record = open('%s/%s.txt' % (trainable_names_record_root, layer_name), 'w') for name in trainable_names: trainable_names_record.write(name + '\n') trainable_names_record.close() optimizer = optim.SGD(trainable_parameters, lr=init_lr, momentum=0.9, weight_decay=5e-4) cascade_soft_update(quantized_net, original_net, train_loader, dataset_name=dataset_name, optimizer=optimizer, train_record=train_record) # Record test acc acc = validate(quantized_net, test_loader, dataset_name=dataset_name, val_record=val_record) torch.save(quantized_net.module.state_dict() if use_cuda else quantized_net.state_dict(), quantized_path) train_record.close() val_record.close()
def main_worker(gpu, args): args.gpu = gpu if not args.cpu: if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) else: print("Use CPU") # create model if args.dataset == 'imagenet': if args.arch in mymodel_names: model = mymodels.__dict__[args.arch]() else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() # load model if os.path.isfile(args.model_file): print("=> loading model '{}'".format(args.model_file)) checkpoint = torch.load(args.model_file) d = checkpoint['state_dict'] for old_key in list(d.keys()): if 'module.' in old_key: d[old_key.replace('module.','')] = d.pop(old_key,None) model.load_state_dict(d) print("=> loaded model '{}'".format(args.model_file)) else: print("=> no model found at '{}'".format(args.model_file)) return if not args.cpu: if args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True # Data loading code if args.dataset == 'imagenet': # ImageNet valdir = os.path.join(args.data, 'val_dir') if args.nonblacklist: val_dataset = mydatasets.ImageNetValFolder( valdir, args.test_transform ) comment = 'non-blacklisted validation set' else: val_dataset = datasets.ImageFolder( valdir, args.test_transform ) comment = 'whole validation set' val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) # define loss function (criterion) criterion = nn.CrossEntropyLoss() if not args.cpu: criterion = criterion.cuda(args.gpu) # evaluate on validation set validate(val_loader, model, criterion, args) # @ Primary worker, show the final results print('on {}'.format(comment))
""" This code validate the *.pth file when generated """ import torch from utils.dataset import get_dataloader from utils.train import validate from models_CIFAR9STL9.tucker_CIFARNet_dual import CIFARNet # from models_CIFAR9STL9.tucker_CIFARNet_dual import CIFARNet2 as CIFARNet # Initial model net = CIFARNet() pretrain_param = torch.load('./checkpoint/tucker_CIFARNet9_dual.pth') # pretrain_param = torch.load('./checkpoint/tucker_CIFARNet9_dual_2.pth') net.load_state_dict(pretrain_param) # Load dataset test_loader = get_dataloader('STL9', 'test', 100) net.cuda() validate(net, test_loader)
'6': [48, 48] } ''' N = len(model.features._modules.keys()) for i, key in enumerate(model.features._modules.keys()): # if i >= N - 2: # break if isinstance(model.features._modules[key], torch.nn.modules.conv.Conv2d): conv_layer = model.features._modules[key] if use_cp: rank = max(conv_layer.weight.data.numpy().shape)//2 decomposed = cp_decomposition_conv_layer(conv_layer, rank) else: decomposed = tucker_decomposition_conv_layer(conv_layer, None) model.features._modules[key] = decomposed ''' for i, key in enumerate(model.classifier._modules.keys()): if isinstance(model.classifier._modules[key], nn.Linear): fc_linear = model.classifier._modules[key] # torch.save(model, './checkpoint/%s_CIFARNet9.p' %('cp' if use_cp else 'tucker')) test_loader = get_dataloader('CIFAR9', 'test', 128) model.cuda() validate(model, test_loader)
def main_worker(gpu, ngpus_per_node, args): global stats args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model if args.dataset == 'imagenet': if args.arch in mymodel_names: model = mymodels.__dict__[args.arch](num_classes=1000) elif args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() print(model) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): # only model print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location=torch.device('cpu')) for old_key in list(checkpoint['state_dict'].keys()): if 'module' in old_key: new_key = old_key.replace('module.','') checkpoint['state_dict'][new_key] = checkpoint['state_dict'].pop(old_key, None) model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {}) for model" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) return if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int(args.workers / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # Data loading code if args.dataset == 'imagenet': # ImageNet traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val_dir') train_dataset = datasets.ImageFolder( traindir, args.train_transform ) val_dataset = datasets.ImageFolder( valdir, args.test_transform ) # Data Sampling if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) # define loss function (criterion) and optimizer lossname = args.loss.pop('name') if lossname == 'Softmax': criterion = nn.CrossEntropyLoss().cuda(args.gpu) elif lossname == 'LargeMarginInSoftmax': criterion = LargeMarginInSoftmaxLoss(**args.loss).cuda(args.gpu) else: raise ValueError("loss function of {} is not supported".format(lossname)) args.loss['name'] = lossname optimizer = torch.optim.SGD(model.parameters(), args.lrs[0], momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: # other state parameters if os.path.isfile(args.resume): args.start_epoch = checkpoint['epoch'] stats = checkpoint['stats'] optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {}) for the others" .format(args.resume, checkpoint['epoch'])) cudnn.benchmark = True # Do Train/Eval if args.evaluate: validate(val_loader, model, criterion, args) return primary_worker = not args.multiprocessing_distributed or \ (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0) if primary_worker: progress = ProgressPlotter( titles=('LR', 'Loss', 'Top-1 Error.', 'Top-5 Error.'), legends=(('learning rate',),('train','val'),('train','val'),('train','val')), ylims=((1e-6,1),(0,10),(0,100),(0,100)), yscales=('log','linear','linear','linear'), vals=((args.lrs[:args.start_epoch],), (stats['train_loss'],stats['test_loss']), (stats['train_err1'],stats['test_err1']), (stats['train_err5'],stats['test_err5']) ) ) for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) lr = adjust_learning_rate(optimizer, epoch, args) # train for one epoch trnerr1, trnerr5, trnloss = train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on validation set valerr1, valerr5, valloss = validate(val_loader, model, criterion, args) # statistics stats['train_err1'].append(trnerr1) stats['train_err5'].append(trnerr5) stats['train_loss'].append(trnloss) stats['test_err1'].append(valerr1) stats['test_err5'].append(valerr5) stats['test_loss'].append(valloss) # remember best err@1 is_best = valerr1 <= min(stats['test_err1']) # @ Primary worker, show and save results if primary_worker: # progress.plot( ((trnloss,valloss), (trnerr1, valerr1), (trnerr5, valerr5)) ) progress.plot( ((lr,), (trnloss,valloss), (trnerr1, valerr1), (trnerr5, valerr5)) ) progress.save(filename=os.path.join(args.out_dir, args.pdf_filename)) save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'stats': stats, 'optimizer' : optimizer.state_dict(), 'args' : args }, is_best, args.save_last_checkpoint, filename=os.path.join(args.out_dir, 'checkpoint-epoch{:d}.pth.tar'.format(epoch+1))) # @ Primary worker, show the final results if primary_worker: minind = stats['test_err1'].index(min(stats['test_err1'])) print(' *BEST* Err@1 {:.3f} Err@5 {:.3f}'.format(stats['test_err1'][minind], stats['test_err5'][minind]))
negative_mining_ratio=10) # calc loss l = calc_loss(cls_loss, bbox_loss, cls_preds, cls_labels, bbox_preds, bbox_labels, bbox_masks) l.backward() trainer.step(batch_size) acc_sum += cls_eval(cls_preds, cls_labels) n += cls_labels.size mae_sum += bbox_eval(bbox_preds, bbox_labels, bbox_masks) m += bbox_labels.size print('epoch %2d, class err %.2e, bbox mae %.2e, time %.1f sec' % ( epoch + 1, 1 - acc_sum / n, mae_sum / m, time.time() - start)) # Checkpoint if (epoch + 1) % 5 == 0: net.export('FPN') _1, _2, _3 = validate(val_iter, net, ctx) val_recorder[int(epoch / 5)] = (_1, _2, _3) print(val_recorder) # plt.figure() # plt.plot(val_recorder) # plt.title("validating curve"); # plt.show() def predict(X): anchors, cls_preds, bbox_preds = net(X.as_in_context(ctx)) cls_probs = cls_preds.softmax().transpose((0, 2, 1)) output = nd.contrib.MultiBoxDetection(cls_probs, bbox_preds, anchors) idx = [i for i, row in enumerate(output[0]) if row[0].asscalar() != -1] if idx == []: return nd.array([[0, 0, 0, 0, 0, 0, 0]]) return output[0, idx]
quantized_net.cuda() quantized_net = torch.nn.DataParallel(quantized_net, device_ids=range( torch.cuda.device_count())) original_net.cuda() original_net = torch.nn.DataParallel(original_net, device_ids=range( torch.cuda.device_count())) cudnn.benchmark = True #################### # First Validation # #################### if args.require_first_test: acc = validate(quantized_net, test_loader, dataset_name=dataset_name) print('Full-precision accuracy: %.3f' % acc) val_record.write('Full-precision accuracy: %.3f\n' % acc) # Generate layer name list: layers to be quantized layer_collection_list = generate_layer_name_collections( quantized_net, model_name=model_name, quantized_first_last_layer=args.quantized_first_and_last) ############### # Begin L-DNQ # ############### for layer_idx, layer_name in enumerate(layer_collection_list): print('[%s] Process layer %s' % (datetime.now(), layer_name))