def _get_backbone_network(self, backbone_name): if backbone_name == 'ResNet18': backbone = models.ResNet18(last_relu=False) elif backbone_name == 'ResNet18HighRes': backbone = models.ResNet18(last_relu=False, high_res=True) else: raise NotImplementedError() return backbone
def get_model(device): """ :param device: instance of torch.device :return: An instance of torch.nn.Module """ num_classes = 2 if config["dataset"] == "Cifar100": num_classes = 100 elif config["dataset"] == "Cifar10": num_classes = 10 model = { "vgg11": lambda: models.VGG("VGG11", num_classes, batch_norm=False), "vgg11_bn": lambda: models.VGG("VGG11", num_classes, batch_norm=True), "vgg13": lambda: models.VGG("VGG13", num_classes, batch_norm=False), "vgg13_bn": lambda: models.VGG("VGG13", num_classes, batch_norm=True), "vgg16": lambda: models.VGG("VGG16", num_classes, batch_norm=False), "vgg16_bn": lambda: models.VGG("VGG16", num_classes, batch_norm=True), "vgg19": lambda: models.VGG("VGG19", num_classes, batch_norm=False), "vgg19_bn": lambda: models.VGG("VGG19", num_classes, batch_norm=True), "resnet10": lambda: models.ResNet10(num_classes=num_classes), "resnet18": lambda: models.ResNet18(num_classes=num_classes), "resnet34": lambda: models.ResNet34(num_classes=num_classes), "resnet50": lambda: models.ResNet50(num_classes=num_classes), "resnet101": lambda: models.ResNet101(num_classes=num_classes), "resnet152": lambda: models.ResNet152(num_classes=num_classes), "bert": lambda: models.BertImage(config, num_classes=num_classes), }[config["model"]]() model.to(device) if device == "cuda": model = torch.nn.DataParallel(model) torch.backends.cudnn.benchmark = True return model
def test(self): model_oris = [ models.model_resnet(width=1, mult=2), models.ResNet18(in_planes=2) ] self.result = [] for model_ori in model_oris: conv_mode = 'patches' # conv_mode can be set as 'matrix' or 'patches' normalize = torchvision.transforms.Normalize( mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]) test_data = torchvision.datasets.CIFAR10( "./data", train=False, download=True, transform=torchvision.transforms.Compose( [torchvision.transforms.ToTensor(), normalize])) N = 1 n_classes = 10 image = torch.Tensor(test_data.data[:N]).reshape(N, 3, 32, 32) image = image.to(torch.float32) / 255.0 model = BoundedModule(model_ori, image, bound_opts={"conv_mode": conv_mode}) ptb = PerturbationLpNorm(norm=np.inf, eps=0.03) image = BoundedTensor(image, ptb) pred = model(image) lb, ub = model.compute_bounds(IBP=False, C=None, method='backward') self.result += [lb, ub] self.check()
def main(): global mean, std args = parse_args() if args.deterministic: random.seed(0) torch.manual_seed(0) np.random.seed(0) torch.backends.cudnn.deterministic = True device = torch.device("cuda" if torch.cuda.is_available() else "cpu") input_transforms = torchvision.transforms.Compose([ torchvision.transforms.ToTensor(), torchvision.transforms.Normalize(mean, std) ]) cifar = torchvision.datasets.CIFAR10(args.data_dir, train=True, transform=input_transforms, download=True) indices = list(range(len(cifar))) train_indices = indices[:int(len(indices) * 0.9)] val_indices = indices[int(len(indices) * 0.9):] train_set = CIFARLarge(Subset(cifar, train_indices), args.num_patches, train=True) val_set = CIFARLarge(Subset(cifar, val_indices), args.num_patches, train=False) dataloaders = { "train": DataLoader(SSLTrainDataset(train_set, args.num_patches, args.num_angles), shuffle=False, batch_size=args.ssl_train_batch_size, pin_memory=True), "val": DataLoader(SSLValDataset(val_set, args.num_patches, args.num_angles), shuffle=False, batch_size=args.ssl_val_batch_size, pin_memory=True) } model = models.ResNet18(args.num_patches, args.num_angles) # model.load_state_dict(torch.load(os.path.join(args.model_dir, f"{args.model_name}"))) # train.gen_grad_map(device, model, dataloaders, args.num_patches, args.num_angles) model, best_val_accuracy = train.ssl_train(device, model, dataloaders, args) model_name = time.ctime().replace(" ", "_").replace(":", "_") model_name = f"{model_name}_{best_val_accuracy:.4f}.pt" torch.save(model.state_dict(), os.path.join(args.model_dir, model_name))
def config_net(net_name="VGG"): assert net_name in __all_models__, "Unimplemented architecture" if net_name == "VGG": return models.VGG("VGG19") elif net_name == "ResNet": return models.ResNet18() elif net_name == "ResNeXt": return models.ResNeXt29_2x64d() elif net_name == "MobileNet": return models.MobileNetV2() elif net_name == "DenseNet": return models.DenseNet121() elif net_name == "DPN": return models.DPN92() elif net_name == "EfficientNet": return models.EfficientNetB0()
def get_invert_model(args): if 'ResNet18' in args.invert_model: model = models.ResNet18(args.model) elif 'ResNet152' in args.invert_model: model = models.ResNet152(args.model) elif 'ResNeXt101' in args.invert_model: model = models.ResNeXt101(args.model) elif 'VGG19' in args.invert_model: model = models.VGG19(args.model) elif 'VGG19_BN' in args.invert_model: model = models.VGG19_BN(args.model) elif 'DenseNet201' in args.invert_model: model = models.DenseNet201(args.model) else: raise Exception(f'{args.invert_model} not found') return model
def evaluate_inversion(args, inverted_net_path): # Load saved inverted net device = 'cuda:{}'.format( args.gpu_ids[0]) if len(args.gpu_ids) > 0 else 'cpu' ckpt_dict = torch.load(inverted_net_path, map_location=device) # Build model, load parameters model_args = ckpt_dict['model_args'] inverted_net = models.ResNet18(**model_args) inverted_net = nn.DataParallel(inverted_net, args.gpu_ids) inverted_net.load_state_dict(ckpt_dict['model_state']) import pdb pdb.set_trace() # Get test images (CelebA) initial_generated_image_dir = '/deep/group/sharonz/generator/z_test_images/' initial_generated_image_name = '058004_crop.jpg' initial_generated_image = util.get_image(initial_generated_image_dir, initial_generated_image_name) initial_generated_image = initial_generated_image / 255. intiial_generated_image = initial_generated_image.cuda() inverted_noise = inverted_net(initial_generated_image) if 'BigGAN' in args.model: class_vector = one_hot_from_int(207, batch_size=batch_size) class_vector = torch.from_numpy(class_vector) num_params = int(''.join(filter(str.isdigit, args.model))) generator = BigGAN.from_pretrained(f'biggan-deep-{num_params}') generator = generator.to(args.device) generated_image = generator.forward(inverted_noise, class_vector, args.truncation) # Get difference btw initial and subsequent generated image # Save both return
def main(): args = parse_args() if args.deterministic: random.seed(0) torch.manual_seed(0) np.random.seed(0) torch.backends.cudnn.deterministic = True device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = models.ResNet18(args.num_patches, args.num_angles) mnist_large = MNISTLarge(args.data_dir, args.num_patches) indices = list(range(len(mnist_large))) train_indices = indices[:int(len(indices) * 0.9)] val_indices = indices[int(len(indices) * 0.9):] dataloaders = { "train": DataLoader(SSLTrainDataset(Subset(mnist_large, train_indices), args.num_patches, args.num_angles), shuffle=True, batch_size=args.ssl_train_batch_size, pin_memory=True), "val": DataLoader(SSLValDataset(Subset(mnist_large, val_indices), args.num_patches, args.num_angles), shuffle=False, batch_size=args.ssl_val_batch_size, pin_memory=True) } model, best_val_accuracy = train.ssl_train( device, model, dataloaders, args.ssl_num_epochs, args.num_patches, args.num_angles, MNISTLarge.mean, MNISTLarge.std, args.learn_prd, args.poisson_rate) model_name = time.ctime().replace(" ", "_").replace(":", "_") model_name = f"{model_name}_{best_val_accuracy:.4f}.pt" torch.save(model.state_dict(), os.path.join(args.model_dir, model_name))
def setup_and_run(args, criterion, device, train_loader, test_loader, val_loader, logging, results, summary_writer): global BEST_ACC print('\n#### Running binarized-net ####') # quantized levels if (not args.tanh and args.quant_levels != 2) or args.quant_levels > 3: print 'Quantization levels "{0}" is invalid, exiting ...'.format(args.quant_levels) exit() # for tanh, Q_l = {-1, 0, 1}, rounding if {-1: ( ,-0.5], 0: (-0.5, 0.5), 1: [0.5, )} if args.zeroone and args.tanh: print 'zeroone cannot be true while tanh is, setting zeroone False ...' args.zeroone = False # architecture if 'VGG' in args.architecture: assert(args.architecture == 'VGG11' or args.architecture == 'VGG13' or args.architecture == 'VGG16' or args.architecture == 'VGG19') model = models.VGG(args.architecture, args.input_channels, args.im_size, args.output_dim).to(device) elif args.architecture == 'RESNET18': model = models.ResNet18(args.input_channels, args.im_size, args.output_dim).to(device) else: print 'Architecture type "{0}" not recognized, exiting ...'.format(args.architecture) exit() # optimizer if args.optimizer == 'ADAM': optimizer = optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) elif args.optimizer == 'SGD': optimizer = optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, nesterov=args.nesterov, weight_decay=args.weight_decay) else: print 'Optimizer type "{0}" not recognized, exiting ...'.format(args.optimizer) exit() # lr-scheduler if args.lr_decay == 'STEP': scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=args.lr_scale) elif args.lr_decay == 'EXP': scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=args.lr_scale) elif args.lr_decay == 'MSTEP': x = args.lr_interval.split(',') lri = [int(v) for v in x] scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=lri, gamma=args.lr_scale) args.lr_interval = 1 # lr_interval handled in scheduler! else: print 'LR decay type "{0}" not recognized, exiting ...'.format(args.lr_decay) exit() init_weights(model, device, xavier=True) if not args.eval: logging.info(model) num_parameters = sum([l.nelement() for l in model.parameters()]) if not args.eval: logging.info("Number of parameters: %d", num_parameters) start_epoch = -1 beta = 1 # discrete forcing scalar, used only for softmax based projection iters = 0 # total no of iterations, used to do many things! amodel = auxmodel(model) # optionally resume from a checkpoint if args.eval: logging.info('Loading checkpoint file "{0}" for evaluation'.format(args.eval)) if not os.path.isfile(args.eval): print 'Checkpoint file "{0}" for evaluation not recognized, exiting ...'.format(args.eval) exit() checkpoint = torch.load(args.eval) model.load_state_dict(checkpoint['state_dict']) beta = checkpoint['beta'] logging.debug('beta: {0}'.format(beta)) elif args.resume: checkpoint_file = args.resume logging.info('Loading checkpoint file "{0}" to resume'.format(args.resume)) if not os.path.isfile(checkpoint_file): print 'Checkpoint file "{0}" not recognized, exiting ...'.format(checkpoint_file) exit() checkpoint = torch.load(checkpoint_file) start_epoch = checkpoint['epoch'] assert(args.architecture == checkpoint['architecture']) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) BEST_ACC = checkpoint['best_acc1'] beta = checkpoint['beta'] iters = checkpoint['iters'] logging.debug('best_acc1: {0}, beta: {1}, iters: {2}'.format(BEST_ACC, beta, iters)) batch_per_epoch = len(train_loader) if not args.eval: logging.info('Training...') model.train() st = timer() for e in range(start_epoch + 1, args.num_epochs): for i, (data, target) in enumerate(train_loader): l = train_step(args, amodel, model, device, data, target, optimizer, criterion, beta=beta) if i % args.log_interval == 0: acc1, acc5 = evaluate(args, amodel, model, device, val_loader, training=True, beta=beta, summary_writer=summary_writer, iterations=e*batch_per_epoch+i) logging.info('Epoch: {0},\t Iter: {1},\t Loss: {loss:.5f},\t Val-Acc1: {acc1:.2f} ' '(Best: {best:.2f}),\t Val-Acc5: {acc5:.2f}'.format(e, i, loss=l, acc1=acc1, best=BEST_ACC, acc5=acc5)) if iters % args.beta_interval == 0: # beta = beta * args.beta_scale beta = min(beta * args.beta_scale, BETAMAX) optimizer.beta_mda = beta logging.info('beta: {0}'.format(beta)) if iters % args.lr_interval == 0: lr = args.learning_rate for param_group in optimizer.param_groups: lr = param_group['lr'] scheduler.step() for param_group in optimizer.param_groups: if lr != param_group['lr']: logging.info('lr: {0}'.format(param_group['lr'])) # print if changed iters += 1 # save checkpoint acc1, acc5 = evaluate(args, amodel, model, device, val_loader, training=True, beta=beta) results.add(epoch=e, iteration=i, train_loss=l, val_acc1=acc1, best_val_acc1=BEST_ACC) util.save_checkpoint({'epoch': e, 'architecture': args.architecture, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'best_acc1': BEST_ACC, 'iters': iters, 'beta': beta}, is_best=False, path=args.save_dir) results.save() et = timer() logging.info('Elapsed time: {0} seconds'.format(et - st)) acc1, acc5 = evaluate(args, amodel, model, device, val_loader, training=True, beta=beta) logging.info('End of training, Val-Acc: {acc1:.2f} (Best: {best:.2f}), Val-Acc5: {acc5:.2f}'.format(acc1=acc1, best=BEST_ACC, acc5=acc5)) # load saved model saved_model = torch.load(args.save_name) model.load_state_dict(saved_model['state_dict']) beta = saved_model['beta'] # end of training # eval-set if args.tanh: dotanh(args, model, beta=beta) if args.eval_set != 'TRAIN' and args.eval_set != 'TEST': print 'Evaluation set "{0}" not recognized ...'.format(args.eval_set) logging.info('Evaluating fractional binarized-net on the {0} set...'.format(args.eval_set)) st = timer() if args.eval_set == 'TRAIN': acc1, acc5 = evaluate(args, amodel, model, device, train_loader) else: acc1, acc5 = evaluate(args, amodel, model, device, test_loader) et = timer() logging.info('Accuracy: top-1: {acc1:.2f}, top-5: {acc5:.2f}%'.format(acc1=acc1, acc5=acc5)) logging.info('Elapsed time: {0} seconds'.format(et - st)) doround(args, model) logging.info('Evaluating discrete binarized-net on the {0} set...'.format(args.eval_set)) st = timer() if args.eval_set == 'TRAIN': acc1, acc5 = evaluate(args, amodel, model, device, train_loader) else: acc1, acc5 = evaluate(args, amodel, model, device, test_loader) et = timer() logging.info('Accuracy: top-1: {acc1:.2f}, top-5: {acc5:.2f}%'.format(acc1=acc1, acc5=acc5)) logging.info('Elapsed time: {0} seconds'.format(et - st))
def main(): start_time = time() torch.manual_seed(7) #np.random.seed(0) mode = 'train' ############# # mode = 'test' transform = transforms.Compose([ transforms.Grayscale(), transforms.Resize([224, 224]), transforms.ToTensor(), transforms.Normalize([0.5], [0.5]) ]) ds = PileogramDataset(REPEATS_TRAIN, CHIMERIC_TRAIN, REGULAR_TRAIN, JUNK_TRAIN, transform=transform) num_samples = len(ds) val_size = round(num_samples * 0.2) train_size = num_samples - val_size ds_train, ds_val = random_split(ds, [train_size, val_size]) dl_train = DataLoader(ds_train, batch_size=BATCH, shuffle=True, num_workers=2, pin_memory=True) dl_val = DataLoader(ds_val, batch_size=BATCH, shuffle=False, num_workers=2, pin_memory=True) ds_test = PileogramDataset(REPEATS_TEST, CHIMERIC_TEST, REGULAR_TEST, JUNK_TEST, transform=transform) dl_test = DataLoader(ds_test, batch_size=1, shuffle=False, num_workers=2, pin_memory=True) net = models.ResNet18(num_classes=4) # if device.type == 'cuda' and torch.cuda.device_count() > 1: # net = nn.DataParallel(net) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # Use cuda if possible # device = torch.device('cpu') # Force using cpu print(f"Using device: {device}") net.to(device) criterion = nn.CrossEntropyLoss() # optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) # optimizer = optim.Adam(net.parameters(), lr=3e-5, betas=(0.9, 0.999)) optimizer = optim.RMSprop(net.parameters(), lr=3e-5) history_train = [] history_val = [] acc_train = [] acc_valid = [] if mode == 'train': for epoch in range(EPOCHS): total_loss = 0.0 iteration = 0 total = 0 correct = 0 net.train() for data in dl_train: iteration += 1 inputs = data['image'].to(device, non_blocking=True) labels = data['label'].to(device, non_blocking=True) optimizer.zero_grad() outputs = net(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # running_loss += loss.item() total_loss += loss.item() total += labels.size(0) _, predicted = torch.max(outputs.data, 1) correct += (predicted == labels).sum().item() # if i % 100 == 99: # print("Epoch: %2d, Step: %5d -> Loss: %.5f" % # (epoch + 1, i + 1, running_loss / 100)) # running_loss = 0.0 accuracy = 100*correct/total print(f"Epoch {epoch + 1}:\tTrain loss = {total_loss / iteration}\tAccuracy = {round(accuracy, 2)}%") history_train.append((epoch + 1, total_loss / iteration)) acc_train.append((epoch+1, accuracy)) total_loss = 0.0 iteration = 0 total = 0 correct = 0 net.eval() with torch.no_grad(): for data in dl_val: iteration += 1 images = data['image'].to(device) labels = data['label'].to(device) outputs = net(images) loss = criterion(outputs, labels) total_loss += loss.item() _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() accuracy = 100 * correct / total print(f"Epoch {epoch + 1}:\tVal loss = {total_loss / iteration},\tAccuracy = {round(accuracy, 2)}%") history_val.append((epoch + 1, total_loss / iteration)) acc_valid.append((epoch + 1, accuracy)) if epoch == 0 or acc_valid[-1] > max(acc_valid[:-1]): torch.save(net.state_dict(), PARAM_PATH) training_time = time() print(f"Finished Training. Training time: {training_time - start_time} s") # visualizer.draw_training_curve(history_train, history_val) # visualizer.draw_accuracy_curve(acc_train, acc_valid) correct = 0 total = 0 net.load_state_dict(torch.load(PARAM_PATH)) net.eval() guess_repeat = [] guess_chim = [] guess_regular = [] guess_junk = [] eval_time_start = time() with torch.no_grad(), open('wrong.txt', 'w') as f: for data in dl_test: images = data['image'].to(device, non_blocking=True) labels = data['label'].to(device, non_blocking=True) paths = data['path'][0] # print(paths) # print(type(paths)) outputs = net(images) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() if labels == 0: guess_repeat.append(predicted.item()) if predicted.item() != 0: output = paths[:-4] + types[int(labels)] + '_' + types[predicted.item()] + paths[-4:] + '\n' f.write(output) elif labels == 1: guess_chim.append(predicted.item()) if predicted.item() != 1: output = paths[:-4] + types[int(labels)] + '_' + types[predicted.item()] + paths[-4:] + '\n' f.write(output) elif labels == 2: guess_regular.append(predicted.item()) if predicted.item() != 2: output = paths[:-4] + types[int(labels)] + '_' + types[predicted.item()] + paths[-4:] + '\n' f.write(output) else: guess_junk.append(predicted.item()) if predicted.item() != 3: output = paths[:-4] + types[int(labels)] + '_' + types[predicted.item()] + paths[-4:] + '\n' f.write(output) eval_time_end = time() print(f"Accuracy of the network on the test set: {100 * correct / total}%.") print(f"Evalutaion time: {eval_time_end - eval_time_start} s.") conf_repeat = (sum([l == 0 for l in guess_repeat]), sum([l == 1 for l in guess_repeat]), sum([l == 2 for l in guess_repeat]), sum([l == 3 for l in guess_repeat])) conf_chim = (sum([l == 0 for l in guess_chim]), sum([l == 1 for l in guess_chim]), sum([l == 2 for l in guess_chim]), sum([l == 3 for l in guess_chim])) conf_regular = (sum([l == 0 for l in guess_regular]), sum([l == 1 for l in guess_regular]), sum([l == 2 for l in guess_regular]), sum([l == 3 for l in guess_regular])) conf_junk = (sum([l == 0 for l in guess_junk]), sum([l == 1 for l in guess_junk]), sum([l == 2 for l in guess_junk]), sum([l == 3 for l in guess_junk])) print_confusion(conf_repeat, conf_chim, conf_regular, conf_junk)
def __init__(self, args): self.args = args # Creating data loaders kwargs = {'num_workers': 4, 'pin_memory': True} if args.dataset == 'MNIST': # setup data loader self.train_loader = torch.utils.data.DataLoader( datasets.MNIST('../data', train=True, download=True, transform=T.ToTensor()), batch_size=args.batch_size, shuffle=True, **kwargs) self.val_loader = torch.utils.data.DataLoader( datasets.MNIST('../data', train=False, transform=T.ToTensor()), batch_size=args.batch_size, shuffle=False, **kwargs) # initialize model torch.manual_seed(args.seed) self.model = models.SmallCNN() elif args.dataset == 'CIFAR10': transform_train = T.Compose([ T.Pad(4, padding_mode='reflect'), T.RandomCrop(32), T.RandomHorizontalFlip(), T.ToTensor() ]) transform_test = T.Compose([T.ToTensor()]) self.train_loader = torch.utils.data.DataLoader( datasets.CIFAR10(args.data_root, train=True, download=True, transform=transform_train), batch_size=args.batch_size, shuffle=True, **kwargs) self.val_loader = torch.utils.data.DataLoader( datasets.CIFAR10(args.data_root, train=False, transform=transform_test), batch_size=args.batch_size, shuffle=True, **kwargs) # initialize model torch.manual_seed(args.seed) self.model = models.ResNet18() self.model = torch.nn.DataParallel(self.model).cuda() self.optimizer = optim.SGD(self.model.parameters(), args.lr, momentum=0.9, weight_decay=args.weight_decay) self.lr_scheduler = optim.lr_scheduler.MultiStepLR( self.optimizer, milestones=[60, 120, 160], gamma=0.2) print('Number of model parameters: {}'.format( sum([p.data.nelement() for p in self.model.parameters()]))) self.save_path = args.save_path self.epoch = 0 # resume from checkpoint ckpt_path = osp.join(self.save_path, 'checkpoint.pth') if osp.exists(ckpt_path): self._load_from_checkpoint(ckpt_path) elif args.restore: self._load_from_checkpoint(args.restore) cudnn.benchmark = True self.attacker = PGDAttacker(args.attack_eps)
extra_loaders = [] if len(args.extra_data) > 0: ex_data = args.extra_data.split(':') ex_batch_size = args.extra_data_bsize.split(':') for i in range(len(ex_data)): if ex_data[i].split('+')[0] == 'wm': tmp = ex_data[i].split('+') _loader = getwmloader(tmp[1], int(ex_batch_size[i]), tmp[2]) else: _loader, _, __ = getdataloader(ex_data[i], args.train_db_path, args.test_db_path, int(ex_batch_size[i]), 4) extra_loaders.append(batch_gen(_loader)) # Loading model. print('==> loading model...') if args.load_path == 'resnet18': net = models.ResNet18(num_classes=n_classes) else: checkpoint = torch.load(args.load_path) net = checkpoint['net'] acc = checkpoint['acc'] start_epoch = 0#checkpoint['epoch'] net = net.to(device) # support cuda if device == 'cuda': print('Using CUDA') print('Parallel training on {0} GPUs.'.format(torch.cuda.device_count())) net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count())) cudnn.benchmark = True if args.wm_afs:
return x if args.resume: print('===> Resuming from checkpoint...') assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!' checkpoint = torch.load('./checkpoint/ckpt2.t7') net = checkpoint['net'] best_acc = checkpoint['acc'] start_epoch = checkpoint['epoch'] else: print('===> Building model...') # net = BasicNet() # net = googlenet.GoogLeNet() net = models.ResNet18() if args.distributed: print('===> Distributed Training Mode') dist.init_process_group(backend=args.backend, init_method=args.dist_url, rank=args.rank, world_size=args.world_size) if args.distributed: if args.use_cuda: print('===> DistributedDataParallel') net.to(device) net = torch.nn.parallel.DistributedDataParallel(net) else: print('===> DistributedDataParallelCPU')
def setup_and_run(args, criterion, device, train_loader, test_loader, val_loader, logging, results): global BEST_ACC print('\n#### Running continuous-net ####') # architecture if 'VGG' in args.architecture: assert (args.architecture == 'VGG11' or args.architecture == 'VGG13' or args.architecture == 'VGG16' or args.architecture == 'VGG19') model = models.VGG(args.architecture, args.input_channels, args.im_size, args.output_dim).to(device) elif args.architecture == 'RESNET18': model = models.ResNet18(args.input_channels, args.im_size, args.output_dim).to(device) else: print 'Architecture type "{0}" not recognized, exiting ...'.format( args.architecture) exit() # optimizer if args.optimizer == 'ADAM': optimizer = optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) elif args.optimizer == 'SGD': optimizer = optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, nesterov=args.nesterov, weight_decay=args.weight_decay) else: print 'Optimizer type "{0}" not recognized, exiting ...'.format( args.optimizer) exit() # lr-scheduler if args.lr_decay == 'STEP': scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=args.lr_scale) elif args.lr_decay == 'EXP': scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=args.lr_scale) elif args.lr_decay == 'MSTEP': x = args.lr_interval.split(',') lri = [int(v) for v in x] scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=lri, gamma=args.lr_scale) args.lr_interval = 1 # lr_interval handled in scheduler! else: print 'LR decay type "{0}" not recognized, exiting ...'.format( args.lr_decay) exit() init_weights(model, xavier=True) logging.info(model) num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info("Number of parameters: %d", num_parameters) start_epoch = -1 iters = 0 # total no of iterations, used to do many things! # optionally resume from a checkpoint if args.eval: logging.info('Loading checkpoint file "{0}" for evaluation'.format( args.eval)) if not os.path.isfile(args.eval): print 'Checkpoint file "{0}" for evaluation not recognized, exiting ...'.format( args.eval) exit() checkpoint = torch.load(args.eval) model.load_state_dict(checkpoint['state_dict']) elif args.resume: checkpoint_file = args.resume logging.info('Loading checkpoint file "{0}" to resume'.format( args.resume)) if not os.path.isfile(checkpoint_file): print 'Checkpoint file "{0}" not recognized, exiting ...'.format( checkpoint_file) exit() checkpoint = torch.load(checkpoint_file) start_epoch = checkpoint['epoch'] assert (args.architecture == checkpoint['architecture']) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) BEST_ACC = checkpoint['best_acc1'] iters = checkpoint['iters'] logging.debug('best_acc1: {0}, iters: {1}'.format(BEST_ACC, iters)) if not args.eval: logging.info('Training...') model.train() st = timer() for e in range(start_epoch + 1, args.num_epochs): for i, (data, target) in enumerate(train_loader): l = train_step(model, device, data, target, optimizer, criterion) if i % args.log_interval == 0: acc1, acc5 = evaluate(args, model, device, val_loader, training=True) logging.info( 'Epoch: {0},\t Iter: {1},\t Loss: {loss:.5f},\t Val-Acc1: {acc1:.2f} ' '(Best: {best:.2f}),\t Val-Acc5: {acc5:.2f}'.format( e, i, loss=l, acc1=acc1, best=BEST_ACC, acc5=acc5)) if iters % args.lr_interval == 0: lr = args.learning_rate for param_group in optimizer.param_groups: lr = param_group['lr'] scheduler.step() for param_group in optimizer.param_groups: if lr != param_group['lr']: logging.info('lr: {0}'.format( param_group['lr'])) # print if changed iters += 1 # save checkpoint acc1, acc5 = evaluate(args, model, device, val_loader, training=True) results.add(epoch=e, iteration=i, train_loss=l, val_acc1=acc1, best_val_acc1=BEST_ACC) util.save_checkpoint( { 'epoch': e, 'architecture': args.architecture, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'best_acc1': BEST_ACC, 'iters': iters }, is_best=False, path=args.save_dir) results.save() et = timer() logging.info('Elapsed time: {0} seconds'.format(et - st)) acc1, acc5 = evaluate(args, model, device, val_loader, training=True) logging.info( 'End of training, Val-Acc: {acc1:.2f} (Best: {best:.2f}), Val-Acc5: {acc5:.2f}' .format(acc1=acc1, best=BEST_ACC, acc5=acc5)) # load saved model saved_model = torch.load(args.save_name) model.load_state_dict(saved_model['state_dict']) # end of training # eval-set if args.eval_set != 'TRAIN' and args.eval_set != 'TEST': print 'Evaluation set "{0}" not recognized ...'.format(args.eval_set) logging.info('Evaluating continuous-net on the {0} set...'.format( args.eval_set)) st = timer() if args.eval_set == 'TRAIN': acc1, acc5 = evaluate(args, model, device, train_loader) else: acc1, acc5 = evaluate(args, model, device, test_loader) et = timer() logging.info('Accuracy: top-1: {acc1:.2f}, top-5: {acc5:.2f}%'.format( acc1=acc1, acc5=acc5)) logging.info('Elapsed time: {0} seconds'.format(et - st))
def cl_streaming(args): seed = args.seed nr_epochs = args.nr_epochs beta = args.beta device = args.device method = args.method buffer_size = args.buffer_size stream_batch_size = args.stream_batch_size dataset = args.dataset if dataset == 'stream_imbalanced_splitcifar': nr_slots = 1 else: nr_slots = 10 generator = SplitCifar(imbalanced=dataset == 'stream_imbalanced_splitcifar') train_loaders = [] test_loaders = [] train_inds_list = [] for i in range(generator.max_iter): train_inds, test_inds = generator.next_task() train_inds_list.append(train_inds) train_loaders.append(get_custom_loader(generator.train_dataset, train_inds)) test_loaders.append(get_custom_loader(generator.test_dataset, test_inds)) model = models.ResNet18().to(device) training_op = Training(model, device, nr_epochs, beta=beta) kernel_fn = get_kernel_fn() bc = bilevel_coreset.BilevelCoreset(outer_loss_fn=loss_utils.cross_entropy, inner_loss_fn=loss_utils.cross_entropy, out_dim=10, max_outer_it=1, candidate_batch_size=600, max_inner_it=300, logging_period=1000) def coreset_builder_fn(X, y, m, data_weights): return bc.build_with_representer_proxy_batch(X, y, m, kernel_fn, data_weights=data_weights, cache_kernel=True, start_size=1, inner_reg=inner_reg) inner_reg = 1e-3 if dataset == 'stream_imbalanced_splitcifar': if method == 'reservoir': training_op = reservoir_buffer(generator, stream_batch_size, buffer_size, training_op) elif method == 'cbrs': training_op = cbrs(generator, stream_batch_size, buffer_size, training_op) elif method == 'coreset': training_op = streaming_coreset(generator, stream_batch_size, buffer_size, training_op, coreset_builder_fn, nr_slots) else: raise ValueError("Invalid dataset - method combination") else: if method not in cl_methods: raise ValueError("Invalid dataset - method combination") training_op = train_with_buffer(generator, buffer_size, training_op, train_loaders, train_inds_list, model, method, device, coreset_builder_fn) result = get_test_accuracy(generator, test_loaders, training_op) filename = '{}_{}_{}_{}_{}.txt'.format(dataset, method, buffer_size, beta, seed) results_path = 'cl_results' if dataset == 'stream_imbalanced_splitcifar': results_path = 'streaming_results' if not os.path.exists(results_path): os.makedirs(results_path) with open(os.path.join(results_path, filename), 'w') as outfile: json.dump({'test_acc': np.mean(result), 'acc_per_task': result}, outfile)
import models name_to_model = { 'LeNet': lambda args: models.LeNet(**args), 'AlexNet': lambda args: models.AlexNet(**args), 'MLP': lambda args: models.MLP(**args), 'ResNet18': lambda args: models.ResNet18(**args), 'PResNet18': lambda args: models.PResNet18(**args), 'Permutation': lambda args: models.TensorPermutation(32, 32, **args), 'ResNet20Original': lambda args: models.resnet20original(), 'MobileNet': lambda args: models.MobileNet(**args), 'ShuffleNet': lambda args: models.ShuffleNetG2(), 'WideResNet28': lambda args: models.WideResNet28(**args), } def get_model(model_config): name = model_config['name'] return name_to_model[name](model_config.get('args', None))
print(f'\n\n************** start new model : {project_name} ******************') utils.train(hyper_param_dict, model, device) del model #run GoogleNet with BN model = models.GoogLeNet_w_bn() model.to(device) project_name = 'GoogLeNet_w_bn' hyper_param_dict['project'] = project_name hyper_param_dict['lr'] = 0.01 print(f'\n\n************** start new model : {project_name} ******************') utils.train(hyper_param_dict, model, device) del model # run ResNet18 model = models.ResNet18() model.to(device) project_name = 'ResNet18' hyper_param_dict['project'] = project_name hyper_param_dict['lr'] = 0.03 hyper_param_dict['batch'] = 256 print(f'\n\n************** start new model : {project_name} ******************') utils.train(hyper_param_dict, model, device) del model # run ResNet34 model = models.ResNet34() model.to(device) project_name = 'ResNet34' hyper_param_dict['project'] = project_name print(f'\n\n************** start new model : {project_name} ******************')
def main(): args = parse_args() if args.deterministic: random.seed(0) torch.manual_seed(0) np.random.seed(0) torch.backends.cudnn.deterministic = True os.makedirs(args.model_dir, exist_ok=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") input_transforms = transforms.Compose( [transforms.ToTensor(), transforms.Normalize(mean, std)]) model = models.ResNet18(args.num_patches, args.num_angles) model = torch.nn.DataParallel(model) if args.do_ssl: stl_unlabeled = datasets.STL10(root=args.data_dir, split='unlabeled', transform=input_transforms, download=args.download) indices = list(range(len(stl_unlabeled))) train_indices = indices[:int(len(indices) * 0.9)] val_indices = indices[int(len(indices) * 0.9):] dataloaders = { "train": DataLoader(SSLTrainDataset(Subset(stl_unlabeled, train_indices), args.num_patches, args.num_angles, args.poisson_rate), shuffle=True, batch_size=args.ssl_train_batch_size, pin_memory=True), "val": DataLoader(SSLValDataset(Subset(stl_unlabeled, val_indices), args.num_patches, args.num_angles), shuffle=False, batch_size=args.ssl_val_batch_size, pin_memory=True) } # checkpoint = torch.load(os.path.join(args.model_dir, f"{args.model_name}"), # map_location=lambda storage, loc: storage.cuda(0)) # model.load_state_dict(checkpoint['state_dict']) # model.load_state_dict(torch.load(os.path.join(args.model_dir, f"{args.model_name}"))) # dataloaders["train"].dataset.set_poisson_rate(args.poisson_rate) args.mean, args.std = mean, std # train.gen_grad_map(device, model, dataloaders["val"], args) model, best_val_accuracy = train.ssl_train(device, model, dataloaders, args) model_name = time.ctime().replace(" ", "_").replace(":", "_") model_name = f"{model_name}_{best_val_accuracy:.4f}.pt" torch.save(model.state_dict(), os.path.join(args.model_dir, model_name)) if args.do_sl: if args.model_name is None: raise ValueError("Model name must be specified") stl_train = datasets.STL10(root=args.data_dir, split='train', transform=input_transforms, download=args.download) args.num_classes = len(stl_train.classes) fold_indices = sl_train.stl_get_train_folds( os.path.join(args.data_dir, "stl10_binary/fold_indices.txt")) stl_test = datasets.STL10(root=args.data_dir, split='test', transform=input_transforms, download=args.download) dataloaders = { "test": DataLoader(stl_test, shuffle=False, batch_size=args.test_batch_size, pin_memory=True) } checkpoint = torch.load( os.path.join(args.model_dir, f"{args.model_name}"), map_location=lambda storage, loc: storage.cuda(0)) model.load_state_dict(checkpoint['state_dict']) # model.load_state_dict(torch.load(os.path.join(args.model_dir, f"{args.model_name}"))) # model.init_classifier(args.num_classes, freeze_params=False) args.mean, args.std = mean, std query_img, _ = stl_train[-1] dataloader = DataLoader(stl_train, batch_size=128, shuffle=False, pin_memory=True) top_images, top_labels = train.retrieve_topk_images( device, model, query_img, dataloader, args)
def training(args,*k,**kw): # if use gpus device = torch.device("cuda:{}".format(args.gpuindex) if torch.cuda.is_available() and args.gpu else "cpu") print("user device: {}".format(device)) # redis helper related redis_helper = redishelper.GoSGDHelper(host=args.host, port=args.port) redis_helper.signin() while redis_helper.cur_edge_num() < args.edgenum: time.sleep(1) # sleep 1 second model_score = 1.0 / args.edgenum # the initial model parameters score # log_file and summary path log_file = "{0}-{1}-edge-{2}.log".format(time.strftime('%Y%m%d-%H%M%S',time.localtime(time.time())), args.model,redis_helper.ID) log_dir = "tbruns/{0}-{1}-cifar10-edge-{2}".format(time.strftime('%Y%m%d%H%M%S',time.localtime(time.time())),args.model,redis_helper.ID) logger = open(log_file,'w') swriter = SummaryWriter(log_dir) # load traing data trainset = dataset.AGGData(root=args.dataset, train=True, download=False, transform=None) testset = dataset.AGGData(root=args.dataset, train=False, download=False, transform=None) testloader = torch.utils.data.DataLoader(testset, batch_size=args.batchsize, shuffle=False, num_workers=0) # construct neural network net = None if args.model == "lenet5": net = models.LeNet5() elif args.model == "resnet18": net = models.ResNet18() elif args.model == "alexnet": net = models.AlexNet(args.num_classes) elif args.model == "alexnetimg8": net = models.AlexNetImg8(args.num_classes) elif args.model == "squeezenet": net = models.SqueezeNet() elif args.model == "mobilenetv2": net = models.MobileNetV2() elif args.model == "resnet34": net = models.ResNet34() elif args.model == "resnet50": net = models.ResNet50() elif args.model == "resnet101": net = models.ResNet101() else: net = models.ResNet152() net.to(device) # define optimizer criterion = nn.CrossEntropyLoss() criterion_loss = nn.CrossEntropyLoss(reduction='none') optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9) lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer,milestones=list(args.lrschstep), gamma=0.1) # start training wallclock = 0.0 iteration = 0 # global iterations for epoch in range(0,args.epoch,1): starteg = time.time() # merge parameters of other edge if epoch > 0: mintime,maxtime,param_list = redis_helper.min2max_time_params() print("The min/max time cost of last epoch: {}/{}".format(mintime,maxtime)) for item in param_list: w1 = model_score / (model_score + item[0]) w2 = item[0] / (model_score + item[0]) for local,other in zip(net.parameters(),item[1]): local.data = local.data * w1 + other.data.to(device) * w2 model_score = model_score + item[0] while redis_helper.finish_update() is False: time.sleep(1.0) critical_extra_start = time.time() # identify critical training samples critrainset = critical_identify(net,trainset,criterion_loss,device,args) critrainloader = torch.utils.data.DataLoader(critrainset, batch_size=args.batchsize, shuffle=True, num_workers=0) critical_extra_cost = time.time() - critical_extra_start training_start = time.time() running_loss = 0.0 record_running_loss = 0.0 for i, data in enumerate(critrainloader, 0): iteration += 1 # get the inputs inputs, labels = data inputs = inputs.to(device) labels = labels.squeeze().to(device) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = net(inputs).squeeze() loss = criterion(outputs, labels) loss.backward() optimizer.step() # print statistics running_loss += loss.item() record_running_loss += loss.item() if i % 10 == 9: swriter.add_scalar("Training loss",record_running_loss / 10,epoch*len(critrainloader)+i) record_running_loss = 0.0 if i % 2000 == 1999: # print every 2000 mini-batches print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 2000)) running_loss = 0.0 training_cost = time.time() - training_start # push time and parameters to Redis model_score = model_score / 2 sel_edge_id = redis_helper.random_edge_id(can_be_self=True) paramls = list(map(lambda x: x.cpu(),list(net.parameters()))) redis_helper.ins_time_params(sel_edge_id,training_cost,model_score,paramls) while not redis_helper.finish_push(): time.sleep(1.0) wallclock += time.time() - starteg total, kaccuracy = validation(net,testloader,device,topk=(1,5)) curtime = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) _header="[ {} Epoch {} /Iteration {} Wallclock {}]".format(curtime,epoch+1,iteration, wallclock) print('{} Accuracy of the network on the {} test images: {} %'.format(_header, total, kaccuracy_str(kaccuracy))) logger.write('{},{},{},{}\n'.format(epoch+1 ,iteration, wallclock, accuracy_str(kaccuracy))) logger.flush() # write to disk for item in kaccuracy: swriter.add_scalar("Top{}Accuracy".format(item[0]), item[1], epoch) # adopt learning rate of optimizer if args.lrscheduler: lr_scheduler.step() print('Finished Training') redis_helper.register_out() logger.close() # close log file writer return net
def main_worker(gpu, ngpus_per_node, args): global best_acc if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model model = models.ResNet18(args.num_patches, args.num_angles) if args.distributed: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) else: # DataParallel will divide and allocate batch_size to all available GPUs model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimiser = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc = checkpoint['best_acc'] args.poisson_rate = checkpoint["poisson_rate"] model.load_state_dict(checkpoint['state_dict']) optimiser.load_state_dict(checkpoint['optimiser']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code input_transforms = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(225), transforms.ToTensor(), transforms.Normalize(mean, std) ]) train_dir = os.path.join(args.data, 'train') val_dir = os.path.join(args.data, 'val') imagenet_train = datasets.ImageFolder(root=train_dir, transform=input_transforms) train_dataset = SSLTrainDataset(imagenet_train, args.num_patches, args.num_angles) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) imagenet_val = datasets.ImageFolder(root=val_dir, transform=input_transforms) val_dataset = SSLValDataset(imagenet_val, args.num_patches, args.num_angles) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, args) return writer = SummaryWriter() train_loader.dataset.set_poisson_rate(args.poisson_rate) for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) # train for one epoch train_loss, train_acc = train(train_loader, model, criterion, optimiser, epoch, args) # evaluate on validation set val_loss, val_acc = validate(val_loader, model, criterion, args) # remember best acc@1 and save checkpoint is_best = val_acc > best_acc best_acc = max(val_acc, best_acc) if not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_acc': best_acc, 'optimiser': optimiser.state_dict(), "poisson_rate": args.poisson_rate }, is_best) if (epoch + 1) % args.learn_prd == 0: args.poisson_rate += 1 train_loader.dataset.set_poisson_rate(args.poisson_rate) writer.add_scalars("Loss", {"train_loss": train_loss, "val_loss": val_loss}, epoch) writer.add_scalars("Accuracy", {"train_acc": train_acc, "val_acc": val_acc}, epoch) writer.add_scalar("Poisson_Rate", train_loader.dataset.pdist.rate, epoch) writer.close()
def main(): global best_acc, mean, std, scale args = parse_args() args.mean, args.std, args.scale = mean, std, scale args.is_master = args.local_rank == 0 if args.deterministic: cudnn.deterministic = True torch.manual_seed(0) random.seed(0) np.random.seed(0) args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.is_master: print("opt_level = {}".format(args.opt_level)) print("keep_batchnorm_fp32 = {}".format(args.keep_batchnorm_fp32), type(args.keep_batchnorm_fp32)) print("loss_scale = {}".format(args.loss_scale), type(args.loss_scale)) print("\nCUDNN VERSION: {}\n".format(torch.backends.cudnn.version())) print(f"Distributed Training Enabled: {args.distributed}") args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() # Scale learning rate based on global batch size # args.lr *= args.batch_size * args.world_size / 256 assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled." # create model model = models.ResNet18(args.num_patches, args.num_angles) if args.sync_bn: import apex print("using apex synced BN") model = apex.parallel.convert_syncbn_model(model) model = model.cuda() optimiser = Ranger(model.parameters(), lr=args.lr) criterion = nn.CrossEntropyLoss().cuda() # Initialize Amp. Amp accepts either values or strings for the optional override arguments, # for convenient interoperation with argparse. model, optimiser = amp.initialize( model, optimiser, opt_level=args.opt_level, keep_batchnorm_fp32=args.keep_batchnorm_fp32, loss_scale=args.loss_scale) # For distributed training, wrap the model with apex.parallel.DistributedDataParallel. # This must be done AFTER the call to amp.initialize. If model = DDP(model) is called # before model, ... = amp.initialize(model, ...), the call to amp.initialize may alter # the types of model's parameters in a way that disrupts or destroys DDP's allreduce hooks. if args.distributed: model = DDP(model, delay_allreduce=True) # Optionally resume from a checkpoint if args.resume: # Use a local scope to avoid dangling references def resume(): global best_acc if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] best_acc = checkpoint['best_acc'] args.poisson_rate = checkpoint["poisson_rate"] model.load_state_dict(checkpoint['state_dict']) optimiser.load_state_dict(checkpoint['optimiser']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) resume() # Data loading code train_dir = os.path.join(args.data, 'train') val_dir = os.path.join(args.data, 'val') crop_size = 225 val_size = 256 imagenet_train = datasets.ImageFolder( root=train_dir, transform=transforms.Compose([ transforms.RandomResizedCrop(crop_size), ])) train_dataset = SSLTrainDataset(imagenet_train, args.num_patches, args.num_angles, args.poisson_rate) imagenet_val = datasets.ImageFolder(root=val_dir, transform=transforms.Compose([ transforms.Resize(val_size), transforms.CenterCrop(crop_size), ])) val_dataset = SSLValDataset(imagenet_val, args.num_patches, args.num_angles) train_sampler = None val_sampler = None if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) val_sampler = torch.utils.data.distributed.DistributedSampler( val_dataset) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, collate_fn=fast_collate) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=val_sampler, collate_fn=fast_collate) if args.evaluate: val_loss, val_acc = apex_validate(val_loader, model, criterion, args) utils.logger.info(f"Val Loss = {val_loss}, Val Accuracy = {val_acc}") return # Create dir to save model and command-line args if args.is_master: model_dir = time.ctime().replace(" ", "_").replace(":", "_") model_dir = os.path.join("models", model_dir) os.makedirs(model_dir, exist_ok=True) with open(os.path.join(model_dir, "args.json"), "w") as f: json.dump(args.__dict__, f, indent=2) writer = SummaryWriter() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) # train for one epoch train_loss, train_acc = apex_train(train_loader, model, criterion, optimiser, args, epoch) # evaluate on validation set val_loss, val_acc = apex_validate(val_loader, model, criterion, args) if (epoch + 1) % args.learn_prd == 0: utils.adj_poisson_rate(train_loader, args) # remember best Acc and save checkpoint if args.is_master: is_best = val_acc > best_acc best_acc = max(val_acc, best_acc) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_acc': best_acc, 'optimiser': optimiser.state_dict(), "poisson_rate": args.poisson_rate }, is_best, model_dir) writer.add_scalars("Loss", { "train_loss": train_loss, "val_loss": val_loss }, epoch) writer.add_scalars("Accuracy", { "train_acc": train_acc, "val_acc": val_acc }, epoch) writer.add_scalar("Poisson_Rate", train_loader.dataset.pdist.rate, epoch)
def setup_and_run(args, criterion, device, train_loader, test_loader, val_loader, logging, results): global BEST_ACC print("\n#### Running REF ####") # architecture if args.architecture == "MLP": model = models.MLP(args.input_dim, args.hidden_dim, args.output_dim).to(device) elif args.architecture == "LENET300": model = models.LeNet300(args.input_dim, args.output_dim).to(device) elif args.architecture == "LENET5": model = models.LeNet5(args.input_channels, args.im_size, args.output_dim).to(device) elif "VGG" in args.architecture: assert (args.architecture == "VGG11" or args.architecture == "VGG13" or args.architecture == "VGG16" or args.architecture == "VGG19") model = models.VGG(args.architecture, args.input_channels, args.im_size, args.output_dim).to(device) elif args.architecture == "RESNET18": model = models.ResNet18(args.input_channels, args.im_size, args.output_dim).to(device) elif args.architecture == "RESNET34": model = models.ResNet34(args.input_channels, args.im_size, args.output_dim).to(device) elif args.architecture == "RESNET50": model = models.ResNet50(args.input_channels, args.im_size, args.output_dim).to(device) elif args.architecture == "RESNET101": model = models.ResNet101(args.input_channels, args.im_size, args.output_dim).to(device) elif args.architecture == "RESNET152": model = models.ResNet152(args.input_channels, args.im_size, args.output_dim).to(device) else: print('Architecture type "{0}" not recognized, exiting ...'.format( args.architecture)) exit() # optimizer if args.optimizer == "ADAM": optimizer = optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) elif args.optimizer == "SGD": optimizer = optim.SGD( model.parameters(), lr=args.learning_rate, momentum=args.momentum, nesterov=args.nesterov, weight_decay=args.weight_decay, ) else: print('Optimizer type "{0}" not recognized, exiting ...'.format( args.optimizer)) exit() # lr-scheduler if args.lr_decay == "STEP": scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=args.lr_scale) elif args.lr_decay == "EXP": scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=args.lr_scale) elif args.lr_decay == "MSTEP": x = args.lr_interval.split(",") lri = [int(v) for v in x] scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=lri, gamma=args.lr_scale) args.lr_interval = 1 # lr_interval handled in scheduler! else: print('LR decay type "{0}" not recognized, exiting ...'.format( args.lr_decay)) exit() init_weights(model, xavier=True) logging.info(model) num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info("Number of parameters: %d", num_parameters) start_epoch = -1 iters = 0 # total no of iterations, used to do many things! # optionally resume from a checkpoint if args.eval: logging.info('Loading checkpoint file "{0}" for evaluation'.format( args.eval)) if not os.path.isfile(args.eval): print( 'Checkpoint file "{0}" for evaluation not recognized, exiting ...' .format(args.eval)) exit() checkpoint = torch.load(args.eval) model.load_state_dict(checkpoint["state_dict"]) elif args.resume: checkpoint_file = args.resume logging.info('Loading checkpoint file "{0}" to resume'.format( args.resume)) if not os.path.isfile(checkpoint_file): print('Checkpoint file "{0}" not recognized, exiting ...'.format( checkpoint_file)) exit() checkpoint = torch.load(checkpoint_file) start_epoch = checkpoint["epoch"] assert args.architecture == checkpoint["architecture"] model.load_state_dict(checkpoint["state_dict"]) optimizer.load_state_dict(checkpoint["optimizer"]) scheduler.load_state_dict(checkpoint["scheduler"]) BEST_ACC = checkpoint["best_acc1"] iters = checkpoint["iters"] logging.debug("best_acc1: {0}, iters: {1}".format(BEST_ACC, iters)) if not args.eval: logging.info("Training...") model.train() st = timer() for e in range(start_epoch + 1, args.num_epochs): for i, (data, target) in enumerate(train_loader): l = train_step(model, device, data, target, optimizer, criterion) if i % args.log_interval == 0: acc1, acc5 = evaluate(args, model, device, val_loader, training=True) logging.info( "Epoch: {0},\t Iter: {1},\t Loss: {loss:.5f},\t Val-Acc1: {acc1:.2f} " "(Best: {best:.2f}),\t Val-Acc5: {acc5:.2f}".format( e, i, loss=l, acc1=acc1, best=BEST_ACC, acc5=acc5)) if iters % args.lr_interval == 0: lr = args.learning_rate for param_group in optimizer.param_groups: lr = param_group["lr"] scheduler.step() for param_group in optimizer.param_groups: if lr != param_group["lr"]: logging.info("lr: {0}".format( param_group["lr"])) # print if changed iters += 1 # save checkpoint acc1, acc5 = evaluate(args, model, device, val_loader, training=True) results.add( epoch=e, iteration=i, train_loss=l, val_acc1=acc1, best_val_acc1=BEST_ACC, ) util.save_checkpoint( { "epoch": e, "architecture": args.architecture, "state_dict": model.state_dict(), "optimizer": optimizer.state_dict(), "scheduler": scheduler.state_dict(), "best_acc1": BEST_ACC, "iters": iters, }, is_best=False, path=args.save_dir, ) results.save() et = timer() logging.info("Elapsed time: {0} seconds".format(et - st)) acc1, acc5 = evaluate(args, model, device, val_loader, training=True) logging.info( "End of training, Val-Acc: {acc1:.2f} (Best: {best:.2f}), Val-Acc5: {acc5:.2f}" .format(acc1=acc1, best=BEST_ACC, acc5=acc5)) # load saved model saved_model = torch.load(args.save_name) model.load_state_dict(saved_model["state_dict"]) # end of training # eval-set if args.eval_set != "TRAIN" and args.eval_set != "TEST": print('Evaluation set "{0}" not recognized ...'.format(args.eval_set)) logging.info("Evaluating REF on the {0} set...".format(args.eval_set)) st = timer() if args.eval_set == "TRAIN": acc1, acc5 = evaluate(args, model, device, train_loader) else: acc1, acc5 = evaluate(args, model, device, test_loader) et = timer() logging.info("Accuracy: top-1: {acc1:.2f}, top-5: {acc5:.2f}%".format( acc1=acc1, acc5=acc5)) logging.info("Elapsed time: {0} seconds".format(et - st))
def create_model(): return models.ResNet18().cuda()
# the "conv_mode" key in the bound_opts parameter when constructing your # BoundeModule object. In this test we show the difference between Patches # mode and Matrix mode in memory consumption. device = 'cuda' conv_mode = 'patches' # conv_mode can be set as 'matrix' or 'patches' seed = 1234 torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) random.seed(seed) np.random.seed(seed) ## Step 1: Define the model # model_ori = models.model_resnet(width=1, mult=4) model_ori = models.ResNet18(in_planes=2) # model_ori.load_state_dict(torch.load("data/cifar_base_kw.pth")['state_dict'][0]) ## Step 2: Prepare dataset as usual # test_data = torchvision.datasets.MNIST("./data", train=False, download=True, transform=torchvision.transforms.ToTensor()) normalize = torchvision.transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]) test_data = torchvision.datasets.CIFAR10( "./data", train=False, download=True, transform=torchvision.transforms.Compose( [torchvision.transforms.ToTensor(), normalize])) # For illustration we only use 1 image from dataset N = 1
def main(): batch_size = 256 num_classes = 1000 image_size = (128, 128) """ load dataset """ dataset = loaders.ImageNetLoader('./datasets/ImageNet').load() train_dataset, valid_dataset = dataset """ processor """ train_processor = processors.ImageNetClassificationProcessor( batch_size, num_classes=num_classes, enable_augmentation=True, image_size=image_size) valid_processor = processors.ImageNetClassificationProcessor( batch_size, num_classes=num_classes, enable_augmentation=False, image_size=image_size) """ iterator """ train_iterator = iterators.MultiprocessIterator(train_dataset, train_processor, num_workers=4) valid_iterator = iterators.MultiprocessIterator(valid_dataset, valid_processor, num_workers=4) """ device """ device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') """ model """ model = models.ResNet18(input_channels=3, num_classes=num_classes).to(device) """ loss """ loss_function = losses.CrossEntropyLoss().to(device) """ optimizer """ optimizer = torch.optim.Adam(model.parameters(), lr=0.01) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=20) """ logger """ logger = loggers.SimpleLogger() """ learning """ for epoch in range(10): print(f"-" * 64) print(f"[epoch {epoch:>4d}]") phase = 'train' torch.set_grad_enabled(True) for batch_data in tqdm.tqdm(train_iterator, desc=phase): optimizer.zero_grad() batch_image = torch.from_numpy(batch_data['image']).to(device) batch_target = torch.from_numpy(batch_data['target']).to(device) batch_output = model(batch_image) batch_loss = loss_function(batch_output, batch_target) batch_loss.sum().backward() optimizer.step() batch_loss = batch_loss.data.cpu().numpy() batch_label = np.argmax(batch_target.data.cpu().numpy(), axis=-1).flatten() batch_pred = np.argmax(batch_output.data.cpu().numpy(), axis=-1).flatten() logger.add_batch_loss(batch_loss, phase=phase) logger.add_batch_pred(batch_pred, phase=phase) logger.add_batch_label(batch_label, phase=phase) loss = logger.get_loss(phase) accuracy = logger.get_accuracy(phase) print(f"loss : {loss}") print(f"accuracy : {accuracy}") phase = 'valid' torch.set_grad_enabled(False) for batch_data in tqdm.tqdm(valid_iterator, desc=phase): optimizer.zero_grad() batch_image = torch.from_numpy(batch_data['image']).to(device) batch_target = torch.from_numpy(batch_data['target']).to(device) batch_output = model(batch_image) batch_loss = loss_function(batch_output, batch_target) batch_loss = batch_loss.data.cpu().numpy() batch_label = np.argmax(batch_target.data.cpu().numpy(), axis=-1).flatten() batch_pred = np.argmax(batch_output.data.cpu().numpy(), axis=-1).flatten() logger.add_batch_loss(batch_loss, phase=phase) logger.add_batch_pred(batch_pred, phase=phase) logger.add_batch_label(batch_label, phase=phase) loss = logger.get_loss(phase) accuracy = logger.get_accuracy(phase) print(f"loss : {loss:.4f}") print(f"accuracy : {accuracy:.4f}") logger.step()