Beispiel #1
0
 def _get_backbone_network(self, backbone_name):
     if backbone_name == 'ResNet18':
         backbone = models.ResNet18(last_relu=False)
     elif backbone_name == 'ResNet18HighRes':
         backbone = models.ResNet18(last_relu=False, high_res=True)
     else:
         raise NotImplementedError()
     return backbone
Beispiel #2
0
def get_model(device):
    """
    :param device: instance of torch.device
    :return: An instance of torch.nn.Module
    """
    num_classes = 2
    if config["dataset"] == "Cifar100":
        num_classes = 100
    elif config["dataset"] == "Cifar10":
        num_classes = 10

    model = {
        "vgg11": lambda: models.VGG("VGG11", num_classes, batch_norm=False),
        "vgg11_bn": lambda: models.VGG("VGG11", num_classes, batch_norm=True),
        "vgg13": lambda: models.VGG("VGG13", num_classes, batch_norm=False),
        "vgg13_bn": lambda: models.VGG("VGG13", num_classes, batch_norm=True),
        "vgg16": lambda: models.VGG("VGG16", num_classes, batch_norm=False),
        "vgg16_bn": lambda: models.VGG("VGG16", num_classes, batch_norm=True),
        "vgg19": lambda: models.VGG("VGG19", num_classes, batch_norm=False),
        "vgg19_bn": lambda: models.VGG("VGG19", num_classes, batch_norm=True),
        "resnet10": lambda: models.ResNet10(num_classes=num_classes),
        "resnet18": lambda: models.ResNet18(num_classes=num_classes),
        "resnet34": lambda: models.ResNet34(num_classes=num_classes),
        "resnet50": lambda: models.ResNet50(num_classes=num_classes),
        "resnet101": lambda: models.ResNet101(num_classes=num_classes),
        "resnet152": lambda: models.ResNet152(num_classes=num_classes),
        "bert": lambda: models.BertImage(config, num_classes=num_classes),
    }[config["model"]]()

    model.to(device)
    if device == "cuda":
        model = torch.nn.DataParallel(model)
        torch.backends.cudnn.benchmark = True

    return model
    def test(self):
        model_oris = [
            models.model_resnet(width=1, mult=2),
            models.ResNet18(in_planes=2)
        ]
        self.result = []

        for model_ori in model_oris:
            conv_mode = 'patches'  # conv_mode can be set as 'matrix' or 'patches'

            normalize = torchvision.transforms.Normalize(
                mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010])
            test_data = torchvision.datasets.CIFAR10(
                "./data",
                train=False,
                download=True,
                transform=torchvision.transforms.Compose(
                    [torchvision.transforms.ToTensor(), normalize]))
            N = 1
            n_classes = 10

            image = torch.Tensor(test_data.data[:N]).reshape(N, 3, 32, 32)
            image = image.to(torch.float32) / 255.0

            model = BoundedModule(model_ori,
                                  image,
                                  bound_opts={"conv_mode": conv_mode})

            ptb = PerturbationLpNorm(norm=np.inf, eps=0.03)
            image = BoundedTensor(image, ptb)
            pred = model(image)
            lb, ub = model.compute_bounds(IBP=False, C=None, method='backward')
            self.result += [lb, ub]

        self.check()
Beispiel #4
0
def main():
    global mean, std
    args = parse_args()
    if args.deterministic:
        random.seed(0)
        torch.manual_seed(0)
        np.random.seed(0)
        torch.backends.cudnn.deterministic = True

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    input_transforms = torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize(mean, std)
    ])
    cifar = torchvision.datasets.CIFAR10(args.data_dir,
                                         train=True,
                                         transform=input_transforms,
                                         download=True)
    indices = list(range(len(cifar)))
    train_indices = indices[:int(len(indices) * 0.9)]
    val_indices = indices[int(len(indices) * 0.9):]

    train_set = CIFARLarge(Subset(cifar, train_indices),
                           args.num_patches,
                           train=True)
    val_set = CIFARLarge(Subset(cifar, val_indices),
                         args.num_patches,
                         train=False)
    dataloaders = {
        "train":
        DataLoader(SSLTrainDataset(train_set, args.num_patches,
                                   args.num_angles),
                   shuffle=False,
                   batch_size=args.ssl_train_batch_size,
                   pin_memory=True),
        "val":
        DataLoader(SSLValDataset(val_set, args.num_patches, args.num_angles),
                   shuffle=False,
                   batch_size=args.ssl_val_batch_size,
                   pin_memory=True)
    }

    model = models.ResNet18(args.num_patches, args.num_angles)
    # model.load_state_dict(torch.load(os.path.join(args.model_dir, f"{args.model_name}")))
    # train.gen_grad_map(device, model, dataloaders, args.num_patches, args.num_angles)

    model, best_val_accuracy = train.ssl_train(device, model, dataloaders,
                                               args)
    model_name = time.ctime().replace(" ", "_").replace(":", "_")
    model_name = f"{model_name}_{best_val_accuracy:.4f}.pt"
    torch.save(model.state_dict(), os.path.join(args.model_dir, model_name))
Beispiel #5
0
def config_net(net_name="VGG"):
    assert net_name in __all_models__, "Unimplemented architecture"
    if net_name == "VGG":
        return models.VGG("VGG19")
    elif net_name == "ResNet":
        return models.ResNet18()
    elif net_name == "ResNeXt":
        return models.ResNeXt29_2x64d()
    elif net_name == "MobileNet":
        return models.MobileNetV2()
    elif net_name == "DenseNet":
        return models.DenseNet121()
    elif net_name == "DPN":
        return models.DPN92()
    elif net_name == "EfficientNet":
        return models.EfficientNetB0()
Beispiel #6
0
def get_invert_model(args):
    if 'ResNet18' in args.invert_model:
        model = models.ResNet18(args.model)
    elif 'ResNet152' in args.invert_model:
        model = models.ResNet152(args.model)
    elif 'ResNeXt101' in args.invert_model:
        model = models.ResNeXt101(args.model)
    elif 'VGG19' in args.invert_model:
        model = models.VGG19(args.model)
    elif 'VGG19_BN' in args.invert_model:
        model = models.VGG19_BN(args.model)
    elif 'DenseNet201' in args.invert_model:
        model = models.DenseNet201(args.model)
    else:
        raise Exception(f'{args.invert_model} not found')
    return model
Beispiel #7
0
def evaluate_inversion(args, inverted_net_path):
    # Load saved inverted net
    device = 'cuda:{}'.format(
        args.gpu_ids[0]) if len(args.gpu_ids) > 0 else 'cpu'
    ckpt_dict = torch.load(inverted_net_path, map_location=device)

    # Build model, load parameters
    model_args = ckpt_dict['model_args']
    inverted_net = models.ResNet18(**model_args)
    inverted_net = nn.DataParallel(inverted_net, args.gpu_ids)
    inverted_net.load_state_dict(ckpt_dict['model_state'])

    import pdb
    pdb.set_trace()

    # Get test images (CelebA)
    initial_generated_image_dir = '/deep/group/sharonz/generator/z_test_images/'
    initial_generated_image_name = '058004_crop.jpg'
    initial_generated_image = util.get_image(initial_generated_image_dir,
                                             initial_generated_image_name)
    initial_generated_image = initial_generated_image / 255.
    intiial_generated_image = initial_generated_image.cuda()

    inverted_noise = inverted_net(initial_generated_image)

    if 'BigGAN' in args.model:
        class_vector = one_hot_from_int(207, batch_size=batch_size)
        class_vector = torch.from_numpy(class_vector)

        num_params = int(''.join(filter(str.isdigit, args.model)))
        generator = BigGAN.from_pretrained(f'biggan-deep-{num_params}')

        generator = generator.to(args.device)
        generated_image = generator.forward(inverted_noise, class_vector,
                                            args.truncation)

    # Get difference btw initial and subsequent generated image
    # Save both

    return
Beispiel #8
0
def main():
    args = parse_args()
    if args.deterministic:
        random.seed(0)
        torch.manual_seed(0)
        np.random.seed(0)
        torch.backends.cudnn.deterministic = True

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = models.ResNet18(args.num_patches, args.num_angles)

    mnist_large = MNISTLarge(args.data_dir, args.num_patches)
    indices = list(range(len(mnist_large)))
    train_indices = indices[:int(len(indices) * 0.9)]
    val_indices = indices[int(len(indices) * 0.9):]
    dataloaders = {
        "train":
        DataLoader(SSLTrainDataset(Subset(mnist_large, train_indices),
                                   args.num_patches, args.num_angles),
                   shuffle=True,
                   batch_size=args.ssl_train_batch_size,
                   pin_memory=True),
        "val":
        DataLoader(SSLValDataset(Subset(mnist_large, val_indices),
                                 args.num_patches, args.num_angles),
                   shuffle=False,
                   batch_size=args.ssl_val_batch_size,
                   pin_memory=True)
    }

    model, best_val_accuracy = train.ssl_train(
        device, model, dataloaders, args.ssl_num_epochs, args.num_patches,
        args.num_angles, MNISTLarge.mean, MNISTLarge.std, args.learn_prd,
        args.poisson_rate)
    model_name = time.ctime().replace(" ", "_").replace(":", "_")
    model_name = f"{model_name}_{best_val_accuracy:.4f}.pt"
    torch.save(model.state_dict(), os.path.join(args.model_dir, model_name))
def setup_and_run(args, criterion, device, train_loader, test_loader, val_loader, logging, results, summary_writer):
    global BEST_ACC
    print('\n#### Running binarized-net ####')

    # quantized levels
    if (not args.tanh and args.quant_levels != 2) or args.quant_levels > 3:
        print 'Quantization levels "{0}" is invalid, exiting ...'.format(args.quant_levels)
        exit()
    # for tanh, Q_l = {-1, 0, 1}, rounding if {-1: ( ,-0.5], 0: (-0.5, 0.5), 1: [0.5, )}

    if args.zeroone and args.tanh:
        print 'zeroone cannot be true while tanh is, setting zeroone False ...'
        args.zeroone = False

    # architecture
    if 'VGG' in args.architecture:
        assert(args.architecture == 'VGG11' or args.architecture == 'VGG13' or args.architecture == 'VGG16' 
                or args.architecture == 'VGG19')
        model = models.VGG(args.architecture, args.input_channels, args.im_size, args.output_dim).to(device)
    elif args.architecture == 'RESNET18':
        model = models.ResNet18(args.input_channels, args.im_size, args.output_dim).to(device)
    else:
        print 'Architecture type "{0}" not recognized, exiting ...'.format(args.architecture)
        exit()

    # optimizer
    if args.optimizer == 'ADAM':
        optimizer = optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay)
    elif args.optimizer == 'SGD':
        optimizer = optim.SGD(model.parameters(), lr=args.learning_rate, 
                momentum=args.momentum, nesterov=args.nesterov, weight_decay=args.weight_decay)
    else:
        print 'Optimizer type "{0}" not recognized, exiting ...'.format(args.optimizer)
        exit()
    
    # lr-scheduler
    if args.lr_decay == 'STEP':
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=args.lr_scale)
    elif args.lr_decay == 'EXP':
        scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=args.lr_scale)
    elif args.lr_decay == 'MSTEP':
        x = args.lr_interval.split(',')
        lri = [int(v) for v in x]
        scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=lri, gamma=args.lr_scale)
        args.lr_interval = 1    # lr_interval handled in scheduler!
    else:
        print 'LR decay type "{0}" not recognized, exiting ...'.format(args.lr_decay)
        exit()

    init_weights(model, device, xavier=True)
    if not args.eval:
        logging.info(model)
    num_parameters = sum([l.nelement() for l in model.parameters()])
    if not args.eval:
        logging.info("Number of parameters: %d", num_parameters)

    start_epoch = -1
    beta = 1    # discrete forcing scalar, used only for softmax based projection
    iters = 0   # total no of iterations, used to do many things!
    amodel = auxmodel(model)
    # optionally resume from a checkpoint
    if args.eval:
        logging.info('Loading checkpoint file "{0}" for evaluation'.format(args.eval))
        if not os.path.isfile(args.eval):
            print 'Checkpoint file "{0}" for evaluation not recognized, exiting ...'.format(args.eval)
            exit()
        checkpoint = torch.load(args.eval)
        model.load_state_dict(checkpoint['state_dict'])
        beta = checkpoint['beta']
        logging.debug('beta: {0}'.format(beta))

    elif args.resume:
        checkpoint_file = args.resume
        logging.info('Loading checkpoint file "{0}" to resume'.format(args.resume))
        if not os.path.isfile(checkpoint_file):
            print 'Checkpoint file "{0}" not recognized, exiting ...'.format(checkpoint_file)
            exit()
        checkpoint = torch.load(checkpoint_file)
        start_epoch = checkpoint['epoch']
        assert(args.architecture == checkpoint['architecture'])
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        scheduler.load_state_dict(checkpoint['scheduler'])
        BEST_ACC = checkpoint['best_acc1']
        beta = checkpoint['beta']
        iters = checkpoint['iters']
        logging.debug('best_acc1: {0}, beta: {1}, iters: {2}'.format(BEST_ACC, beta, iters))

    batch_per_epoch = len(train_loader)

    if not args.eval:
        logging.info('Training...')
        model.train()
        st = timer()                
        for e in range(start_epoch + 1, args.num_epochs):
            for i, (data, target) in enumerate(train_loader):
                l = train_step(args, amodel, model, device, data, target, optimizer, criterion, beta=beta)
    
                if i % args.log_interval == 0:
                    acc1, acc5 = evaluate(args, amodel, model, device, val_loader, training=True, beta=beta,
                                          summary_writer=summary_writer, iterations=e*batch_per_epoch+i)
                    logging.info('Epoch: {0},\t Iter: {1},\t Loss: {loss:.5f},\t Val-Acc1: {acc1:.2f} '
                                 '(Best: {best:.2f}),\t Val-Acc5: {acc5:.2f}'.format(e, i, 
                                     loss=l, acc1=acc1, best=BEST_ACC, acc5=acc5))
    
                if iters % args.beta_interval == 0:
                    # beta = beta * args.beta_scale
                    beta = min(beta * args.beta_scale, BETAMAX)
                    optimizer.beta_mda = beta
                    logging.info('beta: {0}'.format(beta))

                if iters % args.lr_interval == 0:
                    lr = args.learning_rate
                    for param_group in optimizer.param_groups:
                        lr = param_group['lr']                        
                    scheduler.step()
                    for param_group in optimizer.param_groups:
                        if lr != param_group['lr']:
                            logging.info('lr: {0}'.format(param_group['lr']))   # print if changed
                iters += 1

            # save checkpoint
            acc1, acc5 = evaluate(args, amodel, model, device, val_loader, training=True, beta=beta)
            results.add(epoch=e, iteration=i, train_loss=l, val_acc1=acc1, best_val_acc1=BEST_ACC)
            util.save_checkpoint({'epoch': e, 'architecture': args.architecture, 'state_dict': model.state_dict(), 
                'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 
                'best_acc1': BEST_ACC, 'iters': iters, 'beta': beta}, is_best=False, path=args.save_dir)
            results.save()
    
        et = timer()
        logging.info('Elapsed time: {0} seconds'.format(et - st))
    
        acc1, acc5 = evaluate(args, amodel, model, device, val_loader, training=True, beta=beta)
        logging.info('End of training, Val-Acc: {acc1:.2f} (Best: {best:.2f}), Val-Acc5: {acc5:.2f}'.format(acc1=acc1, 
            best=BEST_ACC, acc5=acc5))
        # load saved model
        saved_model = torch.load(args.save_name)
        model.load_state_dict(saved_model['state_dict'])
        beta = saved_model['beta']
    # end of training

    # eval-set
    if args.tanh:
        dotanh(args, model, beta=beta)
    if args.eval_set != 'TRAIN' and args.eval_set != 'TEST':
        print 'Evaluation set "{0}" not recognized ...'.format(args.eval_set)

    logging.info('Evaluating fractional binarized-net on the {0} set...'.format(args.eval_set))
    st = timer()                
    if args.eval_set == 'TRAIN':
        acc1, acc5 = evaluate(args, amodel, model, device, train_loader)
    else: 
        acc1, acc5 = evaluate(args, amodel, model, device, test_loader)
    et = timer()
    logging.info('Accuracy: top-1: {acc1:.2f}, top-5: {acc5:.2f}%'.format(acc1=acc1, acc5=acc5))
    logging.info('Elapsed time: {0} seconds'.format(et - st))

    doround(args, model)
    logging.info('Evaluating discrete binarized-net on the {0} set...'.format(args.eval_set))
    st = timer()                
    if args.eval_set == 'TRAIN':
        acc1, acc5 = evaluate(args, amodel, model, device, train_loader)
    else: 
        acc1, acc5 = evaluate(args, amodel, model, device, test_loader)
    et = timer()
    logging.info('Accuracy: top-1: {acc1:.2f}, top-5: {acc5:.2f}%'.format(acc1=acc1, acc5=acc5))
    logging.info('Elapsed time: {0} seconds'.format(et - st))
Beispiel #10
0
def main():
    start_time = time()
    torch.manual_seed(7)
    #np.random.seed(0)
    mode = 'train'
    #############
    # mode = 'test'

    transform = transforms.Compose([
        transforms.Grayscale(),
        transforms.Resize([224, 224]),
        transforms.ToTensor(),
        transforms.Normalize([0.5], [0.5])
    ])

    ds = PileogramDataset(REPEATS_TRAIN, CHIMERIC_TRAIN, REGULAR_TRAIN, JUNK_TRAIN, transform=transform)
    num_samples = len(ds)
    val_size = round(num_samples * 0.2)
    train_size = num_samples - val_size
    ds_train, ds_val = random_split(ds, [train_size, val_size])
    dl_train = DataLoader(ds_train, batch_size=BATCH, shuffle=True, num_workers=2, pin_memory=True)
    dl_val = DataLoader(ds_val, batch_size=BATCH, shuffle=False, num_workers=2, pin_memory=True)

    ds_test = PileogramDataset(REPEATS_TEST, CHIMERIC_TEST, REGULAR_TEST, JUNK_TEST, transform=transform)
    dl_test = DataLoader(ds_test, batch_size=1, shuffle=False, num_workers=2, pin_memory=True)

    net = models.ResNet18(num_classes=4)
    # if device.type == 'cuda' and torch.cuda.device_count() > 1:
    #     net = nn.DataParallel(net)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')  # Use cuda if possible
    # device = torch.device('cpu')  # Force using cpu
    print(f"Using device: {device}")
    net.to(device)
    criterion = nn.CrossEntropyLoss()
    # optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
    # optimizer = optim.Adam(net.parameters(), lr=3e-5, betas=(0.9, 0.999))
    optimizer = optim.RMSprop(net.parameters(), lr=3e-5)
    history_train = []
    history_val = []
    acc_train = []
    acc_valid = []

    if mode == 'train':
        for epoch in range(EPOCHS):
            total_loss = 0.0
            iteration = 0
            total = 0
            correct = 0
            net.train()

            for data in dl_train:
                iteration += 1
                inputs = data['image'].to(device, non_blocking=True)
                labels = data['label'].to(device, non_blocking=True)
                optimizer.zero_grad()
                outputs = net(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                # running_loss += loss.item()
                total_loss += loss.item()
                total += labels.size(0)
                _, predicted = torch.max(outputs.data, 1)
                correct += (predicted == labels).sum().item()

            # if i % 100 == 99:
            #    print("Epoch: %2d, Step: %5d -> Loss: %.5f" %
            #          (epoch + 1, i + 1, running_loss / 100))
            #    running_loss = 0.0
            accuracy = 100*correct/total
            print(f"Epoch {epoch + 1}:\tTrain loss = {total_loss / iteration}\tAccuracy = {round(accuracy, 2)}%")
            history_train.append((epoch + 1, total_loss / iteration))
            acc_train.append((epoch+1, accuracy))

            total_loss = 0.0
            iteration = 0
            total = 0
            correct = 0
            net.eval()

            with torch.no_grad():
                for data in dl_val:
                    iteration += 1
                    images = data['image'].to(device)
                    labels = data['label'].to(device)
                    outputs = net(images)
                    loss = criterion(outputs, labels)
                    total_loss += loss.item()
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()

            accuracy = 100 * correct / total
            print(f"Epoch {epoch + 1}:\tVal loss = {total_loss / iteration},\tAccuracy = {round(accuracy, 2)}%")
            history_val.append((epoch + 1, total_loss / iteration))
            acc_valid.append((epoch + 1, accuracy))

            if epoch == 0 or acc_valid[-1] > max(acc_valid[:-1]):
                torch.save(net.state_dict(), PARAM_PATH)

        training_time = time()
        print(f"Finished Training. Training time: {training_time - start_time} s")
#        visualizer.draw_training_curve(history_train, history_val)
#        visualizer.draw_accuracy_curve(acc_train, acc_valid)

    correct = 0
    total = 0
    net.load_state_dict(torch.load(PARAM_PATH))
    net.eval()
    guess_repeat = []
    guess_chim = []
    guess_regular = []
    guess_junk = []
    eval_time_start = time()

    with torch.no_grad(), open('wrong.txt', 'w') as f:
        for data in dl_test:
            images = data['image'].to(device, non_blocking=True)
            labels = data['label'].to(device, non_blocking=True)
            paths = data['path'][0]
            # print(paths)
            # print(type(paths))
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            if labels == 0:
                guess_repeat.append(predicted.item())
                if predicted.item() != 0:
                    output = paths[:-4] + types[int(labels)] + '_' + types[predicted.item()] + paths[-4:] + '\n'
                    f.write(output)
            elif labels == 1:
                guess_chim.append(predicted.item())
                if predicted.item() != 1:
                    output = paths[:-4] + types[int(labels)] + '_' + types[predicted.item()] + paths[-4:] + '\n'
                    f.write(output)
            elif labels == 2:
                guess_regular.append(predicted.item())
                if predicted.item() != 2:
                    output = paths[:-4] + types[int(labels)] + '_' + types[predicted.item()] + paths[-4:] + '\n'
                    f.write(output)
            else:
                guess_junk.append(predicted.item())
                if predicted.item() != 3:
                    output = paths[:-4] + types[int(labels)] + '_' + types[predicted.item()] + paths[-4:] + '\n'
                    f.write(output)

    eval_time_end = time()
    print(f"Accuracy of the network on the test set: {100 * correct / total}%.")
    print(f"Evalutaion time: {eval_time_end - eval_time_start} s.")

    conf_repeat = (sum([l == 0 for l in guess_repeat]), sum([l == 1 for l in guess_repeat]),
                   sum([l == 2 for l in guess_repeat]), sum([l == 3 for l in guess_repeat]))
    conf_chim = (sum([l == 0 for l in guess_chim]), sum([l == 1 for l in guess_chim]),
                 sum([l == 2 for l in guess_chim]), sum([l == 3 for l in guess_chim]))
    conf_regular = (sum([l == 0 for l in guess_regular]), sum([l == 1 for l in guess_regular]),
                   sum([l == 2 for l in guess_regular]), sum([l == 3 for l in guess_regular]))
    conf_junk = (sum([l == 0 for l in guess_junk]), sum([l == 1 for l in guess_junk]),
                   sum([l == 2 for l in guess_junk]), sum([l == 3 for l in guess_junk]))

    print_confusion(conf_repeat, conf_chim, conf_regular, conf_junk)
Beispiel #11
0
    def __init__(self, args):

        self.args = args

        # Creating data loaders
        kwargs = {'num_workers': 4, 'pin_memory': True}
        if args.dataset == 'MNIST':
            # setup data loader
            self.train_loader = torch.utils.data.DataLoader(
                datasets.MNIST('../data',
                               train=True,
                               download=True,
                               transform=T.ToTensor()),
                batch_size=args.batch_size,
                shuffle=True,
                **kwargs)

            self.val_loader = torch.utils.data.DataLoader(
                datasets.MNIST('../data', train=False, transform=T.ToTensor()),
                batch_size=args.batch_size,
                shuffle=False,
                **kwargs)

            # initialize model
            torch.manual_seed(args.seed)
            self.model = models.SmallCNN()

        elif args.dataset == 'CIFAR10':
            transform_train = T.Compose([
                T.Pad(4, padding_mode='reflect'),
                T.RandomCrop(32),
                T.RandomHorizontalFlip(),
                T.ToTensor()
            ])
            transform_test = T.Compose([T.ToTensor()])

            self.train_loader = torch.utils.data.DataLoader(
                datasets.CIFAR10(args.data_root,
                                 train=True,
                                 download=True,
                                 transform=transform_train),
                batch_size=args.batch_size,
                shuffle=True,
                **kwargs)
            self.val_loader = torch.utils.data.DataLoader(
                datasets.CIFAR10(args.data_root,
                                 train=False,
                                 transform=transform_test),
                batch_size=args.batch_size,
                shuffle=True,
                **kwargs)
            # initialize model
            torch.manual_seed(args.seed)
            self.model = models.ResNet18()

        self.model = torch.nn.DataParallel(self.model).cuda()
        self.optimizer = optim.SGD(self.model.parameters(),
                                   args.lr,
                                   momentum=0.9,
                                   weight_decay=args.weight_decay)
        self.lr_scheduler = optim.lr_scheduler.MultiStepLR(
            self.optimizer, milestones=[60, 120, 160], gamma=0.2)

        print('Number of model parameters: {}'.format(
            sum([p.data.nelement() for p in self.model.parameters()])))

        self.save_path = args.save_path
        self.epoch = 0

        # resume from checkpoint

        ckpt_path = osp.join(self.save_path, 'checkpoint.pth')
        if osp.exists(ckpt_path):
            self._load_from_checkpoint(ckpt_path)
        elif args.restore:
            self._load_from_checkpoint(args.restore)

        cudnn.benchmark = True
        self.attacker = PGDAttacker(args.attack_eps)
Beispiel #12
0
extra_loaders = []
if len(args.extra_data) > 0:
    ex_data = args.extra_data.split(':')
    ex_batch_size = args.extra_data_bsize.split(':')
    for i in range(len(ex_data)):
        if ex_data[i].split('+')[0] == 'wm':
            tmp = ex_data[i].split('+')
            _loader = getwmloader(tmp[1], int(ex_batch_size[i]), tmp[2])
        else:
            _loader, _, __ = getdataloader(ex_data[i], args.train_db_path, args.test_db_path, int(ex_batch_size[i]), 4)
        extra_loaders.append(batch_gen(_loader))    

# Loading model.
print('==> loading model...')
if args.load_path == 'resnet18':
    net = models.ResNet18(num_classes=n_classes)
else:
    checkpoint = torch.load(args.load_path)
    net = checkpoint['net']
    acc = checkpoint['acc']
start_epoch = 0#checkpoint['epoch']

net = net.to(device)
# support cuda
if device == 'cuda':
    print('Using CUDA')
    print('Parallel training on {0} GPUs.'.format(torch.cuda.device_count()))
    net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count()))
    cudnn.benchmark = True

if args.wm_afs:
Beispiel #13
0
        return x


if args.resume:
    print('===> Resuming from checkpoint...')
    assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!'

    checkpoint = torch.load('./checkpoint/ckpt2.t7')
    net = checkpoint['net']
    best_acc = checkpoint['acc']
    start_epoch = checkpoint['epoch']
else:
    print('===> Building model...')
    #    net = BasicNet()
    #    net = googlenet.GoogLeNet()
    net = models.ResNet18()

if args.distributed:
    print('===> Distributed Training Mode')
    dist.init_process_group(backend=args.backend,
                            init_method=args.dist_url,
                            rank=args.rank,
                            world_size=args.world_size)

if args.distributed:
    if args.use_cuda:
        print('===> DistributedDataParallel')
        net.to(device)
        net = torch.nn.parallel.DistributedDataParallel(net)
    else:
        print('===> DistributedDataParallelCPU')
def setup_and_run(args, criterion, device, train_loader, test_loader,
                  val_loader, logging, results):
    global BEST_ACC
    print('\n#### Running continuous-net ####')

    # architecture
    if 'VGG' in args.architecture:
        assert (args.architecture == 'VGG11' or args.architecture == 'VGG13'
                or args.architecture == 'VGG16'
                or args.architecture == 'VGG19')
        model = models.VGG(args.architecture, args.input_channels,
                           args.im_size, args.output_dim).to(device)
    elif args.architecture == 'RESNET18':
        model = models.ResNet18(args.input_channels, args.im_size,
                                args.output_dim).to(device)
    else:
        print 'Architecture type "{0}" not recognized, exiting ...'.format(
            args.architecture)
        exit()

    # optimizer
    if args.optimizer == 'ADAM':
        optimizer = optim.Adam(model.parameters(),
                               lr=args.learning_rate,
                               weight_decay=args.weight_decay)
    elif args.optimizer == 'SGD':
        optimizer = optim.SGD(model.parameters(),
                              lr=args.learning_rate,
                              momentum=args.momentum,
                              nesterov=args.nesterov,
                              weight_decay=args.weight_decay)
    else:
        print 'Optimizer type "{0}" not recognized, exiting ...'.format(
            args.optimizer)
        exit()

    # lr-scheduler
    if args.lr_decay == 'STEP':
        scheduler = optim.lr_scheduler.StepLR(optimizer,
                                              step_size=1,
                                              gamma=args.lr_scale)
    elif args.lr_decay == 'EXP':
        scheduler = optim.lr_scheduler.ExponentialLR(optimizer,
                                                     gamma=args.lr_scale)
    elif args.lr_decay == 'MSTEP':
        x = args.lr_interval.split(',')
        lri = [int(v) for v in x]
        scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
                                                   milestones=lri,
                                                   gamma=args.lr_scale)
        args.lr_interval = 1  # lr_interval handled in scheduler!
    else:
        print 'LR decay type "{0}" not recognized, exiting ...'.format(
            args.lr_decay)
        exit()

    init_weights(model, xavier=True)
    logging.info(model)
    num_parameters = sum([l.nelement() for l in model.parameters()])
    logging.info("Number of parameters: %d", num_parameters)

    start_epoch = -1
    iters = 0  # total no of iterations, used to do many things!
    # optionally resume from a checkpoint
    if args.eval:
        logging.info('Loading checkpoint file "{0}" for evaluation'.format(
            args.eval))
        if not os.path.isfile(args.eval):
            print 'Checkpoint file "{0}" for evaluation not recognized, exiting ...'.format(
                args.eval)
            exit()
        checkpoint = torch.load(args.eval)
        model.load_state_dict(checkpoint['state_dict'])

    elif args.resume:
        checkpoint_file = args.resume
        logging.info('Loading checkpoint file "{0}" to resume'.format(
            args.resume))
        if not os.path.isfile(checkpoint_file):
            print 'Checkpoint file "{0}" not recognized, exiting ...'.format(
                checkpoint_file)
            exit()
        checkpoint = torch.load(checkpoint_file)
        start_epoch = checkpoint['epoch']
        assert (args.architecture == checkpoint['architecture'])
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        scheduler.load_state_dict(checkpoint['scheduler'])
        BEST_ACC = checkpoint['best_acc1']
        iters = checkpoint['iters']
        logging.debug('best_acc1: {0}, iters: {1}'.format(BEST_ACC, iters))

    if not args.eval:
        logging.info('Training...')
        model.train()
        st = timer()

        for e in range(start_epoch + 1, args.num_epochs):
            for i, (data, target) in enumerate(train_loader):
                l = train_step(model, device, data, target, optimizer,
                               criterion)
                if i % args.log_interval == 0:
                    acc1, acc5 = evaluate(args,
                                          model,
                                          device,
                                          val_loader,
                                          training=True)
                    logging.info(
                        'Epoch: {0},\t Iter: {1},\t Loss: {loss:.5f},\t Val-Acc1: {acc1:.2f} '
                        '(Best: {best:.2f}),\t Val-Acc5: {acc5:.2f}'.format(
                            e, i, loss=l, acc1=acc1, best=BEST_ACC, acc5=acc5))

                if iters % args.lr_interval == 0:
                    lr = args.learning_rate
                    for param_group in optimizer.param_groups:
                        lr = param_group['lr']
                    scheduler.step()
                    for param_group in optimizer.param_groups:
                        if lr != param_group['lr']:
                            logging.info('lr: {0}'.format(
                                param_group['lr']))  # print if changed
                iters += 1

            # save checkpoint
            acc1, acc5 = evaluate(args,
                                  model,
                                  device,
                                  val_loader,
                                  training=True)
            results.add(epoch=e,
                        iteration=i,
                        train_loss=l,
                        val_acc1=acc1,
                        best_val_acc1=BEST_ACC)
            util.save_checkpoint(
                {
                    'epoch': e,
                    'architecture': args.architecture,
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'scheduler': scheduler.state_dict(),
                    'best_acc1': BEST_ACC,
                    'iters': iters
                },
                is_best=False,
                path=args.save_dir)
            results.save()

        et = timer()
        logging.info('Elapsed time: {0} seconds'.format(et - st))

        acc1, acc5 = evaluate(args, model, device, val_loader, training=True)
        logging.info(
            'End of training, Val-Acc: {acc1:.2f} (Best: {best:.2f}), Val-Acc5: {acc5:.2f}'
            .format(acc1=acc1, best=BEST_ACC, acc5=acc5))
        # load saved model
        saved_model = torch.load(args.save_name)
        model.load_state_dict(saved_model['state_dict'])
    # end of training

    # eval-set
    if args.eval_set != 'TRAIN' and args.eval_set != 'TEST':
        print 'Evaluation set "{0}" not recognized ...'.format(args.eval_set)

    logging.info('Evaluating continuous-net on the {0} set...'.format(
        args.eval_set))
    st = timer()
    if args.eval_set == 'TRAIN':
        acc1, acc5 = evaluate(args, model, device, train_loader)
    else:
        acc1, acc5 = evaluate(args, model, device, test_loader)
    et = timer()
    logging.info('Accuracy: top-1: {acc1:.2f}, top-5: {acc5:.2f}%'.format(
        acc1=acc1, acc5=acc5))
    logging.info('Elapsed time: {0} seconds'.format(et - st))
Beispiel #15
0
def cl_streaming(args):
    seed = args.seed
    nr_epochs = args.nr_epochs
    beta = args.beta
    device = args.device
    method = args.method
    buffer_size = args.buffer_size
    stream_batch_size = args.stream_batch_size
    dataset = args.dataset

    if dataset == 'stream_imbalanced_splitcifar':
        nr_slots = 1
    else:
        nr_slots = 10

    generator = SplitCifar(imbalanced=dataset == 'stream_imbalanced_splitcifar')

    train_loaders = []
    test_loaders = []
    train_inds_list = []
    for i in range(generator.max_iter):
        train_inds, test_inds = generator.next_task()
        train_inds_list.append(train_inds)
        train_loaders.append(get_custom_loader(generator.train_dataset, train_inds))
        test_loaders.append(get_custom_loader(generator.test_dataset, test_inds))

    model = models.ResNet18().to(device)
    training_op = Training(model, device, nr_epochs, beta=beta)
    kernel_fn = get_kernel_fn()

    bc = bilevel_coreset.BilevelCoreset(outer_loss_fn=loss_utils.cross_entropy,
                                        inner_loss_fn=loss_utils.cross_entropy, out_dim=10, max_outer_it=1,
                                        candidate_batch_size=600, max_inner_it=300, logging_period=1000)

    def coreset_builder_fn(X, y, m, data_weights):
        return bc.build_with_representer_proxy_batch(X, y, m, kernel_fn, data_weights=data_weights,
                                                     cache_kernel=True, start_size=1, inner_reg=inner_reg)
    inner_reg = 1e-3
    if dataset == 'stream_imbalanced_splitcifar':

        if method == 'reservoir':
            training_op = reservoir_buffer(generator, stream_batch_size, buffer_size, training_op)
        elif method == 'cbrs':
            training_op = cbrs(generator, stream_batch_size, buffer_size, training_op)
        elif method == 'coreset':
            training_op = streaming_coreset(generator, stream_batch_size, buffer_size, training_op, coreset_builder_fn,
                                            nr_slots)
        else:
            raise ValueError("Invalid dataset - method combination")
    else:
        if method not in cl_methods:
            raise ValueError("Invalid dataset - method combination")
        training_op = train_with_buffer(generator, buffer_size, training_op, train_loaders, train_inds_list, model,
                                        method, device,
                                        coreset_builder_fn)

    result = get_test_accuracy(generator, test_loaders, training_op)

    filename = '{}_{}_{}_{}_{}.txt'.format(dataset, method, buffer_size, beta, seed)
    results_path = 'cl_results'
    if dataset == 'stream_imbalanced_splitcifar':
        results_path = 'streaming_results'
    if not os.path.exists(results_path):
        os.makedirs(results_path)

    with open(os.path.join(results_path, filename), 'w') as outfile:
        json.dump({'test_acc': np.mean(result), 'acc_per_task': result}, outfile)
Beispiel #16
0
import models

name_to_model = {
    'LeNet': lambda args: models.LeNet(**args),
    'AlexNet': lambda args: models.AlexNet(**args),
    'MLP': lambda args: models.MLP(**args),
    'ResNet18': lambda args: models.ResNet18(**args),
    'PResNet18': lambda args: models.PResNet18(**args),
    'Permutation': lambda args: models.TensorPermutation(32, 32, **args),
    'ResNet20Original': lambda args: models.resnet20original(),
    'MobileNet': lambda args: models.MobileNet(**args),
    'ShuffleNet': lambda args: models.ShuffleNetG2(),
    'WideResNet28': lambda args: models.WideResNet28(**args),
}


def get_model(model_config):
    name = model_config['name']
    return name_to_model[name](model_config.get('args', None))
Beispiel #17
0
print(f'\n\n**************  start new model : {project_name} ******************')
utils.train(hyper_param_dict, model, device)
del model

#run GoogleNet with BN
model = models.GoogLeNet_w_bn()
model.to(device)
project_name = 'GoogLeNet_w_bn'
hyper_param_dict['project'] = project_name
hyper_param_dict['lr'] = 0.01
print(f'\n\n**************  start new model : {project_name} ******************')
utils.train(hyper_param_dict, model, device)
del model

# run ResNet18
model = models.ResNet18()
model.to(device)
project_name = 'ResNet18'
hyper_param_dict['project'] = project_name  
hyper_param_dict['lr'] = 0.03
hyper_param_dict['batch'] = 256
print(f'\n\n**************  start new model : {project_name} ******************')
utils.train(hyper_param_dict, model, device)
del model

# run ResNet34
model = models.ResNet34()
model.to(device)
project_name = 'ResNet34'
hyper_param_dict['project'] = project_name
print(f'\n\n**************  start new model : {project_name} ******************')
Beispiel #18
0
def main():
    args = parse_args()

    if args.deterministic:
        random.seed(0)
        torch.manual_seed(0)
        np.random.seed(0)
        torch.backends.cudnn.deterministic = True

    os.makedirs(args.model_dir, exist_ok=True)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    input_transforms = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize(mean, std)])

    model = models.ResNet18(args.num_patches, args.num_angles)
    model = torch.nn.DataParallel(model)

    if args.do_ssl:
        stl_unlabeled = datasets.STL10(root=args.data_dir,
                                       split='unlabeled',
                                       transform=input_transforms,
                                       download=args.download)
        indices = list(range(len(stl_unlabeled)))
        train_indices = indices[:int(len(indices) * 0.9)]
        val_indices = indices[int(len(indices) * 0.9):]
        dataloaders = {
            "train":
            DataLoader(SSLTrainDataset(Subset(stl_unlabeled,
                                              train_indices), args.num_patches,
                                       args.num_angles, args.poisson_rate),
                       shuffle=True,
                       batch_size=args.ssl_train_batch_size,
                       pin_memory=True),
            "val":
            DataLoader(SSLValDataset(Subset(stl_unlabeled, val_indices),
                                     args.num_patches, args.num_angles),
                       shuffle=False,
                       batch_size=args.ssl_val_batch_size,
                       pin_memory=True)
        }

        # checkpoint = torch.load(os.path.join(args.model_dir, f"{args.model_name}"),
        #                         map_location=lambda storage, loc: storage.cuda(0))
        # model.load_state_dict(checkpoint['state_dict'])
        # model.load_state_dict(torch.load(os.path.join(args.model_dir, f"{args.model_name}")))
        # dataloaders["train"].dataset.set_poisson_rate(args.poisson_rate)
        args.mean, args.std = mean, std
        # train.gen_grad_map(device, model, dataloaders["val"], args)

        model, best_val_accuracy = train.ssl_train(device, model, dataloaders,
                                                   args)
        model_name = time.ctime().replace(" ", "_").replace(":", "_")
        model_name = f"{model_name}_{best_val_accuracy:.4f}.pt"
        torch.save(model.state_dict(), os.path.join(args.model_dir,
                                                    model_name))

    if args.do_sl:
        if args.model_name is None:
            raise ValueError("Model name must be specified")

        stl_train = datasets.STL10(root=args.data_dir,
                                   split='train',
                                   transform=input_transforms,
                                   download=args.download)

        args.num_classes = len(stl_train.classes)
        fold_indices = sl_train.stl_get_train_folds(
            os.path.join(args.data_dir, "stl10_binary/fold_indices.txt"))

        stl_test = datasets.STL10(root=args.data_dir,
                                  split='test',
                                  transform=input_transforms,
                                  download=args.download)
        dataloaders = {
            "test":
            DataLoader(stl_test,
                       shuffle=False,
                       batch_size=args.test_batch_size,
                       pin_memory=True)
        }

        checkpoint = torch.load(
            os.path.join(args.model_dir, f"{args.model_name}"),
            map_location=lambda storage, loc: storage.cuda(0))
        model.load_state_dict(checkpoint['state_dict'])
        # model.load_state_dict(torch.load(os.path.join(args.model_dir, f"{args.model_name}")))
        # model.init_classifier(args.num_classes, freeze_params=False)

        args.mean, args.std = mean, std
        query_img, _ = stl_train[-1]
        dataloader = DataLoader(stl_train,
                                batch_size=128,
                                shuffle=False,
                                pin_memory=True)
        top_images, top_labels = train.retrieve_topk_images(
            device, model, query_img, dataloader, args)
Beispiel #19
0
def training(args,*k,**kw):
    # if use gpus
    device = torch.device("cuda:{}".format(args.gpuindex) if torch.cuda.is_available() and args.gpu else "cpu")
    print("user device: {}".format(device))

    # redis helper related
    redis_helper = redishelper.GoSGDHelper(host=args.host, port=args.port)
    redis_helper.signin()
    while redis_helper.cur_edge_num() < args.edgenum:
        time.sleep(1) # sleep 1 second

    model_score = 1.0 / args.edgenum # the initial model parameters score

    # log_file and summary path

    log_file = "{0}-{1}-edge-{2}.log".format(time.strftime('%Y%m%d-%H%M%S',time.localtime(time.time())),
    args.model,redis_helper.ID)
    log_dir = "tbruns/{0}-{1}-cifar10-edge-{2}".format(time.strftime('%Y%m%d%H%M%S',time.localtime(time.time())),args.model,redis_helper.ID)

    logger = open(log_file,'w')
    swriter = SummaryWriter(log_dir)

    # load traing data
    trainset = dataset.AGGData(root=args.dataset, train=True, download=False, transform=None)

    testset = dataset.AGGData(root=args.dataset, train=False, download=False, transform=None)
    testloader = torch.utils.data.DataLoader(testset, batch_size=args.batchsize, shuffle=False, num_workers=0)

    # construct neural network
    net = None
    if args.model == "lenet5":
        net = models.LeNet5()
    elif args.model == "resnet18":
        net = models.ResNet18()
    elif args.model == "alexnet":
        net = models.AlexNet(args.num_classes)
    elif args.model == "alexnetimg8":
        net = models.AlexNetImg8(args.num_classes)
    elif args.model == "squeezenet":
        net = models.SqueezeNet()
    elif args.model == "mobilenetv2":
        net = models.MobileNetV2()
    elif args.model == "resnet34":
        net = models.ResNet34()
    elif args.model == "resnet50":
        net = models.ResNet50()
    elif args.model == "resnet101":
        net = models.ResNet101()
    else:
        net = models.ResNet152()
    net.to(device)

    # define optimizer
    criterion = nn.CrossEntropyLoss()
    criterion_loss = nn.CrossEntropyLoss(reduction='none')
    optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9)
    lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer,milestones=list(args.lrschstep), gamma=0.1)

    # start training
    wallclock = 0.0
    iteration = 0 # global iterations
    for epoch in range(0,args.epoch,1):
        starteg = time.time()
        # merge parameters of other edge
        if epoch > 0:
            mintime,maxtime,param_list = redis_helper.min2max_time_params()
            print("The min/max time cost of last epoch: {}/{}".format(mintime,maxtime))
            for item in param_list:
                w1 = model_score / (model_score + item[0])
                w2 = item[0] / (model_score + item[0])

                for local,other in zip(net.parameters(),item[1]):
                    local.data = local.data * w1 + other.data.to(device) * w2
                model_score = model_score + item[0]

            while redis_helper.finish_update() is False:
                time.sleep(1.0)

        critical_extra_start = time.time()
        # identify critical training samples
        critrainset = critical_identify(net,trainset,criterion_loss,device,args)
        critrainloader = torch.utils.data.DataLoader(critrainset, batch_size=args.batchsize, shuffle=True, num_workers=0)

        critical_extra_cost = time.time() - critical_extra_start
        training_start = time.time()

        running_loss = 0.0
        record_running_loss = 0.0
        for i, data in enumerate(critrainloader, 0):
            iteration += 1
            # get the inputs
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.squeeze().to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs).squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            record_running_loss += loss.item()
            if i % 10 == 9:
                swriter.add_scalar("Training loss",record_running_loss / 10,epoch*len(critrainloader)+i)
                record_running_loss = 0.0

            if i % 2000 == 1999:    # print every 2000 mini-batches
                print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 2000))
                running_loss = 0.0

        training_cost = time.time() - training_start

        # push time and parameters to Redis
        model_score = model_score / 2
        sel_edge_id = redis_helper.random_edge_id(can_be_self=True)
        paramls = list(map(lambda x: x.cpu(),list(net.parameters())))
        redis_helper.ins_time_params(sel_edge_id,training_cost,model_score,paramls)
        while not redis_helper.finish_push():
            time.sleep(1.0)

        wallclock += time.time() - starteg

        total, kaccuracy = validation(net,testloader,device,topk=(1,5))

        curtime = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
        _header="[ {} Epoch {} /Iteration {} Wallclock {}]".format(curtime,epoch+1,iteration, wallclock)

        print('{} Accuracy of the network on the {} test images: {} %'.format(_header, total, kaccuracy_str(kaccuracy)))
        logger.write('{},{},{},{}\n'.format(epoch+1 ,iteration, wallclock, accuracy_str(kaccuracy)))
        logger.flush() # write to disk

        for item in kaccuracy:
            swriter.add_scalar("Top{}Accuracy".format(item[0]), item[1], epoch)

        # adopt learning rate of optimizer
        if args.lrscheduler:
            lr_scheduler.step()

    print('Finished Training')

    redis_helper.register_out()
    logger.close() # close log file writer

    return net
Beispiel #20
0
def main_worker(gpu, ngpus_per_node, args):
    global best_acc

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size, rank=args.rank)
    # create model
    model = models.ResNet18(args.num_patches, args.num_angles)

    if args.distributed:
        model.cuda()
        # DistributedDataParallel will divide and allocate batch_size to all
        # available GPUs if device_ids are not set
        model = torch.nn.parallel.DistributedDataParallel(model)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        model = torch.nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()
    optimiser = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_acc = checkpoint['best_acc']
            args.poisson_rate = checkpoint["poisson_rate"]
            model.load_state_dict(checkpoint['state_dict'])
            optimiser.load_state_dict(checkpoint['optimiser'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    input_transforms = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(225),
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])

    train_dir = os.path.join(args.data, 'train')
    val_dir = os.path.join(args.data, 'val')
    imagenet_train = datasets.ImageFolder(root=train_dir, transform=input_transforms)
    train_dataset = SSLTrainDataset(imagenet_train, args.num_patches, args.num_angles)
    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
    else:
        train_sampler = None
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
        num_workers=args.workers, pin_memory=True, sampler=train_sampler)

    imagenet_val = datasets.ImageFolder(root=val_dir, transform=input_transforms)
    val_dataset = SSLValDataset(imagenet_val, args.num_patches, args.num_angles)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=args.batch_size, shuffle=False,
                                             num_workers=args.workers, pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return

    writer = SummaryWriter()
    train_loader.dataset.set_poisson_rate(args.poisson_rate)
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)

        # train for one epoch
        train_loss, train_acc = train(train_loader, model, criterion, optimiser, epoch, args)

        # evaluate on validation set
        val_loss, val_acc = validate(val_loader, model, criterion, args)

        # remember best acc@1 and save checkpoint
        is_best = val_acc > best_acc
        best_acc = max(val_acc, best_acc)

        if not args.multiprocessing_distributed or (args.multiprocessing_distributed
                                                    and args.rank % ngpus_per_node == 0):
            save_checkpoint({
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'best_acc': best_acc,
                'optimiser': optimiser.state_dict(),
                "poisson_rate": args.poisson_rate
            }, is_best)

        if (epoch + 1) % args.learn_prd == 0:
            args.poisson_rate += 1
            train_loader.dataset.set_poisson_rate(args.poisson_rate)

        writer.add_scalars("Loss", {"train_loss": train_loss, "val_loss": val_loss}, epoch)
        writer.add_scalars("Accuracy", {"train_acc": train_acc, "val_acc": val_acc}, epoch)
        writer.add_scalar("Poisson_Rate", train_loader.dataset.pdist.rate, epoch)

    writer.close()
Beispiel #21
0
def main():
    global best_acc, mean, std, scale

    args = parse_args()
    args.mean, args.std, args.scale = mean, std, scale
    args.is_master = args.local_rank == 0

    if args.deterministic:
        cudnn.deterministic = True
        torch.manual_seed(0)
        random.seed(0)
        np.random.seed(0)

    args.distributed = False
    if 'WORLD_SIZE' in os.environ:
        args.distributed = int(os.environ['WORLD_SIZE']) > 1

    if args.is_master:
        print("opt_level = {}".format(args.opt_level))
        print("keep_batchnorm_fp32 = {}".format(args.keep_batchnorm_fp32),
              type(args.keep_batchnorm_fp32))
        print("loss_scale = {}".format(args.loss_scale), type(args.loss_scale))
        print("\nCUDNN VERSION: {}\n".format(torch.backends.cudnn.version()))
        print(f"Distributed Training Enabled: {args.distributed}")

    args.gpu = 0
    args.world_size = 1

    if args.distributed:
        args.gpu = args.local_rank
        torch.cuda.set_device(args.gpu)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        args.world_size = torch.distributed.get_world_size()
        # Scale learning rate based on global batch size
        # args.lr *= args.batch_size * args.world_size / 256

    assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled."

    # create model
    model = models.ResNet18(args.num_patches, args.num_angles)

    if args.sync_bn:
        import apex
        print("using apex synced BN")
        model = apex.parallel.convert_syncbn_model(model)

    model = model.cuda()
    optimiser = Ranger(model.parameters(), lr=args.lr)
    criterion = nn.CrossEntropyLoss().cuda()

    # Initialize Amp.  Amp accepts either values or strings for the optional override arguments,
    # for convenient interoperation with argparse.
    model, optimiser = amp.initialize(
        model,
        optimiser,
        opt_level=args.opt_level,
        keep_batchnorm_fp32=args.keep_batchnorm_fp32,
        loss_scale=args.loss_scale)

    # For distributed training, wrap the model with apex.parallel.DistributedDataParallel.
    # This must be done AFTER the call to amp.initialize.  If model = DDP(model) is called
    # before model, ... = amp.initialize(model, ...), the call to amp.initialize may alter
    # the types of model's parameters in a way that disrupts or destroys DDP's allreduce hooks.
    if args.distributed:
        model = DDP(model, delay_allreduce=True)

    # Optionally resume from a checkpoint
    if args.resume:
        # Use a local scope to avoid dangling references
        def resume():
            global best_acc
            if os.path.isfile(args.resume):
                print("=> loading checkpoint '{}'".format(args.resume))
                checkpoint = torch.load(
                    args.resume,
                    map_location=lambda storage, loc: storage.cuda(args.gpu))
                args.start_epoch = checkpoint['epoch']
                best_acc = checkpoint['best_acc']
                args.poisson_rate = checkpoint["poisson_rate"]
                model.load_state_dict(checkpoint['state_dict'])
                optimiser.load_state_dict(checkpoint['optimiser'])
                print("=> loaded checkpoint '{}' (epoch {})".format(
                    args.resume, checkpoint['epoch']))
            else:
                print("=> no checkpoint found at '{}'".format(args.resume))

        resume()

    # Data loading code
    train_dir = os.path.join(args.data, 'train')
    val_dir = os.path.join(args.data, 'val')

    crop_size = 225
    val_size = 256

    imagenet_train = datasets.ImageFolder(
        root=train_dir,
        transform=transforms.Compose([
            transforms.RandomResizedCrop(crop_size),
        ]))
    train_dataset = SSLTrainDataset(imagenet_train, args.num_patches,
                                    args.num_angles, args.poisson_rate)
    imagenet_val = datasets.ImageFolder(root=val_dir,
                                        transform=transforms.Compose([
                                            transforms.Resize(val_size),
                                            transforms.CenterCrop(crop_size),
                                        ]))
    val_dataset = SSLValDataset(imagenet_val, args.num_patches,
                                args.num_angles)

    train_sampler = None
    val_sampler = None
    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
        val_sampler = torch.utils.data.distributed.DistributedSampler(
            val_dataset)

    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              shuffle=(train_sampler is None),
                              num_workers=args.workers,
                              pin_memory=True,
                              sampler=train_sampler,
                              collate_fn=fast_collate)

    val_loader = DataLoader(val_dataset,
                            batch_size=args.batch_size,
                            shuffle=False,
                            num_workers=args.workers,
                            pin_memory=True,
                            sampler=val_sampler,
                            collate_fn=fast_collate)

    if args.evaluate:
        val_loss, val_acc = apex_validate(val_loader, model, criterion, args)
        utils.logger.info(f"Val Loss = {val_loss}, Val Accuracy = {val_acc}")
        return

    # Create dir to save model and command-line args
    if args.is_master:
        model_dir = time.ctime().replace(" ", "_").replace(":", "_")
        model_dir = os.path.join("models", model_dir)
        os.makedirs(model_dir, exist_ok=True)
        with open(os.path.join(model_dir, "args.json"), "w") as f:
            json.dump(args.__dict__, f, indent=2)
        writer = SummaryWriter()

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)

        # train for one epoch
        train_loss, train_acc = apex_train(train_loader, model, criterion,
                                           optimiser, args, epoch)

        # evaluate on validation set
        val_loss, val_acc = apex_validate(val_loader, model, criterion, args)

        if (epoch + 1) % args.learn_prd == 0:
            utils.adj_poisson_rate(train_loader, args)

        # remember best Acc and save checkpoint
        if args.is_master:
            is_best = val_acc > best_acc
            best_acc = max(val_acc, best_acc)
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'best_acc': best_acc,
                    'optimiser': optimiser.state_dict(),
                    "poisson_rate": args.poisson_rate
                }, is_best, model_dir)

            writer.add_scalars("Loss", {
                "train_loss": train_loss,
                "val_loss": val_loss
            }, epoch)
            writer.add_scalars("Accuracy", {
                "train_acc": train_acc,
                "val_acc": val_acc
            }, epoch)
            writer.add_scalar("Poisson_Rate", train_loader.dataset.pdist.rate,
                              epoch)
Beispiel #22
0
def setup_and_run(args, criterion, device, train_loader, test_loader,
                  val_loader, logging, results):
    global BEST_ACC
    print("\n#### Running REF ####")

    # architecture
    if args.architecture == "MLP":
        model = models.MLP(args.input_dim, args.hidden_dim,
                           args.output_dim).to(device)
    elif args.architecture == "LENET300":
        model = models.LeNet300(args.input_dim, args.output_dim).to(device)
    elif args.architecture == "LENET5":
        model = models.LeNet5(args.input_channels, args.im_size,
                              args.output_dim).to(device)
    elif "VGG" in args.architecture:
        assert (args.architecture == "VGG11" or args.architecture == "VGG13"
                or args.architecture == "VGG16"
                or args.architecture == "VGG19")
        model = models.VGG(args.architecture, args.input_channels,
                           args.im_size, args.output_dim).to(device)
    elif args.architecture == "RESNET18":
        model = models.ResNet18(args.input_channels, args.im_size,
                                args.output_dim).to(device)
    elif args.architecture == "RESNET34":
        model = models.ResNet34(args.input_channels, args.im_size,
                                args.output_dim).to(device)
    elif args.architecture == "RESNET50":
        model = models.ResNet50(args.input_channels, args.im_size,
                                args.output_dim).to(device)
    elif args.architecture == "RESNET101":
        model = models.ResNet101(args.input_channels, args.im_size,
                                 args.output_dim).to(device)
    elif args.architecture == "RESNET152":
        model = models.ResNet152(args.input_channels, args.im_size,
                                 args.output_dim).to(device)
    else:
        print('Architecture type "{0}" not recognized, exiting ...'.format(
            args.architecture))
        exit()

    # optimizer
    if args.optimizer == "ADAM":
        optimizer = optim.Adam(model.parameters(),
                               lr=args.learning_rate,
                               weight_decay=args.weight_decay)
    elif args.optimizer == "SGD":
        optimizer = optim.SGD(
            model.parameters(),
            lr=args.learning_rate,
            momentum=args.momentum,
            nesterov=args.nesterov,
            weight_decay=args.weight_decay,
        )
    else:
        print('Optimizer type "{0}" not recognized, exiting ...'.format(
            args.optimizer))
        exit()

    # lr-scheduler
    if args.lr_decay == "STEP":
        scheduler = optim.lr_scheduler.StepLR(optimizer,
                                              step_size=1,
                                              gamma=args.lr_scale)
    elif args.lr_decay == "EXP":
        scheduler = optim.lr_scheduler.ExponentialLR(optimizer,
                                                     gamma=args.lr_scale)
    elif args.lr_decay == "MSTEP":
        x = args.lr_interval.split(",")
        lri = [int(v) for v in x]
        scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
                                                   milestones=lri,
                                                   gamma=args.lr_scale)
        args.lr_interval = 1  # lr_interval handled in scheduler!
    else:
        print('LR decay type "{0}" not recognized, exiting ...'.format(
            args.lr_decay))
        exit()

    init_weights(model, xavier=True)
    logging.info(model)
    num_parameters = sum([l.nelement() for l in model.parameters()])
    logging.info("Number of parameters: %d", num_parameters)

    start_epoch = -1
    iters = 0  # total no of iterations, used to do many things!
    # optionally resume from a checkpoint
    if args.eval:
        logging.info('Loading checkpoint file "{0}" for evaluation'.format(
            args.eval))
        if not os.path.isfile(args.eval):
            print(
                'Checkpoint file "{0}" for evaluation not recognized, exiting ...'
                .format(args.eval))
            exit()
        checkpoint = torch.load(args.eval)
        model.load_state_dict(checkpoint["state_dict"])

    elif args.resume:
        checkpoint_file = args.resume
        logging.info('Loading checkpoint file "{0}" to resume'.format(
            args.resume))
        if not os.path.isfile(checkpoint_file):
            print('Checkpoint file "{0}" not recognized, exiting ...'.format(
                checkpoint_file))
            exit()
        checkpoint = torch.load(checkpoint_file)
        start_epoch = checkpoint["epoch"]
        assert args.architecture == checkpoint["architecture"]
        model.load_state_dict(checkpoint["state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        scheduler.load_state_dict(checkpoint["scheduler"])
        BEST_ACC = checkpoint["best_acc1"]
        iters = checkpoint["iters"]
        logging.debug("best_acc1: {0}, iters: {1}".format(BEST_ACC, iters))

    if not args.eval:
        logging.info("Training...")
        model.train()
        st = timer()

        for e in range(start_epoch + 1, args.num_epochs):
            for i, (data, target) in enumerate(train_loader):
                l = train_step(model, device, data, target, optimizer,
                               criterion)
                if i % args.log_interval == 0:
                    acc1, acc5 = evaluate(args,
                                          model,
                                          device,
                                          val_loader,
                                          training=True)
                    logging.info(
                        "Epoch: {0},\t Iter: {1},\t Loss: {loss:.5f},\t Val-Acc1: {acc1:.2f} "
                        "(Best: {best:.2f}),\t Val-Acc5: {acc5:.2f}".format(
                            e, i, loss=l, acc1=acc1, best=BEST_ACC, acc5=acc5))

                if iters % args.lr_interval == 0:
                    lr = args.learning_rate
                    for param_group in optimizer.param_groups:
                        lr = param_group["lr"]
                    scheduler.step()
                    for param_group in optimizer.param_groups:
                        if lr != param_group["lr"]:
                            logging.info("lr: {0}".format(
                                param_group["lr"]))  # print if changed
                iters += 1

            # save checkpoint
            acc1, acc5 = evaluate(args,
                                  model,
                                  device,
                                  val_loader,
                                  training=True)
            results.add(
                epoch=e,
                iteration=i,
                train_loss=l,
                val_acc1=acc1,
                best_val_acc1=BEST_ACC,
            )
            util.save_checkpoint(
                {
                    "epoch": e,
                    "architecture": args.architecture,
                    "state_dict": model.state_dict(),
                    "optimizer": optimizer.state_dict(),
                    "scheduler": scheduler.state_dict(),
                    "best_acc1": BEST_ACC,
                    "iters": iters,
                },
                is_best=False,
                path=args.save_dir,
            )
            results.save()

        et = timer()
        logging.info("Elapsed time: {0} seconds".format(et - st))

        acc1, acc5 = evaluate(args, model, device, val_loader, training=True)
        logging.info(
            "End of training, Val-Acc: {acc1:.2f} (Best: {best:.2f}), Val-Acc5: {acc5:.2f}"
            .format(acc1=acc1, best=BEST_ACC, acc5=acc5))
        # load saved model
        saved_model = torch.load(args.save_name)
        model.load_state_dict(saved_model["state_dict"])
    # end of training

    # eval-set
    if args.eval_set != "TRAIN" and args.eval_set != "TEST":
        print('Evaluation set "{0}" not recognized ...'.format(args.eval_set))

    logging.info("Evaluating REF on the {0} set...".format(args.eval_set))
    st = timer()
    if args.eval_set == "TRAIN":
        acc1, acc5 = evaluate(args, model, device, train_loader)
    else:
        acc1, acc5 = evaluate(args, model, device, test_loader)
    et = timer()
    logging.info("Accuracy: top-1: {acc1:.2f}, top-5: {acc5:.2f}%".format(
        acc1=acc1, acc5=acc5))
    logging.info("Elapsed time: {0} seconds".format(et - st))
def create_model():
    return models.ResNet18().cuda()
# the "conv_mode" key in the bound_opts parameter when constructing your
# BoundeModule object.  In this test we show the difference between Patches
# mode and Matrix mode in memory consumption.

device = 'cuda'
conv_mode = 'patches'  # conv_mode can be set as 'matrix' or 'patches'

seed = 1234
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)

## Step 1: Define the model
# model_ori = models.model_resnet(width=1, mult=4)
model_ori = models.ResNet18(in_planes=2)
# model_ori.load_state_dict(torch.load("data/cifar_base_kw.pth")['state_dict'][0])

## Step 2: Prepare dataset as usual
# test_data = torchvision.datasets.MNIST("./data", train=False, download=True, transform=torchvision.transforms.ToTensor())

normalize = torchvision.transforms.Normalize(mean=[0.4914, 0.4822, 0.4465],
                                             std=[0.2023, 0.1994, 0.2010])
test_data = torchvision.datasets.CIFAR10(
    "./data",
    train=False,
    download=True,
    transform=torchvision.transforms.Compose(
        [torchvision.transforms.ToTensor(), normalize]))
# For illustration we only use 1 image from dataset
N = 1
Beispiel #25
0
def main():

    batch_size = 256
    num_classes = 1000
    image_size = (128, 128)
    """ load dataset """
    dataset = loaders.ImageNetLoader('./datasets/ImageNet').load()
    train_dataset, valid_dataset = dataset
    """ processor """
    train_processor = processors.ImageNetClassificationProcessor(
        batch_size,
        num_classes=num_classes,
        enable_augmentation=True,
        image_size=image_size)
    valid_processor = processors.ImageNetClassificationProcessor(
        batch_size,
        num_classes=num_classes,
        enable_augmentation=False,
        image_size=image_size)
    """ iterator """
    train_iterator = iterators.MultiprocessIterator(train_dataset,
                                                    train_processor,
                                                    num_workers=4)
    valid_iterator = iterators.MultiprocessIterator(valid_dataset,
                                                    valid_processor,
                                                    num_workers=4)
    """ device """
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    """ model """
    model = models.ResNet18(input_channels=3,
                            num_classes=num_classes).to(device)
    """ loss """
    loss_function = losses.CrossEntropyLoss().to(device)
    """ optimizer """
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=20)
    """ logger """
    logger = loggers.SimpleLogger()
    """ learning """
    for epoch in range(10):
        print(f"-" * 64)
        print(f"[epoch {epoch:>4d}]")
        phase = 'train'
        torch.set_grad_enabled(True)
        for batch_data in tqdm.tqdm(train_iterator, desc=phase):
            optimizer.zero_grad()
            batch_image = torch.from_numpy(batch_data['image']).to(device)
            batch_target = torch.from_numpy(batch_data['target']).to(device)
            batch_output = model(batch_image)
            batch_loss = loss_function(batch_output, batch_target)
            batch_loss.sum().backward()
            optimizer.step()
            batch_loss = batch_loss.data.cpu().numpy()
            batch_label = np.argmax(batch_target.data.cpu().numpy(),
                                    axis=-1).flatten()
            batch_pred = np.argmax(batch_output.data.cpu().numpy(),
                                   axis=-1).flatten()
            logger.add_batch_loss(batch_loss, phase=phase)
            logger.add_batch_pred(batch_pred, phase=phase)
            logger.add_batch_label(batch_label, phase=phase)
        loss = logger.get_loss(phase)
        accuracy = logger.get_accuracy(phase)
        print(f"loss : {loss}")
        print(f"accuracy : {accuracy}")
        phase = 'valid'
        torch.set_grad_enabled(False)
        for batch_data in tqdm.tqdm(valid_iterator, desc=phase):
            optimizer.zero_grad()
            batch_image = torch.from_numpy(batch_data['image']).to(device)
            batch_target = torch.from_numpy(batch_data['target']).to(device)
            batch_output = model(batch_image)
            batch_loss = loss_function(batch_output, batch_target)
            batch_loss = batch_loss.data.cpu().numpy()
            batch_label = np.argmax(batch_target.data.cpu().numpy(),
                                    axis=-1).flatten()
            batch_pred = np.argmax(batch_output.data.cpu().numpy(),
                                   axis=-1).flatten()
            logger.add_batch_loss(batch_loss, phase=phase)
            logger.add_batch_pred(batch_pred, phase=phase)
            logger.add_batch_label(batch_label, phase=phase)
        loss = logger.get_loss(phase)
        accuracy = logger.get_accuracy(phase)
        print(f"loss : {loss:.4f}")
        print(f"accuracy : {accuracy:.4f}")
        logger.step()