Ejemplo n.º 1
0
def run():
    print 'Inside run!'
    global args
    args = parser.parse_args()
    print args
    # add checkpoint resume option
    # load datasets
    train_dataset = SketchData(root=path,
                            train=True, 
                            transform=None,
                            target_transform=None,
                            )
    val_dataset = SketchData(root=path,
                            train=False, 
                            transform=None,
                            target_transform=None,
                            )
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=args.b, 
                                           shuffle=True)
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                          batch_size=args.b, 
                                          shuffle=False)
    model = AlexNet()
    optimizer = torch.optim.SGD(model.parameters(), args.lr,
                                momentum=args.momentum,
                                weight_decay=args.wd)
    criterion = nn.CrossEntropyLoss()

    best_prec = 0
    for epoch in range(args.epochs):
        print 'Epoch: ' + str(epoch)
        adjust_learning_rate(optimizer, epoch)
        print 'Adjusted learning rate'
        train(train_loader, model, criterion, optimizer, epoch)
        print 'Trained!'
        precision = validate(val_loader, model, criterion)
        print 'Got precision!'
        best_prec = max(precision.data[0], best_prec)
        print 'Updated best precision!'
        save_checkpoint({
            'epoch': epoch + 1,
            'arch': args.arch,
            'best_prec1': best_prec,
            'state_dict': model.state_dict(),
            'optimizer' : optimizer.state_dict(),
        }, (precision.data[0] > best_prec))
Ejemplo n.º 2
0
    def __init__(self, context: DeepSpeedTrialContext) -> None:
        self.context = context
        self.args = AttrDict(self.context.get_hparams())
        model = AlexNet(10)
        model = PipelineModule(
            layers=join_layers(model),
            loss_fn=torch.nn.CrossEntropyLoss(),
            num_stages=self.args.pipe_parallel_size,
            partition_method=self.args.part,
            activation_checkpoint_interval=0,
        )

        ds_config = overwrite_deepspeed_config(
            self.args.deepspeed_config,
            self.args.get("overwrite_deepspeed_args", {}))
        model_engine, optimizer, _, _ = deepspeed.initialize(
            args=self.args,
            model=model,
            model_parameters=[
                p for p in model.parameters() if p.requires_grad
            ],
            config=ds_config,
        )
        self.model_engine = self.context.wrap_model_engine(model_engine)
Ejemplo n.º 3
0
class TrainNetwork():
    def __init__(self, dataset, batch_size, epochs, lr, lr_decay_epoch,
                 momentum):
        assert (dataset == 'letters' or dataset == 'mnist')

        self.dataset = dataset
        self.batch_size = batch_size
        self.epochs = epochs
        self.lr = lr
        self.lr_decay_epoch = lr_decay_epoch
        self.momentum = momentum

        # letters contains 27 classes, digits contains 10 classes
        num_classes = 27 if dataset == 'letters' else 10

        # Load pre learned AlexNet with changed number of output classes
        state_dict = torch.load('./trained_models/alexnet.pth')
        state_dict['classifier.6.weight'] = torch.zeros(num_classes, 4096)
        state_dict['classifier.6.bias'] = torch.zeros(num_classes)
        self.model = AlexNet(num_classes)
        self.model.load_state_dict(state_dict)

        # Use cuda if available
        if torch.cuda.is_available():
            self.model.cuda()

        # Load training dataset
        kwargs = {
            'num_workers': 1,
            'pin_memory': True
        } if torch.cuda.is_available() else {}
        self.train_loader = torch.utils.data.DataLoader(
            EMNIST('./data',
                   dataset,
                   download=True,
                   transform=transforms.Compose([
                       transforms.Lambda(correct_rotation),
                       transforms.Lambda(random_transform),
                       transforms.Resize((224, 224)),
                       transforms.RandomResizedCrop(224, (0.9, 1.1),
                                                    ratio=(0.9, 1.1)),
                       transforms.Grayscale(3),
                       transforms.ToTensor(),
                   ])),
            batch_size=batch_size,
            shuffle=True,
            **kwargs)

        # Optimizer and loss function
        self.optimizer = optim.SGD(self.model.parameters(),
                                   lr=self.lr,
                                   momentum=self.momentum)
        self.loss_fn = nn.CrossEntropyLoss()

    def reduce_learning_rate(self, epoch):
        """
        Reduce the learning rate by factor 0.1 every lr_decay_epoch
        :param optimizer: Optimizer containing the learning rate
        :param epoch: Current epoch
        :param init_lr: Initial learning rate
        :param lr_decay_epoch: Number of epochs until learning rate gets reduced
        :return: None
        """
        lr = self.lr * (0.1**(epoch // self.lr_decay_epoch))

        if epoch % self.lr_decay_epoch == 0:
            print('LR is set to {}'.format(lr))

        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr

    def train(self, epoch):
        """
        Train the model for one epoch and save the result as a .pth file
        :param epoch: Current epoch
        :return: None
        """
        self.model.train()

        train_loss = 0
        train_correct = 0
        progress = None
        for batch_idx, (data, target) in enumerate(self.train_loader):
            # Get data and label
            if torch.cuda.is_available():
                data, target = data.cuda(), target.cuda()
            data, target = Variable(data), Variable(target)

            # Optimize using backpropagation
            self.optimizer.zero_grad()
            output = self.model(data)
            loss = self.loss_fn(output, target)
            train_loss += loss.data[0]
            pred = output.data.max(1, keepdim=True)[1]
            train_correct += pred.eq(target.data.view_as(pred)).sum()
            loss.backward()
            self.optimizer.step()

            # Print information about current step
            current_progress = int(100 * (batch_idx + 1) * self.batch_size /
                                   len(self.train_loader.dataset))
            if current_progress is not progress and current_progress % 5 == 0:
                progress = current_progress
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, (batch_idx + 1) * len(data),
                    len(self.train_loader.dataset), current_progress,
                    loss.data[0]))

        train_loss /= (len(self.train_loader.dataset) / self.batch_size)
        train_correct /= len(self.train_loader.dataset)
        train_correct *= 100

        # Print information about current epoch
        print(
            'Train Epoch: {} \tCorrect: {:3.2f}%\tAverage loss: {:.6f}'.format(
                epoch, train_correct, train_loss))

        # Save snapshot
        torch.save(
            {
                'model': self.model.state_dict(),
                'optimizer': self.optimizer.state_dict()
            }, './trained_models/{}_{}.pth'.format(self.dataset, epoch))

    def start(self):
        """
        Start training the network
        :return: None
        """
        for epoch in range(1, self.epochs + 1):
            self.reduce_learning_rate(epoch)
            self.train(epoch)
Ejemplo n.º 4
0
    assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!'
    checkpoint = torch.load('./checkpoint/ckpt.pth')
    net.load_state_dict(checkpoint['model'])
    best_acc = checkpoint['accuracy']
    start_epoch = checkpoint['epoch']
else:
    best_acc = 0
    start_epoch = 0



# hyperparameters
batch_size= args.batch_size
epoch_num = args.epochs

optimizer = optim.Adam(model.parameters(), lr= args.lr, weight_decay=args.weight_decay)


train_loader = DataLoader(data_train, shuffle=True,batch_size= batch_size, num_workers=8)
test_loader = DataLoader(data_test, shuffle=True, batch_size= batch_size, num_workers=8)

def train(epoch):
    model.train() # set model in training mode (need this because of dropout)
    correct = 0
    loss    = 0

    for batch_id, (data, label) in enumerate(train_loader):
        data = data.to(device)
        target = label.to(device)
        
        optimizer.zero_grad()
Ejemplo n.º 5
0
import torch.nn as nn
from torch.autograd import Variable
from data_process import MyDataset
from torchvision import transforms, utils
from torch.utils.data import DataLoader
import torch

train_data = MyDataset(txt='./data/train.txt', transform=transforms.ToTensor())
train_loader = DataLoader(train_data, batch_size=50, shuffle=True)  #返回的是迭代器
test_data = MyDataset(txt='./data/val.txt', transform=transforms.ToTensor())
test_loader = DataLoader(test_data, batch_size=50)

model = AlexNet().cuda()  #使用gpu,将模型加载到显存
print(model)
#print(list(model.parameters()))
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_func = nn.CrossEntropyLoss()

#开始训练
for epoch in range(30):
    print('epoch {}'.format(epoch + 1))

    #training------------------------
    train_loss = 0
    train_accu = 0

    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.cuda(), batch_y.cuda()  #数据加载到显存
        out = model(batch_x)
        loss = loss_func(out, batch_y)
        train_loss += loss.item()
Ejemplo n.º 6
0
def train(train_loader, eval_loader, opt):
    print('==> Start training...')

    summary_writer = SummaryWriter('./runs/' + str(int(time.time())))

    is_cuda = torch.cuda.is_available()
    model = AlexNet()
    if is_cuda:
        model = model.cuda()

    optimizer = optim.SGD(
        params=model.parameters(),
        lr=opt.base_lr,
        momentum=0.9,
    )
    criterion = nn.CrossEntropyLoss()

    best_eval_acc = -0.1
    losses = AverageMeter()
    accuracies = AverageMeter()
    global_step = 0
    for epoch in range(1, opt.epochs + 1):
        # train
        model.train()
        for batch_idx, (inputs, targets) in enumerate(train_loader):
            global_step += 1
            if is_cuda:
                inputs = inputs.cuda()
                targets = targets.cuda()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            losses.update(loss.item(), outputs.shape[0])
            summary_writer.add_scalar('train/loss', loss, global_step)

            _, preds = torch.max(outputs, dim=1)
            acc = preds.eq(targets).sum().item() / len(targets)
            accuracies.update(acc)
            summary_writer.add_scalar('train/acc', acc, global_step)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'],
                                  global_step)
        print(
            '==> Epoch: %d; Average Train Loss: %.4f; Average Train Acc: %.4f'
            % (epoch, losses.avg, accuracies.avg))

        # eval
        model.eval()
        losses.reset()
        accuracies.reset()
        for batch_idx, (inputs, targets) in enumerate(eval_loader):
            if is_cuda:
                inputs = inputs.cuda()
                targets = targets.cuda()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            losses.update(loss.item(), outputs.shape[0])

            _, preds = torch.max(outputs, dim=1)
            acc = preds.eq(targets).sum().item() / len(targets)
            accuracies.update(acc)

        summary_writer.add_scalar('eval/loss', losses.avg, global_step)
        summary_writer.add_scalar('eval/acc', accuracies.avg, global_step)
        if accuracies.avg > best_eval_acc:
            best_eval_acc = accuracies.avg
            torch.save(model, './weights/best.pt')
        print(
            '==> Epoch: %d; Average Eval Loss: %.4f; Average/Best Eval Acc: %.4f / %.4f'
            % (epoch, losses.avg, accuracies.avg, best_eval_acc))
Ejemplo n.º 7
0
def main():

    #gpus=[4,5,6,7]
    gpus = [0]
    print("GPUs :", gpus)
    print("prepare data")
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_tfs = transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(), normalize
    ])

    val_tfs = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(), normalize
    ])

    train_ds = datasets.ImageFolder('/home/gw/data/imagenet_10/train',
                                    train_tfs)
    val_ds = datasets.ImageFolder('/home/gw/data/imagenet_10/val', val_tfs)

    train_ld = torch.utils.data.DataLoader(train_ds,
                                           batch_size=256,
                                           shuffle=True,
                                           num_workers=4,
                                           pin_memory=True)

    val_ld = torch.utils.data.DataLoader(val_ds,
                                         batch_size=64,
                                         shuffle=False,
                                         num_workers=4,
                                         pin_memory=True)

    print("construct model")
    #model = ResNet50()
    #model=torchvision.models.AlexNet()
    model = AlexNet()
    #model = torch.nn.DataParallel(model, device_ids=gpus).cuda(gpus[0])
    model.cuda()

    criterion = nn.CrossEntropyLoss().cuda(gpus[0])
    optimizer = torch.optim.SGD(model.parameters(),
                                0.01,
                                momentum=0.875,
                                weight_decay=3.0517578125e-05)

    model.train()
    print("begin trainning")
    for epoch in range(0, 50):
        batch_time = AverageMeter('Time', ':6.3f')
        data_time = AverageMeter('Data', ':6.3f')
        losses = AverageMeter('Loss', ':.4e')
        top1 = AverageMeter('Acc@1', ':6.2f')
        top5 = AverageMeter('Acc@5', ':6.2f')

        progress = ProgressMeter(len(train_ld),
                                 [batch_time, data_time, losses, top1, top5],
                                 prefix="Epoch: [{}]".format(epoch))

        end = time.time()
        for i, (images, labels) in enumerate(train_ld):
            data_time.update(time.time() - end)
            print('image shape: ', images.shape)
            print('labels shape: ', labels.shape)
            images = images.cuda(gpus[0], non_blocking=True)
            labels = labels.cuda(gpus[0], non_blocking=True)

            outputs = model(images)
            loss = criterion(outputs, labels)

            # measure accuracy
            acc1, acc5 = accuracy(outputs, labels, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1[0], images.size(0))
            top5.update(acc5[0], images.size(0))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            batch_time.update(time.time() - end)
            end = time.time()

            if i % 10 == 0:
                progress.display(i)
Ejemplo n.º 8
0
    dataset = torchvision.datasets.ImageFolder(path, transform=transformation)
    loader = torch.utils.data.DataLoader(dataset,
                                         batch_size=64,
                                         num_workers=0,
                                         shuffle=shuffle)
    return loader


# https://github.com/fastai/imagenette
# https://s3.amazonaws.com/fast-ai-imageclas/imagenette2.tgz
train_loader = train_dataset(r'data/imagenette2-320/train', shuffle=True)
val_loader = val_dataset(r'data/imagenette2-320/val', shuffle=False)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr=1e-4, momentum=0.9)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                       mode='max',
                                                       factor=0.1,
                                                       patience=2,
                                                       verbose=True,
                                                       threshold=0.1,
                                                       threshold_mode='rel',
                                                       cooldown=0,
                                                       min_lr=0,
                                                       eps=1e-08)


def train(epoch):
    net.train()
Ejemplo n.º 9
0
    num_workers=2
)

####################################################################
# 
# 			Load the model
# 
####################################################################


model = AlexNet().to(device)
criterion = nn.CrossEntropyLoss()

# Stochastic gradient descent
optimizer = optim.SGD(
	model.parameters(),
	lr=0.01,
	weight_decay=0.0005,
	momentum=0.9,
)


def test_model(model, epoch):
	
	correct = 0
	total = 0
	with torch.no_grad():

		for i, data in enumerate(test_loader, 0):
			images, labels, _ = data
			images, labels = images.to(device), labels.to(device)
Ejemplo n.º 10
0
        name = k[7:]  # remove `module.`
        # name = k[9:]  # remove `module.1.`
        new_check_point[name] = v
    net.load_state_dict(new_check_point)


net = net.to(device)

if device == 'cuda':
    net = torch.nn.DataParallel(net)
    cudnn.benchmark = True



criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4)

#
# # Training
# def main():
#     AV = [[] for _ in range(5)]
#     MAV = [[] for _ in range(5)]
#     with torch.no_grad():
#         for batch_idx, (inputs, targets) in enumerate(trainloader):
#             print("Processing {} batches in {}".format(batch_idx, 25000//args.bs))
#             inputs, targets = inputs.to(device), targets.to(device)
#             outputs = net(inputs)
#             for score, t in zip(outputs, targets):
#                 if np.argmax(score) == t:
#                     AV[t].append(score.unsqueeze(dim=0))
#     for i in range(5):
Ejemplo n.º 11
0
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse',
           'ship', 'truck')

# Model
print('==> Building model..')

net = AlexNet(5)

net = net.to(device)

if device == 'cuda':
    net = torch.nn.DataParallel(net)
    cudnn.benchmark = True

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(),
                      lr=args.lr,
                      momentum=0.9,
                      weight_decay=5e-4)


# Training
def train(epoch):
    adjust_learning_rate(optimizer, epoch, args.lr)
    print('\nEpoch: %d   Learning rate: %f' %
          (epoch, optimizer.param_groups[0]['lr']))
    net.train()
    train_loss = 0
    correct = 0
    total = 0
Ejemplo n.º 12
0
image_datasets = {
    'train': ImageFolder(train_dir, transform=data_transforms['train']),
    'val': ImageFolder(val_dir, transform=data_transforms['val']),
}

data_loader = {
    x: torch.utils.data.DataLoader(image_datasets[x],
                                   batch_size=batch_size,
                                   shuffle=True,
                                   num_workers=num_workers)
    for x in ['train', 'val']
}

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),
                       lr=learning_rate,
                       weight_decay=weight_decay)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                 'min',
                                                 verbose=True)

model = model.to(device)
model = train(model,
              data_loader,
              criterion,
              optimizer,
              scheduler,
              num_epochs=num_epoches)

n_datapoints = len(model.train_epoch_loss)
Ejemplo n.º 13
0
def main():
    progress = default_progress()
    experiment_dir = 'experiment/miniplaces'
    # Here's our data
    train_loader = torch.utils.data.DataLoader(CachedImageFolder(
        'dataset/miniplaces/simple/train',
        transform=transforms.Compose([
            transforms.Resize(128),
            transforms.RandomCrop(119),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize(IMAGE_MEAN, IMAGE_STDEV)
        ])),
                                               batch_size=64,
                                               shuffle=True,
                                               num_workers=6,
                                               pin_memory=True)
    val_loader = torch.utils.data.DataLoader(CachedImageFolder(
        'dataset/miniplaces/simple/val',
        transform=transforms.Compose([
            transforms.Resize(128),
            transforms.CenterCrop(119),
            transforms.ToTensor(),
            transforms.Normalize(IMAGE_MEAN, IMAGE_STDEV)
        ])),
                                             batch_size=512,
                                             shuffle=False,
                                             num_workers=6,
                                             pin_memory=True)
    # Create a simplified AlexNet with half resolution.
    model = AlexNet(first_layer='conv1',
                    last_layer='fc8',
                    layer_sizes=dict(fc6=2048, fc7=2048),
                    output_channels=100,
                    half_resolution=True,
                    include_lrn=False,
                    split_groups=False).cuda()
    # Use Kaiming initialization for the weights
    for name, val in model.named_parameters():
        if 'weight' in name:
            init.kaiming_uniform_(val)
        else:
            # Init positive bias in many layers to avoid dead neurons.
            assert 'bias' in name
            init.constant_(
                val, 0 if any(
                    name.startswith(layer)
                    for layer in ['conv1', 'conv3', 'fc8']) else 1)
    # An abbreviated training schedule: 40000 batches.
    # TODO: tune these hyperparameters.
    # init_lr = 0.002
    init_lr = 0.002
    # max_iter = 40000 - 34.5% @1
    # max_iter = 50000 - 37% @1
    # max_iter = 80000 - 39.7% @1
    # max_iter = 100000 - 40.1% @1
    max_iter = 100000
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(
        model.parameters(),
        lr=init_lr,
        momentum=0.9,  # 0.9,
        # weight_decay=0.001)
        weight_decay=0.001)
    iter_num = 0
    best = dict(val_accuracy=0.0)
    model.train()
    # Oh, hold on.  Let's actually resume training if we already have a model.
    checkpoint_filename = 'miniplaces.pth.tar'
    best_filename = 'best_%s' % checkpoint_filename
    best_checkpoint = os.path.join(experiment_dir, best_filename)
    try_to_resume_training = False
    if try_to_resume_training and os.path.exists(best_checkpoint):
        checkpoint = torch.load(os.path.join(experiment_dir, best_filename))
        iter_num = checkpoint['iter']
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        best['val_accuracy'] = checkpoint['accuracy']

    def save_checkpoint(state, is_best):
        filename = os.path.join(experiment_dir, checkpoint_filename)
        ensure_dir_for(filename)
        torch.save(state, filename)
        if is_best:
            shutil.copyfile(filename,
                            os.path.join(experiment_dir, best_filename))

    def validate_and_checkpoint():
        model.eval()
        val_loss, val_acc = AverageMeter(), AverageMeter()
        for input, target in progress(val_loader):
            # Load data
            input_var, target_var = [
                Variable(d.cuda(non_blocking=True)) for d in [input, target]
            ]
            # Evaluate model
            with torch.no_grad():
                output = model(input_var)
                loss = criterion(output, target_var)
                _, pred = output.max(1)
                accuracy = (target_var.eq(pred)
                            ).data.float().sum().item() / input.size(0)
            val_loss.update(loss.data.item(), input.size(0))
            val_acc.update(accuracy, input.size(0))
            # Check accuracy
            post_progress(l=val_loss.avg, a=val_acc.avg)
        # Save checkpoint
        save_checkpoint(
            {
                'iter': iter_num,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'accuracy': val_acc.avg,
                'loss': val_loss.avg,
            }, val_acc.avg > best['val_accuracy'])
        best['val_accuracy'] = max(val_acc.avg, best['val_accuracy'])
        post_progress(v=val_acc.avg)

    # Here is our training loop.
    while iter_num < max_iter:
        for input, target in progress(train_loader):
            # Track the average training loss/accuracy for each epoch.
            train_loss, train_acc = AverageMeter(), AverageMeter()
            # Load data
            input_var, target_var = [
                Variable(d.cuda(non_blocking=True)) for d in [input, target]
            ]
            # Evaluate model
            output = model(input_var)
            loss = criterion(output, target_var)
            train_loss.update(loss.data.item(), input.size(0))
            # Perform one step of SGD
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # Also check training set accuracy
            _, pred = output.max(1)
            accuracy = (target_var.eq(pred)).data.float().sum().item() / (
                input.size(0))
            train_acc.update(accuracy)
            remaining = 1 - iter_num / float(max_iter)
            post_progress(l=train_loss.avg,
                          a=train_acc.avg,
                          v=best['val_accuracy'])
            # Advance
            iter_num += 1
            if iter_num >= max_iter:
                break
            # Linear learning rate decay
            lr = init_lr * remaining
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
            # Ocassionally check validation set accuracy and checkpoint
            if iter_num % 1000 == 0:
                validate_and_checkpoint()
                model.train()
Ejemplo n.º 14
0
class Solver(object):
    def __init__(self, config):
        self.model = None
        self.name = config.name
        self.lr = config.lr
        self.momentum = config.momentum
        self.beta = config.beta
        self.max_alpha = config.max_alpha
        self.epochs = config.epochs
        self.patience = config.patience
        self.N = config.N
        self.batch_size = config.batch_size
        self.random_labels = config.random_labels
        self.use_bn = config.batchnorm
        self.criterion = None
        self.optimizer = None
        self.scheduler = None
        self.device = None
        self.cuda = config.cuda
        self.train_loader = None
        self.test_loader = None

    def load_data(self):
        # ToTensor scales pixel values from [0,255] to [0,1]
        mean_var = (125.3 / 255, 123.0 / 255,
                    113.9 / 255), (63.0 / 255, 62.1 / 255, 66.7 / 255)
        transform = transforms.Compose([
            transforms.CenterCrop(28),
            transforms.ToTensor(),
            transforms.Normalize(*mean_var, inplace=True)
        ])
        train_set = torchvision.datasets.CIFAR10(root='./data',
                                                 train=True,
                                                 download=DOWNLOAD,
                                                 transform=transform)
        test_set = torchvision.datasets.CIFAR10(root='./data',
                                                train=False,
                                                download=DOWNLOAD,
                                                transform=transform)

        if self.random_labels:
            np.random.shuffle(train_set.targets)
            np.random.shuffle(test_set.targets)

        assert self.N <= 50000
        if self.N < 50000:
            train_set.data = train_set.data[:self.N]
            # downsize the test set to improve speed for small N
            test_set.data = test_set.data[:self.N]

        self.train_loader = torch.utils.data.DataLoader(
            dataset=train_set,
            batch_size=self.batch_size,
            shuffle=True,
            drop_last=True)
        self.test_loader = torch.utils.data.DataLoader(
            dataset=test_set,
            batch_size=self.batch_size,
            shuffle=False,
            drop_last=True)

    def load_model(self):
        if self.cuda:
            self.device = torch.device('cuda')
            cudnn.benchmark = True
        else:
            self.device = torch.device('cpu')

        self.model = AlexNet(device=self.device,
                             B=self.batch_size,
                             max_alpha=self.max_alpha,
                             use_bn=self.use_bn).to(self.device)

        self.optimizer = optim.SGD(self.model.parameters(),
                                   lr=self.lr,
                                   momentum=self.momentum)
        self.scheduler = optim.lr_scheduler.StepLR(self.optimizer,
                                                   step_size=140)
        self.criterion = nn.NLLLoss().to(self.device)

    def getIw(self):
        # Iw should be normalized with respect to N
        # via reparameterization, we optimize alpha with only 1920 dimensions
        # but Iw should scale with the dimension of the weights
        return 7 * 7 * 64 * 384 / 1920 * self.model.getIw() / self.batch_size

    def do_batch(self, train, epoch):
        loader = self.train_loader if train else self.test_loader
        total_ce, total_Iw, total_loss = 0, 0, 0
        total_correct = 0
        total = 0
        pbar = tqdm(loader)
        num_batches = len(loader)
        for batch_num, (data, target) in enumerate(pbar):
            data, target = data.to(self.device), target.to(self.device)
            if train:
                self.optimizer.zero_grad()
            output = self.model(data)
            # NLLLoss is averaged across observations for each minibatch
            ce = self.criterion(torch.log(output + EPS), target)
            Iw = self.getIw()
            loss = ce + 0.5 * self.beta * Iw
            if train:
                loss.backward()
                self.optimizer.step()
            total_ce += ce.item()
            total_Iw += Iw.item()
            total_loss += loss.item()
            prediction = torch.max(
                output,
                1)  # second param "1" represents the dimension to be reduced
            total_correct += np.sum(
                prediction[1].cpu().numpy() == target.cpu().numpy())
            total += target.size(0)

            a = self.model.get_a()
            pbar.set_description('Train' if train else 'Test')
            pbar.set_postfix(N=self.N,
                             b=self.beta,
                             ep=epoch,
                             acc=100. * total_correct / total,
                             loss=total_loss / num_batches,
                             ce=total_ce / num_batches,
                             Iw=total_Iw / num_batches,
                             a=a)
        return total_correct / total, total_loss / num_batches, total_ce / num_batches, total_Iw / num_batches, a

    def train(self, epoch):
        self.model.train()
        return self.do_batch(train=True, epoch=epoch)

    def test(self, epoch):
        self.model.eval()
        with torch.no_grad():
            return self.do_batch(train=False, epoch=epoch)

    def save(self, name=None):
        model_out_path = (name or self.name) + ".pth"
        # torch.save(self.model, model_out_path)
        # print("Checkpoint saved to {}".format(model_out_path))

    def run(self):
        self.load_data()
        self.load_model()
        results = []
        best_acc, best_ep = -1, -1
        for epoch in range(1, self.epochs + 1):
            # print("\n===> epoch: %d/200" % epoch)
            train_acc, train_loss, train_ce, train_Iw, train_a = self.train(
                epoch)
            self.scheduler.step(epoch)
            test_acc, test_loss, test_ce, test_Iw, test_a = self.test(epoch)
            results.append([
                self.N, self.beta, train_acc, test_acc, train_loss, test_loss,
                train_ce, test_ce, train_Iw, test_Iw, train_a, test_a
            ])

            if test_acc > best_acc:
                best_acc, best_ep = test_acc, epoch
            if self.patience >= 0:  # early stopping
                if best_ep < epoch - self.patience:
                    break

        with open(self.name + '.csv', 'a') as f:
            w = csv.writer(f)
            w.writerows(results)
        self.save()

        return train_acc, test_acc
Ejemplo n.º 15
0
def run_experiment(args):
    torch.manual_seed(args.seed)
    if not args.no_cuda:
        torch.cuda.manual_seed(args.seed)

    # Dataset
    if args.dataset == 'mnist':
        train_loader, test_loader, _, val_data = prepare_mnist(args)
    else:
        create_val_img_folder(args)
        train_loader, test_loader, _, val_data = prepare_imagenet(args)
    idx_to_class = {i: c for c, i in val_data.class_to_idx.items()}

    # Model & Criterion
    if args.model == 'AlexNet':
        if args.pretrained:
            model = models.__dict__['alexnet'](pretrained=True)
            # Change the last layer
            in_f = model.classifier[-1].in_features
            model.classifier[-1] = nn.Linear(in_f, args.classes)
        else:
            model = AlexNet(args.classes)
        criterion = nn.CrossEntropyLoss(size_average=False)
    else:
        model = SVM(args.features, args.classes)
        criterion = MultiClassHingeLoss(margin=args.margin, size_average=False)
    if not args.no_cuda:
        model.cuda()

    # Load saved model and test on it
    if args.load:
        model.load_state_dict(torch.load(args.model_path))
        val_acc = test(model, criterion, test_loader, 0, [], [], idx_to_class,
                       args)

    # Optimizer
    if args.optimizer == 'adam':
        optimizer = optim.Adam(model.parameters())
    else:
        optimizer = optim.SGD(model.parameters(),
                              lr=args.lr,
                              momentum=args.momentum)

    total_minibatch_count = 0
    val_acc = 0
    train_losses, train_accs = [], []
    val_losses, val_accs = [], []

    # Train and test
    for epoch in range(1, args.epochs + 1):
        total_minibatch_count = train(model, criterion, optimizer,
                                      train_loader, epoch,
                                      total_minibatch_count, train_losses,
                                      train_accs, args)

        val_acc = test(model, criterion, test_loader, epoch, val_losses,
                       val_accs, idx_to_class, args)

    # Save model
    if args.save:
        if not os.path.exists(args.models_dir):
            os.makedirs(args.models_dir)
        filename = '_'.join(
            [args.prefix, args.dataset, args.model, 'model.pt'])
        torch.save(model.state_dict(), os.path.join(args.models_dir, filename))

    # Plot graphs
    fig, axes = plt.subplots(1, 4, figsize=(13, 4))
    axes[0].plot(train_losses)
    axes[0].set_title('Loss')
    axes[1].plot(train_accs)
    axes[1].set_title('Acc')
    axes[1].set_ylim([0, 1])
    axes[2].plot(val_losses)
    axes[2].set_title('Val loss')
    axes[3].plot(val_accs)
    axes[3].set_title('Val Acc')
    axes[3].set_ylim([0, 1])
    # Images don't show on Ubuntu
    # plt.tight_layout()

    # Save results
    if not os.path.exists(args.results_dir):
        os.makedirs(args.results_dir)
    filename = '_'.join([args.prefix, args.dataset, args.model, 'plot.png'])
    fig.suptitle(filename)
    fig.savefig(os.path.join(args.results_dir, filename))
Ejemplo n.º 16
0
    logger = Logger('./largecnnlogs')

    lossfunction = nn.MSELoss()

    dataset = Rand_num()
    sampler = RandomSampler(dataset)
    loader = DataLoader(dataset,
                        batch_size=20,
                        sampler=sampler,
                        shuffle=False,
                        num_workers=1,
                        drop_last=True)
    net = AlexNet(3)
    #net.load_state_dict(torch.load(SAVE_PATH))
    net.cuda()
    optimizer = optim.Adam(net.parameters(), lr=0.001)
    for epoch in range(10000):
        for i, data in enumerate(loader, 0):
            net.zero_grad()
            video, labels = data
            video = video.view(-1, 3, 227, 227)
            labels = labels.view(-1, 3)
            labels = torch.squeeze(Variable(labels.float().cuda()))
            video = torch.squeeze(Variable((video.float() / 256).cuda()))
            net.train()
            outputs = net.forward(video)
            loss = lossfunction(outputs, labels)
            loss.backward()
            optimizer.step()
            if i == 0:
                torch.save(net.state_dict(), SAVE_PATH)
Ejemplo n.º 17
0
def run():
    global args
    args = parser.parse_args()

    if args.resume:
        print 'Resuming from checkpoint!'
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # load datasets, split train and test
    '''
    train_dataset = SketchData(root=path,
                            train=True, 
                            transform=transforms.ToTensor(),
                            target_transform=transforms.ToTensor(),
                            )
    test_dataset = SketchData(root=path,
                            train=False, 
                            transform=transforms.ToTensor(),
                            target_transform=transforms.ToTensor(),
                            )
    '''
    train_dataset = SketchData(
        root=path,
        train=True,
        transform=None,
        target_transform=None,
    )
    val_dataset = SketchData(
        root=path,
        train=False,
        transform=None,
        target_transform=None,
    )
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=args.b,
                                               shuffle=True)
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                             batch_size=args.b,
                                             shuffle=False)

    # create model, set parameters, optimiser, loss
    model = AlexNet()
    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.wd)
    criterion = nn.CrossEntropyLoss()

    best_prec = 0
    for epoch in range(args.epochs):
        adjust_learning_rate(optimizer, epoch)
        train(train_loader, model, criterion, optimizer, epoch)
        precision = validate(val_loader, model, criterion)
        best_prec = max(precision.data[0], best_prec)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'arch': args.arch,
                'best_prec1': best_prec,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
            }, (precision.data[0] > best_prec))
Ejemplo n.º 18
0
                               pretrained=False)
    elif args.model == 'resnext':
        model = resnext50(img_channel=3, num_classes=19)
    elif args.model == 'alexnet':
        model = AlexNet(num_classes=19)
    else:
        raise ValueError("This utility can't train that kind of model.")

    logging.info("Setting dataset")
    torch.cuda.empty_cache()
    train_dataset = TrainImageDataset(
        args.data,
        224,
        224,
    )
    optimizer = torch.optim.Adam(model.parameters(), args.lr)
    # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
    criterion = nn.CrossEntropyLoss()
    config = wandb.config

    # wandb config specification
    config.learning_rate = args.lr
    config.batch_size = args.batch_size
    config.model = args.model

    logging.info("Training...")
    train_loss, train_acc, test_acc, test_loss = train_for_classification(
        net=model,
        dataset=train_dataset,
        batch_size=args.batch_size,
        optimizer=optimizer,
Ejemplo n.º 19
0
                            train=False,
                            download=True,
                            transform=transform)

# Define test batch data
testloader = torch.utils.data.DataLoader(
    testset,
    batch_size=BATCH_SIZE,
    shuffle=True,
)

# Define loss function loss function and optimization method (using SGD)
net = AlexNet().to(device)
criterion = nn.CrossEntropyLoss(
)  # Cross entropy loss function, usually used for multi-classification problems
optimizer = optim.SGD(net.parameters(), lr=LR, momentum=0.9)


# Train and save model parameters
def train():

    for epoch in range(EPOCH):
        sum_loss = 0.0
        # Data read
        for i, data in enumerate(trainloader):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # Gradient clear
            optimizer.zero_grad()
def train_generic_model(model_name="alexnet",
                        dataset="custom",
                        num_classes=-1,
                        batch_size=8,
                        is_transform=1,
                        num_workers=2,
                        lr_decay=1,
                        l2_reg=0,
                        hdf5_path="dataset-bosch-224x224.hdf5",
                        trainset_dir="./TRAIN_data_224_v8",
                        testset_dir="./TEST_data_224_v8",
                        convert_grey=False):
    CHKPT_PATH = "./checkpoint_{}.PTH".format(model_name)
    print("CUDA:")
    print(torch.cuda.is_available())
    if is_transform:

        trans_ls = []
        if convert_grey:
            trans_ls.append(transforms.Grayscale(num_output_channels=1))
        trans_ls.extend([
            transforms.Resize((224, 224)),
            # transforms.RandomCrop((224, 224)),
            # transforms.Grayscale(num_output_channels=1),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])
        transform = transforms.Compose(trans_ls)
    else:
        transform = None

    print("DATASET FORMAT: {}".format(dataset))
    print("TRAINSET PATH: {}".format(trainset_dir))
    print("TESTSET PATH: {}".format(testset_dir))
    print("HDF5 PATH: {}".format(hdf5_path))
    if dataset == "custom":
        trainset = torchvision.datasets.ImageFolder(root=trainset_dir,
                                                    transform=transform)
        train_size = len(trainset)
        testset = torchvision.datasets.ImageFolder(root=testset_dir,
                                                   transform=transform)
        test_size = len(testset)
    elif dataset == "cifar":
        trainset = torchvision.datasets.CIFAR10(root="CIFAR_TRAIN_data",
                                                train=True,
                                                download=True,
                                                transform=transform)
        train_size = len(trainset)
        testset = torchvision.datasets.CIFAR10(root="CIFAR_TEST_data",
                                               train=False,
                                               download=True,
                                               transform=transform)
        test_size = len(testset)
    elif dataset == "hdf5":
        if num_workers == 1:
            trainset = Hdf5Dataset(hdf5_path,
                                   transform=transform,
                                   is_test=False)
        else:
            trainset = Hdf5DatasetMPI(hdf5_path,
                                      transform=transform,
                                      is_test=False)
        train_size = len(trainset)
        if num_workers == 1:
            testset = Hdf5Dataset(hdf5_path, transform=transform, is_test=True)
        else:
            testset = Hdf5DatasetMPI(hdf5_path,
                                     transform=transform,
                                     is_test=True)
        test_size = len(testset)

    train_loader = torch.utils.data.DataLoader(trainset,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               num_workers=num_workers)

    test_loader = torch.utils.data.DataLoader(testset,
                                              batch_size=batch_size,
                                              shuffle=True,
                                              num_workers=num_workers)
    if model_name == "alexnet":
        net = AlexNet(num_classes=num_classes)
    elif model_name == "lenet5":
        net = LeNet5(num_classes=num_classes)
    elif model_name == "stn-alexnet":
        net = STNAlexNet(num_classes=num_classes)
    elif model_name == "stn-lenet5":
        net = LeNet5STN(num_classes=num_classes)
    elif model_name == "capsnet":
        net = CapsuleNet(num_classes=num_classes)
    elif model_name == "convneta":
        net = ConvNetA(num_classes=num_classes)
    elif model_name == "convnetb":
        net = ConvNetB(num_classes=num_classes)
    elif model_name == "convnetc":
        net = ConvNetC(num_classes=num_classes)
    elif model_name == "convnetd":
        net = ConvNetD(num_classes=num_classes)
    elif model_name == "convnete":
        net = ConvNetE(num_classes=num_classes)
    elif model_name == "convnetf":
        net = ConvNetF(num_classes=num_classes)
    elif model_name == "convnetg":
        net = ConvNetG(num_classes=num_classes)
    elif model_name == "convneth":
        net = ConvNetH(num_classes=num_classes)
    elif model_name == "convneti":
        net = ConvNetI(num_classes=num_classes)
    elif model_name == "convnetj":
        net = ConvNetJ(num_classes=num_classes)
    elif model_name == "convnetk":
        net = ConvNetK(num_classes=num_classes)
    elif model_name == "convnetl":
        net = ConvNetL(num_classes=num_classes)
    elif model_name == "convnetm":
        net = ConvNetM(num_classes=num_classes)
    elif model_name == "convnetn":
        net = ConvNetN(num_classes=num_classes)
    elif model_name == "resnet18":
        net = models.resnet18(pretrained=False, num_classes=num_classes)

    print(net)

    if torch.cuda.is_available():
        net = net.cuda()

    if model_name == "capsnet":
        criterion = CapsuleLoss()
    else:
        criterion = nn.CrossEntropyLoss()

    optimizer = optim.SGD(net.parameters(),
                          lr=LEARNING_RATE,
                          momentum=0.9,
                          weight_decay=l2_reg)

    if lr_decay:
        scheduler = ReduceLROnPlateau(optimizer, 'min')

    best_acc = 0
    from_epoch = 0

    if os.path.exists(CHKPT_PATH):
        print("Checkpoint Found: {}".format(CHKPT_PATH))
        state = torch.load(CHKPT_PATH)
        net.load_state_dict(state['state_dict'])
        optimizer.load_state_dict(state['optimizer'])
        best_acc = state['best_accuracy']
        from_epoch = state['epoch']

    for epoch in range(from_epoch, NUM_EPOCHS):
        #print("Epoch: {}/{}".format(epoch + 1, NUM_EPOCHS))
        epoch_loss = 0
        correct = 0
        for i, data in enumerate(train_loader, 0):
            #print("Train \t Epoch: {}/{} \t Batch: {}/{}".format(epoch + 1,
            #                                            NUM_EPOCHS,
            #                                            i + 1,
            #                                            ceil(train_size / BATCH_SIZE)))
            inputs, labels = data
            inputs, labels = Variable(inputs).type(torch.FloatTensor),\
                             Variable(labels).type(torch.LongTensor)

            if model_name == "capsnet":
                inputs = augmentation(inputs)
                ground_truth = torch.eye(num_classes).index_select(
                    dim=0, index=labels)

            if torch.cuda.is_available():
                inputs = inputs.cuda()
                labels = labels.cuda()

            optimizer.zero_grad()

            if model_name == "capsnet":
                classes, reconstructions = net(inputs, ground_truth)
                loss = criterion(inputs, ground_truth, classes,
                                 reconstructions)
            else:
                outputs = net(inputs)
                loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()
            epoch_loss += loss.data[0]
            if model_name != "capsnet":
                log_outputs = F.softmax(outputs, dim=1)
            else:
                log_outputs = classes
            pred = log_outputs.data.max(1, keepdim=True)[1]
            correct += pred.eq(labels.data.view_as(pred)).sum()

        print(
            "Epoch: {} \t Training Loss: {:.4f} \t Training Accuracy: {:.2f} \t {}/{}"
            .format(epoch + 1, epoch_loss / train_size,
                    100 * correct / train_size, correct, train_size))

        correct = 0
        test_loss = 0
        for i, data in enumerate(test_loader, 0):
            # print("Test \t Epoch: {}/{} \t Batch: {}/{}".format(epoch + 1,
            #                                             NUM_EPOCHS,
            #                                             i + 1,
            #                                             ceil(test_size / BATCH_SIZE)))
            inputs, labels = data
            inputs, labels = Variable(inputs).type(
                torch.FloatTensor), Variable(labels).type(torch.LongTensor)

            if model_name == "capsnet":
                inputs = augmentation(inputs)
                ground_truth = torch.eye(num_classes).index_select(
                    dim=0, index=labels)

            if torch.cuda.is_available():
                inputs = inputs.cuda()
                labels = labels.cuda()

            if model_name == "capsnet":
                classes, reconstructions = net(inputs)
                loss = criterion(inputs, ground_truth, classes,
                                 reconstructions)
            else:
                outputs = net(inputs)
                loss = criterion(outputs, labels)

            test_loss += loss.data[0]

            if model_name != "capsnet":
                log_outputs = F.softmax(outputs, dim=1)
            else:
                log_outputs = classes

            pred = log_outputs.data.max(1, keepdim=True)[1]
            correct += pred.eq(labels.data.view_as(pred)).sum()
        print(
            "Epoch: {} \t Testing Loss: {:.4f} \t Testing Accuracy: {:.2f} \t {}/{}"
            .format(epoch + 1, test_loss / test_size,
                    100 * correct / test_size, correct, test_size))
        if correct >= best_acc:
            if not os.path.exists("./models"):
                os.mkdir("./models")
            torch.save(
                net.state_dict(),
                "./models/model-{}-{}-{}-{}-val-acc-{:.2f}-train-{}-test-{}-epoch-{}.pb"
                .format(model_name, dataset, hdf5_path, str(datetime.now()),
                        100 * correct / test_size,
                        trainset_dir.replace(" ", "_").replace("/", "_"),
                        testset_dir.replace(" ", "_").replace("/",
                                                              "_"), epoch + 1))
        best_acc = max(best_acc, correct)

        # save checkpoint path
        state = {
            'epoch': epoch,
            'state_dict': net.state_dict(),
            'optimizer': optimizer.state_dict(),
            'best_accuracy': best_acc
        }
        torch.save(state, CHKPT_PATH)

        if lr_decay:
            # Note that step should be called after validate()
            scheduler.step(test_loss)

    print('Finished Training')

    print("")
    print("")