class TrainNetwork():
    def __init__(self, dataset, batch_size, epochs, lr, lr_decay_epoch,
                 momentum):
        assert (dataset == 'letters' or dataset == 'mnist')

        self.dataset = dataset
        self.batch_size = batch_size
        self.epochs = epochs
        self.lr = lr
        self.lr_decay_epoch = lr_decay_epoch
        self.momentum = momentum

        # letters contains 27 classes, digits contains 10 classes
        num_classes = 27 if dataset == 'letters' else 10

        # Load pre learned AlexNet with changed number of output classes
        state_dict = torch.load('./trained_models/alexnet.pth')
        state_dict['classifier.6.weight'] = torch.zeros(num_classes, 4096)
        state_dict['classifier.6.bias'] = torch.zeros(num_classes)
        self.model = AlexNet(num_classes)
        self.model.load_state_dict(state_dict)

        # Use cuda if available
        if torch.cuda.is_available():
            self.model.cuda()

        # Load training dataset
        kwargs = {
            'num_workers': 1,
            'pin_memory': True
        } if torch.cuda.is_available() else {}
        self.train_loader = torch.utils.data.DataLoader(
            EMNIST('./data',
                   dataset,
                   download=True,
                   transform=transforms.Compose([
                       transforms.Lambda(correct_rotation),
                       transforms.Lambda(random_transform),
                       transforms.Resize((224, 224)),
                       transforms.RandomResizedCrop(224, (0.9, 1.1),
                                                    ratio=(0.9, 1.1)),
                       transforms.Grayscale(3),
                       transforms.ToTensor(),
                   ])),
            batch_size=batch_size,
            shuffle=True,
            **kwargs)

        # Optimizer and loss function
        self.optimizer = optim.SGD(self.model.parameters(),
                                   lr=self.lr,
                                   momentum=self.momentum)
        self.loss_fn = nn.CrossEntropyLoss()

    def reduce_learning_rate(self, epoch):
        """
        Reduce the learning rate by factor 0.1 every lr_decay_epoch
        :param optimizer: Optimizer containing the learning rate
        :param epoch: Current epoch
        :param init_lr: Initial learning rate
        :param lr_decay_epoch: Number of epochs until learning rate gets reduced
        :return: None
        """
        lr = self.lr * (0.1**(epoch // self.lr_decay_epoch))

        if epoch % self.lr_decay_epoch == 0:
            print('LR is set to {}'.format(lr))

        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr

    def train(self, epoch):
        """
        Train the model for one epoch and save the result as a .pth file
        :param epoch: Current epoch
        :return: None
        """
        self.model.train()

        train_loss = 0
        train_correct = 0
        progress = None
        for batch_idx, (data, target) in enumerate(self.train_loader):
            # Get data and label
            if torch.cuda.is_available():
                data, target = data.cuda(), target.cuda()
            data, target = Variable(data), Variable(target)

            # Optimize using backpropagation
            self.optimizer.zero_grad()
            output = self.model(data)
            loss = self.loss_fn(output, target)
            train_loss += loss.data[0]
            pred = output.data.max(1, keepdim=True)[1]
            train_correct += pred.eq(target.data.view_as(pred)).sum()
            loss.backward()
            self.optimizer.step()

            # Print information about current step
            current_progress = int(100 * (batch_idx + 1) * self.batch_size /
                                   len(self.train_loader.dataset))
            if current_progress is not progress and current_progress % 5 == 0:
                progress = current_progress
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, (batch_idx + 1) * len(data),
                    len(self.train_loader.dataset), current_progress,
                    loss.data[0]))

        train_loss /= (len(self.train_loader.dataset) / self.batch_size)
        train_correct /= len(self.train_loader.dataset)
        train_correct *= 100

        # Print information about current epoch
        print(
            'Train Epoch: {} \tCorrect: {:3.2f}%\tAverage loss: {:.6f}'.format(
                epoch, train_correct, train_loss))

        # Save snapshot
        torch.save(
            {
                'model': self.model.state_dict(),
                'optimizer': self.optimizer.state_dict()
            }, './trained_models/{}_{}.pth'.format(self.dataset, epoch))

    def start(self):
        """
        Start training the network
        :return: None
        """
        for epoch in range(1, self.epochs + 1):
            self.reduce_learning_rate(epoch)
            self.train(epoch)
Example #2
0
def main():

    #gpus=[4,5,6,7]
    gpus = [0]
    print("GPUs :", gpus)
    print("prepare data")
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_tfs = transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(), normalize
    ])

    val_tfs = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(), normalize
    ])

    train_ds = datasets.ImageFolder('/home/gw/data/imagenet_10/train',
                                    train_tfs)
    val_ds = datasets.ImageFolder('/home/gw/data/imagenet_10/val', val_tfs)

    train_ld = torch.utils.data.DataLoader(train_ds,
                                           batch_size=256,
                                           shuffle=True,
                                           num_workers=4,
                                           pin_memory=True)

    val_ld = torch.utils.data.DataLoader(val_ds,
                                         batch_size=64,
                                         shuffle=False,
                                         num_workers=4,
                                         pin_memory=True)

    print("construct model")
    #model = ResNet50()
    #model=torchvision.models.AlexNet()
    model = AlexNet()
    #model = torch.nn.DataParallel(model, device_ids=gpus).cuda(gpus[0])
    model.cuda()

    criterion = nn.CrossEntropyLoss().cuda(gpus[0])
    optimizer = torch.optim.SGD(model.parameters(),
                                0.01,
                                momentum=0.875,
                                weight_decay=3.0517578125e-05)

    model.train()
    print("begin trainning")
    for epoch in range(0, 50):
        batch_time = AverageMeter('Time', ':6.3f')
        data_time = AverageMeter('Data', ':6.3f')
        losses = AverageMeter('Loss', ':.4e')
        top1 = AverageMeter('Acc@1', ':6.2f')
        top5 = AverageMeter('Acc@5', ':6.2f')

        progress = ProgressMeter(len(train_ld),
                                 [batch_time, data_time, losses, top1, top5],
                                 prefix="Epoch: [{}]".format(epoch))

        end = time.time()
        for i, (images, labels) in enumerate(train_ld):
            data_time.update(time.time() - end)
            print('image shape: ', images.shape)
            print('labels shape: ', labels.shape)
            images = images.cuda(gpus[0], non_blocking=True)
            labels = labels.cuda(gpus[0], non_blocking=True)

            outputs = model(images)
            loss = criterion(outputs, labels)

            # measure accuracy
            acc1, acc5 = accuracy(outputs, labels, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1[0], images.size(0))
            top5.update(acc5[0], images.size(0))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            batch_time.update(time.time() - end)
            end = time.time()

            if i % 10 == 0:
                progress.display(i)
Example #3
0
def train(train_loader, eval_loader, opt):
    print('==> Start training...')

    summary_writer = SummaryWriter('./runs/' + str(int(time.time())))

    is_cuda = torch.cuda.is_available()
    model = AlexNet()
    if is_cuda:
        model = model.cuda()

    optimizer = optim.SGD(
        params=model.parameters(),
        lr=opt.base_lr,
        momentum=0.9,
    )
    criterion = nn.CrossEntropyLoss()

    best_eval_acc = -0.1
    losses = AverageMeter()
    accuracies = AverageMeter()
    global_step = 0
    for epoch in range(1, opt.epochs + 1):
        # train
        model.train()
        for batch_idx, (inputs, targets) in enumerate(train_loader):
            global_step += 1
            if is_cuda:
                inputs = inputs.cuda()
                targets = targets.cuda()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            losses.update(loss.item(), outputs.shape[0])
            summary_writer.add_scalar('train/loss', loss, global_step)

            _, preds = torch.max(outputs, dim=1)
            acc = preds.eq(targets).sum().item() / len(targets)
            accuracies.update(acc)
            summary_writer.add_scalar('train/acc', acc, global_step)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'],
                                  global_step)
        print(
            '==> Epoch: %d; Average Train Loss: %.4f; Average Train Acc: %.4f'
            % (epoch, losses.avg, accuracies.avg))

        # eval
        model.eval()
        losses.reset()
        accuracies.reset()
        for batch_idx, (inputs, targets) in enumerate(eval_loader):
            if is_cuda:
                inputs = inputs.cuda()
                targets = targets.cuda()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            losses.update(loss.item(), outputs.shape[0])

            _, preds = torch.max(outputs, dim=1)
            acc = preds.eq(targets).sum().item() / len(targets)
            accuracies.update(acc)

        summary_writer.add_scalar('eval/loss', losses.avg, global_step)
        summary_writer.add_scalar('eval/acc', accuracies.avg, global_step)
        if accuracies.avg > best_eval_acc:
            best_eval_acc = accuracies.avg
            torch.save(model, './weights/best.pt')
        print(
            '==> Epoch: %d; Average Eval Loss: %.4f; Average/Best Eval Acc: %.4f / %.4f'
            % (epoch, losses.avg, accuracies.avg, best_eval_acc))
Example #4
0
    dataset = Rand_num()
    sampler = RandomSampler(dataset)
    loader = DataLoader(dataset,
                        batch_size=20,
                        sampler=sampler,
                        shuffle=False,
                        num_workers=1,
                        drop_last=True)
    net = AlexNet(3)
    #net.load_state_dict(torch.load(SAVE_PATH))
    net.cuda()
    optimizer = optim.Adam(net.parameters(), lr=0.001)
    for epoch in range(10000):
        for i, data in enumerate(loader, 0):
            net.zero_grad()
            video, labels = data
            video = video.view(-1, 3, 227, 227)
            labels = labels.view(-1, 3)
            labels = torch.squeeze(Variable(labels.float().cuda()))
            video = torch.squeeze(Variable((video.float() / 256).cuda()))
            net.train()
            outputs = net.forward(video)
            loss = lossfunction(outputs, labels)
            loss.backward()
            optimizer.step()
            if i == 0:
                torch.save(net.state_dict(), SAVE_PATH)
                print(loss)
                logger.scalar_summary('loss', loss.data.cpu().numpy(), epoch)
Example #5
0
from alexnet import AlexNet

height, width, channel, num_class = 227, 227, 3, 2

dataset_path = 'G:\\dataset\\kaggle\dog-vs-cat\\dogs-vs-cats-redux-kernels-edition\\train'

alexNet = AlexNet(height, width, channel, num_class, dataset_path)
alexNet.train()
Example #6
0
def main(argv=None):
    """ Train ImageNet for a number of steps. """
    if not FLAGS.dataset_dir:
        raise ValueError(
            'You must supply the dataset directory with --dataset_dir')
    if not FLAGS.train_dir:
        raise ValueError(
            'You must supply the dataset directory with --train_dir')
    if not FLAGS.num_reader:
        raise ValueError('Please make num_readers at least 1')

    with tf.Graph().as_default():
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        # Force all input processing onto CPU in order to reserve the GPU for the forward inference and back-propagation.
        with tf.device('/cpu:0'):
            image_batch, label_batch = DataProvider.distort_input(
                FLAGS.dataset_dir,
                FLAGS.batch_size,
                FLAGS.num_reader,
                FLAGS.num_preprocess_thread,
                is_train=True)
        # Build a Graph that computes the logits predictions from the alextnet model
        logits, _ = AlexNet.train(x=image_batch,
                                  keep_prob=FLAGS.drop_out,
                                  weight_decay=FLAGS.weight_decay)

        num_batches_per_epoch = int(DataProvider.TRAIN_DATASET_SIZE /
                                    FLAGS.batch_size)

        with tf.name_scope('learning_rate'):
            learning_rate = _configure_learning_rate(num_batches_per_epoch,
                                                     global_step)
            tf.summary.scalar('learning_rate', learning_rate)

        with tf.name_scope('cross_entropy'):
            cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=logits, labels=label_batch)
            cross_entropy_mean = tf.reduce_mean(cross_entropy,
                                                name="cross_entropy")
            tf.add_to_collection('losses', cross_entropy_mean)

        with tf.name_scope('total_loss'):
            # The total loss is defined as the cross entropy loss plus all of the weight decay terms (L2 loss).
            total_loss = tf.add_n(tf.get_collection('losses'),
                                  name='total_loss')

        with tf.name_scope('optimizer'):
            optimizer = _configure_optimizer(learning_rate)
            grads = optimizer.compute_gradients(total_loss)
            apply_gradient_op = optimizer.apply_gradients(
                grads, global_step=global_step)

        #with tf.name_scope('accuracy'):
        #    correct = tf.equal(tf.argmax(logits, 1), tf.argmax(label_batch, 1))
        #    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
        #    tf.summary.scalar('accuracy', accuracy)

        # Add histograms for trainable variables.
        for var in tf.trainable_variables():
            tf.summary.histogram(var.op.name, var)

        # Add histograms for gradients.
        for grad, var in grads:
            if grad is not None:
                tf.summary.histogram(var.op.name + '/gradients', grad)

        # Track the moving averages of all trainable variables.
        variable_averages = tf.train.ExponentialMovingAverage(
            FLAGS.moving_average_decay, global_step)
        variables_averages_op = variable_averages.apply(
            tf.trainable_variables())

        train_op = tf.group(apply_gradient_op, variables_averages_op)

        summary_op = tf.summary.merge_all()
        saver = tf.train.Saver()
        init_op = tf.global_variables_initializer()

        with tf.Session(config=_configure_session()) as sess:
            sess.run(init_op)

            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)
            summary_writer = tf.summary.FileWriter(FLAGS.train_dir,
                                                   graph=sess.graph)

            max_steps = int(FLAGS.num_epochs * num_batches_per_epoch)

            for step in xrange(max_steps):
                start_time = time.time()
                _, loss_value = sess.run([train_op, total_loss])
                duration = time.time() - start_time

                if step % FLAGS.log_every_n_steps == 0:
                    examples_per_sec = FLAGS.batch_size / duration
                    sec_per_batch = duration
                    epoch = step / num_batches_per_epoch + 1
                    format_str = (
                        '%s: Epoch %d  Step %d  Total_loss = %.2f  (%.1f examples/sec; %.3f sec/batch)'
                    )
                    print(format_str %
                          (datetime.now(), epoch, step, loss_value,
                           examples_per_sec, sec_per_batch))

                if step % FLAGS.save_summaries_steps == 0:
                    # Visual Training Process
                    summary_str = sess.run(summary_op)
                    summary_writer.add_summary(summary_str, step)

                if step % FLAGS.save_model_steps == 0:
                    checkpoint_path = os.path.join(FLAGS.train_dir,
                                                   'model.ckpt')
                    saver.save(sess, checkpoint_path, global_step=step)

            coord.request_stop()
            coord.join(threads)
Example #7
0
def main():
    progress = default_progress()
    experiment_dir = 'experiment/miniplaces'
    # Here's our data
    train_loader = torch.utils.data.DataLoader(CachedImageFolder(
        'dataset/miniplaces/simple/train',
        transform=transforms.Compose([
            transforms.Resize(128),
            transforms.RandomCrop(119),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize(IMAGE_MEAN, IMAGE_STDEV)
        ])),
                                               batch_size=64,
                                               shuffle=True,
                                               num_workers=6,
                                               pin_memory=True)
    val_loader = torch.utils.data.DataLoader(CachedImageFolder(
        'dataset/miniplaces/simple/val',
        transform=transforms.Compose([
            transforms.Resize(128),
            transforms.CenterCrop(119),
            transforms.ToTensor(),
            transforms.Normalize(IMAGE_MEAN, IMAGE_STDEV)
        ])),
                                             batch_size=512,
                                             shuffle=False,
                                             num_workers=6,
                                             pin_memory=True)
    # Create a simplified AlexNet with half resolution.
    model = AlexNet(first_layer='conv1',
                    last_layer='fc8',
                    layer_sizes=dict(fc6=2048, fc7=2048),
                    output_channels=100,
                    half_resolution=True,
                    include_lrn=False,
                    split_groups=False).cuda()
    # Use Kaiming initialization for the weights
    for name, val in model.named_parameters():
        if 'weight' in name:
            init.kaiming_uniform_(val)
        else:
            # Init positive bias in many layers to avoid dead neurons.
            assert 'bias' in name
            init.constant_(
                val, 0 if any(
                    name.startswith(layer)
                    for layer in ['conv1', 'conv3', 'fc8']) else 1)
    # An abbreviated training schedule: 40000 batches.
    # TODO: tune these hyperparameters.
    # init_lr = 0.002
    init_lr = 0.002
    # max_iter = 40000 - 34.5% @1
    # max_iter = 50000 - 37% @1
    # max_iter = 80000 - 39.7% @1
    # max_iter = 100000 - 40.1% @1
    max_iter = 100000
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(
        model.parameters(),
        lr=init_lr,
        momentum=0.9,  # 0.9,
        # weight_decay=0.001)
        weight_decay=0.001)
    iter_num = 0
    best = dict(val_accuracy=0.0)
    model.train()
    # Oh, hold on.  Let's actually resume training if we already have a model.
    checkpoint_filename = 'miniplaces.pth.tar'
    best_filename = 'best_%s' % checkpoint_filename
    best_checkpoint = os.path.join(experiment_dir, best_filename)
    try_to_resume_training = False
    if try_to_resume_training and os.path.exists(best_checkpoint):
        checkpoint = torch.load(os.path.join(experiment_dir, best_filename))
        iter_num = checkpoint['iter']
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        best['val_accuracy'] = checkpoint['accuracy']

    def save_checkpoint(state, is_best):
        filename = os.path.join(experiment_dir, checkpoint_filename)
        ensure_dir_for(filename)
        torch.save(state, filename)
        if is_best:
            shutil.copyfile(filename,
                            os.path.join(experiment_dir, best_filename))

    def validate_and_checkpoint():
        model.eval()
        val_loss, val_acc = AverageMeter(), AverageMeter()
        for input, target in progress(val_loader):
            # Load data
            input_var, target_var = [
                Variable(d.cuda(non_blocking=True)) for d in [input, target]
            ]
            # Evaluate model
            with torch.no_grad():
                output = model(input_var)
                loss = criterion(output, target_var)
                _, pred = output.max(1)
                accuracy = (target_var.eq(pred)
                            ).data.float().sum().item() / input.size(0)
            val_loss.update(loss.data.item(), input.size(0))
            val_acc.update(accuracy, input.size(0))
            # Check accuracy
            post_progress(l=val_loss.avg, a=val_acc.avg)
        # Save checkpoint
        save_checkpoint(
            {
                'iter': iter_num,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'accuracy': val_acc.avg,
                'loss': val_loss.avg,
            }, val_acc.avg > best['val_accuracy'])
        best['val_accuracy'] = max(val_acc.avg, best['val_accuracy'])
        post_progress(v=val_acc.avg)

    # Here is our training loop.
    while iter_num < max_iter:
        for input, target in progress(train_loader):
            # Track the average training loss/accuracy for each epoch.
            train_loss, train_acc = AverageMeter(), AverageMeter()
            # Load data
            input_var, target_var = [
                Variable(d.cuda(non_blocking=True)) for d in [input, target]
            ]
            # Evaluate model
            output = model(input_var)
            loss = criterion(output, target_var)
            train_loss.update(loss.data.item(), input.size(0))
            # Perform one step of SGD
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # Also check training set accuracy
            _, pred = output.max(1)
            accuracy = (target_var.eq(pred)).data.float().sum().item() / (
                input.size(0))
            train_acc.update(accuracy)
            remaining = 1 - iter_num / float(max_iter)
            post_progress(l=train_loss.avg,
                          a=train_acc.avg,
                          v=best['val_accuracy'])
            # Advance
            iter_num += 1
            if iter_num >= max_iter:
                break
            # Linear learning rate decay
            lr = init_lr * remaining
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
            # Ocassionally check validation set accuracy and checkpoint
            if iter_num % 1000 == 0:
                validate_and_checkpoint()
                model.train()
Example #8
0
class Solver(object):
    def __init__(self, config):
        self.model = None
        self.name = config.name
        self.lr = config.lr
        self.momentum = config.momentum
        self.beta = config.beta
        self.max_alpha = config.max_alpha
        self.epochs = config.epochs
        self.patience = config.patience
        self.N = config.N
        self.batch_size = config.batch_size
        self.random_labels = config.random_labels
        self.use_bn = config.batchnorm
        self.criterion = None
        self.optimizer = None
        self.scheduler = None
        self.device = None
        self.cuda = config.cuda
        self.train_loader = None
        self.test_loader = None

    def load_data(self):
        # ToTensor scales pixel values from [0,255] to [0,1]
        mean_var = (125.3 / 255, 123.0 / 255,
                    113.9 / 255), (63.0 / 255, 62.1 / 255, 66.7 / 255)
        transform = transforms.Compose([
            transforms.CenterCrop(28),
            transforms.ToTensor(),
            transforms.Normalize(*mean_var, inplace=True)
        ])
        train_set = torchvision.datasets.CIFAR10(root='./data',
                                                 train=True,
                                                 download=DOWNLOAD,
                                                 transform=transform)
        test_set = torchvision.datasets.CIFAR10(root='./data',
                                                train=False,
                                                download=DOWNLOAD,
                                                transform=transform)

        if self.random_labels:
            np.random.shuffle(train_set.targets)
            np.random.shuffle(test_set.targets)

        assert self.N <= 50000
        if self.N < 50000:
            train_set.data = train_set.data[:self.N]
            # downsize the test set to improve speed for small N
            test_set.data = test_set.data[:self.N]

        self.train_loader = torch.utils.data.DataLoader(
            dataset=train_set,
            batch_size=self.batch_size,
            shuffle=True,
            drop_last=True)
        self.test_loader = torch.utils.data.DataLoader(
            dataset=test_set,
            batch_size=self.batch_size,
            shuffle=False,
            drop_last=True)

    def load_model(self):
        if self.cuda:
            self.device = torch.device('cuda')
            cudnn.benchmark = True
        else:
            self.device = torch.device('cpu')

        self.model = AlexNet(device=self.device,
                             B=self.batch_size,
                             max_alpha=self.max_alpha,
                             use_bn=self.use_bn).to(self.device)

        self.optimizer = optim.SGD(self.model.parameters(),
                                   lr=self.lr,
                                   momentum=self.momentum)
        self.scheduler = optim.lr_scheduler.StepLR(self.optimizer,
                                                   step_size=140)
        self.criterion = nn.NLLLoss().to(self.device)

    def getIw(self):
        # Iw should be normalized with respect to N
        # via reparameterization, we optimize alpha with only 1920 dimensions
        # but Iw should scale with the dimension of the weights
        return 7 * 7 * 64 * 384 / 1920 * self.model.getIw() / self.batch_size

    def do_batch(self, train, epoch):
        loader = self.train_loader if train else self.test_loader
        total_ce, total_Iw, total_loss = 0, 0, 0
        total_correct = 0
        total = 0
        pbar = tqdm(loader)
        num_batches = len(loader)
        for batch_num, (data, target) in enumerate(pbar):
            data, target = data.to(self.device), target.to(self.device)
            if train:
                self.optimizer.zero_grad()
            output = self.model(data)
            # NLLLoss is averaged across observations for each minibatch
            ce = self.criterion(torch.log(output + EPS), target)
            Iw = self.getIw()
            loss = ce + 0.5 * self.beta * Iw
            if train:
                loss.backward()
                self.optimizer.step()
            total_ce += ce.item()
            total_Iw += Iw.item()
            total_loss += loss.item()
            prediction = torch.max(
                output,
                1)  # second param "1" represents the dimension to be reduced
            total_correct += np.sum(
                prediction[1].cpu().numpy() == target.cpu().numpy())
            total += target.size(0)

            a = self.model.get_a()
            pbar.set_description('Train' if train else 'Test')
            pbar.set_postfix(N=self.N,
                             b=self.beta,
                             ep=epoch,
                             acc=100. * total_correct / total,
                             loss=total_loss / num_batches,
                             ce=total_ce / num_batches,
                             Iw=total_Iw / num_batches,
                             a=a)
        return total_correct / total, total_loss / num_batches, total_ce / num_batches, total_Iw / num_batches, a

    def train(self, epoch):
        self.model.train()
        return self.do_batch(train=True, epoch=epoch)

    def test(self, epoch):
        self.model.eval()
        with torch.no_grad():
            return self.do_batch(train=False, epoch=epoch)

    def save(self, name=None):
        model_out_path = (name or self.name) + ".pth"
        # torch.save(self.model, model_out_path)
        # print("Checkpoint saved to {}".format(model_out_path))

    def run(self):
        self.load_data()
        self.load_model()
        results = []
        best_acc, best_ep = -1, -1
        for epoch in range(1, self.epochs + 1):
            # print("\n===> epoch: %d/200" % epoch)
            train_acc, train_loss, train_ce, train_Iw, train_a = self.train(
                epoch)
            self.scheduler.step(epoch)
            test_acc, test_loss, test_ce, test_Iw, test_a = self.test(epoch)
            results.append([
                self.N, self.beta, train_acc, test_acc, train_loss, test_loss,
                train_ce, test_ce, train_Iw, test_Iw, train_a, test_a
            ])

            if test_acc > best_acc:
                best_acc, best_ep = test_acc, epoch
            if self.patience >= 0:  # early stopping
                if best_ep < epoch - self.patience:
                    break

        with open(self.name + '.csv', 'a') as f:
            w = csv.writer(f)
            w.writerows(results)
        self.save()

        return train_acc, test_acc
			help='Training Iteration Count')
	parser.add_argument('-b', action='store', dest='batch_size', type=int, 
			help='Batch Size => mini-batch')
	parser.add_argument('-r', action='store', dest='reg', type=float,
			help='Regulizer')
	parser.add_argument('-d', action='store', dest='dropout', type=float,
			help='Dropout Ratio')
	parser.add_argument('-p', action='store', dest='log_path', type=str,
			help='Log, Model Path')

	config = parser.parse_args()

	print "--------------- Config Description ---------------"
	print " -  Learning Rate : ", config.learning_rate
	print " -  Num Iterations : ", config.num_iters
	print " -  Batch Size : ", config.batch_size
	print " -  Regulizer : ", config.reg
	print " -  Dropout : ", config.dropout
	print " -  Log, Model Path : ", config.log_path
	print "--------------------------------------------------"

	dataset = data.load_gender_dataset() 

	model = AlexNet(dataset['geometry'], dataset['num_classes'], config.log_path)
	model.train(dataset['data'], dataset['label'], 
			learning_rate=config.learning_rate, num_iters=config.num_iters, 
			batch_size=config.batch_size, dropout_prob=config.dropout, 
			verbose=True)

#	model.predict(mnist.test.images, mnist.test.labels)
Example #10
0
def run_imagenet_test():
  """ Runs the a test that trains a CNN to classify ImageNet data.
  Returns:
    A tuple containing the total elapsed time, and the average number of
    training iterations per second. """
  batch_size = 128
  # How many batches to have loaded into VRAM at once.
  load_batches = 5

  # Learning rate hyperparameters.
  learning_rate = 0.00001
  decay_steps = 10000
  decay_rate = 1
  momentum = 0.9
  weight_decay = 0.0005

  rho = 0.9
  epsilon = 1e-6

  # Where we save the network.
  save_file = "/home/theano/training_data/alexnet.pkl"
  synsets_save_file = "/home/theano/training_data/synsets.pkl"
  # Where we load the synsets to use from.
  synset_list = "/job_files/ilsvrc16_synsets.txt"
  # Where to load and save datasets.
  dataset_path = "/home/theano/training_data/ilsvrc16_dataset"
  # Where to cache image data.
  cache_path = "/home/theano/training_data/cache"
  # Where to save downloaded synset info.
  synset_dir = "/home/theano/training_data/synsets"

  data = data_loader.ImagenetLoader(batch_size, load_batches, cache_path,
                                    dataset_path, synset_dir, synset_list)
  if os.path.exists(synsets_save_file):
    data.load(synsets_save_file)
  train = data.get_train_set()
  test = data.get_test_set()
  cpu_labels = data.get_non_shared_test_set()

  if os.path.exists(save_file):
    # Load from the file.
    print "Theano: Loading network from file..."
    network = AlexNet.load(save_file, train, test, batch_size,
                           learning_rate=learning_rate)

  else:
    # Build new network.
    network = AlexNet(train, test, batch_size,
                      patch_separation=batch_size * load_batches)

    network.use_sgd_trainer(learning_rate, momentum=momentum,
                            weight_decay=weight_decay,
                            decay_rate=decay_rate,
                            decay_steps=decay_steps)
    #network.use_rmsprop_trainer(learning_rate, rho, epsilon,
    #                            decay_rate=decay_rate,
    #                            decay_steps=decay_steps)

  print "Theano: Starting ImageNet test..."

  accuracy = 0
  start_time = time.time()
  iterations = 0

  train_batch_index = 0
  test_batch_index = 0

  while iterations < 150000:
    logger.debug("Train index, size: %d, %d" % (train_batch_index,
                                                data.get_train_batch_size()))
    logger.debug("Test index, size: %d, %d" % (test_batch_index,
                                               data.get_test_batch_size()))

    # Swap in new data if we need to.
    if (train_batch_index + 1) * batch_size > data.get_train_batch_size():
      train_batch_index = 0
      logger.info("Getting train set.")
      train = data.get_train_set()
      logger.info("Got train set.")
    # Swap in new data if we need to.
    test_set_one_patch = data.get_test_batch_size() / 10
    if (test_batch_index + 1) * batch_size > test_set_one_patch:
      test_batch_index = 0
      logger.info("Getting test set.")
      test = data.get_test_set()
      cpu_labels = data.get_non_shared_test_set()[:]
      logger.info("Got test set.")

    if iterations % 100 == 0:
      # cpu_labels contains labels for every batch currently loaded in VRAM,
      # without duplicates for additional patches.
      label_index = test_batch_index * batch_size
      top_one, top_five = network.test(test_batch_index,
                                       cpu_labels[label_index:label_index + \
                                                              batch_size])
      logger.info("Step %d, testing top 1: %f, testing top 5: %f" % \
                  (iterations, top_one, top_five))

      test_batch_index += 1

    cost, rate, step = network.train(train_batch_index)
    logger.info("Training cost: %f, learning rate: %f, step: %d" % \
                (cost, rate, step))

    if iterations % 100 == 0:
      print "Saving network..."
      network.save(save_file)
      # Save synset data as well.
      data.save(synsets_save_file)

    iterations += 1
    train_batch_index += 1

  elapsed = time.time() - start_time
  speed = iterations / elapsed
  print("Theano: Ran %d training iterations. (%f iter/s)" % \
      (iterations, speed))
  print("Theano: Imagenet test completed in %f seconds." % (elapsed))

  data.exit_gracefully()

  return (elapsed, speed)