class TrainNetwork(): def __init__(self, dataset, batch_size, epochs, lr, lr_decay_epoch, momentum): assert (dataset == 'letters' or dataset == 'mnist') self.dataset = dataset self.batch_size = batch_size self.epochs = epochs self.lr = lr self.lr_decay_epoch = lr_decay_epoch self.momentum = momentum # letters contains 27 classes, digits contains 10 classes num_classes = 27 if dataset == 'letters' else 10 # Load pre learned AlexNet with changed number of output classes state_dict = torch.load('./trained_models/alexnet.pth') state_dict['classifier.6.weight'] = torch.zeros(num_classes, 4096) state_dict['classifier.6.bias'] = torch.zeros(num_classes) self.model = AlexNet(num_classes) self.model.load_state_dict(state_dict) # Use cuda if available if torch.cuda.is_available(): self.model.cuda() # Load training dataset kwargs = { 'num_workers': 1, 'pin_memory': True } if torch.cuda.is_available() else {} self.train_loader = torch.utils.data.DataLoader( EMNIST('./data', dataset, download=True, transform=transforms.Compose([ transforms.Lambda(correct_rotation), transforms.Lambda(random_transform), transforms.Resize((224, 224)), transforms.RandomResizedCrop(224, (0.9, 1.1), ratio=(0.9, 1.1)), transforms.Grayscale(3), transforms.ToTensor(), ])), batch_size=batch_size, shuffle=True, **kwargs) # Optimizer and loss function self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr, momentum=self.momentum) self.loss_fn = nn.CrossEntropyLoss() def reduce_learning_rate(self, epoch): """ Reduce the learning rate by factor 0.1 every lr_decay_epoch :param optimizer: Optimizer containing the learning rate :param epoch: Current epoch :param init_lr: Initial learning rate :param lr_decay_epoch: Number of epochs until learning rate gets reduced :return: None """ lr = self.lr * (0.1**(epoch // self.lr_decay_epoch)) if epoch % self.lr_decay_epoch == 0: print('LR is set to {}'.format(lr)) for param_group in self.optimizer.param_groups: param_group['lr'] = lr def train(self, epoch): """ Train the model for one epoch and save the result as a .pth file :param epoch: Current epoch :return: None """ self.model.train() train_loss = 0 train_correct = 0 progress = None for batch_idx, (data, target) in enumerate(self.train_loader): # Get data and label if torch.cuda.is_available(): data, target = data.cuda(), target.cuda() data, target = Variable(data), Variable(target) # Optimize using backpropagation self.optimizer.zero_grad() output = self.model(data) loss = self.loss_fn(output, target) train_loss += loss.data[0] pred = output.data.max(1, keepdim=True)[1] train_correct += pred.eq(target.data.view_as(pred)).sum() loss.backward() self.optimizer.step() # Print information about current step current_progress = int(100 * (batch_idx + 1) * self.batch_size / len(self.train_loader.dataset)) if current_progress is not progress and current_progress % 5 == 0: progress = current_progress print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, (batch_idx + 1) * len(data), len(self.train_loader.dataset), current_progress, loss.data[0])) train_loss /= (len(self.train_loader.dataset) / self.batch_size) train_correct /= len(self.train_loader.dataset) train_correct *= 100 # Print information about current epoch print( 'Train Epoch: {} \tCorrect: {:3.2f}%\tAverage loss: {:.6f}'.format( epoch, train_correct, train_loss)) # Save snapshot torch.save( { 'model': self.model.state_dict(), 'optimizer': self.optimizer.state_dict() }, './trained_models/{}_{}.pth'.format(self.dataset, epoch)) def start(self): """ Start training the network :return: None """ for epoch in range(1, self.epochs + 1): self.reduce_learning_rate(epoch) self.train(epoch)
def main(): #gpus=[4,5,6,7] gpus = [0] print("GPUs :", gpus) print("prepare data") normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_tfs = transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize ]) val_tfs = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize ]) train_ds = datasets.ImageFolder('/home/gw/data/imagenet_10/train', train_tfs) val_ds = datasets.ImageFolder('/home/gw/data/imagenet_10/val', val_tfs) train_ld = torch.utils.data.DataLoader(train_ds, batch_size=256, shuffle=True, num_workers=4, pin_memory=True) val_ld = torch.utils.data.DataLoader(val_ds, batch_size=64, shuffle=False, num_workers=4, pin_memory=True) print("construct model") #model = ResNet50() #model=torchvision.models.AlexNet() model = AlexNet() #model = torch.nn.DataParallel(model, device_ids=gpus).cuda(gpus[0]) model.cuda() criterion = nn.CrossEntropyLoss().cuda(gpus[0]) optimizer = torch.optim.SGD(model.parameters(), 0.01, momentum=0.875, weight_decay=3.0517578125e-05) model.train() print("begin trainning") for epoch in range(0, 50): batch_time = AverageMeter('Time', ':6.3f') data_time = AverageMeter('Data', ':6.3f') losses = AverageMeter('Loss', ':.4e') top1 = AverageMeter('Acc@1', ':6.2f') top5 = AverageMeter('Acc@5', ':6.2f') progress = ProgressMeter(len(train_ld), [batch_time, data_time, losses, top1, top5], prefix="Epoch: [{}]".format(epoch)) end = time.time() for i, (images, labels) in enumerate(train_ld): data_time.update(time.time() - end) print('image shape: ', images.shape) print('labels shape: ', labels.shape) images = images.cuda(gpus[0], non_blocking=True) labels = labels.cuda(gpus[0], non_blocking=True) outputs = model(images) loss = criterion(outputs, labels) # measure accuracy acc1, acc5 = accuracy(outputs, labels, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() batch_time.update(time.time() - end) end = time.time() if i % 10 == 0: progress.display(i)
def train(train_loader, eval_loader, opt): print('==> Start training...') summary_writer = SummaryWriter('./runs/' + str(int(time.time()))) is_cuda = torch.cuda.is_available() model = AlexNet() if is_cuda: model = model.cuda() optimizer = optim.SGD( params=model.parameters(), lr=opt.base_lr, momentum=0.9, ) criterion = nn.CrossEntropyLoss() best_eval_acc = -0.1 losses = AverageMeter() accuracies = AverageMeter() global_step = 0 for epoch in range(1, opt.epochs + 1): # train model.train() for batch_idx, (inputs, targets) in enumerate(train_loader): global_step += 1 if is_cuda: inputs = inputs.cuda() targets = targets.cuda() outputs = model(inputs) loss = criterion(outputs, targets) losses.update(loss.item(), outputs.shape[0]) summary_writer.add_scalar('train/loss', loss, global_step) _, preds = torch.max(outputs, dim=1) acc = preds.eq(targets).sum().item() / len(targets) accuracies.update(acc) summary_writer.add_scalar('train/acc', acc, global_step) optimizer.zero_grad() loss.backward() optimizer.step() summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step) print( '==> Epoch: %d; Average Train Loss: %.4f; Average Train Acc: %.4f' % (epoch, losses.avg, accuracies.avg)) # eval model.eval() losses.reset() accuracies.reset() for batch_idx, (inputs, targets) in enumerate(eval_loader): if is_cuda: inputs = inputs.cuda() targets = targets.cuda() outputs = model(inputs) loss = criterion(outputs, targets) losses.update(loss.item(), outputs.shape[0]) _, preds = torch.max(outputs, dim=1) acc = preds.eq(targets).sum().item() / len(targets) accuracies.update(acc) summary_writer.add_scalar('eval/loss', losses.avg, global_step) summary_writer.add_scalar('eval/acc', accuracies.avg, global_step) if accuracies.avg > best_eval_acc: best_eval_acc = accuracies.avg torch.save(model, './weights/best.pt') print( '==> Epoch: %d; Average Eval Loss: %.4f; Average/Best Eval Acc: %.4f / %.4f' % (epoch, losses.avg, accuracies.avg, best_eval_acc))
dataset = Rand_num() sampler = RandomSampler(dataset) loader = DataLoader(dataset, batch_size=20, sampler=sampler, shuffle=False, num_workers=1, drop_last=True) net = AlexNet(3) #net.load_state_dict(torch.load(SAVE_PATH)) net.cuda() optimizer = optim.Adam(net.parameters(), lr=0.001) for epoch in range(10000): for i, data in enumerate(loader, 0): net.zero_grad() video, labels = data video = video.view(-1, 3, 227, 227) labels = labels.view(-1, 3) labels = torch.squeeze(Variable(labels.float().cuda())) video = torch.squeeze(Variable((video.float() / 256).cuda())) net.train() outputs = net.forward(video) loss = lossfunction(outputs, labels) loss.backward() optimizer.step() if i == 0: torch.save(net.state_dict(), SAVE_PATH) print(loss) logger.scalar_summary('loss', loss.data.cpu().numpy(), epoch)
from alexnet import AlexNet height, width, channel, num_class = 227, 227, 3, 2 dataset_path = 'G:\\dataset\\kaggle\dog-vs-cat\\dogs-vs-cats-redux-kernels-edition\\train' alexNet = AlexNet(height, width, channel, num_class, dataset_path) alexNet.train()
def main(argv=None): """ Train ImageNet for a number of steps. """ if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') if not FLAGS.train_dir: raise ValueError( 'You must supply the dataset directory with --train_dir') if not FLAGS.num_reader: raise ValueError('Please make num_readers at least 1') with tf.Graph().as_default(): global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) # Force all input processing onto CPU in order to reserve the GPU for the forward inference and back-propagation. with tf.device('/cpu:0'): image_batch, label_batch = DataProvider.distort_input( FLAGS.dataset_dir, FLAGS.batch_size, FLAGS.num_reader, FLAGS.num_preprocess_thread, is_train=True) # Build a Graph that computes the logits predictions from the alextnet model logits, _ = AlexNet.train(x=image_batch, keep_prob=FLAGS.drop_out, weight_decay=FLAGS.weight_decay) num_batches_per_epoch = int(DataProvider.TRAIN_DATASET_SIZE / FLAGS.batch_size) with tf.name_scope('learning_rate'): learning_rate = _configure_learning_rate(num_batches_per_epoch, global_step) tf.summary.scalar('learning_rate', learning_rate) with tf.name_scope('cross_entropy'): cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=label_batch) cross_entropy_mean = tf.reduce_mean(cross_entropy, name="cross_entropy") tf.add_to_collection('losses', cross_entropy_mean) with tf.name_scope('total_loss'): # The total loss is defined as the cross entropy loss plus all of the weight decay terms (L2 loss). total_loss = tf.add_n(tf.get_collection('losses'), name='total_loss') with tf.name_scope('optimizer'): optimizer = _configure_optimizer(learning_rate) grads = optimizer.compute_gradients(total_loss) apply_gradient_op = optimizer.apply_gradients( grads, global_step=global_step) #with tf.name_scope('accuracy'): # correct = tf.equal(tf.argmax(logits, 1), tf.argmax(label_batch, 1)) # accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) # tf.summary.scalar('accuracy', accuracy) # Add histograms for trainable variables. for var in tf.trainable_variables(): tf.summary.histogram(var.op.name, var) # Add histograms for gradients. for grad, var in grads: if grad is not None: tf.summary.histogram(var.op.name + '/gradients', grad) # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) variables_averages_op = variable_averages.apply( tf.trainable_variables()) train_op = tf.group(apply_gradient_op, variables_averages_op) summary_op = tf.summary.merge_all() saver = tf.train.Saver() init_op = tf.global_variables_initializer() with tf.Session(config=_configure_session()) as sess: sess.run(init_op) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, graph=sess.graph) max_steps = int(FLAGS.num_epochs * num_batches_per_epoch) for step in xrange(max_steps): start_time = time.time() _, loss_value = sess.run([train_op, total_loss]) duration = time.time() - start_time if step % FLAGS.log_every_n_steps == 0: examples_per_sec = FLAGS.batch_size / duration sec_per_batch = duration epoch = step / num_batches_per_epoch + 1 format_str = ( '%s: Epoch %d Step %d Total_loss = %.2f (%.1f examples/sec; %.3f sec/batch)' ) print(format_str % (datetime.now(), epoch, step, loss_value, examples_per_sec, sec_per_batch)) if step % FLAGS.save_summaries_steps == 0: # Visual Training Process summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) if step % FLAGS.save_model_steps == 0: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) coord.request_stop() coord.join(threads)
def main(): progress = default_progress() experiment_dir = 'experiment/miniplaces' # Here's our data train_loader = torch.utils.data.DataLoader(CachedImageFolder( 'dataset/miniplaces/simple/train', transform=transforms.Compose([ transforms.Resize(128), transforms.RandomCrop(119), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(IMAGE_MEAN, IMAGE_STDEV) ])), batch_size=64, shuffle=True, num_workers=6, pin_memory=True) val_loader = torch.utils.data.DataLoader(CachedImageFolder( 'dataset/miniplaces/simple/val', transform=transforms.Compose([ transforms.Resize(128), transforms.CenterCrop(119), transforms.ToTensor(), transforms.Normalize(IMAGE_MEAN, IMAGE_STDEV) ])), batch_size=512, shuffle=False, num_workers=6, pin_memory=True) # Create a simplified AlexNet with half resolution. model = AlexNet(first_layer='conv1', last_layer='fc8', layer_sizes=dict(fc6=2048, fc7=2048), output_channels=100, half_resolution=True, include_lrn=False, split_groups=False).cuda() # Use Kaiming initialization for the weights for name, val in model.named_parameters(): if 'weight' in name: init.kaiming_uniform_(val) else: # Init positive bias in many layers to avoid dead neurons. assert 'bias' in name init.constant_( val, 0 if any( name.startswith(layer) for layer in ['conv1', 'conv3', 'fc8']) else 1) # An abbreviated training schedule: 40000 batches. # TODO: tune these hyperparameters. # init_lr = 0.002 init_lr = 0.002 # max_iter = 40000 - 34.5% @1 # max_iter = 50000 - 37% @1 # max_iter = 80000 - 39.7% @1 # max_iter = 100000 - 40.1% @1 max_iter = 100000 criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD( model.parameters(), lr=init_lr, momentum=0.9, # 0.9, # weight_decay=0.001) weight_decay=0.001) iter_num = 0 best = dict(val_accuracy=0.0) model.train() # Oh, hold on. Let's actually resume training if we already have a model. checkpoint_filename = 'miniplaces.pth.tar' best_filename = 'best_%s' % checkpoint_filename best_checkpoint = os.path.join(experiment_dir, best_filename) try_to_resume_training = False if try_to_resume_training and os.path.exists(best_checkpoint): checkpoint = torch.load(os.path.join(experiment_dir, best_filename)) iter_num = checkpoint['iter'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) best['val_accuracy'] = checkpoint['accuracy'] def save_checkpoint(state, is_best): filename = os.path.join(experiment_dir, checkpoint_filename) ensure_dir_for(filename) torch.save(state, filename) if is_best: shutil.copyfile(filename, os.path.join(experiment_dir, best_filename)) def validate_and_checkpoint(): model.eval() val_loss, val_acc = AverageMeter(), AverageMeter() for input, target in progress(val_loader): # Load data input_var, target_var = [ Variable(d.cuda(non_blocking=True)) for d in [input, target] ] # Evaluate model with torch.no_grad(): output = model(input_var) loss = criterion(output, target_var) _, pred = output.max(1) accuracy = (target_var.eq(pred) ).data.float().sum().item() / input.size(0) val_loss.update(loss.data.item(), input.size(0)) val_acc.update(accuracy, input.size(0)) # Check accuracy post_progress(l=val_loss.avg, a=val_acc.avg) # Save checkpoint save_checkpoint( { 'iter': iter_num, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'accuracy': val_acc.avg, 'loss': val_loss.avg, }, val_acc.avg > best['val_accuracy']) best['val_accuracy'] = max(val_acc.avg, best['val_accuracy']) post_progress(v=val_acc.avg) # Here is our training loop. while iter_num < max_iter: for input, target in progress(train_loader): # Track the average training loss/accuracy for each epoch. train_loss, train_acc = AverageMeter(), AverageMeter() # Load data input_var, target_var = [ Variable(d.cuda(non_blocking=True)) for d in [input, target] ] # Evaluate model output = model(input_var) loss = criterion(output, target_var) train_loss.update(loss.data.item(), input.size(0)) # Perform one step of SGD optimizer.zero_grad() loss.backward() optimizer.step() # Also check training set accuracy _, pred = output.max(1) accuracy = (target_var.eq(pred)).data.float().sum().item() / ( input.size(0)) train_acc.update(accuracy) remaining = 1 - iter_num / float(max_iter) post_progress(l=train_loss.avg, a=train_acc.avg, v=best['val_accuracy']) # Advance iter_num += 1 if iter_num >= max_iter: break # Linear learning rate decay lr = init_lr * remaining for param_group in optimizer.param_groups: param_group['lr'] = lr # Ocassionally check validation set accuracy and checkpoint if iter_num % 1000 == 0: validate_and_checkpoint() model.train()
class Solver(object): def __init__(self, config): self.model = None self.name = config.name self.lr = config.lr self.momentum = config.momentum self.beta = config.beta self.max_alpha = config.max_alpha self.epochs = config.epochs self.patience = config.patience self.N = config.N self.batch_size = config.batch_size self.random_labels = config.random_labels self.use_bn = config.batchnorm self.criterion = None self.optimizer = None self.scheduler = None self.device = None self.cuda = config.cuda self.train_loader = None self.test_loader = None def load_data(self): # ToTensor scales pixel values from [0,255] to [0,1] mean_var = (125.3 / 255, 123.0 / 255, 113.9 / 255), (63.0 / 255, 62.1 / 255, 66.7 / 255) transform = transforms.Compose([ transforms.CenterCrop(28), transforms.ToTensor(), transforms.Normalize(*mean_var, inplace=True) ]) train_set = torchvision.datasets.CIFAR10(root='./data', train=True, download=DOWNLOAD, transform=transform) test_set = torchvision.datasets.CIFAR10(root='./data', train=False, download=DOWNLOAD, transform=transform) if self.random_labels: np.random.shuffle(train_set.targets) np.random.shuffle(test_set.targets) assert self.N <= 50000 if self.N < 50000: train_set.data = train_set.data[:self.N] # downsize the test set to improve speed for small N test_set.data = test_set.data[:self.N] self.train_loader = torch.utils.data.DataLoader( dataset=train_set, batch_size=self.batch_size, shuffle=True, drop_last=True) self.test_loader = torch.utils.data.DataLoader( dataset=test_set, batch_size=self.batch_size, shuffle=False, drop_last=True) def load_model(self): if self.cuda: self.device = torch.device('cuda') cudnn.benchmark = True else: self.device = torch.device('cpu') self.model = AlexNet(device=self.device, B=self.batch_size, max_alpha=self.max_alpha, use_bn=self.use_bn).to(self.device) self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr, momentum=self.momentum) self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=140) self.criterion = nn.NLLLoss().to(self.device) def getIw(self): # Iw should be normalized with respect to N # via reparameterization, we optimize alpha with only 1920 dimensions # but Iw should scale with the dimension of the weights return 7 * 7 * 64 * 384 / 1920 * self.model.getIw() / self.batch_size def do_batch(self, train, epoch): loader = self.train_loader if train else self.test_loader total_ce, total_Iw, total_loss = 0, 0, 0 total_correct = 0 total = 0 pbar = tqdm(loader) num_batches = len(loader) for batch_num, (data, target) in enumerate(pbar): data, target = data.to(self.device), target.to(self.device) if train: self.optimizer.zero_grad() output = self.model(data) # NLLLoss is averaged across observations for each minibatch ce = self.criterion(torch.log(output + EPS), target) Iw = self.getIw() loss = ce + 0.5 * self.beta * Iw if train: loss.backward() self.optimizer.step() total_ce += ce.item() total_Iw += Iw.item() total_loss += loss.item() prediction = torch.max( output, 1) # second param "1" represents the dimension to be reduced total_correct += np.sum( prediction[1].cpu().numpy() == target.cpu().numpy()) total += target.size(0) a = self.model.get_a() pbar.set_description('Train' if train else 'Test') pbar.set_postfix(N=self.N, b=self.beta, ep=epoch, acc=100. * total_correct / total, loss=total_loss / num_batches, ce=total_ce / num_batches, Iw=total_Iw / num_batches, a=a) return total_correct / total, total_loss / num_batches, total_ce / num_batches, total_Iw / num_batches, a def train(self, epoch): self.model.train() return self.do_batch(train=True, epoch=epoch) def test(self, epoch): self.model.eval() with torch.no_grad(): return self.do_batch(train=False, epoch=epoch) def save(self, name=None): model_out_path = (name or self.name) + ".pth" # torch.save(self.model, model_out_path) # print("Checkpoint saved to {}".format(model_out_path)) def run(self): self.load_data() self.load_model() results = [] best_acc, best_ep = -1, -1 for epoch in range(1, self.epochs + 1): # print("\n===> epoch: %d/200" % epoch) train_acc, train_loss, train_ce, train_Iw, train_a = self.train( epoch) self.scheduler.step(epoch) test_acc, test_loss, test_ce, test_Iw, test_a = self.test(epoch) results.append([ self.N, self.beta, train_acc, test_acc, train_loss, test_loss, train_ce, test_ce, train_Iw, test_Iw, train_a, test_a ]) if test_acc > best_acc: best_acc, best_ep = test_acc, epoch if self.patience >= 0: # early stopping if best_ep < epoch - self.patience: break with open(self.name + '.csv', 'a') as f: w = csv.writer(f) w.writerows(results) self.save() return train_acc, test_acc
help='Training Iteration Count') parser.add_argument('-b', action='store', dest='batch_size', type=int, help='Batch Size => mini-batch') parser.add_argument('-r', action='store', dest='reg', type=float, help='Regulizer') parser.add_argument('-d', action='store', dest='dropout', type=float, help='Dropout Ratio') parser.add_argument('-p', action='store', dest='log_path', type=str, help='Log, Model Path') config = parser.parse_args() print "--------------- Config Description ---------------" print " - Learning Rate : ", config.learning_rate print " - Num Iterations : ", config.num_iters print " - Batch Size : ", config.batch_size print " - Regulizer : ", config.reg print " - Dropout : ", config.dropout print " - Log, Model Path : ", config.log_path print "--------------------------------------------------" dataset = data.load_gender_dataset() model = AlexNet(dataset['geometry'], dataset['num_classes'], config.log_path) model.train(dataset['data'], dataset['label'], learning_rate=config.learning_rate, num_iters=config.num_iters, batch_size=config.batch_size, dropout_prob=config.dropout, verbose=True) # model.predict(mnist.test.images, mnist.test.labels)
def run_imagenet_test(): """ Runs the a test that trains a CNN to classify ImageNet data. Returns: A tuple containing the total elapsed time, and the average number of training iterations per second. """ batch_size = 128 # How many batches to have loaded into VRAM at once. load_batches = 5 # Learning rate hyperparameters. learning_rate = 0.00001 decay_steps = 10000 decay_rate = 1 momentum = 0.9 weight_decay = 0.0005 rho = 0.9 epsilon = 1e-6 # Where we save the network. save_file = "/home/theano/training_data/alexnet.pkl" synsets_save_file = "/home/theano/training_data/synsets.pkl" # Where we load the synsets to use from. synset_list = "/job_files/ilsvrc16_synsets.txt" # Where to load and save datasets. dataset_path = "/home/theano/training_data/ilsvrc16_dataset" # Where to cache image data. cache_path = "/home/theano/training_data/cache" # Where to save downloaded synset info. synset_dir = "/home/theano/training_data/synsets" data = data_loader.ImagenetLoader(batch_size, load_batches, cache_path, dataset_path, synset_dir, synset_list) if os.path.exists(synsets_save_file): data.load(synsets_save_file) train = data.get_train_set() test = data.get_test_set() cpu_labels = data.get_non_shared_test_set() if os.path.exists(save_file): # Load from the file. print "Theano: Loading network from file..." network = AlexNet.load(save_file, train, test, batch_size, learning_rate=learning_rate) else: # Build new network. network = AlexNet(train, test, batch_size, patch_separation=batch_size * load_batches) network.use_sgd_trainer(learning_rate, momentum=momentum, weight_decay=weight_decay, decay_rate=decay_rate, decay_steps=decay_steps) #network.use_rmsprop_trainer(learning_rate, rho, epsilon, # decay_rate=decay_rate, # decay_steps=decay_steps) print "Theano: Starting ImageNet test..." accuracy = 0 start_time = time.time() iterations = 0 train_batch_index = 0 test_batch_index = 0 while iterations < 150000: logger.debug("Train index, size: %d, %d" % (train_batch_index, data.get_train_batch_size())) logger.debug("Test index, size: %d, %d" % (test_batch_index, data.get_test_batch_size())) # Swap in new data if we need to. if (train_batch_index + 1) * batch_size > data.get_train_batch_size(): train_batch_index = 0 logger.info("Getting train set.") train = data.get_train_set() logger.info("Got train set.") # Swap in new data if we need to. test_set_one_patch = data.get_test_batch_size() / 10 if (test_batch_index + 1) * batch_size > test_set_one_patch: test_batch_index = 0 logger.info("Getting test set.") test = data.get_test_set() cpu_labels = data.get_non_shared_test_set()[:] logger.info("Got test set.") if iterations % 100 == 0: # cpu_labels contains labels for every batch currently loaded in VRAM, # without duplicates for additional patches. label_index = test_batch_index * batch_size top_one, top_five = network.test(test_batch_index, cpu_labels[label_index:label_index + \ batch_size]) logger.info("Step %d, testing top 1: %f, testing top 5: %f" % \ (iterations, top_one, top_five)) test_batch_index += 1 cost, rate, step = network.train(train_batch_index) logger.info("Training cost: %f, learning rate: %f, step: %d" % \ (cost, rate, step)) if iterations % 100 == 0: print "Saving network..." network.save(save_file) # Save synset data as well. data.save(synsets_save_file) iterations += 1 train_batch_index += 1 elapsed = time.time() - start_time speed = iterations / elapsed print("Theano: Ran %d training iterations. (%f iter/s)" % \ (iterations, speed)) print("Theano: Imagenet test completed in %f seconds." % (elapsed)) data.exit_gracefully() return (elapsed, speed)