def main(): np.random.seed(args.seed) tf.set_random_seed(args.seed) logger = u.TensorboardLogger(args.run) with u.timeit("init/session"): rewrite_options = None try: from tensorflow.core.protobuf import rewriter_config_pb2 rewrite_options = rewriter_config_pb2.RewriterConfig( disable_model_pruning=True, constant_folding=rewriter_config_pb2.RewriterConfig.OFF, memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL) except: pass optimizer_options = tf.OptimizerOptions( opt_level=tf.OptimizerOptions.L0) graph_options = tf.GraphOptions(optimizer_options=optimizer_options, rewrite_options=rewrite_options) gpu_options = tf.GPUOptions(allow_growth=False) config = tf.ConfigProto(graph_options=graph_options, gpu_options=gpu_options, log_device_placement=False) sess = tf.InteractiveSession(config=config) u.register_default_session( sess) # since default session is Thread-local with u.timeit("init/model_init"): model = model_creator(args.batch_size, name="main") model.initialize_global_vars(verbose=True) model.initialize_local_vars() kfac_lib.numeric_inverse = args.numeric_inverse with u.timeit("init/kfac_init"): kfac = Kfac(model_creator, args.kfac_batch_size) kfac.model.initialize_global_vars(verbose=False) kfac.model.initialize_local_vars() kfac.Lambda.set(args.Lambda) kfac.reset() # resets optimization variables (not model variables) if args.mode != 'run': opt = tf.train.AdamOptimizer(0.001) else: opt = tf.train.AdamOptimizer(args.lr) grads_and_vars = opt.compute_gradients(model.loss, var_list=model.trainable_vars) grad = IndexedGrad.from_grads_and_vars(grads_and_vars) grad_new = kfac.correct(grad) with u.capture_vars() as adam_vars: train_op = opt.apply_gradients(grad_new.to_grads_and_vars()) with u.timeit("init/adam"): sessrun([v.initializer for v in adam_vars]) losses = [] u.record_time() start_time = time.time() vloss0 = 0 # todo, unify the two data outputs outfn = 'data/%s_%f_%f.csv' % (args.run, args.lr, args.Lambda) start_time = time.time() if args.extra_kfac_batch_advance: kfac.model.advance_batch() # advance kfac batch if args.kfac_async: kfac.start_stats_runners() for step in range(args.num_steps): if args.validate_every_n and step % args.validate_every_n == 0: loss0, vloss0 = sessrun([model.loss, model.vloss]) else: loss0, = sessrun([model.loss]) losses.append(loss0) # TODO: remove this logger('loss/loss', loss0, 'loss/vloss', vloss0) elapsed = time.time() - start_time start_time = time.time() print("%4d ms, step %4d, loss %5.2f, vloss %5.2f" % (elapsed * 1e3, step, loss0, vloss0)) if args.method == 'kfac' and not args.kfac_async: kfac.model.advance_batch() kfac.update_stats() with u.timeit("train"): model.advance_batch() with u.timeit("grad.update"): grad.update() with kfac.read_lock(): grad_new.update() u.run(train_op) u.record_time() logger.next_step() # TODO: use u.global_runs_dir # TODO: get rid of u.timeit? with open('timelines/graphdef.txt', 'w') as f: f.write(str(u.get_default_graph().as_graph_def())) u.summarize_time() if args.mode == 'record': u.dump_with_prompt(losses, release_test_fn) elif args.mode == 'test': targets = np.loadtxt('data/' + release_test_fn, delimiter=",") u.check_equal(losses, targets, rtol=1e-2) u.summarize_difference(losses, targets) assert u.last_time() < 800, "Expected 648 on GTX 1080"
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.9, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--run', type=str, default='momentum-lenet', help='name of run') args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") print("using device ", device) kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader(datasets.MNIST( '/tmp/data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader(datasets.MNIST( '/tmp/data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=True, **kwargs) logger = u.TensorboardLogger(args.run) model = LeNet().to(device) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) example_count = 0 for epoch in range(1, args.epochs + 1): model.train() for batch_idx, (data, target) in enumerate(train_loader): step_start = time.perf_counter() data, target = data.to(device), target.to(device) logger.set_step(example_count) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) for param in model.parameters(): loss += 0.0002 * torch.sum(param * param) logger('loss/train', loss) loss.backward() optimizer.step() example_count += args.batch_size logger('time/step', 1000 * (time.perf_counter() - step_start)) if batch_idx % args.log_interval == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.item())) model.eval() test_loss = 0 correct = 0 with torch.no_grad(): for data, target in test_loader: data, target = data.to(device), target.to(device) output = model(data) test_loss += F.nll_loss( output, target, reduction='sum').item() # sum up batch loss pred = output.max(1, keepdim=True)[ 1] # get the index of the max log-probability correct += pred.eq(target.view_as(pred)).sum().item() test_loss /= len(test_loader.dataset) print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'. format(test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset))) accuracy = 1 - correct / len(test_loader.dataset) logger('loss/test', accuracy)
def main(): use_cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") print("using device ", device) torch.manual_seed(args.seed) u.set_runs_directory('runs3') logger = u.TensorboardLogger(args.run) batch_size = 64 shuffle = True kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader(datasets.MNIST( '/tmp/data', train=True, download=True, transform=transforms.Compose([transforms.ToTensor()])), batch_size=batch_size, shuffle=shuffle, **kwargs) test_loader = torch.utils.data.DataLoader(datasets.MNIST( '/tmp/data', train=False, transform=transforms.Compose([transforms.ToTensor()])), batch_size=1000, shuffle=shuffle, **kwargs) """input image size for the original LeNet5 is 32x32, here is 28x28""" # W1 = 0.1 * torch.randn(1 * 5 * 5 + 1, 6) net = LeNet5().to(device) def train_loss(data, target): y = net(data) y = F.log_softmax(y, dim=1) loss = F.nll_loss(y, target) for w in net.W: loss += 0.0002 * torch.sum(w * w) return loss def test_loss(): num_errs = 0 with torch.no_grad(): for data, target in test_loader: data, target = data.to(device), target.to(device) y = net(data) _, pred = torch.max(y, dim=1) num_errs += torch.sum(pred != target) return num_errs.item() / len(test_loader.dataset) Qs = [[torch.eye(w.shape[0]), torch.eye(w.shape[1])] for w in net.W] for i in range(len(Qs)): for j in range(len(Qs[i])): Qs[i][j] = Qs[i][j].to(device) step_size = 0.1 # tried 0.15, diverges grad_norm_clip_thr = 1e10 TrainLoss, TestLoss = [], [] example_count = 0 step_time_ms = 0 for epoch in range(10): for batch_idx, (data, target) in enumerate(train_loader): step_start = time.perf_counter() data, target = data.to(device), target.to(device) loss = train_loss(data, target) with u.timeit('grad'): grads = autograd.grad(loss, net.W, create_graph=True) TrainLoss.append(loss.item()) logger.set_step(example_count) logger('loss/train', TrainLoss[-1]) if batch_idx % 10 == 0: print( f'Epoch: {epoch}; batch: {batch_idx}; train loss: {TrainLoss[-1]:.2f}, step time: {step_time_ms:.0f}' ) with u.timeit('Hv'): # noise.normal_() # torch.manual_seed(args.seed) v = [torch.randn(w.shape).to(device) for w in net.W] # v = grads Hv = autograd.grad(grads, net.W, v) if args.verbose: print("v", v[0].mean()) print("data", data.mean()) print("Hv", Hv[0].mean()) n = len(net.W) with torch.no_grad(): with u.timeit('P_update'): for i in range(num_updates): psteps = [] for j in range(n): q = Qs[j] dw = v[j] dg = Hv[j] Qs[j][0], Qs[j][ 1], pstep = psgd.update_precond_kron_with_step( q[0], q[1], dw, dg) psteps.append(pstep) # print(np.array(psteps).mean()) logger('p_residual', np.array(psteps).mean()) with u.timeit('g_update'): pre_grads = [ psgd.precond_grad_kron(q[0], q[1], g) for (q, g) in zip(Qs, grads) ] grad_norm = torch.sqrt( sum([torch.sum(g * g) for g in pre_grads])) with u.timeit('gradstep'): step_adjust = min( grad_norm_clip_thr / (grad_norm + 1.2e-38), 1.0) for i in range(len(net.W)): net.W[i] -= step_adjust * step_size * pre_grads[i] total_step = step_adjust * step_size logger('step/adjust', step_adjust) logger('step/size', step_size) logger('step/total', total_step) logger('grad_norm', grad_norm) if args.verbose: print(data.mean()) import pdb pdb.set_trace() if args.early_stop: sys.exit() example_count += batch_size step_time_ms = 1000 * (time.perf_counter() - step_start) logger('time/step', step_time_ms) if args.test and batch_idx >= 100: break if args.test and batch_idx >= 100: break test_loss0 = test_loss() TestLoss.append(test_loss0) logger('loss/test', test_loss0) step_size = (0.1**0.1) * step_size print('Epoch: {}; best test loss: {}'.format(epoch, min(TestLoss))) if args.test: step_times = logger.d['time/step'] assert step_times[-1] < 30, step_times # should be around 20ms losses = logger.d['loss/train'] assert losses[0] > 2 # around 2.3887393474578857 assert losses[-1] < 0.5, losses print("Test passed")
def main(): np.random.seed(args.seed) tf.set_random_seed(args.seed) logger = u.TensorboardLogger(args.run) with u.timeit("init/session"): gpu_options = tf.GPUOptions(allow_growth=False) sess = tf.InteractiveSession(config=tf.ConfigProto(gpu_options=gpu_options)) u.register_default_session(sess) # since default session is Thread-local with u.timeit("init/model_init"): model = model_creator(args.batch_size, name="main") model.initialize_global_vars(verbose=True) model.initialize_local_vars() with u.timeit("init/kfac_init"): kfac = Kfac(model_creator, args.kfac_batch_size) kfac.model.initialize_global_vars(verbose=False) kfac.model.initialize_local_vars() kfac.Lambda.set(args.Lambda) kfac.reset() # resets optimization variables (not model variables) if args.mode != 'run': opt = tf.train.AdamOptimizer(0.001) else: opt = tf.train.AdamOptimizer(args.lr) grads_and_vars = opt.compute_gradients(model.loss, var_list=model.trainable_vars) grad = IndexedGrad.from_grads_and_vars(grads_and_vars) grad_new = kfac.correct(grad) with u.capture_vars() as adam_vars: train_op = opt.apply_gradients(grad_new.to_grads_and_vars()) with u.timeit("init/adam"): sessrun([v.initializer for v in adam_vars]) losses = [] u.record_time() start_time = time.time() vloss0 = 0 # todo, unify the two data outputs outfn = 'data/%s_%f_%f.csv'%(args.run, args.lr, args.Lambda) writer = u.BufferedWriter(outfn, 60) # get rid? start_time = time.time() if args.extra_kfac_batch_advance: kfac.model.advance_batch() # advance kfac batch if args.kfac_async: kfac.start_stats_runners() for step in range(args.num_steps): if args.validate_every_n and step%args.validate_every_n == 0: loss0, vloss0 = sessrun([model.loss, model.vloss]) else: loss0, = sessrun([model.loss]) losses.append(loss0) # TODO: remove this logger('loss/loss', loss0, 'loss/vloss', vloss0) elapsed = time.time()-start_time print("%d sec, step %d, loss %.2f, vloss %.2f" %(elapsed, step, loss0, vloss0)) writer.write('%d, %f, %f, %f\n'%(step, elapsed, loss0, vloss0)) if args.method=='kfac' and not args.kfac_async: kfac.model.advance_batch() kfac.update_stats() with u.timeit("train"): model.advance_batch() grad.update() with kfac.read_lock(): grad_new.update() train_op.run() u.record_time() logger.next_step() # TODO: use u.global_runs_dir # TODO: get rid of u.timeit? with open('timelines/graphdef.txt', 'w') as f: f.write(str(u.get_default_graph().as_graph_def())) u.summarize_time() if args.mode == 'record': u.dump_with_prompt(losses, release_test_fn) elif args.mode == 'test': targets = np.loadtxt('data/'+release_test_fn, delimiter=",") u.check_equal(losses, targets, rtol=1e-2) u.summarize_difference(losses, targets)