def benchmark(batch_size, iters, seed=1, cuda=True, history=100, verbose=False): global final_loss, W_flat tf.set_random_seed(seed) np.random.seed(seed) images = tf.constant(u.get_mnist_images(batch_size).T) images = images[:batch_size] if cuda: images = images.gpu() data = images if cuda: device='/gpu:0' else: device='' device_ctx = tf.device(device) device_ctx.__enter__() visible_size = 28*28 hidden_size = 196 initial_val = tf.zeros([visible_size*hidden_size]) if W_flat is None: W_flat = tfe.Variable(initial_val, name='W_flat') W_flat.assign(initial_val) def loss_fn(w_flat): w = tf.reshape(w_flat, [visible_size, hidden_size]) x = tf.matmul(data, w) x = tf.sigmoid(x) x = tf.matmul(x, w, transpose_b=True) x = tf.sigmoid(x) return tf.reduce_mean(tf.square(x-data)) value_and_gradients_fn = tfe.value_and_gradients_function(loss_fn) def opfunc(x): # returns (value, gradient) value, grads = value_and_gradients_fn(x) return value, grads[0] # initialize weights W_flat.assign(u.ng_init(visible_size, hidden_size).flatten()) state = Struct() config = Struct() config.maxIter = iters config.nCorrection = history config.verbose = True x, f_hist, currentFuncEval = lbfgs(opfunc, W_flat, config, state, verbose) if verbose: u.summarize_time() s = ','.join(["%f"%(n,) for n in times[2:]]) print('{', s,'}') return final_loss
def main(): torch.manual_seed(args.seed) np.random.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) images = torch.Tensor(u.get_mnist_images().T) images = images[:args.batch_size] if args.cuda: images = images.cuda() data = Variable(images) class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.encoder = nn.Linear(args.visible_size, args.hidden_size, bias=False) self.decoder = nn.Linear(args.hidden_size, args.visible_size, bias=False) def forward(self, input): x = input.view(-1, args.visible_size) x = self.encoder(x) x = F.sigmoid(x) x = self.decoder(x) x = F.sigmoid(x) return x.view_as(input) # initialize model and weights model = Net() params1, params2 = list(model.parameters()) params1.data = torch.Tensor( u.ng_init(args.visible_size, args.hidden_size).T) params2.data = torch.Tensor( u.ng_init(args.hidden_size, args.visible_size).T) if args.cuda: model.cuda() model.train() optimizer = optim.SGD(model.parameters(), lr=args.lr) for step in range(args.iters): optimizer.zero_grad() output = model(data) loss = F.mse_loss(output, data) loss0 = loss.data[0] loss.backward() optimizer.step() print("Step %3d loss %6.5f" % (step, loss0)) u.record_time() u.summarize_time()
def benchmark(batch_size, iters, seed=1, cuda=True, verbose=False): global final_loss, W_flat tf.set_random_seed(seed) np.random.seed(seed) images = tf.constant(u.get_mnist_images(batch_size).T) images = images[:batch_size] if cuda: images = images.gpu() data = images if cuda: device = '/gpu:0' else: device = '' device_ctx = tf.device(device) device_ctx.__enter__() visible_size = 28 * 28 hidden_size = 196 initial_val = tf.zeros([visible_size * hidden_size]) if W_flat is None: W_flat = tfe.Variable(initial_val, name='W_flat') W_flat.assign(initial_val) def loss_fn(w_flat): w = tf.reshape(w_flat, [visible_size, hidden_size]) x = tf.matmul(data, w) x = tf.sigmoid(x) x = tf.matmul(x, w, transpose_b=True) x = tf.sigmoid(x) return tf.reduce_mean(tf.square(x - data)) value_and_gradients_fn = tfe.value_and_gradients_function(loss_fn) def opfunc(x): # returns (value, gradient) value, grads = value_and_gradients_fn(x) return value, grads[0] # initialize weights W_flat.assign(u.ng_init(visible_size, hidden_size).flatten()) state = Struct() config = Struct() config.maxIter = iters config.verbose = True x, f_hist, currentFuncEval = lbfgs(opfunc, W_flat, config, state, verbose) if verbose: u.summarize_time() return final_loss
def main(): tf.set_random_seed(args.seed) np.random.seed(args.seed) images = tf.constant(u.get_mnist_images().T) images = images[:args.batch_size] if args.cuda: images = images.as_gpu_tensor() data = images if args.cuda: device='/gpu:0' else: device='' with tf.device(device): encoder = tf.layers.Dense(units=args.hidden_size, use_bias=False, activation=tf.sigmoid) decoder = tf.layers.Dense(units=args.visible_size, use_bias=False, activation=tf.sigmoid) def loss_fn(inputs): predictions = decoder(encoder(inputs)) return tf.reduce_mean(tf.square(predictions-inputs)) value_and_gradients_fn = tfe.implicit_value_and_gradients(loss_fn) # initialize weights loss_fn(data) params1 = encoder.weights[0] params2 = decoder.weights[0] params1.assign(u.ng_init(args.visible_size, args.hidden_size)) params2.assign(u.ng_init(args.hidden_size, args.visible_size)) optimizer = tf.train.GradientDescentOptimizer(learning_rate=args.lr) for step in range(args.iters): value, grads_and_vars = value_and_gradients_fn(data) optimizer.apply_gradients(grads_and_vars) print("Step %3d loss %6.5f"%(step, value.numpy())) u.record_time() u.summarize_time()
def main(): global fs, X, n, f, dsize, lambda_ np.random.seed(0) tf.set_random_seed(0) train_images = u.get_mnist_images() dsize = 1000 fs = [dsize, 28 * 28, 196, 28 * 28] # layer sizes lambda_ = 3e-3 def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] n = len(fs) - 2 X = tf.constant(train_images[:, :dsize].astype(dtype)) W0_0 = u.ng_init(fs[2], fs[3]) W1_0 = u.ng_init(fs[3], fs[2]) W0f = u.flatten([W0_0.flatten(), W1_0.flatten()]) Wf = tf.constant(W0f) assert Wf.dtype == tf.float32 lr = tf.constant(0.2) losses = [] for step in range(10): loss, grad, kfac_grad = loss_and_grad(Wf) loss0 = loss.numpy() print("Step %d loss %.2f" % (step, loss0)) losses.append(loss0) Wf -= lr * kfac_grad if step >= 4: assert loss < 17.6 u.record_time() u.summarize_time() assert losses[-1] < 0.8 assert losses[-1] > 0.78 assert 20e-3 < min(u.global_time_list) < 120e-3
args.advance_batch = 1 args.extra_kfac_batch_advance = 1 args.batch_size = 10000 args.dataset = 'mnist' rundir = u.setup_experiment_run_directory(args.run) with open(rundir + '/args.txt', 'w') as f: f.write(json.dumps(vars(args), indent=4, separators=(',', ':'))) f.write('\n') if args.dataset == 'cifar': # load data globally once from keras.datasets import cifar10 (X_train, y_train), (X_test, y_test) = cifar10.load_data() X_train = X_train.astype(np.float32) X_train = X_train.reshape((X_train.shape[0], -1)) X_test = X_test.astype(np.float32) X_test = X_test.reshape((X_test.shape[0], -1)) X_train /= 255 X_test /= 255 # todo: rename to better names train_images = X_train.T # batch first test_images = X_test.T elif args.dataset == 'mnist': train_images = u.get_mnist_images('train') test_images = u.get_mnist_images('test') train_images = train_images[:, :args.dataset_size] # batch first main()
def train(optimizer='sgd', nonlin=torch.sigmoid, kfac=True, iters=10, lr=0.2, newton_matrix='stochastic', eval_every_n_steps=1, print_interval=200): """Train on first 10k MNIST examples, evaluate on second 10k.""" u.reset_time() dsize = 10000 # model options dtype = np.float32 torch_dtype = 'torch.FloatTensor' use_cuda = torch.cuda.is_available() if use_cuda: torch_dtype = 'torch.cuda.FloatTensor' INVERSE_METHOD = 'numpy' # numpy, gpu As = [] Bs = [] As_inv = [] Bs_inv = [] mode = 'capture' # 'capture', 'kfac', 'standard' class KfacAddmm(Function): @staticmethod def _get_output(ctx, arg, inplace=False): if inplace: ctx.mark_dirty(arg) return arg else: return arg.new().resize_as_(arg) @staticmethod def forward(ctx, add_matrix, matrix1, matrix2, beta=1, alpha=1, inplace=False): ctx.save_for_backward(matrix1, matrix2) output = KfacAddmm._get_output(ctx, add_matrix, inplace=inplace) return torch.addmm(beta, add_matrix, alpha, matrix1, matrix2, out=output) @staticmethod def backward(ctx, grad_output): matrix1, matrix2 = ctx.saved_variables grad_matrix1 = grad_matrix2 = None if mode == 'capture': Bs.insert(0, grad_output.data) As.insert(0, matrix2.data) elif mode == 'kfac': B = grad_output.data A = matrix2.data kfac_A = As_inv.pop() @ A kfac_B = Bs_inv.pop() @ B grad_matrix1 = Variable(torch.mm(kfac_B, kfac_A.t())) elif mode == 'standard': grad_matrix1 = torch.mm(grad_output, matrix2.t()) else: assert False, 'unknown mode ' + mode if ctx.needs_input_grad[2]: grad_matrix2 = torch.mm(matrix1.t(), grad_output) return None, grad_matrix1, grad_matrix2, None, None, None def kfac_matmul(mat1, mat2): output = Variable(mat1.data.new(mat1.data.size(0), mat2.data.size(1))) return KfacAddmm.apply(output, mat1, mat2, 0, 1, True) torch.manual_seed(1) np.random.seed(1) if use_cuda: torch.cuda.manual_seed(1) # feature sizes at each layer fs = [dsize, 28 * 28, 1024, 1024, 1024, 196, 1024, 1024, 1024, 28 * 28] n = len(fs) - 2 # number of matmuls class Net(nn.Module): def __init__(self): super(Net, self).__init__() for i in range(1, n + 1): W0 = u.ng_init(fs[i + 1], fs[i]) setattr(self, 'W' + str(i), nn.Parameter(torch.from_numpy(W0))) def forward(self, input): x = input.view(fs[1], -1) for i in range(1, n + 1): W = getattr(self, 'W' + str(i)) x = nonlin(kfac_matmul(W, x)) return x.view_as(input) model = Net() if use_cuda: model.cuda() images = u.get_mnist_images() train_data0 = images[:, :dsize].astype(dtype) train_data = Variable(torch.from_numpy(train_data0)) test_data0 = images[:, dsize:2 * dsize].astype(dtype) test_data = Variable(torch.from_numpy(test_data0)) if use_cuda: train_data = train_data.cuda() test_data = test_data.cuda() model.train() if optimizer == 'sgd': optimizer = optim.SGD(model.parameters(), lr=lr) elif optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=lr) else: assert False, 'unknown optimizer ' + optimizer noise = torch.Tensor(*train_data.data.shape).type(torch_dtype) assert fs[-1] <= dsize padding = dsize - fs[-1] zero_mat = torch.zeros((fs[-1], padding)) frozen = torch.cat([torch.eye(fs[-1]), zero_mat], 1).type(torch_dtype) covA_inv_saved = [None] * n losses = [] vlosses = [] for step in range(iters): mode = 'standard' output = model(train_data) if kfac: mode = 'capture' optimizer.zero_grad() del As[:], Bs[:], As_inv[:], Bs_inv[:] if newton_matrix == 'stochastic': noise.normal_() err_add = noise elif newton_matrix == 'exact': err_add = frozen else: assert False, 'unknown method for newton matrix ' + newton_matrix output_hat = Variable(output.data + err_add) err_hat = output_hat - output loss_hat = torch.sum(err_hat * err_hat) / 2 / dsize loss_hat.backward(retain_graph=True) # compute inverses for i in range(n): # first layer activations don't change, only compute once if i == 0 and covA_inv_saved[i] is not None: covA_inv = covA_inv_saved[i] else: covA_inv = regularized_inverse(As[i] @ As[i].t() / dsize) covA_inv_saved[i] = covA_inv As_inv.append(covA_inv) covB = (Bs[i] @ Bs[i].t()) * dsize # alternative formula: slower but numerically better result # covB = (Bs[i]*dsize)@(Bs[i].t()*dsize)/dsize covB_inv = regularized_inverse(covB) Bs_inv.append(covB_inv) mode = 'kfac' else: mode = 'standard' if step % eval_every_n_steps == 0: old_mode = mode mode = 'standard' test_output = model(test_data) test_err = test_data - test_output test_loss = torch.sum(test_err * test_err) / 2 / dsize vloss0 = test_loss.data.cpu().numpy()[0] vlosses.append(vloss0) mode = old_mode optimizer.zero_grad() err = output - train_data loss = torch.sum(err * err) / 2 / dsize loss.backward() optimizer.step() loss0 = loss.data.cpu().numpy()[0] losses.append(loss0) if step % print_interval == 0: print("Step %3d loss %10.9f" % (step, loss0)) u.record_time() return losses, vlosses
from tensorflow.contrib.eager.python import tfe tfe.enable_eager_execution() import common_gd args = common_gd.args args.cuda = not args.no_cuda and tfe.num_gpus() > 0 # for line profiling try: profile # throws an exception when profile isn't defined except NameError: profile = lambda x: x # if it's not defined simply ignore the decorator. train_images = u.get_mnist_images() dsize = 10000 fs = [dsize, 28 * 28, 196, 28 * 28] # layer sizes lambda_ = 3e-3 def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] n = len(fs) - 2 dtype = np.float32 tf_dtype = tf.float32 identity_cache = {}
def train(optimizer='sgd', kfac=True, iters=10, verbose=True): global mode torch.manual_seed(1) np.random.seed(1) if args.cuda: torch.cuda.manual_seed(1) # feature sizes at each layer fs = [dsize, 28*28, 1024, 1024, 1024, 196, 1024, 1024, 1024, 28*28] n = len(fs) - 2 # number of matmuls class Net(nn.Module): def __init__(self): super(Net, self).__init__() for i in range(1, n+1): W0 = u.ng_init(fs[i+1], fs[i]) setattr(self, 'W'+str(i), nn.Parameter(torch.from_numpy(W0))) def forward(self, input): x = input.view(fs[1], -1) for i in range(1, n+1): W = getattr(self, 'W'+str(i)) x = nonlin(kfac_matmul(W, x)) return x.view_as(input) model = Net() if args.cuda: model.cuda() data0 = u.get_mnist_images() data0 = data0[:, :dsize].astype(dtype) data = Variable(torch.from_numpy(data0)) if args.cuda: data = data.cuda() model.train() if optimizer == 'sgd': optimizer = optim.SGD(model.parameters(), lr=lr) elif optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=lr) else: assert False, 'unknown optimizer '+optimizer noise = torch.Tensor(*data.data.shape).type(torch_dtype) covA_inv_saved = [None]*n losses = [] for step in range(10): mode = 'standard' output = model(data) mode = 'capture' optimizer.zero_grad() del As[:], Bs[:], As_inv[:], Bs_inv[:] noise.normal_() output_hat = Variable(output.data+noise) err_hat = output_hat - output loss_hat = torch.sum(err_hat*err_hat)/2/dsize loss_hat.backward(retain_graph=True) # compute inverses for i in range(n): # first layer activations don't change, only compute once if i == 0 and covA_inv_saved[i] is not None: covA_inv = covA_inv_saved[i] else: covA_inv = regularized_inverse(As[i] @ As[i].t()/dsize) covA_inv_saved[i] = covA_inv As_inv.append(covA_inv) covB = (Bs[i]@Bs[i].t())*dsize # alternative formula: slower but numerically better result # covB = (Bs[i]*dsize)@(Bs[i].t()*dsize)/dsize covB_inv = regularized_inverse(covB) Bs_inv.append(covB_inv) if kfac: mode = 'kfac' else: mode = 'standard' optimizer.zero_grad() err = output - data loss = torch.sum(err*err)/2/dsize loss.backward() optimizer.step() loss0 = loss.data.cpu().numpy()[0] losses.append(loss0) if verbose: print("Step %3d loss %10.9f"%(step, loss0)) u.record_time() return losses
def main(): # global forward, backward, DO_PRINT global mode, covA_inv, covB_inv torch.manual_seed(args.seed) np.random.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) # feature sizes fs = [args.batch_size, 28 * 28, 196, 28 * 28] # number of layers n = len(fs) - 2 # todo, move to more elegant backprop matmul = kfac_matmul class Net(nn.Module): def __init__(self): super(Net, self).__init__() for i in range(1, n + 1): W0 = u.ng_init(fs[i + 1], fs[i]) setattr(self, 'W' + str(i), nn.Parameter(torch.from_numpy(W0))) def forward(self, input): x = input.view(784, -1) for i in range(1, n + 1): W = getattr(self, 'W' + str(i)) x = nonlin(matmul(W, x)) return x.view_as(input) model = Net() if args.cuda: model.cuda() data0 = u.get_mnist_images() data0 = data0[:, :dsize].astype(dtype) data = Variable(torch.from_numpy(data0)) if args.cuda: data = data.cuda() model.train() optimizer = optim.SGD(model.parameters(), lr=lr) losses = [] covA = [None] * n covA_inv = [None] * n covB_inv = [None] * n noise = torch.Tensor(*data.data.shape).type(torch_dtype) # TODO: # only do 2 passes like in eager mode # integrate with optimizer/same results # scale to deep autoencoder for step in range(10): optimizer.zero_grad() del forward[:] del backward[:] output = model(data) err = output - data loss = torch.sum(err * err) / 2 / dsize loss.backward(retain_graph=True) backward.reverse() loss0 = loss.data[0] A = forward[:] B = backward[:] assert len(B) == n del forward[:] del backward[:] noise.normal_() synthetic_data = Variable(output.data + noise) err2 = output - synthetic_data loss2 = torch.sum(err2 * err2) / 2 / dsize optimizer.zero_grad() loss2.backward() B2 = backward[::-1] assert len(B2) == n # mode = 'kfac' # compute whitened gradient pre_dW = [] for i in range(n): # only compute first activation once if i > 0: covA[i] = A[i] @ t(A[i]) / dsize covA_inv[i] = regularized_inverse(covA[i]) else: if covA[i] is None: covA[i] = A[i] @ t(A[i]) / dsize covA_inv[i] = regularized_inverse(covA[i]) # else: covB2 = B2[i] @ t(B2[i]) / dsize covB = B[i] @ t(B[i]) / dsize # todo: remove covB_inv[i] = regularized_inverse(covB2.data) whitened_A = covA_inv[i] @ A[i] whitened_B = covB_inv[i] @ B[i].data pre_dW.append(whitened_B @ t(whitened_A) / dsize) params = list(model.parameters()) assert len(params) == len(pre_dW) for i in range(len(params)): params[i].data -= lr * pre_dW[i] print("Step %3d loss %10.9f" % (step, loss0)) u.record_time() loss0 = loss.data.cpu().numpy() #[0] target = 2.360062122 if 'Apple' in sys.version: target = 2.360126972 target = 2.335654736 # after changing to torch.randn if args.cuda: target = 2.337174654 target = 2.337215662 # switching to numpy inverse u.summarize_time() assert abs(loss0 - target) < 1e-9, abs(loss0 - target)
def main(): np.random.seed(0) tf.set_random_seed(0) dtype = np.float32 train_images = u.get_mnist_images() dsize = 10000 patches = train_images[:, :dsize].astype(dtype) fs = [dsize, 28 * 28, 196, 28 * 28] # values from deeplearning.stanford.edu/wiki/index.php/UFLDL_Tutorial X0 = patches lambda_ = 3e-3 rho = tf.constant(0.1, dtype=dtype) beta = 3 W0_0 = u.ng_init(fs[2], fs[3]) W1_0 = u.ng_init(fs[3], fs[2]) W0f = u.flatten([W0_0.flatten(), W1_0.flatten()]) def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = f(-1) n = len(fs) - 2 # helper to create variables with numpy or TF initial value init_dict = {} # {var_placeholder: init_value} vard = {} # {var: u.VarInfo} def init_var(val, name, trainable=False, noinit=False): if isinstance(val, tf.Tensor): collections = [] if noinit else None var = tf.Variable(val, name=name, collections=collections) else: val = np.array(val) assert u.is_numeric, "Unknown type" holder = tf.placeholder(dtype, shape=val.shape, name=name + "_holder") var = tf.Variable(holder, name=name, trainable=trainable) init_dict[holder] = val var_p = tf.placeholder(var.dtype, var.shape) var_setter = var.assign(var_p) vard[var] = u.VarInfo(var_setter, var_p) return var lr = init_var(0.2, "lr") Wf = init_var(W0f, "Wf", True) Wf_copy = init_var(W0f, "Wf_copy", True) W = u.unflatten(Wf, fs[1:]) # perftodo: this creates transposes X = init_var(X0, "X") W.insert(0, X) def sigmoid(x): return tf.sigmoid(x) def d_sigmoid(y): return y * (1 - y) def kl(x, y): return x * tf.log(x / y) + (1 - x) * tf.log((1 - x) / (1 - y)) def d_kl(x, y): return (1 - x) / (1 - y) - x / y # A[i] = activations needed to compute gradient of W[i] # A[n+1] = network output A = [None] * (n + 2) fail_node = tf.Print(0, [0], "fail, this must never run") with tf.control_dependencies([fail_node]): A[0] = u.Identity(dsize, dtype=dtype) A[1] = W[0] for i in range(1, n + 1): A[i + 1] = sigmoid(W[i] @ A[i]) # reconstruction error and sparsity error err = (A[3] - A[1]) rho_hat = tf.reduce_sum(A[2], axis=1, keep_dims=True) / dsize # B[i] = backprops needed to compute gradient of W[i] # B2[i] = backprops from sampled labels needed for natural gradient B = [None] * (n + 1) B2 = [None] * (n + 1) B[n] = err * d_sigmoid(A[n + 1]) sampled_labels_live = tf.random_normal((f(n), f(-1)), dtype=dtype, seed=0) sampled_labels = init_var(sampled_labels_live, "sampled_labels", noinit=True) B2[n] = sampled_labels * d_sigmoid(A[n + 1]) for i in range(n - 1, -1, -1): backprop = t(W[i + 1]) @ B[i + 1] backprop2 = t(W[i + 1]) @ B2[i + 1] B[i] = backprop * d_sigmoid(A[i + 1]) B2[i] = backprop2 * d_sigmoid(A[i + 1]) # dW[i] = gradient of W[i] dW = [None] * (n + 1) pre_dW = [None] * (n + 1) # preconditioned dW pre_dW_stable = [None] * (n + 1) # preconditioned stable dW cov_A = [None] * (n + 1) # covariance of activations[i] cov_B2 = [None] * (n + 1) # covariance of synthetic backprops[i] vars_svd_A = [None] * (n + 1) vars_svd_B2 = [None] * (n + 1) for i in range(1, n + 1): cov_op = A[i] @ t(A[i]) / dsize + lambda_ * u.Identity(A[i].shape[0]) cov_A[i] = init_var(cov_op, "cov_A%d" % (i, )) cov_op = B2[i] @ t(B2[i]) / dsize + lambda_ * u.Identity( B2[i].shape[0]) cov_B2[i] = init_var(cov_op, "cov_B2%d" % (i, )) vars_svd_A[i] = u.SvdWrapper(cov_A[i], "svd_A_%d" % (i, ), do_inverses=True) vars_svd_B2[i] = u.SvdWrapper(cov_B2[i], "svd_B2_%d" % (i, ), do_inverses=True) whitened_A = vars_svd_A[i].inv @ A[i] whitened_B = vars_svd_B2[i].inv @ B[i] pre_dW[i] = (whitened_B @ t(whitened_A)) / dsize dW[i] = (B[i] @ t(A[i])) / dsize # Loss function reconstruction = u.L2(err) / (2 * dsize) loss = reconstruction grad_live = u.flatten(dW[1:]) pre_grad_live = u.flatten(pre_dW[1:]) # fisher preconditioned gradient grad = init_var(grad_live, "grad") pre_grad = init_var(pre_grad_live, "pre_grad") update_params_op = Wf.assign(Wf - lr * pre_grad).op save_params_op = Wf_copy.assign(Wf).op pre_grad_dot_grad = tf.reduce_sum(pre_grad * grad) grad_norm = tf.reduce_sum(grad * grad) pre_grad_norm = u.L2(pre_grad) def dump_svd_info(step): """Dump singular values and gradient values in those coordinates.""" for i in range(1, n + 1): svd = vars_svd_A[i] s0, u0, v0 = sess.run([svd.s, svd.u, svd.v]) u.dump(s0, "A_%d_%d" % (i, step)) A0 = A[i].eval() At0 = v0.T @ A0 u.dump(A0 @ A0.T, "Acov_%d_%d" % (i, step)) u.dump(At0 @ At0.T, "Atcov_%d_%d" % (i, step)) u.dump(s0, "As_%d_%d" % (i, step)) for i in range(1, n + 1): svd = vars_svd_B2[i] s0, u0, v0 = sess.run([svd.s, svd.u, svd.v]) u.dump(s0, "B2_%d_%d" % (i, step)) B0 = B[i].eval() Bt0 = v0.T @ B0 u.dump(B0 @ B0.T, "Bcov_%d_%d" % (i, step)) u.dump(Bt0 @ Bt0.T, "Btcov_%d_%d" % (i, step)) u.dump(s0, "Bs_%d_%d" % (i, step)) def advance_batch(): sess.run(sampled_labels.initializer) # new labels for next call def update_covariances(): ops_A = [cov_A[i].initializer for i in range(1, n + 1)] ops_B2 = [cov_B2[i].initializer for i in range(1, n + 1)] sess.run(ops_A + ops_B2) def update_svds(): vars_svd_A[2].update() vars_svd_B2[2].update() vars_svd_B2[1].update() def init_svds(): """Initialize our SVD to identity matrices.""" ops = [] for i in range(1, n + 1): ops.extend(vars_svd_A[i].init_ops) ops.extend(vars_svd_B2[i].init_ops) sess = tf.get_default_session() sess.run(ops) init_op = tf.global_variables_initializer() from tensorflow.core.protobuf import rewriter_config_pb2 rewrite_options = rewriter_config_pb2.RewriterConfig( disable_model_pruning=True, constant_folding=rewriter_config_pb2.RewriterConfig.OFF, memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL) optimizer_options = tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0) graph_options = tf.GraphOptions(optimizer_options=optimizer_options, rewrite_options=rewrite_options) config = tf.ConfigProto(graph_options=graph_options) sess = tf.InteractiveSession(config=config) sess.run(Wf.initializer, feed_dict=init_dict) sess.run(X.initializer, feed_dict=init_dict) advance_batch() update_covariances() init_svds() sess.run(init_op, feed_dict=init_dict) # initialize everything else print("Running training.") u.reset_time() step_lengths = [] # keep track of learning rates losses = [] # adaptive line search parameters alpha = 0.3 # acceptable fraction of predicted decrease beta = 0.8 # how much to shrink when violation growth_rate = 1.05 # how much to grow when too conservative def update_cov_A(i): sess.run(cov_A[i].initializer) def update_cov_B2(i): sess.run(cov_B2[i].initializer) # only update whitening matrix of input activations in the beginning vars_svd_A[1].update() for step in range(40): update_covariances() update_svds() sess.run(grad.initializer) sess.run(pre_grad.initializer) lr0, loss0 = sess.run([lr, loss]) update_params_op.run() advance_batch() losses.append(loss0) step_lengths.append(lr0) print("Step %d loss %.2f" % (step, loss0)) u.record_time() assert losses[-1] < 0.59 assert losses[-1] > 0.57 assert 20e-3 < min( u.global_time_list) < 50e-3, "Time should be 40ms on 1080" u.summarize_time() print("Test passed")
def main(): global mode torch.manual_seed(args.seed) np.random.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) # feature sizes fs = [dsize, 28 * 28, 196, 28 * 28] # number of layers n = len(fs) - 2 matmul = kfac_matmul class Net(nn.Module): def __init__(self): super(Net, self).__init__() # W1 = (np.array([[0., 1], [2, 3]])).astype(dtype)/10 # W2 = (np.array([[4., 5], [6, 7]])).astype(dtype)/10 # self.W1 = nn.Parameter(torch.from_numpy(W1)) # self.W2 = nn.Parameter(torch.from_numpy(W2)) for i in range(1, n + 1): W0 = u.ng_init(fs[i + 1], fs[i]) setattr(self, 'W' + str(i), nn.Parameter(torch.from_numpy(W0))) def forward(self, input): x = input.view(fs[1], -1) for i in range(1, n + 1): W = getattr(self, 'W' + str(i)) x = nonlin(matmul(W, x)) return x.view_as(input) model = Net() if args.cuda: model.cuda() data0 = u.get_mnist_images() data0 = data0[:, :dsize].astype(dtype) data = Variable(torch.from_numpy(data0)) if args.cuda: data = data.cuda() model.train() optimizer = optim.SGD(model.parameters(), lr=lr) noise = torch.Tensor(*data.data.shape).type(torch_dtype) covA_inv_saved = [None] * n for step in range(10): mode = 'standard' output = model(data) mode = 'capture' optimizer.zero_grad() del forward[:] del backward[:] del forward_inv[:] del backward_inv[:] noise.normal_() output_hat = Variable(output.data + noise) output = model(data) err_hat = output_hat - output loss_hat = torch.sum(err_hat * err_hat) / 2 / dsize loss_hat.backward(retain_graph=True) backward.reverse() forward.reverse() assert len(backward) == n assert len(forward) == n A = forward[:] B = backward[:] # compute inverses for i in range(n): # first layer doesn't change so only compute once if i == 0 and covA_inv_saved[i] is not None: covA_inv = covA_inv_saved[i] else: covA_inv = regularized_inverse(A[i] @ t(A[i]) / dsize) covA_inv_saved[i] = covA_inv forward_inv.append(covA_inv) covB_inv = regularized_inverse(B[i] @ t(B[i]) / dsize) backward_inv.append(covB_inv) mode = 'kfac' optimizer.zero_grad() err = output - data loss = torch.sum(err * err) / 2 / dsize loss.backward() optimizer.step() loss0 = loss.data.cpu().numpy() print("Step %3d loss %10.9f" % (step, loss0)) u.record_time() if args.cuda: target = 2.337120533 else: target = 2.335612774 u.summarize_time() assert abs(loss0 - target) < 1e-9, abs(loss0 - target)
def benchmark(batch_size, iters, seed=1, cuda=True, verbose=False): global step, final_loss step = 0 final_loss = None torch.manual_seed(seed) np.random.seed(seed) if cuda: torch.cuda.manual_seed(seed) visible_size = 28 * 28 hidden_size = 196 images = torch.Tensor(u.get_mnist_images(batch_size).T) images = images[:batch_size] if cuda: images = images.cuda() data = Variable(images) class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.encoder = nn.Parameter(torch.rand(visible_size, hidden_size)) def forward(self, input): x = input.view(-1, visible_size) x = torch.sigmoid(torch.mm(x, self.encoder)) x = torch.sigmoid(torch.mm(x, torch.transpose(self.encoder, 0, 1))) return x.view_as(input) # initialize model and weights model = Net() model.encoder.data = torch.Tensor(u.ng_init(visible_size, hidden_size)) if cuda: model.cuda() model.train() optimizer = optim.LBFGS(model.parameters(), max_iter=iters, history_size=100, lr=1.0) def closure(): global step, final_loss optimizer.zero_grad() output = model(data) loss = F.mse_loss(output, data) if verbose: loss0 = loss.data[0] print("Step %3d loss %6.5f msec %6.3f" % (step, loss0, u.last_time())) step += 1 if step == iters: final_loss = loss.data[0] loss.backward() u.record_time() return loss optimizer.step(closure) output = model(data) loss = F.mse_loss(output, data) loss0 = loss.data[0] if verbose: u.summarize_time() return final_loss
def benchmark(batch_size, iters, seed=1, cuda=True, history=100, verbose=False): global step, final_loss step = 0 final_loss = None torch.manual_seed(seed) np.random.seed(seed) if cuda: torch.cuda.manual_seed(seed) visible_size = 28*28 hidden_size = 196 images = torch.Tensor(u.get_mnist_images(batch_size).T) images = images[:batch_size] if cuda: images = images.cuda() data = Variable(images) class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.encoder = nn.Parameter(torch.rand(visible_size, hidden_size)) def forward(self, input): x = input.view(-1, visible_size) x = torch.sigmoid(torch.mm(x, self.encoder)) x = torch.sigmoid(torch.mm(x, torch.transpose(self.encoder, 0, 1))) return x.view_as(input) # initialize model and weights model = Net() model.encoder.data = torch.Tensor(u.ng_init(visible_size, hidden_size)) if cuda: model.cuda() model.train() optimizer = optim.LBFGS(model.parameters(), max_iter=iters, history_size=history, lr=1.0) times = [] def closure(): global step, final_loss optimizer.zero_grad() output = model(data) loss = F.mse_loss(output, data) if verbose: loss0 = loss.data[0] times.append(u.last_time()) print("Step %3d loss %6.5f msec %6.3f"%(step, loss0, u.last_time())) step+=1 if step == iters: final_loss = loss.data[0] loss.backward() u.record_time() return loss optimizer.step(closure) output = model(data) loss = F.mse_loss(output, data) loss0 = loss.data[0] if verbose: u.summarize_time() # print(times) s = ','.join(["%f"%(n,) for n in times[2:]]) print('{', s,'}') return final_loss