def detect(self, img, min_distance=10): fns = [ timeit(self.syms[i].transform, logger=self.log, name_fn=self.gpu_name) for i in xrange(self.levels) ] return self._detect(fns, img, min_distance)
def gen_data_for(opts, task): bbs = BlockBlobService(account_name=auth.store_name(), account_key=auth.store_key()) sdimacs = from_blob(opts, bbs, task.bcnf, delete=False) ctx = solver.Context() s = solver.deserialize(ctx, sutil.mk_opts(opts), sdimacs) btfds = [] new_bcnfs = [] def push_tfd(tfd): btfds.append(to_blob(opts, bbs, sutil.tfd_to_py(tfd))) propagate_status = s.propagate() if propagate_status == solver.Status.UNKNOWN: s_pre_check = s.clone(ctx) assert (s_pre_check.propagate() == solver.Status.UNKNOWN) check_status, check_time = util.timeit(s.check) if check_status == solver.Status.SAT: pass elif check_status == solver.Status.UNSAT: push_tfd(s_pre_check.to_tf_data_with_core()) [ push_tfd(tfd) for tfd in s_pre_check.get_more_cores( ctx=ctx, max_tries=opts.find_max_tries, percent_to_keep=opts.find_percent_to_keep) ] else: assert (check_status == solver.Status.UNKNOWN) s_pre_cube = s.clone(ctx) assert (s_pre_cube.propagate() == solver.Status.UNKNOWN) (cube_status, cubes), cube_time = util.timeit(s.cube) if cube_status == solver.Status.UNKNOWN: assert (len(cubes) in [1, 2]) random.shuffle(cubes) for cube in cubes: s_child = s.clone(ctx) s_child.add(cube) new_bcnfs.append(to_blob(opts, bbs, s_child.serialize())) return TaskResult(btfds=btfds, new_bcnfs=new_bcnfs)
def advance_batch(): print("Step for model(%s) is %s"%(model.name, u.eval(model.step))) sess = u.get_default_session() # TODO: get rid of _sampled_labels sessrun([sampled_labels.initializer, _sampled_labels.initializer]) if args.advance_batch: with u.timeit("advance_batch"): sessrun(advance_batch_op) sessrun(advance_step_op)
def main() -> None: cmd = sys.argv[1] if cmd == "call": rpc.call(*sys.argv[2:]) elif cmd == "rpc-step": payload = sys.argv[2] encoded = run_step("rpc-step", payload) sys.stdout.write(encoded) elif cmd == "stdin-rpc": payload = sys.argv[2] decoded = b64decode(payload) steps = pickle.loads(decoded) results = [] for step in steps: user = step['user'] msg = step['payload'] current_user = check_output(["id", "--user", "--name"]).decode().strip() step_payload = b64encode(pickle.dumps(msg, 0)).decode() args = ["python3", sys.argv[0], "rpc-step", step_payload] if user is not None and user != current_user: args = ["sudo", "-u", user] + args with timeit("rpc-step"): result = check_output(args).decode() results.append(result) else: with timeit("rpc-int"): result = run_step("rpc-int", step_payload) results.append(result) sys.stdout.write(",".join(results)) else: assert 0, "unknown command"
def _build(name: str, repo: str, commit: str): # This is responsible for creating a build/ directory and building in it exe = find_program("git", "/usr/bin") def git(cmd): log(f"{exe} {cmd}") return check_call([exe] + cmd.split()) git_home = os.path.expanduser("~/git") ensure_dir(git_home, 0o700) dst = repo.split("/")[-1] with cd(git_home): local_repo = os.path.abspath(dst) if not os.path.exists(dst): git(f"init -q --bare {dst}") with cd(dst): git("config --local uploadpack.allowreachablesha1inwant true") rc, existing_repo = subprocess.getstatusoutput(f"{exe} remote get-url origin") if rc != 0: git(f"remote add origin {repo}") elif existing_repo.strip() != repo: git(f"remote set-url origin {repo}") with timeit("remote fetch"): git("fetch") cmd = f"init -q build" git(cmd) with cd("build"): git(f"remote add origin {local_repo}") with timeit("local fetch"): git(f"fetch -q origin {commit} --depth 1") git(f"reset -q --hard {commit}") with timeit("build"): subprocess.check_call(f"./services/{name}/build.sh", stdout=sys.stderr)
def backtrack(image, rows, cols, all_rows, all_cols, filter_interval=25): points = product(range(len(cols)), range(len(rows))) points = sorted(points, key=lambda p: min(-abs(p[0] - len(cols) / 2), -abs(p[ 1] - len(rows) / 2))) counter = 0 for x, y in points: if image[y][x] == '?': image[y][x] = '#' _, possible = consequences(image, rows, cols, all_rows, all_cols) image[y][x] = '.' if possible: _, possible = consequences(image, rows, cols, all_rows, all_cols) if not possible: image[y][x] = '#' else: image[y][x] = '?' if counter >= filter_interval: all_rows, all_cols = filter_domains(image, all_rows, all_cols, True) image, possible = consequences(image, rows, cols, all_rows, all_cols, safe=True) timeit('SHOW') draw(image) counter = -1 counter += 1 image, possible = consequences(image, rows, cols, all_rows, all_cols, safe=True) return image, possible
def update_stats(self): # todo: split into separate stats/svd updates """Updates all covariance/SVD info of correctable factors.""" s = self ops = [] # update covariances # s.grad.update() # TODO: not needed # s.grad2.update() for var in s: ops.append(s[var].A.cov_update_op) ops.append(s[var].B2.cov_update_op) with u.timeit("covariances"): u.run(ops) # update SVDs corrected_vars = list(s) with u.timeit("svd"): with s.write_lock(): for var in s: if not dont_update_first_layer or s[var].A.svd.update_counter==0: s[var].A.svd.update() s[var].B2.svd.update()
def query(self, args): try: guesses, n_secs_gpu = util.timeit(self.sess.run, self.guesses, feed_dict={ self.n_vars: args.n_vars, self.n_clauses: args.n_clauses, self.CL_idxs: args.CL_idxs }) return { "n_secs_gpu": n_secs_gpu, "pi_core_var_logits": guesses.pi_core_var_logits } except tf.errors.OpError as e: util.log(kind='error', author='pyro-server', msg=str(e)) return None
def main(): u.install_pdb_handler() u.seed_random(1) logdir = u.create_local_logdir(args.logdir) run_name = os.path.basename(logdir) gl.event_writer = SummaryWriter(logdir) print(f"Logging to {logdir}") loss_type = 'CrossEntropy' d1 = args.data_width ** 2 args.stats_batch_size = min(args.stats_batch_size, args.dataset_size) args.train_batch_size = min(args.train_batch_size, args.dataset_size) n = args.stats_batch_size o = 10 d = [d1, 60, 60, 60, o] # dataset_size = args.dataset_size model = u.SimpleFullyConnected2(d, bias=True, nonlin=args.nonlin, last_layer_linear=True) model = model.to(gl.device) u.mark_expensive(model.layers[0]) # to stop grad1/hess calculations on this layer print(model) try: if args.wandb: wandb.init(project='curv_train_tiny', name=run_name, dir='/tmp/wandb.runs') wandb.tensorboard.patch(tensorboardX=False) wandb.config['train_batch'] = args.train_batch_size wandb.config['stats_batch'] = args.stats_batch_size wandb.config['n'] = n except Exception as e: print(f"wandb crash with {e}") optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9) # optimizer = torch.optim.Adam(model.parameters(), lr=0.03) # make 10x smaller for least-squares loss dataset = u.TinyMNIST(data_width=args.data_width, dataset_size=args.dataset_size, loss_type=loss_type) train_loader = torch.utils.data.DataLoader(dataset, batch_size=args.train_batch_size, shuffle=False, drop_last=True) train_iter = u.infinite_iter(train_loader) stats_loader = torch.utils.data.DataLoader(dataset, batch_size=args.stats_batch_size, shuffle=False, drop_last=True) stats_iter = u.infinite_iter(stats_loader) stats_data, stats_targets = next(stats_iter) test_dataset = u.TinyMNIST(data_width=args.data_width, train=False, dataset_size=args.dataset_size, loss_type=loss_type) test_batch_size = min(args.dataset_size, 1000) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=test_batch_size, shuffle=False, drop_last=True) test_iter = u.infinite_iter(test_loader) if loss_type == 'LeastSquares': loss_fn = u.least_squares else: # loss_type == 'CrossEntropy': loss_fn = nn.CrossEntropyLoss() autograd_lib.add_hooks(model) gl.reset_global_step() last_outer = 0 val_losses = [] for step in range(args.stats_steps): if last_outer: u.log_scalars({"time/outer": 1000*(time.perf_counter() - last_outer)}) last_outer = time.perf_counter() with u.timeit("val_loss"): test_data, test_targets = next(test_iter) test_output = model(test_data) val_loss = loss_fn(test_output, test_targets) print("val_loss", val_loss.item()) val_losses.append(val_loss.item()) u.log_scalar(val_loss=val_loss.item()) with u.timeit("validate"): if loss_type == 'CrossEntropy': val_accuracy, val_loss = validate(model, test_loader, f'test (stats_step {step})') # train_accuracy, train_loss = validate(model, train_loader, f'train (stats_step {step})') metrics = {'stats_step': step, 'val_accuracy': val_accuracy, 'val_loss': val_loss} u.log_scalars(metrics) data, targets = stats_data, stats_targets if not args.skip_stats: # Capture Hessian and gradient stats autograd_lib.enable_hooks() autograd_lib.clear_backprops(model) autograd_lib.clear_hess_backprops(model) with u.timeit("backprop_g"): output = model(data) loss = loss_fn(output, targets) loss.backward(retain_graph=True) with u.timeit("backprop_H"): autograd_lib.backprop_hess(output, hess_type=loss_type) autograd_lib.disable_hooks() # TODO(y): use remove_hooks with u.timeit("compute_grad1"): autograd_lib.compute_grad1(model) with u.timeit("compute_hess"): autograd_lib.compute_hess(model) for (i, layer) in enumerate(model.layers): if hasattr(layer, 'expensive'): continue param_names = {layer.weight: "weight", layer.bias: "bias"} for param in [layer.weight, layer.bias]: # input/output layers are unreasonably expensive if not using Kronecker factoring if d[i]*d[i+1] > 8000: print(f'layer {i} is too big ({d[i],d[i+1]}), skipping stats') continue s = AttrDefault(str, {}) # dictionary-like object for layer stats ############################# # Gradient stats ############################# A_t = layer.activations B_t = layer.backprops_list[0] * n s.sparsity = torch.sum(layer.output <= 0) / layer.output.numel() # proportion of activations that are zero s.mean_activation = torch.mean(A_t) s.mean_backprop = torch.mean(B_t) # empirical Fisher G = param.grad1.reshape((n, -1)) g = G.mean(dim=0, keepdim=True) u.nan_check(G) with u.timeit(f'sigma-{i}'): efisher = G.t() @ G / n sigma = efisher - g.t() @ g # sigma_spectrum = s.sigma_l2 = u.sym_l2_norm(sigma) s.sigma_erank = torch.trace(sigma)/s.sigma_l2 H = param.hess lambda_regularizer = args.lmb * torch.eye(H.shape[0]).to(gl.device) u.nan_check(H) with u.timeit(f"invH-{i}"): invH = torch.cholesky_inverse(H+lambda_regularizer) with u.timeit(f"H_l2-{i}"): s.H_l2 = u.sym_l2_norm(H) s.iH_l2 = u.sym_l2_norm(invH) with u.timeit(f"norms-{i}"): s.H_fro = H.flatten().norm() s.iH_fro = invH.flatten().norm() s.grad_fro = g.flatten().norm() s.param_fro = param.data.flatten().norm() def loss_direction(dd: torch.Tensor, eps): """loss improvement if we take step eps in direction dd""" return u.to_python_scalar(eps * (dd @ g.t()) - 0.5 * eps ** 2 * dd @ H @ dd.t()) def curv_direction(dd: torch.Tensor): """Curvature in direction dd""" return u.to_python_scalar(dd @ H @ dd.t() / (dd.flatten().norm() ** 2)) with u.timeit(f"pinvH-{i}"): pinvH = u.pinv(H) with u.timeit(f'curv-{i}'): s.grad_curv = curv_direction(g) # curvature (eigenvalue) in direction g ndir = g @ pinvH # newton direction s.newton_curv = curv_direction(ndir) setattr(layer.weight, 'pre', pinvH) # save Newton preconditioner s.step_openai = 1 / s.grad_curv if s.grad_curv else 1234567 s.step_div_inf = 2 / s.H_l2 # divegent step size for batch_size=infinity s.step_div_1 = torch.tensor(2) / torch.trace(H) # divergent step for batch_size=1 s.newton_fro = ndir.flatten().norm() # frobenius norm of Newton update s.regret_newton = u.to_python_scalar(g @ pinvH @ g.t() / 2) # replace with "quadratic_form" s.regret_gradient = loss_direction(g, s.step_openai) with u.timeit(f'rho-{i}'): s.rho, s.lyap_erank, lyap_evals = u.truncated_lyapunov_rho(H, sigma) s.step_div_1_adjusted = s.step_div_1/s.rho with u.timeit(f"batch-{i}"): s.batch_openai = torch.trace(H @ sigma) / (g @ H @ g.t()) s.diversity = torch.norm(G, "fro") ** 2 / torch.norm(g) ** 2 / n # Gradient diversity / n s.noise_variance_pinv = torch.trace(pinvH @ sigma) s.H_erank = torch.trace(H) / s.H_l2 s.batch_jain_simple = 1 + s.H_erank s.batch_jain_full = 1 + s.rho * s.H_erank param_name = f"{layer.name}={param_names[param]}" u.log_scalars(u.nest_stats(f"{param_name}", s)) H_evals = u.symeig_pos_evals(H) sigma_evals = u.symeig_pos_evals(sigma) u.log_spectrum(f'{param_name}/hess', H_evals) u.log_spectrum(f'{param_name}/sigma', sigma_evals) u.log_spectrum(f'{param_name}/lyap', lyap_evals) # gradient steps with u.timeit('inner'): for i in range(args.train_steps): optimizer.zero_grad() data, targets = next(train_iter) model.zero_grad() output = model(data) loss = loss_fn(output, targets) loss.backward() optimizer.step() if args.weight_decay: for group in optimizer.param_groups: for param in group['params']: param.data.mul_(1-args.weight_decay) gl.increment_global_step(data.shape[0]) gl.event_writer.close()
def main1(cnt): timeit(v.save, "ESV", times=cnt) timeit(v.read, "ESV", times=cnt)
import sys from util import timeit sys.setrecursionlimit(10000) if __name__ == '__main__': from algorithms.is_prime_number import * test_cases = [(30, False), (1, False), (4, False), (9, False), (16, False), (25, False), (36, False), (49, False), (64, False), (81, False), (100, False), (121, False), (144, False), (169, False), (196, False), (225, False), (256, False), (289, False), (324, False), (361, False), (400, False), (441, False), (484, False), (529, False), (576, False), (625, False), (676, False), (729, False), (784, False), (841, False), (907, True)] l = [a for a, _ in test_cases]*1000 timeit(lambda: [is_prime_standard(el) for el in l]) timeit(lambda: [is_prime(el) for el in l]) print("Tree stuff") from algorithms.linear_recursion import test test()
def test_factored_stats_golden_values(): """Test stats from values generated by non-factored version""" u.seed_random(1) u.install_pdb_handler() torch.set_default_dtype(torch.float32) parser = argparse.ArgumentParser(description='PyTorch MNIST Example') args = parser.parse_args() logdir = u.create_local_logdir('/temp/runs/factored_test') run_name = os.path.basename(logdir) gl.event_writer = SummaryWriter(logdir) print('logging to ', logdir) loss_type = 'LeastSquares' args.data_width = 2 args.dataset_size = 5 args.stats_batch_size = 5 d1 = args.data_width**2 args.stats_batch_size = args.dataset_size args.stats_steps = 1 n = args.stats_batch_size o = 10 d = [d1, o] model = u.SimpleFullyConnected2(d, bias=False, nonlin=0) model = model.to(gl.device) print(model) dataset = u.TinyMNIST(data_width=args.data_width, dataset_size=args.dataset_size, loss_type=loss_type) stats_loader = torch.utils.data.DataLoader( dataset, batch_size=args.stats_batch_size, shuffle=False) stats_iter = u.infinite_iter(stats_loader) stats_data, stats_targets = next(stats_iter) if loss_type == 'LeastSquares': loss_fn = u.least_squares else: # loss_type == 'CrossEntropy': loss_fn = nn.CrossEntropyLoss() autograd_lib.add_hooks(model) gl.reset_global_step() last_outer = 0 for step in range(args.stats_steps): if last_outer: u.log_scalars( {"time/outer": 1000 * (time.perf_counter() - last_outer)}) last_outer = time.perf_counter() data, targets = stats_data, stats_targets # Capture Hessian and gradient stats autograd_lib.enable_hooks() autograd_lib.clear_backprops(model) with u.timeit("backprop_g"): output = model(data) loss = loss_fn(output, targets) loss.backward(retain_graph=True) autograd_lib.clear_hess_backprops(model) with u.timeit("backprop_H"): autograd_lib.backprop_hess(output, hess_type=loss_type) autograd_lib.disable_hooks() # TODO(y): use remove_hooks with u.timeit("compute_grad1"): autograd_lib.compute_grad1(model) with u.timeit("compute_hess"): autograd_lib.compute_hess(model) autograd_lib.compute_hess(model, method='kron', attr_name='hess2') autograd_lib.compute_stats_factored(model) params = list(model.parameters()) assert len(params) == 1 new_values = params[0].stats golden_values = torch.load('test/factored.pt') for valname in new_values: print("Checking ", valname) if valname == 'sigma_l2': u.check_close(new_values[valname], golden_values[valname], atol=1e-2) # sigma is approximate elif valname == 'sigma_erank': u.check_close(new_values[valname], golden_values[valname], atol=0.11) # 1.0 vs 1.1 elif valname in ['rho', 'step_div_1_adjusted', 'batch_jain_full']: continue # lyapunov stats weren't computed correctly in golden set elif valname in ['batch_openai']: continue # batch sizes depend on sigma which is approximate elif valname in ['noise_variance_pinv']: pass # went from 0.22 to 0.014 after kron factoring (0.01 with full centering, 0.3 with no centering) elif valname in ['sparsity']: pass # had a bug in old calc (using integer arithmetic) else: u.check_close(new_values[valname], golden_values[valname], rtol=1e-4, atol=1e-6, label=valname) gl.event_writer.close()
def read_lock(self): with u.timeit("read_lock"): with self.lock: yield
sim.taskmanager.Task("Hunt", sim.taskmanager.TaskType.GATHER), sim.taskmanager.Task("Cook", sim.taskmanager.TaskType.ACTIVITY), sim.taskmanager.Task("Clean", sim.taskmanager.TaskType.ACTIVITY), sim.taskmanager.Task("Gather Water", sim.taskmanager.TaskType.GATHER), ] task_manager.randomly_assign(tasks) # terrain terrain = sim.world.Terrain([ sim.world.Resource("Hunt", 2, entity.Point(-55, -33, 0)), sim.world.Resource("Cook", 4, entity.Point(15, 25, 0)), sim.world.Resource("Clean", 2, entity.Point(0, 0, 0)), sim.world.Resource("Gather Water", 100, entity.Point(5, 5, 0)), ]) # renderer renderer = Renderer() event_processor = EventProcessor() # run sim world = sim.world.World(terrain, entity_manager, task_manager) while True: sim.world.run(world, renderer, event_processor) time.sleep(0.16) if __name__ == "__main__": os.system("cls") time_ran = util.timeit(run) print(f"Runtime: [bold green]{time_ran} [red]seconds")
def main(): np.random.seed(args.seed) tf.set_random_seed(args.seed) logger = u.TensorboardLogger(args.run) with u.timeit("init/session"): rewrite_options = None try: from tensorflow.core.protobuf import rewriter_config_pb2 rewrite_options = rewriter_config_pb2.RewriterConfig( disable_model_pruning=True, constant_folding=rewriter_config_pb2.RewriterConfig.OFF, memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL) except: pass optimizer_options = tf.OptimizerOptions( opt_level=tf.OptimizerOptions.L0) graph_options = tf.GraphOptions(optimizer_options=optimizer_options, rewrite_options=rewrite_options) gpu_options = tf.GPUOptions(allow_growth=False) config = tf.ConfigProto(graph_options=graph_options, gpu_options=gpu_options, log_device_placement=False) sess = tf.InteractiveSession(config=config) u.register_default_session( sess) # since default session is Thread-local with u.timeit("init/model_init"): model = model_creator(args.batch_size, name="main") model.initialize_global_vars(verbose=True) model.initialize_local_vars() kfac_lib.numeric_inverse = args.numeric_inverse with u.timeit("init/kfac_init"): kfac = Kfac(model_creator, args.kfac_batch_size) kfac.model.initialize_global_vars(verbose=False) kfac.model.initialize_local_vars() kfac.Lambda.set(args.Lambda) kfac.reset() # resets optimization variables (not model variables) if args.mode != 'run': opt = tf.train.AdamOptimizer(0.001) else: opt = tf.train.AdamOptimizer(args.lr) grads_and_vars = opt.compute_gradients(model.loss, var_list=model.trainable_vars) grad = IndexedGrad.from_grads_and_vars(grads_and_vars) grad_new = kfac.correct(grad) with u.capture_vars() as adam_vars: train_op = opt.apply_gradients(grad_new.to_grads_and_vars()) with u.timeit("init/adam"): sessrun([v.initializer for v in adam_vars]) losses = [] u.record_time() start_time = time.time() vloss0 = 0 # todo, unify the two data outputs outfn = 'data/%s_%f_%f.csv' % (args.run, args.lr, args.Lambda) start_time = time.time() if args.extra_kfac_batch_advance: kfac.model.advance_batch() # advance kfac batch if args.kfac_async: kfac.start_stats_runners() for step in range(args.num_steps): if args.validate_every_n and step % args.validate_every_n == 0: loss0, vloss0 = sessrun([model.loss, model.vloss]) else: loss0, = sessrun([model.loss]) losses.append(loss0) # TODO: remove this logger('loss/loss', loss0, 'loss/vloss', vloss0) elapsed = time.time() - start_time start_time = time.time() print("%4d ms, step %4d, loss %5.2f, vloss %5.2f" % (elapsed * 1e3, step, loss0, vloss0)) if args.method == 'kfac' and not args.kfac_async: kfac.model.advance_batch() kfac.update_stats() with u.timeit("train"): model.advance_batch() with u.timeit("grad.update"): grad.update() with kfac.read_lock(): grad_new.update() u.run(train_op) u.record_time() logger.next_step() # TODO: use u.global_runs_dir # TODO: get rid of u.timeit? with open('timelines/graphdef.txt', 'w') as f: f.write(str(u.get_default_graph().as_graph_def())) u.summarize_time() if args.mode == 'record': u.dump_with_prompt(losses, release_test_fn) elif args.mode == 'test': targets = np.loadtxt('data/' + release_test_fn, delimiter=",") u.check_equal(losses, targets, rtol=1e-2) u.summarize_difference(losses, targets) assert u.last_time() < 800, "Expected 648 on GTX 1080"
def main(): u.reset_timeit() iters = 11 n = 10000 print(f"Benchmarking n={n}") ############################################################ # Numpy ############################################################ A = scipy.randn(n, n) # random matrix A = A @ A.T # positive definite matrix A = scipy.linalg.cholesky(A) # upper diagonal matrix b = scipy.randn(n) u.reset_timeit() for i in range(iters): with u.timeit('numpy'): scipy.linalg.solve_triangular(A, b) ############################################################ # PyTorch GPU ############################################################ A = torch.randn(n, n) A = A @ A.t() + torch.diag(torch.ones(n)) A = torch.potrf(A).cuda() b = torch.randn(n, 1).cuda() # prewarm torch.trtrs(b, A) for i in range(iters): torch.cuda.synchronize() with u.timeit('Pytorch GPU'): result = torch.trtrs(b, A) torch.cuda.synchronize() del result ############################################################ # PyTorch CPU ############################################################ A = torch.randn(n, n) A = A @ A.t() + torch.diag(torch.ones(n)) A = torch.potrf(A) b = torch.randn(n, 1) # prewarm (result, A_clone) = torch.trtrs(b, A) assert result.device.type == 'cpu' for i in range(iters): torch.cuda.synchronize() with u.timeit('Pytorch CPU'): result = torch.trtrs(b, A) torch.cuda.synchronize() del result ############################################################ # PyTorch GPU ############################################################ A = torch.randn(n, n) A = A @ A.t() + torch.diag(torch.ones(n)) A = torch.potrf(A).cuda() b = torch.randn(n, 1).cuda() # prewarm (result, A_clone) = torch.trtrs(b, A) assert result.device.type == 'cuda' for i in range(iters): torch.cuda.synchronize() with u.timeit('Pytorch GPU'): (result, dummy) = torch.trtrs(b, A) print(result[0, 0]) # torch.cuda.synchronize() del result ############################################################ # Tensorflow GPU ############################################################ A = tf.random_normal((n, n)).gpu() b = tf.random_normal((n, 1)).gpu() A = A @ tf.transpose(A) + tf.diag(tf.ones( (n, ))) # bug, diag is needed, or Cholesky fails A = tf.cholesky(A) # bug, Should be able to do constant conversion, but fails with # Internal: failed to query device pointer for context: CUDA_ERROR_INVALID_VALUE # A = tf.constant(A).gpu() # b = tf.constant(b).gpu() # prewarm result = tf.contrib.eager.Variable(tf.zeros((n, 1))) result.assign(tf.linalg.triangular_solve(A, b)) assert 'gpu' in result.device.lower() for i in range(iters): b += 1 # prevent caching with u.timeit('TF GPU'): result.assign(tf.linalg.triangular_solve(A, b)) print(result[0, 0]) ############################################################ # Tensorflow CPU ############################################################ A = tf.random_normal((n, n)).cpu() b = tf.random_normal((n, 1)).cpu() A = A @ tf.transpose(A) + tf.diag(tf.ones( (n, ))) # bug, diag is needed, or Cholesky fails A = tf.cholesky(A) A = A.cpu() b = b.cpu() # prewarm with tf.device('/cpu:0'): result = tf.contrib.eager.Variable(tf.zeros((n, 1))) result.assign(tf.linalg.triangular_solve(A, b)) assert 'cpu' in result.device.lower() for i in range(iters): b += 1 # prevent caching with u.timeit('TF CPU'): result.assign(tf.linalg.triangular_solve(A, b)) u.summarize_timeit()
# # add offset to value of (m, m) to get value at (y, x): # if square(m) is even, the max value for (m, m) lies on y-axis: # v += y - x # move towards the max axis value # if square(m) is odd, the max value for (m, m) lies on x-axis # v += x - y v = m**2 - m + 1 + (y-x if m % 2 == 0 else x-y) r.append(v) return r if __name__ == '__main__': for r in range(1, 7): tests = (f'{r} {c}' for c in range(1, 7)) print(*number_spiral(*tests), sep='\t') print() benchmark = timeit(lambda *tests: number_spiral(*tests)) benchmark('2 3', '1 1', '4 2') ''' terminal 1 2 9 10 25 26 4 3 8 11 24 27 5 6 7 12 23 28 16 15 14 13 22 29 17 18 19 20 21 30 36 35 34 33 32 31 run <lambda>('2 3', '1 1', '4 2') got [8, 1, 15] in 0.0000378000 secs. '''
def main(): parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') parser.add_argument('--wandb', type=int, default=0, help='log to weights and biases') parser.add_argument('--autograd_check', type=int, default=0, help='autograd correctness checks') parser.add_argument('--logdir', type=str, default='/tmp/runs/curv_train_tiny/run') parser.add_argument('--nonlin', type=int, default=1, help="whether to add ReLU nonlinearity between layers") parser.add_argument('--bias', type=int, default=1, help="whether to add bias between layers") parser.add_argument('--layer', type=int, default=-1, help="restrict updates to this layer") parser.add_argument('--data_width', type=int, default=28) parser.add_argument('--targets_width', type=int, default=28) parser.add_argument( '--hess_samples', type=int, default=1, help='number of samples when sub-sampling outputs, 0 for exact hessian' ) parser.add_argument('--hess_kfac', type=int, default=0, help='whether to use KFAC approximation for hessian') parser.add_argument('--compute_rho', type=int, default=0, help='use expensive method to compute rho') parser.add_argument('--skip_stats', type=int, default=1, help='skip all stats collection') parser.add_argument('--dataset_size', type=int, default=60000) parser.add_argument('--train_steps', type=int, default=1000, help="this many train steps between stat collection") parser.add_argument('--stats_steps', type=int, default=1000000, help="total number of curvature stats collections") parser.add_argument('--full_batch', type=int, default=0, help='do stats on the whole dataset') parser.add_argument('--lr', type=float, default=1e-3) parser.add_argument('--weight_decay', type=float, default=2e-5) parser.add_argument('--momentum', type=float, default=0.9) parser.add_argument('--dropout', type=int, default=0) parser.add_argument('--swa', type=int, default=0) parser.add_argument('--lmb', type=float, default=1e-3) parser.add_argument('--train_batch_size', type=int, default=64) parser.add_argument('--stats_batch_size', type=int, default=10000) parser.add_argument('--uniform', type=int, default=0, help='use uniform architecture (all layers same size)') parser.add_argument('--run_name', type=str, default='noname') gl.args = parser.parse_args() args = gl.args u.seed_random(1) gl.project_name = 'train_ciresan' u.setup_logdir_and_event_writer(args.run_name) print(f"Logging to {gl.logdir}") d1 = 28 * 28 if args.uniform: d = [784, 784, 784, 784, 784, 784, 10] else: d = [784, 2500, 2000, 1500, 1000, 500, 10] o = 10 n = args.stats_batch_size model = u.SimpleFullyConnected2(d, nonlin=args.nonlin, bias=args.bias, dropout=args.dropout) model = model.to(gl.device) optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) dataset = u.TinyMNIST(data_width=args.data_width, targets_width=args.targets_width, original_targets=True, dataset_size=args.dataset_size) train_loader = torch.utils.data.DataLoader( dataset, batch_size=args.train_batch_size, shuffle=True, drop_last=True) train_iter = u.infinite_iter(train_loader) assert not args.full_batch, "fixme: validation still uses stats_iter" if not args.full_batch: stats_loader = torch.utils.data.DataLoader( dataset, batch_size=args.stats_batch_size, shuffle=False, drop_last=True) stats_iter = u.infinite_iter(stats_loader) else: stats_iter = None test_dataset = u.TinyMNIST(data_width=args.data_width, targets_width=args.targets_width, train=False, original_targets=True, dataset_size=args.dataset_size) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.stats_batch_size, shuffle=False, drop_last=False) loss_fn = torch.nn.CrossEntropyLoss() autograd_lib.add_hooks(model) autograd_lib.disable_hooks() gl.token_count = 0 last_outer = 0 for step in range(args.stats_steps): epoch = gl.token_count // 60000 print(gl.token_count) if last_outer: u.log_scalars( {"time/outer": 1000 * (time.perf_counter() - last_outer)}) last_outer = time.perf_counter() # compute validation loss if args.swa: model.eval() with u.timeit('swa'): base_opt = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) opt = torchcontrib.optim.SWA(base_opt, swa_start=0, swa_freq=1, swa_lr=args.lr) for _ in range(100): optimizer.zero_grad() data, targets = next(train_iter) model.zero_grad() output = model(data) loss = loss_fn(output, targets) loss.backward() opt.step() opt.swap_swa_sgd() with u.timeit("validate"): val_accuracy, val_loss = validate(model, test_loader, f'test (epoch {epoch})') train_accuracy, train_loss = validate(model, stats_loader, f'train (epoch {epoch})') # save log metrics = { 'epoch': epoch, 'val_accuracy': val_accuracy, 'val_loss': val_loss, 'train_loss': train_loss, 'train_accuracy': train_accuracy, 'lr': optimizer.param_groups[0]['lr'], 'momentum': optimizer.param_groups[0].get('momentum', 0) } u.log_scalars(metrics) # compute stats if args.full_batch: data, targets = dataset.data, dataset.targets else: data, targets = next(stats_iter) if not args.skip_stats: autograd_lib.enable_hooks() autograd_lib.clear_backprops(model) autograd_lib.clear_hess_backprops(model) with u.timeit("backprop_g"): output = model(data) loss = loss_fn(output, targets) loss.backward(retain_graph=True) with u.timeit("backprop_H"): autograd_lib.backprop_hess(output, hess_type='CrossEntropy') autograd_lib.disable_hooks() # TODO(y): use remove_hooks with u.timeit("compute_grad1"): autograd_lib.compute_grad1(model) with u.timeit("compute_hess"): autograd_lib.compute_hess(model, method='kron', attr_name='hess2') autograd_lib.compute_stats_factored(model) for (i, layer) in enumerate(model.layers): param_names = {layer.weight: "weight", layer.bias: "bias"} for param in [layer.weight, layer.bias]: if param is None: continue if not hasattr(param, 'stats'): continue s = param.stats param_name = param_names[param] u.log_scalars(u.nest_stats(f"{param_name}", s)) # gradient steps model.train() last_inner = 0 for i in range(args.train_steps): if last_inner: u.log_scalars( {"time/inner": 1000 * (time.perf_counter() - last_inner)}) last_inner = time.perf_counter() optimizer.zero_grad() data, targets = next(train_iter) model.zero_grad() output = model(data) loss = loss_fn(output, targets) loss.backward() optimizer.step() if args.weight_decay: for group in optimizer.param_groups: for param in group['params']: param.data.mul_(1 - args.weight_decay) gl.token_count += data.shape[0] gl.event_writer.close()
from subprocess import check_output import util import sys import math from util import N, timeit from functools import partial import multiprocessing import pi_serial import pi_multiprocessing #import pi_mpi import pi_openCL def do_mpi(cpus): util.mpi_verbose = False #make sure pi_mpi.py prints only its result to console mpi_path = sys.path[0] #get result and strip trailing "\n" return check_output(["mpiexec -n " + str(int(cpus)) + " python ./pi_mpi.py"], shell=True)[:-2] if __name__=='__main__': print("N: " + "{:,}".format(N)) print("Pi: " + str(math.pi)) pi_serial.run() pi_multiprocessing.run() pi_openCL.run() #do mpi: for cpus in range(multiprocessing.cpu_count()): timeit(partial(do_mpi, cpus+1), "MPI with " + str(cpus+1) + " CPUs")
def main(): parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') parser.add_argument('--wandb', type=int, default=0, help='log to weights and biases') parser.add_argument('--autograd_check', type=int, default=0, help='autograd correctness checks') parser.add_argument('--logdir', type=str, default='/tmp/runs/curv_train_tiny/run') parser.add_argument('--nonlin', type=int, default=1, help="whether to add ReLU nonlinearity between layers") parser.add_argument('--bias', type=int, default=1, help="whether to add bias between layers") parser.add_argument('--layer', type=int, default=-1, help="restrict updates to this layer") parser.add_argument('--data_width', type=int, default=28) parser.add_argument('--targets_width', type=int, default=28) parser.add_argument('--hess_samples', type=int, default=1, help='number of samples when sub-sampling outputs, 0 for exact hessian') parser.add_argument('--hess_kfac', type=int, default=0, help='whether to use KFAC approximation for hessian') parser.add_argument('--compute_rho', type=int, default=0, help='use expensive method to compute rho') parser.add_argument('--skip_stats', type=int, default=0, help='skip all stats collection') parser.add_argument('--dataset_size', type=int, default=60000) parser.add_argument('--train_steps', type=int, default=100, help="this many train steps between stat collection") parser.add_argument('--stats_steps', type=int, default=1000000, help="total number of curvature stats collections") parser.add_argument('--full_batch', type=int, default=0, help='do stats on the whole dataset') parser.add_argument('--lr', type=float, default=1e-3) parser.add_argument('--weight_decay', type=float, default=0) parser.add_argument('--momentum', type=float, default=0.9) parser.add_argument('--dropout', type=int, default=0) parser.add_argument('--swa', type=int, default=0) parser.add_argument('--lmb', type=float, default=1e-3) parser.add_argument('--train_batch_size', type=int, default=64) parser.add_argument('--stats_batch_size', type=int, default=10000) parser.add_argument('--stats_num_batches', type=int, default=1) parser.add_argument('--run_name', type=str, default='noname') parser.add_argument('--launch_blocking', type=int, default=0) parser.add_argument('--sampled', type=int, default=0) parser.add_argument('--curv', type=str, default='kfac', help='decomposition to use for curvature estimates: zero_order, kfac, isserlis or full') parser.add_argument('--log_spectra', type=int, default=0) u.seed_random(1) gl.args = parser.parse_args() args = gl.args u.seed_random(1) gl.project_name = 'train_ciresan' u.setup_logdir_and_event_writer(args.run_name) print(f"Logging to {gl.logdir}") d1 = 28 * 28 d = [784, 2500, 2000, 1500, 1000, 500, 10] # number of samples per datapoint. Used to normalize kfac model = u.SimpleFullyConnected2(d, nonlin=args.nonlin, bias=args.bias, dropout=args.dropout) model = model.to(gl.device) autograd_lib.register(model) assert args.dataset_size >= args.stats_batch_size optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) dataset = u.TinyMNIST(data_width=args.data_width, targets_width=args.targets_width, original_targets=True, dataset_size=args.dataset_size) train_loader = torch.utils.data.DataLoader(dataset, batch_size=args.train_batch_size, shuffle=True, drop_last=True) train_iter = u.infinite_iter(train_loader) assert not args.full_batch, "fixme: validation still uses stats_iter" if not args.full_batch: stats_loader = torch.utils.data.DataLoader(dataset, batch_size=args.stats_batch_size, shuffle=True, drop_last=True) stats_iter = u.infinite_iter(stats_loader) else: stats_iter = None test_dataset = u.TinyMNIST(data_width=args.data_width, targets_width=args.targets_width, train=False, original_targets=True, dataset_size=args.dataset_size) test_eval_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.stats_batch_size, shuffle=False, drop_last=False) train_eval_loader = torch.utils.data.DataLoader(dataset, batch_size=args.stats_batch_size, shuffle=False, drop_last=False) loss_fn = torch.nn.CrossEntropyLoss() autograd_lib.add_hooks(model) autograd_lib.disable_hooks() gl.token_count = 0 last_outer = 0 for step in range(args.stats_steps): epoch = gl.token_count // 60000 lr = optimizer.param_groups[0]['lr'] print('token_count', gl.token_count) if last_outer: u.log_scalars({"time/outer": 1000 * (time.perf_counter() - last_outer)}) print(f'time: {time.perf_counter() - last_outer:.2f}') last_outer = time.perf_counter() with u.timeit("validate"): val_accuracy, val_loss = validate(model, test_eval_loader, f'test (epoch {epoch})') train_accuracy, train_loss = validate(model, train_eval_loader, f'train (epoch {epoch})') # save log metrics = {'epoch': epoch, 'val_accuracy': val_accuracy, 'val_loss': val_loss, 'train_loss': train_loss, 'train_accuracy': train_accuracy, 'lr': optimizer.param_groups[0]['lr'], 'momentum': optimizer.param_groups[0].get('momentum', 0)} u.log_scalars(metrics) def mom_update(buffer, val): buffer *= 0.9 buffer += val * 0.1 if not args.skip_stats: # number of samples passed through n = args.stats_batch_size * args.stats_num_batches # quanti forward_stats = defaultdict(lambda: AttrDefault(float)) hessians = defaultdict(lambda: AttrDefault(float)) jacobians = defaultdict(lambda: AttrDefault(float)) fishers = defaultdict(lambda: AttrDefault(float)) # empirical fisher/gradient quad_fishers = defaultdict(lambda: AttrDefault(float)) # gradient statistics that depend on fisher (4th order moments) train_regrets = defaultdict(list) test_regrets1 = defaultdict(list) test_regrets2 = defaultdict(list) train_regrets_opt = defaultdict(list) test_regrets_opt = defaultdict(list) cosines = defaultdict(list) dot_products = defaultdict(list) hessians_histograms = defaultdict(lambda: AttrDefault(u.MyList)) jacobians_histograms = defaultdict(lambda: AttrDefault(u.MyList)) fishers_histograms = defaultdict(lambda: AttrDefault(u.MyList)) quad_fishers_histograms = defaultdict(lambda: AttrDefault(u.MyList)) current = None current_histograms = None for i in range(args.stats_num_batches): activations = {} backprops = {} def save_activations(layer, A, _): activations[layer] = A forward_stats[layer].AA += torch.einsum("ni,nj->ij", A, A) print('forward') with u.timeit("stats_forward"): with autograd_lib.module_hook(save_activations): data, targets = next(stats_iter) output = model(data) loss = loss_fn(output, targets) * len(output) def compute_stats(layer, _, B): A = activations[layer] if current == fishers: backprops[layer] = B # about 27ms per layer with u.timeit('compute_stats'): current[layer].BB += torch.einsum("ni,nj->ij", B, B) # TODO(y): index consistency current[layer].diag += torch.einsum("ni,nj->ij", B * B, A * A) current[layer].BA += torch.einsum("ni,nj->ij", B, A) current[layer].a += torch.einsum("ni->i", A) current[layer].b += torch.einsum("nk->k", B) current[layer].norm2 += ((A * A).sum(dim=1) * (B * B).sum(dim=1)).sum() # compute curvatures in direction of all gradiennts if current is fishers: assert args.stats_num_batches == 1, "not tested on more than one stats step, currently reusing aggregated moments" hess = hessians[layer] jac = jacobians[layer] Bh, Ah = B @ hess.BB / n, A @ forward_stats[layer].AA / n Bj, Aj = B @ jac.BB / n, A @ forward_stats[layer].AA / n norms = ((A * A).sum(dim=1) * (B * B).sum(dim=1)) current[layer].min_norm2 = min(norms) current[layer].median_norm2 = torch.median(norms) current[layer].max_norm2 = max(norms) norms2_hess = ((Ah * A).sum(dim=1) * (Bh * B).sum(dim=1)) norms2_jac = ((Aj * A).sum(dim=1) * (Bj * B).sum(dim=1)) current[layer].norm += norms.sum() current_histograms[layer].norms.extend(torch.sqrt(norms)) current[layer].curv_hess += (skip_nans(norms2_hess / norms)).sum() current_histograms[layer].curv_hess.extend(skip_nans(norms2_hess / norms)) current[layer].curv_hess_max += (skip_nans(norms2_hess / norms)).max() current[layer].curv_hess_median += (skip_nans(norms2_hess / norms)).median() current_histograms[layer].curv_jac.extend(skip_nans(norms2_jac / norms)) current[layer].curv_jac += (skip_nans(norms2_jac / norms)).sum() current[layer].curv_jac_max += (skip_nans(norms2_jac / norms)).max() current[layer].curv_jac_median += (skip_nans(norms2_jac / norms)).median() current[layer].a_sparsity += torch.sum(A <= 0).float() / A.numel() current[layer].b_sparsity += torch.sum(B <= 0).float() / B.numel() current[layer].mean_activation += torch.mean(A) current[layer].mean_activation2 += torch.mean(A*A) current[layer].mean_backprop = torch.mean(B) current[layer].mean_backprop2 = torch.mean(B*B) current[layer].norms_hess += torch.sqrt(norms2_hess).sum() current_histograms[layer].norms_hess.extend(torch.sqrt(norms2_hess)) current[layer].norms_jac += norms2_jac.sum() current_histograms[layer].norms_jac.extend(torch.sqrt(norms2_jac)) normalized_moments = copy.copy(hessians[layer]) normalized_moments.AA = forward_stats[layer].AA normalized_moments = u.divide_attributes(normalized_moments, n) train_regrets_ = autograd_lib.offset_losses(A, B, alpha=lr, offset=0, m=normalized_moments, approx=args.curv) test_regrets1_ = autograd_lib.offset_losses(A, B, alpha=lr, offset=1, m=normalized_moments, approx=args.curv) test_regrets2_ = autograd_lib.offset_losses(A, B, alpha=lr, offset=2, m=normalized_moments, approx=args.curv) test_regrets_opt_ = autograd_lib.offset_losses(A, B, alpha=None, offset=2, m=normalized_moments, approx=args.curv) train_regrets_opt_ = autograd_lib.offset_losses(A, B, alpha=None, offset=0, m=normalized_moments, approx=args.curv) cosines_ = autograd_lib.offset_cosines(A, B) train_regrets[layer].extend(train_regrets_) test_regrets1[layer].extend(test_regrets1_) test_regrets2[layer].extend(test_regrets2_) train_regrets_opt[layer].extend(train_regrets_opt_) test_regrets_opt[layer].extend(test_regrets_opt_) cosines[layer].extend(cosines_) dot_products[layer].extend(autograd_lib.offset_dotprod(A, B)) # statistics of the form g.Sigma.g elif current == quad_fishers: hess = hessians[layer] sigma = fishers[layer] jac = jacobians[layer] Bs, As = B @ sigma.BB / n, A @ forward_stats[layer].AA / n Bh, Ah = B @ hess.BB / n, A @ forward_stats[layer].AA / n Bj, Aj = B @ jac.BB / n, A @ forward_stats[layer].AA / n norms = ((A * A).sum(dim=1) * (B * B).sum(dim=1)) norms2_hess = ((Ah * A).sum(dim=1) * (Bh * B).sum(dim=1)) norms2_jac = ((Aj * A).sum(dim=1) * (Bj * B).sum(dim=1)) norms_sigma = ((As * A).sum(dim=1) * (Bs * B).sum(dim=1)) current[layer].norm += norms.sum() # TODO(y) remove, redundant with norm2 above current[layer].curv_sigma += (skip_nans(norms_sigma / norms)).sum() current[layer].curv_sigma_max = skip_nans(norms_sigma / norms).max() current[layer].curv_sigma_median = skip_nans(norms_sigma / norms).median() current[layer].curv_hess += skip_nans(norms2_hess / norms).sum() current[layer].curv_hess_max += skip_nans(norms2_hess / norms).max() current[layer].lyap_hess_mean += skip_nans(norms_sigma / norms2_hess).mean() current[layer].lyap_hess_max = max(skip_nans(norms_sigma/norms2_hess)) current[layer].lyap_jac_mean += skip_nans(norms_sigma / norms2_jac).mean() current[layer].lyap_jac_max = max(skip_nans(norms_sigma/norms2_jac)) print('backward') with u.timeit("backprop_H"): with autograd_lib.module_hook(compute_stats): current = hessians current_histograms = hessians_histograms autograd_lib.backward_hessian(output, loss='CrossEntropy', sampled=args.sampled, retain_graph=True) # 600 ms current = jacobians current_histograms = jacobians_histograms autograd_lib.backward_jacobian(output, sampled=args.sampled, retain_graph=True) # 600 ms current = fishers current_histograms = fishers_histograms model.zero_grad() loss.backward(retain_graph=True) # 60 ms current = quad_fishers current_histograms = quad_fishers_histograms model.zero_grad() loss.backward() # 60 ms print('summarize') for (i, layer) in enumerate(model.layers): stats_dict = {'hessian': hessians, 'jacobian': jacobians, 'fisher': fishers} # evaluate stats from # https://app.wandb.ai/yaroslavvb/train_ciresan/runs/425pu650?workspace=user-yaroslavvb for stats_name in stats_dict: s = AttrDict() stats = stats_dict[stats_name][layer] for key in forward_stats[layer]: # print(f'copying {key} in {stats_name}, {layer}') try: assert stats[key] == float() except: f"Trying to overwrite {key} in {stats_name}, {layer}" stats[key] = forward_stats[layer][key] diag: torch.Tensor = stats.diag / n # jacobian: # curv in direction of gradient goes down to roughly 0.3-1 # maximum curvature goes up to 1000-2000 # # Hessian: # max curv goes down to 1, in direction of gradient 0.0001 s.diag_l2 = torch.max(diag) # 40 - 3000 smaller than kfac l2 for jac s.diag_fro = torch.norm( diag) # jacobian grows to 0.5-1.5, rest falls, layer-5 has phase transition, layer-4 also s.diag_trace = diag.sum() # jacobian grows 0-1000 (first), 0-150 (last). Almost same as kfac_trace (771 vs 810 kfac). Jacobian has up/down phase transition s.diag_average = diag.mean() # normalize for mean loss BB = stats.BB / n AA = stats.AA / n # A_evals, _ = torch.symeig(AA) # averaging 120ms per hit, 90 hits # B_evals, _ = torch.symeig(BB) # s.kfac_l2 = torch.max(A_evals) * torch.max(B_evals) # 60x larger than diag_l2. layer0/hess has down/up phase transition. layer5/jacobian has up/down phase transition s.kfac_trace = torch.trace(AA) * torch.trace(BB) # 0/hess down/up tr, 5/jac sharp phase transition s.kfac_fro = torch.norm(stats.AA) * torch.norm( stats.BB) # 0/hess has down/up tr, 5/jac up/down transition # s.kfac_erank = s.kfac_trace / s.kfac_l2 # first layer has 25, rest 15, all layers go down except last, last noisy # s.kfac_erank_fro = s.kfac_trace / s.kfac_fro / max(stats.BA.shape) s.diversity = (stats.norm2 / n) / u.norm_squared( stats.BA / n) # gradient diversity. Goes up 3x. Bottom layer has most diversity. Jacobian diversity much less noisy than everythingelse # discrepancy of KFAC based on exact values of diagonal approximation # average difference normalized by average diagonal magnitude diag_kfac = torch.einsum('ll,ii->li', BB, AA) s.kfac_error = (torch.abs(diag_kfac - diag)).mean() / torch.mean(diag.abs()) u.log_scalars(u.nest_stats(f'layer-{i}/{stats_name}', s)) # openai batch size stat s = AttrDict() hess = hessians[layer] jac = jacobians[layer] fish = fishers[layer] quad_fish = quad_fishers[layer] # the following check passes, but is expensive # if args.stats_num_batches == 1: # u.check_close(fisher[layer].BA, layer.weight.grad) def trsum(A, B): return (A * B).sum() # computes tr(AB') grad = fishers[layer].BA / n s.grad_fro = torch.norm(grad) # get norms s.lyap_hess_max = quad_fish.lyap_hess_max s.lyap_hess_ave = quad_fish.lyap_hess_sum / n s.lyap_jac_max = quad_fish.lyap_jac_max s.lyap_jac_ave = quad_fish.lyap_jac_sum / n s.hess_trace = hess.diag.sum() / n s.jac_trace = jac.diag.sum() / n # Version 1 of Jain stochastic rates, use Hessian for curvature b = args.train_batch_size s.hess_curv = trsum((hess.BB / n) @ grad @ (hess.AA / n), grad) / trsum(grad, grad) s.jac_curv = trsum((jac.BB / n) @ grad @ (jac.AA / n), grad) / trsum(grad, grad) # compute gradient noise statistics # fish.BB has /n factor twice, hence don't need extra /n on fish.AA # after sampling, hess_noise,jac_noise became 100x smaller, but normalized is unaffected s.hess_noise = (trsum(hess.AA / n, fish.AA / n) * trsum(hess.BB / n, fish.BB / n)) s.jac_noise = (trsum(jac.AA / n, fish.AA / n) * trsum(jac.BB / n, fish.BB / n)) s.hess_noise_centered = s.hess_noise - trsum(hess.BB / n @ grad, grad @ hess.AA / n) s.jac_noise_centered = s.jac_noise - trsum(jac.BB / n @ grad, grad @ jac.AA / n) s.openai_gradient_noise = (fish.norms_hess / n) / trsum(hess.BB / n @ grad, grad @ hess.AA / n) s.mean_norm = torch.sqrt(fish.norm2) / n s.min_norm = torch.sqrt(fish.min_norm2) s.median_norm = torch.sqrt(fish.median_norm2) s.max_norm = torch.sqrt(fish.max_norm2) s.enorms = u.norm_squared(grad) s.a_sparsity = fish.a_sparsity s.b_sparsity = fish.b_sparsity s.mean_activation = fish.mean_activation s.msr_activation = torch.sqrt(fish.mean_activation2) s.mean_backprop = fish.mean_backprop s.msr_backprop = torch.sqrt(fish.mean_backprop2) s.norms_centered = fish.norm2 / n - u.norm_squared(grad) s.norms_hess = fish.norms_hess / n s.norms_jac = fish.norms_jac / n s.hess_curv_grad = fish.curv_hess / n # phase transition, hits minimum loss in layer 1, then starts going up. Other layers take longer to reach minimum. Decreases with depth. s.hess_curv_grad_max = fish.curv_hess_max # phase transition, hits minimum loss in layer 1, then starts going up. Other layers take longer to reach minimum. Decreases with depth. s.hess_curv_grad_median = fish.curv_hess_median # phase transition, hits minimum loss in layer 1, then starts going up. Other layers take longer to reach minimum. Decreases with depth. s.sigma_curv_grad = quad_fish.curv_sigma / n s.sigma_curv_grad_max = quad_fish.curv_sigma_max s.sigma_curv_grad_median = quad_fish.curv_sigma_median s.band_bottou = 0.5 * lr * s.sigma_curv_grad / s.hess_curv_grad s.band_bottou_stoch = 0.5 * lr * quad_fish.curv_ratio / n s.band_yaida = 0.25 * lr * s.mean_norm**2 s.band_yaida_centered = 0.25 * lr * s.norms_centered s.jac_curv_grad = fish.curv_jac / n # this one has much lower variance than jac_curv. Reaches peak at 10k steps, also kfac error reaches peak there. Decreases with depth except for last layer. s.jac_curv_grad_max = fish.curv_jac_max # this one has much lower variance than jac_curv. Reaches peak at 10k steps, also kfac error reaches peak there. Decreases with depth except for last layer. s.jac_curv_grad_median = fish.curv_jac_median # this one has much lower variance than jac_curv. Reaches peak at 10k steps, also kfac error reaches peak there. Decreases with depth except for last layer. # OpenAI gradient noise statistics s.hess_noise_normalized = s.hess_noise_centered / (fish.norms_hess / n) s.jac_noise_normalized = s.jac_noise / (fish.norms_jac / n) train_regrets_, test_regrets1_, test_regrets2_, train_regrets_opt_, test_regrets_opt_, cosines_, dot_products_ = (torch.stack(r[layer]) for r in (train_regrets, test_regrets1, test_regrets2, train_regrets_opt, test_regrets_opt, cosines, dot_products)) s.train_regret = train_regrets_.median() # use median because outliers make it hard to see the trend s.test_regret1 = test_regrets1_.median() s.test_regret2 = test_regrets2_.median() s.test_regret_opt = test_regrets_opt_.median() s.train_regret_opt = train_regrets_opt_.median() s.mean_dot_product = torch.mean(dot_products_) s.median_dot_product = torch.median(dot_products_) a = [1, 2, 3] s.median_cosine = cosines_.median() s.mean_cosine = cosines_.mean() # get learning rates L1 = s.hess_curv_grad / n L2 = s.jac_curv_grad / n diversity = (fish.norm2 / n) / u.norm_squared(grad) robust_diversity = (fish.norm2 / n) / fish.median_norm2 dotprod_diversity = fish.median_norm2 / s.median_dot_product s.lr1 = 2 / (L1 * diversity) s.lr2 = 2 / (L2 * diversity) s.lr3 = 2 / (L2 * robust_diversity) s.lr4 = 2 / (L2 * dotprod_diversity) hess_A = u.symeig_pos_evals(hess.AA / n) hess_B = u.symeig_pos_evals(hess.BB / n) fish_A = u.symeig_pos_evals(fish.AA / n) fish_B = u.symeig_pos_evals(fish.BB / n) jac_A = u.symeig_pos_evals(jac.AA / n) jac_B = u.symeig_pos_evals(jac.BB / n) u.log_scalars({f'layer-{i}/hessA_erank': erank(hess_A)}) u.log_scalars({f'layer-{i}/hessB_erank': erank(hess_B)}) u.log_scalars({f'layer-{i}/fishA_erank': erank(fish_A)}) u.log_scalars({f'layer-{i}/fishB_erank': erank(fish_B)}) u.log_scalars({f'layer-{i}/jacA_erank': erank(jac_A)}) u.log_scalars({f'layer-{i}/jacB_erank': erank(jac_B)}) gl.event_writer.add_histogram(f'layer-{i}/hist_hess_eig', u.outer(hess_A, hess_B).flatten(), gl.get_global_step()) gl.event_writer.add_histogram(f'layer-{i}/hist_fish_eig', u.outer(hess_A, hess_B).flatten(), gl.get_global_step()) gl.event_writer.add_histogram(f'layer-{i}/hist_jac_eig', u.outer(hess_A, hess_B).flatten(), gl.get_global_step()) s.hess_l2 = max(hess_A) * max(hess_B) s.jac_l2 = max(jac_A) * max(jac_B) s.fish_l2 = max(fish_A) * max(fish_B) s.hess_trace = hess.diag.sum() / n s.jain1_sto = 1/(s.hess_trace + 2 * s.hess_l2) s.jain1_det = 1/s.hess_l2 s.jain1_lr = (1 / b) * (1/s.jain1_sto) + (b - 1) / b * (1/s.jain1_det) s.jain1_lr = 2 / s.jain1_lr s.regret_ratio = ( train_regrets_opt_ / test_regrets_opt_).median() # ratio between train and test regret, large means overfitting u.log_scalars(u.nest_stats(f'layer-{i}', s)) # compute stats that would let you bound rho if i == 0: # only compute this once, for output layer hhh = hessians[model.layers[-1]].BB / n fff = fishers[model.layers[-1]].BB / n d = fff.shape[0] L = u.lyapunov_spectral(hhh, 2 * fff, cond=1e-8) L_evals = u.symeig_pos_evals(L) Lcheap = fff @ u.pinv(hhh, cond=1e-8) Lcheap_evals = u.eig_real(Lcheap) u.log_scalars({f'mismatch/rho': d/erank(L_evals)}) u.log_scalars({f'mismatch/rho_cheap': d/erank(Lcheap_evals)}) u.log_scalars({f'mismatch/diagonalizability': erank(L_evals)/erank(Lcheap_evals)}) # 1 means diagonalizable u.log_spectrum(f'mismatch/sigma', u.symeig_pos_evals(fff), loglog=False) u.log_spectrum(f'mismatch/hess', u.symeig_pos_evals(hhh), loglog=False) u.log_spectrum(f'mismatch/lyapunov', L_evals, loglog=True) u.log_spectrum(f'mismatch/lyapunov_cheap', Lcheap_evals, loglog=True) gl.event_writer.add_histogram(f'layer-{i}/hist_grad_norms', u.to_numpy(fishers_histograms[layer].norms.value()), gl.get_global_step()) gl.event_writer.add_histogram(f'layer-{i}/hist_grad_norms_hess', u.to_numpy(fishers_histograms[layer].norms_hess.value()), gl.get_global_step()) gl.event_writer.add_histogram(f'layer-{i}/hist_curv_jac', u.to_numpy(fishers_histograms[layer].curv_jac.value()), gl.get_global_step()) gl.event_writer.add_histogram(f'layer-{i}/hist_curv_hess', u.to_numpy(fishers_histograms[layer].curv_hess.value()), gl.get_global_step()) gl.event_writer.add_histogram(f'layer-{i}/hist_cosines', u.to_numpy(cosines[layer]), gl.get_global_step()) if args.log_spectra: with u.timeit('spectrum'): # 2/alpha # s.jain1_lr = (1 / b) * s.jain1_sto + (b - 1) / b * s.jain1_det # s.jain1_lr = 1 / s.jain1_lr # hess.diag_trace, jac.diag_trace # Version 2 of Jain stochastic rates, use Jacobian squared for curvature s.jain2_sto = s.lyap_jac_max * s.jac_trace / s.lyap_jac_ave s.jain2_det = s.jac_l2 s.jain2_lr = (1 / b) * s.jain2_sto + (b - 1) / b * s.jain2_det s.jain2_lr = 1 / s.jain2_lr u.log_spectrum(f'layer-{i}/hess_A', hess_A) u.log_spectrum(f'layer-{i}/hess_B', hess_B) u.log_spectrum(f'layer-{i}/hess_AB', u.outer(hess_A, hess_B).flatten()) u.log_spectrum(f'layer-{i}/jac_A', jac_A) u.log_spectrum(f'layer-{i}/jac_B', jac_B) u.log_spectrum(f'layer-{i}/fish_A', fish_A) u.log_spectrum(f'layer-{i}/fish_B', fish_B) u.log_scalars({f'layer-{i}/trace_ratio': fish_B.sum()/hess_B.sum()}) L = torch.eig(u.lyapunov_spectral(hess.BB, 2*fish.BB, cond=1e-8))[0] L = L[:, 0] # extract real part L = L.sort()[0] L = torch.flip(L, [0]) L_cheap = torch.eig(fish.BB @ u.pinv(hess.BB, cond=1e-8))[0] L_cheap = L_cheap[:, 0] # extract real part L_cheap = L_cheap.sort()[0] L_cheap = torch.flip(L_cheap, [0]) d = len(hess_B) u.log_spectrum(f'layer-{i}/Lyap', L) u.log_spectrum(f'layer-{i}/Lyap_cheap', L_cheap) u.log_scalars({f'layer-{i}/dims': d}) u.log_scalars({f'layer-{i}/L_erank': erank(L)}) u.log_scalars({f'layer-{i}/L_cheap_erank': erank(L_cheap)}) u.log_scalars({f'layer-{i}/rho': d/erank(L)}) u.log_scalars({f'layer-{i}/rho_cheap': d/erank(L_cheap)}) model.train() with u.timeit('train'): for i in range(args.train_steps): optimizer.zero_grad() data, targets = next(train_iter) model.zero_grad() output = model(data) loss = loss_fn(output, targets) loss.backward() optimizer.step() if args.weight_decay: for group in optimizer.param_groups: for param in group['params']: param.data.mul_(1 - args.weight_decay) gl.token_count += data.shape[0] gl.event_writer.close()
def compute_stats(layer, _, B): A = activations[layer] if current == fishers: backprops[layer] = B # about 27ms per layer with u.timeit('compute_stats'): current[layer].BB += torch.einsum("ni,nj->ij", B, B) # TODO(y): index consistency current[layer].diag += torch.einsum("ni,nj->ij", B * B, A * A) current[layer].BA += torch.einsum("ni,nj->ij", B, A) current[layer].a += torch.einsum("ni->i", A) current[layer].b += torch.einsum("nk->k", B) current[layer].norm2 += ((A * A).sum(dim=1) * (B * B).sum(dim=1)).sum() # compute curvatures in direction of all gradiennts if current is fishers: assert args.stats_num_batches == 1, "not tested on more than one stats step, currently reusing aggregated moments" hess = hessians[layer] jac = jacobians[layer] Bh, Ah = B @ hess.BB / n, A @ forward_stats[layer].AA / n Bj, Aj = B @ jac.BB / n, A @ forward_stats[layer].AA / n norms = ((A * A).sum(dim=1) * (B * B).sum(dim=1)) current[layer].min_norm2 = min(norms) current[layer].median_norm2 = torch.median(norms) current[layer].max_norm2 = max(norms) norms2_hess = ((Ah * A).sum(dim=1) * (Bh * B).sum(dim=1)) norms2_jac = ((Aj * A).sum(dim=1) * (Bj * B).sum(dim=1)) current[layer].norm += norms.sum() current_histograms[layer].norms.extend(torch.sqrt(norms)) current[layer].curv_hess += (skip_nans(norms2_hess / norms)).sum() current_histograms[layer].curv_hess.extend(skip_nans(norms2_hess / norms)) current[layer].curv_hess_max += (skip_nans(norms2_hess / norms)).max() current[layer].curv_hess_median += (skip_nans(norms2_hess / norms)).median() current_histograms[layer].curv_jac.extend(skip_nans(norms2_jac / norms)) current[layer].curv_jac += (skip_nans(norms2_jac / norms)).sum() current[layer].curv_jac_max += (skip_nans(norms2_jac / norms)).max() current[layer].curv_jac_median += (skip_nans(norms2_jac / norms)).median() current[layer].a_sparsity += torch.sum(A <= 0).float() / A.numel() current[layer].b_sparsity += torch.sum(B <= 0).float() / B.numel() current[layer].mean_activation += torch.mean(A) current[layer].mean_activation2 += torch.mean(A*A) current[layer].mean_backprop = torch.mean(B) current[layer].mean_backprop2 = torch.mean(B*B) current[layer].norms_hess += torch.sqrt(norms2_hess).sum() current_histograms[layer].norms_hess.extend(torch.sqrt(norms2_hess)) current[layer].norms_jac += norms2_jac.sum() current_histograms[layer].norms_jac.extend(torch.sqrt(norms2_jac)) normalized_moments = copy.copy(hessians[layer]) normalized_moments.AA = forward_stats[layer].AA normalized_moments = u.divide_attributes(normalized_moments, n) train_regrets_ = autograd_lib.offset_losses(A, B, alpha=lr, offset=0, m=normalized_moments, approx=args.curv) test_regrets1_ = autograd_lib.offset_losses(A, B, alpha=lr, offset=1, m=normalized_moments, approx=args.curv) test_regrets2_ = autograd_lib.offset_losses(A, B, alpha=lr, offset=2, m=normalized_moments, approx=args.curv) test_regrets_opt_ = autograd_lib.offset_losses(A, B, alpha=None, offset=2, m=normalized_moments, approx=args.curv) train_regrets_opt_ = autograd_lib.offset_losses(A, B, alpha=None, offset=0, m=normalized_moments, approx=args.curv) cosines_ = autograd_lib.offset_cosines(A, B) train_regrets[layer].extend(train_regrets_) test_regrets1[layer].extend(test_regrets1_) test_regrets2[layer].extend(test_regrets2_) train_regrets_opt[layer].extend(train_regrets_opt_) test_regrets_opt[layer].extend(test_regrets_opt_) cosines[layer].extend(cosines_) dot_products[layer].extend(autograd_lib.offset_dotprod(A, B)) # statistics of the form g.Sigma.g elif current == quad_fishers: hess = hessians[layer] sigma = fishers[layer] jac = jacobians[layer] Bs, As = B @ sigma.BB / n, A @ forward_stats[layer].AA / n Bh, Ah = B @ hess.BB / n, A @ forward_stats[layer].AA / n Bj, Aj = B @ jac.BB / n, A @ forward_stats[layer].AA / n norms = ((A * A).sum(dim=1) * (B * B).sum(dim=1)) norms2_hess = ((Ah * A).sum(dim=1) * (Bh * B).sum(dim=1)) norms2_jac = ((Aj * A).sum(dim=1) * (Bj * B).sum(dim=1)) norms_sigma = ((As * A).sum(dim=1) * (Bs * B).sum(dim=1)) current[layer].norm += norms.sum() # TODO(y) remove, redundant with norm2 above current[layer].curv_sigma += (skip_nans(norms_sigma / norms)).sum() current[layer].curv_sigma_max = skip_nans(norms_sigma / norms).max() current[layer].curv_sigma_median = skip_nans(norms_sigma / norms).median() current[layer].curv_hess += skip_nans(norms2_hess / norms).sum() current[layer].curv_hess_max += skip_nans(norms2_hess / norms).max() current[layer].lyap_hess_mean += skip_nans(norms_sigma / norms2_hess).mean() current[layer].lyap_hess_max = max(skip_nans(norms_sigma/norms2_hess)) current[layer].lyap_jac_mean += skip_nans(norms_sigma / norms2_jac).mean() current[layer].lyap_jac_max = max(skip_nans(norms_sigma/norms2_jac))
def main(): attemp_count = 0 while os.path.exists(f"{args.logdir}{attemp_count:02d}"): attemp_count += 1 logdir = f"{args.logdir}{attemp_count:02d}" run_name = os.path.basename(logdir) gl.event_writer = SummaryWriter(logdir) print(f"Logging to {run_name}") u.seed_random(1) d1 = args.data_width**2 d2 = 10 d3 = args.targets_width**2 o = d3 n = args.stats_batch_size d = [d1, d2, d3] model = u.SimpleFullyConnected(d, nonlin=args.nonlin) model = model.to(gl.device) try: # os.environ['WANDB_SILENT'] = 'true' if args.wandb: wandb.init(project='curv_train_tiny', name=run_name) wandb.tensorboard.patch(tensorboardX=False) wandb.config['train_batch'] = args.train_batch_size wandb.config['stats_batch'] = args.stats_batch_size wandb.config['method'] = args.method wandb.config['d1'] = d1 wandb.config['d2'] = d2 wandb.config['d3'] = d3 wandb.config['n'] = n except Exception as e: print(f"wandb crash with {e}") optimizer = torch.optim.SGD(model.parameters(), lr=0.03, momentum=0.9) dataset = u.TinyMNIST(data_width=args.data_width, targets_width=args.targets_width, dataset_size=args.dataset_size) train_loader = torch.utils.data.DataLoader( dataset, batch_size=args.train_batch_size, shuffle=False, drop_last=True) train_iter = u.infinite_iter(train_loader) stats_loader = torch.utils.data.DataLoader( dataset, batch_size=args.stats_batch_size, shuffle=False, drop_last=True) stats_iter = u.infinite_iter(stats_loader) test_dataset = u.TinyMNIST(data_width=args.data_width, targets_width=args.targets_width, dataset_size=args.dataset_size, train=False) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.stats_batch_size, shuffle=True, drop_last=True) test_iter = u.infinite_iter(test_loader) skip_forward_hooks = False skip_backward_hooks = False def capture_activations(module: nn.Module, input: List[torch.Tensor], output: torch.Tensor): if skip_forward_hooks: return assert not hasattr( module, 'activations' ), "Seeing results of previous autograd, call util.zero_grad to clear" assert len(input) == 1, "this was tested for single input layers only" setattr(module, "activations", input[0].detach()) setattr(module, "output", output.detach()) def capture_backprops(module: nn.Module, _input, output): if skip_backward_hooks: return assert len(output) == 1, "this works for single variable layers only" if gl.backward_idx == 0: assert not hasattr( module, 'backprops' ), "Seeing results of previous autograd, call util.zero_grad to clear" setattr(module, 'backprops', []) assert gl.backward_idx == len(module.backprops) module.backprops.append(output[0]) def save_grad(param: nn.Parameter) -> Callable[[torch.Tensor], None]: """Hook to save gradient into 'param.saved_grad', so it can be accessed after model.zero_grad(). Only stores gradient if the value has not been set, call util.zero_grad to clear it.""" def save_grad_fn(grad): if not hasattr(param, 'saved_grad'): setattr(param, 'saved_grad', grad) return save_grad_fn for layer in model.layers: layer.register_forward_hook(capture_activations) layer.register_backward_hook(capture_backprops) layer.weight.register_hook(save_grad(layer.weight)) def loss_fn(data, targets): err = data - targets.view(-1, data.shape[1]) assert len(data) == len(targets) return torch.sum(err * err) / 2 / len(data) gl.token_count = 0 last_outer = 0 for step in range(args.stats_steps): if last_outer: u.log_scalars( {"time/outer": 1000 * (time.perf_counter() - last_outer)}) last_outer = time.perf_counter() # compute validation loss skip_forward_hooks = True skip_backward_hooks = True with u.timeit("val_loss"): test_data, test_targets = next(test_iter) test_output = model(test_data) val_loss = loss_fn(test_output, test_targets) print("val_loss", val_loss.item()) u.log_scalar(val_loss=val_loss.item()) # compute stats data, targets = next(stats_iter) skip_forward_hooks = False skip_backward_hooks = False # get gradient values with u.timeit("backprop_g"): gl.backward_idx = 0 u.zero_grad(model) output = model(data) loss = loss_fn(output, targets) loss.backward(retain_graph=True) # get Hessian values skip_forward_hooks = True id_mat = torch.eye(o).to(gl.device) u.log_scalar(loss=loss.item()) with u.timeit("backprop_H"): # optionally use randomized low-rank approximation of Hessian hess_rank = args.hess_samples if args.hess_samples else o for out_idx in range(hess_rank): model.zero_grad() # backprop to get section of batch output jacobian for output at position out_idx output = model( data ) # opt: using autograd.grad means I don't have to zero_grad if args.hess_samples: bval = torch.LongTensor(n, o).to(gl.device).random_( 0, 2) * 2 - 1 bval = bval.float() else: ei = id_mat[out_idx] bval = torch.stack([ei] * n) gl.backward_idx = out_idx + 1 output.backward(bval) skip_backward_hooks = True # for (i, layer) in enumerate(model.layers): s = AttrDefault(str, {}) # dictionary-like object for layer stats ############################# # Gradient stats ############################# A_t = layer.activations assert A_t.shape == (n, d[i]) # add factor of n because backprop takes loss averaged over batch, while we need per-example loss B_t = layer.backprops[0] * n assert B_t.shape == (n, d[i + 1]) with u.timeit(f"khatri_g-{i}"): G = u.khatri_rao_t(B_t, A_t) # batch loss Jacobian assert G.shape == (n, d[i] * d[i + 1]) g = G.sum(dim=0, keepdim=True) / n # average gradient assert g.shape == (1, d[i] * d[i + 1]) if args.autograd_check: u.check_close(B_t.t() @ A_t / n, layer.weight.saved_grad) u.check_close(g.reshape(d[i + 1], d[i]), layer.weight.saved_grad) s.sparsity = torch.sum(layer.output <= 0) / layer.output.numel() s.mean_activation = torch.mean(A_t) s.mean_backprop = torch.mean(B_t) # empirical Fisher with u.timeit(f'sigma-{i}'): efisher = G.t() @ G / n sigma = efisher - g.t() @ g s.sigma_l2 = u.sym_l2_norm(sigma) s.sigma_erank = torch.trace(sigma) / s.sigma_l2 ############################# # Hessian stats ############################# A_t = layer.activations Bh_t = [ layer.backprops[out_idx + 1] for out_idx in range(hess_rank) ] Amat_t = torch.cat([A_t] * hess_rank, dim=0) Bmat_t = torch.cat(Bh_t, dim=0) assert Amat_t.shape == (n * hess_rank, d[i]) assert Bmat_t.shape == (n * hess_rank, d[i + 1]) lambda_regularizer = args.lmb * torch.eye(d[i] * d[i + 1]).to( gl.device) with u.timeit(f"khatri_H-{i}"): Jb = u.khatri_rao_t( Bmat_t, Amat_t) # batch Jacobian, in row-vec format with u.timeit(f"H-{i}"): H = Jb.t() @ Jb / n with u.timeit(f"invH-{i}"): invH = torch.cholesky_inverse(H + lambda_regularizer) with u.timeit(f"H_l2-{i}"): s.H_l2 = u.sym_l2_norm(H) s.iH_l2 = u.sym_l2_norm(invH) with u.timeit(f"norms-{i}"): s.H_fro = H.flatten().norm() s.iH_fro = invH.flatten().norm() s.jacobian_fro = Jb.flatten().norm() s.grad_fro = g.flatten().norm() s.param_fro = layer.weight.data.flatten().norm() u.nan_check(H) if args.autograd_check: model.zero_grad() output = model(data) loss = loss_fn(output, targets) H_autograd = u.hessian(loss, layer.weight) H_autograd = H_autograd.reshape(d[i] * d[i + 1], d[i] * d[i + 1]) u.check_close(H, H_autograd) # u.dump(sigma, f'/tmp/sigmas/H-{step}-{i}') def loss_direction(dd: torch.Tensor, eps): """loss improvement if we take step eps in direction dd""" return u.to_python_scalar(eps * (dd @ g.t()) - 0.5 * eps**2 * dd @ H @ dd.t()) def curv_direction(dd: torch.Tensor): """Curvature in direction dd""" return u.to_python_scalar(dd @ H @ dd.t() / (dd.flatten().norm()**2)) with u.timeit("pinvH"): pinvH = u.pinv(H) with u.timeit(f'curv-{i}'): s.regret_newton = u.to_python_scalar(g @ pinvH @ g.t() / 2) s.grad_curv = curv_direction(g) ndir = g @ pinvH # newton direction s.newton_curv = curv_direction(ndir) setattr(layer.weight, 'pre', pinvH) # save Newton preconditioner s.step_openai = 1 / s.grad_curv if s.grad_curv else 999 s.step_max = 2 / u.sym_l2_norm(H) s.step_min = torch.tensor(2) / torch.trace(H) s.newton_fro = ndir.flatten().norm( ) # frobenius norm of Newton update s.regret_gradient = loss_direction(g, s.step_openai) with u.timeit(f'rho-{i}'): p_sigma = u.lyapunov_svd(H, sigma) if u.has_nan( p_sigma) and args.compute_rho: # use expensive method H0 = H.cpu().detach().numpy() sigma0 = sigma.cpu().detach().numpy() p_sigma = scipy.linalg.solve_lyapunov(H0, sigma0) p_sigma = torch.tensor(p_sigma).to(gl.device) if u.has_nan(p_sigma): s.psigma_erank = H.shape[0] s.rho = 1 else: s.psigma_erank = u.sym_erank(p_sigma) s.rho = H.shape[0] / s.psigma_erank with u.timeit(f"batch-{i}"): s.batch_openai = torch.trace(H @ sigma) / (g @ H @ g.t()) print('openai batch', s.batch_openai) s.diversity = torch.norm(G, "fro")**2 / torch.norm(g)**2 # s.noise_variance = torch.trace(H.inverse() @ sigma) # try: # # this fails with singular sigma # s.noise_variance = torch.trace(torch.solve(sigma, H)[0]) # # s.noise_variance = torch.trace(torch.lstsq(sigma, H)[0]) # pass # except RuntimeError as _: s.noise_variance_pinv = torch.trace(pinvH @ sigma) s.H_erank = torch.trace(H) / s.H_l2 s.batch_jain_simple = 1 + s.H_erank s.batch_jain_full = 1 + s.rho * s.H_erank u.log_scalars(u.nest_stats(layer.name, s)) # gradient steps last_inner = 0 for i in range(args.train_steps): if last_inner: u.log_scalars( {"time/inner": 1000 * (time.perf_counter() - last_inner)}) last_inner = time.perf_counter() optimizer.zero_grad() data, targets = next(train_iter) model.zero_grad() output = model(data) loss = loss_fn(output, targets) loss.backward() u.log_scalar(train_loss=loss.item()) if args.method != 'newton': optimizer.step() else: for (layer_idx, layer) in enumerate(model.layers): param: torch.nn.Parameter = layer.weight param_data: torch.Tensor = param.data param_data.copy_(param_data - 0.1 * param.grad) if layer_idx != 1: # only update 1 layer with Newton, unstable otherwise continue u.nan_check(layer.weight.pre) u.nan_check(param.grad.flatten()) u.nan_check(u.v2r(param.grad.flatten()) @ layer.weight.pre) param_new_flat = u.v2r(param_data.flatten()) - u.v2r( param.grad.flatten()) @ layer.weight.pre u.nan_check(param_new_flat) param_data.copy_(param_new_flat.reshape(param_data.shape)) gl.token_count += data.shape[0] gl.event_writer.close()
def main(opts, cluster_spec, cfg): util.log(author='%s:%d' % (opts.job_name, opts.task_index), msg='starting @ %s' % util.get_hostname(expensive=True)) cluster = tf.train.ClusterSpec(cluster_spec) server = tf.train.Server(cluster, job_name=opts.job_name, task_index=opts.task_index) if opts.job_name == "ps": util.log(author='%s:%d' % (opts.job_name, opts.task_index), msg='joining server') server.join() raise Exception("Expecting server.join() to block forever") assert (opts.job_name == "worker") is_chief = (opts.task_index == 0) outqueue = Queue() train_post_thread = TrainPostThread(cfg, outqueue) train_post_thread.start() with tf.device("/job:ps/task:0"): params = NeuroSATParams(cfg=cfg) with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % opts.task_index, cluster=cluster)): filenames = [ os.path.join(util.gd_tfr_dir(gd_id=cfg['gd_id']), x) for x in os.listdir(util.gd_tfr_dir(gd_id=cfg['gd_id'])) ] dataset = tf.data.TFRecordDataset( filenames=filenames, compression_type="GZIP", num_parallel_reads=cfg['n_parallel_reads']) # TODO(dselsam): don't hardcode the number of shards # idea: extend cluster_spec to map (job, task) -> (n_shards, shard_idx) dataset = dataset.shard(num_shards=4, index=opts.task_index % 4) dataset = dataset.map(example_to_tftd, num_parallel_calls=cfg['n_parallel_calls']) dataset = dataset.filter( lambda tftd: 2 * tftd.n_vars + tftd.n_clauses < cfg['max_n_nodes']) dataset = dataset.repeat() dataset = dataset.prefetch(cfg['n_prefetch']) tftd = dataset.make_one_shot_iterator().get_next() args = NeuroSATArgs(n_vars=tftd.n_vars, n_clauses=tftd.n_clauses, CL_idxs=tftd.CL_idxs) guesses = apply_neurosat(cfg=cfg, params=params, args=args) pi_v_targets = tf.cast(tftd.core_var_mask, tf.float32) pi_v_targets = pi_v_targets / tf.reduce_sum(pi_v_targets) pi_c_targets = tf.cast(tftd.core_clause_mask, tf.float32) pi_c_targets = pi_c_targets / tf.reduce_sum(pi_c_targets) cv_loss = cfg['cv_loss_scale'] * tfutil.kldiv( logits=guesses.pi_core_var_logits, labels=pi_v_targets) cc_loss = cfg['cc_loss_scale'] * tfutil.kldiv( logits=guesses.pi_core_clause_logits, labels=pi_c_targets) l2_loss = cfg['l2_loss_scale'] * tfutil.build_l2_loss() loss = cv_loss + cc_loss + l2_loss stats = Stats(dp_id=tftd.dp_id, cv_loss=cv_loss, cc_loss=cc_loss, l2_loss=l2_loss) global_step = tf.train.get_or_create_global_step() learning_rate = tfutil.build_learning_rate(cfg, global_step) apply_grads = tf.cond( tftd.is_train, lambda: tfutil.build_apply_gradients( cfg, loss, learning_rate, global_step), lambda: True) util.log(author='%s:%d' % (opts.job_name, opts.task_index), msg='creating session (train_id=%d)...' % cfg['train_id']) with tf.train.MonitoredTrainingSession( master=server.target, is_chief=is_chief, checkpoint_dir=util.checkpoint_dir( train_id=cfg['train_id'])) as mon_sess: util.log(author='%s:%d' % (opts.job_name, opts.task_index), msg='starting session loop') step = 0 while True: try: (_, stats_v), n_secs = util.timeit(mon_sess.run, [apply_grads, stats]) outqueue.put( tuple(map(util.de_numpify, stats_v)) + (cfg['train_id'], n_secs)) step += 1 except tf.errors.ResourceExhaustedError as e: tb = traceback.format_exc() util.log(kind='error', author='train', msg="RESOURCE_EXHAUSTED\n%s\n%s" % (str(e), tb)) util.db_insert(table='tune_ooms', train_id=cfg['train_id']) except tf.errors.OpError as e: tb = traceback.format_exc() util.log(kind='error', author='train', msg="OP_ERROR\n%s\n%s" % (str(e), tb)) except Exception as e: tb = traceback.format_exc() util.log(kind='error', author='train', msg="EXCEPTION\n%s\n%s" % (str(e), tb)) except: tb = traceback.format_exc() util.log(kind='error', author='train', msg="UNKNOWN\n%s" % (tb))
def main(): use_cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") print("using device ", device) torch.manual_seed(args.seed) u.set_runs_directory('runs3') logger = u.TensorboardLogger(args.run) batch_size = 64 shuffle = True kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader(datasets.MNIST( '/tmp/data', train=True, download=True, transform=transforms.Compose([transforms.ToTensor()])), batch_size=batch_size, shuffle=shuffle, **kwargs) test_loader = torch.utils.data.DataLoader(datasets.MNIST( '/tmp/data', train=False, transform=transforms.Compose([transforms.ToTensor()])), batch_size=1000, shuffle=shuffle, **kwargs) """input image size for the original LeNet5 is 32x32, here is 28x28""" # W1 = 0.1 * torch.randn(1 * 5 * 5 + 1, 6) net = LeNet5().to(device) def train_loss(data, target): y = net(data) y = F.log_softmax(y, dim=1) loss = F.nll_loss(y, target) for w in net.W: loss += 0.0002 * torch.sum(w * w) return loss def test_loss(): num_errs = 0 with torch.no_grad(): for data, target in test_loader: data, target = data.to(device), target.to(device) y = net(data) _, pred = torch.max(y, dim=1) num_errs += torch.sum(pred != target) return num_errs.item() / len(test_loader.dataset) Qs = [[torch.eye(w.shape[0]), torch.eye(w.shape[1])] for w in net.W] for i in range(len(Qs)): for j in range(len(Qs[i])): Qs[i][j] = Qs[i][j].to(device) step_size = 0.1 # tried 0.15, diverges grad_norm_clip_thr = 1e10 TrainLoss, TestLoss = [], [] example_count = 0 step_time_ms = 0 for epoch in range(10): for batch_idx, (data, target) in enumerate(train_loader): step_start = time.perf_counter() data, target = data.to(device), target.to(device) loss = train_loss(data, target) with u.timeit('grad'): grads = autograd.grad(loss, net.W, create_graph=True) TrainLoss.append(loss.item()) logger.set_step(example_count) logger('loss/train', TrainLoss[-1]) if batch_idx % 10 == 0: print( f'Epoch: {epoch}; batch: {batch_idx}; train loss: {TrainLoss[-1]:.2f}, step time: {step_time_ms:.0f}' ) with u.timeit('Hv'): # noise.normal_() # torch.manual_seed(args.seed) v = [torch.randn(w.shape).to(device) for w in net.W] # v = grads Hv = autograd.grad(grads, net.W, v) if args.verbose: print("v", v[0].mean()) print("data", data.mean()) print("Hv", Hv[0].mean()) n = len(net.W) with torch.no_grad(): with u.timeit('P_update'): for i in range(num_updates): psteps = [] for j in range(n): q = Qs[j] dw = v[j] dg = Hv[j] Qs[j][0], Qs[j][ 1], pstep = psgd.update_precond_kron_with_step( q[0], q[1], dw, dg) psteps.append(pstep) # print(np.array(psteps).mean()) logger('p_residual', np.array(psteps).mean()) with u.timeit('g_update'): pre_grads = [ psgd.precond_grad_kron(q[0], q[1], g) for (q, g) in zip(Qs, grads) ] grad_norm = torch.sqrt( sum([torch.sum(g * g) for g in pre_grads])) with u.timeit('gradstep'): step_adjust = min( grad_norm_clip_thr / (grad_norm + 1.2e-38), 1.0) for i in range(len(net.W)): net.W[i] -= step_adjust * step_size * pre_grads[i] total_step = step_adjust * step_size logger('step/adjust', step_adjust) logger('step/size', step_size) logger('step/total', total_step) logger('grad_norm', grad_norm) if args.verbose: print(data.mean()) import pdb pdb.set_trace() if args.early_stop: sys.exit() example_count += batch_size step_time_ms = 1000 * (time.perf_counter() - step_start) logger('time/step', step_time_ms) if args.test and batch_idx >= 100: break if args.test and batch_idx >= 100: break test_loss0 = test_loss() TestLoss.append(test_loss0) logger('loss/test', test_loss0) step_size = (0.1**0.1) * step_size print('Epoch: {}; best test loss: {}'.format(epoch, min(TestLoss))) if args.test: step_times = logger.d['time/step'] assert step_times[-1] < 30, step_times # should be around 20ms losses = logger.d['loss/train'] assert losses[0] > 2 # around 2.3887393474578857 assert losses[-1] < 0.5, losses print("Test passed")
return pi #========== do a benchmark run ============== def run(): for cpus in range (1, multiprocessing.cpu_count()+1): timeit(partial(calculate_pi, cpus, N), "multiprocessing with " + str(cpus) + " CPUs") #========== Main ========== if __name__=='__main__': #Define n and number of CPUs pi = 0 cpus = multiprocessing.cpu_count() print("CPU count: " + str(cpus)) print("n = " + str(N)) print("--------------------------") print("--------------------------") #Estimate Pi for i in range(1, cpus+1): print("Using " + str(i) + " CPU(s)") start = timer() #calc = calculate_pi(i+1, N) pi += timeit(partial(calculate_pi, i, N), "multiprocessing") end = timer() print("Time: " + str(end - start) + "s") print("--------------------------") print("--------------------------") print("Pi: " + str(pi/cpus)) print("Error: " + str(abs(pi/cpus - math.pi)))
for _ in tqdm(range(num_games // 2)): game = Reversi() mcts = MCTS(game) # mcts.run(20) for step in range(70): if step % 2 == 0: move = alphabeta_move(game, depth) else: mcts.run(iterations) move = mcts.get_move() # mcts.make_move(move) game.move(move) mcts = MCTS(game) if game.terminal(): winner = game.winner() if winner == 0: mcts_wins_as_second += 1 elif winner == 0.5: draws += 1 break print('winrate: {}+{}/{}, draws: {}'.format(mcts_wins_as_first, mcts_wins_as_second, num_games, draws)) if __name__ == '__main__': timeit('START') play_against_random(40, 200) # play_against_alphabeta(20, 400, 3) timeit('SHOW')
def run(): for cpus in range (1, multiprocessing.cpu_count()+1): timeit(partial(calculate_pi, cpus, N), "multiprocessing with " + str(cpus) + " CPUs")
def stats_runner_run(): while(True): with u.timeit("kfac_update"): self.model.advance_batch() self.update_stats()
def main(): u.seed_random(1) logdir = u.create_local_logdir(args.logdir) run_name = os.path.basename(logdir) gl.event_writer = SummaryWriter(logdir) print(f"Logging to {run_name}") d1 = args.data_width ** 2 assert args.data_width == args.targets_width o = d1 n = args.stats_batch_size d = [d1, 30, 30, 30, 20, 30, 30, 30, d1] # small values for debugging # loss_type = 'LeastSquares' loss_type = 'CrossEntropy' args.wandb = 0 args.stats_steps = 10 args.train_steps = 10 args.stats_batch_size = 10 args.data_width = 2 args.targets_width = 2 args.nonlin = False d1 = args.data_width ** 2 d2 = 2 d3 = args.targets_width ** 2 if loss_type == 'CrossEntropy': d3 = 10 o = d3 n = args.stats_batch_size d = [d1, d2, d3] dsize = max(args.train_batch_size, args.stats_batch_size)+1 model = u.SimpleFullyConnected2(d, bias=True, nonlin=args.nonlin) model = model.to(gl.device) try: # os.environ['WANDB_SILENT'] = 'true' if args.wandb: wandb.init(project='curv_train_tiny', name=run_name) wandb.tensorboard.patch(tensorboardX=False) wandb.config['train_batch'] = args.train_batch_size wandb.config['stats_batch'] = args.stats_batch_size wandb.config['method'] = args.method wandb.config['n'] = n except Exception as e: print(f"wandb crash with {e}") #optimizer = torch.optim.SGD(model.parameters(), lr=0.03, momentum=0.9) optimizer = torch.optim.Adam(model.parameters(), lr=0.03) # make 10x smaller for least-squares loss dataset = u.TinyMNIST(data_width=args.data_width, targets_width=args.targets_width, dataset_size=dsize, original_targets=True) train_loader = torch.utils.data.DataLoader(dataset, batch_size=args.train_batch_size, shuffle=False, drop_last=True) train_iter = u.infinite_iter(train_loader) stats_iter = None if not args.full_batch: stats_loader = torch.utils.data.DataLoader(dataset, batch_size=args.stats_batch_size, shuffle=False, drop_last=True) stats_iter = u.infinite_iter(stats_loader) test_dataset = u.TinyMNIST(data_width=args.data_width, targets_width=args.targets_width, train=False, dataset_size=dsize, original_targets=True) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.train_batch_size, shuffle=False, drop_last=True) test_iter = u.infinite_iter(test_loader) if loss_type == 'LeastSquares': loss_fn = u.least_squares elif loss_type == 'CrossEntropy': loss_fn = nn.CrossEntropyLoss() autograd_lib.add_hooks(model) gl.token_count = 0 last_outer = 0 val_losses = [] for step in range(args.stats_steps): if last_outer: u.log_scalars({"time/outer": 1000*(time.perf_counter() - last_outer)}) last_outer = time.perf_counter() with u.timeit("val_loss"): test_data, test_targets = next(test_iter) test_output = model(test_data) val_loss = loss_fn(test_output, test_targets) print("val_loss", val_loss.item()) val_losses.append(val_loss.item()) u.log_scalar(val_loss=val_loss.item()) # compute stats if args.full_batch: data, targets = dataset.data, dataset.targets else: data, targets = next(stats_iter) # Capture Hessian and gradient stats autograd_lib.enable_hooks() autograd_lib.clear_backprops(model) autograd_lib.clear_hess_backprops(model) with u.timeit("backprop_g"): output = model(data) loss = loss_fn(output, targets) loss.backward(retain_graph=True) with u.timeit("backprop_H"): autograd_lib.backprop_hess(output, hess_type=loss_type) autograd_lib.disable_hooks() # TODO(y): use remove_hooks with u.timeit("compute_grad1"): autograd_lib.compute_grad1(model) with u.timeit("compute_hess"): autograd_lib.compute_hess(model) for (i, layer) in enumerate(model.layers): # input/output layers are unreasonably expensive if not using Kronecker factoring if d[i]>50 or d[i+1]>50: print(f'layer {i} is too big ({d[i],d[i+1]}), skipping stats') continue if args.skip_stats: continue s = AttrDefault(str, {}) # dictionary-like object for layer stats ############################# # Gradient stats ############################# A_t = layer.activations assert A_t.shape == (n, d[i]) # add factor of n because backprop takes loss averaged over batch, while we need per-example loss B_t = layer.backprops_list[0] * n assert B_t.shape == (n, d[i + 1]) with u.timeit(f"khatri_g-{i}"): G = u.khatri_rao_t(B_t, A_t) # batch loss Jacobian assert G.shape == (n, d[i] * d[i + 1]) g = G.sum(dim=0, keepdim=True) / n # average gradient assert g.shape == (1, d[i] * d[i + 1]) u.check_equal(G.reshape(layer.weight.grad1.shape), layer.weight.grad1) if args.autograd_check: u.check_close(B_t.t() @ A_t / n, layer.weight.saved_grad) u.check_close(g.reshape(d[i + 1], d[i]), layer.weight.saved_grad) s.sparsity = torch.sum(layer.output <= 0) / layer.output.numel() # proportion of activations that are zero s.mean_activation = torch.mean(A_t) s.mean_backprop = torch.mean(B_t) # empirical Fisher with u.timeit(f'sigma-{i}'): efisher = G.t() @ G / n sigma = efisher - g.t() @ g s.sigma_l2 = u.sym_l2_norm(sigma) s.sigma_erank = torch.trace(sigma)/s.sigma_l2 lambda_regularizer = args.lmb * torch.eye(d[i + 1]*d[i]).to(gl.device) H = layer.weight.hess with u.timeit(f"invH-{i}"): invH = torch.cholesky_inverse(H+lambda_regularizer) with u.timeit(f"H_l2-{i}"): s.H_l2 = u.sym_l2_norm(H) s.iH_l2 = u.sym_l2_norm(invH) with u.timeit(f"norms-{i}"): s.H_fro = H.flatten().norm() s.iH_fro = invH.flatten().norm() s.grad_fro = g.flatten().norm() s.param_fro = layer.weight.data.flatten().norm() u.nan_check(H) if args.autograd_check: model.zero_grad() output = model(data) loss = loss_fn(output, targets) H_autograd = u.hessian(loss, layer.weight) H_autograd = H_autograd.reshape(d[i] * d[i + 1], d[i] * d[i + 1]) u.check_close(H, H_autograd) # u.dump(sigma, f'/tmp/sigmas/H-{step}-{i}') def loss_direction(dd: torch.Tensor, eps): """loss improvement if we take step eps in direction dd""" return u.to_python_scalar(eps * (dd @ g.t()) - 0.5 * eps ** 2 * dd @ H @ dd.t()) def curv_direction(dd: torch.Tensor): """Curvature in direction dd""" return u.to_python_scalar(dd @ H @ dd.t() / (dd.flatten().norm() ** 2)) with u.timeit(f"pinvH-{i}"): pinvH = u.pinv(H) with u.timeit(f'curv-{i}'): s.grad_curv = curv_direction(g) ndir = g @ pinvH # newton direction s.newton_curv = curv_direction(ndir) setattr(layer.weight, 'pre', pinvH) # save Newton preconditioner s.step_openai = s.grad_fro**2 / s.grad_curv if s.grad_curv else 999 s.step_max = 2 / s.H_l2 s.step_min = torch.tensor(2) / torch.trace(H) s.newton_fro = ndir.flatten().norm() # frobenius norm of Newton update s.regret_newton = u.to_python_scalar(g @ pinvH @ g.t() / 2) # replace with "quadratic_form" s.regret_gradient = loss_direction(g, s.step_openai) with u.timeit(f'rho-{i}'): p_sigma = u.lyapunov_svd(H, sigma) if u.has_nan(p_sigma) and args.compute_rho: # use expensive method print('using expensive method') import pdb; pdb.set_trace() H0, sigma0 = u.to_numpys(H, sigma) p_sigma = scipy.linalg.solve_lyapunov(H0, sigma0) p_sigma = torch.tensor(p_sigma).to(gl.device) if u.has_nan(p_sigma): # import pdb; pdb.set_trace() s.psigma_erank = H.shape[0] s.rho = 1 else: s.psigma_erank = u.sym_erank(p_sigma) s.rho = H.shape[0] / s.psigma_erank with u.timeit(f"batch-{i}"): s.batch_openai = torch.trace(H @ sigma) / (g @ H @ g.t()) s.diversity = torch.norm(G, "fro") ** 2 / torch.norm(g) ** 2 / n # Faster approaches for noise variance computation # s.noise_variance = torch.trace(H.inverse() @ sigma) # try: # # this fails with singular sigma # s.noise_variance = torch.trace(torch.solve(sigma, H)[0]) # # s.noise_variance = torch.trace(torch.lstsq(sigma, H)[0]) # pass # except RuntimeError as _: s.noise_variance_pinv = torch.trace(pinvH @ sigma) s.H_erank = torch.trace(H) / s.H_l2 s.batch_jain_simple = 1 + s.H_erank s.batch_jain_full = 1 + s.rho * s.H_erank u.log_scalars(u.nest_stats(layer.name, s)) # gradient steps with u.timeit('inner'): for i in range(args.train_steps): optimizer.zero_grad() data, targets = next(train_iter) model.zero_grad() output = model(data) loss = loss_fn(output, targets) loss.backward() # u.log_scalar(train_loss=loss.item()) if args.method != 'newton': optimizer.step() if args.weight_decay: for group in optimizer.param_groups: for param in group['params']: param.data.mul_(1-args.weight_decay) else: for (layer_idx, layer) in enumerate(model.layers): param: torch.nn.Parameter = layer.weight param_data: torch.Tensor = param.data param_data.copy_(param_data - 0.1 * param.grad) if layer_idx != 1: # only update 1 layer with Newton, unstable otherwise continue u.nan_check(layer.weight.pre) u.nan_check(param.grad.flatten()) u.nan_check(u.v2r(param.grad.flatten()) @ layer.weight.pre) param_new_flat = u.v2r(param_data.flatten()) - u.v2r(param.grad.flatten()) @ layer.weight.pre u.nan_check(param_new_flat) param_data.copy_(param_new_flat.reshape(param_data.shape)) gl.token_count += data.shape[0] gl.event_writer.close()
def write_lock(self): with u.timeit("write_lock"): with self.lock: yield
def main(): np.random.seed(args.seed) tf.set_random_seed(args.seed) logger = u.TensorboardLogger(args.run) with u.timeit("init/session"): gpu_options = tf.GPUOptions(allow_growth=False) sess = tf.InteractiveSession(config=tf.ConfigProto(gpu_options=gpu_options)) u.register_default_session(sess) # since default session is Thread-local with u.timeit("init/model_init"): model = model_creator(args.batch_size, name="main") model.initialize_global_vars(verbose=True) model.initialize_local_vars() with u.timeit("init/kfac_init"): kfac = Kfac(model_creator, args.kfac_batch_size) kfac.model.initialize_global_vars(verbose=False) kfac.model.initialize_local_vars() kfac.Lambda.set(args.Lambda) kfac.reset() # resets optimization variables (not model variables) if args.mode != 'run': opt = tf.train.AdamOptimizer(0.001) else: opt = tf.train.AdamOptimizer(args.lr) grads_and_vars = opt.compute_gradients(model.loss, var_list=model.trainable_vars) grad = IndexedGrad.from_grads_and_vars(grads_and_vars) grad_new = kfac.correct(grad) with u.capture_vars() as adam_vars: train_op = opt.apply_gradients(grad_new.to_grads_and_vars()) with u.timeit("init/adam"): sessrun([v.initializer for v in adam_vars]) losses = [] u.record_time() start_time = time.time() vloss0 = 0 # todo, unify the two data outputs outfn = 'data/%s_%f_%f.csv'%(args.run, args.lr, args.Lambda) writer = u.BufferedWriter(outfn, 60) # get rid? start_time = time.time() if args.extra_kfac_batch_advance: kfac.model.advance_batch() # advance kfac batch if args.kfac_async: kfac.start_stats_runners() for step in range(args.num_steps): if args.validate_every_n and step%args.validate_every_n == 0: loss0, vloss0 = sessrun([model.loss, model.vloss]) else: loss0, = sessrun([model.loss]) losses.append(loss0) # TODO: remove this logger('loss/loss', loss0, 'loss/vloss', vloss0) elapsed = time.time()-start_time print("%d sec, step %d, loss %.2f, vloss %.2f" %(elapsed, step, loss0, vloss0)) writer.write('%d, %f, %f, %f\n'%(step, elapsed, loss0, vloss0)) if args.method=='kfac' and not args.kfac_async: kfac.model.advance_batch() kfac.update_stats() with u.timeit("train"): model.advance_batch() grad.update() with kfac.read_lock(): grad_new.update() train_op.run() u.record_time() logger.next_step() # TODO: use u.global_runs_dir # TODO: get rid of u.timeit? with open('timelines/graphdef.txt', 'w') as f: f.write(str(u.get_default_graph().as_graph_def())) u.summarize_time() if args.mode == 'record': u.dump_with_prompt(losses, release_test_fn) elif args.mode == 'test': targets = np.loadtxt('data/'+release_test_fn, delimiter=",") u.check_equal(losses, targets, rtol=1e-2) u.summarize_difference(losses, targets)
def run(): timeit(pi_serial, "serial")
def run_nn(model, loaders, model_name, prt_model = False, case = None): print('==='+ model_name +'===<start>') total = sum(p.numel() for p in model.parameters() if p.requires_grad) print('# of para: {}'.format(total)) tr, val, te = loaders e = executer(model, model_name, arg, prt_model) e.train(tr, val, case) e.validate(te) print('Accuracy: {}'.format(e.hist['val_acc'])) with open("case"+str(case)+'/'+model_name+".txt", "a") as f: print('Case:', str(case), file = f) print('Time:', str(timeit()), file = f) print('Accuracy: {}'.format(e.hist['val_acc']), file = f) print('F1-Score: {}\n'.format(e.f1), file = f) ##Experiments #------ ''' Case 0: Original setting on datasets(rt) ''' #x = rt.x #lbl = rt.y #x_encoded = ohe().fit_transform_pretrained(x, w2v.getDict()) #loaders = handler().split_tvt((x_encoded, lbl)) #run_nn(models.Transformer(2, 2, d, 1, 1, 2, 256), loaders, 'Transformer.pt', model, 0)
def main2(cnt): timeit(cPickle.dump, v, open("ESV/pickle", "w"), times=cnt) timeit(cPickle.load, open("ESV/pickle"), times=cnt)
def test_main(): parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') parser.add_argument('--wandb', type=int, default=1, help='log to weights and biases') parser.add_argument('--autograd_check', type=int, default=0, help='autograd correctness checks') parser.add_argument('--logdir', type=str, default='/temp/runs/curv_train_tiny/run') parser.add_argument('--train_batch_size', type=int, default=100) parser.add_argument('--stats_batch_size', type=int, default=60000) parser.add_argument('--dataset_size', type=int, default=60000) parser.add_argument('--train_steps', type=int, default=100, help="this many train steps between stat collection") parser.add_argument('--stats_steps', type=int, default=1000000, help="total number of curvature stats collections") parser.add_argument('--nonlin', type=int, default=1, help="whether to add ReLU nonlinearity between layers") parser.add_argument('--method', type=str, choices=['gradient', 'newton'], default='gradient', help="descent method, newton or gradient") parser.add_argument('--layer', type=int, default=-1, help="restrict updates to this layer") parser.add_argument('--data_width', type=int, default=28) parser.add_argument('--targets_width', type=int, default=28) parser.add_argument('--lmb', type=float, default=1e-3) parser.add_argument( '--hess_samples', type=int, default=1, help='number of samples when sub-sampling outputs, 0 for exact hessian' ) parser.add_argument('--hess_kfac', type=int, default=0, help='whether to use KFAC approximation for hessian') parser.add_argument('--compute_rho', type=int, default=1, help='use expensive method to compute rho') parser.add_argument('--skip_stats', type=int, default=0, help='skip all stats collection') parser.add_argument('--full_batch', type=int, default=0, help='do stats on the whole dataset') parser.add_argument('--weight_decay', type=float, default=1e-4) #args = parser.parse_args() args = AttrDict() args.lmb = 1e-3 args.compute_rho = 1 args.weight_decay = 1e-4 args.method = 'gradient' args.logdir = '/tmp' args.data_width = 2 args.targets_width = 2 args.train_batch_size = 10 args.full_batch = False args.skip_stats = False args.autograd_check = False u.seed_random(1) logdir = u.create_local_logdir(args.logdir) run_name = os.path.basename(logdir) #gl.event_writer = SummaryWriter(logdir) gl.event_writer = u.NoOp() # print(f"Logging to {run_name}") # small values for debugging # loss_type = 'LeastSquares' loss_type = 'CrossEntropy' args.wandb = 0 args.stats_steps = 10 args.train_steps = 10 args.stats_batch_size = 10 args.data_width = 2 args.targets_width = 2 args.nonlin = False d1 = args.data_width**2 d2 = 2 d3 = args.targets_width**2 d1 = args.data_width**2 assert args.data_width == args.targets_width o = d1 n = args.stats_batch_size d = [d1, 30, 30, 30, 20, 30, 30, 30, d1] if loss_type == 'CrossEntropy': d3 = 10 o = d3 n = args.stats_batch_size d = [d1, d2, d3] dsize = max(args.train_batch_size, args.stats_batch_size) + 1 model = u.SimpleFullyConnected2(d, bias=True, nonlin=args.nonlin) model = model.to(gl.device) try: # os.environ['WANDB_SILENT'] = 'true' if args.wandb: wandb.init(project='curv_train_tiny', name=run_name) wandb.tensorboard.patch(tensorboardX=False) wandb.config['train_batch'] = args.train_batch_size wandb.config['stats_batch'] = args.stats_batch_size wandb.config['method'] = args.method wandb.config['n'] = n except Exception as e: print(f"wandb crash with {e}") # optimizer = torch.optim.SGD(model.parameters(), lr=0.03, momentum=0.9) optimizer = torch.optim.Adam( model.parameters(), lr=0.03) # make 10x smaller for least-squares loss dataset = u.TinyMNIST(data_width=args.data_width, targets_width=args.targets_width, dataset_size=dsize, original_targets=True) train_loader = torch.utils.data.DataLoader( dataset, batch_size=args.train_batch_size, shuffle=False, drop_last=True) train_iter = u.infinite_iter(train_loader) stats_iter = None if not args.full_batch: stats_loader = torch.utils.data.DataLoader( dataset, batch_size=args.stats_batch_size, shuffle=False, drop_last=True) stats_iter = u.infinite_iter(stats_loader) test_dataset = u.TinyMNIST(data_width=args.data_width, targets_width=args.targets_width, train=False, dataset_size=dsize, original_targets=True) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.train_batch_size, shuffle=False, drop_last=True) test_iter = u.infinite_iter(test_loader) if loss_type == 'LeastSquares': loss_fn = u.least_squares elif loss_type == 'CrossEntropy': loss_fn = nn.CrossEntropyLoss() autograd_lib.add_hooks(model) gl.token_count = 0 last_outer = 0 val_losses = [] for step in range(args.stats_steps): if last_outer: u.log_scalars( {"time/outer": 1000 * (time.perf_counter() - last_outer)}) last_outer = time.perf_counter() with u.timeit("val_loss"): test_data, test_targets = next(test_iter) test_output = model(test_data) val_loss = loss_fn(test_output, test_targets) # print("val_loss", val_loss.item()) val_losses.append(val_loss.item()) u.log_scalar(val_loss=val_loss.item()) # compute stats if args.full_batch: data, targets = dataset.data, dataset.targets else: data, targets = next(stats_iter) # Capture Hessian and gradient stats autograd_lib.enable_hooks() autograd_lib.clear_backprops(model) autograd_lib.clear_hess_backprops(model) with u.timeit("backprop_g"): output = model(data) loss = loss_fn(output, targets) loss.backward(retain_graph=True) with u.timeit("backprop_H"): autograd_lib.backprop_hess(output, hess_type=loss_type) autograd_lib.disable_hooks() # TODO(y): use remove_hooks with u.timeit("compute_grad1"): autograd_lib.compute_grad1(model) with u.timeit("compute_hess"): autograd_lib.compute_hess(model) for (i, layer) in enumerate(model.layers): # input/output layers are unreasonably expensive if not using Kronecker factoring if d[i] > 50 or d[i + 1] > 50: print( f'layer {i} is too big ({d[i], d[i + 1]}), skipping stats') continue if args.skip_stats: continue s = AttrDefault(str, {}) # dictionary-like object for layer stats ############################# # Gradient stats ############################# A_t = layer.activations assert A_t.shape == (n, d[i]) # add factor of n because backprop takes loss averaged over batch, while we need per-example loss B_t = layer.backprops_list[0] * n assert B_t.shape == (n, d[i + 1]) with u.timeit(f"khatri_g-{i}"): G = u.khatri_rao_t(B_t, A_t) # batch loss Jacobian assert G.shape == (n, d[i] * d[i + 1]) g = G.sum(dim=0, keepdim=True) / n # average gradient assert g.shape == (1, d[i] * d[i + 1]) u.check_equal(G.reshape(layer.weight.grad1.shape), layer.weight.grad1) if args.autograd_check: u.check_close(B_t.t() @ A_t / n, layer.weight.saved_grad) u.check_close(g.reshape(d[i + 1], d[i]), layer.weight.saved_grad) s.sparsity = torch.sum(layer.output <= 0) / layer.output.numel( ) # proportion of activations that are zero s.mean_activation = torch.mean(A_t) s.mean_backprop = torch.mean(B_t) # empirical Fisher with u.timeit(f'sigma-{i}'): efisher = G.t() @ G / n sigma = efisher - g.t() @ g s.sigma_l2 = u.sym_l2_norm(sigma) s.sigma_erank = torch.trace(sigma) / s.sigma_l2 lambda_regularizer = args.lmb * torch.eye(d[i + 1] * d[i]).to( gl.device) H = layer.weight.hess with u.timeit(f"invH-{i}"): invH = torch.cholesky_inverse(H + lambda_regularizer) with u.timeit(f"H_l2-{i}"): s.H_l2 = u.sym_l2_norm(H) s.iH_l2 = u.sym_l2_norm(invH) with u.timeit(f"norms-{i}"): s.H_fro = H.flatten().norm() s.iH_fro = invH.flatten().norm() s.grad_fro = g.flatten().norm() s.param_fro = layer.weight.data.flatten().norm() u.nan_check(H) if args.autograd_check: model.zero_grad() output = model(data) loss = loss_fn(output, targets) H_autograd = u.hessian(loss, layer.weight) H_autograd = H_autograd.reshape(d[i] * d[i + 1], d[i] * d[i + 1]) u.check_close(H, H_autograd) # u.dump(sigma, f'/tmp/sigmas/H-{step}-{i}') def loss_direction(dd: torch.Tensor, eps): """loss improvement if we take step eps in direction dd""" return u.to_python_scalar(eps * (dd @ g.t()) - 0.5 * eps**2 * dd @ H @ dd.t()) def curv_direction(dd: torch.Tensor): """Curvature in direction dd""" return u.to_python_scalar(dd @ H @ dd.t() / (dd.flatten().norm()**2)) with u.timeit(f"pinvH-{i}"): pinvH = H.pinverse() with u.timeit(f'curv-{i}'): s.grad_curv = curv_direction(g) ndir = g @ pinvH # newton direction s.newton_curv = curv_direction(ndir) setattr(layer.weight, 'pre', pinvH) # save Newton preconditioner s.step_openai = s.grad_fro**2 / s.grad_curv if s.grad_curv else 999 s.step_max = 2 / s.H_l2 s.step_min = torch.tensor(2) / torch.trace(H) s.newton_fro = ndir.flatten().norm( ) # frobenius norm of Newton update s.regret_newton = u.to_python_scalar( g @ pinvH @ g.t() / 2) # replace with "quadratic_form" s.regret_gradient = loss_direction(g, s.step_openai) with u.timeit(f'rho-{i}'): p_sigma = u.lyapunov_spectral(H, sigma) discrepancy = torch.max(abs(p_sigma - p_sigma.t()) / p_sigma) s.psigma_erank = u.sym_erank(p_sigma) s.rho = H.shape[0] / s.psigma_erank with u.timeit(f"batch-{i}"): s.batch_openai = torch.trace(H @ sigma) / (g @ H @ g.t()) s.diversity = torch.norm(G, "fro")**2 / torch.norm(g)**2 / n # Faster approaches for noise variance computation # s.noise_variance = torch.trace(H.inverse() @ sigma) # try: # # this fails with singular sigma # s.noise_variance = torch.trace(torch.solve(sigma, H)[0]) # # s.noise_variance = torch.trace(torch.lstsq(sigma, H)[0]) # pass # except RuntimeError as _: s.noise_variance_pinv = torch.trace(pinvH @ sigma) s.H_erank = torch.trace(H) / s.H_l2 s.batch_jain_simple = 1 + s.H_erank s.batch_jain_full = 1 + s.rho * s.H_erank u.log_scalars(u.nest_stats(layer.name, s)) # gradient steps with u.timeit('inner'): for i in range(args.train_steps): optimizer.zero_grad() data, targets = next(train_iter) model.zero_grad() output = model(data) loss = loss_fn(output, targets) loss.backward() # u.log_scalar(train_loss=loss.item()) if args.method != 'newton': optimizer.step() if args.weight_decay: for group in optimizer.param_groups: for param in group['params']: param.data.mul_(1 - args.weight_decay) else: for (layer_idx, layer) in enumerate(model.layers): param: torch.nn.Parameter = layer.weight param_data: torch.Tensor = param.data param_data.copy_(param_data - 0.1 * param.grad) if layer_idx != 1: # only update 1 layer with Newton, unstable otherwise continue u.nan_check(layer.weight.pre) u.nan_check(param.grad.flatten()) u.nan_check( u.v2r(param.grad.flatten()) @ layer.weight.pre) param_new_flat = u.v2r(param_data.flatten()) - u.v2r( param.grad.flatten()) @ layer.weight.pre u.nan_check(param_new_flat) param_data.copy_( param_new_flat.reshape(param_data.shape)) gl.token_count += data.shape[0] gl.event_writer.close() assert val_losses[0] > 2.4 # 2.4828238487243652 assert val_losses[-1] < 2.25 # 2.20609712600708
def run(): """ timed runs """ timeit(partial(calculate_pi_opencl, N), "calculate_pi_opencl") timeit(partial(calculate_pi_opencl_simple, N), "calculate_pi_opencl_simple")