def closure(): global step, final_loss optimizer.zero_grad() output = model(data) loss = F.mse_loss(output, data) if verbose: loss0 = loss.data[0] times.append(u.last_time()) print("Step %3d loss %6.5f msec %6.3f"%(step, loss0, u.last_time())) step+=1 if step == iters: final_loss = loss.data[0] loss.backward() u.record_time() return loss
def lbfgs(opfunc, x, config, state, do_verbose): """port of lbfgs.lua, using TensorFlow eager mode. """ global final_loss, times maxIter = config.maxIter or 20 maxEval = config.maxEval or maxIter*1.25 tolFun = config.tolFun or 1e-5 tolX = config.tolX or 1e-9 nCorrection = config.nCorrection or 100 lineSearch = config.lineSearch lineSearchOpts = config.lineSearchOptions learningRate = config.learningRate or 1 isverbose = config.verbose or False # verbose function if isverbose: verbose = verbose_func else: verbose = lambda x: None # evaluate initial f(x) and df/dx f, g = opfunc(x) f_hist = [f] currentFuncEval = 1 state.funcEval = state.funcEval + 1 p = g.shape[0] # check optimality of initial point tmp1 = tf.abs(g) if tf.reduce_sum(tmp1) <= tolFun: verbose("optimality condition below tolFun") return x, f_hist # optimize for a max of maxIter iterations nIter = 0 times = [] while nIter < maxIter: start_time = time.time() # keep track of nb of iterations nIter = nIter + 1 state.nIter = state.nIter + 1 ############################################################ ## compute gradient descent direction ############################################################ if state.nIter == 1: d = -g old_dirs = [] old_stps = [] Hdiag = 1 else: # do lbfgs update (update memory) y = g - g_old s = d*t ys = dot(y, s) if ys > 1e-10: # updating memory if len(old_dirs) == nCorrection: # shift history by one (limited-memory) del old_dirs[0] del old_stps[0] # store new direction/step old_dirs.append(s) old_stps.append(y) # update scale of initial Hessian approximation Hdiag = ys/dot(y, y) # compute the approximate (L-BFGS) inverse Hessian # multiplied by the gradient k = len(old_dirs) # need to be accessed element-by-element, so don't re-type tensor: ro = [0]*nCorrection for i in range(k): ro[i] = 1/dot(old_stps[i], old_dirs[i]) # iteration in L-BFGS loop collapsed to use just one buffer # need to be accessed element-by-element, so don't re-type tensor: al = [0]*nCorrection q = -g for i in range(k-1, -1, -1): al[i] = dot(old_dirs[i], q) * ro[i] q = q - al[i]*old_stps[i] # multiply by initial Hessian r = q*Hdiag for i in range(k): be_i = dot(old_stps[i], r) * ro[i] r += (al[i]-be_i)*old_dirs[i] d = r # final direction is in r/d (same object) g_old = g f_old = f ############################################################ ## compute step length ############################################################ # directional derivative gtd = dot(g, d) # check that progress can be made along that direction if gtd > -tolX: verbose("Can not make progress along direction.") break # reset initial guess for step size if state.nIter == 1: tmp1 = tf.abs(g) t = min(1, 1/tf.reduce_sum(tmp1)) else: t = learningRate # optional line search: user function lsFuncEval = 0 if lineSearch and isinstance(lineSearch) == types.FunctionType: # perform line search, using user function f,g,x,t,lsFuncEval = lineSearch(opfunc,x,t,d,f,g,gtd,lineSearchOpts) f_hist.append(f) else: # no line search, simply move with fixed-step x += t*d if nIter != maxIter: # re-evaluate function only if not in last iteration # the reason we do this: in a stochastic setting, # no use to re-evaluate that function here f, g = opfunc(x) lsFuncEval = 1 f_hist.append(f) # update func eval currentFuncEval = currentFuncEval + lsFuncEval state.funcEval = state.funcEval + lsFuncEval ############################################################ ## check conditions ############################################################ if nIter == maxIter: break if currentFuncEval >= maxEval: # max nb of function evals verbose('max nb of function evals') break tmp1 = tf.abs(g) if tf.reduce_sum(tmp1) <=tolFun: # check optimality verbose('optimality condition below tolFun') break tmp1 = tf.abs(d*t) if tf.reduce_sum(tmp1) <= tolX: # step size below tolX verbose('step size below tolX') break if tf.abs(f-f_old) < tolX: # function value changing less than tolX verbose('function value changing less than tolX'+str(tf.abs(f-f_old))) break if do_verbose: print("Step %3d loss %6.5f msec %6.3f"%(nIter, f.numpy(), u.last_time())) u.record_time() times.append(u.last_time()) if nIter == maxIter - 1: final_loss = f.numpy() # save state state.old_dirs = old_dirs state.old_stps = old_stps state.Hdiag = Hdiag state.g_old = g_old state.f_old = f_old state.t = t state.d = d return x, f_hist, currentFuncEval
def lbfgs(opfunc, x, config, state, do_verbose): """port of lbfgs.lua, using TensorFlow eager mode. """ global final_loss, times maxIter = config.maxIter or 20 maxEval = config.maxEval or maxIter * 1.25 tolFun = config.tolFun or 1e-5 tolX = config.tolX or 1e-9 nCorrection = config.nCorrection or 100 lineSearch = config.lineSearch lineSearchOpts = config.lineSearchOptions learningRate = config.learningRate or 1 isverbose = config.verbose or False # verbose function if isverbose: verbose = verbose_func else: verbose = lambda x: None # evaluate initial f(x) and df/dx f, g = opfunc(x) f_hist = [f] currentFuncEval = 1 state.funcEval = state.funcEval + 1 p = g.shape[0] # check optimality of initial point tmp1 = tf.abs(g) if tf.reduce_sum(tmp1) <= tolFun: verbose("optimality condition below tolFun") return x, f_hist # optimize for a max of maxIter iterations nIter = 0 times = [] while nIter < maxIter: start_time = time.time() # keep track of nb of iterations nIter = nIter + 1 state.nIter = state.nIter + 1 ############################################################ ## compute gradient descent direction ############################################################ if state.nIter == 1: d = -g old_dirs = [] old_stps = [] Hdiag = 1 else: # do lbfgs update (update memory) y = g - g_old s = d * t ys = dot(y, s) if ys > 1e-10: # updating memory if len(old_dirs) == nCorrection: # shift history by one (limited-memory) del old_dirs[0] del old_stps[0] # store new direction/step old_dirs.append(s) old_stps.append(y) # update scale of initial Hessian approximation Hdiag = ys / dot(y, y) # compute the approximate (L-BFGS) inverse Hessian # multiplied by the gradient k = len(old_dirs) # need to be accessed element-by-element, so don't re-type tensor: ro = [0] * nCorrection for i in range(k): ro[i] = 1 / dot(old_stps[i], old_dirs[i]) # iteration in L-BFGS loop collapsed to use just one buffer # need to be accessed element-by-element, so don't re-type tensor: al = [0] * nCorrection q = -g for i in range(k - 1, -1, -1): al[i] = dot(old_dirs[i], q) * ro[i] q = q - al[i] * old_stps[i] # multiply by initial Hessian r = q * Hdiag for i in range(k): be_i = dot(old_stps[i], r) * ro[i] r += (al[i] - be_i) * old_dirs[i] d = r # final direction is in r/d (same object) g_old = g f_old = f ############################################################ ## compute step length ############################################################ # directional derivative gtd = dot(g, d) # check that progress can be made along that direction if gtd > -tolX: verbose("Can not make progress along direction.") break # reset initial guess for step size if state.nIter == 1: tmp1 = tf.abs(g) t = min(1, 1 / tf.reduce_sum(tmp1)) else: t = learningRate # optional line search: user function lsFuncEval = 0 if lineSearch and isinstance(lineSearch) == types.FunctionType: # perform line search, using user function f, g, x, t, lsFuncEval = lineSearch(opfunc, x, t, d, f, g, gtd, lineSearchOpts) f_hist.append(f) else: # no line search, simply move with fixed-step x += t * d if nIter != maxIter: # re-evaluate function only if not in last iteration # the reason we do this: in a stochastic setting, # no use to re-evaluate that function here f, g = opfunc(x) lsFuncEval = 1 f_hist.append(f) # update func eval currentFuncEval = currentFuncEval + lsFuncEval state.funcEval = state.funcEval + lsFuncEval ############################################################ ## check conditions ############################################################ if nIter == maxIter: break if currentFuncEval >= maxEval: # max nb of function evals verbose('max nb of function evals') break tmp1 = tf.abs(g) if tf.reduce_sum(tmp1) <= tolFun: # check optimality verbose('optimality condition below tolFun') break tmp1 = tf.abs(d * t) if tf.reduce_sum(tmp1) <= tolX: # step size below tolX verbose('step size below tolX') break if tf.abs(f - f_old) < tolX: # function value changing less than tolX verbose('function value changing less than tolX' + str(tf.abs(f - f_old))) break if do_verbose: print("Step %3d loss %6.5f msec %6.3f" % (nIter, f.numpy(), u.last_time())) u.record_time() times.append(u.last_time()) if nIter == maxIter - 1: final_loss = f.numpy() # save state state.old_dirs = old_dirs state.old_stps = old_stps state.Hdiag = Hdiag state.g_old = g_old state.f_old = f_old state.t = t state.d = d return x, f_hist, currentFuncEval
def main(): np.random.seed(args.seed) tf.set_random_seed(args.seed) logger = u.TensorboardLogger(args.run) with u.timeit("init/session"): rewrite_options = None try: from tensorflow.core.protobuf import rewriter_config_pb2 rewrite_options = rewriter_config_pb2.RewriterConfig( disable_model_pruning=True, constant_folding=rewriter_config_pb2.RewriterConfig.OFF, memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL) except: pass optimizer_options = tf.OptimizerOptions( opt_level=tf.OptimizerOptions.L0) graph_options = tf.GraphOptions(optimizer_options=optimizer_options, rewrite_options=rewrite_options) gpu_options = tf.GPUOptions(allow_growth=False) config = tf.ConfigProto(graph_options=graph_options, gpu_options=gpu_options, log_device_placement=False) sess = tf.InteractiveSession(config=config) u.register_default_session( sess) # since default session is Thread-local with u.timeit("init/model_init"): model = model_creator(args.batch_size, name="main") model.initialize_global_vars(verbose=True) model.initialize_local_vars() kfac_lib.numeric_inverse = args.numeric_inverse with u.timeit("init/kfac_init"): kfac = Kfac(model_creator, args.kfac_batch_size) kfac.model.initialize_global_vars(verbose=False) kfac.model.initialize_local_vars() kfac.Lambda.set(args.Lambda) kfac.reset() # resets optimization variables (not model variables) if args.mode != 'run': opt = tf.train.AdamOptimizer(0.001) else: opt = tf.train.AdamOptimizer(args.lr) grads_and_vars = opt.compute_gradients(model.loss, var_list=model.trainable_vars) grad = IndexedGrad.from_grads_and_vars(grads_and_vars) grad_new = kfac.correct(grad) with u.capture_vars() as adam_vars: train_op = opt.apply_gradients(grad_new.to_grads_and_vars()) with u.timeit("init/adam"): sessrun([v.initializer for v in adam_vars]) losses = [] u.record_time() start_time = time.time() vloss0 = 0 # todo, unify the two data outputs outfn = 'data/%s_%f_%f.csv' % (args.run, args.lr, args.Lambda) start_time = time.time() if args.extra_kfac_batch_advance: kfac.model.advance_batch() # advance kfac batch if args.kfac_async: kfac.start_stats_runners() for step in range(args.num_steps): if args.validate_every_n and step % args.validate_every_n == 0: loss0, vloss0 = sessrun([model.loss, model.vloss]) else: loss0, = sessrun([model.loss]) losses.append(loss0) # TODO: remove this logger('loss/loss', loss0, 'loss/vloss', vloss0) elapsed = time.time() - start_time start_time = time.time() print("%4d ms, step %4d, loss %5.2f, vloss %5.2f" % (elapsed * 1e3, step, loss0, vloss0)) if args.method == 'kfac' and not args.kfac_async: kfac.model.advance_batch() kfac.update_stats() with u.timeit("train"): model.advance_batch() with u.timeit("grad.update"): grad.update() with kfac.read_lock(): grad_new.update() u.run(train_op) u.record_time() logger.next_step() # TODO: use u.global_runs_dir # TODO: get rid of u.timeit? with open('timelines/graphdef.txt', 'w') as f: f.write(str(u.get_default_graph().as_graph_def())) u.summarize_time() if args.mode == 'record': u.dump_with_prompt(losses, release_test_fn) elif args.mode == 'test': targets = np.loadtxt('data/' + release_test_fn, delimiter=",") u.check_equal(losses, targets, rtol=1e-2) u.summarize_difference(losses, targets) assert u.last_time() < 800, "Expected 648 on GTX 1080"
vlosses.append(vloss0) step_lengths.append(lr0) ratios.append(slope_ratio) grad_norms.append(grad_norm.eval()) pre_grad_norms.append(pre_grad_norm.eval()) pre_grad_stable_norms.append(pre_grad_stable_norm.eval()) if actual_delta > 0: print("Observed increase in loss %.2f, rejecting step" % (actual_delta, )) restore_params_op.run() if step % report_frequency == 0: print( "Step %d loss %.2f, target decrease %.3f, actual decrease, %.3f, time: %.2f" % (step, loss0, target_delta, actual_delta, u.last_time())) #print("Step %d loss %.2f, target decrease %.3f, actual decrease, %.3f ratio %.2f grad norm: %.2f pregrad norm: %.2f"%(step, loss0, target_delta, actual_delta, slope_ratio, grad_norm.eval(), pre_grad_norm.eval())) if adaptive_step_frequency and adaptive_step and step > adaptive_step_burn_in: # shrink if wrong prediction, don't shrink if prediction is tiny if slope_ratio < alpha and abs( target_delta) > 1e-6 and adaptive_step: print("%.2f %.2f %.2f" % (loss0, loss1, slope_ratio)) print( "Slope optimality %.2f, shrinking learning rate to %.2f" % ( slope_ratio, lr0 * beta, )) sess.run(vard[lr].setter, feed_dict={vard[lr].p: lr0 * beta})