Exemple #1
0
 def closure():
   global step, final_loss
   optimizer.zero_grad()
   output = model(data)
   loss = F.mse_loss(output, data)
   if verbose:
     loss0 = loss.data[0]
     times.append(u.last_time())
     print("Step %3d loss %6.5f msec %6.3f"%(step, loss0, u.last_time()))
   step+=1
   if step == iters:
     final_loss = loss.data[0]
   loss.backward()
   u.record_time()
   return loss
Exemple #2
0
def lbfgs(opfunc, x, config, state, do_verbose):
  """port of lbfgs.lua, using TensorFlow eager mode.
  """

  global final_loss, times
  
  maxIter = config.maxIter or 20
  maxEval = config.maxEval or maxIter*1.25
  tolFun = config.tolFun or 1e-5
  tolX = config.tolX or 1e-9
  nCorrection = config.nCorrection or 100
  lineSearch = config.lineSearch
  lineSearchOpts = config.lineSearchOptions
  learningRate = config.learningRate or 1
  isverbose = config.verbose or False

  # verbose function
  if isverbose:
    verbose = verbose_func
  else:
    verbose = lambda x: None

    # evaluate initial f(x) and df/dx
  f, g = opfunc(x)

  f_hist = [f]
  currentFuncEval = 1
  state.funcEval = state.funcEval + 1
  p = g.shape[0]

  # check optimality of initial point
  tmp1 = tf.abs(g)
  if tf.reduce_sum(tmp1) <= tolFun:
    verbose("optimality condition below tolFun")
    return x, f_hist

  # optimize for a max of maxIter iterations
  nIter = 0
  times = []
  while nIter < maxIter:
    start_time = time.time()
    
    # keep track of nb of iterations
    nIter = nIter + 1
    state.nIter = state.nIter + 1

    ############################################################
    ## compute gradient descent direction
    ############################################################
    if state.nIter == 1:
      d = -g
      old_dirs = []
      old_stps = []
      Hdiag = 1
    else:
      # do lbfgs update (update memory)
      y = g - g_old
      s = d*t
      ys = dot(y, s)
      
      if ys > 1e-10:
        # updating memory
        if len(old_dirs) == nCorrection:
          # shift history by one (limited-memory)
          del old_dirs[0]
          del old_stps[0]

        # store new direction/step
        old_dirs.append(s)
        old_stps.append(y)

        # update scale of initial Hessian approximation
        Hdiag = ys/dot(y, y)

      # compute the approximate (L-BFGS) inverse Hessian 
      # multiplied by the gradient
      k = len(old_dirs)

      # need to be accessed element-by-element, so don't re-type tensor:
      ro = [0]*nCorrection
      for i in range(k):
        ro[i] = 1/dot(old_stps[i], old_dirs[i])
        

      # iteration in L-BFGS loop collapsed to use just one buffer
      # need to be accessed element-by-element, so don't re-type tensor:
      al = [0]*nCorrection

      q = -g
      for i in range(k-1, -1, -1):
        al[i] = dot(old_dirs[i], q) * ro[i]
        q = q - al[i]*old_stps[i]

      # multiply by initial Hessian
      r = q*Hdiag
      for i in range(k):
        be_i = dot(old_stps[i], r) * ro[i]
        r += (al[i]-be_i)*old_dirs[i]
        
      d = r
      # final direction is in r/d (same object)

    g_old = g
    f_old = f
    
    ############################################################
    ## compute step length
    ############################################################
    # directional derivative
    gtd = dot(g, d)

    # check that progress can be made along that direction
    if gtd > -tolX:
      verbose("Can not make progress along direction.")
      break

    # reset initial guess for step size
    if state.nIter == 1:
      tmp1 = tf.abs(g)
      t = min(1, 1/tf.reduce_sum(tmp1))
    else:
      t = learningRate


    # optional line search: user function
    lsFuncEval = 0
    if lineSearch and isinstance(lineSearch) == types.FunctionType:
      # perform line search, using user function
      f,g,x,t,lsFuncEval = lineSearch(opfunc,x,t,d,f,g,gtd,lineSearchOpts)
      f_hist.append(f)
    else:
      # no line search, simply move with fixed-step
      x += t*d
      
      if nIter != maxIter:
        # re-evaluate function only if not in last iteration
        # the reason we do this: in a stochastic setting,
        # no use to re-evaluate that function here
        f, g = opfunc(x)
        
        lsFuncEval = 1
        f_hist.append(f)


    # update func eval
    currentFuncEval = currentFuncEval + lsFuncEval
    state.funcEval = state.funcEval + lsFuncEval

    ############################################################
    ## check conditions
    ############################################################
    if nIter == maxIter:
      break

    if currentFuncEval >= maxEval:
      # max nb of function evals
      verbose('max nb of function evals')
      break

    tmp1 = tf.abs(g)
    if tf.reduce_sum(tmp1) <=tolFun:
      # check optimality
      verbose('optimality condition below tolFun')
      break
    
    tmp1 = tf.abs(d*t)
    if tf.reduce_sum(tmp1) <= tolX:
      # step size below tolX
      verbose('step size below tolX')
      break

    if tf.abs(f-f_old) < tolX:
      # function value changing less than tolX
      verbose('function value changing less than tolX'+str(tf.abs(f-f_old)))
      break

    if do_verbose:
      print("Step %3d loss %6.5f msec %6.3f"%(nIter, f.numpy(), u.last_time()))
      u.record_time()
      times.append(u.last_time())

    if nIter == maxIter - 1:
      final_loss = f.numpy()


  # save state
  state.old_dirs = old_dirs
  state.old_stps = old_stps
  state.Hdiag = Hdiag
  state.g_old = g_old
  state.f_old = f_old
  state.t = t
  state.d = d

  return x, f_hist, currentFuncEval
Exemple #3
0
def lbfgs(opfunc, x, config, state, do_verbose):
    """port of lbfgs.lua, using TensorFlow eager mode.
  """

    global final_loss, times

    maxIter = config.maxIter or 20
    maxEval = config.maxEval or maxIter * 1.25
    tolFun = config.tolFun or 1e-5
    tolX = config.tolX or 1e-9
    nCorrection = config.nCorrection or 100
    lineSearch = config.lineSearch
    lineSearchOpts = config.lineSearchOptions
    learningRate = config.learningRate or 1
    isverbose = config.verbose or False

    # verbose function
    if isverbose:
        verbose = verbose_func
    else:
        verbose = lambda x: None

        # evaluate initial f(x) and df/dx
    f, g = opfunc(x)

    f_hist = [f]
    currentFuncEval = 1
    state.funcEval = state.funcEval + 1
    p = g.shape[0]

    # check optimality of initial point
    tmp1 = tf.abs(g)
    if tf.reduce_sum(tmp1) <= tolFun:
        verbose("optimality condition below tolFun")
        return x, f_hist

    # optimize for a max of maxIter iterations
    nIter = 0
    times = []
    while nIter < maxIter:
        start_time = time.time()

        # keep track of nb of iterations
        nIter = nIter + 1
        state.nIter = state.nIter + 1

        ############################################################
        ## compute gradient descent direction
        ############################################################
        if state.nIter == 1:
            d = -g
            old_dirs = []
            old_stps = []
            Hdiag = 1
        else:
            # do lbfgs update (update memory)
            y = g - g_old
            s = d * t
            ys = dot(y, s)

            if ys > 1e-10:
                # updating memory
                if len(old_dirs) == nCorrection:
                    # shift history by one (limited-memory)
                    del old_dirs[0]
                    del old_stps[0]

                # store new direction/step
                old_dirs.append(s)
                old_stps.append(y)

                # update scale of initial Hessian approximation
                Hdiag = ys / dot(y, y)

            # compute the approximate (L-BFGS) inverse Hessian
            # multiplied by the gradient
            k = len(old_dirs)

            # need to be accessed element-by-element, so don't re-type tensor:
            ro = [0] * nCorrection
            for i in range(k):
                ro[i] = 1 / dot(old_stps[i], old_dirs[i])

            # iteration in L-BFGS loop collapsed to use just one buffer
            # need to be accessed element-by-element, so don't re-type tensor:
            al = [0] * nCorrection

            q = -g
            for i in range(k - 1, -1, -1):
                al[i] = dot(old_dirs[i], q) * ro[i]
                q = q - al[i] * old_stps[i]

            # multiply by initial Hessian
            r = q * Hdiag
            for i in range(k):
                be_i = dot(old_stps[i], r) * ro[i]
                r += (al[i] - be_i) * old_dirs[i]

            d = r
            # final direction is in r/d (same object)

        g_old = g
        f_old = f

        ############################################################
        ## compute step length
        ############################################################
        # directional derivative
        gtd = dot(g, d)

        # check that progress can be made along that direction
        if gtd > -tolX:
            verbose("Can not make progress along direction.")
            break

        # reset initial guess for step size
        if state.nIter == 1:
            tmp1 = tf.abs(g)
            t = min(1, 1 / tf.reduce_sum(tmp1))
        else:
            t = learningRate

        # optional line search: user function
        lsFuncEval = 0
        if lineSearch and isinstance(lineSearch) == types.FunctionType:
            # perform line search, using user function
            f, g, x, t, lsFuncEval = lineSearch(opfunc, x, t, d, f, g, gtd,
                                                lineSearchOpts)
            f_hist.append(f)
        else:
            # no line search, simply move with fixed-step
            x += t * d

            if nIter != maxIter:
                # re-evaluate function only if not in last iteration
                # the reason we do this: in a stochastic setting,
                # no use to re-evaluate that function here
                f, g = opfunc(x)

                lsFuncEval = 1
                f_hist.append(f)

        # update func eval
        currentFuncEval = currentFuncEval + lsFuncEval
        state.funcEval = state.funcEval + lsFuncEval

        ############################################################
        ## check conditions
        ############################################################
        if nIter == maxIter:
            break

        if currentFuncEval >= maxEval:
            # max nb of function evals
            verbose('max nb of function evals')
            break

        tmp1 = tf.abs(g)
        if tf.reduce_sum(tmp1) <= tolFun:
            # check optimality
            verbose('optimality condition below tolFun')
            break

        tmp1 = tf.abs(d * t)
        if tf.reduce_sum(tmp1) <= tolX:
            # step size below tolX
            verbose('step size below tolX')
            break

        if tf.abs(f - f_old) < tolX:
            # function value changing less than tolX
            verbose('function value changing less than tolX' +
                    str(tf.abs(f - f_old)))
            break

        if do_verbose:
            print("Step %3d loss %6.5f msec %6.3f" %
                  (nIter, f.numpy(), u.last_time()))
            u.record_time()
            times.append(u.last_time())

        if nIter == maxIter - 1:
            final_loss = f.numpy()

    # save state
    state.old_dirs = old_dirs
    state.old_stps = old_stps
    state.Hdiag = Hdiag
    state.g_old = g_old
    state.f_old = f_old
    state.t = t
    state.d = d

    return x, f_hist, currentFuncEval
Exemple #4
0
def main():
    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)

    logger = u.TensorboardLogger(args.run)

    with u.timeit("init/session"):

        rewrite_options = None
        try:
            from tensorflow.core.protobuf import rewriter_config_pb2
            rewrite_options = rewriter_config_pb2.RewriterConfig(
                disable_model_pruning=True,
                constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
                memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
        except:
            pass

        optimizer_options = tf.OptimizerOptions(
            opt_level=tf.OptimizerOptions.L0)
        graph_options = tf.GraphOptions(optimizer_options=optimizer_options,
                                        rewrite_options=rewrite_options)
        gpu_options = tf.GPUOptions(allow_growth=False)
        config = tf.ConfigProto(graph_options=graph_options,
                                gpu_options=gpu_options,
                                log_device_placement=False)

        sess = tf.InteractiveSession(config=config)
        u.register_default_session(
            sess)  # since default session is Thread-local

    with u.timeit("init/model_init"):
        model = model_creator(args.batch_size, name="main")
        model.initialize_global_vars(verbose=True)
        model.initialize_local_vars()

    kfac_lib.numeric_inverse = args.numeric_inverse
    with u.timeit("init/kfac_init"):
        kfac = Kfac(model_creator, args.kfac_batch_size)
        kfac.model.initialize_global_vars(verbose=False)
        kfac.model.initialize_local_vars()
        kfac.Lambda.set(args.Lambda)
        kfac.reset()  # resets optimization variables (not model variables)

    if args.mode != 'run':
        opt = tf.train.AdamOptimizer(0.001)
    else:
        opt = tf.train.AdamOptimizer(args.lr)
    grads_and_vars = opt.compute_gradients(model.loss,
                                           var_list=model.trainable_vars)

    grad = IndexedGrad.from_grads_and_vars(grads_and_vars)
    grad_new = kfac.correct(grad)
    with u.capture_vars() as adam_vars:
        train_op = opt.apply_gradients(grad_new.to_grads_and_vars())
    with u.timeit("init/adam"):
        sessrun([v.initializer for v in adam_vars])

    losses = []
    u.record_time()

    start_time = time.time()
    vloss0 = 0

    # todo, unify the two data outputs
    outfn = 'data/%s_%f_%f.csv' % (args.run, args.lr, args.Lambda)

    start_time = time.time()
    if args.extra_kfac_batch_advance:
        kfac.model.advance_batch()  # advance kfac batch

    if args.kfac_async:
        kfac.start_stats_runners()

    for step in range(args.num_steps):

        if args.validate_every_n and step % args.validate_every_n == 0:
            loss0, vloss0 = sessrun([model.loss, model.vloss])
        else:
            loss0, = sessrun([model.loss])
        losses.append(loss0)  # TODO: remove this

        logger('loss/loss', loss0, 'loss/vloss', vloss0)

        elapsed = time.time() - start_time
        start_time = time.time()
        print("%4d ms, step %4d, loss %5.2f, vloss %5.2f" %
              (elapsed * 1e3, step, loss0, vloss0))

        if args.method == 'kfac' and not args.kfac_async:
            kfac.model.advance_batch()
            kfac.update_stats()

        with u.timeit("train"):
            model.advance_batch()
            with u.timeit("grad.update"):
                grad.update()
            with kfac.read_lock():
                grad_new.update()
            u.run(train_op)
            u.record_time()

        logger.next_step()

    # TODO: use u.global_runs_dir
    # TODO: get rid of u.timeit?

    with open('timelines/graphdef.txt', 'w') as f:
        f.write(str(u.get_default_graph().as_graph_def()))

    u.summarize_time()

    if args.mode == 'record':
        u.dump_with_prompt(losses, release_test_fn)

    elif args.mode == 'test':
        targets = np.loadtxt('data/' + release_test_fn, delimiter=",")
        u.check_equal(losses, targets, rtol=1e-2)
        u.summarize_difference(losses, targets)
        assert u.last_time() < 800, "Expected 648 on GTX 1080"
Exemple #5
0
        vlosses.append(vloss0)
        step_lengths.append(lr0)
        ratios.append(slope_ratio)
        grad_norms.append(grad_norm.eval())
        pre_grad_norms.append(pre_grad_norm.eval())
        pre_grad_stable_norms.append(pre_grad_stable_norm.eval())

        if actual_delta > 0:
            print("Observed increase in loss %.2f, rejecting step" %
                  (actual_delta, ))
            restore_params_op.run()

        if step % report_frequency == 0:
            print(
                "Step %d loss %.2f, target decrease %.3f, actual decrease, %.3f, time: %.2f"
                % (step, loss0, target_delta, actual_delta, u.last_time()))

            #print("Step %d loss %.2f, target decrease %.3f, actual decrease, %.3f ratio %.2f grad norm: %.2f pregrad norm: %.2f"%(step, loss0, target_delta, actual_delta, slope_ratio, grad_norm.eval(), pre_grad_norm.eval()))

        if adaptive_step_frequency and adaptive_step and step > adaptive_step_burn_in:
            # shrink if wrong prediction, don't shrink if prediction is tiny
            if slope_ratio < alpha and abs(
                    target_delta) > 1e-6 and adaptive_step:
                print("%.2f %.2f %.2f" % (loss0, loss1, slope_ratio))
                print(
                    "Slope optimality %.2f, shrinking learning rate to %.2f" %
                    (
                        slope_ratio,
                        lr0 * beta,
                    ))
                sess.run(vard[lr].setter, feed_dict={vard[lr].p: lr0 * beta})