Beispiel #1
0
def _configure_optimizer(learning_rate):
  """Configures the optimizer used for training.

  Args:
    learning_rate: A scalar or `Tensor` learning rate.

  Returns:
    An instance of an optimizer.

  Raises:
    ValueError: if FLAGS.optimizer is not recognized.
  """
  if FLAGS.optimizer == 'adadelta':
    optimizer = tf.train.AdadeltaOptimizer(
        learning_rate,
        rho=FLAGS.adadelta_rho,
        epsilon=FLAGS.opt_epsilon)
  elif FLAGS.optimizer == 'adagrad':
    optimizer = tf.train.AdagradOptimizer(
        learning_rate,
        initial_accumulator_value=FLAGS.adagrad_initial_accumulator_value)
  elif FLAGS.optimizer == 'adam':
    optimizer = tf.train.AdamOptimizer(
        learning_rate,
        beta1=FLAGS.adam_beta1,
        beta2=FLAGS.adam_beta2,
        epsilon=FLAGS.opt_epsilon)
  elif FLAGS.optimizer == 'ftrl':
    optimizer = tf.train.FtrlOptimizer(
        learning_rate,
        learning_rate_power=FLAGS.ftrl_learning_rate_power,
        initial_accumulator_value=FLAGS.ftrl_initial_accumulator_value,
        l1_regularization_strength=FLAGS.ftrl_l1,
        l2_regularization_strength=FLAGS.ftrl_l2)
  elif FLAGS.optimizer == 'momentum':
    optimizer = tf.train.MomentumOptimizer(
        learning_rate,
        momentum=FLAGS.momentum,
        name='Momentum')
  elif FLAGS.optimizer == 'rmsprop':
    optimizer = tf.train.RMSPropOptimizer(
        learning_rate,
        decay=FLAGS.rmsprop_decay,
        momentum=FLAGS.rmsprop_momentum,
        epsilon=FLAGS.opt_epsilon)
  elif FLAGS.optimizer == 'sgd':
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  elif FLAGS.optimizer == 'yellowfin':
    optimizer = YFOptimizer(lr=1.0, mu=0.0)
  else:
    raise ValueError('Optimizer [%s] was not recognized', FLAGS.optimizer)
  return optimizer
Beispiel #2
0
def test_measurement():
    opt = YFOptimizer(zero_debias=False)
    w = tf.Variable(np.ones([
        n_dim,
    ]),
                    dtype=tf.float32,
                    name="w",
                    trainable=True)
    b = tf.Variable(np.ones([
        1,
    ], dtype=np.float32),
                    dtype=tf.float32,
                    name="b",
                    trainable=True)
    x = tf.constant(np.ones([
        n_dim,
    ], dtype=np.float32), dtype=tf.float32)
    loss = tf.multiply(w, x) + b
    tvars = tf.trainable_variables()

    w_grad_val = tf.placeholder(tf.float32, shape=(n_dim, ))
    b_grad_val = tf.placeholder(tf.float32, shape=(1, ))
    apply_op = opt.apply_gradients(zip([w_grad_val, b_grad_val], tvars))

    init_op = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init_op)
        target_h_max = 0.0
        target_h_min = 0.0
        g_norm_squared_avg = 0.0
        g_norm_avg = 0.0
        g_avg = 0.0
        target_dist = 0.0
        for i in range(n_iter):
            feed_dict = {
                w_grad_val: (i + 1) * np.ones([
                    n_dim,
                ], dtype=np.float32),
                b_grad_val: (i + 1) * np.ones([
                    1,
                ], dtype=np.float32)
            }
            res = sess.run([
                opt._curv_win, opt._h_max, opt._h_min, opt._grad_var,
                opt._dist_to_opt_avg, apply_op
            ],
                           feed_dict=feed_dict)

            g_norm_squared_avg = 0.999 * g_norm_squared_avg  \
              + 0.001 * np.sum(( (i + 1)*np.ones( [n_dim + 1, ] ) )**2)
            g_norm_avg = 0.999 * g_norm_avg  \
              + 0.001 * np.linalg.norm( (i + 1)*np.ones( [n_dim + 1, ] ) )
            g_avg = 0.999 * g_avg + 0.001 * (i + 1)

            target_h_max = 0.999 * target_h_max + 0.001 * (i + 1)**2 * (n_dim +
                                                                        1)
            target_h_min = 0.999 * target_h_min + 0.001 * max(
                1, i + 2 - 20)**2 * (n_dim + 1)
            target_var = g_norm_squared_avg - g_avg**2 * (n_dim + 1)
            target_dist = 0.999 * target_dist + 0.001 * g_norm_avg / g_norm_squared_avg

            # print "iter ", i, " h max ", res[1], target_h_max, " h min ", res[2], target_h_min, \
            #   " var ", res[3], target_var, " dist ", res[4], target_dist
            assert np.abs(target_h_max - res[1]) < np.abs(target_h_max) * 1e-3
            assert np.abs(target_h_min - res[2]) < np.abs(target_h_min) * 1e-3
            assert np.abs(target_var - res[3]) < np.abs(res[3]) * 1e-3
            assert np.abs(target_dist - res[4]) < np.abs(res[4]) * 1e-3
    print("sync measurement test passed!")
Beispiel #3
0
def test_lr_mu():
    opt = YFOptimizer(zero_debias=False)
    w = tf.Variable(np.ones([
        n_dim,
    ]),
                    dtype=tf.float32,
                    name="w",
                    trainable=True)
    b = tf.Variable(np.ones([
        1,
    ], dtype=np.float32),
                    dtype=tf.float32,
                    name="b",
                    trainable=True)
    x = tf.constant(np.ones([
        n_dim,
    ], dtype=np.float32), dtype=tf.float32)
    loss = tf.multiply(w, x) + b
    tvars = tf.trainable_variables()

    w_grad_val = tf.Variable(np.zeros([
        n_dim,
    ]),
                             dtype=tf.float32,
                             trainable=False)
    b_grad_val = tf.Variable(np.zeros([
        1,
    ]),
                             dtype=tf.float32,
                             trainable=False)
    apply_op = opt.apply_gradients(zip([w_grad_val, b_grad_val], tvars))

    init_op = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init_op)
        target_h_max = 0.0
        target_h_min = 0.0
        g_norm_squared_avg = 0.0
        g_norm_avg = 0.0
        g_avg = 0.0
        target_dist = 0.0
        target_lr = 0.1
        target_mu = 0.0
        for i in range(n_iter):

            sess.run(
                tf.assign(w_grad_val,
                          (i + 1) * np.ones([
                              n_dim,
                          ], dtype=np.float32)))
            sess.run(
                tf.assign(b_grad_val,
                          (i + 1) * np.ones([
                              1,
                          ], dtype=np.float32)))

            res = sess.run([
                opt._curv_win, opt._h_max, opt._h_min, opt._grad_var,
                opt._dist_to_opt_avg, opt._lr_var, opt._mu_var, apply_op
            ])

            res[5] = opt._lr_var.eval()
            res[6] = opt._mu_var.eval()

            g_norm_squared_avg = 0.999 * g_norm_squared_avg  \
              + 0.001 * np.sum(( (i + 1)*np.ones( [n_dim + 1, ] ) )**2)
            g_norm_avg = 0.999 * g_norm_avg  \
              + 0.001 * np.linalg.norm( (i + 1)*np.ones( [n_dim + 1, ] ) )
            g_avg = 0.999 * g_avg + 0.001 * (i + 1)

            target_h_max = 0.999 * target_h_max + 0.001 * (i + 1)**2 * (n_dim +
                                                                        1)
            target_h_min = 0.999 * target_h_min + 0.001 * max(
                1, i + 2 - 20)**2 * (n_dim + 1)
            target_var = g_norm_squared_avg - g_avg**2 * (n_dim + 1)
            target_dist = 0.999 * target_dist + 0.001 * g_norm_avg / g_norm_squared_avg

            if i > 0:
                lr, mu = tune_everything(target_dist**2, target_var, 1,
                                         target_h_min, target_h_max)
                target_lr = 0.999 * target_lr + 0.001 * lr
                target_mu = 0.999 * target_mu + 0.001 * mu

            # print "iter ", i, " h max ", res[1], target_h_max, " h min ", res[2], target_h_min, \
    #                              " var ", res[3], target_var, " dist ", res[4], target_dist
    # print "iter ", i, " lr ", res[5], target_lr, " mu ", res[6], target_mu

            assert np.abs(target_h_max - res[1]) < np.abs(target_h_max) * 1e-3
            assert np.abs(target_h_min - res[2]) < np.abs(target_h_min) * 1e-3
            assert np.abs(target_var - res[3]) < np.abs(res[3]) * 1e-3
            assert np.abs(target_dist - res[4]) < np.abs(res[4]) * 1e-3
            assert target_lr == 0.0 or np.abs(target_lr -
                                              res[5]) < np.abs(res[5]) * 1e-3
            assert target_mu == 0.0 or np.abs(target_mu -
                                              res[6]) < np.abs(res[6]) * 5e-3
    print("lr and mu computing test passed!")