def test_measurement():
  opt = YFOptimizer(zero_debias=False)
  w = tf.Variable(np.ones([n_dim, ] ), dtype=tf.float32, name="w", trainable=True)
  b = tf.Variable(np.ones([1, ], dtype=np.float32), dtype=tf.float32, name="b", trainable=True)
  x = tf.constant(np.ones([n_dim, ], dtype=np.float32), dtype=tf.float32)
  loss = tf.multiply(w, x) + b
  tvars = tf.trainable_variables()

  w_grad_val = tf.placeholder(tf.float32, shape=(n_dim, ) )
  b_grad_val = tf.placeholder(tf.float32, shape=(1, ) )
  apply_op = opt.apply_gradients(zip([w_grad_val, b_grad_val], tvars) )

  init_op = tf.global_variables_initializer()
  with tf.Session() as sess:
    sess.run(init_op)
    target_h_max = 0.0
    target_h_min = 0.0
    g_norm_squared_avg = 0.0
    g_norm_avg = 0.0
    g_avg = 0.0
    target_dist = 0.0
    for i in range(n_iter):
      feed_dict = {w_grad_val: (i + 1) * np.ones( [n_dim, ], dtype=np.float32),
             b_grad_val: (i + 1) * np.ones( [1, ], dtype=np.float32) }
      res = sess.run( [opt._curv_win, opt._h_max, opt._h_min, opt._grad_var, opt._dist_to_opt_avg, apply_op], feed_dict=feed_dict)

      g_norm_squared_avg = 0.999 * g_norm_squared_avg  \
        + 0.001 * np.sum(( (i + 1)*np.ones( [n_dim + 1, ] ) )**2)
      g_norm_avg = 0.999 * g_norm_avg  \
        + 0.001 * np.linalg.norm( (i + 1)*np.ones( [n_dim + 1, ] ) )
      g_avg = 0.999 * g_avg + 0.001 * (i + 1)
 
      target_h_max = 0.999 * target_h_max + 0.001 * (i + 1)**2*(n_dim + 1)
      target_h_min = 0.999 * target_h_min + 0.001 * max(1, i + 2 - 20)**2*(n_dim + 1)
      target_var = g_norm_squared_avg - g_avg**2 * (n_dim + 1)
      target_dist = 0.999 * target_dist + 0.001 * g_norm_avg / g_norm_squared_avg

      # print "iter ", i, " h max ", res[1], target_h_max, " h min ", res[2], target_h_min, \
      #   " var ", res[3], target_var, " dist ", res[4], target_dist
      assert np.abs(target_h_max - res[1] ) < np.abs(target_h_max) * 1e-3
      assert np.abs(target_h_min - res[2] ) < np.abs(target_h_min) * 1e-3
      assert np.abs(target_var - res[3] ) < np.abs(res[3] ) * 1e-3
      assert np.abs(target_dist - res[4] ) < np.abs(res[4] ) * 1e-3
  print "sync measurement test passed!"
def test_lr_mu():
  opt = YFOptimizer(zero_debias=False)
  w = tf.Variable(np.ones([n_dim, ] ), dtype=tf.float32, name="w", trainable=True)
  b = tf.Variable(np.ones([1, ], dtype=np.float32), dtype=tf.float32, name="b", trainable=True)
  x = tf.constant(np.ones([n_dim, ], dtype=np.float32), dtype=tf.float32)
  loss = tf.multiply(w, x) + b
  tvars = tf.trainable_variables()

  w_grad_val = tf.Variable(np.zeros( [n_dim, ] ), dtype=tf.float32, trainable=False)
  b_grad_val = tf.Variable(np.zeros([1, ] ), dtype=tf.float32, trainable=False)
  apply_op = opt.apply_gradients(zip([w_grad_val, b_grad_val], tvars) )

  init_op = tf.global_variables_initializer()
  with tf.Session() as sess:
    sess.run(init_op)
    target_h_max = 0.0
    target_h_min = 0.0
    g_norm_squared_avg = 0.0
    g_norm_avg = 0.0
    g_avg = 0.0
    target_dist = 0.0
    target_lr = 0.1
    target_mu = 0.0
    for i in range(n_iter):
    
      sess.run(tf.assign(w_grad_val, (i + 1) * np.ones( [n_dim, ], dtype=np.float32) ) )
      sess.run(tf.assign(b_grad_val, (i + 1) * np.ones( [1, ], dtype=np.float32) ) )
  
      res = sess.run( [opt._curv_win, opt._h_max, opt._h_min, opt._grad_var, opt._dist_to_opt_avg, 
        opt._lr_var, opt._mu_var, apply_op] )
    
      res[5] = opt._lr_var.eval()
      res[6] = opt._mu_var.eval()
  
      g_norm_squared_avg = 0.999 * g_norm_squared_avg  \
        + 0.001 * np.sum(( (i + 1)*np.ones( [n_dim + 1, ] ) )**2)
      g_norm_avg = 0.999 * g_norm_avg  \
        + 0.001 * np.linalg.norm( (i + 1)*np.ones( [n_dim + 1, ] ) )
      g_avg = 0.999 * g_avg + 0.001 * (i + 1)
 
      target_h_max = 0.999 * target_h_max + 0.001 * (i + 1)**2*(n_dim + 1)
      target_h_min = 0.999 * target_h_min + 0.001 * max(1, i + 2 - 20)**2*(n_dim + 1)
      target_var = g_norm_squared_avg - g_avg**2 * (n_dim + 1)
      target_dist = 0.999 * target_dist + 0.001 * g_norm_avg / g_norm_squared_avg

      if i > 0:
        lr, mu = tune_everything(target_dist**2, target_var, 1, target_h_min, target_h_max)
        target_lr = 0.999 * target_lr + 0.001 * lr
        target_mu = 0.999 * target_mu + 0.001 * mu

      # print "iter ", i, " h max ", res[1], target_h_max, " h min ", res[2], target_h_min, \
   #                              " var ", res[3], target_var, " dist ", res[4], target_dist
      # print "iter ", i, " lr ", res[5], target_lr, " mu ", res[6], target_mu

      assert np.abs(target_h_max - res[1] ) < np.abs(target_h_max) * 1e-3
      assert np.abs(target_h_min - res[2] ) < np.abs(target_h_min) * 1e-3
      assert np.abs(target_var - res[3] ) < np.abs(res[3] ) * 1e-3
      assert np.abs(target_dist - res[4] ) < np.abs(res[4] ) * 1e-3
      assert target_lr == 0.0 or np.abs(target_lr - res[5] ) < np.abs(res[5] ) * 1e-3
      assert target_mu == 0.0 or np.abs(target_mu - res[6] ) < np.abs(res[6] ) * 5e-3 
  print "lr and mu computing test passed!"
Exemple #3
0
    def __init__(self, is_training, config, input_, opt_method='sgd'):
        self._input = input_

        batch_size = input_.batch_size
        num_steps = input_.num_steps
        size = config.hidden_size
        vocab_size = config.vocab_size

        # Slightly better results can be obtained with forget gate biases
        # initialized to 1 but the hyperparameters of the model would need to be
        # different than reported in the paper.
        def lstm_cell():
            # With the latest TensorFlow source code (as of Mar 27, 2017),
            # the BasicLSTMCell will need a reuse parameter which is unfortunately not
            # defined in TensorFlow 1.0. To maintain backwards compatibility, we add
            # an argument check here:
            if 'reuse' in inspect.getargspec(
                    tf.contrib.rnn.BasicLSTMCell.__init__).args:
                return tf.contrib.rnn.BasicLSTMCell(
                    size,
                    forget_bias=0.0,
                    state_is_tuple=True,
                    reuse=tf.get_variable_scope().reuse)
            else:
                return tf.contrib.rnn.BasicLSTMCell(size,
                                                    forget_bias=0.0,
                                                    state_is_tuple=True)

        attn_cell = lstm_cell
        if is_training and config.keep_prob < 1:

            def attn_cell():
                return tf.contrib.rnn.DropoutWrapper(
                    lstm_cell(), output_keep_prob=config.keep_prob)

        cell = tf.contrib.rnn.MultiRNNCell(
            [attn_cell() for _ in range(config.num_layers)],
            state_is_tuple=True)

        self._initial_state = cell.zero_state(batch_size, data_type())

        with tf.device("cpu:0"):
            embedding = tf.get_variable("embedding", [vocab_size, size],
                                        dtype=data_type())
            inputs = tf.nn.embedding_lookup(embedding, input_.input_data)

        if is_training and config.keep_prob < 1:
            inputs = tf.nn.dropout(inputs, config.keep_prob)

        # Simplified version of tensorflow.models.rnn.rnn.py's rnn().
        # This builds an unrolled LSTM for tutorial purposes only.
        # In general, use the rnn() or state_saving_rnn() from rnn.py.
        #
        # The alternative version of the code below is:
        #
        # inputs = tf.unstack(inputs, num=num_steps, axis=1)
        # outputs, state = tf.nn.rnn(cell, inputs, initial_state=self._initial_state)
        outputs = []
        state = self._initial_state
        with tf.variable_scope("RNN"):
            for time_step in range(num_steps):
                if time_step > 0: tf.get_variable_scope().reuse_variables()
                (cell_output, state) = cell(inputs[:, time_step, :], state)
                outputs.append(cell_output)

        output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size])
        softmax_w = tf.get_variable("softmax_w", [size, vocab_size],
                                    dtype=data_type())
        softmax_b = tf.get_variable("softmax_b", [vocab_size],
                                    dtype=data_type())
        logits = tf.matmul(output, softmax_w) + softmax_b
        loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
            [logits], [tf.reshape(input_.targets, [-1])],
            [tf.ones([batch_size * num_steps], dtype=data_type())])
        # self._cost = cost = tf.reduce_sum(loss) / batch_size
        self._cost = cost = tf.reduce_sum(loss) / (batch_size * num_steps)
        self._final_state = state

        if not is_training:
            return

        self._lr = tf.Variable(0.0, trainable=False)
        self._mu = tf.Variable(0.0, trainable=False)
        self._grad_norm_thresh = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        self.tvars = tvars

        self.grads = tf.gradients(cost, tvars)

        grads_clip, self.grad_norm = tf.clip_by_global_norm(
            self.grads, self._grad_norm_thresh)
        if opt_method == 'sgd':
            optimizer = tf.train.GradientDescentOptimizer(self._lr)
            self._train_op = optimizer.apply_gradients(
                zip(grads_clip, tvars),
                global_step=tf.contrib.framework.get_or_create_global_step())

        elif opt_method == 'mom':
            print("using sgd mom")
            optimizer = tf.train.MomentumOptimizer(self._lr, self._mu)
            self._train_op = optimizer.apply_gradients(
                zip(grads_clip, tvars),
                global_step=tf.contrib.framework.get_or_create_global_step())
        elif opt_method == 'adam':
            optimizer = tf.train.AdamOptimizer(self._lr)
            self._train_op = optimizer.apply_gradients(
                zip(grads_clip, tvars),
                global_step=tf.contrib.framework.get_or_create_global_step())
        elif opt_method == 'YF':
            optimizer = YFOptimizer(lr=1.0, mu=0.0)
            self._train_op = optimizer.apply_gradients(zip(self.grads, tvars))
        else:
            raise Exception("optimizer not supported")

        self._new_lr = tf.placeholder(tf.float32,
                                      shape=[],
                                      name="new_learning_rate")
        self._lr_update = tf.assign(self._lr, self._new_lr)

        self._new_mu = tf.placeholder(tf.float32,
                                      shape=[],
                                      name="new_momentum")
        self._mu_update = tf.assign(self._mu, self._new_mu)

        self._new_grad_norm_thresh = tf.placeholder(
            tf.float32, shape=[], name="new_grad_norm_thresh")
        self._grad_norm_thresh_update = tf.assign(self._grad_norm_thresh,
                                                  self._new_grad_norm_thresh)