Esempio n. 1
0
    def update_variable(self, var, grad_var):
        """Update the variable and its slots."""
        params = self.params
        global_step = tf.to_float(self.global_step) + 1

        # compute learning rate
        lrate = params.learning_rate
        if params.learning_rate_decay_scheme == "noam":
            lrate *= tf.minimum(
                global_step * params.learning_rate_warmup_steps**-1.5,
                global_step**-0.5)
        else:
            assert params.learning_rate_decay_scheme == "none"
            lrate *= tf.minumum(
                global_step / params.learning_rate_warmup_steps, 1.0)

        # compute adjustment due to second moment
        slots = params.slots[var.op.name]
        grad_squared = tf.square(grad_var)
        beta2_pow = tf.pow(params.beta2, global_step)
        if params.factored_second_moment_accumulator and len(var.shape) == 2:
            vr_update = tf.assign(
                slots["adam_vr"], slots["adam_vr"] * params.beta2 +
                tf.reduce_mean(grad_squared, 1, keep_dims=True) *
                (1.0 - params.beta2))
            vc_update = tf.assign(
                slots["adam_vc"], slots["adam_vc"] * params.beta2 +
                tf.reduce_mean(grad_squared, 0, keep_dims=True) *
                (1.0 - params.beta2))
            with tf.control_dependencies([vr_update, vc_update]):
                vr = tf.sqrt(slots["adam_vr"] /
                             (1.0 - beta2_pow)) + params.epsilon
                vc = tf.sqrt(slots["adam_vc"] /
                             (1.0 - beta2_pow)) + params.epsilon
                vc /= tf.reduce_mean(vc)
                denom = vr * vc
        else:
            v_update = tf.assign(
                slots["adam_v"], slots["adam_v"] * params.beta2 +
                grad_squared * (1.0 - params.beta2))
            with tf.control_dependencies([v_update]):
                denom = tf.sqrt(slots["adam_v"] /
                                (1.0 - beta2_pow)) + params.epsilon

        # compute momentum if applicable
        if params.beta1 != 0.0:
            m_update = tf.assign(
                slots["adam_m"], slots["adam_m"] * params.beta1 + grad_var *
                (1.0 - params.beta1))
            with tf.control_dependencies([m_update]):
                grad_var = slots["adam_m"]

        # update var
        subtrahend = lrate * grad_var / denom
        new_val = _quantize(_dequantize(var, params) - subtrahend, params)
        return tf.assign(var, new_val)
Esempio n. 2
0
  def update_variable(self, var, grad_var):
    """Update the variable and its slots."""
    params = self.params
    global_step = tf.to_float(self.global_step) + 1

    # compute learning rate
    lrate = params.learning_rate
    if params.learning_rate_decay_scheme == "noam":
      lrate *= tf.minimum(global_step * params.learning_rate_warmup_steps**-1.5,
                          global_step**-0.5)
    else:
      assert params.learning_rate_decay_scheme == "none"
      lrate *= tf.minumum(global_step / params.learning_rate_warmup_steps, 1.0)

    # compute adjustment due to second moment
    slots = params.slots[var.op.name]
    grad_squared = tf.square(grad_var)
    beta2_pow = tf.pow(params.beta2, global_step)
    if params.factored_second_moment_accumulator and len(var.shape) == 2:
      vr_update = tf.assign(slots["adam_vr"], slots["adam_vr"] * params.beta2 +
                            tf.reduce_mean(grad_squared, 1, keep_dims=True) *
                            (1.0 - params.beta2))
      vc_update = tf.assign(slots["adam_vc"], slots["adam_vc"] * params.beta2 +
                            tf.reduce_mean(grad_squared, 0, keep_dims=True) *
                            (1.0 - params.beta2))
      with tf.control_dependencies([vr_update, vc_update]):
        vr = tf.sqrt(slots["adam_vr"] / (1.0 - beta2_pow)) + params.epsilon
        vc = tf.sqrt(slots["adam_vc"] / (1.0 - beta2_pow)) + params.epsilon
        vc /= tf.reduce_mean(vc)
        denom = vr * vc
    else:
      v_update = tf.assign(slots["adam_v"],
                           slots["adam_v"] * params.beta2 + grad_squared *
                           (1.0 - params.beta2))
      with tf.control_dependencies([v_update]):
        denom = tf.sqrt(slots["adam_v"] / (1.0 - beta2_pow)) + params.epsilon

    # compute momentum if applicable
    if params.beta1 != 0.0:
      m_update = tf.assign(slots["adam_m"],
                           slots["adam_m"] * params.beta1 + grad_var *
                           (1.0 - params.beta1))
      with tf.control_dependencies([m_update]):
        grad_var = slots["adam_m"]

    # update var
    subtrahend = lrate * grad_var / denom
    new_val = _quantize(_dequantize(var, params) - subtrahend, params)
    return tf.assign(var, new_val)