def __init__(self, optimizer_name, lr, hparams, use_tpu=False): if optimizer_name == "Adam" and use_tpu: # LazyAdamOptimizer does not work on TPU optimizer_name = "TrueAdam" tf.logging.info("Using optimizer %s", optimizer_name) if optimizer_name == "Adam": # We change the default epsilon for Adam and re-scale lr. # Using LazyAdam as it's much faster for large vocabulary embeddings. self._opt = tf.contrib.opt.LazyAdamOptimizer( lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon) elif optimizer_name == "Momentum": self._opt = tf.train.MomentumOptimizer( lr, momentum=hparams.optimizer_momentum_momentum, use_nesterov=hparams.optimizer_momentum_nesterov) elif optimizer_name == "YellowFin": self._opt = yellowfin.YellowFinOptimizer( learning_rate=lr, momentum=hparams.optimizer_momentum_momentum) elif optimizer_name == "TrueAdam": self._opt = tf.train.AdamOptimizer( lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon) elif optimizer_name == "Adafactor": self._opt = adafactor.adafactor_optimizer_from_hparams(hparams, lr) else: self._opt = tf.contrib.layers.OPTIMIZER_CLS_NAMES[optimizer_name](lr)
def __init__(self, optimizer_name, lr, hparams, use_tpu=False): # pylint: disable=super-init-not-called tf.logging.info("Using optimizer %s", optimizer_name) mlperf_log.transformer_print(key=mlperf_log.OPT_NAME, value=optimizer_name) mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=hparams.optimizer_adam_beta1) mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=hparams.optimizer_adam_beta2) mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=hparams.optimizer_adam_epsilon) if optimizer_name == "Adam": # We change the default epsilon for Adam. # Using LazyAdam as it's much faster for large vocabulary embeddings. self._opt = tf.contrib.opt.LazyAdamOptimizer( lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon) elif optimizer_name == "MultistepAdam": self._opt = multistep_optimizer.MultistepAdamOptimizer( lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon, n=hparams.optimizer_multistep_accumulate_steps) elif optimizer_name == "Momentum": self._opt = tf.train.MomentumOptimizer( lr, momentum=hparams.optimizer_momentum_momentum, use_nesterov=hparams.optimizer_momentum_nesterov) elif optimizer_name == "YellowFin": self._opt = yellowfin.YellowFinOptimizer( learning_rate=lr, momentum=hparams.optimizer_momentum_momentum) elif optimizer_name == "TrueAdam": self._opt = tf.train.AdamOptimizer( lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon) elif optimizer_name == "AdamW": # Openai gpt used weight decay. # Given the internals of AdamW, weight decay dependent on the # learning rate is chosen to match the openai implementation. # The weight decay update to each parameter is applied before the adam # gradients computation, which is different from that described # in the paper and in the openai implementation: # https://arxiv.org/pdf/1711.05101.pdf self._opt = tf.contrib.opt.AdamWOptimizer( 0.01 * lr, lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon) elif optimizer_name == "Adafactor": self._opt = adafactor.adafactor_optimizer_from_hparams(hparams, lr) else: self._opt = tf.contrib.layers.OPTIMIZER_CLS_NAMES[optimizer_name]( lr)
def __init__(self, optimizer_name, lr, hparams, use_tpu=False): if optimizer_name == "Adam" and use_tpu: # LazyAdamOptimizer does not work on TPU optimizer_name = "TrueAdam" tf.logging.info("Using optimizer %s", optimizer_name) if optimizer_name == "Adam": # We change the default epsilon for Adam and re-scale lr. # Using LazyAdam as it's much faster for large vocabulary embeddings. self._opt = tf.contrib.opt.LazyAdamOptimizer( lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon) elif optimizer_name == "Momentum": self._opt = tf.train.MomentumOptimizer( lr, momentum=hparams.optimizer_momentum_momentum, use_nesterov=hparams.optimizer_momentum_nesterov) elif optimizer_name == "YellowFin": self._opt = yellowfin.YellowFinOptimizer( learning_rate=lr, momentum=hparams.optimizer_momentum_momentum) elif optimizer_name == "TrueAdam": self._opt = tf.train.AdamOptimizer( lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon) elif optimizer_name == "Adafactor": self._opt = adafactor.adafactor_optimizer_from_hparams(hparams, lr) else: self._opt = tf.contrib.layers.OPTIMIZER_CLS_NAMES[optimizer_name](lr)
def __init__(self, optimizer_name, lr, hparams, use_tpu=False): # pylint: disable=super-init-not-called tf.logging.info("Using optimizer %s", optimizer_name) mlperf_log.transformer_print(key=mlperf_log.OPT_NAME, value=optimizer_name) mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=hparams.optimizer_adam_beta1) mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=hparams.optimizer_adam_beta2) mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=hparams.optimizer_adam_epsilon) if optimizer_name == "Adam": # We change the default epsilon for Adam. # Using LazyAdam as it's much faster for large vocabulary embeddings. self._opt = tf.contrib.opt.LazyAdamOptimizer( lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon) elif optimizer_name == "MultistepAdam": self._opt = multistep_optimizer.MultistepAdamOptimizer( lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon, n=hparams.optimizer_multistep_accumulate_steps) elif optimizer_name == "Momentum": self._opt = tf.train.MomentumOptimizer( lr, momentum=hparams.optimizer_momentum_momentum, use_nesterov=hparams.optimizer_momentum_nesterov) elif optimizer_name == "YellowFin": self._opt = yellowfin.YellowFinOptimizer( learning_rate=lr, momentum=hparams.optimizer_momentum_momentum) elif optimizer_name == "TrueAdam": self._opt = tf.train.AdamOptimizer( lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon) elif optimizer_name == "Adafactor": self._opt = adafactor.adafactor_optimizer_from_hparams(hparams, lr) else: self._opt = tf.contrib.layers.OPTIMIZER_CLS_NAMES[optimizer_name]( lr)
def adafactor(learning_rate, hparams): return adafactor_lib.adafactor_optimizer_from_hparams( hparams, learning_rate)
def register_adafactor(learning_rate, hparams): return adafactor.adafactor_optimizer_from_hparams(hparams, learning_rate)
def __init__(self, optimizer_name, lr, hparams, use_tpu=False): # pylint: disable=super-init-not-called tf.logging.info("Using optimizer %s", optimizer_name) mlperf_log.transformer_print(key=mlperf_log.OPT_NAME, value=optimizer_name, hparams=hparams) mlperf_log.transformer_print( key=mlperf_log.OPT_HP_ADAM_BETA1, value=hparams.optimizer_adam_beta1, hparams=hparams) mlperf_log.transformer_print( key=mlperf_log.OPT_HP_ADAM_BETA2, value=hparams.optimizer_adam_beta2, hparams=hparams) mlperf_log.transformer_print( key=mlperf_log.OPT_HP_ADAM_EPSILON, value=hparams.optimizer_adam_epsilon, hparams=hparams) if optimizer_name == "Adam": # We change the default epsilon for Adam. # Using LazyAdam as it's much faster for large vocabulary embeddings. self._opt = tf.contrib.opt.LazyAdamOptimizer( lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon) elif optimizer_name == "MultistepAdam": self._opt = multistep_optimizer.MultistepAdamOptimizer( lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon, n=hparams.optimizer_multistep_accumulate_steps) elif optimizer_name == "Momentum": self._opt = tf.train.MomentumOptimizer( lr, momentum=hparams.optimizer_momentum_momentum, use_nesterov=hparams.optimizer_momentum_nesterov) elif optimizer_name == "YellowFin": self._opt = yellowfin.YellowFinOptimizer( learning_rate=lr, momentum=hparams.optimizer_momentum_momentum) elif optimizer_name == "TrueAdam": self._opt = tf.train.AdamOptimizer( lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon) elif optimizer_name == "AdamW": # Openai gpt used weight decay. # Given the internals of AdamW, weight decay dependent on the # learning rate is chosen to match the openai implementation. # The weight decay update to each parameter is applied before the adam # gradients computation, which is different from that described # in the paper and in the openai implementation: # https://arxiv.org/pdf/1711.05101.pdf self._opt = tf.contrib.opt.AdamWOptimizer( 0.01*lr, lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon) elif optimizer_name == "Adafactor": self._opt = adafactor.adafactor_optimizer_from_hparams(hparams, lr) else: self._opt = tf.contrib.layers.OPTIMIZER_CLS_NAMES[optimizer_name](lr)
def __init__(self, optimizer_name, lr, hparams, use_tpu=False): # pylint: disable=super-init-not-called tf.logging.info("Using optimizer %s", optimizer_name) mlperf_log.transformer_print(key=mlperf_log.OPT_NAME, value=optimizer_name, hparams=hparams) mlperf_log.transformer_print( key=mlperf_log.OPT_HP_ADAM_BETA1, value=hparams.optimizer_adam_beta1, hparams=hparams) mlperf_log.transformer_print( key=mlperf_log.OPT_HP_ADAM_BETA2, value=hparams.optimizer_adam_beta2, hparams=hparams) mlperf_log.transformer_print( key=mlperf_log.OPT_HP_ADAM_EPSILON, value=hparams.optimizer_adam_epsilon, hparams=hparams) if optimizer_name == "Adam": # We change the default epsilon for Adam. # Using LazyAdam as it's much faster for large vocabulary embeddings. self._opt = tf.contrib.opt.LazyAdamOptimizer( lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon) elif optimizer_name == "MultistepAdam": self._opt = multistep_optimizer.MultistepAdamOptimizer( lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon, n=hparams.optimizer_multistep_accumulate_steps) elif optimizer_name == "Momentum": self._opt = tf.train.MomentumOptimizer( lr, momentum=hparams.optimizer_momentum_momentum, use_nesterov=hparams.optimizer_momentum_nesterov) elif optimizer_name == "YellowFin": self._opt = yellowfin.YellowFinOptimizer( learning_rate=lr, momentum=hparams.optimizer_momentum_momentum) elif optimizer_name == "TrueAdam": self._opt = tf.train.AdamOptimizer( lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon) elif optimizer_name == "AdamW": # Openai gpt used weight decay. # Given the internals of AdamW, weight decay dependent on the # learning rate is chosen to match the openai implementation. # The weight decay update to each parameter is applied before the adam # gradients computation, which is different from that described # in the paper and in the openai implementation: # https://arxiv.org/pdf/1711.05101.pdf self._opt = tf.contrib.opt.AdamWOptimizer( 0.01*lr, lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon) elif optimizer_name == "Adafactor": self._opt = adafactor.adafactor_optimizer_from_hparams(hparams, lr) else: self._opt = tf.contrib.layers.OPTIMIZER_CLS_NAMES[optimizer_name](lr) if _mixed_precision_is_enabled(hparams): if not hparams.mixed_precision_optimizer_loss_scaler: tf.logging.warning("Using mixed precision without a loss scaler will " "likely cause numerical errors.") elif hparams.mixed_precision_optimizer_loss_scaler != "exponential": raise ValueError("Mixed precision training only supports the " "exponential loss scaler") else: tf.logging.info("Using Exponential Update Loss Scaler") manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager( init_loss_scale=2**15, incr_every_n_steps=2000, decr_every_n_nan_or_inf=2, incr_ratio=2, decr_ratio=0.5) self._opt = LossScaleOptimizer(self._opt, manager) self._zero_grads = hparams.optimizer_zero_grads