def optimize(loss, params, trainable_varaibles=None): """Minimizes the loss. Args: loss: The loss to minimize. params: A dictionary of hyperparameters. trainable_varaibles: list of variables to optimize or None to use all trainable variables. Returns: The loss minimization op. """ global_step = tf.train.get_or_create_global_step() decay_type = params.get("decay_type") if decay_type is not None: decay_fn = learning_rate_decay_fn( decay_type, params["decay_rate"], params["decay_steps"], decay_step_duration=params.get("decay_step_duration", 1), staircase=params.get("staircase", True), start_decay_steps=params.get("start_decay_steps", 0), minimum_learning_rate=params.get("minimum_learning_rate", 0)) else: decay_fn = None learning_rate = float(params["learning_rate"]) print("learning_rate: %s" % learning_rate) clip_gradients = params.get("clip_gradients") if clip_gradients is not None: clip_gradients = float(clip_gradients) print("clip_gradients: %s" % clip_gradients) optimizer_class = get_optimizer_class(params["optimizer"]) optimizer_params = params.get("optimizer_params", {}) if optimizer_class.__name__ == "AdafactorOptimizer": optimizer = optimizers.get_adafactor_optimizer_from_params( optimizer_class, optimizer_params) else: optimizer = lambda lr: optimizer_class(lr, **optimizer_params) regularization = params.get("regularization") if regularization is not None: loss += regularization_penalty(regularization["type"], regularization["scale"]) return tf.contrib.layers.optimize_loss(loss, global_step, learning_rate, optimizer, clip_gradients=clip_gradients, learning_rate_decay_fn=decay_fn, variables=trainable_varaibles, name="optim", summaries=[ "learning_rate", "global_gradient_norm", ], colocate_gradients_with_ops=True)
def optimize(loss, params, mixed_precision=False): """Minimizes the loss. Args: loss: The loss to minimize. params: A dictionary of hyperparameters. mixed_precision: If ``True``, wraps the optimizer to maintain a float32 copy of the weights. Returns: The loss minimization op. """ regularization = params.get("regularization") if regularization is not None: loss += regularization_penalty(regularization["type"], regularization["scale"]) global_step = tf.train.get_or_create_global_step() with tf.variable_scope("optim"): # Learning rate. learning_rate = tf.get_variable( "learning_rate", [], trainable=False, initializer=tf.constant_initializer(float(params["learning_rate"]))) if "decay_type" in params: decay_fn = learning_rate_decay_fn( params["decay_type"], params["decay_rate"], params["decay_steps"], decay_step_duration=params.get("decay_step_duration", 1), staircase=params.get("staircase", True), start_decay_steps=params.get("start_decay_steps", 0), minimum_learning_rate=params.get("minimum_learning_rate", 0)) learning_rate = decay_fn(learning_rate, global_step) tf.summary.scalar("learning_rate", learning_rate) # Optimizer. optimizer_class = get_optimizer_class(params["optimizer"]) optimizer_params = params.get("optimizer_params", {}) if optimizer_class.__name__ == "AdafactorOptimizer": optimizer = optimizers.get_adafactor_optimizer_from_params( optimizer_class, optimizer_params, learning_rate=learning_rate) else: optimizer = optimizer_class(learning_rate, **optimizer_params) if mixed_precision: optimizer = optimizers.MixedPrecisionOptimizerWrapper( optimizer, loss_scale=get_loss_scale_from_params(params)) # Gradients. gradients = optimizer.compute_gradients(loss, colocate_gradients_with_ops=True) _summarize_gradients_norm("global_norm/gradient_norm", gradients) if "clip_gradients" in params: gradients = _clip_gradients_by_norm(gradients, float(params["clip_gradients"])) _summarize_gradients_norm("global_norm/clipped_gradient_norm", gradients) return optimizer.apply_gradients(gradients, global_step=global_step)
def optimize(loss, params, mixed_precision=False): """Minimizes the loss. Args: loss: The loss to minimize. params: A dictionary of hyperparameters. mixed_precision: If ``True``, wraps the optimizer to maintain a float32 copy of the weights. Returns: The loss minimization op. """ global_step = tf.train.get_or_create_global_step() decay_type = params.get("decay_type") if decay_type is not None: decay_fn = learning_rate_decay_fn( decay_type, params["decay_rate"], params["decay_steps"], decay_step_duration=params.get("decay_step_duration", 1), staircase=params.get("staircase", True), start_decay_steps=params.get("start_decay_steps", 0), minimum_learning_rate=params.get("minimum_learning_rate", 0)) else: decay_fn = None learning_rate = float(params["learning_rate"]) clip_gradients = params.get("clip_gradients") if clip_gradients is not None: clip_gradients = float(clip_gradients) optimizer_class = get_optimizer_class(params["optimizer"]) optimizer_params = params.get("optimizer_params", {}) if optimizer_class.__name__ == "AdafactorOptimizer": optimizer = optimizers.get_adafactor_optimizer_from_params( optimizer_class, optimizer_params) else: optimizer = lambda lr: optimizer_class(lr, **optimizer_params) if mixed_precision: optimizer_fn = lambda lr: optimizers.MixedPrecisionOptimizerWrapper( optimizer(lr), loss_scale=get_loss_scale_from_params(params)) else: optimizer_fn = optimizer regularization = params.get("regularization") if regularization is not None: loss += regularization_penalty(regularization["type"], regularization["scale"]) return tf.contrib.layers.optimize_loss(loss, global_step, learning_rate, optimizer_fn, clip_gradients=clip_gradients, learning_rate_decay_fn=decay_fn, name="optim", summaries=[ "learning_rate", "global_gradient_norm", ], colocate_gradients_with_ops=True)
def optimize_loss(loss, params, mixed_precision=False, var_list=None, hvd=None): """Minimizes the loss. Args: loss: The loss to minimize. params: A dictionary of hyperparameters. mixed_precision: If ``True``, wraps the optimizer to maintain a float32 copy of the weights. var_list: The variables to update. hvd: Optional Horovod object. Returns: The loss minimization op and a list of internal variables to initialize. """ regularization = params.get("regularization") if regularization is not None: loss += regularization_penalty(regularization["type"], regularization["scale"], weights_list=var_list) global_step = tf.train.get_or_create_global_step() with tf.variable_scope("optim"): learning_rate = tf.constant(params["learning_rate"], dtype=tf.float32) if params.get("decay_type") is not None: decay_params = params.get("decay_params", {}) if "decay_rate" in params: # Backward compatibility: fill params from previous options. decay_params["decay_rate"] = params["decay_rate"] decay_params["decay_steps"] = params["decay_steps"] decay_params["staircase"] = params.get("staircase", True) decay_fn = learning_rate_decay_fn_v2( params["decay_type"], decay_params=decay_params, decay_step_duration=params.get("decay_step_duration", 1), start_decay_step=params.get("start_decay_steps", 0), minimum_learning_rate=params.get("minimum_learning_rate", 0)) learning_rate = decay_fn(learning_rate, global_step) tf.summary.scalar("learning_rate", learning_rate) # Optimizer. optimizer_class = get_optimizer_class(params["optimizer"]) optimizer_params = params.get("optimizer_params", {}) if optimizer_class.__name__ == "AdafactorOptimizer": optimizer = optimizers.get_adafactor_optimizer_from_params( optimizer_class, optimizer_params, learning_rate=learning_rate) else: weight_decay = params.get("weight_decay") if weight_decay is not None: optimizer_class = tf.contrib.opt.extend_with_decoupled_weight_decay( optimizer_class) optimizer = optimizer_class(weight_decay, learning_rate=learning_rate, **optimizer_params) else: optimizer = optimizer_class(learning_rate, **optimizer_params) if mixed_precision: from opennmt.optimizers.mixed_precision_wrapper import get_loss_scale_from_params optimizer = optimizers.MixedPrecisionOptimizerWrapper( optimizer, loss_scale=get_loss_scale_from_params(params)) if hvd is not None: from opennmt.optimizers.distributed_optimizer import DistributedOptimizer optimizer = DistributedOptimizer.from_params( optimizer, params=params.get("horovod")) # Gradients. var_list = _get_trainable_variables( var_list=var_list, freeze_variables=params.get("freeze_variables")) gradients = optimizer.compute_gradients( loss, var_list=var_list, colocate_gradients_with_ops=True) _summarize_gradients_norm("global_norm/gradient_norm", gradients) if params.get("clip_gradients") is not None: gradients = _clip_gradients_by_norm( gradients, float(params["clip_gradients"])) _summarize_gradients_norm("global_norm/clipped_gradient_norm", gradients) return delayed_update(optimizer, gradients, global_step, accum_count=params.get("gradients_accum", 1))
def optimize_loss(loss, params, mixed_precision=False, var_list=None): """Minimizes the loss. Args: loss: The loss to minimize. params: A dictionary of hyperparameters. mixed_precision: If ``True``, wraps the optimizer to maintain a float32 copy of the weights. var_list: The variables to update. #TODO: however var_list are not passed from model.py, how, assumed in the scope? Returns: The loss minimization op and a list of internal variables to initialize. """ regularization = params.get("regularization") if regularization is not None: loss += regularization_penalty( regularization["type"], regularization["scale"], weights_list=var_list) #TODO global_step = tf.train.get_or_create_global_step() with tf.variable_scope("optim"): # Learning rate. learning_rate = tf.get_variable( "learning_rate", [], trainable=False, initializer=tf.constant_initializer(float(params["learning_rate"]))) if params.get("decay_type") is not None: decay_params = params.get("decay_params", {}) if "decay_rate" in params: # Backward compatibility: fill para ms from previous options. decay_params["decay_rate"] = params["decay_rate"] decay_params["decay_steps"] = params["decay_steps"] decay_params["staircase"] = params.get("staircase", True) decay_fn = learning_rate_decay_fn_v2( params["decay_type"], decay_params=decay_params, decay_step_duration=params.get("decay_step_duration", 1), start_decay_step=params.get("start_decay_steps", 0), minimum_learning_rate=params.get("minimum_learning_rate", 0)) learning_rate = decay_fn(learning_rate, global_step) tf.summary.scalar("learning_rate", learning_rate) # Optimizer. optimizer_class = get_optimizer_class(params["optimizer"]) optimizer_params = params.get("optimizer_params", {}) if optimizer_class.__name__ == "AdafactorOptimizer": optimizer = optimizers.get_adafactor_optimizer_from_params( optimizer_class, optimizer_params, learning_rate=learning_rate) else: optimizer = optimizer_class(learning_rate, **optimizer_params) if mixed_precision: optimizer = optimizers.MixedPrecisionOptimizerWrapper( optimizer, loss_scale=get_loss_scale_from_params(params)) # Gradients. #gradients = optimizer.compute_gradients(loss, colocate_gradients_with_ops=True) gradients = optimizer.compute_gradients( loss, var_list=var_list, colocate_gradients_with_ops=True) #TODO, which var_list ? """ # Gradients. NEW with freeze_update freeze_params = params.get("freeze") if False: #True: #freeze_params is not None: tf.logging.info("Optimizing selected network components:", freeze_params) # TODO: then get the variable list to upadte before backprob # create a scope and get the variables as done for the learning-rate or with a separate variable list #var_list = tf.trainable_variables() # TODO: #, var_list) # the called method only does a selection of variables to update from the trainable var_list variable_list = freeze_update_op(freeze_params) tf.logging.info("Parameters being optimized:", variable_list) gradients = optimizer.compute_gradients(loss, var_list=variable_list, colocate_gradients_with_ops=True) else: tf.logging.info("OPTIMIZING ALL NET COMPONENTS NORMALLY >>> :") gradients = optimizer.compute_gradients(loss, colocate_gradients_with_ops=True) """ _summarize_gradients_norm("global_norm/gradient_norm", gradients) if params.get("clip_gradients") is not None: gradients = _clip_gradients_by_norm(gradients, float(params["clip_gradients"])) _summarize_gradients_norm("global_norm/clipped_gradient_norm", gradients) return delayed_update( optimizer, gradients, global_step, accum_count=params.get("gradients_accum", 1))