def mean_baseline(_, loss): with vs.variable_scope(name, default_name="MeanBaseline"): reduced_loss = math_ops.reduce_mean(loss) ema = training.ExponentialMovingAverage(decay=ema_decay) update_op = ema.apply([reduced_loss]) # The bias correction term requires keeping track of how many times the # EMA has been updated. Creating a variable here to do so. The global step # is not used because it may or may not track exactly the number of times # the EMA is updated. ema_var = ema.average(reduced_loss) assert ema_var is not None with ops.colocate_with(ema_var): num_updates = vs.get_variable("local_ema_step", initializer=0, trainable=False) num_updates = num_updates.assign_add(1) bias_correction = 1. - math_ops.pow( ema_decay, math_ops.cast(num_updates, reduced_loss.dtype)) with ops.control_dependencies([update_op]): baseline = ema.average(reduced_loss) / bias_correction return baseline
def mean_baseline(_, loss): with ops.name_scope(name): ema = training.ExponentialMovingAverage(decay=ema_decay) update_op = ema.apply(math_ops.reduce_mean(loss)) with control_flow_ops.control_dependencies([update_op]): # TODO(rsepassi): Possibly implement the initialization bias correction # term from Adam (section 3 of https://arxiv.org/pdf/1412.6980v8.pdf). baseline = ema.average(loss) return baseline
def mean_baseline(_, loss): with vs.variable_scope(name, default_name="MeanBaseline"): reduced_loss = math_ops.reduce_mean(loss) ema = training.ExponentialMovingAverage(decay=ema_decay) update_op = ema.apply([reduced_loss]) with ops.control_dependencies([update_op]): # Using `identity` causes an op to be added in this context, which # triggers the update. Removing the `identity` means nothing is updated. baseline = array_ops.identity(ema.average(reduced_loss)) return baseline
def optimize_loss(loss, global_step, learning_rate, optimizer, clip_gradients=None, moving_average_decay=0.9, learning_rate_decay_fn=None, variables=None): """Given loss and parameters for optimizer, returns a training op. Args: loss: Tensor, 0 dimensional. global_step: Tensor, step counter for each update. learning_rate: float or Tensor, magnitude of update per each training step. optimizer: string, class or optimizer instance, used as trainer. string should be name of optimizer, like 'SGD', 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant. class should be sub-class of tf.Optimizer that implements `compute_gradients` and `apply_gradients` functions. optimizer instance should be instantion of tf.Optimizer sub-class and have `compute_gradients` and `apply_gradients` functions. clip_gradients: float or None, clips gradients by this value. moving_average_decay: float or None, takes into account previous loss to make learning smoother due to outliers. learning_rate_decay_fn: function, takes learning_rate and global_step Tensors, returns Tensor. Can be used to implement any learning rate decay funcitons. For example: tf.train.exponential_decay. variables: list of variables to optimizer or none. Returns: Training op. Raises: ValueError: if optimizer is wrong type. """ # Moving average of the loss with decay. if moving_average_decay is not None: # Generate moving averages of the loss. loss_averages = train.ExponentialMovingAverage(moving_average_decay, name="avg") loss_averages_op = loss_averages.apply([loss]) logging_ops.scalar_summary("loss/mean", loss_averages.average(loss)) loss = control_flow_ops.with_dependencies([loss_averages_op], loss) # Learning rate variable, with possible decay. if isinstance(learning_rate, ops.Tensor) and len(learning_rate.get_shape()) == 0: lr = learning_rate elif isinstance(learning_rate, float): lr = vs.get_variable("learning_rate", [], trainable=False, initializer=init_ops.constant_initializer(learning_rate)) else: raise ValueError("Learning rate should be 0d Tensor or float. Got %s" % str(learning_rate)) if learning_rate_decay_fn is not None: lr = learning_rate_decay_fn(lr, global_step) # Create optimizer, given specified parameters. if isinstance(optimizer, six.string_types): if optimizer not in OPTIMIZER_CLS_NAMES: raise ValueError("Optimizer name should be one of [%s], you provided %s." % (", ".join(OPTIMIZER_CLS_NAMES), optimizer)) opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr) elif isinstance(optimizer, type) and issubclass(optimizer, optimizer_.Optimizer): opt = optimizer(learning_rate=lr) elif isinstance(optimizer, optimizer_.Optimizer): opt = optimizer else: raise ValueError("Unrecognized optimizer: should be string, " "subclass of Optimizer or instance of " "subclass of Optimizer. Got %s." % str(optimizer)) # All trainable variables, if specific variables are not specified. if variables is None: variables = vars_.trainable_variables() # Compute gradients and clip them if provided. gradients = opt.compute_gradients(loss, variables) if clip_gradients is not None: gradients, variables = zip(*gradients) clipped_gradients, _ = clip_ops.clip_by_global_norm(gradients, clip_gradients) gradients = list(zip(clipped_gradients, variables)) # Add scalar summary for loss. logging_ops.scalar_summary("loss", loss) # Add histograms for variables, gradients and gradient norms. for gradient, variable in gradients: if isinstance(gradient, ops.IndexedSlices): grad_values = gradient.values else: grad_values = gradient if grad_values is not None: logging_ops.histogram_summary(variable.name, variable) logging_ops.histogram_summary(variable.name + "/gradients", grad_values) logging_ops.histogram_summary(variable.name + "/gradient_norm", clip_ops.global_norm([grad_values])) # Create gradient updates. grad_updates = opt.apply_gradients(gradients, global_step=global_step, name="train") # Make sure total_loss is valid. final_loss = array_ops.check_numerics(loss, "Loss is inf or nan") # Ensure the train_tensor computes grad_updates. train_tensor = control_flow_ops.with_dependencies([grad_updates], final_loss) return train_tensor
def optimize_loss(loss, global_step, learning_rate, optimizer, gradient_noise_scale=None, gradient_multipliers=None, clip_gradients=None, moving_average_decay=None, learning_rate_decay_fn=None, update_ops=None, variables=None, name=None, summaries=None): """Given loss and parameters for optimizer, returns a training op. Args: loss: Tensor, 0 dimensional. global_step: Tensor, step counter for each update. learning_rate: float or Tensor, magnitude of update per each training step. optimizer: string, class or optimizer instance, used as trainer. string should be name of optimizer, like 'SGD', 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant. class should be sub-class of tf.Optimizer that implements `compute_gradients` and `apply_gradients` functions. optimizer instance should be instantion of tf.Optimizer sub-class and have `compute_gradients` and `apply_gradients` functions. gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this value. gradient_multipliers: dict of variables or variable names to floats. If present, gradients for specified variables will be multiplied by given constant. clip_gradients: float or `None`, clips gradients by this value. moving_average_decay: Deprecated. float or None, takes into account previous loss to make learning smoother due to outliers. learning_rate_decay_fn: function, takes `learning_rate` and `global_step` `Tensor`s, returns `Tensor`. Can be used to implement any learning rate decay functions. For example: tf.train.exponential_decay. update_ops: list of update `Operation`s to execute at each step. If `None`, uses elements of UPDATE_OPS collection. The order of execution between `update_ops` and `loss` is non-deterministic. variables: list of variables to optimize or `None` to use all trainable variables. name: The name for this operation is used to scope operations and summaries. summaries: List of internal quantities to visualize on tensorboard. If not set only the loss and the learning rate will be reported. The complete list is in OPTIMIZER_SUMMARIES. Returns: Training op. Raises: ValueError: if optimizer is wrong type. """ with vs.variable_scope(name, "OptimizeLoss", [loss, global_step]): # Update ops take UPDATE_OPS collection if not provided. if update_ops is None: update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS)) # Make sure update ops are ran before computing loss. if update_ops: loss = control_flow_ops.with_dependencies(list(update_ops), loss) # Moving average of the loss with decay. # TODO(b/30439864): moving_average_decay should be removed. if moving_average_decay is not None: logging.warn("'moving_average_decay' is deprecated. Please use " "tensorboard's builtin averaging instead.") # Generate moving averages of the loss. loss_averages = train.ExponentialMovingAverage( moving_average_decay, name="avg") loss_averages_op = loss_averages.apply([loss]) logging_ops.scalar_summary("loss/mean", loss_averages.average(loss)) loss = control_flow_ops.with_dependencies([loss_averages_op], loss) # Learning rate variable, with possible decay. if (isinstance(learning_rate, ops.Tensor) and learning_rate.get_shape().ndims == 0): lr = learning_rate elif isinstance(learning_rate, float): lr = vs.get_variable( "learning_rate", [], trainable=False, initializer=init_ops.constant_initializer(learning_rate)) else: raise ValueError("Learning rate should be 0d Tensor or float. " "Got %s of type %s" % (str(learning_rate), str(type(learning_rate)))) if summaries is None: summaries = ["loss", "learning_rate"] if learning_rate_decay_fn is not None: lr = learning_rate_decay_fn(lr, global_step) if "learning_rate" in summaries: logging_ops.scalar_summary("learning_rate", lr) # Create optimizer, given specified parameters. if isinstance(optimizer, six.string_types): if optimizer not in OPTIMIZER_CLS_NAMES: raise ValueError( "Optimizer name should be one of [%s], you provided %s." % (", ".join(OPTIMIZER_CLS_NAMES), optimizer)) opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr) elif isinstance(optimizer, type) and issubclass( optimizer, optimizer_.Optimizer): opt = optimizer(learning_rate=lr) elif isinstance(optimizer, optimizer_.Optimizer): opt = optimizer else: raise ValueError("Unrecognized optimizer: should be string, " "subclass of Optimizer or instance of " "subclass of Optimizer. Got %s." % str(optimizer)) # All trainable variables, if specific variables are not specified. if variables is None: variables = vars_.trainable_variables() # Compute gradients. gradients = opt.compute_gradients(loss, variables) # Optionally add gradient noise. if gradient_noise_scale is not None: gradients = _add_scaled_noise_to_gradients(gradients, gradient_noise_scale) # Multiply some gradients. if gradient_multipliers is not None: gradients = _multiply_gradients(gradients, gradient_multipliers) # Optionally clip gradients by global norm. if clip_gradients is not None: gradients = _clip_gradients_by_norm(gradients, clip_gradients) # Add scalar summary for loss. if "loss" in summaries: logging_ops.scalar_summary("loss", loss) # Add histograms for variables, gradients and gradient norms. for gradient, variable in gradients: if isinstance(gradient, ops.IndexedSlices): grad_values = gradient.values else: grad_values = gradient if grad_values is not None: if "gradients" in summaries: logging_ops.histogram_summary(variable.name + "/gradients", grad_values) if "gradient_norm" in summaries: logging_ops.histogram_summary( variable.name + "/gradient_norm", clip_ops.global_norm([grad_values])) # Create gradient updates. grad_updates = opt.apply_gradients(gradients, global_step=global_step, name="train") # Ensure the train_tensor computes grad_updates. train_tensor = control_flow_ops.with_dependencies([grad_updates], loss) return train_tensor
def optimize_loss(loss, global_step, learning_rate, optimizer, clip_gradients=None, moving_average_decay=0.9, learning_rate_decay_fn=None, variables=None): """Given loss and parameters for optimizer, returns a training op. Args: loss: Tensor, 0 dimensional. global_step: Tensor, step counter for each update. learning_rate: float or Tensor, magnitude of update per each training step. optimizer: string or function, used as optimizer for training. clip_gradients: float or None, clips gradients by this value. moving_average_decay: float or None, takes into account previous loss to make learning smoother due to outliers. learning_rate_decay_fn: function, takes learning_rate and global_step Tensors, returns Tensor. Can be used to implement any learning rate decay funcitons. For example: tf.train.exponential_decay. variables: list of variables to optimizer or none. Returns: Training op. Raises: ValueError: if optimizer is wrong type. """ # Moving average of the loss with decay. if moving_average_decay is not None: # Generate moving averages of the loss. loss_averages = train.ExponentialMovingAverage(moving_average_decay, name="avg") loss_averages_op = loss_averages.apply([loss]) logging_ops.scalar_summary("loss/mean", loss_averages.average(loss)) loss = control_flow_ops.with_dependencies([loss_averages_op], loss) # Convert optimizer into the optimizer class. if isinstance(optimizer, str): opt_cls = OPTIMIZER_CLS_NAMES[optimizer] elif callable(optimizer): opt_cls = optimizer else: raise ValueError("Unrecognized optimizer: should be string or function.") # Learning rate variable, with possible decay. lr = vs.get_variable("learning_rate", [], trainable=False, initializer=init_ops.constant_initializer(learning_rate)) if learning_rate_decay_fn is not None: lr = learning_rate_decay_fn(lr, global_step) # Create optimizer. opt = opt_cls(learning_rate=lr) # All trainable variables, if specific variables are not specified. if variables is None: variables = vars_.trainable_variables() # Compute gradients and clip them if provided. gradients = opt.compute_gradients(loss, variables) if clip_gradients is not None: clipped_gradients, _ = clip_ops.clip_by_global_norm(gradients, clip_gradients) gradients = zip(clipped_gradients, variables) # Add scalar summary for loss. logging_ops.scalar_summary("loss", loss) # Add histograms for variables, gradients and gradient norms. for gradient, variable in gradients: if isinstance(gradient, ops.IndexedSlices): grad_values = gradient.values else: grad_values = gradient logging_ops.histogram_summary(variable.name, variable) logging_ops.histogram_summary(variable.name + "/gradients", grad_values) logging_ops.histogram_summary(variable.name + "/gradient_norm", clip_ops.global_norm([grad_values])) # Create gradient updates. grad_updates = opt.apply_gradients(gradients, global_step=global_step, name="train") # Make sure total_loss is valid. final_loss = array_ops.check_numerics(loss, "Loss is inf or nan") # Ensure the train_tensor computes grad_updates. train_tensor = control_flow_ops.with_dependencies([grad_updates], final_loss) return train_tensor
def _batch_norm(_input, trnFlag, eps=1e-3, name="batch_norm", ema_decay=0.5, dtype=dtypes.float32): """ A wrapped BN operation used for 2D or 3D convolution as described in: https://gist.github.com/tomokishii/0ce3bdac1588b5cca9fa5fbdf6e1c412 https://stackoverflow.com/questions/33949786/how-could-i-use-batch-normalization-in-tensorflow?answertab=votes#tab-top :param _input: tensor, always convolution result before Relu :param eps: scalar, :param trnFlag: bool, whether training or not :param ema_decay: scalar, moving average used of BN's beta and gamma :param dtype: tf.dtype, data type :return: tensor, BN reuslt which has the same shape as _input """ shape = _input.get_shape().as_list() with variable_scope.variable_scope(name) as scope: beta = variable_scope.get_variable( "beta", [shape[-1]], dtype=dtype, initializer=init_ops.constant_initializer(0., dtype=dtype), trainable=True) gamma = variable_scope.get_variable( "gamma", [shape[-1]], dtype=dtype, initializer=init_ops.random_normal_initializer(1., 0.01, dtype=dtype, seed=20170705), trainable=True) if shape.__len__() == 2: # fc, [batch_size, num_dim] batch_mean, batch_var = nn_impl.moments(_input, [0], name="moments") elif shape.__len__( ) == 4: # conv, [batch_size, width, heigth, channel] batch_mean, batch_var = nn_impl.moments(_input, [0, 1, 2], name="moments") elif shape.__len__( ) == 5: # conv, [batch_size, depth, width, heigth, channel] batch_mean, batch_var = nn_impl.moments(_input, [0, 1, 2, 3], name="moments") else: raise 'wrong _input shape, it must have dim of 2 or 4 or 5' ema = training.ExponentialMovingAverage(decay=ema_decay) def mean_var_with_update(): ema_apply_op = ema.apply([batch_mean, batch_var]) with ops.control_dependencies([ema_apply_op]): return array_ops.identity(batch_mean), array_ops.identity( batch_var) mean, var = control_flow_ops.cond( trnFlag, mean_var_with_update, lambda: (ema.average(batch_mean), ema.average(batch_var))) bn_out = nn_impl.batch_normalization(_input, mean, var, beta, gamma, eps) return bn_out