def __call__(self, step): with ops.name_scope_v2(self.name or "NoisyLinearCosineDecay") as name: initial_learning_rate = ops.convert_to_tensor( self.initial_learning_rate, name="initial_learning_rate") dtype = initial_learning_rate.dtype decay_steps = math_ops.cast(self.decay_steps, dtype) initial_variance = math_ops.cast(self.initial_variance, dtype) variance_decay = math_ops.cast(self.variance_decay, dtype) num_periods = math_ops.cast(self.num_periods, dtype) alpha = math_ops.cast(self.alpha, dtype) beta = math_ops.cast(self.beta, dtype) global_step_recomp = math_ops.cast(step, dtype) global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps) linear_decayed = (decay_steps - global_step_recomp) / decay_steps variance = initial_variance / ( math_ops.pow(1.0 + global_step_recomp, variance_decay)) std = math_ops.sqrt(variance) noisy_linear_decayed = ( linear_decayed + random_ops.random_normal( linear_decayed.shape, stddev=std)) completed_fraction = global_step_recomp / decay_steps fraction = 2.0 * num_periods * completed_fraction cosine_decayed = 0.5 * ( 1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction)) noisy_linear_cosine_decayed = ( (alpha + noisy_linear_decayed) * cosine_decayed + beta) return math_ops.multiply( initial_learning_rate, noisy_linear_cosine_decayed, name=name)
def __call__(self, step): with ops.name_scope_v2(self.name or "PolynomialDecay") as name: initial_learning_rate = ops.convert_to_tensor( self.initial_learning_rate, name="initial_learning_rate") dtype = initial_learning_rate.dtype end_learning_rate = math_ops.cast(self.end_learning_rate, dtype) power = math_ops.cast(self.power, dtype) global_step_recomp = math_ops.cast(step, dtype) decay_steps_recomp = math_ops.cast(self.decay_steps, dtype) if self.cycle: # Find the first multiple of decay_steps that is bigger than # global_step. If global_step is zero set the multiplier to 1 multiplier = control_flow_ops.cond( math_ops.equal(global_step_recomp, 0), lambda: 1.0, lambda: math_ops.ceil(global_step_recomp / self.decay_steps)) decay_steps_recomp = math_ops.multiply(decay_steps_recomp, multiplier) else: # Make sure that the global_step used is not bigger than decay_steps. global_step_recomp = math_ops.minimum(global_step_recomp, self.decay_steps) p = math_ops.div(global_step_recomp, decay_steps_recomp) return math_ops.add( math_ops.multiply(initial_learning_rate - end_learning_rate, math_ops.pow(1 - p, power)), end_learning_rate, name=name)
def __call__(self, step): with ops.name_scope_v2(self.name or "ExponentialDecay") as name: initial_learning_rate = ops.convert_to_tensor( self.initial_learning_rate, name="initial_learning_rate") dtype = initial_learning_rate.dtype decay_steps = math_ops.cast(self.decay_steps, dtype) decay_rate = math_ops.cast(self.decay_rate, dtype) global_step_recomp = math_ops.cast(step, dtype) p = global_step_recomp / decay_steps if self.staircase: p = math_ops.floor(p) return math_ops.multiply( initial_learning_rate, math_ops.pow(decay_rate, p), name=name)
def __call__(self, step): with ops.name_scope_v2(self.name or "CosineDecay"): initial_learning_rate = ops.convert_to_tensor( self.initial_learning_rate, name="initial_learning_rate") dtype = initial_learning_rate.dtype decay_steps = math_ops.cast(self.decay_steps, dtype) global_step_recomp = math_ops.cast(step, dtype) global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps) completed_fraction = global_step_recomp / decay_steps cosine_decayed = 0.5 * (1.0 + math_ops.cos( constant_op.constant(math.pi) * completed_fraction)) decayed = (1 - self.alpha) * cosine_decayed + self.alpha return math_ops.multiply(initial_learning_rate, decayed)
def __call__(self, step): with ops.name_scope_v2(self.name or "InverseTimeDecay") as name: initial_learning_rate = ops.convert_to_tensor( self.initial_learning_rate, name="initial_learning_rate") dtype = initial_learning_rate.dtype decay_steps = math_ops.cast(self.decay_steps, dtype) decay_rate = math_ops.cast(self.decay_rate, dtype) global_step_recomp = math_ops.cast(step, dtype) p = global_step_recomp / decay_steps if self.staircase: p = math_ops.floor(p) const = math_ops.cast(constant_op.constant(1), dtype) denom = math_ops.add(const, math_ops.multiply(decay_rate, p)) return math_ops.div(initial_learning_rate, denom, name=name)
def __call__(self, step): with ops.name_scope_v2(self.name or "LinearCosineDecay") as name: initial_learning_rate = ops.convert_to_tensor( self.initial_learning_rate, name="initial_learning_rate") dtype = initial_learning_rate.dtype decay_steps = math_ops.cast(self.decay_steps, dtype) num_periods = math_ops.cast(self.num_periods, dtype) alpha = math_ops.cast(self.alpha, dtype) beta = math_ops.cast(self.beta, dtype) global_step_recomp = math_ops.cast(step, dtype) global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps) linear_decayed = (decay_steps - global_step_recomp) / decay_steps completed_fraction = global_step_recomp / decay_steps fraction = 2.0 * num_periods * completed_fraction cosine_decayed = 0.5 * ( 1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction)) linear_cosine_decayed = (alpha + linear_decayed) * cosine_decayed + beta return math_ops.multiply(initial_learning_rate, linear_cosine_decayed, name=name)
def __call__(self, step): with ops.name_scope_v2(self.name or "SGDRDecay") as name: initial_learning_rate = ops.convert_to_tensor( self.initial_learning_rate, name="initial_learning_rate") dtype = initial_learning_rate.dtype first_decay_steps = math_ops.cast(self.first_decay_steps, dtype) alpha = math_ops.cast(self.alpha, dtype) t_mul = math_ops.cast(self._t_mul, dtype) m_mul = math_ops.cast(self._m_mul, dtype) global_step_recomp = math_ops.cast(step, dtype) completed_fraction = global_step_recomp / first_decay_steps def compute_step(completed_fraction, geometric=False): """Helper for `cond` operation.""" if geometric: i_restart = math_ops.floor( math_ops.log(1.0 - completed_fraction * (1.0 - t_mul)) / math_ops.log(t_mul)) sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul) completed_fraction = (completed_fraction - sum_r) / t_mul**i_restart else: i_restart = math_ops.floor(completed_fraction) completed_fraction -= i_restart return i_restart, completed_fraction i_restart, completed_fraction = control_flow_ops.cond( math_ops.equal(t_mul, 1.0), lambda: compute_step(completed_fraction, geometric=False), lambda: compute_step(completed_fraction, geometric=True)) m_fac = m_mul**i_restart cosine_decayed = 0.5 * m_fac * (1.0 + math_ops.cos( constant_op.constant(math.pi) * completed_fraction)) decayed = (1 - alpha) * cosine_decayed + alpha return math_ops.multiply(initial_learning_rate, decayed, name=name)
def __call__(self, step): with ops.name_scope_v2(self.name or "PiecewiseConstant"): boundaries = ops.convert_n_to_tensor(self.boundaries) values = ops.convert_n_to_tensor(self.values) x_recomp = ops.convert_to_tensor(step) # Avoid explicit conversion to x's dtype. This could result in faulty # comparisons, for example if floats are converted to integers. for i, b in enumerate(boundaries): if b.dtype.base_dtype != x_recomp.dtype.base_dtype: # We can promote int32 boundaries to int64 without loss of precision. # This covers the most common case where the user passes in boundaries # as an array of Python integers. if (b.dtype.base_dtype == dtypes.int32 and x_recomp.dtype.base_dtype == dtypes.int64): b = math_ops.cast(b, x_recomp.dtype.base_dtype) boundaries[i] = b else: raise ValueError( "Boundaries (%s) must have the same dtype as x (%s)." % (b.dtype.base_dtype, x_recomp.dtype.base_dtype)) # TODO(rdipietro): Ensure that boundaries' elements strictly increases. for v in values[1:]: if v.dtype.base_dtype != values[0].dtype.base_dtype: raise ValueError( "Values must have elements all with the same dtype (%s vs %s)." % (values[0].dtype.base_dtype, v.dtype.base_dtype)) pred_fn_pairs = [] pred_fn_pairs.append((x_recomp <= boundaries[0], lambda: values[0])) pred_fn_pairs.append((x_recomp > boundaries[-1], lambda: values[-1])) for low, high, v in zip(boundaries[:-1], boundaries[1:], values[1:-1]): # Need to bind v here; can do this with lambda v=v: ... pred = (x_recomp > low) & (x_recomp <= high) pred_fn_pairs.append((pred, lambda v=v: v)) # The default isn't needed here because our conditions are mutually # exclusive and exhaustive, but tf.case requires it. default = lambda: values[0] return control_flow_ops.case(pred_fn_pairs, default, exclusive=True)
def __call__(self, step): with ops.name_scope_v2(self.name or "CustomCosineDecay"): return self.lr_fn(step)
def __call__(self, step): with ops.name_scope_v2(self.name or "PiecewiseLinear") as name: return piecewise_linear(step, self.schedule)
def fn(): with ops.name_scope_v2("name"): pass
def __call__(self, step: int): """ Call function from optimizer function. Args: step (int): step """ with ops.name_scope_v2( self.name or "PolynomialDecayWithWarmup" ) as name: initial_learning_rate = ops.convert_to_tensor_v2( self.initial_learning_rate, name="initial_learning_rate" ) dtype = initial_learning_rate.dtype end_learning_rate = math_ops.cast(self.end_learning_rate, dtype) power = math_ops.cast(self.power, dtype) warm_up_steps = math_ops.cast(self.warm_up_steps, dtype) start_warmup_step = math_ops.cast(self.start_warmup_step, dtype) global_step_recomp = math_ops.cast(step, dtype) decay_steps_recomp = math_ops.cast(self.decay_steps, dtype) if self.cycle: # Find the first multiple of decay_steps that is bigger than # global_step. If global_step is zero set the multiplier to 1 multiplier = control_flow_ops.cond( math_ops.equal(global_step_recomp, 0), lambda: 1.0, lambda: math_ops.ceil( global_step_recomp / self.decay_steps ), ) decay_steps_recomp = math_ops.multiply( decay_steps_recomp, multiplier ) else: # Make sure that the global_step used is not bigger than decay_steps. global_step_recomp = math_ops.minimum( global_step_recomp, decay_steps_recomp ) p = math_ops.divide(global_step_recomp, decay_steps_recomp) decay_learning_rate = math_ops.multiply( initial_learning_rate - end_learning_rate, math_ops.pow(1 - p, power), ) global_step_warmup = math_ops.sub( global_step_recomp, start_warmup_step ) warmup_percent_done = math_ops.divide( global_step_warmup, warm_up_steps ) warmup_learning_rate = math_ops.multiply( initial_learning_rate, warmup_percent_done, ) learning_rate = control_flow_ops.cond( math_ops.greater(global_step_warmup, warm_up_steps), lambda: decay_learning_rate, lambda: warmup_learning_rate, ) return learning_rate
def zero_state(self, batch_size, dtype): with ops.name_scope_v2(type(self).__name__ + "ZeroState"): with ops.device(self._device): return self.cell.zero_state(batch_size, dtype)
def __init__(self, cell, input_keep_prob=1.0, output_keep_prob=1.0, state_keep_prob=1.0, variational_recurrent=False, input_size=None, dtype=None, seed=None, dropout_state_filter_visitor=None, **kwargs): """Create a cell with added input, state, and/or output dropout. If `variational_recurrent` is set to `True` (**NOT** the default behavior), then the same dropout mask is applied at every step, as described in: [A Theoretically Grounded Application of Dropout in Recurrent Neural Networks. Y. Gal, Z. Ghahramani](https://arxiv.org/abs/1512.05287). Otherwise a different dropout mask is applied at every time step. Note, by default (unless a custom `dropout_state_filter` is provided), the memory state (`c` component of any `LSTMStateTuple`) passing through a `DropoutWrapper` is never modified. This behavior is described in the above article. Args: cell: an RNNCell, a projection to output_size is added to it. input_keep_prob: unit Tensor or float between 0 and 1, input keep probability; if it is constant and 1, no input dropout will be added. output_keep_prob: unit Tensor or float between 0 and 1, output keep probability; if it is constant and 1, no output dropout will be added. state_keep_prob: unit Tensor or float between 0 and 1, output keep probability; if it is constant and 1, no output dropout will be added. State dropout is performed on the outgoing states of the cell. **Note** the state components to which dropout is applied when `state_keep_prob` is in `(0, 1)` are also determined by the argument `dropout_state_filter_visitor` (e.g. by default dropout is never applied to the `c` component of an `LSTMStateTuple`). variational_recurrent: Python bool. If `True`, then the same dropout pattern is applied across all time steps per run call. If this parameter is set, `input_size` **must** be provided. input_size: (optional) (possibly nested tuple of) `TensorShape` objects containing the depth(s) of the input tensors expected to be passed in to the `DropoutWrapper`. Required and used **iff** `variational_recurrent = True` and `input_keep_prob < 1`. dtype: (optional) The `dtype` of the input, state, and output tensors. Required and used **iff** `variational_recurrent = True`. seed: (optional) integer, the randomness seed. dropout_state_filter_visitor: (optional), default: (see below). Function that takes any hierarchical level of the state and returns a scalar or depth=1 structure of Python booleans describing which terms in the state should be dropped out. In addition, if the function returns `True`, dropout is applied across this sublevel. If the function returns `False`, dropout is not applied across this entire sublevel. Default behavior: perform dropout on all terms except the memory (`c`) state of `LSTMCellState` objects, and don't try to apply dropout to `TensorArray` objects: ``` def dropout_state_filter_visitor(s): if isinstance(s, LSTMCellState): # Never perform dropout on the c state. return LSTMCellState(c=False, h=True) elif isinstance(s, TensorArray): return False return True ``` **kwargs: dict of keyword arguments for base layer. Raises: TypeError: if `cell` is not an `RNNCell`, or `keep_state_fn` is provided but not `callable`. ValueError: if any of the keep_probs are not between 0 and 1. """ super(DropoutWrapperBase, self).__init__(cell, dtype=dtype, **kwargs) if (dropout_state_filter_visitor is not None and not callable(dropout_state_filter_visitor)): raise TypeError("dropout_state_filter_visitor must be callable") self._dropout_state_filter = ( dropout_state_filter_visitor or _default_dropout_state_filter_visitor) with ops.name_scope_v2("DropoutWrapperInit"): def tensor_and_const_value(v): tensor_value = ops.convert_to_tensor(v) const_value = tensor_util.constant_value(tensor_value) return (tensor_value, const_value) for prob, attr in [(input_keep_prob, "input_keep_prob"), (state_keep_prob, "state_keep_prob"), (output_keep_prob, "output_keep_prob")]: tensor_prob, const_prob = tensor_and_const_value(prob) if const_prob is not None: if const_prob < 0 or const_prob > 1: raise ValueError("Parameter %s must be between 0 and 1: %d" % (attr, const_prob)) setattr(self, "_%s" % attr, float(const_prob)) else: setattr(self, "_%s" % attr, tensor_prob) # Set variational_recurrent, seed before running the code below self._variational_recurrent = variational_recurrent self._input_size = input_size self._seed = seed self._recurrent_input_noise = None self._recurrent_state_noise = None self._recurrent_output_noise = None if variational_recurrent: if dtype is None: raise ValueError( "When variational_recurrent=True, dtype must be provided") def convert_to_batch_shape(s): # Prepend a 1 for the batch dimension; for recurrent # variational dropout we use the same dropout mask for all # batch elements. return array_ops.concat(([1], tensor_shape.TensorShape(s).as_list()), 0) def batch_noise(s, inner_seed): shape = convert_to_batch_shape(s) return random_ops.random_uniform(shape, seed=inner_seed, dtype=dtype) if (not isinstance(self._input_keep_prob, numbers.Real) or self._input_keep_prob < 1.0): if input_size is None: raise ValueError( "When variational_recurrent=True and input_keep_prob < 1.0 or " "is unknown, input_size must be provided") self._recurrent_input_noise = _enumerated_map_structure_up_to( input_size, lambda i, s: batch_noise(s, inner_seed=self._gen_seed("input", i)), input_size) self._recurrent_state_noise = _enumerated_map_structure_up_to( cell.state_size, lambda i, s: batch_noise(s, inner_seed=self._gen_seed("state", i)), cell.state_size) self._recurrent_output_noise = _enumerated_map_structure_up_to( cell.output_size, lambda i, s: batch_noise(s, inner_seed=self._gen_seed("output", i)), cell.output_size)