def _prepare_local(self, var_device, var_dtype, apply_state): lr_t = tf.identity(self._get_hyper('learning_rate', var_dtype)) beta_1_t = tf.identity(self._get_hyper('beta_1', var_dtype)) beta_2_t = tf.identity(self._get_hyper('beta_2', var_dtype)) local_step = tf.cast(self.iterations + 1, var_dtype) next_step = tf.cast(self.iterations + 2, var_dtype) decay_base = tf.cast(0.96, var_dtype) m_t = beta_1_t * (1. - 0.5 * ( tf.pow(decay_base, self._initial_decay * local_step))) m_t_1 = beta_1_t * (1. - 0.5 * ( tf.pow(decay_base, self._initial_decay * next_step))) m_schedule_new = tf.cast(self._m_cache_read, var_dtype) * m_t if var_dtype is self._m_cache.dtype: m_schedule_new = tf.identity(tf.compat.v1.assign( self._m_cache, m_schedule_new, use_locking=self._use_locking)) m_schedule_next = m_schedule_new * m_t_1 apply_state[(var_device, var_dtype)] = dict( lr_t=lr_t, neg_lr_t=-lr_t, epsilon=tf.convert_to_tensor(self.epsilon, var_dtype), beta_1_t=beta_1_t, beta_2_t=beta_2_t, m_t=m_t, m_t_1=m_t_1, one_minus_beta_1_t=1 - beta_1_t, one_minus_beta_2_t=1 - beta_2_t, one_minus_m_t=1. - m_t, one_minus_m_schedule_new=1. - m_schedule_new, one_minus_m_schedule_next=1. - m_schedule_next, v_t_prime_denominator=1. - tf.pow(beta_2_t, local_step), )
def get_beta_accumulators(opt, dtype): local_step = tf.cast(opt.iterations + 1, dtype) beta_1_t = tf.cast(opt._get_hyper("beta_1"), dtype) beta_1_power = tf.pow(beta_1_t, local_step) beta_2_t = tf.cast(opt._get_hyper("beta_2"), dtype) beta_2_power = tf.pow(beta_2_t, local_step) return (beta_1_power, beta_2_power)
def _cdf(self, x): loc = tf.convert_to_tensor(self.loc) scale = tf.convert_to_tensor(self.scale) power = tf.convert_to_tensor(self.power) ipower = tf.math.reciprocal(power) half = tf.constant(0.5, dtype=self.dtype) # 0.5 is fp64 in numpy # For the CDF computation, we need to use a double-where a la: # https://github.com/tensorflow/probability/blob/master/discussion/where-nan.pdf # to avoid NaN gradients. This comes from computing (loc - x) ** power when # x > loc. If power is a not an even integer, then this value is not defined # or is negative, both of which are not valid values for `igamma`. loc_stop_grad = tf.stop_gradient(loc) # Use values that are right below loc and above loc. At loc, this will # result in `gamma|igamma(c)(1. / power, 0.)`. This has an undefined # gradient at 0. safe_x_lt_loc = tf.where(x > loc_stop_grad, loc_stop_grad - half, x) safe_x_gt_loc = tf.where(x < loc_stop_grad, loc_stop_grad + half, x) cdf = tf.where( x < loc, half * tf.math.igammac(ipower, tf.pow( (loc - safe_x_lt_loc) / scale, power)), half + half * tf.math.igamma(ipower, tf.pow( (safe_x_gt_loc - loc) / scale, power))) return cdf
def update_step(self, gradient, variable): """Update step given gradient and the associated model variable.""" if self._var_key(variable) not in self._index_dict: raise KeyError(f'Optimizer cannot recognize variable {variable.name}, ' f'this usually means you are calling an optimizer ' f'previously used on a different model. Please try ' f'creating a new optimizer instance.') lr = tf.cast(self.learning_rate, variable.dtype) var_key = self._var_key(variable) accum = self._accumulators[self._index_dict[var_key]] linear = self._linears[self._index_dict[var_key]] lr_power = self.learning_rate_power l2_reg = self.l2_regularization_strength l2_reg = (l2_reg + self.beta / (2. * lr)) # Ftrl optimizer has the same implementation for sparse and dense # gradients update. grad_to_use = ( gradient + 2 * self.l2_shrinkage_regularization_strength * variable) new_accum = accum + tf.pow(gradient, 2) linear.assign_add(grad_to_use - (tf.pow(new_accum, -lr_power) - tf.pow(accum, -lr_power)) / lr * variable) quadratic = tf.pow(new_accum, (-lr_power)) / lr + 2 * l2_reg linear_clipped = tf.clip_by_value(linear, -self.l1_regularization_strength, self.l1_regularization_strength) variable.assign((linear_clipped - linear) / quadratic) accum.assign(new_accum)
def update_step(self, gradient, variable): """Update step given gradient and the associated model variable.""" lr = tf.cast(self.learning_rate, variable.dtype) var_key = self._var_key(variable) accum = self._accumulators[self._index_dict[var_key]] linear = self._linears[self._index_dict[var_key]] lr_power = self.learning_rate_power l2_reg = self.l2_regularization_strength l2_reg = l2_reg + self.beta / (2.0 * lr) # Ftrl optimizer has the same implementation for sparse and dense # gradients update. grad_to_use = ( gradient + 2 * self.l2_shrinkage_regularization_strength * variable) new_accum = accum + tf.pow(gradient, 2) linear.assign_add(grad_to_use - (tf.pow(new_accum, -lr_power) - tf.pow(accum, -lr_power)) / lr * variable) quadratic = tf.pow(new_accum, (-lr_power)) / lr + 2 * l2_reg linear_clipped = tf.clip_by_value( linear, -self.l1_regularization_strength, self.l1_regularization_strength, ) variable.assign((linear_clipped - linear) / quadratic) accum.assign(new_accum)
def update_step(self, gradient, variable): """Update step given gradient and the associated model variable.""" var_dtype = variable.dtype lr = tf.cast(self.learning_rate, var_dtype) local_step = tf.cast(self.iterations + 1, var_dtype) next_step = tf.cast(self.iterations + 2, var_dtype) decay = tf.cast(0.96, var_dtype) beta_1 = tf.cast(self.beta_1, var_dtype) beta_2 = tf.cast(self.beta_2, var_dtype) u_t = beta_1 * (1.0 - 0.5 * (tf.pow(decay, local_step))) u_t_1 = beta_1 * (1.0 - 0.5 * (tf.pow(decay, next_step))) def get_cached_u_product(): return self._u_product def compute_new_u_product(): u_product_t = self._u_product * u_t self._u_product.assign(u_product_t) self._u_product_counter += 1 return u_product_t u_product_t = tf.cond( self._u_product_counter == (self.iterations + 2), true_fn=get_cached_u_product, false_fn=compute_new_u_product, ) u_product_t_1 = u_product_t * u_t_1 beta_2_power = tf.pow(beta_2, local_step) var_key = self._var_key(variable) m = self._momentums[self._index_dict[var_key]] v = self._velocities[self._index_dict[var_key]] if isinstance(gradient, tf.IndexedSlices): # Sparse gradients. m.assign_add(-m * (1 - beta_1)) m.scatter_add( tf.IndexedSlices(gradient.values * (1 - beta_1), gradient.indices)) v.assign_add(-v * (1 - beta_2)) v.scatter_add( tf.IndexedSlices( tf.square(gradient.values) * (1 - beta_2), gradient.indices)) m_hat = u_t_1 * m / (1 - u_product_t_1) + (1 - u_t) * gradient / ( 1 - u_product_t) v_hat = v / (1 - beta_2_power) variable.assign_sub((m_hat * lr) / (tf.sqrt(v_hat) + self.epsilon)) else: # Dense gradients. m.assign_add((gradient - m) * (1 - beta_1)) v.assign_add((tf.square(gradient) - v) * (1 - beta_2)) m_hat = u_t_1 * m / (1 - u_product_t_1) + (1 - u_t) * gradient / ( 1 - u_product_t) v_hat = v / (1 - beta_2_power) variable.assign_sub((m_hat * lr) / (tf.sqrt(v_hat) + self.epsilon))
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [] with tf.control_dependencies( [tf.compat.v1.assign_add(self.iterations, 1)] ): t = tf.cast(self.iterations, backend.floatx()) # Due to the recommendations in [2], i.e. warming momentum schedule momentum_cache_t = self.beta_1 * ( 1.0 - 0.5 * (tf.pow(backend.cast_to_floatx(0.96), t * self.schedule_decay)) ) momentum_cache_t_1 = self.beta_1 * ( 1.0 - 0.5 * ( tf.pow( backend.cast_to_floatx(0.96), (t + 1) * self.schedule_decay ) ) ) m_schedule_new = self.m_schedule * momentum_cache_t m_schedule_next = ( self.m_schedule * momentum_cache_t * momentum_cache_t_1 ) self.updates.append((self.m_schedule, m_schedule_new)) ms, vs = self._create_all_weights(params) for p, g, m, v in zip(params, grads, ms, vs): # the following equations given in [1] g_prime = g / (1.0 - m_schedule_new) m_t = self.beta_1 * m + (1.0 - self.beta_1) * g m_t_prime = m_t / (1.0 - m_schedule_next) v_t = self.beta_2 * v + (1.0 - self.beta_2) * tf.square(g) v_t_prime = v_t / (1.0 - tf.pow(self.beta_2, t)) m_t_bar = ( 1.0 - momentum_cache_t ) * g_prime + momentum_cache_t_1 * m_t_prime self.updates.append(tf.compat.v1.assign(m, m_t)) self.updates.append(tf.compat.v1.assign(v, v_t)) p_t = p - self.lr * m_t_bar / ( backend.sqrt(v_t_prime) + self.epsilon ) new_p = p_t # Apply constraints. if getattr(p, "constraint", None) is not None: new_p = p.constraint(new_p) self.updates.append(tf.compat.v1.assign(p, new_p)) return self.updates
def cosine_distance(x, y): """Cosine distance between vectors x and y.""" x_norm = tf.math.sqrt(tf.reduce_sum(tf.pow(x, 2), axis=-1)) x_norm = tf.reshape(x_norm, (-1, 1)) y_norm = tf.math.sqrt(tf.reduce_sum(tf.pow(y, 2), axis=-1)) y_norm = tf.reshape(y_norm, (-1, 1)) normalized_x = x / x_norm normalized_y = y / y_norm return tf.reduce_mean(tf.reduce_sum(normalized_x * normalized_y, axis=-1))
def _cdf(self, x): loc = tf.convert_to_tensor(self.loc) scale = tf.convert_to_tensor(self.scale) power = tf.convert_to_tensor(self.power) ipower = tf.math.reciprocal(power) half = tf.constant(0.5, dtype=self.dtype) # 0.5 is fp64 in numpy cdf = tf.where( x < loc, half * tf.math.igammac(ipower, tf.pow((loc - x) / scale, power)), half + half * tf.math.igamma(ipower, tf.pow((x - loc) / scale, power))) return cdf
def update_step(self, gradient, variable): """Update step given gradient and the associated model variable.""" beta_1_power = None beta_2_power = None lr = tf.cast(self.learning_rate, variable.dtype) local_step = tf.cast(self.iterations + 1, variable.dtype) beta_1_power = tf.pow(tf.cast(self.beta_1, variable.dtype), local_step) beta_2_power = tf.pow(tf.cast(self.beta_2, variable.dtype), local_step) var_key = self._var_key(variable) m = self._momentums[self._index_dict[var_key]] v = self._velocities[self._index_dict[var_key]] alpha = lr * tf.sqrt(1 - beta_2_power) / (1 - beta_1_power) # Apply step weight decay if ( self.weight_decay != 0 and variable not in self._exclude_from_weight_decay ): wd = tf.cast(self.weight_decay, variable.dtype) variable.assign_sub(variable * wd) if isinstance(gradient, tf.IndexedSlices): # Sparse gradients. m.assign_add(-m * (1 - self.beta_1)) m.scatter_add( tf.IndexedSlices( gradient.values * (1 - self.beta_1), gradient.indices ) ) v.assign_add(-v * (1 - self.beta_2)) v.scatter_add( tf.IndexedSlices( tf.square(gradient.values) * (1 - self.beta_2), gradient.indices, ) ) if self.amsgrad: v_hat = self._velocity_hats[self._index_dict[var_key]] v_hat.assign(tf.maximum(v_hat, v)) v = v_hat variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon)) else: # Dense gradients. m.assign_add((gradient - m) * (1 - self.beta_1)) v.assign_add((tf.square(gradient) - v) * (1 - self.beta_2)) if self.amsgrad: v_hat = self._velocity_hats[self._index_dict[var_key]] v_hat.assign(tf.maximum(v_hat, v)) v = v_hat variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon))
def update_step(self, gradient, variable): """Update step given gradient and the associated model variable.""" if self._var_key(variable) not in self._index_dict: raise KeyError( f'Optimizer cannot recognize variable {variable.name}, ' f'this usually means you are calling an optimizer ' f'previously used on a different model. Please try ' f'creating a new optimizer instance.') beta_1_power = None beta_2_power = None lr = tf.cast(self.learning_rate, variable.dtype) local_step = tf.cast(self.iterations + 1, variable.dtype) beta_1_power = tf.pow(tf.cast(self.beta_1, variable.dtype), local_step) beta_2_power = tf.pow(tf.cast(self.beta_2, variable.dtype), local_step) var_key = self._var_key(variable) m = self._momentums[self._index_dict[var_key]] v = self._velocities[self._index_dict[var_key]] alpha = (lr * tf.sqrt(1 - beta_2_power) / (1 - beta_1_power)) # Apply step weight decay if self.weight_decay != 0: wd = tf.cast(self.weight_decay, variable.dtype) variable.assign_sub(variable * (1 - lr * wd)) if isinstance(gradient, tf.IndexedSlices): # Sparse gradients. m.assign_add(-m * (1 - self.beta_1)) m.scatter_add( tf.IndexedSlices(gradient.values * (1 - self.beta_1), gradient.indices)) v.assign_add(-v * (1 - self.beta_2)) v.scatter_add( tf.IndexedSlices( tf.square(gradient.values) * (1 - self.beta_2), gradient.indices)) if self.amsgrad: v_hat = self._velocity_hats[self._index_dict[var_key]] v_hat.assign(tf.maximum(v_hat, v)) v = v_hat variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon)) else: # Dense gradients. m.assign_add((gradient - m) * (1 - self.beta_1)) v.assign_add((tf.square(gradient) - v) * (1 - self.beta_2)) if self.amsgrad: v_hat = self._velocity_hats[self._index_dict[var_key]] v_hat.assign(tf.maximum(v_hat, v)) v = v_hat variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon))
def geomspace(start, stop, num=50, endpoint=True, dtype=float): """Returns `num` values from a geometric progression. The ratio of any two consecutive values in the output sequence is constant. This is similar to `logspace`, except the endpoints are specified directly instead of as powers of a base. Args: start: start of the geometric progression. stop: end of the geometric progression. This is included in the output if endpoint is true. num: Number of values to sample. Defaults to 50. endpoint: Whether to include `stop` in the output. Defaults to true. dtype: Optional. Type of the resulting ndarray. Could be a python type, a NumPy type or a TensorFlow `DType`. If not provided, it is figured from input args. Returns: An ndarray. Raises: ValueError: If there is an error in the arguments. """ # TODO(srbs): Check whether dtype is handled properly. if dtype: dtype = utils.to_tf_type(dtype) if num < 0: raise ValueError( 'Number of samples {} must be non-negative.'.format(num)) if not num: return empty([0]) if start == 0: raise ValueError('start: {} must be non-zero.'.format(start)) if stop == 0: raise ValueError('stop: {} must be non-zero.'.format(stop)) if np_sign(start) != np_sign(stop): raise ValueError('start: {} and stop: {} must have same sign.'.format( start, stop)) step = 1. if endpoint: if num > 1: step = tf.pow((stop / start), 1 / (num - 1)) else: step = tf.pow((stop / start), 1 / num) result = tf.cast(tf.range(num), step.dtype) result = tf.pow(step, result) result = tf.multiply(result, start) if dtype: result = tf.cast(result, dtype=dtype) return utils.tensor_to_ndarray(result)
def update_step(self, gradient, variable): """Update step given gradient and the associated model variable.""" if self._var_key(variable) not in self._index_dict: raise KeyError( f'Optimizer cannot recognize variable {variable.name}, ' f'this usually means you are calling an optimizer ' f'previously used on a different model. Please try ' f'creating a new optimizer instance.') lr = tf.cast(self.learning_rate, variable.dtype) local_step = tf.cast(self.iterations + 1, variable.dtype) beta_1_power = tf.pow(tf.cast(self.beta_1, variable.dtype), local_step) var_key = self._var_key(variable) m = self._m[self._index_dict[var_key]] u = self._u[self._index_dict[var_key]] if isinstance(gradient, tf.IndexedSlices): # Sparse gradients. indices = gradient.indices m.assign_add(-m * (1 - self.beta_1)) m.scatter_add( tf.IndexedSlices(gradient.values * (1 - self.beta_1), indices)) u.assign(u * self.beta_2) u_slice = tf.gather(u, indices) u_slice_incremental = tf.maximum(u_slice, tf.abs( gradient.values)) - u_slice u.scatter_add(tf.IndexedSlices(u_slice_incremental, indices)) variable.assign_sub( (lr * m) / ((1 - beta_1_power) * (u + self.epsilon))) else: # Dense gradients. m.assign_add((gradient - m) * (1 - self.beta_1)) u.assign(tf.maximum(self.beta_2 * u, tf.abs(gradient))) variable.assign_sub( (lr * m) / ((1 - beta_1_power) * (u + self.epsilon)))
def update_step(self, gradient, variable): """Update step given gradient and the associated model variable.""" lr = tf.cast(self.learning_rate, variable.dtype) local_step = tf.cast(self.iterations + 1, variable.dtype) beta_1_power = tf.pow(tf.cast(self.beta_1, variable.dtype), local_step) var_key = self._var_key(variable) m = self._m[self._index_dict[var_key]] u = self._u[self._index_dict[var_key]] if isinstance(gradient, tf.IndexedSlices): # Sparse gradients. indices = gradient.indices m.assign_add(-m * (1 - self.beta_1)) m.scatter_add( tf.IndexedSlices(gradient.values * (1 - self.beta_1), indices)) u.assign(u * self.beta_2) u_slice = tf.gather(u, indices) u_slice_incremental = ( tf.maximum(u_slice, tf.abs(gradient.values)) - u_slice) u.scatter_add(tf.IndexedSlices(u_slice_incremental, indices)) variable.assign_sub( (lr * m) / ((1 - beta_1_power) * (u + self.epsilon))) else: # Dense gradients. m.assign_add((gradient - m) * (1 - self.beta_1)) u.assign(tf.maximum(self.beta_2 * u, tf.abs(gradient))) variable.assign_sub( (lr * m) / ((1 - beta_1_power) * (u + self.epsilon)))
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [] lr = self.lr if self.initial_decay > 0: lr = lr * ( # pylint: disable=g-no-augmented-assignment 1. / (1. + self.decay * tf.cast(self.iterations, backend.dtype(self.decay)))) with tf.control_dependencies( [tf.compat.v1.assign_add(self.iterations, 1)]): t = tf.cast(self.iterations, backend.floatx()) lr_t = lr / (1. - tf.pow(self.beta_1, t)) ms, us = self._create_all_weights(params) for p, g, m, u in zip(params, grads, ms, us): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g u_t = tf.maximum(self.beta_2 * u, tf.abs(g)) p_t = p - lr_t * m_t / (u_t + self.epsilon) self.updates.append(tf.compat.v1.assign(m, m_t)) self.updates.append(tf.compat.v1.assign(u, u_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(tf.compat.v1.assign(p, new_p)) return self.updates
def __call__(self, step): with tf.name_scope(self.name or "PolynomialDecay") as name: initial_learning_rate = tf.convert_to_tensor( self.initial_learning_rate, name="initial_learning_rate") dtype = initial_learning_rate.dtype end_learning_rate = tf.cast(self.end_learning_rate, dtype) power = tf.cast(self.power, dtype) global_step_recomp = tf.cast(step, dtype) decay_steps_recomp = tf.cast(self.decay_steps, dtype) if self.cycle: # Find the first multiple of decay_steps that is bigger than # global_step. If global_step is zero set the multiplier to 1 multiplier = tf.where( tf.equal(global_step_recomp, 0), 1.0, tf.math.ceil(global_step_recomp / self.decay_steps)) decay_steps_recomp = tf.multiply(decay_steps_recomp, multiplier) else: # Make sure that the global_step used is not bigger than decay_steps. global_step_recomp = tf.minimum(global_step_recomp, decay_steps_recomp) p = tf.divide(global_step_recomp, decay_steps_recomp) return tf.add(tf.multiply( initial_learning_rate - end_learning_rate, tf.pow(1 - p, power)), end_learning_rate, name=name)
def __call__(self, step): with tf.name_scope(self.name or "NoisyLinearCosineDecay") as name: initial_learning_rate = tf.convert_to_tensor( self.initial_learning_rate, name="initial_learning_rate") dtype = initial_learning_rate.dtype decay_steps = tf.cast(self.decay_steps, dtype) initial_variance = tf.cast(self.initial_variance, dtype) variance_decay = tf.cast(self.variance_decay, dtype) num_periods = tf.cast(self.num_periods, dtype) alpha = tf.cast(self.alpha, dtype) beta = tf.cast(self.beta, dtype) global_step_recomp = tf.cast(step, dtype) global_step_recomp = tf.minimum(global_step_recomp, decay_steps) linear_decayed = (decay_steps - global_step_recomp) / decay_steps variance = initial_variance / (tf.pow(1.0 + global_step_recomp, variance_decay)) std = tf.sqrt(variance) noisy_linear_decayed = ( linear_decayed + tf.random.normal(linear_decayed.shape, stddev=std)) completed_fraction = global_step_recomp / decay_steps fraction = 2.0 * num_periods * completed_fraction cosine_decayed = 0.5 * (1.0 + tf.cos(tf.constant(math.pi) * fraction)) noisy_linear_cosine_decayed = ( (alpha + noisy_linear_decayed) * cosine_decayed + beta) return tf.multiply(initial_learning_rate, noisy_linear_cosine_decayed, name=name)
def mix_white_noise(audio, noise_level_db): _, variance = tf.nn.moments(audio, axes=[0]) audio_rms = tf.math.sqrt(variance) noise_rms = tf.pow(10.0, noise_level_db / 10.0) * audio_rms noise = tf.random.normal( tf.shape(audio), mean=0.0, stddev=noise_rms, dtype=tf.float32) return audio + noise
def _variance(self): tailweight = tf.convert_to_tensor(self.tailweight) scale = tf.convert_to_tensor(self.scale) # For tail < 0.5, the variance is finite. See Eq (18) in # https://www.hindawi.com/journals/tswj/2015/909231/ var = ( tf.cast(tf.pow(1. - 2. * tailweight, -3. / 2.), dtype=self.dtype) * tf.math.square(scale)) # We need to put the tf.where inside the outer tf.where to ensure we never # hit a NaN in the gradient. result_where_defined = tf.where( tailweight < 0.5, var, tf.convert_to_tensor(np.inf, dtype=self.dtype)) if self.allow_nan_stats: return tf.where(tailweight < 1.0, result_where_defined, tf.convert_to_tensor(np.nan, self.dtype)) else: return distribution_util.with_dependencies([ assert_util.assert_greater_equal( tf.ones([], dtype=self.dtype), tailweight, message= "variance not defined for components of tailweight >= 1"), ], result_where_defined)
def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None): """Returns `num` values sampled evenly on a log scale. Equivalent to `base ** linspace(start, stop, num, endpoint)`. Args: start: base**start is the start of the output sequence. stop: If `endpoint` is true and num > 1, base ** stop is included in the output. If `endpoint` is false, `num` + 1 values are linearly sampled in [start, stop] both inclusive and the last value is ignored before raising to power of `base`. num: Number of values to sample. Defaults to 50. endpoint: When to include `base ** stop` in the output. Defaults to true. base: Base of the log space. dtype: Optional. Type of the resulting ndarray. Could be a python type, a NumPy type or a TensorFlow `DType`. If not provided, it is figured from input args. """ # TODO(srbs): Check whether dtype is handled properly. if dtype: dtype = utils.to_tf_type(dtype) result = linspace(start, stop, num=num, endpoint=endpoint) result = tf.pow(base, result.data) if dtype: result = utils.maybe_cast(result, dtype) return utils.tensor_to_ndarray(result)
def update(self, expert_dataset_iter, replay_buffer_iter): """Performs a single training step for critic and actor. Args: expert_dataset_iter: An tensorflow graph iteratable over expert data. replay_buffer_iter: An tensorflow graph iteratable over replay buffer. """ expert_states, expert_actions, _ = next(expert_dataset_iter) policy_states, policy_actions, _, _, _ = next(replay_buffer_iter)[0] policy_inputs = tf.concat([policy_states, policy_actions], -1) expert_inputs = tf.concat([expert_states, expert_actions], -1) alpha = tf.random.uniform(shape=(policy_inputs.get_shape()[0], 1)) inter = alpha * policy_inputs + (1 - alpha) * expert_inputs with tf.GradientTape(watch_accessed_variables=False) as tape: tape.watch(self.discriminator.variables) policy_output = self.discriminator(policy_inputs) expert_output = self.discriminator(expert_inputs) # Using the standard value for label smoothing instead of 0.25. classification_loss = tfgan_losses.modified_discriminator_loss( expert_output, policy_output, label_smoothing=0.0) with tf.GradientTape(watch_accessed_variables=False) as tape2: tape2.watch(inter) output = self.discriminator(inter) grad = tape2.gradient(output, [inter])[0] grad_penalty = tf.reduce_mean(tf.pow(tf.norm(grad, axis=-1) - 1, 2)) total_loss = classification_loss + self.grad_penalty_coeff * grad_penalty grads = tape.gradient(total_loss, self.discriminator.variables) self.optimizer.apply_gradients(zip(grads, self.discriminator.variables)) self.avg_classification_loss(classification_loss) self.avg_gp_loss(grad_penalty) self.avg_total_loss(total_loss) if tf.equal(self.optimizer.iterations % self.log_interval, 0): tf.summary.scalar( 'train gail/classification loss', self.avg_classification_loss.result(), step=self.optimizer.iterations) self.avg_classification_loss.reset_states() tf.summary.scalar( 'train gail/gradient penalty', self.avg_gp_loss.result(), step=self.optimizer.iterations) self.avg_gp_loss.reset_states() tf.summary.scalar( 'train gail/loss', self.avg_total_loss.result(), step=self.optimizer.iterations) self.avg_total_loss.reset_states()
def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None): if dtype: dtype = utils.result_type(dtype) result = linspace(start, stop, num=num, endpoint=endpoint) result = tf.pow(base, result.data) if dtype: result = tf.cast(result, dtype) return arrays.tensor_to_ndarray(result)
def call(self, y_true, y_pred): error = tf.pow(tf.abs(tf.squeeze(y_pred) - y_true), self._power) target_weights, target_index = self._get_target_weights_and_indices() quantiles = ops.softsort(error, axis=0, target_weights=target_weights, **self._kwargs) return tf.gather(quantiles, target_index, axis=0)
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [] lr = self.lr if self.initial_decay > 0: lr = lr * ( 1.0 / ( 1.0 + self.decay * tf.cast(self.iterations, backend.dtype(self.decay)) ) ) with tf.control_dependencies( [tf.compat.v1.assign_add(self.iterations, 1)] ): t = tf.cast(self.iterations, backend.floatx()) lr_t = lr * ( backend.sqrt(1.0 - tf.pow(self.beta_2, t)) / (1.0 - tf.pow(self.beta_1, t)) ) ms, vs, vhats = self._create_all_weights(params) for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): m_t = (self.beta_1 * m) + (1.0 - self.beta_1) * g v_t = (self.beta_2 * v) + (1.0 - self.beta_2) * tf.square(g) if self.amsgrad: vhat_t = tf.maximum(vhat, v_t) p_t = p - lr_t * m_t / (backend.sqrt(vhat_t) + self.epsilon) self.updates.append(tf.compat.v1.assign(vhat, vhat_t)) else: p_t = p - lr_t * m_t / (backend.sqrt(v_t) + self.epsilon) self.updates.append(tf.compat.v1.assign(m, m_t)) self.updates.append(tf.compat.v1.assign(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, "constraint", None) is not None: new_p = p.constraint(new_p) self.updates.append(tf.compat.v1.assign(p, new_p)) return self.updates
def call(self, y_true, y_pred): error = tf.pow(tf.abs(tf.squeeze(y_pred) - y_true), self._power) width = self._end_quantile - self._start_quantile quantile = 0.5 * (self._end_quantile + self._start_quantile) return ops.softquantiles(error, quantile, quantile_width=width, axis=0, **self._kwargs)
def oadam_update(g, alpha, beta_1, beta_2, epsilon, t, m, v): """Implements 'Algorithm 1' from [1].""" old_m = m old_v = v m = beta_1 * m + (1. - beta_1) * g # Biased first moment estimate. v = beta_2 * v + (1. - beta_2) * g * g # Biased second raw moment estimate. m_hat = m / (1. - tf.pow(beta_1, t)) # Bias corrected 1st moment estimate. v_hat = v / (1. - tf.pow(beta_2, t)) # Bias corrected 2nd moment estimate. if t == 1: update = alpha * m_hat / (tf.sqrt(v_hat) + epsilon) else: # Old bias corrected moment estimates. old_m_hat = old_m / (1. - tf.pow(beta_1, (t - 1))) old_v_hat = old_v / (1. - tf.pow(beta_2, (t - 1))) update = alpha * (2 * m_hat / (tf.sqrt(v_hat) + epsilon) - old_m_hat / (tf.sqrt(old_v_hat) + epsilon)) return update, m, v
def geomspace(start, stop, num=50, endpoint=True, dtype=float): # pylint: disable=missing-docstring if dtype: dtype = utils.result_type(dtype) if num < 0: raise ValueError('Number of samples {} must be non-negative.'.format(num)) if not num: return empty([0]) step = 1. if endpoint: if num > 1: step = tf.pow((stop / start), 1 / (num - 1)) else: step = tf.pow((stop / start), 1 / num) result = tf.cast(tf.range(num), step.dtype) result = tf.pow(step, result) result = tf.multiply(result, start) if dtype: result = tf.cast(result, dtype=dtype) return arrays_lib.tensor_to_ndarray(result)
def test_trimmed(self, start, end, power): loss_fn = losses.TrimmedRegressionLoss(start_quantile=start, end_quantile=end, power=power) loss = loss_fn(self._y_true, self._y_pred) start_index = int(start * self._num_points) end_index = int(end * self._num_points) selected = tf.pow(self._values[start_index:end_index], power) expected_loss = tf.math.reduce_mean(selected) self.assertAllClose(loss, expected_loss, 0.2, 0.2)
def _log_prob(self, x): loc = tf.convert_to_tensor(self.loc) scale = tf.convert_to_tensor(self.scale) power = tf.convert_to_tensor(self.power) one = tf.constant(1., dtype=self.dtype) two = tf.constant(2., dtype=self.dtype) log_normalization = (tf.math.log(two) + tf.math.log(scale) + tf.math.lgamma(one + tf.math.reciprocal(power))) log_unnormalized = -tf.pow(tf.abs(x - loc) / scale, power) return log_unnormalized - log_normalization
def _prepare_local(self, var_device, var_dtype, apply_state): super(Adam, self)._prepare_local(var_device, var_dtype, apply_state) local_step = tf.cast(self.iterations + 1, var_dtype) beta_1_t = tf.identity(self._get_hyper('beta_1', var_dtype)) beta_2_t = tf.identity(self._get_hyper('beta_2', var_dtype)) beta_1_power = tf.pow(beta_1_t, local_step) beta_2_power = tf.pow(beta_2_t, local_step) lr = (apply_state[(var_device, var_dtype)]['lr_t'] * (tf.sqrt(1 - beta_2_power) / (1 - beta_1_power))) apply_state[(var_device, var_dtype)].update( dict(lr=lr, epsilon=tf.convert_to_tensor(self.epsilon, var_dtype), beta_1_t=beta_1_t, beta_1_power=beta_1_power, one_minus_beta_1_t=1 - beta_1_t, beta_2_t=beta_2_t, beta_2_power=beta_2_power, one_minus_beta_2_t=1 - beta_2_t))