def students_t_nll(x, df, scale): """The NLL of a Generalized Student's T distribution (w/o including TFP).""" return 0.5 * ((df + 1.) * tf.math.log1p( (x / scale)**2. / df) + tf.math.log(df)) + tf.math.log( tf.abs(scale)) + tf.math.lgamma( 0.5 * df) - tf.math.lgamma(0.5 * df + 0.5) + 0.5 * np.log(np.pi)
def __init__(self, state_dim, action_dim, log_interval, actor_lr=1e-3, critic_lr=1e-3, alpha_init=1.0, learn_alpha=True, algae_alpha=1.0, use_dqn=True, use_init_states=True, exponent=2.0): """Creates networks. Args: state_dim: State size. action_dim: Action size. log_interval: Log losses every N steps. actor_lr: Actor learning rate. critic_lr: Critic learning rate. alpha_init: Initial temperature value for causal entropy regularization. learn_alpha: Whether to learn alpha or not. algae_alpha: Algae regularization weight. use_dqn: Whether to use double networks for target value. use_init_states: Whether to use initial states in objective. exponent: Exponent p of function f(x) = |x|^p / p. """ self.actor = Actor(state_dim, action_dim) self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr) self.avg_actor_loss = tf.keras.metrics.Mean('actor_loss', dtype=tf.float32) self.avg_alpha_loss = tf.keras.metrics.Mean('alpha_loss', dtype=tf.float32) self.avg_actor_entropy = tf.keras.metrics.Mean('actor_entropy', dtype=tf.float32) self.avg_alpha = tf.keras.metrics.Mean('alpha', dtype=tf.float32) self.avg_lambda = tf.keras.metrics.Mean('lambda', dtype=tf.float32) self.use_init_states = use_init_states if use_dqn: self.critic = DoubleCritic(state_dim, action_dim) self.critic_target = DoubleCritic(state_dim, action_dim) else: self.critic = Critic(state_dim, action_dim) self.critic_target = Critic(state_dim, action_dim) soft_update(self.critic, self.critic_target, tau=1.0) self._lambda = tf.Variable(0.0, trainable=True) self.critic_optimizer = tf.keras.optimizers.Adam( learning_rate=critic_lr) self.avg_critic_loss = tf.keras.metrics.Mean('critic_loss', dtype=tf.float32) self.log_alpha = tf.Variable(tf.math.log(alpha_init), trainable=True) self.learn_alpha = learn_alpha self.alpha_optimizer = tf.keras.optimizers.Adam() self.log_interval = log_interval self.algae_alpha = algae_alpha self.use_dqn = use_dqn self.exponent = exponent if self.exponent <= 1: raise ValueError( 'Exponent must be greather than 1, but received %f.' % self.exponent) self.f = lambda resid: tf.pow(tf.abs(resid), self.exponent ) / self.exponent clip_resid = lambda resid: tf.clip_by_value(resid, 0.0, 1e6) self.fgrad = lambda resid: tf.pow(clip_resid(resid), self.exponent - 1)
def sym_exp_sigmoid(x, width=8.0): """Symmetrical version of exp_sigmoid centered at (0, 1e-7).""" x = tf_float32(x) return exp_sigmoid(width * (tf.abs(x) / 2.0 - 1.0))
def __init__(self, level_scale_prior=None, initial_level_prior=None, observed_time_series=None, name=None): """Specify a local level model. Args: level_scale_prior: optional `tfd.Distribution` instance specifying a prior on the `level_scale` parameter. If `None`, a heuristic default prior is constructed based on the provided `observed_time_series`. Default value: `None`. initial_level_prior: optional `tfd.Distribution` instance specifying a prior on the initial level. If `None`, a heuristic default prior is constructed based on the provided `observed_time_series`. Default value: `None`. observed_time_series: optional `float` `Tensor` of shape `batch_shape + [T, 1]` (omitting the trailing unit dimension is also supported when `T > 1`), specifying an observed time series. Any priors not explicitly set will be given default values according to the scale of the observed time series (or batch of time series). May optionally be an instance of `tfp.sts.MaskedTimeSeries`, which includes a mask `Tensor` to specify timesteps with missing observations. Default value: `None`. name: the name of this model component. Default value: 'LocalLevel'. """ with tf1.name_scope(name, 'LocalLevel', values=[observed_time_series]) as name: dtype = dtype_util.common_dtype( [level_scale_prior, initial_level_prior]) if level_scale_prior is None or initial_level_prior is None: if observed_time_series is not None: _, observed_stddev, observed_initial = ( sts_util.empirical_statistics(observed_time_series)) else: observed_stddev, observed_initial = (tf.convert_to_tensor( value=1., dtype=dtype), tf.convert_to_tensor( value=0., dtype=dtype)) # Heuristic default priors. Overriding these may dramatically # change inference performance and results. if level_scale_prior is None: level_scale_prior = tfd.LogNormal(loc=tf.math.log( .05 * observed_stddev), scale=3., name='level_scale_prior') if initial_level_prior is None: self._initial_state_prior = tfd.MultivariateNormalDiag( loc=observed_initial[..., tf.newaxis], scale_diag=(tf.abs(observed_initial) + observed_stddev)[..., tf.newaxis], name='initial_level_prior') else: self._initial_state_prior = tfd.MultivariateNormalDiag( loc=initial_level_prior.mean()[..., tf.newaxis], scale_diag=initial_level_prior.stddev()[..., tf.newaxis]) super(LocalLevel, self).__init__(parameters=[ Parameter( 'level_scale', level_scale_prior, tfb.Chain([ tfb.AffineScalar(scale=observed_stddev), tfb.Softplus() ])), ], latent_size=1, name=name)
def interpolate(times, interval_values, interval_times, validate_args=False, dtype=None, name=None): """Performs the monotone convex interpolation. The monotone convex method is a scheme devised by Hagan and West (Ref [1]). It is a commonly used method to interpolate interest rate yield curves. For more details see Refs [1, 2]. It is important to point out that the monotone convex method *does not* solve the standard interpolation problem but a modified one as described below. Suppose we are given a strictly increasing sequence of scalars (which we will refer to as time) `[t_1, t_2, ... t_n]` and a set of values `[f_1, f_2, ... f_n]`. The aim is to find a function `f(t)` defined on the interval `[0, t_n]` which satisfies (in addition to continuity and positivity conditions) the following ```None Integral[f(u), t_{i-1} <= u <= t_i] = f_i, with t_0 = 0 ``` In the context of interest rate curve building, `f(t)` corresponds to the instantaneous forward rate at time `t` and the `f_i` correspond to the discrete forward rates that apply to the time period `[t_{i-1}, t_i]`. Furthermore, the integral of the forward curve is related to the yield curve by ```None Integral[f(u), 0 <= u <= t] = r(t) * t ``` where `r(t)` is the interest rate that applies between `[0, t]` (the yield of a zero coupon bond paying a unit of currency at time `t`). This function computes both the interpolated value and the integral along the segment containing the supplied time. Specifically, given a time `t` such that `t_k <= t <= t_{k+1}`, this function computes the interpolated value `f(t)` and the value `Integral[f(u), t_k <= u <= t]`. This implementation of the method currently supports batching along the interpolation times but not along the interpolated curves (i.e. it is possible to evaluate the `f(t)` for `t` as a vector of times but not build multiple curves at the same time). ### Example ```python interval_times = tf.constant([0.25, 0.5, 1.0, 2.0, 3.0], dtype=dtype) interval_values = tf.constant([0.05, 0.051, 0.052, 0.053, 0.055], dtype=dtype) times = tf.constant([0.25, 0.5, 1.0, 2.0, 3.0, 1.1], dtype=dtype) # Returns the following two values: # interpolated = [0.0505, 0.05133333, 0.05233333, 0.054, 0.0555, 0.05241] # integrated = [0, 0, 0, 0, 0.055, 0.005237] # Note that the first four integrated values are zero. This is because # those interpolation time are at the start of their containing interval. # The fourth value (i.e. at 3.0) is not zero because this is the last # interval (i.e. it is the integral from 2.0 to 3.0). interpolated, integrated = interpolate( times, interval_values, interval_times) ``` ### References: [1]: Patrick Hagan & Graeme West. Interpolation Methods for Curve Construction. Applied Mathematical Finance. Vol 13, No. 2, pp 89-129. June 2006. https://www.researchgate.net/publication/24071726_Interpolation_Methods_for_Curve_Construction [2]: Patrick Hagan & Graeme West. Methods for Constructing a Yield Curve. Wilmott Magazine, pp. 70-81. May 2008. Args: times: Non-negative rank 1 `Tensor` of any size. The times for which the interpolation has to be performed. interval_values: Rank 1 `Tensor` of the same shape and dtype as `interval_times`. The values associated to each of the intervals specified by the `interval_times`. Must have size at least 2. interval_times: Strictly positive rank 1 `Tensor` of real dtype containing increasing values. The endpoints of the intervals (i.e. `t_i` above.). Note that the left end point of the first interval is implicitly assumed to be 0. Must have size at least 2. validate_args: Python bool. If true, adds control dependencies to check that the `times` are bounded by the `interval_endpoints`. Default value: False dtype: `tf.Dtype` to use when converting arguments to `Tensor`s. If not supplied, the default Tensorflow conversion will take place. Note that this argument does not do any casting. Default value: None. name: Python `str` name prefixed to Ops created by this class. Default value: None which is mapped to the default name 'interpolation'. Returns: A 2-tuple containing interpolated_values: Rank 1 `Tensor` of the same size and dtype as the `times`. The interpolated values at the supplied times. integrated_values: Rank 1 `Tensor` of the same size and dtype as the `times`. The integral of the interpolated function. The integral is computed from the largest interval time that is smaller than the time up to the given time. """ with tf.compat.v1.name_scope( name, default_name='interpolate', values=[times, interval_times, interval_values]): times = tf.convert_to_tensor(times, dtype=dtype, name='times') interval_times = tf.convert_to_tensor(interval_times, dtype=dtype, name='interval_times') interval_values = tf.convert_to_tensor(interval_values, dtype=dtype, name='interval_values') control_deps = [] if validate_args: control_deps = [ tf.compat.v1.debugging.assert_non_negative(times), tf.compat.v1.debugging.assert_positive(interval_times) ] with tf.compat.v1.control_dependencies(control_deps): # Step 1: Find the values at the endpoints. endpoint_values = _interpolate_adjacent(interval_times, interval_values) endpoint_times = tf.concat([[0.0], interval_times], axis=0) intervals = piecewise.find_interval_index( times, endpoint_times, last_interval_is_closed=True) # Comparing to the notation used in the paper: # f_left -> f_{i-1} # f_right -> f_i # t_left -> t_{i-1} # t_right -> t_i # fd -> f^d_i # g0 -> g0 # g1 -> g1 # g1plus2g0 -> g1 + 2 g0 (boundary line A) # g0plus2g1 -> g0 + 2 g1 (boundary line B) # x -> x f_left = tf.gather(endpoint_values, intervals) f_right = tf.gather(endpoint_values, intervals + 1) # fd is the discrete forward associated to each interval. fd = tf.gather(interval_values, intervals) t_left = tf.gather(endpoint_times, intervals) t_right = tf.gather(endpoint_times, intervals + 1) interval_lengths = (t_right - t_left) x = (times - t_left) / interval_lengths # TODO(b/140410758): The calculation below can be done more efficiently # if we instead do the following: # 1. Subdivide the regions further so that each subregion corresponds # to a single quadratic in x. (Region 2, 3 and 4 get divided into 2 # pieces for a total of 7 cases. # 2. For each interval (i.e. [t_i, t{i+1}]) the case that applies to # a point falling in that region can be decided and the corresponding # quadratic coefficients computed once and for all. # 3. The above information is built once for the supplied forwards. # 4. Given the above information and a set of times to interpolate for, # we map each time to the appropriate interval and compute the quadratic # function value using that x. g0 = f_left - fd g1 = f_right - fd g1plus2g0 = g1 + 2 * g0 g0plus2g1 = g0 + 2 * g1 result = tf.zeros_like(times) integrated = tf.zeros_like(times) # The method uses quadratic splines to do the interpolation. # The specific spline used depends on the relationship between the # boundary values (`g0` and `g1` above). # The two dimensional plane determined by these two values is divided # into four wedge sections referred to as region 1, 2, 3 and 4 below. # For details of how the regions are defined, see Fig. 4 in Ref [2]. is_region_1, region_1_value, integrated_value_1 = _region_1( g1plus2g0, g0plus2g1, g0, g1, x) result = tf.where(is_region_1, region_1_value, result) integrated = tf.where(is_region_1, integrated_value_1, integrated) is_region_2, region_2_value, integrated_value_2 = _region_2( g1plus2g0, g0plus2g1, g0, g1, x) result = tf.where(is_region_2, region_2_value, result) integrated = tf.where(is_region_2, integrated_value_2, integrated) is_region_3, region_3_value, integrated_value_3 = _region_3( g1plus2g0, g0plus2g1, g0, g1, x) result = tf.where(is_region_3, region_3_value, result) integrated = tf.where(is_region_3, integrated_value_3, integrated) is_region_4, region_4_value, integrated_value_4 = _region_4( g1plus2g0, g0plus2g1, g0, g1, x) result = tf.where(is_region_4, region_4_value, result) integrated = tf.where(is_region_4, integrated_value_4, integrated) # g0 = g1 = 0 requires special handling. Checking if the values are # legitimatey zero requires we pay close attention to the numerical # precision issues. g0_eps = tf.abs(tf.math.nextafter(fd, f_left) - fd) * 1.1 g1_eps = tf.abs(tf.math.nextafter(fd, f_right) - fd) * 1.1 is_origin = ((tf.abs(g0) <= g0_eps) & (tf.abs(g1) <= g1_eps)) result = tf.where(is_origin, tf.zeros_like(result), result) integrated = tf.where(is_origin, tf.zeros_like(integrated), integrated) return (result + fd, (integrated + fd * x) * interval_lengths)
def test_abs_square(self, dtype): test_values = np.array([1 + 2j, 0.3 - 1j, 3.5 - 3.7j]) input_values = tf.cast(test_values, dtype) actual_abs_square = rk_util.abs_square(input_values) expected_abs_square = tf.math.square(tf.abs(input_values)) self.assertAllClose(actual_abs_square, expected_abs_square)
def reduce_weighted_logsumexp(logx, w=None, axis=None, keep_dims=False, return_sign=False, name=None): """Computes `log(abs(sum(weight * exp(elements across tensor dimensions))))`. If all weights `w` are known to be positive, it is more efficient to directly use `reduce_logsumexp`, i.e., `tf.reduce_logsumexp(logx + tf.log(w))` is more efficient than `du.reduce_weighted_logsumexp(logx, w)`. Reduces `input_tensor` along the dimensions given in `axis`. Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in `axis`. If `keep_dims` is true, the reduced dimensions are retained with length 1. If `axis` has no entries, all dimensions are reduced, and a tensor with a single element is returned. This function is more numerically stable than log(sum(w * exp(input))). It avoids overflows caused by taking the exp of large inputs and underflows caused by taking the log of small inputs. For example: ```python x = tf.constant([[0., 0, 0], [0, 0, 0]]) w = tf.constant([[-1., 1, 1], [1, 1, 1]]) du.reduce_weighted_logsumexp(x, w) # ==> log(-1*1 + 1*1 + 1*1 + 1*1 + 1*1 + 1*1) = log(4) du.reduce_weighted_logsumexp(x, w, axis=0) # ==> [log(-1+1), log(1+1), log(1+1)] du.reduce_weighted_logsumexp(x, w, axis=1) # ==> [log(-1+1+1), log(1+1+1)] du.reduce_weighted_logsumexp(x, w, axis=1, keep_dims=True) # ==> [[log(-1+1+1)], [log(1+1+1)]] du.reduce_weighted_logsumexp(x, w, axis=[0, 1]) # ==> log(-1+5) ``` Args: logx: The tensor to reduce. Should have numeric type. w: The weight tensor. Should have numeric type identical to `logx`. axis: The dimensions to reduce. If `None` (the default), reduces all dimensions. Must be in the range `[-rank(input_tensor), rank(input_tensor))`. keep_dims: If true, retains reduced dimensions with length 1. return_sign: If `True`, returns the sign of the result. name: A name for the operation (optional). Returns: lswe: The `log(abs(sum(weight * exp(x))))` reduced tensor. sign: (Optional) The sign of `sum(weight * exp(x))`. """ with tf.name_scope(name or 'reduce_weighted_logsumexp'): logx = tf.convert_to_tensor(logx, name='logx') if w is None: lswe = tf.reduce_logsumexp(logx, axis=axis, keepdims=keep_dims) if return_sign: sgn = tf.ones_like(lswe) return lswe, sgn return lswe w = tf.convert_to_tensor(w, dtype=logx.dtype, name='w') log_absw_x = logx + tf.math.log(tf.abs(w)) max_log_absw_x = tf.reduce_max(log_absw_x, axis=axis, keepdims=True) # If the largest element is `-inf` or `inf` then we don't bother subtracting # off the max. We do this because otherwise we'd get `inf - inf = NaN`. That # this is ok follows from the fact that we're actually free to subtract any # value we like, so long as we add it back after taking the `log(sum(...))`. max_log_absw_x = tf.where( tf.math.is_inf(max_log_absw_x), tf.zeros([], max_log_absw_x.dtype), max_log_absw_x) wx_over_max_absw_x = (tf.sign(w) * tf.exp(log_absw_x - max_log_absw_x)) sum_wx_over_max_absw_x = tf.reduce_sum( wx_over_max_absw_x, axis=axis, keepdims=keep_dims) if not keep_dims: max_log_absw_x = tf.squeeze(max_log_absw_x, axis) sgn = tf.sign(sum_wx_over_max_absw_x) lswe = max_log_absw_x + tf.math.log(sgn * sum_wx_over_max_absw_x) if return_sign: return lswe, sgn return lswe
def expected_calibration_error(num_bins, logits=None, labels_true=None, labels_predicted=None, name=None): """Compute the Expected Calibration Error (ECE). This method implements equation (3) in [1]. In this equation the probability of the decided label being correct is used to estimate the calibration property of the predictor. Note: a trade-off exist between using a small number of `num_bins` and the estimation reliability of the ECE. In particular, this method may produce unreliable ECE estimates in case there are few samples available in some bins. As an alternative to this method, consider also using `bayesian_expected_calibration_error`. #### References [1]: Chuan Guo, Geoff Pleiss, Yu Sun, Kilian Q. Weinberger, On Calibration of Modern Neural Networks. Proceedings of the 34th International Conference on Machine Learning (ICML 2017). arXiv:1706.04599 https://arxiv.org/pdf/1706.04599.pdf Args: num_bins: int, number of probability bins, e.g. 10. logits: Tensor, (n,nlabels), with logits for n instances and nlabels. labels_true: Tensor, (n,), with tf.int32 or tf.int64 elements containing ground truth class labels in the range [0,nlabels]. labels_predicted: Tensor, (n,), with tf.int32 or tf.int64 elements containing decisions of the predictive system. If `None`, we will use the argmax decision using the `logits`. name: Python `str` name prefixed to Ops created by this function. Returns: ece: Tensor, scalar, tf.float32. """ with tf.name_scope(name or 'expected_calibration_error'): logits = tf.convert_to_tensor(logits) labels_true = tf.convert_to_tensor(labels_true) if labels_predicted is not None: labels_predicted = tf.convert_to_tensor(labels_predicted) # Compute empirical counts over the events defined by the sets # {incorrect,correct}x{0,1,..,num_bins-1}, as well as the empirical averages # of predicted probabilities in each probability bin. event_bin_counts, pmean_observed = _compute_calibration_bin_statistics( num_bins, logits=logits, labels_true=labels_true, labels_predicted=labels_predicted) # Compute the marginal probability of observing a probability bin. event_bin_counts = tf.cast(event_bin_counts, tf.float32) bin_n = tf.reduce_sum(event_bin_counts, axis=0) pbins = bin_n / tf.reduce_sum( bin_n) # Compute the marginal bin probability # Compute the marginal probability of making a correct decision given an # observed probability bin. tiny = np.finfo(np.float32).tiny pcorrect = event_bin_counts[1, :] / (bin_n + tiny) # Compute the ECE statistic as defined in reference [1]. ece = tf.reduce_sum(pbins * tf.abs(pcorrect - pmean_observed)) return ece
def abs_func(x): return tf.reduce_sum(input_tensor=tf.abs(x), axis=-1)
def _process(job): file_num, rf = tuple(job) metadata_filepath = '{}/results/val_{}/structs_42.mat'.format( _BUCKET, file_num) target_jpeg_filepath = '{}/images/val_{}.JPEG'.format( _BUCKET, file_num + 5000) body_jpeg_filepath = '{}/images/val_{}.JPEG'.format( _BUCKET, file_num) m = h5py.File(tf.io.gfile.GFile(metadata_filepath, 'rb'), mode='r') elements = m.get('/xdc/out')[()].shape[-1] on_elements = [ np.arange(i, _SUB_APERTURE_SIZE + i, dtype=np.int64) for i in range(len(_TX_POS)) ] tx_delays = np.zeros((len(_TX_POS), elements)) for i, on_ele in enumerate(on_elements): tx_delays[i, on_ele] = np.squeeze(m.get('xdc/delays'))[()] on_bool = np.zeros((len(_TX_POS), elements)) for i, on_ele in enumerate(on_elements): on_bool[i, on_ele] = np.ones(_SUB_APERTURE_SIZE) on_bool = on_bool.astype(bool) common = { 'c': np.squeeze(m.get('acq_params/c')[()]), 'fs': np.squeeze(m.get('acq_params/fs')[()]), 'tiny_imagenet': { 'target_number': file_num + 5000, 'body_wall_number': file_num, 'target': target_jpeg_filepath, 'clutter_wall': body_jpeg_filepath }, 'probe': { 'tx_delays': tx_delays, 'element_on_mask': on_bool, 'element_positions': np.transpose(m.get('xdc/out')[()]), 'rx_positions': np.transpose(m.get('xdc/out')[()]), 'tx_positions': np.transpose( np.stack([ _TX_POS, np.zeros_like(_TX_POS), np.zeros_like(_TX_POS) ])), 'impulse': np.squeeze(m.get('xdc/impulse')[()]), 'pulse': np.squeeze(m.get('xdc/pulse')[()]), 'excitation': np.squeeze(m.get('xdc/excitation')[()]), 'a': np.squeeze(m.get('bf_params/a')[()]), 'b': np.squeeze(m.get('bf_params/b')[()]), 'pitch': np.squeeze(m.get('xdc/pitch')[()]), 't0': np.squeeze(m.get('xdc/t0')[()]), 'f0_hz': np.squeeze(m.get('acq_params/f0')[()]), 'focus_depth': np.squeeze(m.get('xdc/focus')[()][2]) }, 'sim': { 'B': np.squeeze(m.get('input_vars/B')[()]), 'cmap': np.squeeze(m.get('field_maps/cmap')[()]), 'atten': np.squeeze(m.get('input_vars/atten')[()]), 'cfl': np.squeeze(m.get('input_vars/cfl')[()]), 'ncycles': np.squeeze(m.get('input_vars/ncycles')[()]), 'omega0': np.squeeze(m.get('input_vars/omega0')[()]), 'p0': np.squeeze(m.get('input_vars/p0')[()]), 'ppw': np.squeeze(m.get('input_vars/ppw')[()]), 'rho': np.squeeze(m.get('input_vars/rho')[()]), 'td': np.squeeze(m.get('input_vars/td')[()]), 'v': np.squeeze(m.get('input_vars/v')[()]), 'grid': { 'dT': np.squeeze(m.get('grid_vars/dT')[()]), 'dY': np.squeeze(m.get('grid_vars/dY')[()]), 'dZ': np.squeeze(m.get('grid_vars/dZ')[()]), 'nT': np.squeeze(m.get('grid_vars/nT')[()]), 'nY': np.squeeze(m.get('grid_vars/nY')[()]), 'nZ': np.squeeze(m.get('grid_vars/nZ')[()]), 'wY': np.squeeze(m.get('input_vars/wY')[()]), 'wZ': np.squeeze(m.get('input_vars/wZ')[()]), } } } a = common['probe']['a'].astype(np.float32) b = common['probe']['b'].astype(np.float32) fs = common['fs'].astype(np.float32) f0 = common['probe']['f0_hz'].astype(np.float32) rf = list(rf) ordering = np.array([r['i'] for r in rf]) ordering = list(np.argsort(ordering)) nowall = [] wall = [] for idx in ordering: nowall.append(rf[idx]['nowall']) wall.append(rf[idx]['wall']) nowall = np.stack(nowall) wall = np.stack(wall) rf = None nowall = signal.lfilter(b, a, nowall, axis=-1) wall = signal.lfilter(b, a, wall, axis=-1) if self.builder_config.name is 'channel': varying = {'without_wall': nowall, 'with_wall': wall} if self.builder_config.name is 'iq_channel': nowall = self.tf_demodulate(nowall, fs, f0, axis=-1) wall = self.tf_demodulate(wall, fs, f0, axis=-1) varying = { 'without_wall_real': tf.cast(tf.math.real(nowall), tf.float16), 'without_wall_imag': tf.cast(tf.math.imag(nowall), tf.float16), 'with_wall_real': tf.cast(tf.math.real(wall), tf.float16), 'with_wall_imag': tf.cast(tf.math.imag(wall), tf.float16) } if self.builder_config.name is 'dynamic_rx_beamformed' or self.builder_config.name is 'b_mode' or self.builder_config.name is 'iq_dynamic_rx_beamformed': s = self.beamform_dynamic_rx(nowall.shape, common['probe']['tx_positions'], common['probe']['rx_positions'], s0=common['probe']['t0'] * common['fs'], fs=common['fs'], data_dtype=tf.float64) s = tf.cast(s, tf.float32) nowall = self.apply_delays(s, nowall)[..., 3117:6234] wall = self.apply_delays(s, wall)[..., 3117:6234] if self.builder_config.name is 'iq_dynamic_rx_beamformed': nowall = self.tf_demodulate(nowall, fs, f0, axis=-1) wall = self.tf_demodulate(wall, fs, f0, axis=-1) varying = { 'without_wall_real': tf.cast(tf.math.real(nowall), tf.float16), 'without_wall_imag': tf.cast(tf.math.imag(nowall), tf.float16), 'with_wall_real': tf.cast(tf.math.real(wall), tf.float16), 'with_wall_imag': tf.cast(tf.math.imag(wall), tf.float16) } if self.builder_config.name is 'b_mode': on_mask = tf.cast( common['probe']['element_on_mask'][..., None], tf.float32) env = tf.abs( self.tf_hilbert(tf.reduce_sum(nowall * on_mask, axis=1), axis=-1)) env = env / tf.reduce_max(env) nowall = self.tf_db(env) env = tf.abs( self.tf_hilbert(tf.reduce_sum(wall * on_mask, axis=1), axis=-1)) env = env / tf.reduce_max(env) wall = self.tf_db(env) varying = { 'without_wall': tf.cast(nowall, tf.float16), 'with_wall': tf.cast(wall, tf.float16) } m.close() beam.metrics.Metrics.counter('results', "rf-processed").inc() yield file_num, {'data': varying, 'params': common}
def __init__(self, order, coefficients_prior=None, level_scale_prior=None, initial_state_prior=None, coefficient_constraining_bijector=None, observed_time_series=None, name=None): """Specify an autoregressive model. Args: order: scalar Python positive `int` specifying the number of past timesteps to regress on. coefficients_prior: optional `tfd.Distribution` instance specifying a prior on the `coefficients` parameter. If `None`, a default standard normal (`tfd.MultivariateNormalDiag(scale_diag=tf.ones([order]))`) prior is used. Default value: `None`. level_scale_prior: optional `tfd.Distribution` instance specifying a prior on the `level_scale` parameter. If `None`, a heuristic default prior is constructed based on the provided `observed_time_series`. Default value: `None`. initial_state_prior: optional `tfd.Distribution` instance specifying a prior on the initial state, corresponding to the values of the process at a set of size `order` of imagined timesteps before the initial step. If `None`, a heuristic default prior is constructed based on the provided `observed_time_series`. Default value: `None`. coefficient_constraining_bijector: optional `tfb.Bijector` instance representing a constraining mapping for the autoregressive coefficients. For example, `tfb.Tanh()` constrains the coefficients to lie in `(-1, 1)`, while `tfb.Softplus()` constrains them to be positive, and `tfb.Identity()` implies no constraint. If `None`, the default behavior constrains the coefficients to lie in `(-1, 1)` using a `Tanh` bijector. Default value: `None`. observed_time_series: optional `float` `Tensor` of shape `batch_shape + [T, 1]` (omitting the trailing unit dimension is also supported when `T > 1`), specifying an observed time series. Any priors not explicitly set will be given default values according to the scale of the observed time series (or batch of time series). May optionally be an instance of `tfp.sts.MaskedTimeSeries`, which includes a mask `Tensor` to specify timesteps with missing observations. Default value: `None`. name: the name of this model component. Default value: 'Autoregressive'. """ with tf1.name_scope( name, 'Autoregressive', values=[observed_time_series]) as name: masked_time_series = None if observed_time_series is not None: masked_time_series = ( sts_util.canonicalize_observed_time_series_with_mask( observed_time_series)) dtype = dtype_util.common_dtype( [(masked_time_series.time_series if masked_time_series is not None else None), coefficients_prior, level_scale_prior, initial_state_prior], dtype_hint=tf.float32) if observed_time_series is not None: _, observed_stddev, observed_initial = sts_util.empirical_statistics( masked_time_series) else: observed_stddev, observed_initial = ( tf.convert_to_tensor(value=1., dtype=dtype), tf.convert_to_tensor(value=0., dtype=dtype)) batch_ones = tf.ones(tf.concat([ tf.shape(input=observed_initial), # Batch shape [order]], axis=0), dtype=dtype) # Heuristic default priors. Overriding these may dramatically # change inference performance and results. if coefficients_prior is None: coefficients_prior = tfd.MultivariateNormalDiag( scale_diag=batch_ones) if level_scale_prior is None: level_scale_prior = tfd.LogNormal( loc=tf.math.log(0.05 * observed_stddev), scale=3.) if (coefficients_prior.event_shape.is_fully_defined() and order != coefficients_prior.event_shape[0]): raise ValueError("Prior dimension {} doesn't match order {}.".format( coefficients_prior.event_shape[0], order)) if initial_state_prior is None: initial_state_prior = tfd.MultivariateNormalDiag( loc=observed_initial[..., tf.newaxis] * batch_ones, scale_diag=(tf.abs(observed_initial) + observed_stddev)[..., tf.newaxis] * batch_ones) self._order = order self._coefficients_prior = coefficients_prior self._level_scale_prior = level_scale_prior self._initial_state_prior = initial_state_prior if coefficient_constraining_bijector is None: coefficient_constraining_bijector = tfb.Tanh() super(Autoregressive, self).__init__( parameters=[ Parameter('coefficients', coefficients_prior, coefficient_constraining_bijector), Parameter('level_scale', level_scale_prior, tfb.Chain([tfb.AffineScalar(scale=observed_stddev), tfb.Softplus()])) ], latent_size=order, name=name)
def _forward_log_det_jacobian(self, x): x = self._maybe_assert_valid(x) return -2. * tf.math.log(tf.abs(x))
def _forward(self, x): return tf.abs(x)
def _forward_log_det_jacobian(self, x): power = tf.cast(self.power, x.dtype) with tf.control_dependencies(self._assertions(x, power=power)): return tf.math.log(tf.abs(power)) + tf.math.xlogy( power - 1., tf.math.abs(x))
def _sample_n(self, n, seed=None): dim0_seed, otherdims_seed = samplers.split_seed( seed, salt='von_mises_fisher') # The sampling strategy relies on the fact that vMF variates are symmetric # about the mean direction. Accordingly, if we have a sampling strategy for # the away-from-mean angle, then we can uniformly sample the remaining # dimensions on the S^{dim-2} sphere for , and rotate these samples from a # (1, 0, 0, ..., 0)-mode distribution into the target orientation. # # This is easy to imagine on the 1-sphere (S^1; in 2-D space): sample a # von-Mises distributed `x` value in [-1, 1], then uniformly select what # amounts to a "up" or "down" additional degree of freedom after unit # normalizing, followed by a final rotation to the desired mean direction # from a basis of (1, 0). # # On S^2 (in 3-D), selecting a vMF `x` identifies a circle in `yz` on the # unit sphere over which the distribution is uniform, in particular the # circle where x = \hat{x} intersects the unit sphere. We pick a point on # that circle, then rotate to the desired mean direction from a basis of # (1, 0, 0). mean_direction = tf.convert_to_tensor(self.mean_direction) concentration = tf.convert_to_tensor(self.concentration) event_dim = ( tf.compat.dimension_value(self.event_shape[0]) or self._event_shape_tensor(mean_direction=mean_direction)[0]) sample_batch_shape = ps.concat( [[n], self._batch_shape_tensor(mean_direction=mean_direction, concentration=concentration)], axis=0) dim = tf.cast(event_dim - 1, self.dtype) if event_dim == 3: samples_dim0 = self._sample_3d(n, mean_direction=mean_direction, concentration=concentration, seed=dim0_seed) else: # Wood'94 provides a rejection algorithm to sample the x coordinate. # Wood'94 definition of b: # b = (-2 * kappa + tf.sqrt(4 * kappa**2 + dim**2)) / dim # https://stats.stackexchange.com/questions/156729 suggests: b = dim / (2 * concentration + tf.sqrt(4 * concentration**2 + dim**2)) # TODO(bjp): Integrate any useful numerical tricks from hyperspherical VAE # https://github.com/nicola-decao/s-vae-tf/ x = (1 - b) / (1 + b) c = concentration * x + dim * tf.math.log1p(-x**2) beta = beta_lib.Beta(dim / 2, dim / 2) def cond_fn(w, should_continue, seed): del w, seed return tf.reduce_any(should_continue) def body_fn(w, should_continue, seed): """While loop body for sampling the angle `w`.""" beta_seed, unif_seed, next_seed = samplers.split_seed(seed, n=3) z = beta.sample(sample_shape=sample_batch_shape, seed=beta_seed) # set_shape needed here because of b/139013403 tensorshape_util.set_shape(z, w.shape) w = tf.where(should_continue, (1. - (1. + b) * z) / (1. - (1. - b) * z), w) if not self.allow_nan_stats: w = tf.debugging.check_numerics(w, 'w') unif = samplers.uniform(sample_batch_shape, seed=unif_seed, dtype=self.dtype) # set_shape needed here because of b/139013403 tensorshape_util.set_shape(unif, w.shape) should_continue = should_continue & ( concentration * w + dim * tf.math.log1p(-x * w) - c < # Use log1p(-unif) to prevent log(0) and ensure that log(1) is # possible. tf.math.log1p(-unif)) return w, should_continue, next_seed w = tf.zeros(sample_batch_shape, dtype=self.dtype) should_continue = tf.ones(sample_batch_shape, dtype=tf.bool) samples_dim0, _, _ = tf.while_loop(cond=cond_fn, body=body_fn, loop_vars=(w, should_continue, dim0_seed)) samples_dim0 = samples_dim0[..., tf.newaxis] if not self._allow_nan_stats: # Verify samples are w/in -1, 1, with useful error output tensors (top # value rather than all values). with tf.control_dependencies([ assert_util.assert_less_equal( samples_dim0, dtype_util.as_numpy_dtype(self.dtype)(1.01)), assert_util.assert_greater_equal( samples_dim0, dtype_util.as_numpy_dtype(self.dtype)(-1.01)), ]): samples_dim0 = tf.identity(samples_dim0) samples_otherdims_shape = ps.concat( [sample_batch_shape, [event_dim - 1]], axis=0) unit_otherdims = tf.math.l2_normalize(samplers.normal( samples_otherdims_shape, seed=otherdims_seed, dtype=self.dtype), axis=-1) samples = tf.concat( [ samples_dim0, # we must avoid sqrt(1 - (>1)**2) tf.sqrt(tf.maximum(1 - samples_dim0**2, 0.)) * unit_otherdims ], axis=-1) samples = tf.math.l2_normalize(samples, axis=-1) if not self.allow_nan_stats: samples = tf.debugging.check_numerics(samples, 'samples') # Runtime assert that samples are unit length. if not self.allow_nan_stats: worst, _ = tf.math.top_k( tf.reshape(tf.abs(1 - tf.linalg.norm(samples, axis=-1)), [-1])) with tf.control_dependencies([ assert_util.assert_near(dtype_util.as_numpy_dtype( self.dtype)(0), worst, atol=1e-4, summarize=100) ]): samples = tf.identity(samples) # The samples generated are symmetric around a mode at (1, 0, 0, ...., 0). # Now, we move the mode to `self.mean_direction` using a rotation matrix. if not self.allow_nan_stats: # Assert that the basis vector rotates to the mean direction, as expected. basis = tf.cast( tf.concat([[1.], tf.zeros([event_dim - 1])], axis=0), self.dtype) with tf.control_dependencies([ assert_util.assert_less( tf.linalg.norm(self._rotate( basis, mean_direction=mean_direction) - mean_direction, axis=-1), dtype_util.as_numpy_dtype(self.dtype)(1e-5)) ]): return self._rotate(samples, mean_direction=mean_direction) return self._rotate(samples, mean_direction=mean_direction)
def error(cost, f, g, eps, b): b_target = tf.math.reduce_sum(transport(cost, f, g, eps), axis=1) return tf.reduce_max((tf.abs(b_target - b) / b)[:])
def __init__(self, level_scale_prior=None, slope_scale_prior=None, initial_level_prior=None, initial_slope_prior=None, observed_time_series=None, name=None): """Specify a local linear trend model. Args: level_scale_prior: optional `tfd.Distribution` instance specifying a prior on the `level_scale` parameter. If `None`, a heuristic default prior is constructed based on the provided `observed_time_series`. Default value: `None`. slope_scale_prior: optional `tfd.Distribution` instance specifying a prior on the `slope_scale` parameter. If `None`, a heuristic default prior is constructed based on the provided `observed_time_series`. Default value: `None`. initial_level_prior: optional `tfd.Distribution` instance specifying a prior on the initial level. If `None`, a heuristic default prior is constructed based on the provided `observed_time_series`. Default value: `None`. initial_slope_prior: optional `tfd.Distribution` instance specifying a prior on the initial slope. If `None`, a heuristic default prior is constructed based on the provided `observed_time_series`. Default value: `None`. observed_time_series: optional `float` `Tensor` of shape `batch_shape + [T, 1]` (omitting the trailing unit dimension is also supported when `T > 1`), specifying an observed time series. Any `NaN`s are interpreted as missing observations; missingness may be also be explicitly specified by passing a `tfp.sts.MaskedTimeSeries` instance. Any priors not explicitly set will be given default values according to the scale of the observed time series (or batch of time series). Default value: `None`. name: the name of this model component. Default value: 'LocalLinearTrend'. """ init_parameters = dict(locals()) with tf.name_scope(name or 'LocalLinearTrend') as name: _, observed_stddev, observed_initial = ( sts_util.empirical_statistics(observed_time_series) if observed_time_series is not None else (0., 1., 0.)) # Heuristic default priors. Overriding these may dramatically # change inference performance and results. if level_scale_prior is None: level_scale_prior = tfd.LogNormal(loc=tf.math.log( .05 * observed_stddev), scale=3., name='level_scale_prior') if slope_scale_prior is None: slope_scale_prior = tfd.LogNormal(loc=tf.math.log( .05 * observed_stddev), scale=3., name='slope_scale_prior') if initial_level_prior is None: initial_level_prior = tfd.Normal( loc=observed_initial, scale=tf.abs(observed_initial) + observed_stddev, name='initial_level_prior') if initial_slope_prior is None: initial_slope_prior = tfd.Normal(loc=0., scale=observed_stddev, name='initial_slope_prior') tf.debugging.assert_same_float_dtype([ level_scale_prior, slope_scale_prior, initial_level_prior, initial_slope_prior ]) self._initial_state_prior = tfd.MultivariateNormalDiag( loc=tf.stack( [initial_level_prior.mean(), initial_slope_prior.mean()], axis=-1), scale_diag=tf.stack([ initial_level_prior.stddev(), initial_slope_prior.stddev() ], axis=-1)) scaled_softplus = tfb.Chain( [tfb.Scale(scale=observed_stddev), tfb.Softplus()]) super(LocalLinearTrend, self).__init__(parameters=[ Parameter('level_scale', level_scale_prior, scaled_softplus), Parameter('slope_scale', slope_scale_prior, scaled_softplus) ], latent_size=2, init_parameters=init_parameters, name=name)
def active(datadir, prop, tsne_train, tsne_val, tsne_test, ytrain_dft, yval_dft, ytest_dft, maxiters, amp, length_scale, rate): """ adam.active(datadir, prop, tsne_train, tsne_val, tsne_test, ytrain_dft, yval_dft, ytest_dft, maxiters, amp, length_scale, rate) A Gaussian Process (GP) with a Matern One Half kernel. The GP is first trained to minimise the MAE on the validation set. The best hyperparameters obtained during the GP training is used to build an optimised kernel for predicting the test set. Inputs: datadir- Directory into which results are written into. prop- Optical property of interest. tsne_train- Latent points for the training set. tsne_val- Latent points for the validation set. tsne_test- Latent points for the test set. ytrain_dft- DFT-calculated set for training. yval_dft- DFT-calculated data for validation. ytest_dft- DFT-calculated data for testing. maxiters- Number of iterations for optimising hyperparameters. amp- Maximum value of the kernel. length_scale- The width of the kernel. rate- Learning rate for Adam optimisation. Outputs: 1- Optimised loss. 2- Optimised kernel amplitude. 3- Optimised kernel scale length. 4- Best kernel amplitude. 5- Best kernel length scale. 6- GP prediction. 7- Uncertainty on the GP prediction. 8- Variance on the GP prediction. 9- Best MAE and its corresponding MSE, the standard deviation on the MAE on the test set. 10- Pearson correlation coefficient between the DFT- calculated and GP-predicted optical property. """ latent_train = convert_index_points(tsne_train)[0] latent_val = convert_index_points(tsne_val)[0] latent_test = convert_index_points(tsne_test)[0] feature_ndims = convert_index_points(tsne_train)[1] # Define the DFT-calculated values ytrain_dft = tf.constant(ytrain_dft, dtype=tf.float64) yval_dft = tf.constant(yval_dft, dtype=tf.float64) ytest_dft = tf.constant(ytest_dft, dtype=tf.float64) if maxiters <= 0: amp = tf.cast(amp, tf.float64) length_scale = tf.cast(length_scale, tf.float64) print("Prior on the amplitude of the kernel = %.4f" % amp.numpy()) print("Prior on the width of the kernel = %.4f" % length_scale.numpy()) print("No bijector is applied to the priors ...") # Build the optimised kernel using the input hyperparameters Optkernel = tfk.MaternOneHalf(amp, length_scale, feature_ndims=feature_ndims) gprm_dft = tfd.GaussianProcessRegressionModel( kernel=Optkernel, index_points=latent_test, observation_index_points=latent_train, observations=ytrain_dft) else: print( "Requested optimisation with Adam algorithm at learning rate %s" % rate) print("Number of iterations = %s" % maxiters) print("Prior on the amplitude of the kernel = %s" % amp) print("Prior on the width of the kernel = %s" % length_scale) optimizer = tf.optimizers.Adam(learning_rate=rate) # Create a trainable variables and apply positive constraint amp = tfp.util.TransformedVariable(initial_value=amp, bijector=tfb.Exp(), name="amp", dtype=tf.float64) length_scale = tfp.util.TransformedVariable( initial_value=length_scale, bijector=tfb.Exp(), name="length_scale", dtype=tf.float64) def trainables(): return [ var.trainable_variables[0] for var in [amp, length_scale] ] logging.info( "Training GP on the training set to minimise MAE on the validation set ..." ) @tf.function def loss_fn(): kernel = tfk.MaternOneHalf(amp, length_scale, feature_ndims=feature_ndims) gp = tfd.GaussianProcess(kernel=kernel, index_points=latent_train) return -gp.log_prob(ytrain_dft) OptLoss = np.array([]) OptAmp = np.array([]) OptLength = np.array([]) Optmae_val = np.array([]) Optmse_val = np.array([]) Optsae_val = np.array([]) for i in tf.range(maxiters): with tf.GradientTape() as tape: loss = loss_fn() grads = tape.gradient(loss, trainables()) optimizer.apply_gradients(zip(grads, trainables())) OptLoss = np.append(OptLoss, loss.numpy()) OptAmp = np.append(OptAmp, amp._value().numpy()) OptLength = np.append(OptLength, length_scale._value().numpy()) gprm = tfd.GaussianProcessRegressionModel( kernel=tfk.MaternOneHalf(OptAmp[i], OptLength[i], feature_ndims=feature_ndims), index_points=latent_val, observation_index_points=latent_train, observations=ytrain_dft) Optmae_val = np.append( Optmae_val, tf.losses.MAE(yval_dft, gprm.mean().numpy())) Optmse_val = np.append( Optmse_val, tf.losses.MSE(yval_dft, gprm.mean().numpy())) Optsae_val = np.append( Optsae_val, np.std(np.abs(gprm.mean().numpy() - yval_dft.numpy()))) if i % 10 == 0 or i + 1 == maxiters: print( "At step %d: loss=%.4f, amplitude=%.4f, length_scale=%.4f, mae=%.4f, mse=%.4f, sae=%.4f, min(std) = %.4f, max(std) = %.4f" % (i, OptLoss[i], OptAmp[i], OptLength[i], Optmae_val[i], Optmse_val[i], Optsae_val[i], min(gprm.stddev().numpy()), max(gprm.stddev().numpy()))) logging.info("Best-fitted parameters:") print(" amplitude: %.4f" % OptAmp[np.argmin(Optmae_val)]) print(" length_scale: %.4f" % OptLength[np.argmin(Optmae_val)]) logging.info( "Building optimised kernel using the optimised hyperparameters ..." ) logging.info("GP predicting the test set ...") Optkernel = tfk.MaternOneHalf(OptAmp[np.argmin(Optmae_val)], OptLength[np.argmin(Optmae_val)], feature_ndims=feature_ndims) gprm_dft = tfd.GaussianProcessRegressionModel( kernel=Optkernel, index_points=latent_test, observation_index_points=latent_train, observations=ytrain_dft) # Compute the Pearson correlation coefficient, MAE, MSE and # standard deviation on the absolute error (SAE) on the test set mae_test = tf.losses.MAE(ytest_dft.numpy(), gprm_dft.mean().numpy()) mse_test = tf.losses.MSE(ytest_dft.numpy(), gprm_dft.mean().numpy()) sae_test = tf.math.reduce_std( tf.abs(gprm_dft.mean().numpy() - ytest_dft.numpy())) R, p = pearsonr(x=ytest_dft.numpy(), y=gprm_dft.mean().numpy()) print( "Prediction: mae = %.4f, mse = %.4f, sae = %.4f, min(std) = %.4f, max(std) = %.4f, R = %.4f" % (mae_test, mse_test, sae_test, min( gprm_dft.stddev().numpy()), max(gprm_dft.stddev().numpy()), R)) logging.info("Writing results to file ...") if maxiters > 0: np.save("%s/OptLoss.npy" % datadir, OptLoss) np.save("%s/OptAmp.npy" % datadir, OptAmp) np.save("%s/OptLength.npy" % datadir, OptLength) np.save("%s/Optmae_val.npy" % datadir, Optmae_val) np.save("%s/Optmse_val.npy" % datadir, Optmse_val) np.save("%s/Optsae_val.npy" % datadir, Optsae_val) np.save("%s/ytrain.npy" % datadir, ytrain_dft.numpy()) np.save("%s/yval.npy" % datadir, yval_dft.numpy()) np.save("%s/ytest.npy" % datadir, ytest_dft.numpy()) np.save("%s/gp_mean.npy" % datadir, gprm_dft.mean().numpy()) np.save("%s/gp_stddev.npy" % datadir, gprm_dft.stddev().numpy()) np.save("%s/gp_variance.npy" % datadir, gprm_dft.variance().numpy()) # Lets predict the test DFT values and estimate the # uncertainties on the prediction. Since a log-loss # was minimised, the variance is a better measure of # the uncertainty. For more information, go to # https://www.kdnuggets.com/2018/10/introduction-active-learning.html if maxiters <= 0: return (None, None, None, amp, length_scale, gprm_dft.mean().numpy(), gprm_dft.stddev().numpy(), gprm_dft.variance().numpy(), None, mae_test, mse_test, sae_test, R) else: return (OptLoss, OptAmp, OptLength, OptAmp[np.argmin(Optmae_val)], OptLength[np.argmin(Optmae_val)], gprm_dft.mean().numpy(), gprm_dft.stddev().numpy(), gprm_dft.variance().numpy(), min(Optmae_val), mae_test, mse_test, sae_test, R)
def soft_threshold(x, threshold, name=None): """Soft Thresholding operator. This operator is defined by the equations ```none { x[i] - gamma, x[i] > gamma SoftThreshold(x, gamma)[i] = { 0, x[i] == gamma { x[i] + gamma, x[i] < -gamma ``` In the context of proximal gradient methods, we have ```none SoftThreshold(x, gamma) = prox_{gamma L1}(x) ``` where `prox` is the proximity operator. Thus the soft thresholding operator is used in proximal gradient descent for optimizing a smooth function with (non-smooth) L1 regularization, as outlined below. The proximity operator is defined as: ```none prox_r(x) = argmin{ r(z) + 0.5 ||x - z||_2**2 : z }, ``` where `r` is a (weakly) convex function, not necessarily differentiable. Because the L2 norm is strictly convex, the above argmin is unique. One important application of the proximity operator is as follows. Let `L` be a convex and differentiable function with Lipschitz-continuous gradient. Let `R` be a convex lower semicontinuous function which is possibly nondifferentiable. Let `gamma` be an arbitrary positive real. Then ```none x_star = argmin{ L(x) + R(x) : x } ``` if and only if the fixed-point equation is satisfied: ```none x_star = prox_{gamma R}(x_star - gamma grad L(x_star)) ``` Proximal gradient descent thus typically consists of choosing an initial value `x^{(0)}` and repeatedly applying the update ```none x^{(k+1)} = prox_{gamma^{(k)} R}(x^{(k)} - gamma^{(k)} grad L(x^{(k)})) ``` where `gamma` is allowed to vary from iteration to iteration. Specializing to the case where `R(x) = ||x||_1`, we minimize `L(x) + ||x||_1` by repeatedly applying the update ``` x^{(k+1)} = SoftThreshold(x - gamma grad L(x^{(k)}), gamma) ``` (This idea can also be extended to second-order approximations, although the multivariate case does not have a known closed form like above.) Args: x: `float` `Tensor` representing the input to the SoftThreshold function. threshold: nonnegative scalar, `float` `Tensor` representing the radius of the interval on which each coordinate of SoftThreshold takes the value zero. Denoted `gamma` above. name: Python string indicating the name of the TensorFlow operation. Default value: `'soft_threshold'`. Returns: softthreshold: `float` `Tensor` with the same shape and dtype as `x`, representing the value of the SoftThreshold function. #### References [1]: Yu, Yao-Liang. The Proximity Operator. https://www.cs.cmu.edu/~suvrit/teach/yaoliang_proximity.pdf [2]: Wikipedia Contributors. Proximal gradient methods for learning. _Wikipedia, The Free Encyclopedia_, 2018. https://en.wikipedia.org/wiki/Proximal_gradient_methods_for_learning """ # https://math.stackexchange.com/questions/471339/derivation-of-soft-thresholding-operator with tf.name_scope(name or 'soft_threshold'): x = tf.convert_to_tensor(x, name='x') threshold = tf.convert_to_tensor(threshold, dtype=x.dtype, name='threshold') return tf.sign(x) * tf.maximum(tf.abs(x) - threshold, 0.)
def __call__(self, x): return self.l1 * tf.reduce_sum(tf.abs(x))
def call(self, inputs): if (not isinstance(inputs, random_variable.RandomVariable) and not isinstance(self.kernel, random_variable.RandomVariable) and not isinstance(self.bias, random_variable.RandomVariable)): return super(DenseDVI, self).call(inputs) self.call_weights() inputs_mean, inputs_variance, inputs_covariance = get_moments(inputs) kernel_mean, kernel_variance, _ = get_moments(self.kernel) if self.use_bias: bias_mean, _, bias_covariance = get_moments(self.bias) # E[outputs] = E[inputs] * E[kernel] + E[bias] mean = tf.tensordot(inputs_mean, kernel_mean, [[-1], [0]]) if self.use_bias: mean = tf.nn.bias_add(mean, bias_mean) # Cov = E[inputs**2] Cov(kernel) + E[W]^T Cov(inputs) E[W] + Cov(bias) # For first term, assume Cov(kernel) = 0 on off-diagonals so we only # compute diagonal term. covariance_diag = tf.tensordot(inputs_variance + inputs_mean**2, kernel_variance, [[-1], [0]]) # Compute quadratic form E[W]^T Cov E[W] from right-to-left. First is # [..., features, features], [features, units] -> [..., features, units]. cov_w = tf.tensordot(inputs_covariance, kernel_mean, [[-1], [0]]) # Next is [..., features, units], [features, units] -> [..., units, units]. w_cov_w = tf.tensordot(cov_w, kernel_mean, [[-2], [0]]) covariance = w_cov_w if self.use_bias: covariance += bias_covariance covariance = tf.linalg.set_diag( covariance, tf.linalg.diag_part(covariance) + covariance_diag) if self.activation in (tf.keras.activations.relu, tf.nn.relu): # Compute activation's moments with variable names from Wu et al. (2018). variance = tf.linalg.diag_part(covariance) scale = tf.sqrt(variance) mu = mean / (scale + tf.keras.backend.epsilon()) mean = scale * soft_relu(mu) pairwise_variances = (tf.expand_dims(variance, -1) * tf.expand_dims(variance, -2) ) # [..., units, units] rho = covariance / tf.sqrt(pairwise_variances + tf.keras.backend.epsilon()) rho = tf.clip_by_value(rho, -1. / (1. + tf.keras.backend.epsilon()), 1. / (1. + tf.keras.backend.epsilon())) s = covariance / (rho + tf.keras.backend.epsilon()) mu1 = tf.expand_dims(mu, -1) # [..., units, 1] mu2 = tf.linalg.matrix_transpose(mu1) # [..., 1, units] a = (soft_relu(mu1) * soft_relu(mu2) + rho * tfp.distributions.Normal(0., 1.).cdf(mu1) * tfp.distributions.Normal(0., 1.).cdf(mu2)) gh = tf.asinh(rho) bar_rho = tf.sqrt(1. - rho**2) gr = gh + rho / (1. + bar_rho) # Include numerically stable versions of gr and rho when multiplying or # dividing them. The sign of gr*rho and rho/gr is always positive. safe_gr = tf.abs(gr) + 0.5 * tf.keras.backend.epsilon() safe_rho = tf.abs(rho) + tf.keras.backend.epsilon() exp_negative_q = gr / ( 2. * math.pi) * tf.exp(-safe_rho / (2. * safe_gr * (1 + bar_rho)) + (gh - rho) / (safe_gr * safe_rho) * mu1 * mu2) covariance = s * (a + exp_negative_q) elif self.activation not in (tf.keras.activations.linear, None): raise NotImplementedError( 'Activation is {}. Deterministic variational ' 'inference is only available if activation is ' 'ReLU or None.'.format(self.activation)) return generated_random_variables.MultivariateNormalFullCovariance( mean, covariance)
def lp_distance(x, y, p=1): """l_p distance.""" diffs_abs = tf.abs(x - y) summation = tf.reduce_sum(tf.math.pow(diffs_abs, p), axis=-1) return tf.reduce_mean(tf.math.pow(summation, 1. / p), axis=-1)
def _forward_log_det_jacobian(self, unused_x): return tf.reduce_sum(tf.math.log( tf.abs(tf.linalg.diag_part(self.lower_upper))), axis=-1)
def compute_mag(audio, size=6144, overlap=0.75, pad_end=True): mag = tf.abs(stft(audio, frame_size=size, overlap=overlap, pad_end=pad_end)) return tf_float32(mag)
def find_root_chandrupatla(objective_fn, low, high, position_tolerance=1e-8, value_tolerance=0., max_iterations=50, stopping_policy_fn=tf.reduce_all, validate_args=False, name='find_root_chandrupatla'): r"""Finds root(s) of a scalar function using Chandrupatla's method. Chandrupatla's method [1, 2] is a root-finding algorithm that is guaranteed to converge if a root lies within the given bounds. It generalizes the [bisection method](https://en.wikipedia.org/wiki/Bisection_method); at each step it chooses to perform either bisection or inverse quadratic interpolation. This makes it similar in spirit to [Brent's method]( https://en.wikipedia.org/wiki/Brent%27s_method), which also considers steps that use the secant method, but Chandrupatla's method is simpler and often converges at least as quickly [3]. Args: objective_fn: Python callable for which roots are searched. It must be a callable of a single variable. `objective_fn` must return a `Tensor` with shape `batch_shape` and dtype matching `lower_bound` and `upper_bound`. low: Float `Tensor` of shape `batch_shape` representing a lower bound(s) on the value of a root(s). high: Float `Tensor` of shape `batch_shape` representing an upper bound(s) on the value of a root(s). position_tolerance: Optional `Tensor` representing the maximum absolute error in the positions of the estimated roots. Shape must broadcast with `batch_shape`. Default value: `1e-8`. value_tolerance: Optional `Tensor` representing the absolute error allowed in the value of the objective function. If the absolute value of `objective_fn` is smaller than `value_tolerance` at a given position, then that position is considered a root for the function. Shape must broadcast with `batch_shape`. Default value: `1e-8`. max_iterations: Optional `Tensor` or Python integer specifying the maximum number of steps to perform. Shape must broadcast with `batch_shape`. Default value: `50`. stopping_policy_fn: Python `callable` controlling the algorithm termination. It must be a callable accepting a `Tensor` of booleans with the same shape as `lower_bound` and `upper_bound` (denoting whether each search is finished), and returning a scalar boolean `Tensor` indicating whether the overall search should stop. Typical values are `tf.reduce_all` (which returns only when the search is finished for all points), and `tf.reduce_any` (which returns as soon as the search is finished for any point). Default value: `tf.reduce_all` (returns only when the search is finished for all points). validate_args: Python `bool` indicating whether to validate arguments. Default value: `False`. name: Python `str` name prefixed to ops created by this function. Default value: 'find_root_chandrupatla'. Returns: root_search_results: A Python `namedtuple` containing the following items: estimated_root: `Tensor` containing the last position explored. If the search was successful within the specified tolerance, this position is a root of the objective function. objective_at_estimated_root: `Tensor` containing the value of the objective function at `position`. If the search was successful within the specified tolerance, then this is close to 0. num_iterations: The number of iterations performed. #### References [1] Tirupathi R. Chandrupatla. A new hybrid quadratic/bisection algorithm for finding the zero of a nonlinear function without using derivatives. _Advances in Engineering Software_, 28.3:145-149, 1997. [2] Philipp OJ Scherer. Computational Physics. _Springer Berlin_, Heidelberg, 2010. Section 6.1.7.3 https://books.google.com/books?id=cC-8BAAAQBAJ&pg=PA95 [3] Jason Sachs. Ten Little Algorithms, Part 5: Quadratic Extremum Interpolation and Chandrupatla's Method (2015). https://www.embeddedrelated.com/showarticle/855.php """ ################################################ # Loop variables used by Chandrupatla's method: # # a: endpoint of an interval `[min(a, b), max(a, b)]` containing the # root. There is no guarantee as to which of `a` and `b` is larger. # b: endpoint of an interval `[min(a, b), max(a, b)]` containing the # root. There is no guarantee as to which of `a` and `b` is larger. # f_a: value of the objective at `a`. # f_b: value of the objective at `b`. # t: the next position to be evaluated as the coefficient of a convex # combination of `a` and `b` (i.e., a value in the unit interval). # num_iterations: integer number of steps taken so far. # converged: boolean indicating whether each batch element has converged. # # All variables have the same shape `batch_shape`. def _should_continue(a, b, f_a, f_b, t, num_iterations, converged): del a, b, f_a, f_b, t # Unused. all_converged = stopping_policy_fn( tf.logical_or(converged, num_iterations >= max_iterations)) return ~all_converged def _body(a, b, f_a, f_b, t, num_iterations, converged): """One step of Chandrupatla's method for root finding.""" previous_loop_vars = (a, b, f_a, f_b, t, num_iterations, converged) finalized_elements = tf.logical_or(converged, num_iterations >= max_iterations) # Evaluate the new point. x_new = (1 - t) * a + t * b f_new = objective_fn(x_new) # Tighten the bounds. a, b, c, f_a, f_b, f_c = _structure_broadcasting_where( tf.equal(tf.math.sign(f_new), tf.math.sign(f_a)), (x_new, b, a, f_new, f_b, f_a), (x_new, a, b, f_new, f_a, f_b)) # Check for convergence. f_best = tf.where(tf.abs(f_a) < tf.abs(f_b), f_a, f_b) interval_tolerance = position_tolerance / (tf.abs(b - c)) converged = tf.logical_or(interval_tolerance > 0.5, tf.math.abs(f_best) <= value_tolerance) # Propose next point to evaluate. xi = (a - b) / (c - b) phi = (f_a - f_b) / (f_c - f_b) t = tf.where( # Condition for inverse quadratic interpolation. tf.logical_and(1 - tf.math.sqrt(1 - xi) < phi, tf.math.sqrt(xi) > phi), # Propose a point by inverse quadratic interpolation. (f_a / (f_b - f_a) * f_c / (f_b - f_c) + (c - a) / (b - a) * f_a / (f_c - f_a) * f_b / (f_c - f_b)), # Otherwise, just cut the interval in half (bisection). 0.5) # Constrain the proposal to the current interval (0 < t < 1). t = tf.minimum(tf.maximum(t, interval_tolerance), 1 - interval_tolerance) # Update elements that haven't converged. return _structure_broadcasting_where( finalized_elements, previous_loop_vars, (a, b, f_a, f_b, t, num_iterations + 1, converged)) with tf.name_scope(name): max_iterations = tf.convert_to_tensor(max_iterations, name='max_iterations', dtype_hint=tf.int32) a = tf.convert_to_tensor(low, name='lower_bound') b = tf.convert_to_tensor(high, name='upper_bound') f_a, f_b = objective_fn(a), objective_fn(b) batch_shape = ps.broadcast_shape(ps.shape(f_a), ps.shape(f_b)) assertions = [] if validate_args: assertions += [ assert_util.assert_none_equal( tf.math.sign(f_a), tf.math.sign(f_b), message='Bounds must be on different sides of a root.') ] with tf.control_dependencies(assertions): initial_loop_vars = [ a, b, f_a, f_b, tf.cast(0.5, dtype=f_a.dtype), tf.cast(0, dtype=max_iterations.dtype), False ] a, b, f_a, f_b, _, num_iterations, _ = tf.while_loop( _should_continue, _body, loop_vars=tf.nest.map_structure( lambda x: tf.broadcast_to(x, batch_shape), initial_loop_vars)) x_best, f_best = _structure_broadcasting_where( tf.abs(f_a) < tf.abs(f_b), (a, f_a), (b, f_b)) return RootSearchResults(estimated_root=x_best, objective_at_estimated_root=f_best, num_iterations=num_iterations)
def _my_mae(y_true, y_pred): return keras.backend.mean(tf.abs(y_pred - y_true), axis=-1)
def _forward_log_det_jacobian(self, x): if self.log_scale is not None: return self.log_scale elif self.scale is not None: return tf.math.log(tf.abs(self.scale))
def __init__(self, num_seasons, num_steps_per_season=1, allow_drift=True, drift_scale_prior=None, initial_effect_prior=None, constrain_mean_effect_to_zero=True, observed_time_series=None, name=None): """Specify a seasonal effects model. Args: num_seasons: Scalar Python `int` number of seasons. num_steps_per_season: Python `int` number of steps in each season. This may be either a scalar (shape `[]`), in which case all seasons have the same length, or a NumPy array of shape `[num_seasons]`, in which seasons have different length, but remain constant around different cycles, or a NumPy array of shape `[num_cycles, num_seasons]`, in which num_steps_per_season for each season also varies in different cycle (e.g., a 4 years cycle with leap day). Default value: 1. allow_drift: optional Python `bool` specifying whether the seasonal effects can drift over time. Setting this to `False` removes the `drift_scale` parameter from the model. This is mathematically equivalent to `drift_scale_prior = tfd.Deterministic(0.)`, but removing drift directly is preferred because it avoids the use of a degenerate prior. Default value: `True`. drift_scale_prior: optional `tfd.Distribution` instance specifying a prior on the `drift_scale` parameter. If `None`, a heuristic default prior is constructed based on the provided `observed_time_series`. Default value: `None`. initial_effect_prior: optional `tfd.Distribution` instance specifying a normal prior on the initial effect of each season. This may be either a scalar `tfd.Normal` prior, in which case it applies independently to every season, or it may be multivariate normal (e.g., `tfd.MultivariateNormalDiag`) with event shape `[num_seasons]`, in which case it specifies a joint prior across all seasons. If `None`, a heuristic default prior is constructed based on the provided `observed_time_series`. Default value: `None`. constrain_mean_effect_to_zero: if `True`, use a model parameterization that constrains the mean effect across all seasons to be zero. This constraint is generally helpful in identifying the contributions of different model components and can lead to more interpretable posterior decompositions. It may be undesirable if you plan to directly examine the latent space of the underlying state space model. Default value: `True`. observed_time_series: optional `float` `Tensor` of shape `batch_shape + [T, 1]` (omitting the trailing unit dimension is also supported when `T > 1`), specifying an observed time series. Any priors not explicitly set will be given default values according to the scale of the observed time series (or batch of time series). May optionally be an instance of `tfp.sts.MaskedTimeSeries`, which includes a mask `Tensor` to specify timesteps with missing observations. Default value: `None`. name: the name of this model component. Default value: 'Seasonal'. """ with tf1.name_scope(name, 'Seasonal', values=[observed_time_series]) as name: _, observed_stddev, observed_initial = ( sts_util.empirical_statistics(observed_time_series) if observed_time_series is not None else (0., 1., 0.)) # Heuristic default priors. Overriding these may dramatically # change inference performance and results. if drift_scale_prior is None: drift_scale_prior = tfd.LogNormal(loc=tf.math.log( .01 * observed_stddev), scale=3.) if initial_effect_prior is None: initial_effect_prior = tfd.Normal( loc=observed_initial, scale=tf.abs(observed_initial) + observed_stddev) dtype = tf.debugging.assert_same_float_dtype( [drift_scale_prior, initial_effect_prior]) if isinstance(initial_effect_prior, tfd.Normal): initial_state_prior = tfd.MultivariateNormalDiag( loc=tf.stack([initial_effect_prior.mean()] * num_seasons, axis=-1), scale_diag=tf.stack([initial_effect_prior.stddev()] * num_seasons, axis=-1)) else: initial_state_prior = initial_effect_prior if constrain_mean_effect_to_zero: # Transform the prior to the residual parameterization used by # `ConstrainedSeasonalStateSpaceModel`, imposing a zero-sum constraint. # This doesn't change the marginal prior on individual effects, but # does introduce dependence between the effects. (effects_to_residuals, _) = build_effects_to_residuals_matrix(num_seasons, dtype=dtype) effects_to_residuals_linop = tf.linalg.LinearOperatorFullMatrix( effects_to_residuals ) # Use linop so that matmul broadcasts. initial_state_prior_loc = effects_to_residuals_linop.matvec( initial_state_prior.mean()) initial_state_prior_scale_linop = effects_to_residuals_linop.matmul( initial_state_prior.scale) # returns LinearOperator initial_state_prior = tfd.MultivariateNormalFullCovariance( loc=initial_state_prior_loc, covariance_matrix=initial_state_prior_scale_linop.matmul( initial_state_prior_scale_linop.to_dense(), adjoint_arg=True)) self._constrain_mean_effect_to_zero = constrain_mean_effect_to_zero self._initial_state_prior = initial_state_prior self._num_seasons = num_seasons self._num_steps_per_season = num_steps_per_season parameters = [] if allow_drift: parameters.append( Parameter( 'drift_scale', drift_scale_prior, tfb.Chain([ tfb.AffineScalar(scale=observed_stddev), tfb.Softplus() ]))) self._allow_drift = allow_drift super(Seasonal, self).__init__( parameters, latent_size=(num_seasons - 1 if self.constrain_mean_effect_to_zero else num_seasons), name=name)
def sinc(x, threshold=1e-20): """Normalized zero phase version (peak at zero).""" x = tf_float32(x) x = tf.where(tf.abs(x) < threshold, threshold * tf.ones_like(x), x) x = np.pi * x return tf.sin(x) / x
def my_regularizer(weights): return tf.reduce_sum(tf.abs(weights))