def adam(network, loss, rate, decay, epsilon=1e-8, clip=5.0, mrate=0.0005): """ ADAMski optimiser Similar to ADAM optimizer but with momentum phased in gradually from 0, as having lower momentum at the start of training seems to be beneficial. See: https://www.cs.toronto.edu/~hinton/absps/guideTR.pdf page 10 :param network: network to optimise :param loss: loss function to optimise over :param rate: rate (step size) for optimiser :param decay: decay for estimate of gradient and curvature :param epsilon: same parameter to prevent reciprocal of variance exploding :param mrate: Rate at which momentum is increased. None = ADAM optimiser :returns: a dictionary containing update functions for Tensors """ assert decay > (0.0, 0.0), "Decay must be non-negative" assert decay < (1.0, 1.0), "Decay must be less-than or equal to one" assert mrate is None or mrate > 0.0, "Rate of momentum increase must be positive" if mrate is not None: _M_RATE = -np.float_(mrate).astype(sloika_dtype) _M_P = np.exp(_M_RATE) _M_K = (1.0 - decay[0]) * decay[0] * _M_P / (1.0 - _M_P * decay[0]) _M_K = np.float_(_M_K).astype(sloika_dtype) else: _M_RATE = -np.float_(1e30).astype(sloika_dtype) _M_P = np.float_(0.0).astype(sloika_dtype) _M_K = np.float_(0.0).astype(sloika_dtype) params = network.params() updates = OrderedDict() gradients = th.grad(loss, params) ldecay = np.log(decay, dtype=sloika_dtype) t = th.shared(np.float32(0.0).astype(sloika_dtype)) lr_t = th.shared(np.float32(0.0).astype(sloika_dtype)) momentum_decay = th.shared(np.float32(0.0).astype(sloika_dtype)) updates[t] = t + 1.0 momentum_factor = _M_K * T.expm1(t * (ldecay[0] + _M_RATE)) - T.expm1( updates[t] * ldecay[0]) updates[lr_t] = rate * T.sqrt(-T.expm1(updates[t] * ldecay[1])) / momentum_factor updates[momentum_decay] = -decay[0] * T.expm1(updates[t] * _M_RATE) for param, grad in zip(params, gradients): val = param.get_value(borrow=True) momentum = th.shared(np.zeros(val.shape, dtype=val.dtype)) variance = th.shared(np.zeros(val.shape, dtype=val.dtype)) grad_clip = T.clip(grad, -clip, clip) updates[momentum] = updates[momentum_decay] * momentum + ( 1.0 - decay[0]) * grad_clip updates[variance] = decay[1] * variance + (1.0 - decay[1]) * T.sqr(grad_clip) updates[param] = param - updates[lr_t] * updates[momentum] / ( T.sqrt(updates[variance]) + epsilon) return updates
def elu(x): """Exponential Linear Unit :math:`\\varphi(x) = (x > 0) ? x : e^x - 1` The Exponential Linear Unit (ELU) was introduced in [1]_. Compared to the linear rectifier :func:`rectify`, it has a mean activation closer to zero and nonzero gradient for negative input, which can help convergence. Compared to the leaky rectifier :class:`LeakyRectify`, it saturates for highly negative inputs. Parameters ---------- x : float32 The activation (the summed, weighed input of a neuron). Returns ------- float32 The output of the exponential linear unit for the activation. Notes ----- In [1]_, an additional parameter :math:`\\alpha` controls the (negative) saturation value for negative inputs, but is set to 1 for all experiments. It is omitted here. References ---------- .. [1] Djork-Arné Clevert, Thomas Unterthiner, Sepp Hochreiter (2015): Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs), http://arxiv.org/abs/1511.07289 """ return tensor.switch(x > 0, x, tensor.expm1(x))
def __init__(self, incoming, sample_rate, frame_len, num_bands, min_freq, max_freq, trainable=True, **kwargs): super(MelBankLayer, self).__init__(incoming, **kwargs) # mel-spaced peak frequencies min_mel = 1127 * np.log1p(min_freq / 700.0) max_mel = 1127 * np.log1p(max_freq / 700.0) spacing = (max_mel - min_mel) / (num_bands + 1) spaces = np.ones(num_bands + 2) * spacing spaces[0] = min_mel spaces = theano.shared(lasagne.utils.floatX(spaces)) # learned param peaks_mel = spaces.cumsum() # create parameter as a vector of real-valued peak bins peaks_hz = 700 * (T.expm1(peaks_mel / 1127)) peaks_bin = peaks_hz * frame_len / sample_rate self.peaks = self.add_param(peaks_bin, shape=(num_bands + 2, ), name='peaks', trainable=trainable, regularizable=False) # store what else is needed self.num_bands = num_bands
def selu(x): """ Scaled exponential linear units as proposed in [1]. [1] - https://arxiv.org/pdf/1706.02515.pdf """ alpha = 1.6732632423543772848170429916717 lam = 1.0507009873554804934193349852946 return lam * switch(x >= 0.0, x, alpha * expm1(x))
def log1mexp(x): """Return log(1 - exp(-x)). This function is numerically more stable than the naive approach. For details, see https://cran.r-project.org/web/packages/Rmpfr/vignettes/log1mexp-note.pdf """ return tt.switch(tt.lt(x, 0.683), tt.log(-tt.expm1(-x)), tt.log1p(-tt.exp(-x)))
def log1mexp(x): """Return log(1 - exp(-x)). This function is numerically more stable than the naive approch. For details, see https://cran.r-project.org/web/packages/Rmpfr/vignettes/log1mexp-note.pdf """ return tt.switch( tt.lt(x, 0.683), tt.log(-tt.expm1(-x)), tt.log1p(-tt.exp(-x)))
def log1mexp(x): r"""Return log(1 - exp(-x)). This function is numerically more stable than the naive approach. For details, see https://cran.r-project.org/web/packages/Rmpfr/vignettes/log1mexp-note.pdf References ---------- .. [Machler2012] Martin Mächler (2012). "Accurately computing `\log(1-\exp(- \mid a \mid))` Assessed by the Rmpfr package" """ return tt.switch(tt.lt(x, 0.6931471805599453), tt.log(-tt.expm1(-x)), tt.log1p(-tt.exp(-x)))
def selu(x): alpha = 1.6732632423543772848170429916717 scale = 1.0507009873554804934193349852946 return scale * TT.switch(x > 0, x, alpha * TT.expm1(x))
def rmspe(y_true, y_pred): y_true = T.expm1(y_true) y_pred = T.expm1(y_pred) return T.sqrt(T.sqr((y_true - y_pred) / y_true).mean(axis=-1))
def blackbody_lambda(lam, temperature): """ Compute the blackbody flux as a function of wavelength `lam` in mks units """ return (two * hc2 / tt.pow(lam, 5) / tt.expm1(h * c / (lam * k_B * temperature)))
def rmspe(y_true, y_pred): y_true = T.expm1(y_true) y_pred = T.expm1(y_pred) return T.sqrt(T.sqr((y_true - y_pred)/y_true).mean(axis=-1))
def inv_temp_cond_prob_func(self, inv_temp, delta): return tt.switch(tt.eq(delta, 0.), tt.ones_like(delta), -tt.exp(-inv_temp * delta) * delta / tt.expm1(-delta))
def inv_temp_cond_prob_0_1(self, delta): prob_0 = tt.switch(tt.eq(delta, 0.), tt.ones_like(delta), -delta / tt.expm1(-delta)) prob_1 = tt.switch(tt.eq(delta, 0.), tt.ones_like(delta), delta / tt.expm1(delta)) return prob_0, prob_1
def __call__(self, x): return self.scale * tensor.switch(x > 0.0, x, self.scale_neg * (tensor.expm1(x)))
def elu(x): """ Exponential Linear Unit See https://arxiv.org/pdf/1511.07289.pdf """ return T.switch(x > 0, x, T.expm1(x))