def update_fun(param, grad, dataset, history, opt, learnParams, params): """ Computing the update from gradient. Adaptive step sizes, learning rate, momentum etc. """ epsilon = np.asarray(0.0, dtype=theano.config.floatX) # specification of learning rate, (hyper)param specific globalLR1, globalLR2, momentParam1, momentParam2 = learnParams assert dataset in ['T1', 'T2'] lr = globalLR1 if dataset == 'T1' else separateLR(params, param.name, globalLR1, globalLR2) # update with sgd if opt is None: updates = [] if params.trackGrads: updates, trackGrads = grad_monitor(param, grad, updates, params, opt) other = [grad] else: trackGrads = [] other = [grad] up = - lr * grad # update with adam else: up, updates, trackGrads, other = opt.up(param, grad, params, lr, dataset) # dictionary param to grad (first time around) if params.useT2 and dataset == 'T1': history['grad'][param] = grad history['up'][param] = up # momentum if params.use_momentum: oldup = theano.shared(np.asarray(param.get_value() * 0., dtype='float32'), broadcastable=param.broadcastable, name='oldup_%s' % param.name) momentParam = momentParam1 if dataset == 'T1' else momentParam2 up += momentParam * oldup updates += [(oldup, up)] # new parameter newparam = param + up # min value (assumption: all hyperparams >= 0) if dataset == 'T2': newparam = T.maximum(epsilon, newparam) updates += [(param, newparam)] adamGrad = [other] return updates, trackGrads, adamGrad
def update_fun(param, grad, dataset, history, opt, learnParams, params): ''' Computing the update from gradient. Adaptive step sizes, learning rate, momentum etc. ''' epsilon = np.asarray(0.0, dtype=theano.config.floatX) # specification of learning rate, (hyper)param specific globalLR1, globalLR2, momentParam1, momentParam2 = learnParams assert dataset in ['T1', 'T2'] lr = globalLR1 if dataset == 'T1' else separateLR(params, param.name, globalLR1, globalLR2) # update without adam if opt is None: updates = [] if params.trackGrads: updates, trackGrads = grad_monitor(param, grad, updates, params, opt) other = [grad] else: trackGrads = [] other = [grad] up = - lr * grad # update with adam else: up, updates, trackGrads, other = opt.up(param, grad, params, lr, dataset) # dictionary param to grad (first time around) if params.useT2 and dataset == 'T1': history['grad'][param] = grad history['up'][param] = up # momentum if params.use_momentum: oldup = theano.shared(np.asarray(param.get_value() * 0., dtype='float32'), broadcastable=param.broadcastable, name='oldup_%s' % param.name) momentParam = momentParam1 if dataset == 'T1' else momentParam2 up += momentParam * oldup updates += [(oldup, up)] # new parameter newparam = param + up # min value (assumption: all hyperparams >= 0) if dataset == 'T2': newparam = T.maximum(epsilon, newparam) updates += [(param, newparam)] adamGrad = [other] return updates, trackGrads, adamGrad
def up(self, param, grad, params, lr=1e-4, dataset='T1'): zero = np.float32(0.) one = np.float32(1.) updates = [] trackGrads = [] other = [] # initialize adam shared variables m = theano.shared(np.float32(param.get_value()) * zero, name="m_%s" % param.name) v = theano.shared(np.float32(param.get_value()) * zero, name="v_%s" % param.name) fix1 = one - self.b1**self.i fix2 = one - self.b2**self.i b1_t = self.b1 * self.lam**(self.i - 1) lr_t = lr * (T.sqrt(fix2) / fix1) m_t = ((one - b1_t) * grad) + (b1_t * m) # m_t = ((one - self.b1) * grad) + (self.b1 * m) v_t = ((one - self.b2) * T.sqr(grad)) + (self.b2 * v) g_t = m_t / (T.sqrt(v_t) + self.e) p_t = -(lr_t * g_t) # update Adam shared variables updates.append((m, m_t)) updates.append((v, v_t)) # in case of gradient tracking if params.trackGrads: updates, trackGrads = grad_monitor(param, grad, updates, params, 'adam', g_t, m, v, self.e) # if approximationg gradC2 with adam if params.avC2grad in ['adam', 'momentum']: other = g_t * (T.sqrt(fix2) / fix1) # alt: -lr_t*g_t or m_t return p_t, updates, trackGrads, other
def up(self, param, grad, params, lr=1e-4, dataset='T1'): zero = np.float32(0.) one = np.float32(1.) updates = [] trackGrads = [] other = [] # initialize adam shared variables m = theano.shared(np.float32(param.get_value()) * zero, name="m_%s" % param.name) v = theano.shared(np.float32(param.get_value()) * zero, name="v_%s" % param.name) fix1 = one - self.b1 ** self.i fix2 = one - self.b2 ** self.i b1_t = self.b1 * self.lam ** (self.i - 1) lr_t = lr * (T.sqrt(fix2) / fix1) m_t = ((one - b1_t) * grad) + (b1_t * m) # m_t = ((one - self.b1) * grad) + (self.b1 * m) v_t = ((one - self.b2) * T.sqr(grad)) + (self.b2 * v) g_t = m_t / (T.sqrt(v_t) + self.e) p_t = - (lr_t * g_t) # update Adam shared variables updates.append((m, m_t)) updates.append((v, v_t)) # in case of gradient tracking if params.trackGrads: updates, trackGrads = grad_monitor(param, grad, updates, params, 'adam', g_t, m, v, self.e) # if approximationg gradC2 with adam if params.avC2grad in ['adam', 'momentum']: other = g_t * (T.sqrt(fix2) / fix1) # alt: -lr_t*g_t or m_t return p_t, updates, trackGrads, other