def updates(self, cost): grad = T.grad(cost, self.param) grad2 = hessian_diagonal(cost, self.param, grad=grad) # calculate memory constants tau_rec = 1.0 / self.tau tau_inv_rec = 1.0 - tau_rec # new moving average of gradient g_avg_new = tau_inv_rec * self.g_avg + tau_rec * grad # new moving average of squared gradient v_avg_new = tau_inv_rec * self.v_avg + tau_rec * grad**2 # new moving average of hessian diagonal h_avg_new = tau_inv_rec * self.h_avg + tau_rec * T.abs_(grad2) rate_unsafe = (g_avg_new**2) / (v_avg_new * h_avg_new) rate = T.switch( T.isinf(rate_unsafe) | T.isnan(rate_unsafe), self.learning_rate, rate_unsafe) tau_unsafe = (1 - (g_avg_new**2) / v_avg_new) * self.tau + 1 tau_new = T.switch( T.isnan(tau_unsafe) | T.isinf(tau_unsafe), self.tau, tau_unsafe) return [(self.g_avg, g_avg_new), (self.v_avg, v_avg_new), (self.h_avg, h_avg_new), (self.tau, tau_new), (self.last_grad, grad), (self.last_grad2, grad2), (self.last_rate, rate), (self.param, self.param - rate * grad)]
def updates(self, cost): grad = T.grad(cost, self.param) grad2 = hessian_diagonal(cost, self.param, grad=grad) # calculate memory constants tau_rec = 1.0 / self.tau tau_inv_rec = 1.0 - tau_rec # new moving average of gradient g_avg_new = tau_inv_rec * self.g_avg + tau_rec * grad # new moving average of squared gradient v_avg_new = tau_inv_rec * self.v_avg + tau_rec * grad**2 # new moving average of hessian diagonal h_avg_new = tau_inv_rec * self.h_avg + tau_rec * T.abs_(grad2) rate_unsafe = (g_avg_new ** 2) / (v_avg_new * h_avg_new) rate = T.switch(T.isinf(rate_unsafe) | T.isnan(rate_unsafe), self.learning_rate, rate_unsafe) tau_unsafe = (1 - (g_avg_new ** 2) / v_avg_new) * self.tau + 1 tau_new = T.switch(T.isnan(tau_unsafe) | T.isinf(tau_unsafe), self.tau, tau_unsafe) return [(self.g_avg, g_avg_new), (self.v_avg, v_avg_new), (self.h_avg, h_avg_new), (self.tau, tau_new), (self.last_grad, grad), (self.last_grad2, grad2), (self.last_rate, rate), (self.param, self.param - rate * grad)]
def burn_in_updates(self, cost): grad = T.grad(cost, self.param) grad2 = hessian_diagonal(cost, self.param, grad=grad) print 'burn in updates for %s' % self.param return [(self.g_avg, self.g_avg + grad), (self.h_avg, self.h_avg + T.abs_(grad2)), (self.v_avg, self.v_avg + grad**2), (self.N, self.N + 1)]
def burn_in_updates(self, cost): grad = T.grad(cost, self.param) grad2 = hessian_diagonal(cost, self.param, grad=grad) print 'burn in updates for %s' % self.param return [ (self.g_avg, self.g_avg + grad), (self.h_avg, self.h_avg + T.abs_(grad2)), (self.v_avg, self.v_avg + grad**2), (self.N, self.N + 1) ]