def adadelta(allparams, nat_stepsize, num_epochs, seq_len, num_seqs=None, rho=0.95, epsilon=1e-6, num_samples=1, permute=True): natparams, params = allparams[:1], allparams[1:] sum_gsq = zeros_like(params) # accumulated sq. grads sum_usq = zeros_like(params) # accumulated sq. updates accumulate = lambda a, b: add(scale(rho, a), scale(1 - rho, b)) for epoch in xrange(num_epochs): vals = [] batches, num_batches = split_into_batches(data, seq_len, num_seqs) for y in batches: val, grad = scale( 1. / num_datapoints, val_and_grad(y, num_batches, num_samples, *allparams)) natgrad, grad = grad[:1], grad[1:] sum_gsq = accumulate(sum_gsq, square(grad)) diag_scaling = div(sqrt(add_scalar(epsilon, sum_usq)), sqrt(add_scalar(epsilon, sum_gsq))) update = mul(diag_scaling, grad) sum_usq = accumulate(sum_usq, square(update)) natparams = add(natparams, scale(nat_stepsize, natgrad)) params = add(params, update) allparams = concat(natparams, params) vals.append(val) if callback: callback(epoch, vals, natgrad, allparams) return allparams
def __init__(params, lr=0.0001, usage=0.7, decay=0.99, pullback=): self.param_list = params self.lr = lr self.usage = usage self.decay = decay self.pullback = pullback self.lagged = zeros_like(params)
def step(): step = zeros_like(self.params) grad = DirectSum([x.grad for x in self.param_list]) lag_comp = self.decay * ddot(grad, self.lagged) step = grad * (lr + (lag_comp / pow(dnorm(grad), 1.5))) step -= self.lagged * pullback self.lagged -= self.lagged * lag_comp / dnorm(grad) self.lagged *= decay for p ,s in zip(self.param_list, step): p -= s self.lagged += grad
def adam(allparams, nat_stepsize, stepsize, num_epochs, seq_len, num_seqs=None, b1=0.9, b2=0.999, eps=1e-8, num_samples=1): natparams, params = allparams[:1], allparams[1:] m = zeros_like(params) v = zeros_like(params) i = 0 accumulate = lambda rho, a, b: add(scale(1 - rho, a), scale(rho, b)) for epoch in xrange(num_epochs): vals = [] batches, num_batches = split_into_batches(data, seq_len, num_seqs) for y in batches: val, grad = scale( 1. / num_datapoints, val_and_grad(y, num_batches, num_samples, *allparams)) natgrad, grad = grad[:1], grad[1:] m = accumulate(b1, grad, m) # first moment estimate v = accumulate(b2, square(grad), v) # second moment estimate mhat = scale(1. / (1 - b1**(i + 1)), m) # bias correction vhat = scale(1. / (1 - b2**(i + 1)), v) update = scale(stepsize, div(mhat, add_scalar(eps, sqrt(vhat)))) natparams = add(natparams, scale(nat_stepsize, natgrad)) params = add(params, update) allparams = concat(natparams, params) vals.append(val) i += 1 if callback: callback(epoch, vals, natgrad, allparams) return allparams
def adam(gradfun, allparams, num_iters, step_size, b1=0.9, b2=0.999, eps=1e-8): natparams, params = allparams[:1], allparams[1:] m = zeros_like(params) v = zeros_like(params) i = 0 accumulate = lambda rho, a, b: add(scale(1-rho, a), scale(rho, b)) for i in xrange(num_iters): grad = gradfun(allparams, i) natgrad, grad = grad[:1], grad[1:] m = accumulate(b1, grad, m) # first moment estimate v = accumulate(b2, square(grad), v) # second moment estimate mhat = scale(1./(1 - b1**(i+1)), m) # bias correction vhat = scale(1./(1 - b2**(i+1)), v) update = scale(step_size, div(mhat, add_scalar(eps, sqrt(vhat)))) natparams = sub(natparams, scale(step_size, natgrad)) params = sub(params, update) allparams = concat(natparams, params) return allparams