def updates(self, cost): grad = T.grad(cost, self.param) grad2 = hessian_diagonal(cost, self.param, grad=grad) # calculate memory constants tau_rec = 1.0 / self.tau tau_inv_rec = 1.0 - tau_rec # new moving average of gradient g_avg_new = tau_inv_rec * self.g_avg + tau_rec * grad # new moving average of squared gradient v_avg_new = tau_inv_rec * self.v_avg + tau_rec * grad**2 # new moving average of hessian diagonal h_avg_new = tau_inv_rec * self.h_avg + tau_rec * T.abs_(grad2) rate_unsafe = (g_avg_new ** 2) / (v_avg_new * h_avg_new) rate = T.switch(T.isinf(rate_unsafe) | T.isnan(rate_unsafe), self.learning_rate, rate_unsafe) tau_unsafe = (1 - (g_avg_new ** 2) / v_avg_new) * self.tau + 1 tau_new = T.switch(T.isnan(tau_unsafe) | T.isinf(tau_unsafe), self.tau, tau_unsafe) return [(self.g_avg, g_avg_new), (self.v_avg, v_avg_new), (self.h_avg, h_avg_new), (self.tau, tau_new), (self.last_grad, grad), (self.last_grad2, grad2), (self.last_rate, rate), (self.param, self.param - rate * grad)]
def updates(self, cost): grad = T.grad(cost, self.param) grad2 = hessian_diagonal(cost, self.param, grad=grad) # calculate memory constants tau_rec = 1.0 / self.tau tau_inv_rec = 1.0 - tau_rec # new moving average of gradient g_avg_new = tau_inv_rec * self.g_avg + tau_rec * grad # new moving average of squared gradient v_avg_new = tau_inv_rec * self.v_avg + tau_rec * grad**2 # new moving average of hessian diagonal h_avg_new = tau_inv_rec * self.h_avg + tau_rec * T.abs_(grad2) rate_unsafe = (g_avg_new**2) / (v_avg_new * h_avg_new) rate = T.switch( T.isinf(rate_unsafe) | T.isnan(rate_unsafe), self.learning_rate, rate_unsafe) tau_unsafe = (1 - (g_avg_new**2) / v_avg_new) * self.tau + 1 tau_new = T.switch( T.isnan(tau_unsafe) | T.isinf(tau_unsafe), self.tau, tau_unsafe) return [(self.g_avg, g_avg_new), (self.v_avg, v_avg_new), (self.h_avg, h_avg_new), (self.tau, tau_new), (self.last_grad, grad), (self.last_grad2, grad2), (self.last_rate, rate), (self.param, self.param - rate * grad)]
def from_partial(self, X, dX): eps = 1e-10 U, S, V = X dU, dS, dV = dX umask = 1 - (1 - tensor.isnan(dU)) * (1 - tensor.isinf(dU) ) # indicators of nan/inf values vmask = 1 - (1 - tensor.isnan(dV)) * (1 - tensor.isinf(dV) ) # indicators of nan/inf values # U S V => U mask product by columns, V by rows smask = 1 - tensor.prod(1 - umask, axis=0) * tensor.prod(1 - vmask, axis=1) S = tensor.diag(S) dU = tensor.set_subtensor(dU[umask.nonzero()], 0.0) S_pinv = tensor.switch(tensor.gt(abs(S), eps), 1.0 / S, 0.0) S_pinv = tensor.set_subtensor(S_pinv[smask.nonzero()], 0.0) S_pinv = tensor.diag(S_pinv) dV = tensor.set_subtensor(dV[vmask.nonzero()], 0.0) ZV = dU.dot(S_pinv) UtZV = dS ZtU = S_pinv.dot(dV) Zproj = (ZV - U.dot(UtZV), UtZV, ZtU - (UtZV.dot(V))) return Zproj
def get_nesterov_sgd_updates(param_list, gradients, velocities, lr, mu): """Do SGD updates with Nesterov momentum.""" updates = [] for p, g, v in zip(param_list, gradients, velocities): new_v = mu * v - lr * g new_p = p - mu * v + (1 + mu) * new_v has_non_finite = (T.any(T.isnan(new_p) + T.isinf(new_p)) + T.any(T.isnan(new_v) + T.isinf(new_v))) updates.append((p, ifelse(has_non_finite, p, new_p))) updates.append((v, ifelse(has_non_finite, v, new_v))) return updates
def adamgc_(cost, params, lr=0.0002, b1=0.1, b2=0.01, e=1e-8, max_magnitude=5.0, infDecay=0.1): updates = [] grads = T.grad(cost, params) norm = norm_gs(params, grads) sqrtnorm = T.sqrt(norm) not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm)) adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.0) i = shared(floatX(0.0)) i_t = i + 1.0 fix1 = 1.0 - (1.0 - b1) ** i_t fix2 = 1.0 - (1.0 - b2) ** i_t lr_t = lr * (T.sqrt(fix2) / fix1) for p, g in zip(params, grads): g = T.switch(not_finite, infDecay * p, g * adj_norm_gs) m = shared(p.get_value() * 0.0) v = shared(p.get_value() * 0.0) m_t = (b1 * g) + ((1.0 - b1) * m) v_t = (b2 * T.sqr(g)) + ((1.0 - b2) * v) g_t = m_t / (T.sqrt(v_t) + e) p_t = p - (lr_t * g_t) # e_t = shared(p.get_value() * 0.) # de_t = (srnd.normal(p.shape, std = 0.05, dtype=theano.config.floatX)*p_t - e_t)*0.05 #*p_t # p_t = p_t + de_t # updates.append((e_t, e_t + de_t)) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) return updates, norm
def clip_grad_remove_nan(grads, clip_c_shared, mt_tparams, freeze_word_emb=False, only_word_att=False, gated_att=False): g2 = 0. for g in grads: g2 += (g * g).sum() not_finite = tensor.or_(tensor.isnan(g2), tensor.isinf(g2)) if clip_c_shared.get_value() > 0.: new_grads = [] for g, p in zip( grads, itemlist(mt_tparams, freeze_word_emb, only_word_att, gated_att)): tmpg = tensor.switch(g2 > (clip_c_shared * clip_c_shared), g / tensor.sqrt(g2) * clip_c_shared, g) new_grads.append( tensor.switch(not_finite, np.float32(.1) * p, tmpg)) return new_grads, tensor.sqrt(g2) else: return grads, tensor.sqrt(g2)
def exe(self, mainloop): """ .. todo:: WRITEME """ grads = mainloop.grads """ for p, g in grads.items(): grads[p] = g / self.batch_size g_norm = 0. for g in grads.values(): g_norm += (g**2).sum() """ g_norm = 0. for p, g in grads.items(): g /= self.batch_size grads[p] = g g_norm += (g**2).sum() not_finite = T.or_(T.isnan(g_norm), T.isinf(g_norm)) g_norm = T.sqrt(g_norm) scaler = self.scaler / T.maximum(self.scaler, g_norm) for p, g in grads.items(): grads[p] = T.switch(not_finite, 0.1 * p, g * scaler) mainloop.grads = grads
def pseudograd(loss, params, srng=None, temperature = 1.0e-1, learning_rate=1.0e-2, rho2=0.95): one = T.constant(1.0) zero = T.constant(0.0) deltas = [ make_normal(param, srng=srng) for param in params ] momentum = [ make_copy(param) for param in params ] new_params = [ param + learning_rate * delta for param, delta, m in zip(params, deltas, momentum) ] new_loss = theano.clone( loss, replace=dict(zip(params, new_params)) ) accepting_p = T.exp((loss - new_loss) / temperature) u = srng.uniform(size=(), dtype=loss.dtype) cond = T.or_(T.or_(u > accepting_p, T.isnan(new_loss)), T.isinf(new_loss)) step = T.switch(cond, zero, one) updates = OrderedDict() for m, delta in zip(momentum, deltas): updates[m] = m * rho2 + (one - rho2) * delta * step for param, m in zip(params, momentum): updates[param] = param + learning_rate * m return updates
def get_clip_sgd_updates(self, params, cost, learning_rate, momentum, rescale=5.): gparams = T.grad(cost, params) updates = OrderedDict() if not hasattr(self, "momentum_velocity_"): self.momentum_velocity_ = [0.] * len(gparams) # Gradient clipping grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), gparams))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) grad_norm = T.sqrt(grad_norm) scaling_num = rescale scaling_den = T.maximum(rescale, grad_norm) for n, (param, gparam) in enumerate(zip(params, gparams)): # clip gradient directly, not momentum etc. gparam = T.switch(not_finite, 0.1 * param, gparam * (scaling_num / scaling_den)) velocity = self.momentum_velocity_[n] update_step = momentum * velocity - learning_rate * gparam self.momentum_velocity_[n] = update_step updates[param] = param + update_step return updates
def get_gradients(self, model, data, **kwargs): cost = self.expr(model=model, data=data, **kwargs) params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs='ignore') gradients = OrderedDict(izip(params, grads)) if self.gradient_clipping: norm_gs = 0. for grad in gradients.values(): norm_gs += (grad**2).sum() not_finite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs)) norm_gs = T.sqrt(norm_gs) norm_gs = T.switch(T.ge(norm_gs, self.max_magnitude), self.max_magnitude / norm_gs, 1.) for param, grad in gradients.items(): gradients[param] = T.switch(not_finite, .1 * param, grad * norm_gs) updates = OrderedDict() return gradients, updates
def updates(self, params, grads, learning_rate, momentum, rescale=5.): grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) grad_norm = T.sqrt(grad_norm) scaling_num = rescale scaling_den = T.maximum(rescale, grad_norm) # Magic constants combination_coeff = 0.9 minimum_grad = 1E-4 updates = [] for n, (param, grad) in enumerate(zip(params, grads)): grad = T.switch(not_finite, 0.1 * param, grad * (scaling_num / scaling_den)) old_square = self.running_square_[n] new_square = combination_coeff * old_square + ( 1. - combination_coeff) * T.sqr(grad) old_avg = self.running_avg_[n] new_avg = combination_coeff * old_avg + (1. - combination_coeff) * grad rms_grad = T.sqrt(new_square - new_avg**2) rms_grad = T.maximum(rms_grad, minimum_grad) memory = self.memory_[n] update = momentum * memory - learning_rate * grad / rms_grad update2 = momentum * momentum * memory - ( 1 + momentum) * learning_rate * grad / rms_grad updates.append((old_square, new_square)) updates.append((old_avg, new_avg)) updates.append((memory, update)) updates.append((param, param + update2)) return updates
def shade(self, shape, lights, camera): # See: http://en.wikipedia.org/wiki/Phong_reflection_model#Description # Since our material params are 1d we calculate bw shadings first and # convert to color after light = lights[0] material = shape.material normals = shape.normals(camera.rays) ambient_light = material.ka # diffuse (lambertian) diffuse_shadings = material.kd*T.tensordot(normals, -light.normed_dir(), 1) # specular rm = 2.0*(T.tensordot(normals, -light.normed_dir(), 1).dimshuffle( 0, 1, 'x'))*normals + light.normed_dir() specular_shadings = material.ks*(T.tensordot(rm, camera.look_at, 1) ** material.shininess) # phong phong_shadings = ambient_light + diffuse_shadings + specular_shadings colorized = phong_shadings.dimshuffle(0, 1, 'x') * material.color.dimshuffle('x', 'x', 0) * light.intensity.dimshuffle('x', 'x', 0) clipped = T.clip(colorized, 0, 1) distances = shape.distance(camera.rays) return broadcasted_switch(T.isinf(distances), [0., 0., 0.], clipped)
def compute_step(self, param, previous_step): not_finite = tensor.any( tensor.or_(tensor.isnan(previous_step), tensor.isinf(previous_step))) step = tensor.switch(not_finite, self.scaler * param, previous_step) return step, []
def recurrence(log_p_curr, log_p_prev, skip_mask=None): if skip_mask is None: skip_mask = T.ones_like(log_p_curr[:, 1:-2:2]) # normalise and bring back to p space k = T.max(log_p_prev, axis=1, keepdims=True) norm_p_prev = T.switch( T.isinf(log_p_prev), 0, T.exp(log_p_prev - k)) # set -inf to 0 # previous _result = norm_p_prev # add shift of previous _result = T.inc_subtensor(_result[:, 1:], norm_p_prev[:, :-1]) # add skips of previous _result = T.inc_subtensor(_result[:, 3::2], T.switch(skip_mask,norm_p_prev[:, 1:-2:2],0)) # current # log(p) should be 0 for first 2 terms result = T.switch( T.eq(_result, 0), -np.inf, log_p_curr + T.log(_result) + k ) return result
def get_updates(self, loss, lr, max_norm=1, beta1=0.9, beta2=0.999, epsilon=1e-8, grads=None): # Gradients if grads is None: grads = tensor.grad(loss, self.trainables) # Clipping norm = tensor.sqrt(sum([tensor.sqr(g).sum() for g in grads])) m = theanotools.clipping_multiplier(norm, max_norm) grads = [m*g for g in grads] # Safeguard against numerical instability new_cond = tensor.or_(tensor.or_(tensor.isnan(norm), tensor.isinf(norm)), tensor.or_(norm < 0, norm > 1e10)) grads = [tensor.switch(new_cond, np.float32(0), g) for g in grads] # Safeguard against numerical instability #cond = tensor.or_(norm < 0, tensor.or_(tensor.isnan(norm), tensor.isinf(norm))) #grads = [tensor.switch(cond, np.float32(0), g) for g in grads] # New values t = self.time + 1 lr_t = lr*tensor.sqrt(1. - beta2**t)/(1. - beta1**t) means_t = [beta1*m + (1. - beta1)*g for g, m in zip(grads, self.means)] vars_t = [beta2*v + (1. - beta2)*tensor.sqr(g) for g, v in zip(grads, self.vars)] steps = [lr_t*m_t/(tensor.sqrt(v_t) + epsilon) for m_t, v_t in zip(means_t, vars_t)] # Updates updates = [(x, x - step) for x, step in zip(self.trainables, steps)] updates += [(m, m_t) for m, m_t in zip(self.means, means_t)] updates += [(v, v_t) for v, v_t in zip(self.vars, vars_t)] updates += [(self.time, t)] return norm, grads, updates
def graves_rmsprop_updates(self, params, grads, learning_rate=1e-4, alpha=0.9, epsilon=1e-4, chi=0.95): """ Alex Graves' RMSProp [1]_. .. math :: n_{i} &= \chi * n_i-1 + (1 - \chi) * grad^{2}\\ g_{i} &= \chi * g_i-1 + (1 - \chi) * grad\\ \Delta_{i} &= \alpha * Delta_{i-1} - learning_rate * grad / sqrt(n_{i} - g_{i}^{2} + \epsilon)\\ w_{i} &= w_{i-1} + \Delta_{i} References ---------- .. [1] Graves, Alex. "Generating Sequences With Recurrent Neural Networks", p.23 arXiv:1308.0850 """ updates = [] grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) for n, (param, grad) in enumerate(zip(params, grads)): grad = T.switch(not_finite, 0.1 * param, grad) old_square = self.running_square_[n] old_avg = self.running_avg_[n] old_memory = self.memory_[n] new_square = chi * old_square + (1. - chi) * grad ** 2 new_avg = chi * old_avg + (1. - chi) * grad new_memory = alpha * old_memory - learning_rate * grad / T.sqrt(new_square - \ new_avg ** 2 + epsilon) updates.append((old_square, new_square)) updates.append((old_avg, new_avg)) updates.append((old_memory, new_memory)) updates.append((param, param + new_memory)) return updates
def surface_pts(self, rayField): rf = self.w2o(rayField) distance = self.distance(rayField) stabilized = T.switch(T.isinf(distance), 1000, distance) return rf.origin + (stabilized.dimshuffle(0, 1, 'x') * rays)
def compute_updates(self, training_cost, params): updates = [] grads = T.grad(training_cost, params) grads = OrderedDict(zip(params, grads)) # Clip stuff c = numpy.float32(self.cutoff) clip_grads = [] norm_gs = T.sqrt(sum(T.sum(g**2) for p, g in grads.items())) normalization = T.switch(T.ge(norm_gs, c), c / norm_gs, np.float32(1.)) notfinite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs)) for p, g in grads.items(): clip_grads.append((p, T.switch(notfinite, numpy.float32(.1) * p, g * normalization))) grads = OrderedDict(clip_grads) if self.updater == 'adagrad': updates = Adagrad(grads, self.lr) elif self.updater == 'sgd': raise Exception("Sgd not implemented!") elif self.updater == 'adadelta': updates = Adadelta(grads) elif self.updater == 'rmsprop': updates = RMSProp(grads, self.lr) elif self.updater == 'adam': updates = Adam(grads) else: raise Exception("Updater not understood!") return updates
def get_gradients(self, model, data, ** kwargs): cost = self.expr(model=model, data=data, **kwargs) params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs='ignore') gradients = OrderedDict(izip(params, grads)) if self.gradient_clipping: norm_gs = 0. for grad in gradients.values(): norm_gs += (grad ** 2).sum() not_finite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs)) norm_gs = T.sqrt(norm_gs) norm_gs = T.switch(T.ge(norm_gs, self.max_magnitude), self.max_magnitude / norm_gs, 1.) for param, grad in gradients.items(): gradients[param] = T.switch(not_finite, .1 * param, grad * norm_gs) updates = OrderedDict() return gradients, updates
def minimize(self, loss, momentum, rescale): super(RMSPropOptimizer, self).minimize(loss) grads = self.gradparams grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) grad_norm = T.sqrt(grad_norm) scaling_num = rescale scaling_den = T.maximum(rescale, grad_norm) # Magic constants combination_coeff = 0.9 minimum_grad = 1E-4 updates = [] params = self.params for n, (param, grad) in enumerate(zip(params, grads)): grad = T.switch(not_finite, 0.1 * param, grad * (scaling_num / scaling_den)) old_square = self.running_square_[n] new_square = combination_coeff * old_square + ( 1. - combination_coeff) * T.sqr(grad) old_avg = self.running_avg_[n] new_avg = combination_coeff * old_avg + ( 1. - combination_coeff) * grad rms_grad = T.sqrt(new_square - new_avg ** 2) rms_grad = T.maximum(rms_grad, minimum_grad) memory = self.memory_[n] update = momentum * memory - self.lr * grad / rms_grad update2 = momentum * momentum * memory - ( 1 + momentum) * self.lr * grad / rms_grad updates.append((old_square, new_square)) updates.append((old_avg, new_avg)) updates.append((memory, update)) updates.append((param, param + update2)) return updates
def exe(self, mainloop): """ .. todo:: WRITEME """ grads = mainloop.grads g_norm = 0. for p, g in grads.items(): g /= T.cast(self.batch_size, dtype=theano.config.floatX) grads[p] = g g_norm += (g**2).sum() if self.check_nan: not_finite = T.or_(T.isnan(g_norm), T.isinf(g_norm)) g_norm = T.sqrt(g_norm) scaler = self.scaler / T.maximum(self.scaler, g_norm) if self.check_nan: for p, g in grads.items(): grads[p] = T.switch(not_finite, 0.1 * p, g * scaler) else: for p, g in grads.items(): grads[p] = g * scaler mainloop.grads = grads
def compute_step(self, parameter, previous_step): step_sum = tensor.sum(previous_step) not_finite = (tensor.isnan(step_sum) + tensor.isinf(step_sum)) step = tensor.switch( not_finite > 0, (1 - self.scaler) * parameter, previous_step) return step, []
def compute_step(self, param, previous_step): grad_norm = l2_norm([previous_step]) not_finite = tensor.or_(tensor.isnan(grad_norm), tensor.isinf(grad_norm)) step = tensor.switch(not_finite, self.scaler * param, previous_step) return step, []
def nan_shield(parameters, deltas, other_updates): delta_sum = sum(T.sum(d) for d in deltas) not_finite = T.isnan(delta_sum) | T.isinf(delta_sum) parameter_updates = [(p, T.switch(not_finite, 0.9 * p, p - d)) for p, d in izip(parameters, deltas)] other_updates = [(p, T.switch(not_finite, p, u)) for p, u in other_updates] return parameter_updates, other_updates
def updates(self, cost, params, learning_rate = 0.1, momentum= 0.95, rescale=5.): grads = T.grad(cost, params) grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) grad_norm = T.sqrt(grad_norm) scaling_num = rescale scaling_den = T.maximum(rescale, grad_norm) # Magic constants combination_coeff = 0.9 minimum_grad = 1e-4 updates = [] for n, (param, grad) in enumerate(zip(params, grads)): grad = T.switch(not_finite, 0.1 * param, grad * (scaling_num / scaling_den)) old_square = self.running_square_[n] new_square = combination_coeff * old_square + ( 1. - combination_coeff) * T.sqr(grad) old_avg = self.running_avg_[n] new_avg = combination_coeff * old_avg + ( 1. - combination_coeff) * grad rms_grad = T.sqrt(new_square - new_avg ** 2) rms_grad = T.maximum(rms_grad, minimum_grad) memory = self.memory_[n] update = momentum * memory - learning_rate * grad / rms_grad update2 = momentum * momentum * memory - ( 1 + momentum) * learning_rate * grad / rms_grad updates.append((old_square, new_square)) updates.append((old_avg, new_avg)) updates.append((memory, update)) updates.append((param, param + update2)) return updates
def adamgc(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8, max_magnitude=5.0, infDecay=0.1): updates = [] grads = T.grad(cost, params) norm = norm_gs(params, grads) sqrtnorm = T.sqrt(norm) not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm)) adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.) i = shared(floatX(0.)) i_t = i + 1. fix1 = 1. - (1. - b1)**i_t fix2 = 1. - (1. - b2)**i_t lr_t = lr * (T.sqrt(fix2) / fix1) for p, g in zip(params, grads): g = T.switch(not_finite, infDecay * p, g * adj_norm_gs) m = shared(p.get_value() * 0.) v = shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * T.sqr(g)) + ((1. - b2) * v) g_t = m_t / (T.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) return updates, norm
def __init__(self, n_visible, n_hidden=150, n_hidden_recurrent=100, lr=0.001, l2_norm=None, l1_norm=None): (v, v_sample, cost, monitor, params, updates_train, v_t, updates_generate, n_steps) = build_rnnrbm(n_visible, n_hidden, n_hidden_recurrent, lr, l2_norm=l2_norm, l1_norm=l1_norm) for param in params: gradient = T.grad(cost, param, consider_constant=[v_sample]) # remove nan and inf values not_finite = T.or_(T.isnan(gradient), T.isinf(gradient)) gradient = T.switch(not_finite, 0.1 * param, gradient) # max_grad = param * 1e-3 # gradient = T.switch(T.gt(gradient, max_grad), max_grad, gradient) # momentum # velocity = shared_zeros('velocity_' + str(param.name), param.get_value(borrow=True).shape) # update = param - T.cast(lr, dtype=dtype) * gradient # x = momentum * velocity + update - param # updates_train[velocity] = x # updates_train[param] = momentum * x + update # rmsprop accu = shared_zeros('accu_' + str(param.name), param.get_value(borrow=True).shape) accu_new = 0.9 * accu + 0.1 * gradient ** 2 updates_train[accu] = accu_new updates_train[param] = param - (lr * gradient / T.sqrt(accu_new + 1e-6)) self.params = params self.train_function = theano.function([v], monitor, updates=updates_train) self.generate_function = theano.function([n_steps], v_t, updates=updates_generate)
def lda_logp(rt, gaze, values, error_lls, s_condition_index, s_subject_index, v_condition_index, v_subject_index, tau_condition_index, tau_subject_index, gamma_condition_index, gamma_subject_index, t0_condition_index, t0_subject_index, zerotol): # compute drifts drift = glam.components.expdrift( v[tt.cast(v_subject_index, dtype='int32'), tt.cast(v_condition_index, dtype='int32')][:, None], tau[tt.cast(tau_subject_index, dtype='int32'), tt.cast(tau_condition_index, dtype='int32')][:, None], gamma[tt.cast(gamma_subject_index, dtype='int32'), tt.cast(gamma_condition_index, dtype='int32')][:, None], values, gaze, zerotol) glam_ll = glam.components.tt_wienerrace_pdf( rt[:, None], drift, s[tt.cast(s_subject_index, dtype='int32'), tt.cast(s_condition_index, dtype='int32')][:, None], b, t0[tt.cast(t0_subject_index, dtype='int32'), tt.cast(t0_condition_index, dtype='int32')][:, None], zerotol) # mix likelihoods mixed_ll = ((1 - p_error) * glam_ll + p_error * error_lls[subject_idx]) mixed_ll = tt.where(tt.isnan(mixed_ll), 0., mixed_ll) mixed_ll = tt.where(tt.isinf(mixed_ll), 0., mixed_ll) return tt.sum(tt.log(mixed_ll + zerotol))
def compute_updates(self, training_cost, params): updates = [] grads = T.grad(training_cost, params) grads = OrderedDict(zip(params, grads)) # Clip stuff c = numpy.float32(self.cutoff) clip_grads = [] norm_gs = T.sqrt(sum(T.sum(g ** 2) for p, g in grads.items())) normalization = T.switch(T.ge(norm_gs, c), c / norm_gs, np.float32(1.)) notfinite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs)) for p, g in grads.items(): clip_grads.append((p, T.switch(notfinite, numpy.float32(.1) * p, g * normalization))) grads = OrderedDict(clip_grads) if self.updater == 'adagrad': updates = Adagrad(grads, self.lr) elif self.updater == 'sgd': raise Exception("Sgd not implemented!") elif self.updater == 'adadelta': updates = Adadelta(grads) elif self.updater == 'rmsprop': updates = RMSProp(grads, self.lr) elif self.updater == 'adam': updates = Adam(grads) else: raise Exception("Updater not understood!") return updates
def lda_logp(rt, gaze, values, error_ll, v_index, tau_index, gamma_index, s_index, t0_index, is_multiplicative, zerotol): # compute drifts ## Select the right drift function drift = ifelse( is_multiplicative, glam.components.tt_drift_multiplicative( v[0, tt.cast(v_index, dtype='int32')][:, None], tau[0, tt.cast(tau_index, dtype='int32')][:, None], gamma[0, tt.cast(gamma_index, dtype='int32')][:, None], values, gaze, zerotol), glam.components.tt_drift_additive( v[0, tt.cast(v_index, dtype='int32')][:, None], tau[0, tt.cast(tau_index, dtype='int32')][:, None], gamma[0, tt.cast(gamma_index, dtype='int32')][:, None], values, gaze, zerotol)) # drift = driftfun(v[0, tt.cast(v_index, dtype='int32')][:, None], # tau[0, tt.cast(tau_index, dtype='int32')][:, None], # gamma[0, tt.cast(gamma_index, dtype='int32')][:, None], # values, # gaze, # zerotol) glam_ll = glam.components.tt_wienerrace_pdf( rt[:, None], drift, s[0, tt.cast(s_index, dtype='int32')][:, None], b, t0[0, tt.cast(t0_index, dtype='int32')][:, None], zerotol) # mix likelihoods mixed_ll = ((1 - p_error) * glam_ll + p_error * error_ll) mixed_ll = tt.where(tt.isnan(mixed_ll), 0., mixed_ll) mixed_ll = tt.where(tt.isinf(mixed_ll), 0., mixed_ll) return tt.log(mixed_ll + zerotol)
def find_sigma(X_shared, sigma_shared, N, perplexity, sigma_iters, verbose=0): X = T.fmatrix('X') sigma = T.fvector('sigma') target = np.log(perplexity) P = T.maximum(p_ij_conditional_var(X, sigma), epsilon) entropy = -T.sum(P * T.log(P), axis=1) # Setting update for binary search interval sigmin_shared = theano.shared(np.full(N, np.sqrt(epsilon), dtype=floath)) sigmax_shared = theano.shared(np.full(N, np.inf, dtype=floath)) sigmin = T.fvector('sigmin') sigmax = T.fvector('sigmax') upmin = T.switch(T.lt(entropy, target), sigma, sigmin) upmax = T.switch(T.gt(entropy, target), sigma, sigmax) givens = { X: X_shared, sigma: sigma_shared, sigmin: sigmin_shared, sigmax: sigmax_shared } updates = [(sigmin_shared, upmin), (sigmax_shared, upmax)] update_intervals = theano.function([], entropy, givens=givens, updates=updates) # Setting update for sigma according to search interval upsigma = T.switch(T.isinf(sigmax), sigma * 2, (sigmin + sigmax) / 2.) givens = { sigma: sigma_shared, sigmin: sigmin_shared, sigmax: sigmax_shared } updates = [(sigma_shared, upsigma)] update_sigma = theano.function([], sigma, givens=givens, updates=updates) for i in range(sigma_iters): e = update_intervals() update_sigma() if verbose: print( 'Finding sigmas... Iteration {0}/{1}: Perplexities in [{2:.4f}, {3:.4f}].' .format(i + 1, sigma_iters, np.exp(e.min()), np.exp(e.max())), end='\r') if np.any(np.isnan(np.exp(e))): raise SigmaTooLowException( 'Invalid sigmas. The perplexity is probably too low.') if verbose: print('\nDone. Perplexities in [{0:.4f}, {1:.4f}].'.format( np.exp(e.min()), np.exp(e.max())))
def acc_cost(log_probs, label_mask, frame_mask, skip_mask=None): seq_acc_logp = forward_backward_pass(log_probs, label_mask, frame_mask, skip_mask) k = T.max(seq_acc_logp, axis=2, keepdims=True) log_sum_p = T.log( T.sum(T.switch(T.isinf(seq_acc_logp), 0, T.exp(seq_acc_logp - k)), axis=2)) + k.dimshuffle(0, 1) return T.sum(log_sum_p, axis=0)
def get_vanilla_sgd_updates(param_list, gradients, lr): """Do SGD updates with vanilla step rule.""" updates = [] for p, g in zip(param_list, gradients): new_p = p - lr * g has_non_finite = T.any(T.isnan(new_p) + T.isinf(new_p)) updates.append((p, ifelse(has_non_finite, p, new_p))) return updates
def clip(clip_size,parameters,gradients): grad_mag = T.sqrt(sum(T.sum(T.sqr(w)) for w in parameters)) exploded = T.isnan(grad_mag) | T.isinf(grad_mag) scale = clip_size / T.maximum(clip_size,grad_mag) return [ T.switch(exploded, 0.1 * p, scale * g ) for p,g in zip(parameters,gradients) ]
def bfgs(inverse_hessian, weight_delta, gradient_delta, maxrho=1e4): ident_matrix = cast_float(T.eye(inverse_hessian.shape[0])) maxrho = cast_float(maxrho) rho = cast_float(1.) / gradient_delta.dot(weight_delta) rho = ifelse(T.isinf(rho), maxrho * T.sgn(rho), rho) param1 = ident_matrix - T.outer(weight_delta, gradient_delta) * rho param2 = ident_matrix - T.outer(gradient_delta, weight_delta) * rho param3 = rho * T.outer(weight_delta, weight_delta) return param1.dot(inverse_hessian).dot(param2) + param3
def clip(clip_size, parameters, gradients): grad_mag = T.sqrt(sum(T.sum(T.sqr(w)) for w in parameters)) exploded = T.isnan(grad_mag) | T.isinf(grad_mag) scale = clip_size / T.maximum(clip_size, grad_mag) return [ T.switch(exploded, 0.1 * p, scale * g) for p, g in zip(parameters, gradients) ]
def replace_nans(tensor): """ convert nans and infs to float_max. convert -infs to float_min. """ tensor = T.switch(T.isnan(tensor), sys.float_info.max, tensor) return T.switch( T.isinf(tensor), T.switch(T.lt(tensor, 0), sys.float_info.min, sys.float_info.max), tensor)
def gradient_clipping(grads, tparams, clip_c=1.0): g2 = 0. for g in grads: g2 += (g**2).sum() g2 = tensor.sqrt(g2) not_finite = tensor.or_(tensor.isnan(g2), tensor.isinf(g2)) new_grads = [] for p, g in zip(tparams.values(), grads): new_grads.append(tensor.switch(g2 > clip_c, g * (clip_c / g2), g)) return new_grads, not_finite, tensor.lt(clip_c, g2)
def normals(self, rayField): """Returns the sphere normals at each hit point.""" rf = self.w2o(rayField) distance = self.distance(rayField) distance = T.switch(T.isinf(distance), 0, distance) projections = (rf.origin) + (distance.dimshuffle(0, 1, 'x') * rf.rays) normals = projections / T.sqrt(T.sum(projections**2, 2)).dimshuffle( 0, 1, 'x') return normals # need to fix
def replace_nans(tensor): """ convert nans and infs to float_max. convert -infs to float_min. """ tensor = T.switch(T.isnan(tensor), sys.float_info.max, tensor) return T.switch(T.isinf(tensor), T.switch(T.lt(tensor, 0), sys.float_info.min, sys.float_info.max), tensor)
def normals(self, rayField): """Returns the sphere normals at each hit point.""" rf = self.w2o(rayField) distance = self.distance(rayField) distance = T.switch(T.isinf(distance), 0, distance) projections = (rf.origin) + (distance.dimshuffle(0, 1, 'x') * rf.rays) normals = projections / T.sqrt( T.sum(projections ** 2, 2)).dimshuffle(0, 1, 'x') return normals # need to fix
def step_clipping(params, gparams, scale=1.): grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), gparams))) notfinite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) multiplier = T.switch(grad_norm < scale, 1., scale / grad_norm) _g = [] for param, gparam in izip(params, gparams): tmp_g = gparam * multiplier _g.append(T.switch(notfinite, param * 0.1, tmp_g)) params_clipping = _g return params_clipping
def get_output_for(self, input, **kwargs): # batch_size, n_channels, n_rows, n_cols = self.input_shape input = input - input.min(axis=1, keepdims=True) output = input / input.sum(axis=1, keepdims=True) # deal with NaN produced because of dividing by 0 nan_mask = T.isnan(output) nan_idx = nan_mask.nonzero() output_without_nan = T.set_subtensor(output[nan_idx], 0) inf_mask = T.isinf(output_without_nan) inf_idx = inf_mask.nonzero() output_without_inf = T.set_subtensor(output_without_nan[inf_idx], 0) return output_without_inf
def step_clipping(params, gparams, scale=1.0): grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), gparams))) notfinite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) multiplier = T.switch(grad_norm < scale, 1.0, scale / grad_norm) _g = [] for param, gparam in izip(params, gparams): tmp_g = gparam * multiplier _g.append(T.switch(notfinite, param * 0.1, tmp_g)) params_clipping = _g return params_clipping
def acc_cost(log_probs, label_mask, frame_mask, skip_mask=None): seq_acc_logp = forward_backward_pass( log_probs, label_mask, frame_mask, skip_mask ) k = T.max(seq_acc_logp, axis=2, keepdims=True) log_sum_p = T.log(T.sum( T.switch(T.isinf(seq_acc_logp), 0, T.exp(seq_acc_logp - k)), axis=2 )) + k.dimshuffle(0, 1) return T.sum(log_sum_p, axis=0)
def exe(self, mainloop): """ .. todo:: WRITEME """ grads = mainloop.grads for p, g in grads.items(): g /= self.batch_size g_norm = T.sqrt((g**2).sum()) not_finite = T.or_(T.isnan(g_norm), T.isinf(g_norm)) scaler = self.scaler / T.maximum(self.scaler, g_norm) grads[p] = T.switch(not_finite, 0.1 * p, g * scaler) mainloop.grads = grads
def from_partial(self, X, dX): eps=1e-10 U, S, V = X dU, dS, dV = dX umask = 1 - (1 - tensor.isnan(dU)) * (1 - tensor.isinf(dU)) # indicators of nan/inf values vmask = 1 - (1 - tensor.isnan(dV)) * (1 - tensor.isinf(dV)) # indicators of nan/inf values # U S V => U mask product by columns, V by rows smask = 1 - tensor.prod(1 - umask, axis=0) * tensor.prod(1 - vmask, axis=1) S = tensor.diag(S) dU = tensor.set_subtensor(dU[umask.nonzero()], 0.0) S_pinv = tensor.switch(tensor.gt(abs(S), eps), 1.0 / S, 0.0) S_pinv = tensor.set_subtensor(S_pinv[smask.nonzero()], 0.0) S_pinv = tensor.diag(S_pinv) dV = tensor.set_subtensor(dV[vmask.nonzero()], 0.0) ZV = dU.dot(S_pinv) UtZV = dS ZtU = S_pinv.dot(dV) Zproj = (ZV - U.dot(UtZV), UtZV, ZtU - (UtZV.dot(V))) return Zproj
def exe(self, mainloop): """ .. todo:: WRITEME """ grads = mainloop.grads g_norm = sum([T.sqr(x/self.batch_size).sum() for x in grads.values()]) not_finite = T.or_(T.isnan(g_norm), T.isinf(g_norm)) g_norm = T.sqrt(g_norm) scaler = self.scaler / T.maximum(self.scaler, g_norm) for p, g in grads.items(): grads[p] = T.switch(not_finite, 0.1 * p, g * scaler) mainloop.grads = grads
def gradient_descent(self, loss): """Momentum GD with gradient clipping.""" grad = T.grad(loss, self.params) self.momentum_velocity_ = [0.0] * len(grad) grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grad))) updates = OrderedDict() not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) scaling_den = T.maximum(5.0, grad_norm) for n, (param, grad) in enumerate(zip(self.params, grad)): grad = T.switch(not_finite, 0.1 * param, grad * (5.0 / scaling_den)) velocity = self.momentum_velocity_[n] update_step = self.momentum * velocity - self.learning_rate * grad self.momentum_velocity_[n] = update_step updates[param] = param + update_step return updates
def find_sigma(X_shared, sigma_shared, N, perplexity, sigma_iters, metric, verbose=0): """Binary search on sigma for a given perplexity.""" X = T.fmatrix('X') sigma = T.fvector('sigma') target = np.log(perplexity) P = T.maximum(p_Xp_given_X_var(X, sigma, metric), epsilon) entropy = -T.sum(P*T.log(P), axis=1) # Setting update for binary search interval sigmin_shared = theano.shared(np.full(N, np.sqrt(epsilon), dtype=floath)) sigmax_shared = theano.shared(np.full(N, np.inf, dtype=floath)) sigmin = T.fvector('sigmin') sigmax = T.fvector('sigmax') upmin = T.switch(T.lt(entropy, target), sigma, sigmin) upmax = T.switch(T.gt(entropy, target), sigma, sigmax) givens = {X: X_shared, sigma: sigma_shared, sigmin: sigmin_shared, sigmax: sigmax_shared} updates = [(sigmin_shared, upmin), (sigmax_shared, upmax)] update_intervals = theano.function([], entropy, givens=givens, updates=updates) # Setting update for sigma according to search interval upsigma = T.switch(T.isinf(sigmax), sigma*2, (sigmin + sigmax)/2.) givens = {sigma: sigma_shared, sigmin: sigmin_shared, sigmax: sigmax_shared} updates = [(sigma_shared, upsigma)] update_sigma = theano.function([], sigma, givens=givens, updates=updates) for i in range(sigma_iters): e = update_intervals() update_sigma() if verbose: print('Iteration: {0}.'.format(i+1)) print('Perplexities in [{0:.4f}, {1:.4f}].'.format(np.exp(e.min()), np.exp(e.max()))) if np.any(np.isnan(np.exp(e))): raise Exception('Invalid sigmas. The perplexity is probably too low.')
def bfgs(inverse_hessian, weight_delta, gradient_delta, maxrho=1e4): ident_matrix = T.eye(inverse_hessian.shape[0]) maxrho = asfloat(maxrho) rho = asfloat(1.) / gradient_delta.dot(weight_delta) rho = ifelse( T.isinf(rho), maxrho * T.sgn(rho), rho, ) param1 = ident_matrix - T.outer(weight_delta, gradient_delta) * rho param2 = ident_matrix - T.outer(gradient_delta, weight_delta) * rho param3 = rho * T.outer(weight_delta, weight_delta) return param1.dot(inverse_hessian).dot(param2) + param3
def getUpdates(self): params=self.params lr=self.lr momentum=self.momentum rescale=self.rescale gparams =self.gparams updates = OrderedDict() if not hasattr(self, "running_average_"): self.running_square_ = [0.] * len(gparams) self.running_avg_ = [0.] * len(gparams) self.updates_storage_ = [0.] * len(gparams) if not hasattr(self, "momentum_velocity_"): self.momentum_velocity_ = [0.] * len(gparams) # Gradient clipping grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), gparams))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) grad_norm = T.sqrt(grad_norm) scaling_num = rescale scaling_den = T.maximum(rescale, grad_norm) for n, (param, gparam) in enumerate(zip(params, gparams)): gparam = T.switch(not_finite, 0.1 * param, gparam * (scaling_num / scaling_den)) combination_coeff = 0.9 minimum_grad = 1e-4 old_square = self.running_square_[n] new_square = combination_coeff * old_square + ( 1. - combination_coeff) * T.sqr(gparam) old_avg = self.running_avg_[n] new_avg = combination_coeff * old_avg + ( 1. - combination_coeff) * gparam rms_grad = T.sqrt(new_square - new_avg ** 2) rms_grad = T.maximum(rms_grad, minimum_grad) velocity = self.momentum_velocity_[n] update_step = momentum * velocity - lr * ( gparam / rms_grad) self.running_square_[n] = new_square self.running_avg_[n] = new_avg self.updates_storage_[n] = update_step self.momentum_velocity_[n] = update_step updates[param] = param + update_step return updates
def ada_delta(self, loss, rho=0.95, eps=1e-8): '''AdaDelta with Gradient Clipping''' grad = T.grad(loss, self.params) grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grad))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) scaling_den = T.maximum(5.0, grad_norm) updates = OrderedDict() for n, (param, grad, grad_sq_old, delta_sq_old) in enumerate(zip(self.params, grad, self.gradients_sq, self.deltas_sq)): grad = T.switch(not_finite, 0.01 * param, grad * (5.0 / scaling_den)) grad_sq_new = rho*grad_sq_old + (1-rho)*(grad**2) delta = (T.sqrt(delta_sq_old+eps)/T.sqrt(grad_sq_new+eps))*grad delta_sq_new = rho*delta_sq_old + (1-rho)*delta**2 updates[param] = param - delta updates[grad_sq_old] = grad_sq_new updates[delta_sq_old] = delta_sq_new return updates
def clip_gradients_norm(gradients, threshold, parameters, fix_nan = False): gradient_sqr_vec = T.concatenate([T.sqr(g.flatten()) for g in gradients]) gradient_norm = T.sqrt(gradient_sqr_vec.sum()) rescale = T.maximum(gradient_norm, threshold) if fix_nan: isnan = T.or_(T.isnan(gradient_norm), T.isinf(gradient_norm)) else: isnan = None rv = [] for i, g in enumerate(gradients): if fix_nan: alt_g = 0.1 * parameters[i] print_alt_g = Print("NaN detected! Fixing with pseudogradient with mean:", ["mean"])(alt_g) new_g = T.switch(isnan, print_alt_g, g / rescale) else: new_g = g / rescale rv.append(new_g) return rv