def step(x_t,y_t,h_tm1,Wx,Wh,bh,Wy,by,lr,switch): h_t = relu(T.dot(x_t,Wx)+T.dot(h_tm1,Wh)+bh) yo_t = relu(T.dot(h_t,Wy)+by) updates = OrderedDict() # Train the RNN: backprop (loss + DNI output) loss = T.mean(T.square(yo_t-y_t)) dni_out = self.dni.output(h_t) for param in self.params: dlossdparam = T.grad(loss,param) dniJ = T.Lop(h_t,param,dni_out,disconnected_inputs='ignore') updates[param] = param-lr*T.switch(T.gt(switch,0), dlossdparam+dniJ, dlossdparam) # Update the DNI (from the last step) # re-calculate the DNI prediction from the last step # note: can't be passed through scan or T.grad won't work dni_out_old = self.dni.output(h_tm1) # dni_target: current loss backprop'ed + new dni backprop'ed dni_target = T.grad(loss,h_tm1) \ +T.Lop(h_t,h_tm1,dni_out) dni_error = T.sum(T.square(dni_out_old-dni_target)) for param in self.dni.params: gparam = T.grad(dni_error,param) updates[param] = param-lr*gparam return [h_t,loss,dni_error],updates
def test_multiple_outputs(self): m = tensor.matrix('m') v = tensor.vector('v') m_ = tensor.matrix('m_') v_ = tensor.vector('v_') mval = self.rng.uniform(size=(3, 7)).astype(theano.config.floatX) vval = self.rng.uniform(size=(7, )).astype(theano.config.floatX) m_val = self.rng.uniform(size=(3, 7)).astype(theano.config.floatX) v_val = self.rng.uniform(size=(7, )).astype(theano.config.floatX) rop_out1 = tensor.Rop([m, v, m + v], [m, v], [m_, v_]) assert isinstance(rop_out1, list) assert len(rop_out1) == 3 rop_out2 = tensor.Rop((m, v, m + v), [m, v], [m_, v_]) assert isinstance(rop_out2, tuple) assert len(rop_out2) == 3 lop_out1 = tensor.Lop([m, v, m + v], (m, v), [m_, v_]) assert isinstance(lop_out1, tuple) assert len(lop_out1) == 2 lop_out2 = tensor.Lop((m, v, m + v), [m, v], [m_, v_]) assert isinstance(lop_out2, list) assert len(lop_out2) == 2 all_outs = [] for o in rop_out1, rop_out2, lop_out1, lop_out2: all_outs.extend(o) f = theano.function([m, v, m_, v_], all_outs) f(mval, vval, m_val, v_val)
def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.gc_outs, replace) final_results = dict( zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.gc_outs_operator): loc_params = [ x for x in model.params if x in theano.gof.graph.inputs([nw_out]) ] loc_args = [ x for x, y in zip(args, model.params) if y in theano.gof.graph.inputs([nw_out]) ] if out_operator == 'softmax': factor = const(options['cbs']) * (nw_out + eps) elif out_operator == 'sigmoid': factor = const( options['cbs']) # * nw_out * (1 - nw_out) else: factor = const(options['cbs']) if out_operator != 'sigmoid': loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) else: tnwout = TT.nnet.sigmoid(nw_out) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) *\ tnwout * (1 - tnwout)/ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params) ] return [gv_args[0] + const(1)] + Gvs nw_cost, nw_preactiv_out = safe_clone( [model.train_cost, model.preactiv_out], replace) nw_gvs = TT.Lop( nw_preactiv_out, model.params, TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params, args)) Gvs = [ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)] return [gv_args[0] + const(1)] + Gvs
def hypergrad(params_ele, params_hyper, dvalid_dtheta, loss_ele, loss_hyper, loss_ele_penalty=0.): """ Function defining the hypergradients: gradients of validation cost with respect to various hyperparameters. The function is separating penalty hyperparameters (which is assumed to depend only on w) from noise and other hyperparameters, due to otherwise dependancy errors in the Lop operator. Inputs: paramsT1, paramsT2 :: T1 and T2 parameters c1, c2 :: cross-entropy on training and validation set p1, p2 :: penalty terms on training and validation set (p2 assumed 0) """ # initializations reg_penalty, reg_noise, grad_penalty, grad_noise, w, dvalid_dw = [], [], [], [], [], [] # separate different types of parameters for regular in params_hyper: reg_type, _ = regular.name.split('_') if reg_type in penalty_list: reg_penalty += [regular] elif reg_type in noise_list: reg_noise += [regular] else: print 'Hypergrad not implemented for ', reg_type # separate weight parameters and gradients for (param, grad) in zip(params_ele, dvalid_dtheta): paramType, _ = param.name.split('_') if paramType == 'W': w += [param] dvalid_dw += [grad] # hyper-gradients if reg_penalty: dpenalty_dw = T.grad(loss_ele_penalty, w) dpenalty_dw = [-grad for grad in dpenalty_dw] grad_penalty = T.Lop(dpenalty_dw, reg_penalty, dvalid_dw) if reg_noise: dele_dtheta = T.grad(loss_ele, params_ele) dele_dtheta = [-grad for grad in dele_dtheta] grad_noise = T.Lop(dele_dtheta, reg_noise, dvalid_dtheta) # outputs params_hyper = reg_penalty + reg_noise dvalid_dgamma = grad_penalty + grad_noise return params_hyper, dvalid_dgamma
def hypergrad(paramsT1, paramsT2, gradC2T1, c1, c2, p1=0., p2=0.): ''' Function defining the hypergradients: gradients of validation cost with respect to various hyperparameters. The function is separating penalty hyperparameters (which is assumed to depend only on W) from noise and other hyperparameters, due to otherwise dependancy errors in the Lop operator. Inputs: paramsT1, paramsT2 :: T1 and T2 parameters c1, c2 :: cross-entropy on training and validation set p1, p2 :: penalty terms on training and validation set (p2 assumed 0) ''' # initializations rglrzPenal = [] rglrzNoiz = [] gradPenal = [] gradNoiz = [] W = [] gradC2W = [] # separate different types of parameters for rglrz in paramsT2: rglrzType, _ = rglrz.name.split('_') if rglrzType in penalList: rglrzPenal += [rglrz] elif rglrzType in noizList: rglrzNoiz += [rglrz] else: print 'Hypergrad not implemented for ', rglrzType # separate weight parameters and gradients for (param, grad) in zip(paramsT1, gradC2T1): paramType, _ = param.name.split('_') if paramType == 'W': W += [param] gradC2W += [grad] # hyper-gradients if rglrzPenal != []: gradPW = T.grad(p1, W) gradPW = [-grad for grad in gradPW] gradPenal = T.Lop(gradPW, rglrzPenal, gradC2W) if rglrzNoiz != []: gradE1T1 = T.grad(c1, paramsT1) gradE1T1 = [-grad for grad in gradE1T1] gradNoiz = T.Lop(gradE1T1, rglrzNoiz, gradC2T1) # outputs paramsT2 = rglrzPenal + rglrzNoiz gradC2T2 = gradPenal + gradNoiz return paramsT2, gradC2T2
def Gvs(self, *args): # Contribution of hid_sig nw_args1 = TT.Lop( self.hid_sig, self.params, TT.Rop(self.hid_sig, self.params, args) / ((1 - self.hid_sig) * self.hid_sig * self.mbs)) nw_args2 = TT.Lop( self.hid_sftmax, self.params, TT.Rop(self.hid_sftmax, self.params, args) / (self.hid_sftmax * self.mbs)) return [x + y for x, y in zip(nw_args1, nw_args2)]
def reinforce_no_baseline(params, policy, cost, lr, regularising_cost=None): """ return reinforce updates @policy and @cost should be of shape (minibatch_size, 1) @policy should be the probability of the sampled actions """ log_pol = T.log(policy) if regularising_cost is None: return [(i, i - lr * gi) for i, gi in zip( params, T.Lop(f=log_pol, wrt=params, eval_points=cost))] else: return [(i, i - lr * (gi + gr)) for i, gi, gr in zip( params, T.Lop(f=log_pol, wrt=params, eval_points=cost), T.grad(regularising_cost, params))]
def compute_Gv(*args): (hid_sig, hid_sftmax) = self.get_hiddens() nw_args1 = TT.Lop( hid_sig, self.params, TT.Rop(hid_sig, self.params, args) / ((1 - hid_sig) * hid_sig * self.batchsize)) nw_args2 = TT.Lop( hid_sftmax, self.params, TT.Rop(hid_sftmax, self.params, args) / (hid_sftmax * self.batchsize)) fin_vals = [x + y for x, y in zip(nw_args1, nw_args2)] new_vals = safe_clone(fin_vals, [self.X, self.Y], [self.loc_x, self.loc_y]) return new_vals, {}
def setup(self, bottom, top): input = T.tensor4("input") v = T.matrix("v") result = T.sum(input, axis=(2, 3)) result_g = T.Lop(result, input, v) self.f = theano.function([input], result) self.b = theano.function([input, v], result_g)
def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict(zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [x for x in model.params if x in theano.gof.graph.inputs([nw_out])] loc_args = [x for x, y in zip(cgv, model.params) if y in theano.gof.graph.inputs([nw_out])] if out_operator == 'softmax': factor = const(options['cbs']) * nw_out elif out_operator == 'sigmoid': factor = const(options['cbs']) * nw_out * (1 - nw_out) else: factor = const(options['cbs']) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params)] return [gv_args[0] + const(1)] + Gvs
def create_esgd_updates(updates, params, gparams, gsums, xsums, lr, eps, gamma, momentum): has_momentum = momentum.get_value() > 0.0 samples = [ default_mrng.normal(size=p.shape, avg=0, std=1, dtype=theano.config.floatX) for p in params ] HVs = T.Lop(gparams, params, samples) i = theano.shared(np.float64(0.0).astype(theano.config.floatX)) i_t = i + 1.0 omg_t = 1.0 - gamma**i_t for p, g, m, D, Hv in zip(params, gparams, gsums, xsums, HVs): if is_subtensor_op(p): raise Exception("ESGD subtensor update not implemented!") else: D_t = D * gamma + T.sqr(Hv) * (1.0 - gamma) if has_momentum: m_t = m * momentum + g updates[m] = m_t else: m_t = g g_t = m_t / (T.sqrt(D_t / omg_t + eps)) updates[D] = D_t updates[p] = p - lr * g_t updates[i] = i_t
def compute_Ax(x): # There are three ways to compute the Fisher-vector product: # 1. https://github.com/joschu/modular_rl/blob/master/modular_rl/trpo.py#L54 # Use theano.gradient.disconnected_grad and call theano.tensor.grad() twice. # WARNING: In our case (with the attention mechanism) it is extremly slow. # 2. http://deeplearning.net/software/theano/tutorial/gradients.html#hessian-times-a-vector # Use only theano.tensor.Rop, but you will need to calculate the fixed_output outside # of the compiled function, because disconnected_grad will not work with Rop. # 3. https://github.com/pascanur/natgrad/blob/master/model_convMNIST_standard.py # Rop devided by output because a metric F is based on gradient of log(output). # Here we also split the vector of parameters. Not checked, but it may be # faster then supply few vectors to minresQLP. xs = [] offset = 0 for p in params: shape = p.get_value().shape size = np.prod(shape) xs.append(x[offset:offset + size].reshape(shape)) offset += size jvp = T.Rop(new_output, params, xs) / ( new_output * self.batch_size * self.history + TINY) fvp = T.Lop(new_output, params, jvp) fvp = T.concatenate([g.flatten() for g in fvp]) return [fvp], {}
def __init__(self, t_cost, t_traj_info, t_inputs, params, reg=1e-5): t_new_params = [ _np2theano(p.name, p.get_value(borrow=True)) for p in params ] t_mean = t_traj_info['act_mean'] t_mean = t_mean.reshape((-1, t_mean.shape[-1])) t_logstd = t_traj_info['act_logstd'] t_logstd = t_logstd.reshape((-1, t_logstd.shape[-1])) t_new_mean = t_traj_info['new_act_mean'] t_new_mean = t_new_mean.reshape((-1, t_new_mean.shape[-1])) t_new_logstd = t_traj_info['new_act_logstd'] t_new_logstd = t_new_logstd.reshape((-1, t_new_logstd.shape[-1])) print 'Compiling cost function ... ', s = time() self.cost = theano.function(inputs=t_inputs, outputs=t_cost, on_unused_input='ignore') print 'finished in %f seconds' % (time() - s) print 'Building cost grad function ... ', s = time() _t_cost_grad = T.grad(-t_cost, wrt=params) print 'finished in %f seconds' % (time() - s) print 'Compiling cost grad function ... ', s = time() self._cost_grad = theano.function(inputs=t_inputs, outputs=[t_cost] + _t_cost_grad, on_unused_input='ignore') print 'finished in %f seconds' % (time() - s) print 'Building Hx function ... ', s = time() mu = T.concatenate([t_new_mean, t_new_logstd], axis=-1) Jx = sum([T.Rop(mu, p, x) for (p, x) in zip(params, t_new_params)]) M = T.tile(T.eye(2), (mu.shape[0], 1, 1)) Jx = Jx.reshape((Jx.shape[0], Jx.shape[1], 1)) Jx = T.tile(Jx, (1, 1, Jx.shape[1])) MJx = Jx JMJx = [ T.Lop(MJx, p, x, disconnected_inputs='ignore') for (p, x) in zip(params, t_new_params) ] Hx = [h + reg * p for (h, p) in zip(JMJx, t_new_params)] print 'finished in %f seconds' % (time() - s) # TODO: Use mask to handle different lengths. print 'Compiling Hx function ...', s = time() self._constraint_Hx = theano.function(inputs=t_inputs + t_new_params, outputs=Hx, on_unused_input='ignore') self.constraint_Hx = lambda inputs, params: self._constraint_Hx(*( inputs + params)) print 'finished in %f seconds' % (time() - s)
def mean_weighted_grad(weights, loss): # Lop to the rescue! Here I was calling T.jacobian and trying to # broadcast things and elementwise-multiply through the resulting lists, # when a function already existed to do all of that for me... return T.Lop(loss, params, weights / T.cast(weights.shape[0], 'float32'), disconnected_inputs='ignore')
def check_rop_lop(self, y, out_shape): """ As check_mat_rop_lop, except the input is self.x which is a vector. The output is still a vector. """ # TEST ROP vx = np.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) vv = np.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) yv = tensor.Rop(y, self.x, self.v) rop_f = function([self.x, self.v], yv, on_unused_input="ignore") J, _ = theano.scan( lambda i, y, x: tensor.grad(y[i], x), sequences=tensor.arange(y.shape[0]), non_sequences=[y, self.x], ) sy = tensor.dot(J, self.v) scan_f = function([self.x, self.v], sy, on_unused_input="ignore") v1 = rop_f(vx, vv) v2 = scan_f(vx, vv) assert np.allclose(v1, v2), "ROP mismatch: %s %s" % (v1, v2) known_fail = False try: tensor.Rop(theano.clone(y, replace={self.x: break_op(self.x)}), self.x, self.v) except ValueError: known_fail = True # TEST LOP vx = np.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) vv = np.asarray(self.rng.uniform(size=out_shape), theano.config.floatX) yv = tensor.Lop(y, self.x, self.v) lop_f = function([self.x, self.v], yv, on_unused_input="ignore") J, _ = theano.scan( lambda i, y, x: tensor.grad(y[i], x), sequences=tensor.arange(y.shape[0]), non_sequences=[y, self.x], ) sy = tensor.dot(self.v, J) scan_f = function([self.x, self.v], sy) v1 = lop_f(vx, vv) v2 = scan_f(vx, vv) assert np.allclose(v1, v2), "LOP mismatch: %s %s" % (v1, v2) if known_fail: pytest.skip("Rop does not handle non-differentiable inputs " "correctly. Bug exposed by fixing Add.grad method.")
def setup(self, bottom, top): import theano.tensor as T import theano x = T.dvector('x') v = T.dvector('v') y = x * 2 yg = T.Lop(y, x, v) self.f = theano.function([x], y) self.b = theano.function([x, v], yg, on_unused_input='warn')
def test_rop_lop(): mx = tensor.matrix('mx') mv = tensor.matrix('mv') v = tensor.vector('v') y = matrix_inverse(mx).sum(axis=0) yv = tensor.Rop(y, mx, mv) yv2 = tensor.Rop_via_Lop(y, mx, mv) rop_f = function([mx, mv], [yv, yv2]) sy, _ = theano.scan(lambda i, y, x, v: (tensor.grad(y[i], x) * v).sum(), sequences=tensor.arange(y.shape[0]), non_sequences=[y, mx, mv]) scan_f = function([mx, mv], sy) rng = np.random.RandomState(utt.fetch_seed()) vx = np.asarray(rng.randn(4, 4), theano.config.floatX) vv = np.asarray(rng.randn(4, 4), theano.config.floatX) v1 = scan_f(vx, vv) v2, v3 = rop_f(vx, vv) assert _allclose(v2, v1), ('Rop mismatch: %s %s' % (v2, v1)) assert _allclose(v3, v1), ('Rop_via_Lop mismatch: %s %s' % (v3, v1)) raised = False try: tensor.Rop(theano.clone(y, replace={mx: break_op(mx)}), mx, mv) except ValueError: raised = True if not raised: raise Exception(('Op did not raised an error even though the function' ' is not differentiable')) try: tensor.Rop_via_Lop(theano.clone(y, replace={mx: break_op(mx)}), mx, mv) except theano.gradient.NullTypeGradError: raised = True except theano.gradient.DisconnectedInputError: raised = True if not raised: raise Exception(( 'Rop_via_Lop for Op did not raise an error even though the function' ' is not differentiable')) vv = np.asarray(rng.uniform(size=(4, )), theano.config.floatX) yv = tensor.Lop(y, mx, v) lop_f = function([mx, v], yv) sy = tensor.grad((v * y).sum(), mx) scan_f = function([mx, v], sy) v1 = lop_f(vx, vv) v2 = scan_f(vx, vv) assert _allclose(v1, v2), ('LOP mismatch: %s %s' % (v1, v2))
def check_rop_lop(self, y, out_shape): """ As check_mat_rop_lop, except the input is self.x which is a vector. The output is still a vector. """ # TEST ROP vx = numpy.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) vv = numpy.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) yv = tensor.Rop(y, self.x, self.v) rop_f = function([self.x, self.v], yv, on_unused_input='ignore') J, _ = theano.scan(lambda i, y, x: tensor.grad(y[i], x), sequences=tensor.arange(y.shape[0]), non_sequences=[y, self.x]) sy = tensor.dot(J, self.v) scan_f = function([self.x, self.v], sy, on_unused_input='ignore') v1 = rop_f(vx, vv) v2 = scan_f(vx, vv) assert numpy.allclose(v1, v2), ('ROP mismatch: %s %s' % (v1, v2)) known_fail = False try: self.check_nondiff_rop( theano.clone(y, replace={self.x: break_op(self.x)})) except AssertionError: known_fail = True # TEST LOP vx = numpy.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) vv = numpy.asarray(self.rng.uniform(size=out_shape), theano.config.floatX) yv = tensor.Lop(y, self.x, self.v) lop_f = function([self.x, self.v], yv, on_unused_input='ignore') J, _ = theano.scan(lambda i, y, x: tensor.grad(y[i], x), sequences=tensor.arange(y.shape[0]), non_sequences=[y, self.x]) sy = tensor.dot(self.v, J) scan_f = function([self.x, self.v], sy) v1 = lop_f(vx, vv) v2 = scan_f(vx, vv) assert numpy.allclose(v1, v2), ('LOP mismatch: %s %s' % (v1, v2)) if known_fail: raise KnownFailureTest( "Rop doesn't handle non-differentiable " "inputs correctly. Bug exposed by fixing Add.grad" " method.")
def __init__(self, p, inputs, s, costs): # useful data for reshaping self.shapes = [i.get_value().shape for i in p] self.sizes = map(np.prod, self.shapes) self.positions = np.cumsum([0] + self.sizes)[:-1] self.p = p self.inputs = inputs self.s = s self.costs = costs g = T.grad(costs[0], p) g = map(T.as_tensor_variable, g) # for CudaNdarray self.f_gc = theano.function(inputs, g + costs) # gradient computation self.f_cost = theano.function(inputs, costs) # quick cost evaluation symbolic_types = T.scalar, T.vector, T.matrix, T.tensor3, T.tensor4 coefficient = T.scalar() # this is lambda*mu # this computes the product Gv = J'HJv (G is the Gauss-Newton matrix) v = [symbolic_types[len(i)]() for i in self.shapes] Jv = T.Rop(s, p, v) HJv = T.grad(T.sum(T.grad(costs[0], s) * Jv), s, consider_constant=[Jv]) Gv = T.grad(T.sum(HJv * s), p, consider_constant=[HJv, Jv]) Gv = map(T.as_tensor_variable, Gv) # for CudaNdarray self.function_Gv = theano.function(inputs + v + [coefficient], Gv, givens={}, on_unused_input='ignore') # compute J'sqrt(diag(H))v for jacobi preconditioner r = T.matrix() sqrt_Hv = T.sqrt(T.grad(T.sum(T.grad(costs[0], s)), s)) * r J_sqrt_Hv = T.Lop(s, p, sqrt_Hv) J_sqrt_Hv = map(T.as_tensor_variable, J_sqrt_Hv) # for CudaNdarray self.function_J_sqrt_Hv = theano.function(inputs + [r], J_sqrt_Hv, givens={}, on_unused_input='ignore') # compute Hv dp = T.grad(costs[0], p) total = 0 for dp_, v_ in zip(dp, v): total += T.sum(dp_ * v_) Hv = T.grad(total, p) Hv = map(T.as_tensor_variable, Hv) # for CudaNdarray self.function_Hv = theano.function(inputs + v + [coefficient], Hv, on_unused_input='ignore')
def check_mat_rop_lop(self, y, out_shape): """ Test the Rop/Lop when input is a matrix and the output is a vector :param y: the output variable of the op applied to self.mx :param out_shape: Used to generate a random tensor corresponding to the evaluation point of the Rop (i.e. the tensor with which you multiply the Jacobian). It should be a tuple of ints. If the Op has more than 1 input, one of them must be mx, while others must be shared variables / constants. We will test only against the input self.mx, so you must call check_mat_rop_lop/check_rop_lop for the other inputs. We expect all inputs/outputs have dtype floatX. If you want to test an Op with an output matrix, add a sum after the Op you want to test. """ vx = np.asarray(self.rng.uniform(size=self.mat_in_shape), theano.config.floatX) vv = np.asarray(self.rng.uniform(size=self.mat_in_shape), theano.config.floatX) yv = tensor.Rop(y, self.mx, self.mv) yv2 = tensor.Rop_via_Lop(y, self.mx, self.mv) rop_f = function([self.mx, self.mv], [yv, yv2], on_unused_input='ignore') sy, _ = theano.scan(lambda i, y, x, v: (tensor.grad(y[i], x) * v).sum(), sequences=tensor.arange(y.shape[0]), non_sequences=[y, self.mx, self.mv]) scan_f = function([self.mx, self.mv], sy, on_unused_input='ignore') v1, v2 = rop_f(vx, vv) v3 = scan_f(vx, vv) assert np.allclose(v1, v3), ('ROP mismatch: %s %s' % (v1, v3)) assert np.allclose(v2, v3), ('ROP_VIA_LOP mismatch: %s %s' % (v2, v3)) self.check_nondiff_rop( theano.clone(y, replace={self.mx: break_op(self.mx)})) vv = np.asarray(self.rng.uniform(size=out_shape), theano.config.floatX) yv = tensor.Lop(y, self.mx, self.v) lop_f = function([self.mx, self.v], yv) sy = tensor.grad((self.v * y).sum(), self.mx) scan_f = function([self.mx, self.v], sy) v1 = lop_f(vx, vv) v2 = scan_f(vx, vv) assert np.allclose(v1, v2), ('LOP mismatch: %s %s' % (v1, v2))
def _get_updates_for(self, param, grad): D_tm1 = shared_like(param, 'D_ewma') v = self.rng.normal(param.shape) if self.hv_method == 'rop': Hv = TT.Rop(grad, param, v) if self.hv_method == 'lop': Hv = TT.Lop(grad, param, v) if self.hv_method == 'grad': Hv = TT.grad(TT.sum(grad * v), param) D_t = self.ewma * D_tm1 + (1 - self.ewma) * Hv * Hv den = TT.sqrt(D_t) + self.epsilon yield D_tm1, D_t yield param, param - grad * self.learning_rate / den
def setup(self, bottom, top): weights = T.matrix("weights") weights_bc = weights.dimshuffle((0, 1, "x", "x")) feats = T.tensor4("weights") v = T.tensor3("v") dot = weights_bc * feats result = T.sum(dot, axis=1) g_w, g_f = T.Lop(result, [weights, feats], v) self.f = theano.function([weights, feats], result) self.b_w = theano.function([weights, feats, v], g_w) self.b_f = theano.function([weights, feats, v], g_f)
def setup(self, bottom, top): small_size = bottom[0].shape[1] small = T.matrix("small") big = T.tensor4("big") v = T.tensor4("v") small_bc = small.dimshuffle(0, 1, "x", "x") small_bc = T.addbroadcast(small_bc, 0) result = big + small_bc g_small, g_big = T.Lop(result, [small, big], v) self.f = theano.function([small, big], result) self.b_small = theano.function([v], g_small) self.b_big = theano.function([v], g_big)
def step(x_t, y_t, h_tmT, Wx, Wh, bh, Wy, by, lr, switch): # manually build the graph for the inner loop... # passing correct h_tm1 is impossible in nested scans yo_t = [] h_tm1 = h_tmT for t in range(self.steps): h_t = relu(T.dot(x_t[t], Wx) + T.dot(h_tm1, Wh) + bh) yo_t.append(relu(T.dot(h_t, Wy) + by)) h_tm1 = h_t updates = OrderedDict() # Train the RNN: backprop (loss + DNI output) loss = T.mean(T.square(yo_t - y_t)) dni_out = self.dni.output(h_t) for param in self.params: dlossdparam = T.grad(loss, param) dniJ = T.Lop(h_t, param, dni_out, disconnected_inputs='ignore') updates[param] = param - lr * T.switch( T.gt(switch, 0), dlossdparam + dniJ, dlossdparam) # Update the DNI (from the last step) # re-calculate the DNI prediction from the last step # note: can't be passed through scan or T.grad won't work dni_out_old = self.dni.output(h_tmT) # dni_target: current loss backprop'ed + new dni backprop'ed dni_target = T.grad(loss,h_tmT) \ +T.Lop(h_t,h_tmT,dni_out) dni_error = T.sum(T.square(dni_out_old - dni_target)) for param in self.dni.params: gparam = T.grad(dni_error, param) updates[param] = param - lr * gparam return [h_t, loss, dni_error], updates
def gauss_vect_mult(v): """ Multiply a vector by the Gauss-Newton matrix JHJ' where J is the Jacobian between output and params and H is the Hessian between costs and output H should be diagonal and positive. Also add the ridge """ Jv = T.Rop(output, params, v) HJv = T.Rop(T.grad(opt_cost, output), output, Jv) JHJv = T.Lop(output, params, HJv) if not isinstance(JHJv, list): JHJv = [JHJv] JHJv = [a + ridge * b for a, b in zip(JHJv, v)] return JHJv
def setup(self, bottom, top): attention = T.tensor4("attention") input = T.tensor4("input") v = T.matrix("v") attention_bc = T.addbroadcast(attention, 1) attended = T.mul(input, attention_bc) result = T.sum(attended, axis=(2, 3)) result_g_attention, result_g_input = T.Lop(result, [attention, input], v) self.f = theano.function([attention, input], result) self.b_attention = theano.function([attention, input, v], result_g_attention) self.b_input = theano.function([attention, input, v], result_g_attention)
def parse_args(self, bottom, top): function_str = self.pythonargs[0] top_shape = self.pythonargs[1] old_function_str = self.function_str old_top_shape = self.top_shape self.function_str = function_str self.top_shape = top_shape if function_str != old_function_str or len(top_shape) != len( old_top_shape): if old_function_str != '': print( 'TheanoGPU function string different from cache: recompiling' ) import theano.tensor as T import theano from theano.sandbox.cuda.basic_ops import gpu_from_host x = [] for i in range(len(bottom)): if len(bottom[i].shape) == 1: x.append(T.vector('x%d' % i)) if len(bottom[i].shape) == 2: x.append(T.matrix('x%d' % i)) if len(bottom[i].shape) == 3: x.append(T.tensor3('x%d' % i)) if len(bottom[i].shape) == 4: x.append(T.tensor4('x%d' % i)) y = eval(function_str) self.f = theano.function(x, gpu_from_host(y), on_unused_input='ignore') if len(self.top_shape) == 1: v = T.vector('v') elif len(self.top_shape) == 2: v = T.matrix('v') elif len(self.top_shape) == 3: v = T.tensor3('v') elif len(self.top_shape) == 4: v = T.tensor4('v') self.b = [] for i in range(len(bottom)): yg = T.Lop(y, x[i], v) self.b.append( theano.function(x + [v], gpu_from_host(yg), on_unused_input='ignore'))
def check_rop_lop(self, y, out_shape): """ As check_mat_rop_lop, except the input is self.x witch is a vector. The output is still a vector. """ # TEST ROP vx = numpy.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) vv = numpy.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) yv = tensor.Rop(y, self.x, self.v) rop_f = function([self.x, self.v], yv, on_unused_input='ignore') J, _ = theano.scan(lambda i, y, x: tensor.grad(y[i], x), sequences=tensor.arange(y.shape[0]), non_sequences=[y, self.x]) sy = tensor.dot(J, self.v) scan_f = function([self.x, self.v], sy, on_unused_input='ignore') v1 = rop_f(vx, vv) v2 = scan_f(vx, vv) assert numpy.allclose(v1, v2), ('ROP mismatch: %s %s' % (v1, v2)) self.check_nondiff_rop( theano.clone(y, replace={self.x: break_op(self.x)})) # TEST LOP vx = numpy.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) vv = numpy.asarray(self.rng.uniform(size=out_shape), theano.config.floatX) yv = tensor.Lop(y, self.x, self.v) lop_f = function([self.x, self.v], yv, on_unused_input='ignore') J, _ = theano.scan(lambda i, y, x: tensor.grad(y[i], x), sequences=tensor.arange(y.shape[0]), non_sequences=[y, self.x]) sy = tensor.dot(self.v, J) scan_f = function([self.x, self.v], sy) v1 = lop_f(vx, vv) v2 = scan_f(vx, vv) assert numpy.allclose(v1, v2), ('LOP mismatch: %s %s' % (v1, v2))
def compute_Lx(energies, params, deltas): # expectations and derivatives are commutative. cenergies = energies - T.mean(energies) Minv = T.cast(1. / energies.shape[0], floatX) rhs_terms = [] for param_j, delta_j in zip(params, deltas): rhs_term = T.Rop(cenergies, param_j, delta_j) rhs_terms += [rhs_term] Lx_terms = [] for param_i in params: Lx_term = 0 for rhs in rhs_terms: Lx_term += Minv * T.Lop(cenergies, param_i, rhs) Lx_terms += [Lx_term] return Lx_terms
def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost, nw_preactiv_out = safe_clone( [model.train_cost, model.preactiv_out], replace) nw_gvs = TT.Lop( nw_preactiv_out, model.params, TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params, cgv)) Gvs = [ ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs) ] return [gv_args[0] + const(1)] + Gvs