def compute_output(self, network, h_vw, x_vw): batch_axis = network.find_hyperparameter(["batch_axis"]) if batch_axis is None: # NOTE: this code path is not tested! jacobian = T.jacobian(h_vw.variable.ravel(), x_vw.variable) res = (jacobian ** 2).mean() res_shape = () else: batch_size = h_vw.symbolic_shape()[batch_axis] # sum across batch to avoid disconnected input error # ravel to be a vector h_var = h_vw.variable.sum(axis=batch_axis).ravel() x_var = x_vw.variable # shape of result = h_var.shape + x_var.shape jacobian = T.jacobian(h_var, x_var) # put batch axis as first dimension # adding 1 to batch axis, because len(h_var.shape) == 1 swapped_jacobian = jacobian.swapaxes(0, batch_axis + 1) # convert to a matrix and mean over elements in a batch reshaped_jacobian = swapped_jacobian.reshape((batch_size, -1)) res = (reshaped_jacobian ** 2).mean(axis=1) res_shape = (h_vw.shape[batch_axis],) network.create_variable( "default", variable=res, shape=res_shape, tags={"output"}, )
def test_dot_not_output(self): """ Test the case where the vector input to the dot is not already an output of the inner function. """ v = T.vector() m = T.matrix() output = T.dot(v, m) # Compile the function twice, once with the optimization and once # without opt_mode = mode.including("scan") f_opt = theano.function([v, m], T.jacobian(output, v), mode=opt_mode) no_opt_mode = mode.excluding("scanOp_pushout_output") f_no_opt = theano.function([v, m], T.jacobian(output, v), mode=no_opt_mode) # Ensure that the optimization was performed correctly in f_opt # The inner function of scan should have only one output and it should # not be the result of a Dot scan_node = [node for node in f_opt.maker.fgraph.toposort() if isinstance(node.op, Scan)][0] assert len(scan_node.op.outputs) == 1 assert not isinstance(scan_node.op.outputs[0], T.Dot) # Ensure that the function compiled with the optimization produces # the same results as the function compiled without v_value = numpy.random.random((4)).astype(config.floatX) m_value = numpy.random.random((4, 5)).astype(config.floatX) output_opt = f_opt(v_value, m_value) output_no_opt = f_no_opt(v_value, m_value) utt.assert_allclose(output_opt, output_no_opt)
def _gen_deriv_functions(self): ''' _gen_deriv_functions To be called by the derived class to compile all the required functions. ''' ################################## # Define some Theano derivatives # Derivative w.r.t. the hyperparameters self.th_dhyp, uhyp = theano.scan( lambda i, y, x: T.jacobian(y[i], x), sequences=T.arange(self.th_K.shape[0]), non_sequences=[self.th_K, self.th_hyp]) # Derivative w.r.t. the inputs self.th_dX, ux = theano.scan(lambda i, y, x: T.jacobian(y[i], x), sequences=T.arange(self.th_K.shape[0]), non_sequences=[self.th_K, self.th_X]) ################################## # Compilation # Kxx: self covariance matrix self.K = theano.function([self.th_X, self.th_hyp], self.th_K) # Kxy: cross covariance matrix self.Kc = theano.function([self.th_X, self.th_Xc, self.th_hyp], self.th_Kc) self.dK_dhyp = theano.function([self.th_X, self.th_hyp], self.th_dhyp, updates=uhyp) self.dK_dX = theano.function([self.th_X, self.th_hyp], self.th_dX, updates=ux)
def compute_output(self, network, h_vw, x_vw): batch_axis = network.find_hyperparameter(["batch_axis"]) if batch_axis is None: # NOTE: this code path is not tested! jacobian = T.jacobian(h_vw.variable.ravel(), x_vw.variable) res = (jacobian**2).mean() res_shape = () else: batch_size = h_vw.symbolic_shape()[batch_axis] # sum across batch to avoid disconnected input error # ravel to be a vector h_var = h_vw.variable.sum(axis=batch_axis).ravel() x_var = x_vw.variable # shape of result = h_var.shape + x_var.shape jacobian = T.jacobian(h_var, x_var) # put batch axis as first dimension # adding 1 to batch axis, because len(h_var.shape) == 1 swapped_jacobian = jacobian.swapaxes(0, batch_axis + 1) # convert to a matrix and mean over elements in a batch reshaped_jacobian = swapped_jacobian.reshape((batch_size, -1)) res = (reshaped_jacobian**2).mean(axis=1) res_shape = (h_vw.shape[batch_axis], ) network.create_vw( "default", variable=res, shape=res_shape, tags={"output"}, )
def test_dot_not_output(self): # Test the case where the vector input to the dot is not already an # output of the inner function. v = tt.vector() m = tt.matrix() output = tt.dot(v, m) # Compile the function twice, once with the optimization and once # without opt_mode = mode.including("scan") f_opt = theano.function([v, m], tt.jacobian(output, v), mode=opt_mode) no_opt_mode = mode.excluding("scanOp_pushout_output") f_no_opt = theano.function([v, m], tt.jacobian(output, v), mode=no_opt_mode) # Ensure that the optimization was performed correctly in f_opt # The inner function of scan should have only one output and it should # not be the result of a Dot scan_node = [ node for node in f_opt.maker.fgraph.toposort() if isinstance(node.op, Scan) ][0] assert len(scan_node.op.outputs) == 1 assert not isinstance(scan_node.op.outputs[0], tt.Dot) # Ensure that the function compiled with the optimization produces # the same results as the function compiled without v_value = np.random.random(4).astype(config.floatX) m_value = np.random.random((4, 5)).astype(config.floatX) output_opt = f_opt(v_value, m_value) output_no_opt = f_no_opt(v_value, m_value) utt.assert_allclose(output_opt, output_no_opt)
def augment_system(ode_func,t_n, t_m): '''Function to create augmented system. Take a function which specifies a set of differential equations and return a compiled function which allows for computation of gradients of the differential equation's solition with repsect to the parameters. Args: ode_func (function): Differential equation. Returns array-like Returns: system (function): Augemted system of differential equations. ''' #Shapes for the dydp dmatrix #TODO: Should this be int64 or other dtype? # t_n = tt.scalar('n', dtype = 'int64') # t_m = tt.scalar('m', dtype = 'int64') #Present state of the system t_y = tt.vector('y', dtype=theano.config.floatX) #Parameter(s). Should be vector to allow for generaliztion to multiparameter #systems of ODEs t_p = tt.vector('p', dtype=theano.config.floatX) #Time. Allow for non-automonous systems of ODEs to be analyzed t_t = tt.scalar('t', dtype=theano.config.floatX) #Present state of the gradients: #Will always be 0 unless the parameter is the inital condition #Entry i,j is partial of y[i] wrt to p[j] dydp_vec = tt.vector('dydp', dtype=theano.config.floatX) dydp = dydp_vec.reshape((t_n,t_m)) #Stack the results of the ode_func #TODO: Does this behave the same of ODE is scalar? f_tensor = tt.stack(ode_func(t_y, t_t, t_p)) #Now compute gradients J = tt.jacobian(f_tensor,t_y) Jdfdy = tt.dot(J, dydp) grad_f = tt.jacobian(f_tensor, t_p) #This is the time derivative of dydp ddt_dydp = (Jdfdy + grad_f).flatten() system = theano.function( inputs=[t_y, t_t, t_p, dydp_vec], outputs=[f_tensor, ddt_dydp], on_unused_input='ignore') return system
def _grad_single(self, ct, s, lnC2, GAMMI2): lnC = lnC2 GAMMI = GAMMI2 v = self.v#T.as_tensor(self.v)[:,ct:] v0 = T.as_tensor(v[v[:,0]==0, :]) v1 = T.as_tensor(v[v[:,0]==1, :]) cnp = v.shape[0] # Gradient of fE wrt the priors over final state [ofE, oxS], upd_fE_single = th.scan(fn=self._free_energy, sequences=v, non_sequences=[s,self.h,lnC,self.b]) ofE0 = ofE[v0].sum() ofE1 = ofE[v1].sum() dFE0dlnC = T.jacobian(ofE0, lnC) dFE1dlnC = T.jacobian(ofE1, lnC) dFEdlnC = T.jacobian(ofE, lnC) ofE_ = T.vector() ofE_.tag.test_value = ofE.tag.test_value # Gradient of Gamma with respect to its initial condition: GAMMA, upd_GAMMA = th.scan(fn=self._upd_gamma, outputs_info=[GAMMI], non_sequences=[ofE, self.lambd, self.alpha, self.beta, cnp], n_steps=4) dGdg = T.grad(GAMMA[-1], GAMMI) dGdfE = T.jacobian(GAMMA[-1], ofE) dGdlnC = dGdfE.dot(dFEdlnC) out1 = ofE0 out2 = ofE1 maxout = T.max([out1, out2]) exp_out1 = T.exp(GAMMA[-1]*(out1 - maxout)) exp_out2 = T.exp(GAMMA[-1]*(out2 - maxout)) norm_const = exp_out1 + exp_out2 # Derivative wrt the second output (gammi): Jac1_gammi = (-(out1-out2)*dGdg* T.exp(GAMMA[-1]*(out1+out2 - 2*maxout))/(norm_const**2)) Jac2_gammi = -Jac1_gammi # dfd1_tZ = Jac1_gammi*dCdf[1][0]+ Jac2_gammi*dCdf[1][1] # Derivative wrt first input (lnc) Jac1_lnC = (T.exp(GAMMA[-1]*(out1 + out2 - 2*maxout))/(norm_const**2)* (-dGdlnC*(out1 - out2) - GAMMA[-1]*(dFE0dlnC - dFE1dlnC))) Jac2_lnC = -Jac1_lnC Jac1 = T.concatenate([T.stack(Jac1_gammi), Jac1_lnC]) Jac2 = T.concatenate([T.stack(Jac2_gammi), Jac2_lnC]) self.debug = [Jac1_lnC, Jac2_lnC, Jac2_gammi, Jac1_gammi, dFE0dlnC, dFE1dlnC, dGdg, out1, out2, v0, v1, v, ct] return Jac1, Jac2
def compute_jacobian(errors, parameters): """ Compute jacobian. Parameters ---------- errors : Theano variable Computed MSE for each sample separetly. parameters : list of Theano variable Neural network parameters (e.g. weights, biases). Returns ------- Theano variable """ n_samples = errors.shape[0] J = T.jacobian(errors, wrt=parameters) jacobians = [] for jacobian, parameter in zip(J, parameters): jacobian = jacobian.reshape((n_samples, parameter.size)) jacobians.append(jacobian) return T.concatenate(jacobians, axis=1)
def hessian(objective, argument): """ Compute the directional derivative of the gradient (which is equal to the hessian multiplied by direction). """ g = T.grad(objective, argument) # Create a new tensor A, which has the same type (i.e. same dimensionality) # as argument. A = argument.type() try: # First attempt efficient 'R-op', this directly calculates the # directional derivative of the gradient, rather than explicitly # calculating the hessian and then multiplying. R = T.Rop(g, argument, A) except NotImplementedError: shp = T.shape(argument) H = T.jacobian(g.flatten(), argument).reshape(T.concatenate([shp, shp]), 2 * A.ndim) R = T.tensordot(H, A, A.ndim) try: hess = theano.function([argument, A], R, on_unused_input='raise') except theano.compile.UnusedInputError: warn('Theano detected unused input - suggests hessian may be zero or ' 'constant.') hess = theano.function([argument, A], R, on_unused_input='ignore') return hess
def fixedPointIteration(*args): fpiIns = dict(zip(self.getFPIArgNames(), args)) gOpFreeIns = [fpiIns[k] for k in self.getGOpFreeArgNames()] gOpClampedIns = [fpiIns[k] for k in self.getGOpClampedArgNames()] gOpFreeRets = self.g(*gOpFreeIns) gOpClampedRets = self.g(*gOpClampedIns) gOpDiffs = [c-f for c,f in zip(gOpClampedRets, gOpFreeRets)] for i, (s, fi, fr, ci, cr) in enumerate(zip(self.getStateIter(), gOpFreeIns, gOpFreeRets, gOpClampedIns, gOpClampedRets)): if i>0: fpiIns["free_" +s.name] = self.rho(fi+fr) fpiIns["clamped_"+s.name] = self.rho(ci+cr) allStates = TT.concatenate([s.flatten() for s in gOpFreeRets]) allDiffs = TT.concatenate([d.flatten() for d in gOpDiffs]) for t, tName in [(fpiIns[t.name], t.name) for t in self.getThetaIter()]: J = TT.jacobian(allStates, t.flatten(), disconnected_inputs="ignore") dt = J.T.dot(allDiffs).reshape(t.shape) fpiIns[tName] += fpiIns["lr"]*dt fpiIns["i"] += 1 fpiRets = [fpiIns[k] for k in self.getFPIRetNames()] return fpiRets
def _get_updates(self): n = self.params['batch_size'] N = self.params['train_size'] prec_lik = self.params['prec_lik'] prec_prior = self.params['prec_prior'] gc_norm = self.params['gc_norm'] gamma = float(n + N) / n # compute log-likelihood error = self.model_outputs - self.true_outputs logliks = log_normal(error, prec_lik) sumloglik = logliks.sum() # compute gradient of likelihood wrt each data point grads = tensor.jacobian(expression=logliks, wrt=self.weights) grads = tensor.concatenate([g.flatten(ndim=2) for g in grads], axis=1) avg_grads = grads.mean(axis=0) dist_grads = grads - avg_grads # compute variance of gradient var_grads = (1. / (n - 1)) * tensor.dot(dist_grads.T, dist_grads) logprior = log_prior_normal(self.weights, prec_prior) grads_prior = tensor.grad(cost=logprior, wrt=self.weights) grads_prior = tensor.concatenate([g.flatten() for g in grads_prior]) # update Fisher information I_t_next = (1 - 1 / self.it) * self.I_t + 1 / self.it * var_grads # compute noise if 'B' in self.params: B = self.params['B'] else: B = gamma * I_t_next * N # B += np.eye(self.n_weights) * (10 ** -9) B_ch = slinalg.cholesky(B) noise = tensor.dot(((2. / tensor.sqrt(self.lr)) * B_ch), trng.normal((self.n_weights, 1))) # expensive inversion inv_cond_mat = gamma * N * I_t_next + (4. / self.lr) * B cond_mat = nlinalg.matrix_inverse(inv_cond_mat) updates = [] updates.append((self.I_t, I_t_next)) updates.append((self.it, self.it + 1)) # update the parameters updated_params = 2 * tensor.dot( cond_mat, grads_prior + N * avg_grads + noise.flatten()) updated_params = updated_params.flatten() last_row = 0 for p in self.weights: sub_index = np.prod(p.get_value().shape) up = updated_params[last_row:last_row + sub_index] up = up.reshape(p.shape) updates.append((p, up)) last_row += sub_index return updates, sumloglik
def logdet_dinv_num(self, y): # return debug(tt.log(sT.det(debug(tt.jacobian(self.inv(y), y), 'jacobian_inv'))), 'automatic_logdet_dinv') return tt.sum( debug( tt.log( tt.diag(debug(tt.jacobian(self.inv(y), y), 'jacobian_inv'))), 'automatic_logdet_dinv'))
def grad_wrt_input(self, inputf): fx = theano.function([self.model.layers[0].input], T.jacobian(self.model.layers[-1].output.flatten(), self.model.layers[0].input), allow_input_downcast=True) grad = fx(inputf) return grad
def dM2_f_i(mx, beta, hyp, X): hyps = (hyp[:idims + 1], hyp[idims + 1]) kernel_func = partial(cov.Sum, hyps, self.covs) k = kernel_func(mx[None, :], X).flatten() mean = k.dot(beta) dmean = tt.jacobian(mean.flatten(), mx) return tt.square(dmean.flatten())
def hessian(objective, argument): """ Compute the directional derivative of the gradient (which is equal to the hessian multiplied by direction). """ g = T.grad(objective, argument) # Create a new tensor A, which has the same type (i.e. same dimensionality) # as argument. A = argument.type() try: # First attempt efficient 'R-op', this directly calculates the # directional derivative of the gradient, rather than explicitly # calculating the hessian and then multiplying. R = T.Rop(g, argument, A) except NotImplementedError: shp = T.shape(argument) H = T.jacobian(g.flatten(), argument).reshape( T.concatenate([shp, shp]), 2*A.ndim) R = T.tensordot(H, A, A.ndim) try: hess = theano.function([argument, A], R, on_unused_input='raise') except theano.compile.UnusedInputError: warn('Theano detected unused input - suggests hessian may be zero or ' 'constant.') hess = theano.function([argument, A], R, on_unused_input='ignore') return hess
def auto4check2(input, dataset): a = theano.shared(value=dataset[0], name="a") b = theano.shared(value=dataset[1], name="b") c = theano.shared(value=dataset[2], name="c") x = T.vector('x') u = x[0] - 0.8 v = x[1] - (a[0] + a[1] * u ** 2 * (1 - u) ** 0.5 - a[2] * u) alpha = -b[0] + b[1] * u ** 2 * (1 + u) ** 0.5 + b[2] * u beta = c[0] * v ** 2 * (1 - c[1] * v) / (1 + c[2] * u ** 2) fx = alpha * np.e ** (-beta) g_f_x = T.jacobian(fx, x) grad = theano.function([x], g_f_x) Hessian = theano.function([x], T.hessian(fx, x)) H_alpha_x = theano.function([x], T.hessian(alpha, x)) H_beta_x = theano.function([x], T.hessian(beta, x)) J_f_alpha = theano.function([x], T.grad(fx, alpha)) J_f_beta = theano.function([x], T.grad(fx, beta)) J_alpha_x = theano.function([x], T.grad(alpha, x)) J_beta_x = theano.function([x], T.grad(beta, x)) J_f_y = [J_f_alpha(input), J_f_beta(input)] J_y_x = [J_alpha_x(input), J_beta_x(input)] # print "H_alpha_x" # print H_alpha_x(input) # print "H_beta_x" # print H_beta_x(input) # print "J_f_y" # print J_f_y # print "J_y_x" # print J_y_x # print grad(input) return Hessian(input)
def compile_tan_force(self, u_np, s_np, *args, **kargs): grid = u_np.grid grid_math = grid._math grid._math = T tensor_dim = u_np.ndim + 2 input_data = T.TensorType('float64', (False,) * tensor_dim)() tensor_dim = s_np.ndim param = T.TensorType('float64', (False,) * tensor_dim)() #param = T.dvector('s') u_theano = grid.array(input_data.copy(), u_np.shape) s_theano = np.array(param.copy(), s_np.shape) ret = self._function(u_theano, s_theano, *args, **kargs) out_tan = T.jacobian(ret._data, param) if _VERBOSE_: print('tangent derived in theano mode, compiling') f = theano.function([input_data, param], [out_tan]) if _VERBOSE_: print('tangent sucessfully compiled') grid._math = grid_math return f
def hypernet_elbo(X, y, loglik_primary_f, logprior_f, hypernet_f, z_noise, N, log_det_dtheta_dz_f=None): assert(X.ndim == 2 and y.ndim == 2) assert(z_noise.ndim == 1) B = X.shape[0] rescale = float(N) / B # Ensure not integer division theta = hypernet_f(z_noise) loglik = loglik_primary_f(X, y, theta) assert(loglik.ndim == 1) loglik_total = T.sum(loglik) assert(loglik_total.ndim == 0) logprior_theta = logprior_f(theta) assert(logprior_theta.ndim == 0) if log_det_dtheta_dz_f is None: # This is slower, but good for testing assert(theta.ndim == 1) # Use vector theta for this mode J = T.jacobian(theta, z_noise) penalty = log_abs_det_T(J) else: penalty = log_det_dtheta_dz_f(z_noise) assert(penalty.ndim == 0) logprior_z = 0.5 * T.dot(z_noise, z_noise) assert(logprior_z.ndim == 0) elbo = rescale * loglik_total + logprior_theta + penalty + logprior_z return elbo
def test_flow_det(flow_spec): z0 = tt.arange(0, 20).astype('float32') flow = flow_spec(dim=20, z0=z0.dimshuffle('x', 0)) with change_flags(compute_test_value='off'): z1 = flow.forward.flatten() J = tt.jacobian(z1, z0) logJdet = tt.log(tt.abs_(tt.nlinalg.det(J))) det = flow.logdet[0] np.testing.assert_allclose(logJdet.eval(), det.eval(), atol=0.0001)
def test002_jacobian_matrix(): x = tensor.matrix() y = 2 * x.sum(axis=0) rng = numpy.random.RandomState(seed=utt.fetch_seed()) ev = numpy.zeros((10, 10, 10)) for dx in xrange(10): ev[dx, :, dx] = 2. # test when the jacobian is called with a tensor as wrt Jx = tensor.jacobian(y, x) f = theano.function([x], Jx) vx = rng.uniform(size=(10, 10)).astype(theano.config.floatX) assert numpy.allclose(f(vx), ev) # test when the jacobian is called with a tuple as wrt Jx = tensor.jacobian(y, (x,)) assert isinstance(Jx, tuple) f = theano.function([x], Jx[0]) vx = rng.uniform(size=(10, 10)).astype(theano.config.floatX) assert numpy.allclose(f(vx), ev) # test when the jacobian is called with a list as wrt Jx = tensor.jacobian(y, [x]) assert isinstance(Jx, list) f = theano.function([x], Jx[0]) vx = rng.uniform(size=(10, 10)).astype(theano.config.floatX) assert numpy.allclose(f(vx), ev) # test when the jacobian is called with a list of two elements z = tensor.matrix() y = (x * z).sum(axis=1) Js = tensor.jacobian(y, [x, z]) f = theano.function([x, z], Js) vx = rng.uniform(size=(10, 10)).astype(theano.config.floatX) vz = rng.uniform(size=(10, 10)).astype(theano.config.floatX) vJs = f(vx, vz) evx = numpy.zeros((10, 10, 10)) evz = numpy.zeros((10, 10, 10)) for dx in xrange(10): evx[dx, dx, :] = vx[dx, :] evz[dx, dx, :] = vz[dx, :] assert numpy.allclose(vJs[0], evz) assert numpy.allclose(vJs[1], evx)
def get_gradients(self, model, data, ** kwargs): space, sources = self.get_data_specs(model) space.validate(data) X, Y = data theano_rng = RandomStreams(seed = model.rng.randint(2 ** 15)) noise = theano_rng.random_integers(size = (X.shape[0] * model.k,), low=0, high = model.dict_size - 1) delta = model.delta(data) p = model.score(X, Y) params = model.get_params() pos_ = T.jacobian(model.score(X, Y), params, disconnected_inputs='ignore') pos_coeff = 1 - T.nnet.sigmoid(model.delta(data)) pos = [] for param in pos_: axes = [0] axes.extend(['x' for item in range(param.ndim - 1)]) pos.append(pos_coeff.dimshuffle(axes) * param) del pos_, pos_coeff noise_x = T.tile(X, (model.k, 1)) neg_ = T.jacobian(model.score(noise_x, noise), params, disconnected_inputs='ignore') neg_coeff = T.nnet.sigmoid(model.delta((noise_x, noise))) neg = [] for param in neg_: axes = [0] axes.extend(['x' for item in range(param.ndim - 1)]) tmp = neg_coeff.dimshuffle(axes) * param new_shape = [X.shape[0], model.k] new_shape.extend([tmp.shape[i] for i in range(1, tmp.ndim)]) neg.append(tmp.reshape(new_shape).sum(axis=1)) del neg_, neg_coeff grads = [(pos_ - neg_).mean(axis=0) for pos_, neg_ in zip(pos, neg)] gradients = OrderedDict(izip(params, grads)) updates = OrderedDict() return gradients, updates
def get_stat(f, thetahat): fhat = theano.function([theta], f)(thetahat) dfhat = theano.function([theta], T.jacobian(f, [theta])[0])(thetahat) fhatcov = np.dot(np.dot(dfhat, covhat), dfhat.transpose()) try: fse = np.sqrt(np.diag(fhatcov)) except: fse = np.sqrt(fhatcov) ftstat = fhat/fse return fhat, fse, ftstat
def test_jacobian_matrix(): x = tensor.matrix() y = 2 * x.sum(axis=0) rng = np.random.RandomState(seed=utt.fetch_seed()) ev = np.zeros((10, 10, 10)) for dx in range(10): ev[dx, :, dx] = 2.0 # test when the jacobian is called with a tensor as wrt Jx = tensor.jacobian(y, x) f = theano.function([x], Jx) vx = rng.uniform(size=(10, 10)).astype(theano.config.floatX) assert np.allclose(f(vx), ev) # test when the jacobian is called with a tuple as wrt Jx = tensor.jacobian(y, (x,)) assert isinstance(Jx, tuple) f = theano.function([x], Jx[0]) vx = rng.uniform(size=(10, 10)).astype(theano.config.floatX) assert np.allclose(f(vx), ev) # test when the jacobian is called with a list as wrt Jx = tensor.jacobian(y, [x]) assert isinstance(Jx, list) f = theano.function([x], Jx[0]) vx = rng.uniform(size=(10, 10)).astype(theano.config.floatX) assert np.allclose(f(vx), ev) # test when the jacobian is called with a list of two elements z = tensor.matrix() y = (x * z).sum(axis=1) Js = tensor.jacobian(y, [x, z]) f = theano.function([x, z], Js) vx = rng.uniform(size=(10, 10)).astype(theano.config.floatX) vz = rng.uniform(size=(10, 10)).astype(theano.config.floatX) vJs = f(vx, vz) evx = np.zeros((10, 10, 10)) evz = np.zeros((10, 10, 10)) for dx in range(10): evx[dx, dx, :] = vx[dx, :] evz[dx, dx, :] = vz[dx, :] assert np.allclose(vJs[0], evz) assert np.allclose(vJs[1], evx)
def test_vectors(self): try: import theano.tensor as T from theano import function except: return for MT in [False, True]: # Set up variables and function vals = [np.random.randn(20) for i in range(5)] f = lambda a, b, c, d, e: a + (b * c) - d**e # Set up our objects Cs = [ch.Ch(v) for v in vals] C_result = f(*Cs) C_result.MT = MT # Set up Theano equivalents Ts = T.dvectors('T1', 'T2', 'T3', 'T4', 'T5') TF = f(*Ts) T_result = function(Ts, TF) if False: import theano.gradient which = 1 theano_sse = (TF**2.).sum() theano_grad = theano.gradient.grad(theano_sse, Ts[which]) theano_fn = function(Ts, theano_grad) print(theano_fn(*vals)) C_result_grad = ch.SumOfSquares(C_result).dr_wrt(Cs[which]) print(C_result_grad) # if True: # aaa = np.linalg.solve(C_result_grad.T.dot(C_result_grad), C_result_grad.dot(np.zeros(C_result_grad.shape[1]))) # theano_hes = theano.R_obbb = theano.R_op() import pdb pdb.set_trace() # Make sure values and derivatives are equal np.testing.assert_array_equal(C_result.r, T_result(*vals)) for k in range(len(vals)): theano_derivative = function(Ts, T.jacobian(TF, Ts[k]))(*vals) our_derivative = np.array(C_result.dr_wrt(Cs[k]).todense()) #print(theano_derivative, our_derivative) # Theano produces has more nans than we do during exponentiation. # So we test only on entries where Theano is without NaN's without_nans = np.nonzero( np.logical_not(np.isnan(theano_derivative.flatten())))[0] np.testing.assert_array_equal( theano_derivative.flatten()[without_nans], our_derivative.flatten()[without_nans])
def estimate_fisher(outputs, n_outputs, parameters): # shape (sample_size, n_outputs, #parameters) grads = T.stack(*[util.batched_flatcat( T.jacobian(outputs[:, j], parameters)) for j in xrange(n_outputs)]) # ravel the batch and output axes so that the product will sum # over the outputs *and* over the batch. divide by the batch # size to get the batch mean. grads = grads.reshape((grads.shape[0] * grads.shape[1], grads.shape[2])) fisher = T.dot(grads.T, grads) / grads.shape[0] return fisher
def Hessian(objective, *Vars, **kwargs): """block structure matrix of Jacobian of gradients, symmetric""" return T.concatenate([ T.concatenate([ T.jacobian(T.grad(objective, var1, disconnected_inputs='ignore').reshape((-1, )), var2, disconnected_inputs='ignore').reshape( (var1.size, var2.size)) for var2 in Vars ], axis=1) for var1 in Vars ], axis=0)
def Hessian(objective, *Vars, **kwargs): return T.concatenate([ T.concatenate([ T.jacobian( T.grad(objective, var1, disconnected_inputs='ignore').reshape( (T.prod(var1.shape), )), var2, disconnected_inputs='ignore').reshape( (T.prod(var1.shape), T.prod(var2.shape))) for var2 in Vars ], axis=1) for var1 in Vars ], axis=0)
def test_jacobian_vector(): x = tensor.vector() y = x * 2 rng = np.random.RandomState(seed=utt.fetch_seed()) # test when the jacobian is called with a tensor as wrt Jx = tensor.jacobian(y, x) f = theano.function([x], Jx) vx = rng.uniform(size=(10,)).astype(theano.config.floatX) assert np.allclose(f(vx), np.eye(10) * 2) # test when the jacobian is called with a tuple as wrt Jx = tensor.jacobian(y, (x,)) assert isinstance(Jx, tuple) f = theano.function([x], Jx[0]) vx = rng.uniform(size=(10,)).astype(theano.config.floatX) assert np.allclose(f(vx), np.eye(10) * 2) # test when the jacobian is called with a list as wrt Jx = tensor.jacobian(y, [x]) assert isinstance(Jx, list) f = theano.function([x], Jx[0]) vx = rng.uniform(size=(10,)).astype(theano.config.floatX) assert np.allclose(f(vx), np.eye(10) * 2) # test when the jacobian is called with a list of two elements z = tensor.vector() y = x * z Js = tensor.jacobian(y, [x, z]) f = theano.function([x, z], Js) vx = rng.uniform(size=(10,)).astype(theano.config.floatX) vz = rng.uniform(size=(10,)).astype(theano.config.floatX) vJs = f(vx, vz) evx = np.zeros((10, 10)) evz = np.zeros((10, 10)) np.fill_diagonal(evx, vx) np.fill_diagonal(evz, vz) assert np.allclose(vJs[0], evz) assert np.allclose(vJs[1], evx)
def test001_jacobian_vector(): x = tensor.vector() y = x * 2 rng = numpy.random.RandomState(seed=utt.fetch_seed()) # test when the jacobian is called with a tensor as wrt Jx = tensor.jacobian(y, x) f = theano.function([x], Jx) vx = rng.uniform(size=(10,)).astype(theano.config.floatX) assert numpy.allclose(f(vx), numpy.eye(10) * 2) # test when the jacobian is called with a tuple as wrt Jx = tensor.jacobian(y, (x,)) assert isinstance(Jx, tuple) f = theano.function([x], Jx[0]) vx = rng.uniform(size=(10,)).astype(theano.config.floatX) assert numpy.allclose(f(vx), numpy.eye(10) * 2) # test when the jacobian is called with a list as wrt Jx = tensor.jacobian(y, [x]) assert isinstance(Jx, list) f = theano.function([x], Jx[0]) vx = rng.uniform(size=(10,)).astype(theano.config.floatX) assert numpy.allclose(f(vx), numpy.eye(10) * 2) # test when the jacobian is called with a list of two elements z = tensor.vector() y = x * z Js = tensor.jacobian(y, [x, z]) f = theano.function([x, z], Js) vx = rng.uniform(size=(10,)).astype(theano.config.floatX) vz = rng.uniform(size=(10,)).astype(theano.config.floatX) vJs = f(vx, vz) evx = numpy.zeros((10, 10)) evz = numpy.zeros((10, 10)) numpy.fill_diagonal(evx, vx) numpy.fill_diagonal(evz, vz) assert numpy.allclose(vJs[0], evz) assert numpy.allclose(vJs[1], evx)
def test_vectors(self): try: import theano.tensor as T from theano import function except: return for MT in [False, True]: # Set up variables and function vals = [np.random.randn(20) for i in range(5)] f = lambda a, b, c, d, e : a + (b * c) - d ** e # Set up our objects Cs = [ch.Ch(v) for v in vals] C_result = f(*Cs) C_result.MT = MT # Set up Theano equivalents Ts = T.dvectors('T1', 'T2', 'T3', 'T4', 'T5') TF = f(*Ts) T_result = function(Ts, TF) if False: import theano.gradient which = 1 theano_sse = (TF**2.).sum() theano_grad = theano.gradient.grad(theano_sse, Ts[which]) theano_fn = function(Ts, theano_grad) print theano_fn(*vals) C_result_grad = ch.SumOfSquares(C_result).dr_wrt(Cs[which]) print C_result_grad # if True: # aaa = np.linalg.solve(C_result_grad.T.dot(C_result_grad), C_result_grad.dot(np.zeros(C_result_grad.shape[1]))) # theano_hes = theano.R_obbb = theano.R_op() import pdb; pdb.set_trace() # Make sure values and derivatives are equal np.testing.assert_array_equal(C_result.r, T_result(*vals)) for k in range(len(vals)): theano_derivative = function(Ts, T.jacobian(TF, Ts[k]))(*vals) our_derivative = np.array(C_result.dr_wrt(Cs[k]).todense()) #print theano_derivative, our_derivative # Theano produces has more nans than we do during exponentiation. # So we test only on entries where Theano is without NaN's without_nans = np.nonzero(np.logical_not(np.isnan(theano_derivative.flatten())))[0] np.testing.assert_array_equal(theano_derivative.flatten()[without_nans], our_derivative.flatten()[without_nans])
def test_flow_det_local(flow_spec): z0 = tt.arange(0, 12).astype('float32') spec = flow_spec.cls.get_param_spec_for(d=12) params = dict() for k, shp in spec.items(): params[k] = np.random.randn(1, *shp).astype('float32') flow = flow_spec(dim=12, z0=z0.reshape((1, 1, 12)), **params) assert flow.batched with change_flags(compute_test_value='off'): z1 = flow.forward.flatten() J = tt.jacobian(z1, z0) logJdet = tt.log(tt.abs_(tt.nlinalg.det(J))) det = flow.logdet[0] np.testing.assert_allclose(logJdet.eval(), det.eval(), atol=0.0001)
def get_fisher_mat(): grad2d = [] for p in self.model.params: grad2d += [T.jacobian(self.f_loss_samples, p)] if grad2d[-1].ndim == 2: grad2d[-1] = grad2d[-1].dimshuffle(0, 1, 'x') grad2d_vec = T.concatenate([g.flatten(2).T for g in grad2d]).T # tensor wise: F_p,i,j = sum_k grad2d[p,i,k]*grad2d[p,k,j] # just a slow reference implementation of what is below # F = T.mean(T.batched_dot(grad2d_vec.dimshuffle(0, 1, 'x'), grad2d_vec.dimshuffle(0, 'x', 1)), 0)/self.over_sampling F = T.dot(grad2d_vec.T, grad2d_vec)/T.cast(grad2d_vec.shape[0], theano.config.floatX)/self.over_sampling return F
def jacobian_vector(expr, wrt): """Computes the Jacobian of a vector expression with respect to varaibles. Args: expr: Vector Theano tensor expression. wrt: List of Theano variables. Returns: Theano tensor. """ try: return _tensor_map(lambda f: jacobian_scalar(f, wrt), expr) except ValueError: # Fallback for wider support. return T.stack([T.jacobian(expr, wrt, disconnected_inputs="ignore")])
def test003_jacobian_scalar(): x = tensor.scalar() y = x * 2 rng = numpy.random.RandomState(seed=utt.fetch_seed()) # test when the jacobian is called with a tensor as wrt Jx = tensor.jacobian(y, x) f = theano.function([x], Jx) vx = numpy.cast[theano.config.floatX](rng.uniform()) assert numpy.allclose(f(vx), 2) # test when the jacobian is called with a tuple as wrt Jx = tensor.jacobian(y, (x,)) assert isinstance(Jx, tuple) f = theano.function([x], Jx[0]) vx = numpy.cast[theano.config.floatX](rng.uniform()) assert numpy.allclose(f(vx), 2) # test when the jacobian is called with a list as wrt Jx = tensor.jacobian(y, [x]) assert isinstance(Jx, list) f = theano.function([x], Jx[0]) vx = numpy.cast[theano.config.floatX](rng.uniform()) assert numpy.allclose(f(vx), 2) # test when the jacobian is called with a list of two elements z = tensor.scalar() y = x * z Jx = tensor.jacobian(y, [x, z]) f = theano.function([x, z], Jx) vx = numpy.cast[theano.config.floatX](rng.uniform()) vz = numpy.cast[theano.config.floatX](rng.uniform()) vJx = f(vx, vz) assert numpy.allclose(vJx[0], vz) assert numpy.allclose(vJx[1], vx)
def test_jacobian_scalar(): x = tensor.scalar() y = x * 2 rng = np.random.RandomState(seed=utt.fetch_seed()) # test when the jacobian is called with a tensor as wrt Jx = tensor.jacobian(y, x) f = theano.function([x], Jx) vx = np.cast[theano.config.floatX](rng.uniform()) assert np.allclose(f(vx), 2) # test when the jacobian is called with a tuple as wrt Jx = tensor.jacobian(y, (x,)) assert isinstance(Jx, tuple) f = theano.function([x], Jx[0]) vx = np.cast[theano.config.floatX](rng.uniform()) assert np.allclose(f(vx), 2) # test when the jacobian is called with a list as wrt Jx = tensor.jacobian(y, [x]) assert isinstance(Jx, list) f = theano.function([x], Jx[0]) vx = np.cast[theano.config.floatX](rng.uniform()) assert np.allclose(f(vx), 2) # test when the jacobian is called with a list of two elements z = tensor.scalar() y = x * z Jx = tensor.jacobian(y, [x, z]) f = theano.function([x, z], Jx) vx = np.cast[theano.config.floatX](rng.uniform()) vz = np.cast[theano.config.floatX](rng.uniform()) vJx = f(vx, vz) assert np.allclose(vJx[0], vz) assert np.allclose(vJx[1], vx)
def initialize_calc_ll_gmm_hist_fun(self): meansvec = T.dvector('means') covarsvec = T.dvector('covars') weights = T.dvector('weights') gm_num = weights.shape[0] means = T.reshape(meansvec, (gm_num, meansvec.shape[0] / gm_num)) covars = T.reshape(covarsvec, (gm_num, meansvec.shape[0] / gm_num)) Yp = T.dmatrix('Yp') Yn = T.dmatrix('Yn') p_p,r_p,p_p_m = self.calc_ll_gmm(Yp, means, covars, weights) p_n,r_n,p_n_m = self.calc_ll_gmm(Yn, means, covars, weights) L, hmax, hmin, hn, hp = self.calc_hist_loss_vector(p_n, p_p) dL = T.jacobian(L, [meansvec, covarsvec, weights, Yp, Yn]) self.gmmhist_df = function([meansvec, covarsvec, weights, Yp, Yn], dL, allow_input_downcast=True) self.gmmhist_f = function([meansvec, covarsvec, weights, Yp, Yn], [L, hmax, hmin, hn, hp], allow_input_downcast=True)
def grad(self, inputs, dCdf): """ Gradient MTF """ MU = inputs[0][0] SD = inputs[0][1] # Y = self._normal(just_return = True, MU=MU, SD=SD) Y, Y_upd = th.scan(fn=self.norm_fun, sequences=self.counter, non_sequences=[MU, SD]) dYdMIn = T.jacobian(Y.sum(axis=0), inputs[0]) # dYdSD = T.jacobian(Y, SD) # return dYdMIn[0]*dCdf[0][0] + dYdMIn[1]*dCdf[0][1], # return T.as_tensor([dCdf[0][0]*dYdMIn[0][0] + dCdf[0][1]*dYdMIn[1][0], # dCdf[0][0]*dYdMIn[0][1] + dCdf[0][1]*dYdMIn[1][1]]), return T.as_tensor([dCdf[0].dot(dYdMIn[:,0]), dCdf[0].dot(dYdMIn[:,1])]),
def L_op(self, inputs, output, grads): # from IPython import embed; embed() if not hasattr(self, 'precomputed_grads'): grad_integrators = T.jacobian(self._expr, self._extra_vars) self.precomputed_grads = [ IntegrateVectorizedGeneralized(gi, self._var, self.bins, *self._extra_vars) for gi in grad_integrators ] out, = grads dargs = [] for integrate in self.precomputed_grads: darg = T.dot(out, integrate(*inputs)) # print(darg) dargs.append(darg) return dargs
def _get_updates(self): n = self.params['batch_size'] N = self.params['train_size'] prec_lik = self.params['prec_lik'] prec_prior = self.params['prec_prior'] gc_norm = self.params['gc_norm'] alpha = self.params['alpha'] mu = self.params['mu'] use_gamma = self.params['use_gamma'] # compute log-likelihood error = self.model_outputs - self.true_outputs logliks = log_normal(error, prec_lik) sumloglik = logliks.sum() meanloglik = sumloglik / n # compute gradients grads = tensor.grad(cost=meanloglik, wrt=self.weights) # update preconditioning matrix V_t_next = [ alpha * v + (1 - alpha) * g * g for g, v in zip(grads, self.V_t) ] G_t = [1. / (mu + tensor.sqrt(v)) for v in V_t_next] logprior = log_prior_normal(self.weights, prec_prior) grads_prior = tensor.grad(cost=logprior, wrt=self.weights) updates = [] [updates.append((v, v_n)) for v, v_n in zip(self.V_t, V_t_next)] for p, g, gp, gt in zip(self.weights, grads, grads_prior, G_t): # inject noise noise = tensor.sqrt(self.lr * gt) * trng.normal(p.shape) if use_gamma: # compute gamma gamma = nlinalg.extract_diag( tensor.jacobian(gt.flatten(), p).flatten(ndim=2)) gamma = gamma.reshape(p.shape) updates.append((p, p + 0.5 * self.lr * ((gt * (gp + N * g)) + gamma) + noise)) else: updates.append( (p, p + 0.5 * self.lr * (gt * (gp + N * g)) + noise)) return updates, sumloglik
def grad(self, inputs, g_outputs): [gz] = g_outputs [A] = inputs v = self(A) dexp = T.jacobian(self.exp(v).flatten(), v) invdexp = T.nlinalg.matrix_inverse( dexp.reshape(( A.shape[0] * A.shape[1], v.shape[0] * v.shape[1], ))).reshape(( A.shape[0], A.shape[1], v.shape[0], v.shape[1], )) return [T.tensordot(gz, invdexp, ((0, 1), (0, 1)))]
def __init__(self, mode='matrix', exp=None, LAtoV=None, VtoLA=None): assert mode in ['matrix', 'zeroest', 'nearest'] self.mode = mode if exp is None: exp = T.slinalg.Expm() self.exp = exp self.LAtoVf = None self.VtoLAf = None self.lossf = None self.dlossf = None if mode != 'matrix': g = T.matrix() hatxi = T.vector() xi = T.matrix() self.LAtoVf = theano.function([xi], LAtoV(xi)) self.VtoLAf = theano.function([hatxi], VtoLA(hatxi)) loss = lambda hatxi, g: T.sum((exp(VtoLA(hatxi)) - g)**2) dloss = lambda hatxi, g: T.jacobian(loss(hatxi, g), hatxi) self.lossf = theano.function([hatxi, g], loss(hatxi, g)) self.dlossf = theano.function([hatxi, g], dloss(hatxi, g))
def get_order_n_pole(order): """Generate function to calculate the Fourier transform `order`-order pole. The Fourier transform of :math:`{(z-ϵ)}^{n}` is calculate, where `ϵ` is the position of the pole and `n` the order of the pole. Parameters ---------- order : int The order of the pole Returns ------- order_n_pole : Callable The function (tau, pole, beta)->gf_tau calculating the Fourier transform. """ import theano import theano.tensor as T from theano.ifelse import ifelse from math import factorial pole = T.dscalar('pole') beta = T.dscalar('beta') # tau = T.dscalar('tau') tau = T.dscalar('tau') fermi_fct = (1 + T.tanh(-beta*pole/2))/2 gf_tau = ifelse( pole > 0, # avoid overflows asserting negative exponent -(1 - fermi_fct)*T.exp(-pole*tau), -fermi_fct*T.exp(pole*(beta-tau)), ) n_gf_tau = gf_tau for __ in range(order-1): # n_gf_tau = T.grad(n_gf_tau, pole) n_gf_tau = T.jacobian(n_gf_tau, pole) n_gf_tau = n_gf_tau / factorial(order-1) # resuts, __ = theano.scan(n_gf_tau.) func = theano.function([tau, pole, beta], n_gf_tau) return np.vectorize(func, otypes=[np.float])
def compute_hessian(self, objective, argument): """ Computes the directional derivative of the gradient (which is equal to the Hessian multiplied by direction). """ g = T.grad(objective, argument) # Create a new tensor A, which has the same type (i.e. same # dimensionality) as argument. try: A = argument.type() except AttributeError: # Assume we are on the product manifold A = [arg.type() for arg in argument] try: # First attempt efficient 'R-op', this directly calculates the # directional derivative of the gradient, rather than explicitly # calculating the Hessian and then multiplying. R = T.Rop(g, argument, A) except NotImplementedError: # TODO: fix this fallback for the product manifold. shp = T.shape(argument) H = T.jacobian(g.flatten(), argument).reshape(T.concatenate([shp, shp]), 2 * A.ndim) R = T.tensordot(H, A, A.ndim) try: hess = theano.function([argument, A], R, on_unused_input="warn") except TypeError: hess_prod = theano.function(argument + A, R, on_unused_input="warn") def hess(x, a): return hess_prod(*(x + a)) return hess
def compute_hessian(self, objective, argument): """ Computes the directional derivative of the gradient (which is equal to the Hessian multiplied by direction). """ g = T.grad(objective, argument) # Create a new tensor A, which has the same type (i.e. same # dimensionality) as argument. try: A = argument.type() except AttributeError: # Assume we are on the product manifold A = [arg.type() for arg in argument] try: # First attempt efficient 'R-op', this directly calculates the # directional derivative of the gradient, rather than explicitly # calculating the Hessian and then multiplying. R = T.Rop(g, argument, A) except NotImplementedError: # TODO: fix this fallback for the product manifold. shp = T.shape(argument) H = T.jacobian(g.flatten(), argument).reshape( T.concatenate([shp, shp]), 2 * A.ndim) R = T.tensordot(H, A, A.ndim) try: hess = theano.function([argument, A], R, on_unused_input="warn") except TypeError: hess_prod = theano.function(argument + A, R, on_unused_input="warn") def hess(x, a): return hess_prod(*(x + a)) return hess
def auto4check(dataset, x, tol=1e-9, maxiter=1000): t0 = theano.shared(value=dataset[0], name="t0") a0 = theano.shared(value=dataset[1], name="a0") b0 = theano.shared(value=dataset[2], name="b0") c0 = theano.shared(value=dataset[3], name="c0") k = T.vector('k') a_t = np.e ** (-(k[0] + k[1]) * t0) b_t = k[0] / (k[0] + k[1]) * (1 - a_t) c_t = k[1] / (k[0] + k[1]) * (1 - a_t) f = T.sum((a0 - a_t) ** 2 + (b0 - b_t) ** 2 + (c0 - c_t) ** 2) F = theano.function([k], f) g_f_k = T.jacobian(f, k) j_f_k = theano.function([k], g_f_k) H_f_k = T.hessian(f, k) Hessian = theano.function([k], H_f_k) track, f_val = [], [] track.append(array(x)) f_val.append(F(x)) g = j_f_k(x) i = 0 print "Step =", i, "g=", g, "x=", x, "loss=", F(x) while norm(g) > tol: i += 1 if i > maxiter: break G = Hessian(x) s = -np.linalg.solve(G, g) x += s track.append(array(x)) f_val.append(F(x)) g = j_f_k(x) print "step =", i, "g=", g, "x=", x, "loss=", F(x), "G=", G return x, F(x), track, f_val
def grad_hess(objective, argument): """ Compute both the gradient and the directional derivative of the gradient (which is equal to the hessian multiplied by direction). """ # TODO: Check that the hessian calculation is correct! # TODO: Make this compatible with non-matrix manifolds. g = T.grad(objective, argument) grad = compile(g, argument) # Create a new tensor A, which has the same type (i.e. same dimensionality) # as argument. A = argument.type() try: # First attempt efficient 'R-op', this directly calculates the # directional derivative of the gradient, rather than explicitly # calculating the hessian and then multiplying. print("begins") sys.stdout.flush() R = T.Rop(g, argument, A) print("ends") sys.stdout.flush() except NotImplementedError: # This will break if the manifold is not a matrix. n, p = T.shape(argument) H = T.jacobian(g.flatten(), argument).reshape([n, p, n, p], 4) R = T.tensordot(H, A) try: hess = theano.function([argument, A], R) except theano.compile.UnusedInputError: warn('Theano detected unused input - suggests hessian may be zero or ' 'constant.') hess = theano.function([argument, A], R, on_unused_input='ignore') return grad, hess
input_duration = T.scalar('input_duration') input_intensity = T.scalar('input_intensity') P = input_intensity * ((T.sgn(input_duration - t) + 1) / 2) corrected_sigmoid = \ 1 / (1 + T.exp(-T.mul(a,(T.dot(c,activation) + P) - theta))) \ - 1 / (1 + T.exp(np.multiply(a, theta))) #corrected_sigmoid = theano.function([t, activation, input_duration, input_intensity], corrected_s) #d_a = T.true_div(-activation + T.mul( 1 - T.mul(r, activation), corrected_s), tau) d_a = T.true_div(-activation + T.mul( k - T.mul(r, activation), corrected_sigmoid), tau) d_activation = theano.function(inputs=[activation, t, input_duration, input_intensity], outputs=d_a, on_unused_input='warn') J = theano.function(inputs=[activation, t, input_duration, input_intensity], outputs=T.jacobian(d_a, activation), on_unused_input='warn') activation_0 = np.array([0, 0]) t_0 = 0 t_1 = .125 dt = .0001 times = np.arange(t_0, t_1, dt) intens = 1 duration = .125 params = (duration, intens) #r = ode(d_activation).set_integrator('vode') #r.set_initial_value(activation_0, t_0).set_f_params(*params) timeseries = odeint(d_activation, activation_0, times, params, Dfun=J)
def __init__(self, params, sx2 = 1, linear_model = False, samples = 20, use_hat = False): ker, self.samples, self.params, self.KmmInv = kernel(), samples, params, {} self.use_hat = use_hat model_file_name = 'model' + ('_hat' if use_hat else '') + ('_linear' if linear_model else '') + '.save' try: print 'Trying to load model...' with open(model_file_name, 'rb') as file_handle: obj = cPickle.load(file_handle) self.f, self.g, self.f_Kmm, self.f_KmmInv, self.dKmm_d = obj self.update_KmmInv_cache() print 'Loaded!' return except: print 'Failed. Creating a new model...' Y, Z, m, ls, mu, lL, eps_MK, eps_NQ, eps_NK, KmmInv = T.dmatrices('Y', 'Z', 'm', 'ls', 'mu', 'lL', 'eps_MK', 'eps_NQ', 'eps_NK', 'KmmInv') lhyp = T.dvector('lhyp') (M, K), N, Q = mu.shape, m.shape[0], Z.shape[1] s, sl2, sf2, l = T.exp(ls), T.exp(lhyp[0]), T.exp(lhyp[1]), T.exp(lhyp[2:2+Q]) L = T.tril(lL - T.diag(T.diag(lL)) + T.diag(T.exp(T.diag(lL)))) print 'Setting up cache...' Kmm = ker.RBF(sf2, l, Z) if not linear_model else ker.LIN(sl2, Z) KmmInv_cache = sT.matrix_inverse(Kmm) self.f_Kmm = theano.function([Z, lhyp], Kmm, name='Kmm') self.f_KmmInv = theano.function([Z, lhyp], KmmInv_cache, name='KmmInv_cache') self.update_KmmInv_cache() self.dKmm_d = {'Z': theano.function([Z, lhyp], T.jacobian(Kmm.flatten(), Z), name='dKmm_dZ'), 'lhyp': theano.function([Z, lhyp], T.jacobian(Kmm.flatten(), lhyp), name='dKmm_dlhyp')} print 'Setting up model...' if not self.use_hat: mu_scaled, L_scaled = sf2**0.5 * mu, sf2**0.5 * L X = m + s * eps_NQ U = mu_scaled + L_scaled.dot(eps_MK) Kmn = ker.RBF(sf2, l, Z, X) if not linear_model else ker.LIN(sl2, Z, X) Knn = ker.RBFnn(sf2, l, X) if not linear_model else ker.LINnn(sl2, X) A = KmmInv.dot(Kmn) B = Knn - T.sum(Kmn * KmmInv.dot(Kmn), 0) F = A.T.dot(U) + T.maximum(B, 1e-16)[:,None]**0.5 * eps_NK F = T.concatenate((T.zeros((N,1)), F), axis=1) S = T.nnet.softmax(F) LS = T.sum(T.log(T.maximum(T.sum(Y * S, 1), 1e-16))) if not linear_model: KL_U = -0.5 * (T.sum(KmmInv.T * T.sum(mu_scaled[:,None,:]*mu_scaled[None,:,:], 2)) + K * (T.sum(KmmInv.T * L_scaled.dot(L_scaled.T)) - M - 2.0*T.sum(T.log(T.diag(L_scaled))) + 2.0*T.sum(T.log(T.diag(sT.cholesky(Kmm)))))) else: KL_U = 0 #KL_U = -0.5 * T.sum(T.sum(mu_scaled * KmmInv.dot(mu_scaled), 0) + T.sum(KmmInv * L_scaled.dot(L_scaled.T)) - M # - 2.0*T.sum(T.log(T.diag(L_scaled))) + 2.0*T.sum(T.log(T.diag(sT.cholesky(Kmm))))) if not linear_model else 0 else: # mu_scaled, L_scaled = mu / sf2**0.5, L / sf2**0.5 mu_scaled, L_scaled = mu / sf2, L / sf2 X = m + s * eps_NQ U = mu_scaled + L_scaled.dot(eps_MK) Kmn = ker.RBF(sf2, l, Z, X) if not linear_model else ker.LIN(sl2, Z, X) Knn = ker.RBFnn(sf2, l, X) if not linear_model else ker.LINnn(sl2, X) B = Knn - T.sum(Kmn * KmmInv.dot(Kmn), 0) F = Kmn.T.dot(U) + T.maximum(B, 1e-16)[:,None]**0.5 * eps_NK F = T.concatenate((T.zeros((N,1)), F), axis=1) S = T.nnet.softmax(F) LS = T.sum(T.log(T.maximum(T.sum(Y * S, 1), 1e-16))) if not linear_model: KL_U = -0.5 * (T.sum(Kmm.T * T.sum(mu_scaled[:,None,:]*mu_scaled[None,:,:], 2)) + K * (T.sum(Kmm.T * L_scaled.dot(L_scaled.T)) - M - 2.0*T.sum(T.log(T.diag(L_scaled))) - 2.0*T.sum(T.log(T.diag(sT.cholesky(Kmm)))))) else: KL_U = 0 KL_X_all = -0.5 * T.sum((m**2.0 + s**2.0)/sx2 - 1.0 - 2.0*ls + T.log(sx2), 1) KL_X = T.sum(KL_X_all) print 'Compiling...' inputs = {'Y': Y, 'Z': Z, 'm': m, 'ls': ls, 'mu': mu, 'lL': lL, 'lhyp': lhyp, 'KmmInv': KmmInv, 'eps_MK': eps_MK, 'eps_NQ': eps_NQ, 'eps_NK': eps_NK} z = 0.0*sum([T.sum(v) for v in inputs.values()]) # solve a bug with derivative wrt inputs not in the graph f = zip(['X', 'U', 'S', 'LS', 'KL_U', 'KL_X', 'KL_X_all'], [X, U, S, LS, KL_U, KL_X, KL_X_all]) self.f = {n: theano.function(inputs.values(), f+z, name=n, on_unused_input='ignore') for n,f in f} g = zip(['LS', 'KL_U', 'KL_X'], [LS, KL_U, KL_X]) wrt = {'Z': Z, 'm': m, 'ls': ls, 'mu': mu, 'lL': lL, 'lhyp': lhyp, 'KmmInv': KmmInv} self.g = {vn: {gn: theano.function(inputs.values(), T.grad(gv+z, vv), name='d'+gn+'_d'+vn, on_unused_input='ignore') for gn,gv in g} for vn, vv in wrt.iteritems()} with open(model_file_name, 'wb') as file_handle: print 'Saving model...' sys.setrecursionlimit(2000) cPickle.dump([self.f, self.g, self.f_Kmm, self.f_KmmInv, self.dKmm_d], file_handle, protocol=cPickle.HIGHEST_PROTOCOL)
drawsallbase = (np.tile(np.arange(ndraws), (nobs,nchoice,1)).transpose() + 0.5)/ndraws draws1allbase = norminv(drawsallbase*p0allbase) p1allbase = normcdf(-(Vallbase[:,1,:] + c10[:,groupid]*draws1allbase)/c11[:,groupid]).mean(axis=0) pallbase = p0allbase*p1allbase if use_fe and use_share_moments: pstation = T.stack([pallbase[1:,np.where(stationid==i)[0]].mean(axis=1) for i in range(nstation)]).transpose().flatten()[(~nuisancexi).flatten().nonzero()[0]] pstationtrue = np.stack([dv_choice[1:,stationid==i].mean(axis=1) for i in range(nstation)]).transpose().flatten()[~nuisancexi.flatten()] obj_multiplier = T.dscalar('obj_multiplier') lagrange_multiplier = T.dvector('lagrange_multiplier') lagrange = obj_multiplier*obj + (lagrange_multiplier*pstation).sum() constr = theano.function([theta], pstation) jab = theano.function([theta], T.jacobian(pstation, [theta])) hess_constr = theano.function([theta, lagrange_multiplier, obj_multiplier], outputs=theano.gradient.hessian(lagrange, [theta])) ntheta1 = nalpha + nbeta + nallsigma nxifull = (nchoice-1)*nstation mask00 = np.ones((ntheta1, ntheta1), dtype = bool) mask01 = np.ones((ntheta1, nxi), dtype = bool) mask10 = np.ones((nxi, ntheta1), dtype = bool) mask11 = np.tile(np.eye(nstation, dtype = bool), (nchoice-1, nchoice-1))[~nuisancexi.flatten(),:][:,~nuisancexi.flatten()] maskj = np.hstack((mask10, mask11)) maskh = np.hstack((np.vstack((mask00, mask10)), np.vstack((mask01, mask11)))) def solve_constr(theta0, use_hess = False): pyipopt.set_loglevel(1)
def compute_reproj_err_d_wrapper(curr_w,o,feat): curr_cam = cams[o[0]] curr_X = X[o[1]] return T.jacobian(compute_reproj_err(curr_cam,curr_X,curr_w,feat), [curr_cam,curr_X,curr_w])
def __init__(self,fname,constants={},sparse=False): # parse model specification with open(fname,'r') as fid: mod = json.load(fid,object_pairs_hook=OrderedDict) self.mod = mod # constants self.con_dict = OrderedDict() for name in mod['constants']: value = constants[name] self.con_dict[name] = np.array(value) if type(value) is list else value # arguments self.arg_info = OrderedDict() self.arg_dict = OrderedDict() for (name,spec) in mod['arguments'].items(): asize = spec['size'] (amin,amax) = spec['range'] agrid = np.linspace(amin,amax,asize) info = OrderedDict() info['size'] = asize info['grid'] = agrid self.arg_info[name] = info self.arg_dict[name] = agrid # parameters self.par_info = OrderedDict() self.par_sizes = [] for (name,spec) in mod['parameters'].items(): ptype = spec.get('type','scalar') psize = 1 if ptype == 'scalar' else spec['size'] info = OrderedDict() info['type'] = ptype info['size'] = psize self.par_info[name] = info self.par_sizes.append(psize) # variables self.var_info = OrderedDict() self.var_sizes = [] for (name,spec) in mod['variables'].items(): vtype = spec['type'] info = OrderedDict() info['type'] = vtype if vtype == 'scalar': vsize = 1 self.var_sizes.append(vsize) elif vtype == 'vector': vsize = spec['size'] self.var_sizes.append(vsize) elif vtype == 'function': vder = spec.get('derivs',[]) nder = len(vder) args = spec['args'] ainfo = [self.arg_info[arg] for arg in args] vsize = np.prod([ai['size'] for ai in ainfo]) info['vder'] = vder info['nder'] = nder info['args'] = args info['shape'] = [self.arg_info[a]['size'] for a in args] info['grid'] = map(lambda v: v.transpose().flatten(),np.meshgrid(*[self.arg_info[a]['grid'] for a in args])) if len(args) > 1 else [self.arg_info[args[0]]['grid']] self.var_sizes.append(vsize) self.var_sizes += sum(map(len,vder))*[vsize] info['size'] = vsize self.var_info[name] = info # totals self.n_pars = len(self.par_info) self.n_vars = len(self.var_info) self.sz_pars = np.sum(self.par_sizes) self.sz_vars = np.sum(self.var_sizes) # input vectors self.par_vec = T.dvector('parvec') self.var_vec = T.dvector('varvec') # unpack and map out variables self.par_dict = OrderedDict() piter = iter(split(self.par_vec,self.par_sizes)) for (name,info) in self.par_info.items(): ptype = info['type'] par = next(piter) if ptype == 'scalar': par = par[0] par.name = name self.par_dict[name] = par else: par.name = name self.par_dict[name] = par self.var_dict = OrderedDict() self.der_dict = OrderedDict() viter = iter(split(self.var_vec,self.var_sizes)) for (name,info) in self.var_info.items(): var = next(viter) vtype = info['type'] if vtype == 'scalar': var = var[0] var.name = name self.var_dict[name] = var elif vtype == 'vector': var.name = name self.var_dict[name] = var elif vtype == 'function': var.name = name self.var_dict[name] = var vder = info.get('vder',[]) nder = len(vder) self.der_dict[var] = {'': var} for der in vder: for s in prefixes(der): dvar = viter.next() dvar.name = name+'_'+s self.der_dict[var][s] = dvar # define operators def diff(var,*args): name = ''.join([getkey(self.arg_dict,v) for v in args]) return self.der_dict[var][name] def vslice(var,arg,point): var_name = var.name arg_name = getkey(self.arg_dict,arg) var_info = self.var_info[var_name] args = var_info['args'] (idx, _) = filter(lambda ia: ia[1]==arg_name, enumerate(args))[0] shape = var_info['shape'] idx_list = slice_dim([point],idx,shape) return var[idx_list] def grid(var,arg): var_name = var.name arg_name = getkey(self.arg_dict,arg) var_info = self.var_info[var_name] args = var_info['args'] (idx, _) = filter(lambda ia: ia[1]==arg_name, enumerate(args))[0] return var_info['grid'][idx] def interp(var,arg,x): i = icut(arg,x) t = np.clip((arg[i+1]-x)/(arg[i+1]-arg[i]),0.0,1.0) return t*vslice(var,arg,i) + (1.0-t)*vslice(var,arg,i+1) self.func_dict = {'diff': diff, 'slice': vslice, 'grid': grid, 'interp': interp} # combine them all self.sym_dict = merge(op_dict,self.con_dict,self.par_dict,self.var_dict,self.func_dict,self.arg_dict) # evaluate self.equations = [] # regular equations for eq in mod['equations']: self.equations.append(eval(eq,{},self.sym_dict)) # derivative relations for (name,info) in self.var_info.items(): if info['type'] == 'function': var = self.var_dict[name] size = info['size'] # derivative relations - symmetric except at 0 vder = info.get('vder','') args = info['args'] shape = info['shape'] for der in vder: v0 = '' # function value for v1 in prefixes(der): # collect argument info arg = v1[-1] (adx, _) = filter(lambda ia: ia[1]==arg, enumerate(args))[0] s = shape[adx] grid = info['grid'][adx] # generate accessors zer_idx = slice_dim([0],adx,shape) one_idx = slice_dim([1],adx,shape) beg_idx = slice_dim(range(s-2),adx,shape) mid_idx = slice_dim(range(1,s-1),adx,shape) end_idx = slice_dim(range(2,s),adx,shape) # calculate derivatives d0 = self.der_dict[var][v0] d1 = self.der_dict[var][v1] self.equations.append(d0[one_idx]-d0[zer_idx]-(grid[one_idx]-grid[zer_idx])*d1[zer_idx]) self.equations.append((d0[end_idx]-d0[beg_idx])-(grid[end_idx]-grid[beg_idx])*d1[mid_idx]) # to next level v0 = v1 # repack self.eqn_vec = T.join(0,*map(ensure_vector,self.equations)) # jacobians self.par_jac = T.jacobian(self.eqn_vec,self.par_vec) self.var_jac = T.jacobian(self.eqn_vec,self.var_vec) # sparse? if sparse: self.par_jac = S.csc_from_dense(self.par_jac) self.var_jac = S.csc_from_dense(self.var_jac) self.linsolve = spsolve else: self.linsolve = np.linalg.solve # compile print('Compiling...') self.eqn_fun = theano.function([self.par_vec,self.var_vec],self.eqn_vec) self.parjac_fun = theano.function([self.par_vec,self.var_vec],self.par_jac) self.varjac_fun = theano.function([self.par_vec,self.var_vec],self.var_jac) # newtonian path t = T.dscalar('t') start = T.dvector('start') finish = T.dvector('finish') path = (1.0-t)*start + t*finish dpath = T.jacobian(path,t) self.path_fun = theano.function([start,finish,t],path) self.dpath_fun = theano.function([start,finish,t],dpath)
def CompileTrainingFunctions(self, RPROP_penalty=0.35, RPORP_gain=0.2, SGD_LR_=5e-5, SGD_momentum_=0.9, b_Override_only_SGD=False, bOverride_OnlyGPROP=False, bOverride_OnlyRPORP=False, b_Override_only_RMSPROP=False, bWeightDecay=False, bHighActivationPenalty=False, b_layerwise_LR= False, b_external_top_error=False, b_use_clipped_gradients = False, f_clip_at = 5e-3): """ creates the functions for the last layer of <self.layers> trains all parameters included in <self.params>, i.e. ignoring the layer structure rmsprop and sgd share <last_grads>, so switching between them may behave a bit strangely """ print "Called: CompileTrainingFunctions. You don't have to call this function, you may use .training_step() directly!" if len(self.params)==0: print "call CompileOutputFunctions() before calling CompileTrainingFunctions()!" return -1 # create a list of gradients for all model parameters if b_external_top_error==False: if b_use_clipped_gradients==False: output_layer_Gradients = T.grad( self.output_layer_Loss, self.params, disconnected_inputs="warn") else: print "\nBE WARNED: Feature activated: use_clipped_gradients (f_clip_at =",f_clip_at,")" output_layer_Gradients_tmp = T.jacobian( self.layers[-1].negative_log_likelihood_array(self.y), self.params, disconnected_inputs="warn") #each element has shape: (batchsize, rest...) output_layer_Gradients = [T.mean(T.clip(x,-np.float32(np.abs(f_clip_at)),np.float32(np.abs(f_clip_at))),axis=0) for x in output_layer_Gradients_tmp] else: self.known_top_err = T.TensorType('float32',(False,)*5,name='known_top_err')('known_top_err') print "predictions are last_layer.output, which is (hopefully) sigmoid!" print "top error is specified externally: <self.known_top_err> (batchsize,x,n_classes,y,z)" output_layer_Gradients = theano.gradient.grad( T.sum(self.layers[-1].output*self.known_top_err) , self.params ,disconnected_inputs="warn")#.subgraph_grad() if b_Override_only_SGD==False: self.RPROP_LRs=[] # one for each parameter -> many self.last_grads=[] self.gprop_grad_variance=[] for i,para in enumerate(self.params): if para in self.params[:i]: print "Detected RNN or shared param @index =",i continue if b_Override_only_SGD==False: # print "warning: was 4e-5" self.RPROP_LRs.append(theano.shared( 1e-4*np.ones(para.get_value().shape,dtype=theano.config.floatX) , name=para.name+str('_RPORP') , borrow=0)) self.gprop_grad_variance.append(theano.shared( 1e-2*np.ones(para.get_value().shape,dtype=theano.config.floatX) , name=para.name+str('_GPROP') , borrow=0)) # print "WARNING change this if you want to use sgd/rmsprop" self.last_grads.append(theano.shared( np.zeros(para.get_value().shape,dtype=theano.config.floatX) , name=para.name+str('_LG') , borrow=0)) #self.SGD_EigHessian_perturbed_grads.append(theano.shared( zeros(para.get_value().shape,dtype=theano.config.floatX) , name=para.name+str('_pLG') , borrow=True)) n = len(self.last_grads) for i,lay in enumerate(self.layers): low = (i*2)%n lay.last_grads = self.last_grads[low:low+2] SGD_updatesa=[] SGD_updatesb=[] if b_Override_only_SGD==False: RPROP_updates = [] RMSPROP_updates = [] self.SGD_global_LR.set_value(np.float32(SGD_LR_)) if bWeightDecay: print "CNN::using Weight decay! Change via this.SGD_global_weightdecay.set_value()" self.SGD_global_weightdecay = theano.shared(np.asarray(0.0005).astype("float32")) self.SGD_momentum.set_value(np.float32(SGD_momentum_)) if b_Override_only_SGD==False: assert len(self.params)==len(self.last_grads),"rnn/shared params not yet implemented in rprop/gprop" # print "Trading memory usage for more speed (SGD_updates_a), change it if it gets too big (removes momentum, too)." for param_i, grad_i, last_grad_i, pLR_i, gprop_var_i in zip(self.params, output_layer_Gradients, self.last_grads, self.RPROP_LRs, self.gprop_grad_variance): # capping RPROP-LR inside [1e-7,1e-2] print "RPROP: missing backtracking handling " RPROP_updates.append((pLR_i, T.minimum( T.maximum( pLR_i * ( 1 - np.float32(RPROP_penalty)* ((last_grad_i*grad_i) < -1e-9) + np.float32(RPORP_gain)* ((last_grad_i*grad_i) > 1e-11) ) , 1e-7*T.ones_like(pLR_i) ),2e-3 * T.ones_like(pLR_i)) )) RPROP_updates.append((param_i, param_i - pLR_i * grad_i/(T.abs_(grad_i) + 1e-6) - (0 if bWeightDecay==False else self.SGD_global_weightdecay*param_i) )) RPROP_updates.append((last_grad_i, grad_i ))#RPROP_updates.append((last_grad_i, (grad_i + 0.5*last_grad_i)/1.5)) #trailing exp-mean over last gradients: smoothing. check if useful... if b_layerwise_LR: print "Using layerwise LR multiplier. Speed penalty ~ 10%. Access it via this.SGD_local_LRs (default is 1. == no modification of the global LR)." self.SGD_local_LRs = [theano.shared(np.float32(1.)) for x in self.params] #one LR modifier per param group else: self.SGD_local_LRs = [1. for x in self.params] for param_i, grad_i, last_grad_i, local_lr_modifier in zip(self.params, output_layer_Gradients, self.last_grads, self.SGD_local_LRs): if len(self.params)>len(self.last_grads): grad_i = None print "grad_param::",param_i for i in range(len(self.params)): if param_i == self.params[i]: print ">>",i grad_i = output_layer_Gradients[i] if grad_i==None else grad_i + output_layer_Gradients[i] SGD_updatesa.append((last_grad_i, grad_i + last_grad_i * self.SGD_momentum))#use this if you want to use the gradient magnitude for i, param_i, grad_i, last_grad_i, local_lr_modifier in zip(range(len(self.params)), self.params, output_layer_Gradients, self.last_grads, self.SGD_local_LRs): if bWeightDecay and (i < len(self.params)-2): #no WeightDecay in last layer SGD_updatesb.append((param_i, param_i - (self.SGD_global_LR * local_lr_modifier) * last_grad_i - self.SGD_global_LR *self.SGD_global_weightdecay*param_i )) else: SGD_updatesb.append((param_i, param_i - (self.SGD_global_LR * local_lr_modifier) * last_grad_i )) RMSPROP_updates.append((last_grad_i, 0.95*last_grad_i + 0.05* (grad_i)**2 )) RMSPROP_updates.append((param_i, param_i - self.SGD_global_LR * grad_i/( T.sqrt(last_grad_i+0.000001) ) )) print "RMSPROP: advice: a good LR is 2e-4 (value for <self.SGD_global_LR>)" if bHighActivationPenalty: self.HighActivationPenalty_coeff = theano.shared(np.float32(1e-4)) print "Applying high-activation-penalty..." print "todo: test..." for lay in self.layers: type_ = lay.ActivationFunction ok=1 if type_=="tanh": grads = T.grad( T.mean((lay.output)**2), lay.params) elif type_=="sigmoid": grads = T.grad( 2*T.mean((lay.output-0.5)**2), lay.params) elif type_=="relu": print "relu...todo:test" grads = T.grad( -T.mean((lay.output)**2), lay.params) else: print "UNSUPPORTED ActivationFunction!" ok=0 if ok: for param_i,grad_i in zip(lay.params,grads): for i,u in enumerate(SGD_updatesb): if u[0]==param_i: SGD_updatesb[i] = (param_i,u[1] - (self.SGD_global_LR * self.HighActivationPenalty_coeff) * grad_i) break try: for i,u in enumerate(RMSPROP_updates): if u[0]==param_i: RMSPROP_updates[i] = (param_i,u[1] - (self.SGD_global_LR * self.HighActivationPenalty_coeff) * grad_i) break for i,u in enumerate(RPROP_updates): if u[0]==param_i: RPROP_updates[i] = (param_i,u[1] - (self.SGD_global_LR * self.HighActivationPenalty_coeff) * grad_i) break except: print "only sgd..." addthis = [self.z,] if self.bUseModulatedNLL else [] if b_external_top_error: addthis = addthis + [self.known_top_err] if bOverride_OnlyRPORP or (b_Override_only_SGD==False and bOverride_OnlyGPROP==False and b_Override_only_RMSPROP==0): print "compiling RPROP..." self.train_model_RPROP = theano.function([self.x] + ([] if b_external_top_error else [self.y])+addthis, None if b_external_top_error else self.output_layer_Loss, updates=RPROP_updates, on_unused_input='warn') if b_Override_only_SGD==False and bOverride_OnlyGPROP==False and bOverride_OnlyRPORP==False: print "compiling RMSPROP..." self.train_model_RMSPROP = theano.function([self.x] + ([] if b_external_top_error else [self.y])+addthis, None if b_external_top_error else self.output_layer_Loss, updates=RMSPROP_updates, on_unused_input='warn') if bOverride_OnlyGPROP==0 and b_Override_only_RMSPROP==0 and bOverride_OnlyRPORP==False: print "compiling SGD..." # a only updates last_grads, it DOES NOT change any parameters #you could call it 10 times and would get the same nll every time... but if momentum is != 0 then this changes the search direction assert len(SGD_updatesa)==len(SGD_updatesb),str(len(SGD_updatesa))+" != "+str(len(SGD_updatesb)) self.train_model_SGD_a = theano.function([self.x] + ([] if b_external_top_error else [self.y])+addthis, None if b_external_top_error else self.output_layer_Loss, updates=SGD_updatesa, on_unused_input='warn')#the output is the value you get BEFORE updates.... try: self.train_model_SGD_a_ext = theano.function([self.x,self.y]+addthis, [self.output_layer_Loss, self.layers[-1].class_probabilities_realshape], updates=SGD_updatesa, on_unused_input='warn') except: print "NNet.train_model_SGD_a_ext unavailable" # b ONLY changes the parameters self.train_model_SGD_b = theano.function([], None, updates=SGD_updatesb) return 0
def __init__(self, fileEmbeddings, wordEmbeddings, weights=None, contextSize=None, negative=None): filesCount, fileEmbeddingSize = fileEmbeddings.shape wordsCount, wordEmbeddingSize = wordEmbeddings.shape trainWeights = weights is None if trainWeights: weights = rnd2(fileEmbeddingSize + contextSize * wordEmbeddingSize, wordsCount) else: featuresCount, activationsCount = weights.shape contextSize = (featuresCount - fileEmbeddingSize) / wordEmbeddingSize negative = activationsCount - 1 self.fileEmbeddings = theano.shared(asfx(fileEmbeddings), 'fileEmbeddings', borrow=False) self.wordEmbeddings = theano.shared(asfx(wordEmbeddings), 'wordEmbeddings', borrow=False) self.weights = theano.shared(asfx(weights), 'weights', borrow=False) fileIndexOffset = 0 wordIndicesOffset = fileIndexOffset + 1 indicesOffset = wordIndicesOffset + contextSize contexts = T.imatrix('contexts') fileIndices = contexts[:,fileIndexOffset:wordIndicesOffset] wordIndices = contexts[:,wordIndicesOffset:indicesOffset] indices = contexts[:,indicesOffset:indicesOffset + negative] files = self.fileEmbeddings[fileIndices] fileFeatures = T.flatten(files, outdim=2) words = self.wordEmbeddings[wordIndices] wordFeatures = T.flatten(words, outdim=2) features = T.concatenate([fileFeatures, wordFeatures], axis=1) subWeights = self.weights[:,indices].dimshuffle(1, 0, 2) probabilities = T.batched_dot(features, subWeights) parameters = [self.fileEmbeddings] subParameters = [files] consider_constant = [self.wordEmbeddings] if trainWeights: parameters.append(self.weights) subParameters.append(subWeights) else: consider_constant.append(self.weights) # cost = -T.mean(T.log(T.nnet.sigmoid(probabilities[:,0])) + T.sum(T.log(T.nnet.sigmoid(-probabilities[:,1:])), dtype=floatX, acc_dtype=floatX), dtype=floatX, acc_dtype=floatX) cost = -T.log(T.nnet.sigmoid(probabilities[:,0])) - T.sum(T.log(T.nnet.sigmoid(-probabilities[:,1:])), dtype=floatX, acc_dtype=floatX) learningRate = T.scalar('learningRate', dtype=floatX) updates = [] for p, subP in zip(parameters, subParameters): if subP is not None: gradient = T.jacobian(cost, wrt=subP) update = (p, T.inc_subtensor(subP, -learningRate * gradient)) else: gradient = T.jacobian(cost, wrt=p) update = (p, p - learningRate * gradient) updates.append(update) contextIndex = T.iscalar('contextIndex') self.trainingContexts = theano.shared(empty(1,1,1), 'trainingContexts', borrow=False) self.trainModel = theano.function( inputs=[contextIndex, learningRate], outputs=cost, updates=updates, givens={ contexts: self.trainingContexts[:,contextIndex] } )
def compile(self, optimizer, metrics=[]): metrics += [mean_q] if hasattr(optimizer, '__len__'): if len(optimizer) != 2: raise ValueError('More than two optimizers provided. Please only provide a maximum of two optimizers, the first one for the actor and the second one for the critic.') actor_optimizer, critic_optimizer = optimizer else: actor_optimizer = optimizer critic_optimizer = clone_optimizer(optimizer) assert actor_optimizer != critic_optimizer if len(metrics) == 2 and hasattr(metrics[0], '__len__') and hasattr(metrics[1], '__len__'): actor_metrics, critic_metrics = metrics else: actor_metrics = critic_metrics = metrics def clipped_mse(y_true, y_pred): delta = K.clip(y_true - y_pred, self.delta_range[0], self.delta_range[1]) return K.mean(K.square(delta), axis=-1) # Compile target networks. We only use them in feed-forward mode, hence we can pass any # optimizer and loss since we never use it anyway. self.target_actor = clone_model(self.actor, self.custom_model_objects) self.target_actor.compile(optimizer='sgd', loss='mse') self.target_critic = clone_model(self.critic, self.custom_model_objects) self.target_critic.compile(optimizer='sgd', loss='mse') # We also compile the actor. We never optimize the actor using Keras but instead compute # the policy gradient ourselves. However, we need the actor in feed-forward mode, hence # we also compile it with any optimzer and self.actor.compile(optimizer='sgd', loss='mse') # Compile the critic. if self.target_model_update < 1.: # We use the `AdditionalUpdatesOptimizer` to efficiently soft-update the target model. critic_updates = get_soft_target_model_updates(self.target_critic, self.critic, self.target_model_update) critic_optimizer = AdditionalUpdatesOptimizer(critic_optimizer, critic_updates) self.critic.compile(optimizer=critic_optimizer, loss=clipped_mse, metrics=critic_metrics) # Combine actor and critic so that we can get the policy gradient. combined_inputs = [] critic_inputs = [] for i in self.critic.input: if i == self.critic_action_input: combined_inputs.append(self.actor.output) else: combined_inputs.append(i) critic_inputs.append(i) combined_output = self.critic(combined_inputs) if K._BACKEND == 'tensorflow': grads = K.gradients(combined_output, self.actor.trainable_weights) grads = [g / float(self.batch_size) for g in grads] # since TF sums over the batch elif K._BACKEND == 'theano': import theano.tensor as T grads = T.jacobian(combined_output.flatten(), self.actor.trainable_weights) grads = [K.mean(g, axis=0) for g in grads] else: raise RuntimeError('Unknown Keras backend "{}".'.format(K._BACKEND)) # We now have the gradients (`grads`) of the combined model wrt to the actor's weights and # the output (`output`). Compute the necessary updates using a clone of the actor's optimizer. clipnorm = getattr(actor_optimizer, 'clipnorm', 0.) clipvalue = getattr(actor_optimizer, 'clipvalue', 0.) def get_gradients(loss, params): # We want to follow the gradient, but the optimizer goes in the opposite direction to # minimize loss. Hence the double inversion. assert len(grads) == len(params) modified_grads = [-g for g in grads] if clipnorm > 0.: norm = K.sqrt(sum([K.sum(K.square(g)) for g in modified_grads])) modified_grads = [optimizers.clip_norm(g, clipnorm, norm) for g in modified_grads] if clipvalue > 0.: modified_grads = [K.clip(g, -clipvalue, clipvalue) for g in modified_grads] return modified_grads actor_optimizer.get_gradients = get_gradients updates = actor_optimizer.get_updates(self.actor.trainable_weights, self.actor.constraints, None) if self.target_model_update < 1.: # Include soft target model updates. updates += get_soft_target_model_updates(self.target_actor, self.actor, self.target_model_update) updates += self.actor.updates # include other updates of the actor, e.g. for BN # Finally, combine it all into a callable function. actor_inputs = None if not hasattr(self.actor.input, '__len__'): actor_inputs = [self.actor.input] else: actor_inputs = self.actor.input inputs = actor_inputs + critic_inputs if self.uses_learning_phase: inputs += [K.learning_phase()] self.actor_train_fn = K.function(inputs, [self.actor.output], updates=updates) self.actor_optimizer = actor_optimizer self.compiled = True
compile_mode = 'FAST_COMPILE' #compile_mode = 'FAST_RUN' th.config.linker='cvm' start = t.time() err_ = hand_objective(params_,nbones_,base_relatives_,parents_,inverse_base_absolutes_,base_positions_, weights_,mirror_factor_,points_,correspondences_) f = th.function([params_,nbones_,base_relatives_,parents_,inverse_base_absolutes_,base_positions_, weights_,mirror_factor_,points_,correspondences_], err_, mode=compile_mode) end = t.time() tf_compile = (end - start) print("tf_compile: %f" % tf_compile) start = t.time() jac = T.jacobian(T.flatten(err_),[params_]) fjac = th.function([params_,nbones_,base_relatives_,parents_,inverse_base_absolutes_,base_positions_, weights_,mirror_factor_,points_,correspondences_], jac, mode=compile_mode) end = t.time() tJ_compile = (end - start) print("tJ_compile: %f" % tJ_compile) ntasks = (len(sys.argv)-1)//5 for task_id in range(ntasks): print("task_id: %i" % task_id) argv_idx = task_id*5 + 1 dir_in = sys.argv[argv_idx] dir_out = sys.argv[argv_idx+1] fn = sys.argv[argv_idx+2] nruns_f = int(sys.argv[argv_idx+3])