def __init__(self, rng, input1, input2, n_in1, n_in2, n_hidden_layers, d_hidden, W1=None, W2=None): self.input1 = input1 self.input2 = input2 CouplingFunc = WarpNetwork(rng, input1, n_hidden_layers, d_hidden, n_in1, n_in2) if W1 is None: bin = numpy.sqrt(6. / (n_in1 + n_in1)) W1_values = numpy.identity(n_in1, dtype=theano.config.floatX) W1 = theano.shared(value=W1_values, name='W1') if W2 is None: bin = numpy.sqrt(6. / (n_in2 + n_in2)) W2_values = numpy.identity(n_in2, dtype=theano.config.floatX) W2 = theano.shared(value=W2_values, name='W2') V1u = T.triu(W1) V1l = T.tril(W1) V1l = T.extra_ops.fill_diagonal(V1l, 1.) V1 = T.dot(V1u, V1l) V2u = T.triu(W2) V2l = T.tril(W2) V2l = T.extra_ops.fill_diagonal(V2l, 1.) V2 = T.dot(V2u, V2l) self.output1 = T.dot(input1, V1) self.output2 = T.dot(input2, V2) + CouplingFunc.output self.log_jacobian = T.log(T.abs_(T.nlinalg.ExtractDiag()(V1u))).sum() \ + T.log(T.abs_(T.nlinalg.ExtractDiag()(V2u))).sum() self.params = CouplingFunc.params
def cost(X): Y = T.dot(X, X.T) s = T.triu(Y, 1).max() expY = T.exp((Y - s) / epsilon) expY = expY - T.diag(T.diag(expY)) u = T.sum(T.triu(expY, 1)) return s + epsilon * T.log(u)
def gaussian_chol(mean, logvar, chol, sample=None): if sample != None: raise Exception('Not implemented') diag = gaussian_diag(mean, logvar) mask = T.shape_padleft(T.triu(T.ones_like(chol[0]), 1)) sample = diag.sample + T.batched_dot(diag.sample, chol * mask) return RandomVariable(sample, diag.logp, diag.entr, mean=mean, logvar=logvar)
def grad(self, inputs, output_gradients): """ Reverse-mode gradient updates for matrix solve operation c = A \ b. Symbolic expression for updates taken from [1]_. References ---------- ..[1] M. B. Giles, "An extended collection of matrix derivative results for forward and reverse mode automatic differentiation", http://eprints.maths.ox.ac.uk/1079/ """ A, b = inputs c = self(A, b) c_bar = output_gradients[0] trans_map = { 'lower_triangular': 'upper_triangular', 'upper_triangular': 'lower_triangular' } trans_solve_op = Solve( # update A_structure and lower to account for a transpose operation A_structure=trans_map.get(self.A_structure, self.A_structure), lower=not self.lower ) b_bar = trans_solve_op(A.T, c_bar) # force outer product if vector second input A_bar = -tensor.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T) if self.A_structure == 'lower_triangular': A_bar = tensor.tril(A_bar) elif self.A_structure == 'upper_triangular': A_bar = tensor.triu(A_bar) return [A_bar, b_bar]
def L_op(self, inputs, outputs, gradients): # Modified from theano/tensor/slinalg.py # No handling for on_error = 'nan' dz = gradients[0] chol_x = outputs[0] # this is for nan mode # # ok = ~tensor.any(tensor.isnan(chol_x)) # chol_x = tensor.switch(ok, chol_x, 1) # dz = tensor.switch(ok, dz, 1) # deal with upper triangular by converting to lower triangular if not self.lower: chol_x = chol_x.T dz = dz.T def tril_and_halve_diagonal(mtx): """Extracts lower triangle of square matrix and halves diagonal.""" return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.) def conjugate_solve_triangular(outer, inner): """Computes L^{-T} P L^{-1} for lower-triangular L.""" return gpu_solve_upper_triangular( outer.T, gpu_solve_upper_triangular(outer.T, inner.T).T) s = conjugate_solve_triangular( chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz))) if self.lower: grad = tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s)) else: grad = tensor.triu(s + s.T) - tensor.diag(tensor.diagonal(s)) return [grad]
def L_op(self, inputs, outputs, output_gradients): r""" Reverse-mode gradient updates for matrix solve operation c = A \\\ b. Symbolic expression for updates taken from [#]_. References ---------- .. [#] M. B. Giles, "An extended collection of matrix derivative results for forward and reverse mode automatic differentiation", http://eprints.maths.ox.ac.uk/1079/ """ A, b = inputs c = outputs[0] c_bar = output_gradients[0] trans_map = { "lower_triangular": "upper_triangular", "upper_triangular": "lower_triangular", } trans_solve_op = Solve( # update A_structure and lower to account for a transpose operation A_structure=trans_map.get(self.A_structure, self.A_structure), lower=not self.lower, ) b_bar = trans_solve_op(A.T, c_bar) # force outer product if vector second input A_bar = -tensor.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T) if self.A_structure == "lower_triangular": A_bar = tensor.tril(A_bar) elif self.A_structure == "upper_triangular": A_bar = tensor.triu(A_bar) return [A_bar, b_bar]
def TopAccuracy2C(pred=None, truth=None, symmetric=False): M1s = T.ones_like(truth, dtype=np.int8) LRsel = T.triu(M1s, 24) MLRsel = T.triu(M1s, 12) SMLRsel = T.triu(M1s, 6) MRsel = MLRsel - LRsel SRsel = SMLRsel - MLRsel dataLen = truth.shape[0] pred0 = pred[:, :, 0] if symmetric: avg_pred = (pred0 + pred0.dimshuffle(1, 0)) / 2.0 else: avg_pred = pred0 #pred_truth = T.concatenate( (avg_pred, truth.dimshuffle(0, 1, 'x') ), axis=2) pred_truth = T.stack([avg_pred, T.cast(truth, 'int32')], axis=2) accuracyList = [] for Rsel in [LRsel, MRsel, MLRsel, SRsel]: selected_pred_truth = pred_truth[Rsel.nonzero()] ## sort by the predicted value for label 0 from the largest to the smallest selected_pred_truth_sorted = selected_pred_truth[( selected_pred_truth[:, 0]).argsort()[::-1]] #print 'topRatio =', topRatio numTops = T.minimum(T.iround(dataLen * topRatio), selected_pred_truth_sorted.shape[0]) selected_sorted_truth = T.cast( selected_pred_truth_sorted[:, -1], 'int32') numTruths = T.bincount(selected_sorted_truth, minlength=2) numCorrects = T.bincount(selected_sorted_truth[0:numTops], minlength=2) #numTops = T.minimum(numTops, numTruths[0]) accuracyList.append( T.stack([ numCorrects[0] * 1. / (numTops + 0.001), numTops, numTruths[0] ], axis=0)) return T.stacklists(accuracyList)
def __init__(self, input, n_in, n_out): batchSize, seqLen, _ = input.shape import collections if isinstance(n_out, collections.Sequence): LRembedLayer = EmbeddingLayer(input, n_in, n_out[2]) MRembedLayer = EmbeddingLayer(input, n_in, n_out[1]) SRembedLayer = EmbeddingLayer(input, n_in, n_out[0]) n_out_max = max(n_out) else: LRembedLayer = EmbeddingLayer(input, n_in, n_out) MRembedLayer = EmbeddingLayer(input, n_in, n_out) SRembedLayer = EmbeddingLayer(input, n_in, n_out) n_out_max = n_out self.layers = [LRembedLayer, MRembedLayer, SRembedLayer] M1s = T.ones((seqLen, seqLen)) Sep24Mat = T.triu(M1s, 24) + T.tril(M1s, -24) Sep12Mat = T.triu(M1s, 12) + T.tril(M1s, -12) Sep6Mat = T.triu(M1s, 6) + T.tril(M1s, -6) LRsel = Sep24Mat.dimshuffle('x', 0, 1, 'x') MRsel = (Sep12Mat - Sep24Mat).dimshuffle('x', 0, 1, 'x') SRsel = (Sep6Mat - Sep12Mat).dimshuffle('x', 0, 1, 'x') selections = [LRsel, MRsel, SRsel] self.output = T.zeros((batchSize, seqLen, seqLen, n_out_max), dtype=theano.config.floatX) for emLayer, sel in zip(self.layers, selections): l_n_out = emLayer.n_out self.output = T.inc_subtensor(self.output[:, :, :, :l_n_out], T.mul(emLayer.output, sel)) self.pcenters = 0 self.params = [] self.paramL1 = 0 self.paramL2 = 0 for layer in [LRembedLayer, MRembedLayer, SRembedLayer]: self.params += layer.params self.paramL1 += layer.paramL1 self.paramL2 += layer.paramL2 self.pcenters += layer.pcenters self.n_out = n_out_max
def rank_loss(scores): # Images diag = T.diag(scores) diff_img = scores - diag.dimshuffle(0, 'x') + 1 max_img = T.maximum(0, diff_img) triu_img = T.triu(max_img, 1) til_img = T.tril(max_img, -1) res_img = T.sum(triu_img) + T.sum(til_img) # Sentences diff_sent = scores.T - diag.dimshuffle(0, 'x') + 1 max_sent = T.maximum(0, diff_sent) triu_sent = T.triu(max_sent, 1) til_sent = T.tril(max_sent, -1) res_sent = T.sum(triu_sent) + T.sum(til_sent) return T.log(T.sum(scores) + 0.01)
def check_u(m, k=0): m_symb = T.matrix(dtype=m.dtype) k_symb = T.iscalar() f = theano.function([m_symb, k_symb], T.triu(m_symb, k_symb), mode=mode_with_gpu) result = f(m, k) assert np.allclose(result, np.triu(m, k)) assert result.dtype == np.dtype(dtype) assert any([isinstance(node.op, GpuTri) for node in f.maker.fgraph.toposort()])
def check_u(m, k=0): m_symb = T.matrix(dtype=m.dtype) k_symb = T.iscalar() f = theano.function([m_symb, k_symb], T.triu(m_symb, k_symb), mode=mode_with_gpu) result = f(m, k) assert np.allclose(result, np.triu(m, k)) assert result.dtype == np.dtype(dtype) assert any([ isinstance(node.op, GpuTri) for node in f.maker.fgraph.toposort() ])
def lower_lower(self): '''Evaluates the intractable term in the lower bound which itself must be lower bounded''' a = self.get_aux_mult() reversed_cum_probs = T.extra_ops.cumsum(a[:,::-1],1) dot_prod_m = T.dot(reversed_cum_probs, self.digams_1p2) dot_prod_mp1 = T.dot(T.concatenate((reversed_cum_probs[:,1:],T.zeros((self.K,1))),1), self.digams[:,0]) # final entropy term triu_ones = T.triu(T.ones_like(a)) - T.eye(self.K) aloga = T.sum(T.tril(a)*T.log(T.tril(a)+triu_ones),1) return T.dot(a, self.digams[:,1]) + dot_prod_m + dot_prod_mp1 - aloga
def grad(self, inputs, gradients): """ Cholesky decomposition reverse-mode gradient update. Symbolic expression for reverse-mode Cholesky gradient taken from [0]_ References ---------- .. [0] I. Murray, "Differentiation of the Cholesky decomposition", http://arxiv.org/abs/1602.07527 """ x = inputs[0] dz = gradients[0] chol_x = self(x) # Replace the cholesky decomposition with 1 if there are nans # or solve_upper_triangular will throw a ValueError. if self.on_error == 'nan': ok = ~tensor.any(tensor.isnan(chol_x)) chol_x = tensor.switch(ok, chol_x, 1) dz = tensor.switch(ok, dz, 1) # deal with upper triangular by converting to lower triangular if not self.lower: chol_x = chol_x.T dz = dz.T def tril_and_halve_diagonal(mtx): """Extracts lower triangle of square matrix and halves diagonal.""" return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.) def conjugate_solve_triangular(outer, inner): """Computes L^{-T} P L^{-1} for lower-triangular L.""" return solve_upper_triangular( outer.T, solve_upper_triangular(outer.T, inner.T).T) s = conjugate_solve_triangular( chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz))) if self.lower: grad = tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s)) else: grad = tensor.triu(s + s.T) - tensor.diag(tensor.diagonal(s)) if self.on_error == 'nan': return [tensor.switch(ok, grad, np.nan)] else: return [grad]
def L_op(self, inputs, outputs, output_gradients): # Modified from theano/tensor/slinalg.py A, b = inputs c = outputs[0] c_bar = output_gradients[0] trans_solve_op = GpuCublasTriangularSolve(not self.lower) b_bar = trans_solve_op(A.T, c_bar) A_bar = -tensor.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T) if self.lower: A_bar = tensor.tril(A_bar) else: A_bar = tensor.triu(A_bar) return [A_bar, b_bar]
def triangularize_network(layers, force_diag=False): n_layers, rem = divmod(len(layers) + 1, 4) assert(rem == 0) assert(n_layers > 0) assert((n_layers - 1, aL_PARAM) not in layers) layers_LU = layers.copy() for nn in xrange(n_layers): LL, UL = layers[(nn, LL_PARAM)], layers[(nn, UL_PARAM)] LL_diag = T.nlinalg.alloc_diag(T.nlinalg.extract_diag(LL)) layers_LU[(nn, LL_PARAM)] = \ ifelse(force_diag, LL_diag, T.tril(LL)) layers_LU[(nn, UL_PARAM)] = \ ifelse(force_diag, T.eye(UL.shape[0]), T.triu(UL)) return layers_LU, n_layers
def calMAP(_k): inx = T.argsort(dist, axis=1) # A = (te_lab == tr_lab[inx[:, 0: _k].reshape([-1])].reshape([length, _k])).astype('float32') A = T.eq(te_lab, tr_lab[inx[:, 0: _k].reshape([-1])].reshape([length, _k])).astype('float32') U = T.triu(T.ones([_k, _k])) B = T.dot(A, U) B *= A r = T.sum(A, axis=1) p = T.sum(B / (T.arange(1, _k + 1).astype('float32')), axis=1) r, p = theano.function([], [r, p])() p = p[r.nonzero()] r = r[r.nonzero()] res = T.sum(p / r) res /= (_k * length) res = theano.function([], res)() return res
def grad(self, inputs, g_outputs): r"""The gradient function should return .. math:: \sum_n\left(W_n\frac{\partial\,w_n} {\partial a_{ij}} + \sum_k V_{nk}\frac{\partial\,v_{nk}} {\partial a_{ij}}\right), where [:math:`W`, :math:`V`] corresponds to ``g_outputs``, :math:`a` to ``inputs``, and :math:`(w, v)=\mbox{eig}(a)`. Analytic formulae for eigensystem gradients are well-known in perturbation theory: .. math:: \frac{\partial\,w_n} {\partial a_{ij}} = v_{in}\,v_{jn} .. math:: \frac{\partial\,v_{kn}} {\partial a_{ij}} = \sum_{m\ne n}\frac{v_{km}v_{jn}}{w_n-w_m} Code derived from theano.nlinalg.Eigh and doi=10.1.1.192.9105 """ x, = inputs w, v = self(x) # Replace gradients wrt disconnected variables with # zeros. This is a work-around for issue #1063. W, V = _zero_disconnected([w, v], g_outputs) N = x.shape[0] # W part gW = T.tensordot(v, v * W[numpy.newaxis, :], (1, 1)) # V part vv = v[:, :, numpy.newaxis, numpy.newaxis] * v[numpy.newaxis, numpy.newaxis, :, :] minusww = -w[:, numpy.newaxis] + w[numpy.newaxis, :] minuswwinv = 1 / (minusww + T.eye(N)) minuswwinv = T.triu(minuswwinv, 1) + T.tril(minuswwinv, -1) # remove diagonal c = (vv * minuswwinv[numpy.newaxis, :, numpy.newaxis, :]).dimshuffle( (1, 3, 0, 2)) vc = T.tensordot(v, c, (1, 0)) gV = T.tensordot(V, vc, ((0, 1), (0, 1))) g = gW + gV res = (g.T + g) / 2 return [res]
def calc_feats(self, h): """ :param h: 1D: n_words, 2D: batch_size, 3D: hidden_dim :return: 1D: batch_size, 2D: n_spans, 3D: 2 * hidden_dim """ h = h.dimshuffle(1, 0, 2) n_words = h.shape[1] m = T.triu(T.ones(shape=(n_words, n_words))) indices = m.nonzero() # 1D: batch_size, 2D: n_spans, 3D: hidden_dim h_i = h[:, indices[0]] h_j = h[:, indices[1]] h_diff = h_i - h_j h_add = h_i + h_j return T.concatenate([h_add, h_diff], axis=2)
def grad(self, inputs, gradients): """ Cholesky decomposition reverse-mode gradient update. Symbolic expression for reverse-mode Cholesky gradient taken from [0]_ References ---------- .. [0] I. Murray, "Differentiation of the Cholesky decomposition", http://arxiv.org/abs/1602.07527 """ x = inputs[0] dz = gradients[0] chol_x = self(x) ok = tt.all(tt.nlinalg.diag(chol_x) > 0) chol_x = tt.switch(ok, chol_x, tt.fill_diagonal(chol_x, 1)) dz = tt.switch(ok, dz, floatX(1)) # deal with upper triangular by converting to lower triangular if not self.lower: chol_x = chol_x.T dz = dz.T def tril_and_halve_diagonal(mtx): """Extracts lower triangle of square matrix and halves diagonal.""" return tt.tril(mtx) - tt.diag(tt.diagonal(mtx) / 2.) def conjugate_solve_triangular(outer, inner): """Computes L^{-T} P L^{-1} for lower-triangular L.""" solve = tt.slinalg.Solve(A_structure="upper_triangular") return solve(outer.T, solve(outer.T, inner.T).T) s = conjugate_solve_triangular( chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz))) if self.lower: grad = tt.tril(s + s.T) - tt.diag(tt.diagonal(s)) else: grad = tt.triu(s + s.T) - tt.diag(tt.diagonal(s)) return [tt.switch(ok, grad, floatX(np.nan))]
def span_feats(self, h): """ :param h: 1D: n_words, 2D: batch_size, 3D: hidden_dim :return: 1D: batch_size, 2D: n_words(i), 3D: n_words(j), 4D: 2 * hidden_dim """ h = h.dimshuffle(1, 0, 2) n_words = h.shape[1] pad = T.zeros(shape=(h.shape[0], 1, h.shape[2])) h_pad = T.concatenate([h, pad], axis=1) m = T.triu(T.ones(shape=(n_words, n_words))) indices = m.nonzero() # 1D: batch_size, 2D: n_spans, 3D: hidden_dim h_i = h[:, indices[0]] h_j = h_pad[:, indices[1] + 1] h_diff = h_i - h_j h_add = h_i + h_j return T.concatenate([h_add, h_diff], axis=2)
def grad(self, inputs, gradients): """ Cholesky decomposition reverse-mode gradient update. Symbolic expression for reverse-mode Cholesky gradient taken from [0]_ References ---------- .. [0] I. Murray, "Differentiation of the Cholesky decomposition", http://arxiv.org/abs/1602.07527 """ x = inputs[0] dz = gradients[0] chol_x = self(x) # deal with upper triangular by converting to lower triangular if not self.lower: chol_x = chol_x.T dz = dz.T def tril_and_halve_diagonal(mtx): """Extracts lower triangle of square matrix and halves diagonal.""" return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.) def conjugate_solve_triangular(outer, inner): """Computes L^{-T} P L^{-1} for lower-triangular L.""" return solve_upper_triangular( outer.T, solve_upper_triangular(outer.T, inner.T).T) s = conjugate_solve_triangular( chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz))) if self.lower: return [tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s))] else: return [tensor.triu(s + s.T) - tensor.diag(tensor.diagonal(s))]
def __init__(self, weights_init, biases_init, lower=False, weights_prec=0., biases_prec=0., weights_mean=None, biases_mean=None): assert weights_init.ndim == 2, 'weights_init must be 2D array.' assert biases_init.ndim == 1, 'biases_init must be 1D array.' assert weights_init.shape[0] == biases_init.shape[0], \ 'Dimensions of weights_init and biases_init must be consistent.' self.lower = lower self.weights = th.shared(weights_init, name='W') self.weights_tri = (tt.tril(self.weights) if lower else tt.triu(self.weights)) self.biases = th.shared(biases_init, name='b') self.weights_prec = weights_prec self.biases_prec = biases_prec if weights_mean is None: weights_mean = np.eye(weights_init.shape[0]) if biases_mean is None: biases_mean = np.zeros_like(biases_init) self.weights_mean = (np.tril(weights_mean) if lower else np.triu(weights_mean)) self.biases_mean = biases_mean super(TriangularAffineLayer, self).__init__( [self.weights, self.biases])
def L_op(self, inputs, outputs, gradients): # Modified from theano/tensor/slinalg.py # No handling for on_error = 'nan' dz = gradients[0] chol_x = outputs[0] # this is for nan mode # # ok = ~tensor.any(tensor.isnan(chol_x)) # chol_x = tensor.switch(ok, chol_x, 1) # dz = tensor.switch(ok, dz, 1) # deal with upper triangular by converting to lower triangular if not self.lower: chol_x = chol_x.T dz = dz.T def tril_and_halve_diagonal(mtx): """Extracts lower triangle of square matrix and halves diagonal.""" return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.0) def conjugate_solve_triangular(outer, inner): """Computes L^{-T} P L^{-1} for lower-triangular L.""" return gpu_solve_upper_triangular( outer.T, gpu_solve_upper_triangular(outer.T, inner.T).T ) s = conjugate_solve_triangular( chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz)) ) if self.lower: grad = tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s)) else: grad = tensor.triu(s + s.T) - tensor.diag(tensor.diagonal(s)) return [grad]
def skew_frac(A): return tensor.tril(A, -1) - tensor.tril(A, -1).T,\ tensor.triu(A, 0).T + tensor.triu(A, 1)
def run_model(index, slide_index, Y, mFunc, Struct, Dist, n, kernel, lambdaw, Kf, sample_size, tune_size): """ index: index of object data slide_index: index of slide window Y: time-series data mFunc: functional connectivity Struct: structural connectivity Dist: distribution matrix of n ROIs n: ROI number kernel: "exponential" or "gaussian" or "matern52" or "matern32" lambdaw: weighted parameter kf: weighted parameter sample_size: NUTS number tune_size: burning number """ m = Dist[0].shape[0] k = Y.shape[1] n_vec = n * (n + 1) // 2 Y_mean = [] for i in range(n): Y_mean.append(np.mean(Y[i * m:(i + 1) * m, 0])) Y_mean = np.array(Y_mean) with pm.Model() as model_generator: # convariance matrix log_Sig = pm.Uniform("log_Sig", -8, 8, shape=(n, )) SQ = tt.diag(tt.sqrt(tt.exp(log_Sig))) Func_Covm = tt.dot(tt.dot(SQ, mFunc), SQ) Struct_Convm = tt.dot(tt.dot(SQ, Struct), SQ) # double fusion of structural and FC L_fc_vec = tt.reshape( tt.slinalg.cholesky(tt.squeeze(Func_Covm)).T[np.triu_indices(n)], (n_vec, )) L_st_vec = tt.reshape( tt.slinalg.cholesky( tt.squeeze(Struct_Convm)).T[np.triu_indices(n)], (n_vec, )) Struct_vec = tt.reshape(Struct[np.triu_indices(n)], (n_vec, )) rhonn = Kf*( (1-lambdaw)*L_fc_vec + lambdaw*L_st_vec ) + \ (1-Kf)*( (1-Struct_vec*lambdaw)*L_fc_vec + Struct_vec*lambdaw*L_st_vec ) # correlation Cov_temp = tt.triu(tt.ones((n, n))) Cov_temp = tt.set_subtensor(Cov_temp[np.triu_indices(n)], rhonn) Cov_mat_v = tt.dot(Cov_temp.T, Cov_temp) d = tt.sqrt(tt.diagonal(Cov_mat_v)) rho = (Cov_mat_v.T / d).T / d rhoNew = pm.Deterministic("rhoNew", rho[np.triu_indices(n, 1)]) # temporal correlation AR(1) phi_T = pm.Uniform("phi_T", 0, 1, shape=(n, )) sigW_T = pm.Uniform("sigW_T", 0, 100, shape=(n, )) B = pm.Normal("B", 0, 100, shape=(n, )) muW1 = Y_mean - B # get the shifted mean mean_overall = muW1 / (1.0 - phi_T) # AR(1) mean tau_overall = (1.0 - tt.sqr(phi_T)) / tt.sqr(sigW_T) # AR (1) variance W_T = pm.MvNormal("W_T", mu=mean_overall, tau=tt.diag(tau_overall), shape=(k, n)) # add all parts together one_m_vec = tt.ones((m, 1)) one_k_vec = tt.ones((1, k)) D = pm.MvNormal("D", mu=tt.zeros(n), cov=Cov_mat_v, shape=(n, )) phi_s = pm.Uniform("phi_s", 0, 20, shape=(n, )) spat_prec = pm.Uniform("spat_prec", 0, 100, shape=(n, )) H_base = pm.Normal("H_base", 0, 1, shape=(m, n)) Mu_all = tt.zeros((m * n, k)) if kernel == "exponential": for i in range(n): r = Dist[i] * phi_s[i] H_temp = tt.sqr(spat_prec[i]) * tt.exp(-r) L_H_temp = tt.slinalg.cholesky(H_temp) Mu_all_update = tt.set_subtensor(Mu_all[m*i:m*(i+1), :], B[i] + D[i] + one_m_vec*W_T[:,i] + \ tt.dot(L_H_temp, tt.reshape(H_base[:,i], (m, 1)))*one_k_vec) Mu_all = Mu_all_update elif kernel == "gaussian": for i in range(n): r = Dist[i] * phi_s[i] H_temp = tt.sqr(spat_prec[i]) * tt.exp(-tt.sqr(r) * 0.5) L_H_temp = tt.slinalg.cholesky(H_temp) Mu_all_update = tt.set_subtensor(Mu_all[m*i:m*(i+1), :], B[i] + D[i] + one_m_vec*W_T[:,i] + \ tt.dot(L_H_temp, tt.reshape(H_base[:,i], (m, 1)))*one_k_vec) Mu_all = Mu_all_update elif kernel == "matern52": for i in range(n): r = Dist[i] * phi_s[i] H_temp = tt.sqr(spat_prec[i]) * ( (1.0 + tt.sqrt(5.0) * r + 5.0 / 3.0 * tt.sqr(r)) * tt.exp(-1.0 * tt.sqrt(5.0) * r)) L_H_temp = tt.slinalg.cholesky(H_temp) Mu_all_update = tt.set_subtensor(Mu_all[m*i:m*(i+1), :], B[i] + D[i] + one_m_vec*W_T[:,i] + \ tt.dot(L_H_temp, tt.reshape(H_base[:,i], (m, 1)))*one_k_vec) Mu_all = Mu_all_update elif kernel == "matern32": for i in range(n): r = Dist[i] * phi_s[i] H_temp = tt.sqr(spat_prec[i]) * ( 1.0 + tt.sqrt(3.0) * r) * tt.exp(-tt.sqrt(3.0) * r) L_H_temp = tt.slinalg.cholesky(H_temp) Mu_all_update = tt.set_subtensor(Mu_all[m*i:m*(i+1), :], B[i] + D[i] + one_m_vec*W_T[:,i] + \ tt.dot(L_H_temp, tt.reshape(H_base[:,i], (m, 1)))*one_k_vec) Mu_all = Mu_all_update sigma_error_prec = pm.Uniform("sigma_error_prec", 0, 100) Y1 = pm.Normal("Y1", mu=Mu_all, sd=sigma_error_prec, observed=Y) with model_generator: step = pm.NUTS() trace = pm.sample(sample_size, step=step, tune=tune_size, chains=1) # save as pandas format and output the csv file save_trace = pm.trace_to_dataframe(trace) save_trace.to_csv(out_dir + date.today().strftime("%m_%d_%y") + \ "_sample_size_" + str(sample_size) + "_index_" + str(index) + "_slide_index_" + str(slide_index) +".csv")
def __init__(self, x_h_0, v_h_0, t_h_0, x_t_0, v_t_0, a_t_0, t_t_0, time_steps, exist, is_leader, x_goal, turn_vec_h, turn_vec_t, n_steps, lr, game_params, arch_params, solver_params, params): self._init_layers(params, arch_params, game_params) self._connect(game_params, solver_params) def _dist_from_rail(pos, rail_center, rail_radius): d = tt.sqrt(((pos - rail_center)**2).sum()) return tt.sum((d - rail_radius)**2) def _step_state(x_h_, v_h_, angle_, speed_, t_h_, turn_vec_h, x_t_, v_t_, t_t_, turn_vec_t, ctrl, exist, time_step): a_t_e, v_t_e, x_t_e, t_t, t_h = step(x_h_, v_h_, t_h_, turn_vec_h, x_t_, v_t_, t_t_, turn_vec_h, exist, time_step) t_h = common.disconnected_grad(t_h) t_t = common.disconnected_grad(t_t) # approximated dynamic of the un-observed parts in the state a_t_a = tt.zeros(shape=(3, 2), dtype=np.float32) v_t_a = v_t_ x_t_a = x_t_ + self.dt * v_t_a # difference in predictions n_v_t = v_t_e - v_t_a n_a_t = a_t_e - a_t_a n_x_t = x_t_e - x_t_a # disconnect the gradient of the noise signals n_v_t = common.disconnected_grad(n_v_t) n_a_t = common.disconnected_grad(n_a_t) n_x_t = common.disconnected_grad(n_x_t) # add the noise to the approximation a_t = a_t_a + n_a_t v_t = v_t_a + n_v_t x_t = x_t_a + n_x_t # update the observed part of the state delta_steer = ctrl[0] accel = ctrl[1] delta_steer = tt.clip(delta_steer, -np.pi / 4, np.pi / 4) angle = angle_ + delta_steer speed = speed_ + accel * self.dt speed = tt.clip(speed, 0, self.v_max) v_h_x = speed * tt.sin(angle) v_h_y = speed * tt.cos(angle) v_h = tt.stack([v_h_x, v_h_y]) x_h = x_h_ + self.dt * v_h x_h = tt.clip(x_h, -self.bw, self.bw) return x_h, v_h, angle, speed, t_h, x_t, v_t, a_t, t_t def _recurrence(time_step, x_h_, v_h_, angle_, speed_, t_h_, x_t_, v_t_, a_t_, t_t_, exist, is_leader, x_goal, turn_vec_h, turn_vec_t): # state ''' 1. host 1.1 position (2) - (x,y) coordinates in cross coordinate system 1.2 speed (2) - (v_x,v_y) # 1.3 acceleration (2) - (a_x,a_y) # 1.4 waiting time (1) - start counting on full stop. stop counting when clearing the junction 1.5 x_goal (2) - destination position (indicates different turns) total = 5 2. right lane car 2.1 position (2) - null value = (-1,-1) 2.2 speed (2) - null value = (0,0) 2.3 acceleration (2) - null value = (0,0) 2.4 waiting time (1) - null value = 0 total = 7 3. front lane car 3.1 position (2) 3.2 speed (2) 3.3 acceleration (2) 3.4 waiting time (1) total = 7 4. target 3 4.1 position (2) 4.2 speed (2) 4.3 acceleration (2) 4.4 waiting time (1) total = 7 total = 26 ''' # host_state_vec = tt.concatenate([x_h_, v_h_, t_h_]) ang_spd = tt.stack([angle_, speed_]) host_state_vec = tt.concatenate([x_h_, ang_spd, x_goal]) # target_state_vec = tt.concatenate([tt.flatten(x_t_), tt.flatten(v_t_), tt.flatten(a_t_), tt.flatten(t_t_)]) target_state_vec = tt.concatenate([ tt.flatten(x_t_), tt.flatten(v_t_), tt.flatten(a_t_), is_leader ]) state = tt.concatenate([host_state_vec, target_state_vec]) h0 = tt.dot(state, self.W_0) + self.b_0 relu0 = tt.nnet.relu(h0) h1 = tt.dot(relu0, self.W_1) + self.b_1 relu1 = tt.nnet.relu(h1) h2 = tt.dot(relu1, self.W_2) + self.b_2 relu2 = tt.nnet.relu(h2) a_h = tt.dot(relu2, self.W_c) x_h, v_h, angle, speed, t_h, x_t, v_t, a_t, t_t = _step_state( x_h_, v_h_, angle_, speed_, t_h_, turn_vec_h, x_t_, v_t_, t_t_, turn_vec_t, a_h, exist, time_step) # cost: discount_factor = 0.99**time_step # 0. smooth driving policy cost_steer = discount_factor * a_h[0]**2 cost_accel = discount_factor * a_h[1]**2 # 1. forcing the host to move forward dist_from_goal = tt.mean((x_goal - x_h)**2) cost_progress = discount_factor * dist_from_goal # 2. keeping distance from in front vehicles d_t_h = x_t - x_h h_t_dists = (d_t_h**2).sum(axis=1) # v_h_norm = tt.sqrt((v_h**2).sum()) # d_t_h_norm = tt.sqrt((d_t_h**2).sum(axis=1)) # # denominator = v_h_norm * d_t_h_norm # # host_targets_orientation = tt.dot(d_t_h, v_h) / (denominator + 1e-3) # # in_fornt_targets = tt.nnet.sigmoid(5 * host_targets_orientation) # # close_targets = tt.sum(tt.abs_(d_t_h)) # # cost_accident = tt.mean(in_fornt_targets * close_targets) cost_accident = tt.sum( tt.nnet.relu(self.require_distance - h_t_dists)) # 3. rail divergence cost_right_rail = _dist_from_rail( x_h, self.right_rail_center, self.right_rail_radius) * turn_vec_h[0] cost_front_rail = (x_h[0] - self.lw / 2)**2 * turn_vec_h[1] cost_left_rail = _dist_from_rail( x_h, self.left_rail_center, self.left_rail_radius) * turn_vec_h[2] cost_rail = cost_right_rail + cost_left_rail + cost_front_rail return (x_h, v_h, angle, speed, t_h, x_t, v_t, a_t, t_t, cost_steer, cost_accel, cost_progress, cost_accident, cost_rail, a_h), t.scan_module.until(dist_from_goal < 0.001) [ x_h, v_h, angle, speed, t_h, x_t, v_t, a_t, t_t, costs_steer, costs_accel, costs_progress, costs_accident, costs_rail, a_hs ], scan_updates = t.scan( fn=_recurrence, sequences=time_steps, outputs_info=[ x_h_0, v_h_0, 0., 0., t_h_0, x_t_0, v_t_0, a_t_0, t_t_0, None, None, None, None, None, None ], non_sequences=[exist, is_leader, x_goal, turn_vec_h, turn_vec_t], n_steps=n_steps, name='scan_func') # 3. right of way cost term T = x_h.shape[0] x_h_rpt_1 = tt.repeat(x_h, T, axis=1) # (Tx2T) x_h_rpt_1_3d = x_h_rpt_1.dimshuffle(0, 1, 'x') # (Tx2Tx1) x_h_3D = tt.repeat(x_h_rpt_1_3d, 3, axis=2) # (Tx2Tx3) x_t_rshp_1 = tt.zeros(shape=(2 * T, 3), dtype=np.float32) # (2Tx3) x_t_rshp_1_x = tt.set_subtensor(x_t_rshp_1[:T, :], x_t[:, :, 0]) x_t_rshp_1_xy = tt.set_subtensor(x_t_rshp_1_x[T:, :], x_t[:, :, 1]) x_t_rshp_1_3d = x_t_rshp_1_xy.dimshuffle(0, 1, 'x') # (2Tx3x1) x_t_rpt_2_3d = tt.repeat(x_t_rshp_1_3d, T, axis=2) # (2Tx3xT) x_t_3D = x_t_rpt_2_3d.dimshuffle(2, 0, 1) # (Tx2Tx3) # abs_diff_mat = tt.abs_(x_h_3D - x_t_3D) # (Tx2Tx3) abs_diff_mat = (x_h_3D - x_t_3D)**2 # (Tx2Tx3) dists_mat = abs_diff_mat[:, : T, :] + abs_diff_mat[:, T:, :] # d_x+d_y: (TxTx3) # punish only when cutting a leader host_effective_dists = (tt.triu(dists_mat[:, :, 0]) * is_leader[0] + tt.triu(dists_mat[:, :, 1]) * is_leader[1] + tt.triu(dists_mat[:, :, 2]) * is_leader[2]) costs_row = tt.mean( tt.nnet.sigmoid(self.eps_row - host_effective_dists)) self.cost_steer = tt.mean(costs_steer) self.cost_accel = tt.mean(costs_accel) self.cost_progress = tt.mean(costs_progress) self.cost_accident = tt.mean(costs_accident) self.cost_row = tt.mean(costs_row) self.cost_rail = tt.mean(costs_rail) self.weighted_cost = ( self.w_delta_steer * self.cost_steer + self.w_accel * self.cost_accel + self.w_progress * self.cost_progress + self.w_accident * self.cost_accident + # self.w_row * self.cost_row self.w_rail * self.cost_rail) self.cost = ( self.cost_steer + self.cost_accel + self.cost_progress + self.cost_accident + # self.cost_row self.cost_rail) objective = self.weighted_cost objective = common.weight_decay(objective=objective, params=self.params, l1_weight=self.l1_weight) objective = t.gradient.grad_clip(objective, -self.grad_clip_val, self.grad_clip_val) gradients = tt.grad(objective, self.params) self.updates = optimizers.optimizer(lr=lr, param_struct=self, gradients=gradients, solver_params=solver_params) self.x_h = x_h self.v_h = v_h self.x_t = x_t self.v_t = v_t self.max_a = tt.max(abs(a_hs)) self.max_grad_val = 0 self.grad_mean = 0 for g in gradients: self.grad_mean += tt.mean(tt.abs_(g)) self.max_grad_val = (tt.max(g) > self.max_grad_val) * tt.max(g) + ( tt.max(g) <= self.max_grad_val) * self.max_grad_val self.params_abs_norm = self._calc_params_norm()
def grad(self, inputs, g_outputs): r"""The gradient function should return .. math:: \sum_n\left(W_n\frac{\partial\,w_n} {\partial a_{ij}} + \sum_k V_{nk}\frac{\partial\,v_{nk}} {\partial a_{ij}}\right), where [:math:`W`, :math:`V`] corresponds to ``g_outputs``, :math:`a` to ``inputs``, and :math:`(w, v)=\mbox{eig}(a)`. Analytic formulae for eigensystem gradients are well-known in perturbation theory: .. math:: \frac{\partial\,w_n} {\partial a_{ij}} = v_{in}\,v_{jn} .. math:: \frac{\partial\,v_{kn}} {\partial a_{ij}} = \sum_{m\ne n}\frac{v_{km}v_{jn}}{w_n-w_m} Code derived from theano.nlinalg.Eigh and doi=10.1.1.192.9105 """ x, = inputs w, vr, vj = self(x) # Replace gradients wrt disconnected variables with # zeros. This is a work-around for issue #1063. W, Vr, Vj = _zero_disconnected([w, vr, vj], g_outputs) # # complex version # v = vr+1j*vj # V = Vr+1j*Vj # N = x.shape[0] # gW = T.tensordot(T.conj(v),v*W[numpy.newaxis,:],(1,1)) # W part # vv = T.conj(v[:,:,numpy.newaxis,numpy.newaxis])*v[numpy.newaxis,numpy.newaxis,:,:] # minusww = -w[:,numpy.newaxis]+w[numpy.newaxis,:] # minuswwinv = 1/(minusww+T.eye(N)) # minuswwinv = T.triu(minuswwinv,1)+T.tril(minuswwinv,-1)# remove diagonal # c = (vv*minuswwinv[numpy.newaxis,:,numpy.newaxis,:]).dimshuffle((1,3,0,2)) # vc = T.tensordot(v,c,(1,0)) # gV = T.tensordot(T.conj(V),vc,((0,1),(0,1))) # g = gW+gV # g = T.imag(g) # real version v = vr + 1j * vj V = Vr + 1j * Vj N = x.shape[0] # W part gWr = (T.tensordot(vr, vr * W[numpy.newaxis, :], (1, 1)) + T.tensordot(vj, vj * W[numpy.newaxis, :], (1, 1))) gWj = (T.tensordot(vr, vj * W[numpy.newaxis, :], (1, 1)) - T.tensordot(vj, vr * W[numpy.newaxis, :], (1, 1))) # V part vvr = (vr[:, :, numpy.newaxis, numpy.newaxis] * vr[numpy.newaxis, numpy.newaxis, :, :] + vj[:, :, numpy.newaxis, numpy.newaxis] * vj[numpy.newaxis, numpy.newaxis, :, :]) vvj = (vr[:, :, numpy.newaxis, numpy.newaxis] * vj[numpy.newaxis, numpy.newaxis, :, :] - vj[:, :, numpy.newaxis, numpy.newaxis] * vr[numpy.newaxis, numpy.newaxis, :, :]) minusww = -w[:, numpy.newaxis] + w[numpy.newaxis, :] minuswwinv = 1 / (minusww + T.eye(N)) minuswwinv = T.triu(minuswwinv, 1) + T.tril(minuswwinv, -1) # remove diagonal cr = (vvr * minuswwinv[numpy.newaxis, :, numpy.newaxis, :]).dimshuffle( (1, 3, 0, 2)) cj = (vvj * minuswwinv[numpy.newaxis, :, numpy.newaxis, :]).dimshuffle( (1, 3, 0, 2)) vcr = (T.tensordot(vr, cr, (1, 0)) - T.tensordot(vj, cj, (1, 0))) vcj = (T.tensordot(vr, cj, (1, 0)) + T.tensordot(vj, cr, (1, 0))) gVr = (T.tensordot(Vr, vcr, ((0, 1), (0, 1))) + T.tensordot(Vj, vcj, ((0, 1), (0, 1)))) gVj = (T.tensordot(Vr, vcj, ((0, 1), (0, 1))) - T.tensordot(Vj, vcr, ((0, 1), (0, 1)))) g = gWj + gVj res = (g.T - g) / 2 return [res]
def __init__(self, rng, input, n_in, n_batch, d_bucket, activation, activation_deriv, w=None, index_permute=None, index_permute_reverse=None): srng = RandomStreams(seed=234) n_bucket = n_in / d_bucket + 1 self.input = input # randomly permute input space if index_permute is None: index_permute = srng.permutation(n=n_in)#numpy.random.permutation(n_in) index_permute_reverse = T.argsort(index_permute) self.index_permute = index_permute self.index_permute_reverse = index_permute_reverse permuted_input = input[:, index_permute] self.permuted_input = permuted_input # initialize reflection parameters if w is None: bound = numpy.sqrt(3. / d_bucket) w_values = numpy.asarray(rng.uniform(low=-bound, high=bound, size=(n_bucket, d_bucket, d_bucket)), dtype=theano.config.floatX) w = theano.shared(value=w_values, name='w') self.w = w # compute outputs and Jacobians log_jacobian = T.alloc(0, n_batch) for b in xrange(n_bucket): bucket_size = d_bucket if b == n_bucket - 1: bucket_size = n_in - b * d_bucket x_b = self.permuted_input[:, b*d_bucket:b*d_bucket + bucket_size] w_b = self.w[b, :bucket_size, :bucket_size] # W = T.slinalg.Expm()(w_b) # log_jacobian = log_jacobian + T.alloc(T.nlinalg.trace(w_b), n_batch) Upper = T.triu(w_b) # Upper = T.extra_ops.fill_diagonal(Upper, 1.) Lower = T.tril(w_b) Lower = T.extra_ops.fill_diagonal(Lower, 1.) log_det_Upper = T.log(T.abs_(T.nlinalg.ExtractDiag()(Upper))).sum() # log_det_Lower = T.log(T.abs_(T.nlinalg.ExtractDiag()(Lower))).sum() W = T.dot(Upper, Lower) log_jacobian = log_jacobian + T.alloc(log_det_Upper, n_batch) # W = T.dot(T.transpose(w_b), w_b) + 0.001*T.eye(bucket_size) # log_jacobian = log_jacobian + T.alloc(T.log(T.abs_(T.nlinalg.Det()(W))), n_batch) # diag = T.nlinalg.diag(W) # div = T.tile(T.reshape(T.sqrt(diag), [1, bucket_size]), (bucket_size, 1)) # W = W / div / T.transpose(div) #import pdb; pdb.set_trace() lin_output_b = T.dot(x_b, W) if b>0: lin_output = T.concatenate([lin_output, lin_output_b], axis=1) else: lin_output = lin_output_b if activation is not None: derivs = activation_deriv(lin_output_b) #import pdb; pdb.set_trace() log_jacobian = log_jacobian + T.log(T.abs_(derivs)).sum(axis=1) # for n in xrange(n_batch): # mat = T.tile(T.reshape(derivs[n], [1, bucket_size]), (bucket_size, 1)) # mat = mat * W # T.inc_subtensor(log_jacobian[n], T.log(T.abs_(T.nlinalg.Det()(mat)))) self.log_jacobian = log_jacobian self.output = ( lin_output if activation is None else activation(lin_output) ) self.params = [w]
def predict_symbolic(self, mx, Sx, unroll_scan=False): idims = self.D odims = self.E Ms = self.sr.shape[1] sf2M = (self.hyp[:, idims]**2)/tt.cast(Ms, floatX) sn2 = self.hyp[:, idims+1]**2 # TODO this should just fallback to the method from the SSGP class if Sx is None: # first check if we received a vector [D] or a matrix [nxD] if mx.ndim == 1: mx = mx[None, :] srdotx = self.sr.dot(self.X.T).transpose(0,2,1) phi_x = tt.concatenate([tt.sin(srdotx), tt.cos(srdotx)], 2) M = (phi_x*self.beta_ss[:, None, :]).sum(-1) phi_x_L = tt.stack([ solve_lower_triangular(self.Lmm[i], phi_x[i].T) for i in range(odims)]) S = sn2[:, None]*(1 + (sf2M[:, None])*(phi_x_L**2).sum(-2)) + 1e-6 return M, S # precompute some variables srdotx = self.sr.dot(mx) srdotSx = self.sr.dot(Sx) srdotSxdotsr = tt.sum(srdotSx*self.sr, 2) e = tt.exp(-0.5*srdotSxdotsr) cos_srdotx = tt.cos(srdotx) sin_srdotx = tt.sin(srdotx) cos_srdotx_e = cos_srdotx*e sin_srdotx_e = sin_srdotx*e # compute the mean vector mphi = tt.horizontal_stack(sin_srdotx_e, cos_srdotx_e) # E x 2*Ms M = tt.sum(mphi*self.beta_ss, 1) # input output covariance mx_c = mx.dimshuffle(0, 'x') sin_srdotx_e_r = sin_srdotx_e.dimshuffle(0, 'x', 1) cos_srdotx_e_r = cos_srdotx_e.dimshuffle(0, 'x', 1) srdotSx_tr = srdotSx.transpose(0, 2, 1) c = tt.concatenate([mx_c*sin_srdotx_e_r + srdotSx_tr*cos_srdotx_e_r, mx_c*cos_srdotx_e_r - srdotSx_tr*sin_srdotx_e_r], axis=2) # E x D x 2*Ms beta_ss_r = self.beta_ss.dimshuffle(0, 'x', 1) # input output covariance (notice this is not premultiplied by the # input covariance inverse) V = tt.sum(c*beta_ss_r, 2).T - tt.outer(mx, M) srdotSxdotsr_c = srdotSxdotsr.dimshuffle(0, 1, 'x') srdotSxdotsr_r = srdotSxdotsr.dimshuffle(0, 'x', 1) M2 = tt.zeros((odims, odims)) # initialize indices triu_indices = np.triu_indices(odims) indices = [tt.as_index_variable(idx) for idx in triu_indices] def second_moments(i, j, M2, beta, iA, sn2, sf2M, sr, srdotSx, srdotSxdotsr_c, srdotSxdotsr_r, sin_srdotx, cos_srdotx, *args): # compute the second moments of the spectrum feature vectors siSxsj = srdotSx[i].dot(sr[j].T) # Ms x Ms sijSxsij = -0.5*(srdotSxdotsr_c[i] + srdotSxdotsr_r[j]) em = tt.exp(sijSxsij+siSxsj) # MsxMs ep = tt.exp(sijSxsij-siSxsj) # MsxMs si = sin_srdotx[i] # Msx1 ci = cos_srdotx[i] # Msx1 sj = sin_srdotx[j] # Msx1 cj = cos_srdotx[j] # Msx1 sicj = tt.outer(si, cj) # MsxMs cisj = tt.outer(ci, sj) # MsxMs sisj = tt.outer(si, sj) # MsxMs cicj = tt.outer(ci, cj) # MsxMs sm = (sicj-cisj)*em sp = (sicj+cisj)*ep cm = (sisj+cicj)*em cp = (cicj-sisj)*ep # Populate the second moment matrix of the feature vector Q_up = tt.concatenate([cm-cp, sm+sp], axis=1) Q_lo = tt.concatenate([sp-sm, cm+cp], axis=1) Q = tt.concatenate([Q_up, Q_lo], axis=0) # Compute the second moment of the output m2 = 0.5*matrix_dot(beta[i], Q, beta[j].T) m2 = theano.ifelse.ifelse( tt.eq(i, j), m2 + sn2[i]*(1.0 + sf2M[i]*tt.sum(self.iA[i]*Q)) + 1e-6, m2) M2 = tt.set_subtensor(M2[i, j], m2) return M2 nseq = [self.beta_ss, self.iA, sn2, sf2M, self.sr, srdotSx, srdotSxdotsr_c, srdotSxdotsr_r, sin_srdotx, cos_srdotx, self.Lmm] if unroll_scan: from lasagne.utils import unroll_scan [M2_] = unroll_scan(second_moments, indices, [M2], nseq, len(triu_indices[0])) updts = {} else: M2_, updts = theano.scan(fn=second_moments, sequences=indices, outputs_info=[M2], non_sequences=nseq, allow_gc=False, name="%s>M2_scan" % (self.name)) M2 = M2_[-1] M2 = M2 + tt.triu(M2, k=1).T S = M2 - tt.outer(M, M) return M, S, V
def run_model(index, in_dir, out_dir, data_filename, func_filename, struct_filename, dist_filename, n, sample_size, tune_size): """ index: data in_dir: set up work directory out_dir: save the trace as csv in the out directory data_filename: filename for time series data func_filename: filename for functional connectivity struct_filename: filename for structural connectivity dist_filename: filename for distribution matrix of n ROIs n: ROI number sample_size: NUTS number tune_size: burning number """ os.chdir(in_dir + str(index)) Y = get_data(data_filename) mFunc = get_func(func_filename, n) Struct = get_struct(struct_filename, n) Dist = get_dist(dist_filename, n) m = Dist[0].shape[0] k = Y.shape[1] n_vec = n * (n + 1) // 2 Y_mean = [] for i in range(n): Y_mean.append(np.mean(Y[i * m:(i + 1) * m, 0])) Y_mean = np.array(Y_mean) with pm.Model() as model_generator: # convariance matrix log_Sig = pm.Uniform("log_Sig", -8, 8, shape=(n, )) SQ = tt.diag(tt.sqrt(tt.exp(log_Sig))) Func_Covm = tt.dot(tt.dot(SQ, mFunc), SQ) Struct_Convm = tt.dot(tt.dot(SQ, Struct), SQ) # double fusion of structural and FC L_fc_vec = tt.reshape( tt.slinalg.cholesky(tt.squeeze(Func_Covm)).T[np.triu_indices(n)], (n_vec, )) L_st_vec = tt.reshape( tt.slinalg.cholesky( tt.squeeze(Struct_Convm)).T[np.triu_indices(n)], (n_vec, )) Struct_vec = tt.reshape(Struct[np.triu_indices(n)], (n_vec, )) lambdaw = pm.Beta("lambdaw", alpha=1, beta=1, shape=(n_vec, )) Kf = pm.Beta("Kf", alpha=1, beta=1, shape=(n_vec, )) rhonn = Kf*( (1-lambdaw)*L_fc_vec + lambdaw*L_st_vec ) + \ (1-Kf)*( (1-Struct_vec*lambdaw)*L_fc_vec + Struct_vec*lambdaw*L_st_vec ) # correlation Cov_temp = tt.triu(tt.ones((n, n))) Cov_temp = tt.set_subtensor(Cov_temp[np.triu_indices(n)], rhonn) Cov_mat_v = tt.dot(Cov_temp.T, Cov_temp) d = tt.sqrt(tt.diagonal(Cov_mat_v)) rho = (Cov_mat_v.T / d).T / d rhoNew = pm.Deterministic("rhoNew", rho[np.triu_indices(n, 1)]) # temporal correlation AR(1) phi_T = pm.Uniform("phi_T", 0, 1, shape=(n, )) sigW_T = pm.Uniform("sigW_T", 0, 100, shape=(n, )) B = pm.Normal("B", 0, 0.01, shape=(n, )) muW1 = Y_mean - B # get the shifted mean mean_overall = muW1 / (1.0 - phi_T) # AR(1) mean tau_overall = (1.0 - tt.sqr(phi_T)) / tt.sqr(sigW_T) # AR (1) variance W_T = pm.MvNormal("W_T", mu=mean_overall, tau=tt.diag(tau_overall), shape=(k, n)) # add all parts together one_m_vec = tt.ones((m, 1)) one_k_vec = tt.ones((1, k)) D = pm.MvNormal("D", mu=tt.zeros(n), cov=Cov_mat_v, shape=(n, )) phi_s = pm.Uniform("phi_s", 0, 20, shape=(n, )) spat_prec = pm.Uniform("spat_prec", 0, 100, shape=(n, )) H_base = pm.Normal("H_base", 0, 1, shape=(m, n)) Mu_all_temp = [] for i in range(n): # exponential covariance function H_temp = tt.sqr(spat_prec[i]) * tt.exp(-phi_s[i] * Dist[i]) L_H_temp = tt.slinalg.cholesky(H_temp) Mu_all_temp.append( B[i] + D[i] + one_m_vec * W_T[:, i] + tt.dot(L_H_temp, tt.reshape(H_base[:, i], (m, 1))) * one_k_vec) MU_all = tt.concatenate(Mu_all_temp, axis=0) sigma_error_prec = pm.Uniform("sigma_error_prec", 0, 100) Y1 = pm.Normal("Y1", mu=MU_all, sd=sigma_error_prec, observed=Y) with model_generator: step = pm.NUTS() trace = pm.sample(sample_size, step=step, tune=tune_size, chains=1) # save as pandas format and output the csv file save_trace = pm.trace_to_dataframe(trace) save_trace.to_csv(out_dir + date.today().strftime("%m_%d_%y") + "_sample_size_" + str(sample_size) + "_index_" + str(index) + ".csv")
def compile_theano(): """ This function generates theano compiled kernels for energy and force learning ker_jkmn_withcutoff = ker_jkmn #* cutoff_ikmn The position of the atoms relative to the centrla one, and their chemical species are defined by a matrix of dimension Mx5 Returns: km_ee (func): energy-energy kernel km_ef (func): energy-force kernel km_ff (func): force-force kernel """ if not (os.path.exists(Mffpath / 'k3_ee_m.pickle') and os.path.exists(Mffpath / 'k3_ef_m.pickle') and os.path.exists(Mffpath / 'k3_ff_m.pickle')): print("Building Kernels") import theano.tensor as T from theano import function, scan logger.info("Started compilation of theano three body kernels") # -------------------------------------------------- # INITIAL DEFINITIONS # -------------------------------------------------- # positions of central atoms r1, r2 = T.dvectors('r1d', 'r2d') # positions of neighbours rho1, rho2 = T.dmatrices('rho1', 'rho2') # hyperparameter sig = T.dscalar('sig') # cutoff hyperparameters theta = T.dscalar('theta') rc = T.dscalar('rc') # positions of neighbours without chemical species rho1s = rho1[:, 0:3] rho2s = rho2[:, 0:3] alpha_1 = rho1[:, 3].flatten() alpha_2 = rho2[:, 3].flatten() alpha_j = rho1[:, 4].flatten() alpha_m = rho2[:, 4].flatten() alpha_k = rho1[:, 4].flatten() alpha_n = rho2[:, 4].flatten() # -------------------------------------------------- # RELATIVE DISTANCES TO CENTRAL VECTOR AND BETWEEN NEIGHBOURS # -------------------------------------------------- # first and second configuration r1j = T.sqrt(T.sum((rho1s[:, :] - r1[None, :])**2, axis=1)) r2m = T.sqrt(T.sum((rho2s[:, :] - r2[None, :])**2, axis=1)) rjk = T.sqrt( T.sum((rho1s[None, :, :] - rho1s[:, None, :])**2, axis=2)) rmn = T.sqrt( T.sum((rho2s[None, :, :] - rho2s[:, None, :])**2, axis=2)) # -------------------------------------------------- # CHEMICAL SPECIES MASK # -------------------------------------------------- # numerical kronecker def delta_alpha2(a1j, a2m): d = np.exp(-(a1j - a2m)**2 / (2 * 0.00001**2)) return d # permutation 1 delta_alphas12 = delta_alpha2(alpha_1[0], alpha_2[0]) delta_alphasjm = delta_alpha2(alpha_j[:, None], alpha_m[None, :]) delta_alphas_jmkn = delta_alphasjm[:, None, :, None] * delta_alphasjm[None, :, None, :] delta_perm1 = delta_alphas12 * delta_alphas_jmkn # permutation 3 delta_alphas1m = delta_alpha2(alpha_1[0, None], alpha_m[None, :]).flatten() delta_alphasjn = delta_alpha2(alpha_j[:, None], alpha_n[None, :]) delta_alphask2 = delta_alpha2(alpha_k[:, None], alpha_2[None, 0]).flatten() delta_perm3 = delta_alphas1m[None, None, :, None] * delta_alphasjn[:, None, None, :] * \ delta_alphask2[None, :, None, None] # permutation 5 delta_alphas1n = delta_alpha2(alpha_1[0, None], alpha_n[None, :]).flatten() delta_alphasj2 = delta_alpha2(alpha_j[:, None], alpha_2[None, 0]).flatten() delta_alphaskm = delta_alpha2(alpha_k[:, None], alpha_m[None, :]) delta_perm5 = delta_alphas1n[None, None, None, :] * delta_alphaskm[None, :, :, None] * \ delta_alphasj2[:, None, None, None] # -------------------------------------------------- # BUILD THE KERNEL # -------------------------------------------------- # Squared exp of differences se_1j2m = T.exp(-(r1j[:, None] - r2m[None, :])**2 / (2 * sig**2)) se_jkmn = T.exp( -(rjk[:, :, None, None] - rmn[None, None, :, :])**2 / (2 * sig**2)) se_jk2m = T.exp(-(rjk[:, :, None] - r2m[None, None, :])**2 / (2 * sig**2)) se_1jmn = T.exp(-(r1j[:, None, None] - rmn[None, :, :])**2 / (2 * sig**2)) # Kernel not summed (cyclic permutations) k1n = (se_1j2m[:, None, :, None] * se_1j2m[None, :, None, :] * se_jkmn) k2n = (se_1jmn[:, None, :, :] * se_jk2m[:, :, None, :] * se_1j2m[None, :, :, None]) k3n = (se_1j2m[:, None, None, :] * se_jk2m[:, :, :, None] * se_1jmn[None, :, :, :]) # final shape is M1 M1 M2 M2 ker_loc = k1n * delta_perm1 + k2n * delta_perm3 + k3n * delta_perm5 # Faster version of cutoff (less calculations) cut_j = 0.5 * (1 + T.cos(np.pi * r1j / rc)) cut_m = 0.5 * (1 + T.cos(np.pi * r2m / rc)) cut_jk = cut_j[:, None] * cut_j[None, :] * 0.5 * ( 1 + T.cos(np.pi * rjk / rc)) cut_mn = cut_m[:, None] * cut_m[None, :] * 0.5 * ( 1 + T.cos(np.pi * rmn / rc)) # -------------------------------------------------- # REMOVE DIAGONAL ELEMENTS # -------------------------------------------------- # remove diagonal elements AND lower triangular ones from first configuration mask_jk = T.triu(T.ones_like(rjk)) - T.identity_like(rjk) # remove diagonal elements from second configuration mask_mn = T.ones_like(rmn) - T.identity_like(rmn) # Combine masks mask_jkmn = mask_jk[:, :, None, None] * mask_mn[None, None, :, :] # Apply mask and then apply cutoff functions ker_loc = ker_loc * mask_jkmn ker_loc = T.sum(ker_loc * cut_jk[:, :, None, None] * cut_mn[None, None, :, :]) ker_loc = T.exp(ker_loc / 20) # -------------------------------------------------- # FINAL FUNCTIONS # -------------------------------------------------- # energy energy kernel k_ee_fun = function([r1, r2, rho1, rho2, sig, theta, rc], ker_loc, on_unused_input='ignore') # energy force kernel k_ef_cut = T.grad(ker_loc, r2) k_ef_fun = function([r1, r2, rho1, rho2, sig, theta, rc], k_ef_cut, on_unused_input='ignore') # force force kernel k_ff_cut = T.grad(ker_loc, r1) k_ff_cut_der, updates = scan( lambda j, k_ff_cut, r2: T.grad(k_ff_cut[j], r2), sequences=T.arange(k_ff_cut.shape[0]), non_sequences=[k_ff_cut, r2]) k_ff_fun = function([r1, r2, rho1, rho2, sig, theta, rc], k_ff_cut_der, on_unused_input='ignore') # Save the function that we want to use for multiprocessing # This is necessary because theano is a crybaby and does not want to access the # Automaticallly stored compiled object from different processes with open(Mffpath / 'k3_ee_m.pickle', 'wb') as f: pickle.dump(k_ee_fun, f) with open(Mffpath / 'k3_ef_m.pickle', 'wb') as f: pickle.dump(k_ef_fun, f) with open(Mffpath / 'k3_ff_m.pickle', 'wb') as f: pickle.dump(k_ff_fun, f) else: print("Loading Kernels") with open(Mffpath / "k3_ee_m.pickle", 'rb') as f: k_ee_fun = pickle.load(f) with open(Mffpath / "k3_ef_m.pickle", 'rb') as f: k_ef_fun = pickle.load(f) with open(Mffpath / "k3_ff_m.pickle", 'rb') as f: k_ff_fun = pickle.load(f) # WRAPPERS (we don't want to plug the position of the central element every time) def km_ee(conf1, conf2, sig, theta, rc): """ Many body kernel for energy-energy correlation Args: conf1 (array): first configuration. conf2 (array): second configuration. sig (float): lengthscale hyperparameter theta[0] theta (float): cutoff decay rate hyperparameter theta[1] rc (float): cutoff distance hyperparameter theta[2] Returns: kernel (float): scalar valued energy-energy many-body kernel """ return k_ee_fun(np.zeros(3), np.zeros(3), conf1, conf2, sig, theta, rc) def km_ef(conf1, conf2, sig, theta, rc): """ Many body kernel for energy-force correlation Args: conf1 (array): first configuration. conf2 (array): second configuration. sig (float): lengthscale hyperparameter theta[0] theta (float): cutoff decay rate hyperparameter theta[1] rc (float): cutoff distance hyperparameter theta[2] Returns: kernel (array): 3x1 energy-force many-body kernel """ return -k_ef_fun(np.zeros(3), np.zeros(3), conf1, conf2, sig, theta, rc) def km_ff(conf1, conf2, sig, theta, rc): """ Many body kernel for force-force correlation Args: conf1 (array): first configuration. conf2 (array): second configuration. sig (float): lengthscale hyperparameter theta[0] theta (float): cutoff decay rate hyperparameter theta[1] rc (float): cutoff distance hyperparameter theta[2] Returns: kernel (matrix): 3x3 force-force many-body kernel """ return k_ff_fun(np.zeros(3), np.zeros(3), conf1, conf2, sig, theta, rc) logger.info("Ended compilation of theano many body kernels") return km_ee, km_ef, km_ff
def predict_symbolic(self, mx, Sx, unroll_scan=False): idims = self.D odims = self.E # centralize inputs zeta = self.X - mx # initialize some variables sf2 = self.hyp[:, idims]**2 eyeE = tt.tile(tt.eye(idims), (odims, 1, 1)) lscales = self.hyp[:, :idims] iL = eyeE / lscales.dimshuffle(0, 1, 'x') # predictive mean inp = iL.dot(zeta.T).transpose(0, 2, 1) iLdotSx = iL.dot(Sx) # TODO vectorize this B = (iLdotSx[:, :, None, :] * iL[:, None, :, :]).sum(-1) + tt.eye(idims) t = tt.stack([solve(B[i].T, inp[i].T).T for i in range(odims)]) c = sf2 / tt.sqrt(tt.stack([det(B[i]) for i in range(odims)])) l = tt.exp(-0.5 * tt.sum(inp * t, 2)) lb = l * self.beta # E x N dot E x N M = tt.sum(lb, 1) * c # input output covariance tiL = (t[:, :, None, :] * iL[:, None, :, :]).sum(-1) # tiL = tt.stack([t[i].dot(iL[i]) for i in range(odims)]) V = tt.stack([tiL[i].T.dot(lb[i]) for i in range(odims)]).T * c # predictive covariance logk = (tt.log(sf2))[:, None] - 0.5 * tt.sum(inp * inp, 2) logk_r = logk.dimshuffle(0, 'x', 1) logk_c = logk.dimshuffle(0, 1, 'x') Lambda = tt.square(iL) LL = (Lambda.dimshuffle(0, 'x', 1, 2) + Lambda).transpose(0, 1, 3, 2) R = tt.dot(LL, Sx).transpose(0, 1, 3, 2) + tt.eye(idims) z_ = Lambda.dot(zeta.T).transpose(0, 2, 1) M2 = tt.zeros((odims, odims)) # initialize indices triu_indices = np.triu_indices(odims) indices = [tt.as_index_variable(idx) for idx in triu_indices] def second_moments(i, j, M2, beta, iK, sf2, R, logk_c, logk_r, z_, Sx, *args): # This comes from Deisenroth's thesis ( Eqs 2.51- 2.54 ) Rij = R[i, j] n2 = logk_c[i] + logk_r[j] n2 += utils.maha(z_[i], -z_[j], 0.5 * solve(Rij, Sx)) Q = tt.exp(n2) / tt.sqrt(det(Rij)) # Eq 2.55 m2 = matrix_dot(beta[i], Q, beta[j]) m2 = theano.ifelse.ifelse(tt.eq(i, j), m2 - tt.sum(iK[i] * Q) + sf2[i], m2) M2 = tt.set_subtensor(M2[i, j], m2) return M2 nseq = [self.beta, self.iK, sf2, R, logk_c, logk_r, z_, Sx, self.L] if unroll_scan: from lasagne.utils import unroll_scan [M2_] = unroll_scan(second_moments, indices, [M2], nseq, len(triu_indices[0])) updts = {} else: M2_, updts = theano.scan(fn=second_moments, sequences=indices, outputs_info=[M2], non_sequences=nseq, allow_gc=False, strict=True, name="%s>M2_scan" % (self.name)) M2 = M2_[-1] M2 = M2 + tt.triu(M2, k=1).T S = M2 - tt.outer(M, M) return M, S, V
def predict_symbolic(self, mx, Sx=None, unroll_scan=False): idims = self.D odims = self.E # initialize some variables sf2 = self.hyp[:, idims]**2 eyeE = tt.tile(tt.eye(idims), (odims, 1, 1)) lscales = self.hyp[:, :idims] iL = eyeE / lscales.dimshuffle(0, 1, 'x') if Sx is None: # first check if we received a vector [D] or a matrix [nxD] if mx.ndim == 1: mx = mx[None, :] # centralize inputs zeta = self.X[:, None, :] - mx[None, :, :] # predictive mean ( we don't need to do the rest ) inp = (iL[:, None, :, None, :] * zeta[:, None, :, :]).sum(2) l = tt.exp(-0.5 * tt.sum(inp**2, -1)) lb = l * self.beta[:, :, None] # E x N M = tt.sum(lb, 1).T * sf2 # apply saturating function to the output if available if self.sat_func is not None: # saturate the output M = self.sat_func(M) return M # centralize inputs zeta = self.X - mx # predictive mean inp = iL.dot(zeta.T).transpose(0, 2, 1) iLdotSx = iL.dot(Sx) B = (iLdotSx[:, :, None, :] * iL[:, None, :, :]).sum(-1) + tt.eye(idims) t = tt.stack([solve(B[i].T, inp[i].T).T for i in range(odims)]) c = sf2 / tt.sqrt(tt.stack([det(B[i]) for i in range(odims)])) l = tt.exp(-0.5 * tt.sum(inp * t, 2)) lb = l * self.beta M = tt.sum(lb, 1) * c # input output covariance tiL = tt.stack([t[i].dot(iL[i]) for i in range(odims)]) V = tt.stack([tiL[i].T.dot(lb[i]) for i in range(odims)]).T * c # predictive covariance logk = (tt.log(sf2))[:, None] - 0.5 * tt.sum(inp * inp, 2) logk_r = logk.dimshuffle(0, 'x', 1) logk_c = logk.dimshuffle(0, 1, 'x') Lambda = tt.square(iL) LL = (Lambda.dimshuffle(0, 'x', 1, 2) + Lambda).transpose(0, 1, 3, 2) R = tt.dot(LL, Sx).transpose(0, 1, 3, 2) + tt.eye(idims) z_ = Lambda.dot(zeta.T).transpose(0, 2, 1) M2 = tt.zeros((odims, odims)) # initialize indices triu_indices = np.triu_indices(odims) indices = [tt.as_index_variable(idx) for idx in triu_indices] def second_moments(i, j, M2, beta, R, logk_c, logk_r, z_, Sx, *args): # This comes from Deisenroth's thesis ( Eqs 2.51- 2.54 ) Rij = R[i, j] n2 = logk_c[i] + logk_r[j] n2 += utils.maha(z_[i], -z_[j], 0.5 * solve(Rij, Sx)) Q = tt.exp(n2) / tt.sqrt(det(Rij)) # Eq 2.55 m2 = matrix_dot(beta[i], Q, beta[j]) m2 = theano.ifelse.ifelse(tt.eq(i, j), m2 + 1e-6, m2) M2 = tt.set_subtensor(M2[i, j], m2) return M2 nseq = [self.beta, R, logk_c, logk_r, z_, Sx, self.iK, self.L] if unroll_scan: from lasagne.utils import unroll_scan [M2_] = unroll_scan(second_moments, indices, [M2], nseq, len(triu_indices[0])) updts = {} else: M2_, updts = theano.scan(fn=second_moments, sequences=indices, outputs_info=[M2], non_sequences=nseq, allow_gc=False, strict=True, name="%s>M2_scan" % (self.name)) M2 = M2_[-1] M2 = M2 + tt.triu(M2, k=1).T S = M2 - tt.outer(M, M) # apply saturating function to the output if available if self.sat_func is not None: # saturate the output M, S, U = self.sat_func(M, S) # compute the joint input output covariance V = V.dot(U) return M, S, V
def __init__(self, x_h_0, v_h_0, t_h_0, x_t_0, v_t_0, a_t_0, t_t_0, time_steps, exist, is_leader, x_goal, turn_vec_h, turn_vec_t, n_steps, lr, game_params, arch_params, solver_params, params): self._init_layers(params, arch_params, game_params) self._connect(game_params, solver_params) def _dist_from_rail(pos, rail_center, rail_radius): d = tt.sqrt(((pos - rail_center)**2).sum()) return tt.sum((d - rail_radius)**2) def _step_state(x_h_, v_h_, angle_, speed_, t_h_, turn_vec_h, x_t_, v_t_, t_t_, turn_vec_t, ctrl, exist, time_step): a_t_e, v_t_e, x_t_e, t_t, t_h = step(x_h_, v_h_, t_h_, turn_vec_h, x_t_, v_t_, t_t_, turn_vec_h, exist, time_step) t_h = common.disconnected_grad(t_h) t_t = common.disconnected_grad(t_t) # approximated dynamic of the un-observed parts in the state a_t_a = tt.zeros(shape=(3,2), dtype=np.float32) v_t_a = v_t_ x_t_a = x_t_ + self.dt * v_t_a # difference in predictions n_v_t = v_t_e - v_t_a n_a_t = a_t_e - a_t_a n_x_t = x_t_e - x_t_a # disconnect the gradient of the noise signals n_v_t = common.disconnected_grad(n_v_t) n_a_t = common.disconnected_grad(n_a_t) n_x_t = common.disconnected_grad(n_x_t) # add the noise to the approximation a_t = a_t_a + n_a_t v_t = v_t_a + n_v_t x_t = x_t_a + n_x_t # update the observed part of the state delta_steer = ctrl[0] accel = ctrl[1] delta_steer = tt.clip(delta_steer, -np.pi/4, np.pi/4) angle = angle_ + delta_steer speed = speed_ + accel * self.dt speed = tt.clip(speed, 0, self.v_max) v_h_x = speed * tt.sin(angle) v_h_y = speed * tt.cos(angle) v_h = tt.stack([v_h_x,v_h_y]) x_h = x_h_ + self.dt * v_h x_h = tt.clip(x_h, -self.bw, self.bw) return x_h, v_h, angle, speed, t_h, x_t, v_t, a_t, t_t def _recurrence(time_step, x_h_, v_h_, angle_, speed_, t_h_, x_t_, v_t_, a_t_, t_t_, exist, is_leader, x_goal, turn_vec_h, turn_vec_t): # state ''' 1. host 1.1 position (2) - (x,y) coordinates in cross coordinate system 1.2 speed (2) - (v_x,v_y) # 1.3 acceleration (2) - (a_x,a_y) # 1.4 waiting time (1) - start counting on full stop. stop counting when clearing the junction 1.5 x_goal (2) - destination position (indicates different turns) total = 5 2. right lane car 2.1 position (2) - null value = (-1,-1) 2.2 speed (2) - null value = (0,0) 2.3 acceleration (2) - null value = (0,0) 2.4 waiting time (1) - null value = 0 total = 7 3. front lane car 3.1 position (2) 3.2 speed (2) 3.3 acceleration (2) 3.4 waiting time (1) total = 7 4. target 3 4.1 position (2) 4.2 speed (2) 4.3 acceleration (2) 4.4 waiting time (1) total = 7 total = 26 ''' # host_state_vec = tt.concatenate([x_h_, v_h_, t_h_]) ang_spd = tt.stack([angle_, speed_]) host_state_vec = tt.concatenate([x_h_, ang_spd, x_goal]) # target_state_vec = tt.concatenate([tt.flatten(x_t_), tt.flatten(v_t_), tt.flatten(a_t_), tt.flatten(t_t_)]) target_state_vec = tt.concatenate([tt.flatten(x_t_), tt.flatten(v_t_), tt.flatten(a_t_), is_leader]) state = tt.concatenate([host_state_vec, target_state_vec]) h0 = tt.dot(state, self.W_0) + self.b_0 relu0 = tt.nnet.relu(h0) h1 = tt.dot(relu0, self.W_1) + self.b_1 relu1 = tt.nnet.relu(h1) h2 = tt.dot(relu1, self.W_2) + self.b_2 relu2 = tt.nnet.relu(h2) a_h = tt.dot(relu2, self.W_c) x_h, v_h, angle, speed, t_h, x_t, v_t, a_t, t_t = _step_state(x_h_, v_h_, angle_, speed_, t_h_, turn_vec_h, x_t_, v_t_, t_t_, turn_vec_t, a_h, exist, time_step) # cost: discount_factor = 0.99**time_step # 0. smooth driving policy cost_steer = discount_factor * a_h[0]**2 cost_accel = discount_factor * a_h[1]**2 # 1. forcing the host to move forward dist_from_goal = tt.mean((x_goal - x_h)**2) cost_progress = discount_factor * dist_from_goal # 2. keeping distance from in front vehicles d_t_h = x_t - x_h h_t_dists = (d_t_h**2).sum(axis=1) # v_h_norm = tt.sqrt((v_h**2).sum()) # d_t_h_norm = tt.sqrt((d_t_h**2).sum(axis=1)) # # denominator = v_h_norm * d_t_h_norm # # host_targets_orientation = tt.dot(d_t_h, v_h) / (denominator + 1e-3) # # in_fornt_targets = tt.nnet.sigmoid(5 * host_targets_orientation) # # close_targets = tt.sum(tt.abs_(d_t_h)) # # cost_accident = tt.mean(in_fornt_targets * close_targets) cost_accident = tt.sum(tt.nnet.relu(self.require_distance - h_t_dists)) # 3. rail divergence cost_right_rail = _dist_from_rail(x_h, self.right_rail_center, self.right_rail_radius) * turn_vec_h[0] cost_front_rail = (x_h[0] - self.lw/2)**2 * turn_vec_h[1] cost_left_rail = _dist_from_rail(x_h, self.left_rail_center, self.left_rail_radius) * turn_vec_h[2] cost_rail = cost_right_rail + cost_left_rail + cost_front_rail return (x_h, v_h, angle, speed, t_h, x_t, v_t, a_t, t_t, cost_steer, cost_accel, cost_progress, cost_accident, cost_rail, a_h), t.scan_module.until(dist_from_goal < 0.001) [x_h, v_h, angle, speed, t_h, x_t, v_t, a_t, t_t, costs_steer, costs_accel, costs_progress, costs_accident, costs_rail, a_hs], scan_updates = t.scan(fn=_recurrence, sequences=time_steps, outputs_info=[x_h_0, v_h_0, 0., 0., t_h_0, x_t_0, v_t_0, a_t_0, t_t_0, None, None, None, None, None, None], non_sequences=[exist, is_leader, x_goal, turn_vec_h, turn_vec_t], n_steps=n_steps, name='scan_func') # 3. right of way cost term T = x_h.shape[0] x_h_rpt_1 = tt.repeat(x_h,T,axis=1) # (Tx2T) x_h_rpt_1_3d = x_h_rpt_1.dimshuffle(0,1,'x') # (Tx2Tx1) x_h_3D = tt.repeat(x_h_rpt_1_3d, 3, axis=2) # (Tx2Tx3) x_t_rshp_1 = tt.zeros(shape=(2*T,3),dtype=np.float32) # (2Tx3) x_t_rshp_1_x = tt.set_subtensor(x_t_rshp_1[:T,:],x_t[:,:,0]) x_t_rshp_1_xy = tt.set_subtensor(x_t_rshp_1_x[T:,:],x_t[:,:,1]) x_t_rshp_1_3d = x_t_rshp_1_xy.dimshuffle(0,1,'x') # (2Tx3x1) x_t_rpt_2_3d = tt.repeat(x_t_rshp_1_3d,T,axis=2) # (2Tx3xT) x_t_3D = x_t_rpt_2_3d.dimshuffle(2,0,1) # (Tx2Tx3) # abs_diff_mat = tt.abs_(x_h_3D - x_t_3D) # (Tx2Tx3) abs_diff_mat = (x_h_3D - x_t_3D)**2 # (Tx2Tx3) dists_mat = abs_diff_mat[:,:T,:] + abs_diff_mat[:,T:,:] # d_x+d_y: (TxTx3) # punish only when cutting a leader host_effective_dists = (tt.triu(dists_mat[:,:,0]) * is_leader[0] + tt.triu(dists_mat[:,:,1]) * is_leader[1] + tt.triu(dists_mat[:,:,2]) * is_leader[2]) costs_row = tt.mean(tt.nnet.sigmoid(self.eps_row - host_effective_dists)) self.cost_steer = tt.mean(costs_steer) self.cost_accel = tt.mean(costs_accel) self.cost_progress = tt.mean(costs_progress) self.cost_accident = tt.mean(costs_accident) self.cost_row = tt.mean(costs_row) self.cost_rail = tt.mean(costs_rail) self.weighted_cost = ( self.w_delta_steer * self.cost_steer + self.w_accel * self.cost_accel + self.w_progress * self.cost_progress + self.w_accident * self.cost_accident + # self.w_row * self.cost_row self.w_rail * self.cost_rail ) self.cost = ( self.cost_steer + self.cost_accel + self.cost_progress + self.cost_accident + # self.cost_row self.cost_rail ) objective = self.weighted_cost objective = common.weight_decay(objective=objective, params=self.params, l1_weight=self.l1_weight) objective = t.gradient.grad_clip(objective, -self.grad_clip_val, self.grad_clip_val) gradients = tt.grad(objective, self.params) self.updates = optimizers.optimizer(lr=lr, param_struct=self, gradients=gradients, solver_params=solver_params) self.x_h = x_h self.v_h = v_h self.x_t = x_t self.v_t = v_t self.max_a = tt.max(abs(a_hs)) self.max_grad_val = 0 self.grad_mean = 0 for g in gradients: self.grad_mean += tt.mean(tt.abs_(g)) self.max_grad_val = (tt.max(g) > self.max_grad_val) * tt.max(g) + (tt.max(g) <= self.max_grad_val) * self.max_grad_val self.params_abs_norm = self._calc_params_norm()
def compile_theano(): """ This function generates theano compiled kernels for energy and force learning ker_jkmn_withcutoff = ker_jkmn #* cutoff_ikmn The position of the atoms relative to the centrla one, and their chemical species are defined by a matrix of dimension Mx5 Returns: k3_ee (func): energy-energy kernel k3_ef (func): energy-force kernel k3_ff (func): force-force kernel """ if not (os.path.exists(Mffpath / 'k3_ee_s.pickle') and os.path.exists(Mffpath / 'k3_ef_s.pickle') and os.path.exists(Mffpath / 'k3_ff_s.pickle')): print("Building Kernels") import theano.tensor as T from theano import function, scan logger.info("Started compilation of theano three body kernels") # -------------------------------------------------- # INITIAL DEFINITIONS # -------------------------------------------------- # positions of central atoms r1, r2 = T.dvectors('r1d', 'r2d') # positions of neighbours rho1, rho2 = T.dmatrices('rho1', 'rho2') # hyperparameter sig = T.dscalar('sig') # cutoff hyperparameters theta = T.dscalar('theta') rc = T.dscalar('rc') # positions of neighbours without chemical species rho1s = rho1[:, 0:3] rho2s = rho2[:, 0:3] # -------------------------------------------------- # RELATIVE DISTANCES TO CENTRAL VECTOR AND BETWEEN NEIGHBOURS # -------------------------------------------------- # first and second configuration r1j = T.sqrt(T.sum((rho1s[:, :] - r1[None, :])**2, axis=1)) r2m = T.sqrt(T.sum((rho2s[:, :] - r2[None, :])**2, axis=1)) rjk = T.sqrt( T.sum((rho1s[None, :, :] - rho1s[:, None, :])**2, axis=2)) rmn = T.sqrt( T.sum((rho2s[None, :, :] - rho2s[:, None, :])**2, axis=2)) # -------------------------------------------------- # BUILD THE KERNEL # -------------------------------------------------- # Squared exp of differences se_1j2m = T.exp(-(r1j[:, None] - r2m[None, :])**2 / (2 * sig**2)) se_jkmn = T.exp( -(rjk[:, :, None, None] - rmn[None, None, :, :])**2 / (2 * sig**2)) se_jk2m = T.exp(-(rjk[:, :, None] - r2m[None, None, :])**2 / (2 * sig**2)) se_1jmn = T.exp(-(r1j[:, None, None] - rmn[None, :, :])**2 / (2 * sig**2)) # Kernel not summed (cyclic permutations) k1n = (se_1j2m[:, None, :, None] * se_1j2m[None, :, None, :] * se_jkmn) k2n = (se_1jmn[:, None, :, :] * se_jk2m[:, :, None, :] * se_1j2m[None, :, :, None]) k3n = (se_1j2m[:, None, None, :] * se_jk2m[:, :, :, None] * se_1jmn[None, :, :, :]) # final shape is M1 M1 M2 M2 ker = k1n + k2n + k3n cut_j = 0.5 * (1 + T.cos(np.pi * r1j / rc)) * ( (T.sgn(rc - r1j) + 1) / 2) cut_m = 0.5 * (1 + T.cos(np.pi * r2m / rc)) * ( (T.sgn(rc - r2m) + 1) / 2) cut_jk = cut_j[:, None] * cut_j[None, :] * 0.5 * ( 1 + T.cos(np.pi * rjk / rc)) * ((T.sgn(rc - rjk) + 1) / 2) cut_mn = cut_m[:, None] * cut_m[None, :] * 0.5 * ( 1 + T.cos(np.pi * rmn / rc)) * ((T.sgn(rc - rmn) + 1) / 2) # -------------------------------------------------- # REMOVE DIAGONAL ELEMENTS AND ADD CUTOFF # -------------------------------------------------- # remove diagonal elements AND lower triangular ones from first configuration mask_jk = T.triu(T.ones_like(rjk)) - T.identity_like(rjk) # remove diagonal elements from second configuration mask_mn = T.ones_like(rmn) - T.identity_like(rmn) # Combine masks mask_jkmn = mask_jk[:, :, None, None] * mask_mn[None, None, :, :] # Apply mask and then apply cutoff functions ker = ker * mask_jkmn ker = T.sum(ker * cut_jk[:, :, None, None] * cut_mn[None, None, :, :]) # -------------------------------------------------- # FINAL FUNCTIONS # -------------------------------------------------- # global energy energy kernel k_ee_fun = function([r1, r2, rho1, rho2, sig, theta, rc], ker, on_unused_input='ignore') # global energy force kernel k_ef = T.grad(ker, r2) k_ef_fun = function([r1, r2, rho1, rho2, sig, theta, rc], k_ef, on_unused_input='ignore') # local force force kernel k_ff = T.grad(ker, r1) k_ff_der, updates = scan(lambda j, k_ff, r2: T.grad(k_ff[j], r2), sequences=T.arange(k_ff.shape[0]), non_sequences=[k_ff, r2]) k_ff_fun = function([r1, r2, rho1, rho2, sig, theta, rc], k_ff_der, on_unused_input='ignore') # Save the function that we want to use for multiprocessing # This is necessary because theano is a crybaby and does not want to access the # Automaticallly stored compiled object from different processes with open(Mffpath / 'k3_ee_s.pickle', 'wb') as f: pickle.dump(k_ee_fun, f) with open(Mffpath / 'k3_ef_s.pickle', 'wb') as f: pickle.dump(k_ef_fun, f) with open(Mffpath / 'k3_ff_s.pickle', 'wb') as f: pickle.dump(k_ff_fun, f) else: print("Loading Kernels") with open(Mffpath / "k3_ee_s.pickle", 'rb') as f: k_ee_fun = pickle.load(f) with open(Mffpath / "k3_ef_s.pickle", 'rb') as f: k_ef_fun = pickle.load(f) with open(Mffpath / "k3_ff_s.pickle", 'rb') as f: k_ff_fun = pickle.load(f) # WRAPPERS (we don't want to plug the position of the central element every time) def k3_ee(conf1, conf2, sig, theta, rc): """ Three body kernel for global energy-energy correlation Args: conf1 (array): first configuration. conf2 (array): second configuration. sig (float): lengthscale hyperparameter theta[0] theta (float): cutoff decay rate hyperparameter theta[1] rc (float): cutoff distance hyperparameter theta[2] Returns: kernel (float): scalar valued energy-energy 3-body kernel """ return k_ee_fun(np.zeros(3), np.zeros(3), conf1, conf2, sig, theta, rc) def k3_ef(conf1, conf2, sig, theta, rc): """ Three body kernel for global energy-force correlation Args: conf1 (array): first configuration. conf2 (array): second configuration. sig (float): lengthscale hyperparameter theta[0] theta (float): cutoff decay rate hyperparameter theta[1] rc (float): cutoff distance hyperparameter theta[2] Returns: kernel (array): 3x1 energy-force 3-body kernel """ return -k_ef_fun(np.zeros(3), np.zeros(3), conf1, conf2, sig, theta, rc) def k3_ff(conf1, conf2, sig, theta, rc): """ Three body kernel for local force-force correlation Args: conf1 (array): first configuration. conf2 (array): second configuration. sig (float): lengthscale hyperparameter theta[0] theta (float): cutoff decay rate hyperparameter theta[1] rc (float): cutoff distance hyperparameter theta[2] Returns: kernel (matrix): 3x3 force-force 3-body kernel """ return k_ff_fun(np.zeros(3), np.zeros(3), conf1, conf2, sig, theta, rc) logger.info("Ended compilation of theano three body kernels") return k3_ee, k3_ef, k3_ff
def __init__(self, rng, input, n_in, n_batch, d_bucket, activation, activation_deriv, w=None, index_permute=None, index_permute_reverse=None): srng = RandomStreams(seed=234) n_bucket = n_in / d_bucket + 1 self.input = input # randomly permute input space if index_permute is None: index_permute = srng.permutation(n=n_in)#numpy.random.permutation(n_in) index_permute_reverse = T.argsort(index_permute) self.index_permute = index_permute self.index_permute_reverse = index_permute_reverse permuted_input = input[:, index_permute] self.permuted_input = permuted_input # initialize matrix parameters if w is None: bound = numpy.sqrt(3. / d_bucket) w_values = numpy.asarray(rng.uniform(low=-bound, high=bound, size=(n_bucket, d_bucket, d_bucket)), dtype=theano.config.floatX) w = theano.shared(value=w_values, name='w') self.w = w # compute outputs and Jacobians log_jacobian = T.alloc(0, n_batch) for b in xrange(n_bucket): bucket_size = d_bucket if b == n_bucket - 1: bucket_size = n_in - b * d_bucket if b>0: prev_input = x_b """here we warp the previous bucket of inputs and add to the new input""" x_b = self.permuted_input[:, b*d_bucket:b*d_bucket + bucket_size] w_b = self.w[b, :bucket_size, :bucket_size] if b>0: x_b_plus = x_b + m_b else: x_b_plus = x_b Upper = T.triu(w_b) Lower = T.tril(w_b) Lower = T.extra_ops.fill_diagonal(Lower, 1.) log_det_Upper = T.log(T.abs_(T.nlinalg.ExtractDiag()(Upper))).sum() W = T.dot(Upper, Lower) log_jacobian = log_jacobian + T.alloc(log_det_Upper, n_batch) lin_output_b = T.dot(x_b_plus, W) if b>0: lin_output = T.concatenate([lin_output, lin_output_b], axis=1) else: lin_output = lin_output_b if activation is not None: derivs = activation_deriv(lin_output_b) #import pdb; pdb.set_trace() log_jacobian = log_jacobian + T.log(T.abs_(derivs)).sum(axis=1) self.log_jacobian = log_jacobian self.output = ( lin_output[:, index_permute_reverse] if activation is None else activation(lin_output[:, index_permute_reverse]) ) self.params = [w]