def __init__(self,model, dis_updater = updates.Adam(lr=sharedX(0.0002), b1=0.5, regularizer=updates.Regularizer(l2=1e-5)), gen_updater = updates.Adam(lr=sharedX(0.0002), b1=0.5, regularizer=updates.Regularizer(l2=1e-5))): X = model.X Z = model.Z targets = T.matrix() genX = model.genX disX = model.disX disgenX = model.disgenX disX_loss = bce(disX, T.ones(disX.shape)).mean() disgenX_loss = bce(disgenX, T.zeros(disgenX.shape)).mean() genX_loss = bce(disgenX, T.ones(disgenX.shape)).mean() dis_loss = disX_loss + disgenX_loss gen_loss = genX_loss trainable_discrim_params = model.trainable_discrim_params trainable_gen_params = model.trainable_gen_params dis_updates = dis_updater(trainable_discrim_params, dis_loss) + model.other_discrim_updates gen_updates = gen_updater(trainable_gen_params, gen_loss) + model.other_gen_updates print 'COMPILING' t = time() self._train_gen = theano.function([Z], gen_loss, updates=gen_updates) self._train_dis = theano.function([X, Z], dis_loss, updates=dis_updates) self._gen = theano.function([Z], genX) print '%.2f seconds to compile theano functions'%(time()-t)
def compute(self, minibatch=1, steps=5, lrate=0.01): G = Generator(self.num_vis, self.num_hid) D = Discriminator(self.num_vis) for i in range(steps): # Sample m noise examples from Generator noise_samples = G.get_noise() # Sample m examples from data distribution data_examples = self._sample(minibatch) # Get real examples realX = D.output(data_examples) # Get generated examples genX = D.output(noise_samples) drealcost = T.mean(T.nnet.binary_crossentropy(realX, T.ones(realX.shape))) dgencost = T.mean(T.nnet.binary_crossentropy(noise_samples, T.zeros(genX.shape))) gencost = T.mean(T.nnet.binary_crossentropy(genX, T.ones(genX.shape))) cost = drealcost + dgencost updates = D.update(cost.mean()) func = theano.function([], (realX, genX), updates=updates, givens={self.x:}) print("Discriminator cost {0}: ".format(func())) noise_samples = G.get_noise() allparams = [] for param in G.params: allparams.append(param) '''for param in D.params: allparams.append(param)''' #gencost = 1 / self.num_samples * \ # T.sum(T.log(1 - D.output(G.output(noise_samples)))) grads = T.grad(T.mean(gencost), allparams) return gencost, [(oldparam, oldparam - lrate * newparam) for (oldparam, newparam) in zip(allparams, grads)]
def step_fun(self): if self._step_fun is None: inputs = T.matrix('inputs') states_tm1 = [T.matrix('state_%d_%d_tm1' % (layer, state)) for layer in range(self.n_layers) for state in range(self.gate0.n_states)] if self.gates[-1].use_attention: raise NotImplementedError('Stacked RNN with attention') attended=T.tensor3('attended') attended_dot_u=T.tensor3('attended_dot_u') attention_mask=T.matrix('attention_mask') self._step_fun = function( [inputs] + states_tm1 + [ attended, attended_dot_u, attention_mask], self.step(*([inputs, T.ones(inputs.shape[:-1])] + states_tm1 + [T.ones_like(states_tm1[0]), attended, attended_dot_u, attention_mask])), name='%s_step_fun' else: self._step_fun = function( [inputs] + states_tm1, self.step(*([inputs, T.ones(inputs.shape[:-1])] + states_tm1 + [T.ones_like(states_tm1[0])])), name='%s_step_fun' return self._step_fun
def chi2_test_statistic(M, Obs, K, num_M, num_Obs): #Getting frequencies from observations Ns =,T.ones((K,1))) p = Obs/Ns #Find the zeros so we can deal with them later pZEROs = T.eq(p, 0) mZEROs = T.eq(M, 0) #log probabilities, with -INF as log(0) lnM = T.log(M + mZEROs) - INF*mZEROs lnp = T.log(p + pZEROs) - INF*pZEROs #Using kroneker products so every row of M hits every row of P in the difference klnM - kln O_ones = T.ones((num_Obs,1)) M_ones = T.ones((num_M,1)) klnM = kron(lnM,O_ones) klnP = kron(M_ones, lnp) klnP_M = klnP - klnM kObs = kron(M_ones, Obs) G = 2.0* ,kObs.T) G = G*T.identity_like(G) G =,T.ones((num_M*num_Obs,1))) G = T.reshape(G,(num_M,num_Obs)) #The following quotient improves the convergence to chi^2 by an order of magnitude #source: #numerator = 1.0/(M + 0.01),T.ones((K,1))) - T.ones((num_M,1)) #q1 = T.ones((num_M,num_Obs)) +,1.0/Ns.T/6.0)/(K-1.0) return G#/q1
def instantiate(self, shape=None): # Parse shape shape = [None, ] * self.ndim if shape is None else shape initshape = tuple([shape[n] if givenshape is None else givenshape for n, givenshape in enumerate(self.shape)]) assert all([ishp is not None for ishp in initshape]), "Given shape information not sufficient to instantiate " \ "from ghost state." # Initialize. If shape is a tensor variable, initialize a tensor variable and return. if isinstance(shape, T.vector().__class__) or not self.shared: # Make variable var = T.zeros(shape=initshape, dtype='floatX') \ if self.value == 0. else self.value(initshape) * T.ones(shape=initshape, dtype='floatX') \ if callable(self.value) else self.value * T.ones(shape=initshape, dtype='floatX') # Safety cast var = T.cast(var, dtype='floatX') = # Warn if a shared variable is requested if self.shared: warn("Provided shape variable is a theano tensor variable, it cannot be used to initialize a shared " "variable.") # Return return var else: # Make variable var = th.shared((getattr(np, th.config.floatX)(self.value) if not callable(self.value) and not np.isscalar(self.value) else getattr(np, th.config.floatX)(self.value(initshape)) if callable(self.value) else self.value * np.ones(shape=initshape, dtype=th.config.floatX))) = # Safety cast and return return var
def _initial_part_matrix(self, part, size, deterministic): if size is None: size = 1 length, dist_name, dist_map = self._choose_alternative( part, (self.local_size, self.initial_dist_local_name, self.initial_dist_local_map), (self.global_size, self.initial_dist_global_name, self.initial_dist_global_map) ) dtype = self.symbolic_initial_global_matrix.dtype if length == 0: # in this case theano fails to compute sample of correct size return tt.ones((size, 0), dtype) length = tt.as_tensor(length) size = tt.as_tensor(size) shape = tt.stack((size, length)) # apply optimizations if possible if not isinstance(deterministic, tt.Variable): if deterministic: return tt.ones(shape, dtype) * dist_map else: return getattr(self._rng, dist_name)(shape) else: sample = getattr(self._rng, dist_name)(shape) initial = tt.switch( deterministic, tt.ones(shape, dtype) * dist_map, sample ) return initial
def pos_phase_updates(self, v, init_state=None, n_steps=1, mean_field=False): """ Implements the positive phase sampling, which performs blocks Gibbs sampling in order to sample from p(g,h,x,y|v). :param v: fixed training set :param init: dictionary of initial values, or None if sampling from scratch :param n_steps: scalar, number of Gibbs steps to perform. :param restart: if False, start sampling from buffers self.pos_* """ if init_state is None: assert n_steps # start sampler from scratch init_state = OrderedDict() init_state['g'] = T.ones((self.batch_size,self.n_g)) * T.nnet.sigmoid(self.gbias) init_state['s'] = T.ones((self.batch_size,self.n_g)) * init_state['h'] = T.ones((self.batch_size,self.n_h)) * T.nnet.sigmoid(self.hbias) init_state['t'] = T.ones((self.batch_size,self.n_h)) * self.eta [new_g, new_s, new_h, new_t] = self.pos_phase(v, init_state = init_state, n_steps = n_steps, mean_field = mean_field) pos_states = OrderedDict() pos_states['g'] = new_g pos_states['s'] = new_s pos_states['h'] = new_h pos_states['t'] = new_t # update running average of positive phase activations pos_updates = OrderedDict() return pos_states, pos_updates
def sample_h_given_v_2wise(v, W, Wh, bh, nh): phi =, W) + bh ephi = T.exp(phi) adder = np.zeros((nh/2, nh), dtype=theano.config.floatX) for i in xrange(len(adder)): adder[i, 2*i] = 1 adder[i, 2*i+1] = 1 adder = theano.shared(adder) # wobble = 1 + exp(phi_2i) + exp(phi_{2i+1}) + exp(phi_2i + phi_{21+1} + Wh_i) # p(h_2i = 1 | v) = (exp(phi_2i) + exp(phi_2i + phi_{21+1} + Wh_i ) / wobble # p(h_{2i+1} = 1 | v) = (exp(phi_2i) + exp(phi_2i + phi_{2i+1} + Wh_i )) / wobble # the second term is the same in both - the pair term. but it must be broadcasted (the kron!) # dotting by adder returns a vector of half the size of sums of pairs of elements pairsum =, adder.T) first = ephi.T[T.arange(0, nh, 2)].T pairprod = pairsum*first - first**2 pairterm = pairprod*T.exp(Wh) wobble = 1 + pairsum + pairterm pairterm_broadcast = kron(pairterm.dimshuffle(0, 'x'), T.ones(2)) wobble_broadcast = kron(wobble.dimshuffle(0, 'x'), T.ones(2)) prop_up = (ephi + pairterm_broadcast) / wobble_broadcast h = theano_rng.binomial(n=1, p = prop_up, dtype=theano.config.floatX, size=(nh,), ndim=1) return h
def pos_phase_updates(self, v, init_state=None, n_steps=1): """ Implements the positive phase sampling, which performs blocks Gibbs sampling in order to sample from p(g,h,x,y|v). :param v: fixed training set :param init: dictionary of initial values, or None if sampling from scratch :param n_steps: scalar, number of Gibbs steps to perform. :param restart: if False, start sampling from buffers self.pos_* """ if init_state is None: assert n_steps # start sampler from scratch init_state = OrderedDict() init_state['g'] = T.ones((v.shape[0], self.n_g)) * T.nnet.sigmoid(self.gbias) init_state['h'] = T.ones((v.shape[0], self.n_h)) * T.nnet.sigmoid(self.hbias) [new_g, new_h, new_s1, new_s0, crap_v, pos_counter] = self.pos_phase( v, init_state=init_state, n_steps=n_steps) # update running average of positive phase activations pos_updates = OrderedDict() pos_updates[self.pos_counter] = pos_counter pos_updates[self.odd_even] = (self.odd_even + 1) % 2 pos_updates[self.pos_g] = new_g pos_updates[self.pos_h] = new_h pos_updates[self.pos_s1] = new_s1 pos_updates[self.pos_s0] = new_s0 pos_updates[self.pos_s] = self.s_hat(new_h, new_s1, new_s0) if self.flags['pos_phase_ch']: pos_updates[] = T.cast(0.999 * + 0.001 * new_h.mean(axis=0), floatX) return pos_updates
def pos_phase_updates(self, v, l=None, init_state=None, n_steps=1, mean_field=False): """ Implements the positive phase sampling, which performs blocks Gibbs sampling in order to sample from p(g,h,x,y|v). :param v: fixed training set :param l: l is None means we sample l, l not None means we clamp l. :param init: dictionary of initial values, or None if sampling from scratch :param n_steps: scalar, number of Gibbs steps to perform. :param restart: if False, start sampling from buffers self.pos_* """ if init_state is None: assert n_steps # start sampler from scratch init_state = OrderedDict() init_state['g'] = T.ones((self.batch_size,self.n_g)) * T.nnet.sigmoid(self.gbias) init_state['h'] = T.ones((self.batch_size,self.n_h)) * T.nnet.sigmoid(self.hbias) init_state['l'] = T.ones((self.batch_size,self.n_l)) * T.nnet.softmax(self.lbias) outputs = self.pos_phase(v, l=l, init_state=init_state, n_steps=n_steps, mean_field=mean_field) pos_states = OrderedDict() pos_states['g'] = outputs[0] pos_states['h'] = outputs[1] pos_states['l'] = outputs[2] if l is None else self.input_labels # update running average of positive phase activations pos_updates = OrderedDict() pos_updates[self.pos_counter] = outputs[-1] pos_updates[self.odd_even] = (self.odd_even + 1) % 2 return pos_states, pos_updates
def _meshgrid(height, width, depth): # This function is the grid generator from eq. (1) in reference [1]. # It is equivalent to the following numpy code: # x_t, y_t,z_t = np.meshgrid(np.linspace(-1, 1, width), # np.linspace(-1, 1, height)) # ones = np.ones( # grid = np.vstack([x_t.flatten(), y_t.flatten(), ones]) # It is implemented in Theano instead to support symbolic grid sizes. # Note: If the image size is known at layer construction time, we could # compute the meshgrid offline in numpy instead of doing it dynamically # in Theano. However, it hardly affected performance when we tried. x_t = T.reshape( _linspace(-1.0, 1.0, height).dimshuffle(0, 'x'), T.ones((1, width))), (height, width, 1)), T.ones((1, 1, depth)) ) y_t = T.reshape( T.ones((height, 1)), _linspace(-1.0, 1.0, width).dimshuffle('x', 0)), (height, width, 1)), T.ones((1, 1, depth)) ) z_t =, width, 1)), T.reshape(_linspace(-1.0, 1.0, depth), (1, 1, -1))) x_t_flat = x_t.reshape((1, -1)) y_t_flat = y_t.reshape((1, -1)) z_t_flat = z_t.reshape((1, -1)) ones = T.ones_like(x_t_flat) grid = T.concatenate([x_t_flat, y_t_flat, z_t_flat, ones], axis=0) return grid
def apply_log_domain(self, l, probs, l_len=None, probs_mask=None): # Does the same computation as apply, but alpha is in the log domain # This avoids numerical underflow issues that were not corrected in the previous version. def _log(a): return tensor.log(tensor.clip(a, 1e-12, 1e12)) def _log_add(a, b): maximum = tensor.maximum(a, b) return (maximum + tensor.log1p(tensor.exp(a + b - 2 * maximum))) def _log_mul(a, b): return a + b # See comments above B = probs.shape[1] C = probs.shape[2]-1 L = l.shape[0] S = 2*L+1 l_blk = C * tensor.ones((S, B), dtype='int32') l_blk = tensor.set_subtensor(l_blk[1::2,:], l) l_blk = l_blk.T # now l_blk is B x S alpha0 = tensor.concatenate([ tensor.ones((B, 1)), tensor.zeros((B, S-1)) ], axis=1) alpha0 = _log(alpha0) l_blk_2 = tensor.concatenate([-tensor.ones((B,2)), l_blk[:,:-2]], axis=1) l_case2 = tensor.neq(l_blk, C) * tensor.neq(l_blk, l_blk_2) def recursion(p, p_mask, prev_alpha): prev_alpha_1 = tensor.concatenate([tensor.zeros((B,1)),prev_alpha[:,:-1]], axis=1) prev_alpha_2 = tensor.concatenate([tensor.zeros((B,2)),prev_alpha[:,:-2]], axis=1) alpha_bar1 = tensor.set_subtensor(prev_alpha[:,1:], _log_add(prev_alpha[:,1:],prev_alpha[:,:-1])) alpha_bar2 = tensor.set_subtensor(alpha_bar1[:,2:], _log_add(alpha_bar1[:,2:],prev_alpha[:,:-2])) alpha_bar = tensor.switch(l_case2, alpha_bar2, alpha_bar1) probs = _log(p[tensor.arange(B)[:,None].repeat(S,axis=1).flatten(), l_blk.flatten()].reshape((B,S))) next_alpha = _log_mul(alpha_bar, probs) next_alpha = tensor.switch(p_mask[:,None], next_alpha, prev_alpha) return next_alpha alpha, _ = scan(fn=recursion, sequences=[probs, probs_mask], outputs_info=[alpha0]) last_alpha = alpha[-1] # last_alpha = theano.printing.Print('a-1')(last_alpha) prob = _log_add(last_alpha[tensor.arange(B), 2*l_len.astype('int32')-1], last_alpha[tensor.arange(B), 2*l_len.astype('int32')]) # return the negative log probability of the labellings return -prob
def get_output(self, train=False): X = self.get_input(train=train) c0 = self.c0[None,:] * T.ones((X.shape[0], self.context_dim)) cn =[None,:] * T.ones((X.shape[0], self.context_dim)) X = T.concatenate( [ T.shape_padleft(self.e0,2) * T.ones((X.shape[0], 1, X.shape[2])), X, T.shape_padleft(self.en,2) * T.ones((X.shape[0], 1, X.shape[2])), ], axis = 1 ) X = X.dimshuffle(1,0,2) # timestep 置于第一纬 # 只有将int32 mask 强制转换为 float32 才不会在scan里面将mask_t[:, None] * cl_t 结果upcast成float64 mask = T.cast(self.get_output_mask(train=train), T.config.floatX) mask = mask.dimshuffle(1,0) # timestep 置于第一纬 #theano.printing.debugprint([mask], print_type=True) def _forward_step(e_t, e_tm1, mask_t, cl_tm1): #print 'e_t:', e_t.type.ndim #print 'cl_t:', cl_tm1.type.ndim cl_t = T.nnet.sigmoid(, self.Wl) +, self.Wsl) ) cl_t = mask_t[:, None] * cl_t + (1. - mask_t[:, None]) * cl_tm1 # 如果它被mask就直接继承那个词 #theano.printing.debugprint([mask_t], print_type=True) #theano.printing.debugprint([cl_t], print_type=True) return cl_t def _backward_step(e_t, e_tp1, mask_t, cr_tp1): cr_t = T.nnet.sigmoid(, self.Wr) +, self.Wsr)) cr_t = mask_t[:, None] * cr_t + (1. - mask_t[:, None]) * cr_tp1 # 如果它被mask就直接继承那个词 return cr_t Cl, _ = theano.scan(_forward_step, sequences=[dict(input=X, taps=[0, -1]), mask], outputs_info=[ dict(initial=c0, taps=[-1]) # 注意不是c0!!! ], ) Cr, _ = theano.scan(_backward_step, sequences=[dict(input=X, taps=[0, -1]), mask], outputs_info=[ dict(initial=cn, taps=[-1]) ], go_backwards=True, ) Cr = Cr[::-1] # 翻转Cr def _concatenate_activation_step(e_t, mask_t, cl_t, cr_t): #print theano.printing.debugprint(cr_t, print_type=True) h_t = T.tanh([e_t, cl_t, cr_t], axis=1), self.W2) + self.b2) h_t = mask_t[:, None] * h_t + (1. - mask_t[:, None]) * (-10000000000.) # 将mask的地方设置为最小值 return h_t Y, _ = theano.scan(_concatenate_activation_step, sequences=[X, mask, Cl, Cr], outputs_info=None, ) return Y.dimshuffle(1,0,2) # 重置样本为第一维
def scanr(self, x, y0=None, c0=None, mask=None, **kwargs): if y0 is None: #y0 = self.cact(self.y0) y0 = th.ones((x.shape[1],1))*self.y0 if c0 is None: c0 = th.ones((x.shape[1],1))*self.c0 return scanr(, y0, c0, x, mask=mask, iact=self.iact, fact=self.fact, oact=self.oact , gact=self.gact, cact=self.cact, **kwargs)
def result(theano, TT): def fn(s1, s2): return s1 + s2 outputs, _ = theano.scan( fn, sequences=[TT.ones(10), 2 * TT.ones(10)]) return theano.function([], outputs)()
def gen_img(shape_params, rotation_matrix, width, height, nsteps, res): raster_space = gen_fragcoords(width, height) rd, ro = make_ro(rotation_matrix, raster_space, width, height) a = 0 - ro # c = 0 b = 1 - ro # c = 1 nmatrices = rotation_matrix.shape[0] tn = T.reshape(a, (nmatrices, 1, 1, 3))/rd tf = T.reshape(b, (nmatrices, 1, 1, 3))/rd tn_true = T.minimum(tn,tf) tf_true = T.maximum(tn,tf) # do X tn_x = tn_true[:,:,:,0] tf_x = tf_true[:,:,:,0] tmin = 0.0 tmax = 10.0 t0 = tmin t1 = tmax t02 = T.switch(tn_x > t0, tn_x, t0) t12 = T.switch(tf_x < t1, tf_x, t1) # y tn_x = tn_true[:,:,:,1] tf_x = tf_true[:,:,:,1] t03 = T.switch(tn_x > t02, tn_x, t02) t13 = T.switch(tf_x < t12, tf_x, t12) #z tn_x = tn_true[:,:,:,2] tf_x = tf_true[:,:,:,2] t04 = T.switch(tn_x > t03, tn_x, t03) t14 = T.switch(tf_x < t13, tf_x, t13) # Shift a little bit to avoid numerial inaccuracies t04 = t04*1.001 t14 = t14*0.999 nvoxgrids = shape_params.shape[0] left_over = T.ones((nvoxgrids, nmatrices * width * height,)) step_size = (t14 - t04)/nsteps orig = T.reshape(ro, (nmatrices, 1, 1, 3)) + rd * T.reshape(t04,(nmatrices, width, height, 1)) xres = yres = zres = res orig = T.reshape(orig, (nmatrices * width * height, 3)) rd = T.reshape(rd, (nmatrices * width * height, 3)) step_sz = T.reshape(step_size, (nmatrices * width * height,1)) for i in range(nsteps): # print "step", i pos = orig + rd*step_sz*i voxel_indices = T.floor(pos*res) pruned = T.clip(voxel_indices,0,res-1) p_int = T.cast(pruned, 'int32') indices = T.reshape(p_int, (nmatrices*width*height,3)) attenuation = shape_params[:, indices[:,0],indices[:,1],indices[:,2]] left_over = left_over*T.exp(-attenuation*T.flatten(step_sz)) img = left_over pixels = T.reshape(img, (nvoxgrids, nmatrices, width, height)) mask = t14>t04 return T.switch(t14>t04, pixels, T.ones_like(pixels)), rd, ro, tn_x, T.ones((nvoxgrids, nmatrices * width * height,)), orig, shape_params
def f1_score(self, y): n_total = y.shape[0] n_relevant_documents_predicted = T.sum(T.eq(T.ones(self.y_pred.shape), self.y_pred)) two_vector = T.add(T.ones(self.y_pred.shape), T.ones(self.y_pred.shape)) n_relevant_predicted_correctly = T.sum(T.eq(T.add(self.y_pred, y), two_vector)) precision = T.true_div(n_relevant_predicted_correctly, n_relevant_documents_predicted) recall = T.true_div(n_relevant_predicted_correctly, n_total) f1_score = T.mul(2.0, T.true_div(T.mul(precision, recall), T.add(precision, recall))) return [f1_score, precision, recall]
def new_attention_step(self, ct, prev_g, mem, q_q): cWq =, self.batch_size), dtype=floatX),, self.W_b), q_q) * T.eye(n=self.batch_size, m=self.batch_size, dtype=floatX)) cWm =, self.batch_size), dtype=floatX),, self.W_b), mem) * T.eye(n=self.batch_size, m=self.batch_size, dtype=floatX)) z = T.concatenate([ct, mem, q_q, ct * q_q, ct * mem, T.abs_(ct - q_q), T.abs_(ct - mem), cWq, cWm], axis=0) l_1 =, z) + self.b_1.dimshuffle(0, 'x') l_1 = T.tanh(l_1) l_2 =, l_1) + self.b_2.dimshuffle(0, 'x') G = T.nnet.sigmoid(l_2)[0] return G
def gradient(self, observed, at_risk): prediction = self.output risk = T.exp(prediction) product = self.input * (risk * T.ones((1, self.input.shape[0]))) numerator = Te.cumsum(product[::-1])[::-1][at_risk] denominator = Te.cumsum(risk[::-1])[::-1][at_risk] * T.ones((1, self.input.shape[0])) numerator = numerator.flatten() denominator = denominator.flatten() gradient =, self.input - (numerator / denominator)) return gradient
def result(theano, TT): def fn(s1, s2, o1): return s1 + s2 + o1 outputs, _ = theano.scan( fn, sequences=[TT.ones(10), 2 * TT.ones(10)], outputs_info=0., ) return theano.function([], outputs)()
def result(theano, TT): def fn(s1, s2, addn): return s1 + s2 + addn outputs, _ = theano.scan( fn, sequences=[TT.ones(10), 2 * TT.ones(10)], non_sequences=1, ) return theano.function([], outputs)()
def backward(self, y): Km1 = y.shape[0] k = tt.arange(Km1)[(slice(None),) + (None,) * (y.ndim - 1)] eq_share = -tt.log(Km1 - k) # logit(1./(Km1 + 1 - k)) z = inverse_logit(y + eq_share) yl = tt.concatenate([z, tt.ones(y[:1].shape)]) yu = tt.concatenate([tt.ones(y[:1].shape), 1 - z]) S = tt.extra_ops.cumprod(yu, 0) x = S * yl return x
def __init__(self, num_hidden, num_features, seq_length, mb_size, tf_states, rf_states): tf_states = T.specify_shape(tf_states, (seq_length, mb_size, num_features)) rf_states = T.specify_shape(rf_states, (seq_length, mb_size, num_features)) hidden_state_features = T.specify_shape(T.concatenate([tf_states, rf_states], axis = 1), (seq_length, mb_size * 2, num_features)) gru_params_1 = init_tparams(param_init_gru(None, {}, prefix = "gru1", dim = num_hidden, nin = num_features)) #gru_params_2 = init_tparams(param_init_gru(None, {}, prefix = "gru2", dim = num_hidden, nin = num_hidden + num_features)) #gru_params_3 = init_tparams(param_init_gru(None, {}, prefix = "gru3", dim = num_hidden, nin = num_hidden + num_features)) gru_1_out = gru_layer(gru_params_1, hidden_state_features, None, prefix = 'gru1')[0] #gru_2_out = gru_layer(gru_params_2, T.concatenate([gru_1_out, hidden_state_features], axis = 2), None, prefix = 'gru2', backwards = True)[0] #gru_3_out = gru_layer(gru_params_3, T.concatenate([gru_2_out, hidden_state_features], axis = 2), None, prefix = 'gru3')[0] final_out_recc = T.specify_shape(T.mean(gru_1_out, axis = 0), (mb_size * 2, num_hidden)) h_out_1 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify) #h_out_2 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify) #h_out_3 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify) h_out_4 = DenseLayer((mb_size * 2, num_hidden), num_units = 1, nonlinearity=None) h_out_1_value = h_out_1.get_output_for(final_out_recc) h_out_4_value = h_out_4.get_output_for(h_out_1_value) raw_y = h_out_4_value #raw_y = T.clip(h_out_4_value, -10.0, 10.0) classification = T.nnet.sigmoid(raw_y) #tf comes before rf. p_real = classification[:mb_size] p_gen = classification[mb_size:] #bce = lambda r,t: t * T.nnet.softplus(-r) + (1 - t) * (r + T.nnet.softplus(-r)) self.d_cost_real = bce(p_real, 0.9 * T.ones(p_real.shape)).mean() self.d_cost_gen = bce(p_gen, 0.1 + T.zeros(p_gen.shape)).mean() self.g_cost_d = bce(p_gen, 0.9 * T.ones(p_gen.shape)).mean() self.d_cost = self.d_cost_real + self.d_cost_gen self.g_cost = self.g_cost_d self.classification = classification self.params = [] self.params += lasagne.layers.get_all_params(h_out_4,trainable=True) #self.params += lasagne.layers.get_all_params(h_out_3,trainable=True) #self.params += lasagne.layers.get_all_params(h_out_2,trainable=True) self.params += lasagne.layers.get_all_params(h_out_1,trainable=True) self.params += gru_params_1.values() #self.params += gru_params_2.values() #self.params += gru_params_3.values() self.accuracy = T.mean(T.eq(T.ones(p_real.shape).flatten(),, 0.5).flatten())) + T.mean(T.eq(T.ones(p_gen.shape).flatten(),, 0.5).flatten()))
def backward(self, y_): y = y_.T Km1 = y.shape[0] k = tt.arange(Km1)[(slice(None), ) + (None, ) * (y.ndim - 1)] eq_share = logit(1./(Km1 + 1 - k)) #- tt.log(Km1 - k) z = invlogit(y + eq_share, self.eps) yl = tt.concatenate([z, tt.ones(y[:1].shape)]) yu = tt.concatenate([tt.ones(y[:1].shape), 1-z]) S = tt.extra_ops.cumprod(yu, 0) x = S * yl return x.T
def generate(self, source_sentence, representation,tw_representation,topical_embedding,content_embedding, **kwargs): return self.sequence_generator.generate( n_steps=2 * source_sentence.shape[1], batch_size=source_sentence.shape[0], attended=representation, attended_mask=tensor.ones(source_sentence.shape).T, topical_attended=tw_representation, topical_attended_mask=tensor.ones([source_sentence.shape[0],10]).T, topical_embeddingq=topical_embedding, content_embedding=content_embedding, **kwargs)
def Meshgrid(height, width): x_t =, 1)), Linspace(-1.0, 1.0, width).dimshuffle('x', 0)) y_t =, 1.0, height).dimshuffle(0, 'x'), T.ones((1, width))) x_t_flat = x_t.reshape((1, -1)) y_t_flat = y_t.reshape((1, -1)) ones = T.ones_like(x_t_flat) grid = T.concatenate([x_t_flat, y_t_flat, ones], axis=0) return grid
def sample(self, shape): """ Paramaters -------- shape : tuple sets a shape of the output sample """ return super(UnitGammaSample, self).sample(T.ones(shape), T.ones(shape))
def kldiv_m(self, mu, std_r, std_c): pmu, pstdr, pstdc = self.get_priors() var_r, var_c = T.sqr(std_r), T.sqr(std_c) # first kl term fa = T.sum((1./(pstdc**2)) * var_c)*T.sum((1./(pstdr**2))*var_r) # second kl term prior_sigma = T.outer(T.ones((mu.shape[0],))*(pstdr**2), T.ones((mu.shape[1],))*(pstdc**2)) fb = T.sum(T.sqr(mu - pmu) / prior_sigma) # third kl term fc = mu.shape[1]*(mu.shape[0]*T.log(pstdr**2) - T.sum(T.log(var_r))) + \ mu.shape[0]*(mu.shape[1]*T.log(pstdc**2) - T.sum(T.log(var_c))) return - 0.5 * (fa + fb - + fc)
def log_likelihood(self): Users = self.L[:, :-2] Items = self.R[:, :-2] UserBiases = self.L[:, -1] ItemBiases = self.R[:, -2] UserOuter = self.L[:, -2] ItemOuter = self.R[:, -1] ## A =, Items.T) ## A += UserBiases ## A += ItemBiases.T ## B = A * self.counts ## loglik = T.sum(B) # A implicitly stored as self.L @ self.R.T # loglik = T.sum(A * self.counts) => sum over nonzeros only print('nnz size: {}'.format(self.counts.nonzero()[0].size)) loglik =, self.R, self.counts.nonzero(), fast=False), np.array(self.counts[self.counts.nonzero()]).ravel()) ## A = T.exp(A) ## A += 1 ## A = T.log(A) # There we use Taylor series ln(exp(x) + 1) = ln(2) + x/2 + x^2/8 + O(x^4) at x=0 # ln(2) const_term = (T.ones((self.num_users, 1)) * np.log(2), T.ones((self.num_items, 1))) # x/2 first_order_term = (0.5 * self.L, 0.5 * self.R) # x^2/8 second_order_term = hadamard((self.L, self.R), (self.L, self.R), self.num_factors) second_order_term = tuple(factor / 8.0 for factor in second_order_term) grouped_factors = list(zip(const_term, first_order_term, second_order_term)) A = (T.concatenate(grouped_factors[0], axis=1), T.concatenate(grouped_factors[1], axis=1)) ## A = (self.counts + 1) * A ## loglik -= T.sum(A) loglik -= sum_lowrank(A) loglik -=[0], A[1], self.counts.nonzero(), fast=False), np.array(self.counts[self.counts.nonzero()]).ravel()) # L2 regularization loglik -= 0.5 * self.reg_param * T.sum(T.square(Users)) loglik -= 0.5 * self.reg_param * T.sum(T.square(Items)) # we need strictly maintain UserOuter and ItemOuter be ones, just to ensure they properly # outer products with biases loglik -= self.num_users * T.sum(T.square(UserOuter - 1)) loglik -= self.num_items * T.sum(T.square(ItemOuter - 1)) # Return negation of LogLikelihood cause we will minimize cost return -loglik
def cost(y, y_hat_softmax, y_mask=None, mask=None): """ Computes the CTC cost using just the forward computations. The difference between this function and the vanilla 'cost' function is that this function adds blanks first. Notes ----- y_hat should be the output from a softmax layer. This is different from pseudo_cost which takes energies as input. Do not calculate the gradient from this cost but use pseudo_cost to calculate the gradients. This cost function can be used to monitor the cost during training. Parameters ---------- y : matrix (num_batch, target_seq_len) the target label sequences y_hat_softmax : tensor3 (num_batch, input_seq_len, num_classes + 1) class probabily distribution sequences, potentially in log domain y_mask : matrix (num_batch, output_seq_len) indicates which values of y to use mask : matrix (num_batch, input_seq_len) indicates the lenghts of the sequences in y_hat """ # dimshuffle from lasagnes output format y_hat_softmax = y_hat_softmax.dimshuffle(1, 0, 2) y = y.dimshuffle(1, 0) if y_mask is None: y_mask = T.ones(y.shape, dtype=theano.config.floatX) else: y_mask = y_mask.dimshuffle(1, 0) if mask is None: mask = T.ones((y_hat_softmax.shape[0], y_hat_softmax.shape[1]), dtype=theano.config.floatX) else: mask = mask.dimshuffle(1, 0) num_classes = y_hat_softmax.shape[2] - 1 blanked_y, blanked_y_mask = _add_blanks( y=y, blank_symbol=num_classes, y_mask=y_mask) final_cost = -sequence_log_likelihood(blanked_y, y_hat_softmax, blanked_y_mask, mask, num_classes) return final_cost
def f(q_i, D_gt_id, tparams, is_train, trng, options): # Use search engine again to compute the reward/metrics given a query. search = Search(options) # append the unknown vector for words whose index = -1. W_ = tensor.concatenate([tparams['W'], tparams['UNK']], axis=0) q_m = (q_i > -2).astype('float32') #get embeddings for the queries q_a = W_[q_i.flatten()].reshape((q_i.shape[0], q_i.shape[1], prm.dim_emb)) * q_m[:,:,None] if len(prm.filters_query) > 0: q_aa = conv_query(q_a, tparams) else: q_aa = q_a q_a_avg = q_a.sum(1) / tensor.maximum(1., q_m.sum(1, keepdims=True)) out = [] for n_iter in range(prm.n_iterations): if n_iter == 0 and prm.q_0_fixed_until >= prm.n_iterations: prob = tensor.zeros((q_a.shape[0], prm.max_words_input, 2)) bl = tensor.zeros((q_a.shape[0],)) D_m_r = tensor.zeros((q_a.shape[0], prm.max_words_input)) else: if n_iter > 0: D_m_ = (D_i_ > -2).astype('float32') D_a_ = W_[D_i_.flatten()].reshape((D_i_.shape[0], D_i_.shape[1], D_i_.shape[2], prm.dim_emb)) * D_m_[:,:,:,None] else: D_a_ = 1. * q_a[:,None,:,:] D_m_ = 1. * q_m[:,None,:] if len(prm.filters_cand) > 0: D_aa_ = conv_cand(D_a_, tparams, 0) else: D_aa_ = D_a_ D_aa_ =, tparams['Ad']) + tparams['bAd'] if n_iter > 0: if prm.q_0_fixed_until < 2: D_a = tensor.concatenate([D_a, D_a_], axis=1) D_aa = tensor.concatenate([D_aa, D_aa_], axis=1) D_m = tensor.concatenate([D_m, D_m_], axis=1) else: D_a = D_a_ D_aa = D_aa_ D_m = D_m_ else: D_a = D_a_ D_aa = D_aa_ D_m = D_m_ D_a_r = D_a.reshape((D_a.shape[0], -1, D_a.shape[3])) D_aa_r = D_aa.reshape((D_aa.shape[0], -1, D_aa.shape[3])) D_m_r = D_m.reshape((D_m.shape[0],-1)) q_aa_avg = q_aa.sum(1) / tensor.maximum(1., q_m.sum(1, keepdims=True)) q_aa_att = q_aa_avg[:,None,:] q_aa_att =, tparams['Aq']) z = D_aa_r + q_aa_att # estimate reward based on the query. bl = theano.gradient.grad_scale(z, 0.1) D_m_r_c = theano.gradient.disconnected_grad(D_m_r) bl = bl.sum(1) / tensor.maximum(1., D_m_r_c.sum(1))[:,None] for i in range(len(prm.n_hidden_critic)+1): if prm.dropout > 0: bl = dropout_layer(bl, is_train, trng) bl = tensor.maximum(0., bl) bl =, tparams['C'+str(i)]) + tparams['bC'+str(i)] bl = tensor.tanh(bl) bl = bl.flatten() for i in range(len(prm.n_hidden_actor)+1): if prm.dropout > 0: z = dropout_layer(z, is_train, trng) z = tensor.maximum(0., z) z =, tparams['V'+str(i)]) + tparams['bV'+str(i)] prob = softmax_mask(z) * D_m_r[:,:,None] # if training, sample. Otherwise, pick maximum probability. s = trng.multinomial(n=1, pvals=prob.reshape((-1, 2)), dtype=prob.dtype) s = s.reshape((prob.shape[0],prob.shape[1],prob.shape[2])) #if frozen is enabled and this iteration is within its limit, pick maximum probability. if prm.frozen_until > 0: if n_iter < prm.frozen_until: s = prob res = tensor.eq(is_train,1.) * s + tensor.eq(is_train,0.) * prob # final answer & valid words ans = res.argmax(2) * D_m_r if n_iter < prm.q_0_fixed_until: ones = tensor.ones((q_a.shape[0], prm.max_words_input)) if n_iter > 0: # select everything from the original query in the first iteration. ans = tensor.concatenate([ones, ans], axis=1) else: ans = ones metrics, D_i_, D_id_, D_gt_m_ = search(ans, D_gt_id, n_iter, is_train) out.append([prob, ans, metrics, bl, D_m_r, D_id_]) return out
def set_up(self, config=None, make_prunable=False): """Loads and initializes all the theano variables for the training model and the decoding model. Args: config (dict): NMT configuration """ if config: self.config = config else: config = self.config # Create Theano variables logging.debug('Creating theano variables') source_sentence_mask = tensor.matrix('source_mask') target_sentence_mask = tensor.matrix('target_mask') # Construct model (fs439: Add NoLookup options) if config['dec_layers'] != 1: logging.fatal("Only dec_layers=1 supported.") logging.debug('Building RNN encoder-decoder') if config['src_sparse_feat_map']: if config['enc_layers'] != 1: logging.fatal("Only enc_layers=1 supported for sparse " "source features.") source_sentence = tensor.tensor3('source') self.sampling_input = tensor.tensor3('input') encoder = NoLookupEncoder(config['enc_embed'], config['enc_nhids']) else: source_sentence = tensor.lmatrix('source') self.sampling_input = tensor.lmatrix('input') if config['enc_layers'] > 1 and not config['enc_share_weights']: encoder = DeepBidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_layers'], config['enc_skip_connections'], config['enc_nhids']) else: encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_layers'], config['enc_skip_connections'], config['enc_nhids']) if config['trg_sparse_feat_map']: target_sentence = tensor.tensor3('target') decoder = NoLookupDecoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['att_nhids'], config['maxout_nhids'], config['enc_nhids'] * 2, config['attention'], config['dec_attention_sources'], config['dec_readout_sources'], config['memory'], config['memory_size'], config['seq_len'], config['dec_init']) else: target_sentence = tensor.lmatrix('target') decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['att_nhids'], config['maxout_nhids'], config['enc_nhids'] * 2, config['attention'], config['dec_attention_sources'], config['dec_readout_sources'], config['memory'], config['memory_size'], config['seq_len'], config['dec_init'], make_prunable=make_prunable) if config['annotations'] != 'direct': annotators = [] add_direct = False for name in config['annotations'].split(','): if name == 'direct': add_direct = True elif name == 'hierarchical': annotators.append(HierarchicalAnnotator(encoder)) else: logging.fatal("Annotation strategy %s unknown" % name) encoder = EncoderWithAnnotators(encoder, annotators, add_direct) annotations, annotations_mask = encoder.apply(source_sentence, source_sentence_mask) self.cost = decoder.cost(annotations, annotations_mask, target_sentence, target_sentence_mask)'Creating computational graph') = ComputationGraph(self.cost) # Initialize model'Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() try: encoder.bidir.prototype.weights_init = Orthogonal() except AttributeError: pass # Its fine, no bidirectional encoder decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog'Applying dropout') dropout_inputs = [ x for x in if == 'maxout_apply_output' ] = apply_dropout(, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0:'Applying weight noise to ff layers') if encoder.lookup: enc_params = Selector(encoder.lookup).get_parameters().values() enc_params += Selector(encoder.fwd_fork).get_parameters().values() enc_params += Selector(encoder.back_fork).get_parameters().values() dec_params = Selector( decoder.sequence_generator.readout).get_parameters().values() dec_params += Selector( decoder.sequence_generator.fork).get_parameters().values() = apply_noise(, enc_params + dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in] logging.debug("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logging.debug(' {:15}: {}'.format(shape, count)) logging.debug("Total number of CG parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logging.debug("Parameter names: ") for name, value in enc_dec_param_dict.items(): logging.debug(' {:15}: {}'.format(value.get_value().shape, name))"Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model"Building model") self.training_model = Model(self.cost)"Building sampling model") src_shape = (self.sampling_input.shape[-2], self.sampling_input.shape[-1]) # batch_size x sen_length sampling_representation, _ = encoder.apply(self.sampling_input, tensor.ones(src_shape)) generated = decoder.generate(src_shape, sampling_representation) self.search_model = Model(generated) generated_outputs = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs self.samples = generated_outputs[1] self.encoder = encoder self.decoder = decoder
def step( input_n, cell_previous, hid_previous, visual, W_hid_stacked, W_in_stacked, b_stacked, W_cell_to_ingate, W_cell_to_forgetgate, W_cell_to_outgate, W_h_to_attenGate, W_g_to_attenGate, W_v_to_attenGate, W_s_to_attenGate, W_p ): if not self.precompute_input: input_n =, W_in_stacked) + b_stacked # Calculate gates pre-activations and slice gates = input_n +, W_hid_stacked) # Clip gradients if self.grad_clipping: gates = theano.gradient.grad_clip( gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) ggate = slice_w(gates, 4) if self.peepholes: # Compute peephole connections ingate += cell_previous*W_cell_to_ingate forgetgate += cell_previous*W_cell_to_forgetgate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) # ggate gt ggate = self.nonlinearity_ggate(ggate) # Compute new cell value cell = forgetgate*cell_previous + ingate*cell_input if self.peepholes: outgate += cell*W_cell_to_outgate outgate = self.nonlinearity_outgate(outgate) # Compute new hidden unit activation hid = outgate*self.nonlinearity(cell) st = ggate*self.nonlinearity(cell) # zt = # self.nonlinearity( #, W_v_to_attenGate) + # #, W_g_to_attenGate).dimshuffle(0, 1, 'x'), # T.ones((1, self.video_len)) # ) # ), # W_h_to_attenGate # )[:, :, 0] # to avoid optimization failure of Tenseor 3D dot vector, we should transform # e = to e = A*B.dimshuffle('x', 'x', 0), e=e.sum(axis=2) zt_dot_A = self.nonlinearity(, W_v_to_attenGate) +, W_g_to_attenGate).dimshuffle(0, 1, 'x'), T.ones((1, self.video_len)) ) ) zt = zt_dot_A*W_h_to_attenGate.dimshuffle('x', 'x', 0) zt = zt.sum(axis=2) # vt = # self.nonlinearity( # # st, W_s_to_attenGate # ) + # # hid, W_g_to_attenGate # ) # ), # W_h_to_attenGate # ) vt_dot_A = self.nonlinearity( st, W_s_to_attenGate ) + hid, W_g_to_attenGate ) ) vt = vt_dot_A*W_h_to_attenGate.dimshuffle('x', 0) vt = vt.sum(axis=1) vt = vt.dimshuffle(0, 'x') alpha_hat_t = self.nonlinearity_attenGate(T.concatenate( [zt, vt], axis=-1 )) feature = T.concatenate( [visual_input, st.dimshuffle(0, 'x', 1)], axis=1 ).dimshuffle(2, 0, 1) c_hat_t = T.sum(alpha_hat_t*feature, axis=-1) It = (c_hat_t.T+hid), W_p ) return [cell, hid, It]
def categorical_crossentropy_of_mean(predictions): num_cls = predictions.shape[1] uniform_targets = T.ones((1, num_cls)) / num_cls return categorical_crossentropy(predictions.mean(axis=0, keepdims=True), uniform_targets)
def another_simple_model(): _model = models.simple_model()[1] with _model: pm.Potential('pot', tt.ones((10, 10))) return _model
def ones(self, shape, dtype=None, name=None): dtype = dtype or self.floatx() return T.ones(shape, dtype=dtype)
def lohhla_clone_model(sample_ids, tree_edges, clonal_prevalence_mat, cellularity, ploidy_values, tumour_sample_reads, normal_sample_reads, integercpn_info, all_genotypes, transition_inputs, stayrate_alpha=0.9, stayrate_beta=0.1, sd=0.5, nb_alpha=0.5, iter_count=20000, tune_iters=20000, anchor_type='nb', anchor_mode='snvcn', nchains=2, njobs=2): ''' stayrate_alpha: Beta prior alpha-parameter on stayrate in clone tree Markov chain stayrate_beta: Beta prior beta-parameter on stayrate in clone tree Markov chain all_genotypes: Dataframe of genotypes, 0-indexed ''' num_nodes = clonal_prevalence_mat.shape[1] valid_transitions = transition_inputs['valid_transitions'] num_transitions = transition_inputs['num_transitions'] num_genotypes = transition_inputs['num_genotypes'] cn_genotype_matrix = transition_inputs['cn_genotype_matrix'] ## Beta-binomial dispersion (higher = less dispersed) dispersion = 200. ## Tree edges edges = tree_edges.as_matrix().astype(int) - 1 with pm.Model() as model: BoundedNormal = pm.Bound(pm.Normal, lower=0., upper=1.) stay_rate = BoundedNormal('stayrate', mu=0.75, sd=0.4) P = np.zeros(shape=(num_genotypes, num_genotypes)) P = P + tt.eye(num_genotypes) * stay_rate fill_values = tt.as_tensor((1. - stay_rate) / num_transitions) fill_values = tt.set_subtensor(fill_values[0], 0) P = P + valid_transitions * fill_values[:, np.newaxis] P = tt.set_subtensor(P[0, 0], 1.) A = tt.dmatrix('A') PA = tt.ones(shape=(num_genotypes)) / num_genotypes states = CloneTreeGenotypes('genotypes', PA=PA, P=P, edges=edges, k=num_genotypes, shape=(num_nodes)) total_cns = theano.shared(np.array(all_genotypes['total_cn'].values)) alt_cns = theano.shared(np.array(all_genotypes['alt_cn'].values)) total_cn = pm.Deterministic('total_cn', total_cns[states]) alt_cn = pm.Deterministic('alt_cn', alt_cns[states]) sample_alt_copies =, alt_cn ) * cellularity + (1. - cellularity) * 1. vafs = sample_alt_copies / (, total_cn) * cellularity + (1. - cellularity) * 2.) pm.Deterministic('vafs', vafs) alphas = vafs * dispersion betas = (1 - vafs) * dispersion ## Copy number of tumour cells (aggregated over clones, but not including normal contamination) tutotalcn = pm.Deterministic('tutotalcn',, total_cn)) ## Can't be vectorized further for j in range(len(sample_ids)): current_sample = sample_ids[j] total_counts = integercpn_info['TumorCov_type1'][ current_sample].values + integercpn_info['TumorCov_type2'][ current_sample].values alt_counts = integercpn_info['TumorCov_type2'][ current_sample].values alpha_sel = alphas[j] beta_sel = betas[j] ## Draw alternative allele counts for HLA locus for each polymorphic site alt_reads = pm.BetaBinomial('x_' + str(j), alpha=alpha_sel, beta=beta_sel, n=total_counts, observed=alt_counts) mult_factor_mean = (tumour_sample_reads[current_sample] / normal_sample_reads) ploidy = ploidy_values[j] ploidy_ratio = (tutotalcn[j] * cellularity[j] + (1 - cellularity[j]) * 2) / ( cellularity[j] * ploidy + (1 - cellularity[j]) * 2) if anchor_mode == 'snvcn': mult_factor_computed = pm.Deterministic( 'mult_factor_computed_' + str(j), 1. / ploidy_ratio * (integercpn_info['Total_TumorCov'][current_sample].values / integercpn_info['Total_NormalCov'][current_sample].values) ) nloci = len( integercpn_info['Total_TumorCov'][current_sample].values) tumour_reads_observed = integercpn_info['Total_TumorCov'][ current_sample].values normal_reads_observed = integercpn_info['Total_NormalCov'][ current_sample].values elif anchor_mode == 'binmedian': binvar_tumour = 'combinedBinTumor' binvar_normal = 'combinedBinNormal' ## All within a bin are the same, so this is OK duplicated_entries = integercpn_info['binNum'][ current_sample].duplicated(keep='first') nloci = len(integercpn_info[binvar_tumour][current_sample] [~duplicated_entries].values) mult_factor_computed = pm.Deterministic( 'mult_factor_computed_' + str(j), (1. / ploidy_ratio * (integercpn_info[binvar_tumour][current_sample] [~duplicated_entries].values / integercpn_info[binvar_normal][current_sample] [~duplicated_entries].values))) tumour_reads_observed = integercpn_info[binvar_tumour][ current_sample][~duplicated_entries].values normal_reads_observed = integercpn_info[binvar_normal][ current_sample][~duplicated_entries].values else: raise Exception("Invalid option specified.") ## Draw ploidy-corrected tumour/normal locus coverage ratio for each polymorphic site if anchor_type == 'mult_factor': mult_factor = pm.Lognormal('mult_factor_' + str(j), mu=np.log(mult_factor_mean), sd=sd, observed=mult_factor_computed, shape=(nloci)) elif anchor_type == 'nb': tc_nc_ratio = pm.Deterministic( 'tc_nc_ratio_' + str(j), (tutotalcn[j] * cellularity[j] + (1 - cellularity[j]) * 2) / (ploidy * cellularity[j] + (1 - cellularity[j]) * 2)) tumoursamplecn = pm.Deterministic( 'tumoursamplecn_' + str(j), (tutotalcn[j] * cellularity[j] + (1 - cellularity[j]) * 2)) tumour_reads_mean = pm.Deterministic( 'tumour_reads_mean_' + str(j), tc_nc_ratio * mult_factor_mean * normal_reads_observed) tumour_reads = pm.NegativeBinomial( 'tumour_reads_' + str(j), mu=tumour_reads_mean, alpha=nb_alpha, observed=tumour_reads_observed) else: raise Exception('Must specify a valid model type.') pm.Deterministic('log_prob', model.logpt) step1 = pm.CategoricalGibbsMetropolis(vars=[states]) step2 = pm.Metropolis(vars=[stay_rate]) trace = pm.sample(iter_count, tune=tune_iters, step=[step1, step2], njobs=njobs, chains=nchains) return trace
def best_right_path_cost(pred, mask, token, blank=0): ''' best right path cost of multi sentences :param pred: (T, nb, voca_size+1) (4,1,3) :param mask: (nb, T) # :param pred_len: (nb,) pred_len of prediction (1) :param token: (nb, U) -1 for NIL (1,2) :param blank: (1) :return: best_right_path_cost (nb,) :return: argmin_token (nb, T) best path, -1 for null ''' pred_len = mask.sum(axis=-1).astype('int32') eps = theano.shared(np.float32(1e-35)) EPS = theano.shared(np.float32(35)) t = pred.shape[0] nb, U = token.shape[0], token.shape[1] token_len = T.sum(T.neq(token, -1), axis=-1) # token_with_blank token = token[:, :, None] # (nb, U, 1) token_with_blank = T.concatenate( (T.ones_like(token, dtype=intX) * blank, token), axis=2).reshape( (nb, 2 * U)) token_with_blank = T.concatenate( (token_with_blank, T.ones( (nb, 1), dtype=intX) * blank), axis=1) # (nb, 2*U+1) length = token_with_blank.shape[1] # only use these predictions pred = pred[:, T.tile(T.arange(nb), (length, 1)).T, token_with_blank] # (T, nb, 2U+1) pred = -T.log(pred + eps) # recurrence relation sec_diag = T.concatenate( (T.zeros((nb, 2), dtype=intX), T.neq(token_with_blank[:, :-2], token_with_blank[:, 2:])), axis=1) * T.neq(token_with_blank, blank) # (nb, 2U+1) recurrence_relation = T.tile( (m_eye(length) + m_eye(length, k=1)), (nb, 1, 1)) + T.tile(m_eye(length, k=2), (nb, 1, 1)) * sec_diag[:, None, :] # (nb, 2U+1, 2U+1) recurrence_relation = -T.log(recurrence_relation + eps).astype(floatX) # alpha alpha = T.ones_like(token_with_blank, dtype=floatX) * EPS alpha = T.set_subtensor(alpha[:, :2], pred[0, :, :2]) ################(nb, 2U+1) # dynamic programming # (T, nb, 2U+1) [log_probability, argmin_pos_1], _ = theano.scan(lambda curr, accum: ( (accum[:, :, None] + recurrence_relation).min(axis=1) + curr, (accum[:, :, None] + recurrence_relation).argmin(axis=1)), sequences=[pred[1:]], outputs_info=[alpha, None]) # why pred_len-2? labels_1 = log_probability[pred_len - 2, T.arange(nb), 2 * token_len - 1] # (nb,) labels_2 = log_probability[pred_len - 2, T.arange(nb), 2 * token_len] # (nb,) concat_labels = T.concatenate([labels_1[:, None], labels_2[:, None]], axis=-1) argmin_labels = concat_labels.argmin(axis=-1) cost = concat_labels.min(axis=-1) min_path = T.ones((t - 1, nb), dtype=intX) * -1 # -1 for null min_path = T.set_subtensor(min_path[pred_len - 2, T.arange(nb)], 2 * token_len - 1 + argmin_labels) # (T-1, nb) min_full_path, _ = theano.scan( lambda m_path, argm_pos, m_full_path: argm_pos[ T.arange(nb), T.maximum(m_path, m_full_path).astype('int32')].astype('int32'), sequences=[min_path[::-1], argmin_pos_1[::-1]], outputs_info=[min_path[-1]]) argmin_pos = T.concatenate((min_full_path[::-1], min_path[-1][None, :]), axis=0) # (T, nb) argmin_pos = T.set_subtensor(argmin_pos[pred_len - 1, T.arange(nb)], 2 * token_len - 1 + argmin_labels) argmin_token = token_with_blank[T.arange(nb)[None, :], argmin_pos] # (nb,), (nb, T) return cost, (argmin_token.transpose((1, 0)) * mask + mask - 1).astype( 'int32' ) # alpha, log_probability, argmin_pos_1, argmin_labels, min_path, min_full_path, argmin_pos, token_with_blank, argmin_token
def ctc_cost(pred, pred_len, token, blank=0): ''' ctc_cost of multi sentences :param pred: (T, nb, voca_size+1) (4,1,3) :param pred_len: (nb,) pred_len of prediction (1) :param token: (nb, U) -1 for NIL (1,2) :param blank: (1) :return: ctc_cost ''' eps = theano.shared(np.float32(1e-35)) Time = pred.shape[0] nb, U = token.shape[0], token.shape[1] token_len = T.sum(T.neq(token, -1), axis=-1) # token_with_blank token = token[:, :, None] # (nb, U, 1) token_with_blank = T.concatenate( (T.ones_like(token, dtype=intX) * blank, token), axis=2).reshape( (nb, 2 * U)) token_with_blank = T.concatenate( (token_with_blank, T.ones( (nb, 1), dtype=intX) * blank), axis=1) # (nb, 2*U+1) length = token_with_blank.shape[1] # only use these predictions pred = pred[T.arange(Time)[:, None, None], T.arange(nb)[None, :, None], token_with_blank[None, :, :]] # (T, nb, 2U+1) # recurrence relation sec_diag = T.concatenate( (T.zeros((nb, 2), dtype=intX), T.neq(token_with_blank[:, :-2], token_with_blank[:, 2:])), axis=1) * T.neq(token_with_blank, blank) # (nb, 2U+1) recurrence_relation = T.tile( (m_eye(length) + m_eye(length, k=1)), (nb, 1, 1)) + T.tile(m_eye(length, k=2), (nb, 1, 1)) * sec_diag[:, None, :] # (nb, 2U+1, 2U+1) recurrence_relation = recurrence_relation.astype(floatX) # alpha alpha = T.zeros_like(token_with_blank, dtype=floatX) alpha = T.set_subtensor(alpha[:, :2], pred[0, :, :2]) ################(nb, 2U+1) # dynamic programming # (T, nb, 2U+1) probability, _ = theano.scan(lambda curr, accum: T.sum( accum[:, :, None] * recurrence_relation, axis=1) * curr, sequences=[pred[1:]], outputs_info=[alpha]) # T.batched_dot(accum[:, None, :], recurrence_relation)[:, 0] * curr, labels_2 = probability[pred_len - 2, T.arange(nb), 2 * token_len - 1] labels_1 = probability[pred_len - 2, T.arange(nb), 2 * token_len] labels_prob = labels_2 + labels_1 cost = -T.log(labels_prob + eps) return cost
def top_k_right_path_cost(pred, mask, token, k, blank=0): ''' best right path cost of multi sentences :param pred: (T, nb, voca_size+1) (4,1,3) :param mask: (nb, T) :param token: (nb, U) -1 for NIL (1,2) :param k: (1) top k paths :param blank: (1) :return: top_k_path_cost (nb, k) :return: argmin_k_token (nb, k, T) top k path, -1 for null ''' pred_len = mask.sum(axis=-1).astype('int32') eps = theano.shared(np.float32(1e-35)) EPS = theano.shared(np.float32(35)) t = pred.shape[0] nb, U = token.shape[0], token.shape[1] token_len = T.sum(T.neq(token, -1), axis=-1) # token_with_blank token = token[:, :, None] # (nb, U, 1) token_with_blank = T.concatenate( (T.ones_like(token, dtype=intX) * blank, token), axis=2).reshape( (nb, 2 * U)) token_with_blank = T.concatenate( (token_with_blank, T.ones( (nb, 1), dtype=intX) * blank), axis=1) # (nb, 2*U+1) length = token_with_blank.shape[1] # only use these predictions pred = pred[:, T.tile(T.arange(nb), (length, 1)).T, token_with_blank] # (T, nb, 2U+1) pred = -T.log(pred + eps) # recurrence relation sec_diag = T.concatenate( (T.zeros((nb, 2), dtype=intX), T.neq(token_with_blank[:, :-2], token_with_blank[:, 2:])), axis=1) * T.neq(token_with_blank, blank) # (nb, 2U+1) recurrence_relation = T.tile( (m_eye(length) + m_eye(length, k=1)), (nb, 1, 1)) + T.tile(m_eye(length, k=2), (nb, 1, 1)) * sec_diag[:, None, :] # (nb, 2U+1, 2U+1) recurrence_relation = -T.log(recurrence_relation + eps).astype(floatX) # alpha alpha = T.ones((nb, k, length), dtype=floatX) * EPS alpha = T.set_subtensor(alpha[:, 0, :2], pred[0, :, :2]) #(nb, k, 2U+1) def step_func_1(curr, accum): ''' :param curr: (nb, length) :param accum: (nb, k, length) ''' alpha_t = (accum[:, :, :, None] + recurrence_relation[:, None, :, :]).reshape( (nb, k * length, length)) accum_t = alpha_t.sort(axis=1)[:, :k, :] + curr[:, None, :] argmin_k_t = alpha_t.argsort(axis=1)[:, :k, :] # from 0 to k*length return accum_t, argmin_k_t # dynamic programming # (T-1, nb, k, length), (T-1, nb, k, length) [log_probability, argmin_pos_k], _ = theano.scan(step_func_1, sequences=[pred[1:]], outputs_info=[alpha, None]) labels_1 = log_probability[(pred_len - 2)[:, None], T.arange(nb)[:, None], T.arange(k)[None, :], (2 * token_len - 1)[:, None]] # (nb, k) labels_2 = log_probability[(pred_len - 2)[:, None], T.arange(nb)[:, None], T.arange(k)[None, :], (2 * token_len)[:, None]] # (nb, k) concat_labels = T.concatenate([labels_1, labels_2], axis=-1) argmin_labels = (2 * token_len - 1)[:, None] + concat_labels.argsort( axis=-1)[:, :k].astype('int32') / k # (nb, k) from 0 to 2k cost = concat_labels.sort(axis=-1)[:, :k] min_path = T.ones( (t - 1, nb, k), dtype=intX) * -1 # (T-1, nb, k) -1 for null min_path = T.set_subtensor(min_path[(pred_len - 2)[:, None], T.arange(nb)[:, None], T.arange(k)[None, :]], argmin_labels + T.arange(k)[None, :] * length) # set (nb, k) def step_func_2(m_path, argm_pos, m_full_path): ''' :param m_path: (nb, k) min path (from 0 to k*length) :param argm_pos: (nb, k, length) argmin_pos_k :param m_full_path: (nb, k) min full path (from 0 to k*length) ''' path_here = T.maximum(m_path, m_full_path).astype('int32') # (nb, k) m_full_return = argm_pos.reshape( (nb, k * length))[T.arange(nb)[:, None], path_here].astype('int32') # (nb, k) return m_full_return # (T-1, nb, k) min_full_path, _ = theano.scan( step_func_2, sequences=[min_path[::-1], argmin_pos_k[::-1]], outputs_info=[min_path[-1]]) # (T, nb, k) argmin_pos = T.concatenate((min_full_path[::-1], min_path[-1][None, :, :]), axis=0) # (T, nb, k) argmin_pos = T.set_subtensor( argmin_pos[(pred_len - 1)[:, None], T.arange(nb)[:, None], T.arange(k)[None, :]], argmin_labels + T.arange(k)[None, :] * length) # (nb, k*length) -> (T, nb, k) argmin_token = T.tile(token_with_blank[:, None, :], (1, k, 1)).reshape( (nb, k * length))[T.arange(nb)[None, :, None], argmin_pos] mask_k = T.le(cost, EPS - 1) argmin_token = (argmin_token.transpose( (1, 0, 2)) * mask[:, :, None] + mask[:, :, None] - 1) * mask_k[:, None, :] + mask_k[:, None, :] - 1 # (nb, k), (nb, T, k) return cost, argmin_token.astype( 'int32') #, log_probability, argmin_pos_k, min_full_path
def make_hierarchical_model(rts, gaze, values, error_lls, subject_idx, v_val=None, gamma_val=None, s_val=None, tau_val=None, t0_val=None, zerotol=1e-6, error_weight=0.05, boundary=1., gamma_bounds=(-1, 1), drift='multiplicative', design=dict(v=dict(), gamma=dict(), s=dict(), tau=dict(), t0=dict())): if drift == 'multiplicative': is_multiplicative = True elif drift == 'additive': is_multiplicative = False else: is_multiplicative = None ValueError('Drift function "{}" not recognized.'.format(drift)) n_subjects = np.unique(subject_idx).size with pm.Model() as glam_hierarchical: # Mechanics b = pm.Deterministic('b', tt.constant(boundary, dtype='float32')) p_error = pm.Deterministic('p_error', tt.constant(error_weight, dtype='float32')) # Parameter priors v = generate_hierarchical_model_parameters(parameter='v', n_subjects=n_subjects, design=design['v'], mu_lower=zerotol, mu_upper=0.0005, sd_lower=zerotol, sd_upper=0.0005, bound_lower=0, bound_upper=0.0005, val=v_val, testval=0.0001) gamma = generate_hierarchical_model_parameters( parameter='gamma', n_subjects=n_subjects, design=design['gamma'], mu_lower=gamma_bounds[0], mu_upper=gamma_bounds[1], sd_lower=zerotol, sd_upper=gamma_bounds[1] - gamma_bounds[0], bound_lower=gamma_bounds[0], bound_upper=gamma_bounds[1], val=gamma_val, testval=.5) s = generate_hierarchical_model_parameters(parameter='s', n_subjects=n_subjects, design=design['s'], mu_lower=zerotol, mu_upper=0.02, sd_lower=zerotol, sd_upper=0.02, bound_lower=zerotol, bound_upper=0.02, val=s_val, testval=0.0075) tau = generate_hierarchical_model_parameters(parameter='tau', n_subjects=n_subjects, design=design['tau'], mu_lower=0, mu_upper=5, sd_lower=zerotol, sd_upper=5, bound_lower=0, bound_upper=5, val=tau_val, testval=.5) if t0_val is None: t0 = pm.Uniform('t0', 0, 500, testval=50, shape=(n_subjects, 1)) else: t0 = pm.Deterministic('t0', tt.ones((n_subjects, 1)) * t0_val) # Likelihood def lda_logp(rt, gaze, values, error_lls, s_condition_index, s_subject_index, v_condition_index, v_subject_index, tau_condition_index, tau_subject_index, gamma_condition_index, gamma_subject_index, t0_condition_index, t0_subject_index, is_multiplicative, zerotol): # compute drifts drift = ifelse( is_multiplicative, glam.components.tt_drift_multiplicative( v[tt.cast(v_subject_index, dtype='int32'), tt.cast(v_condition_index, dtype='int32')][:, None], tau[tt.cast(tau_subject_index, dtype='int32'), tt.cast(tau_condition_index, dtype='int32')][:, None], gamma[tt.cast(gamma_subject_index, dtype='int32'), tt.cast(gamma_condition_index, dtype='int32')][:, None], values, gaze, zerotol), glam.components.tt_drift_additive( v[tt.cast(v_subject_index, dtype='int32'), tt.cast(v_condition_index, dtype='int32')][:, None], tau[tt.cast(tau_subject_index, dtype='int32'), tt.cast(tau_condition_index, dtype='int32')][:, None], gamma[tt.cast(gamma_subject_index, dtype='int32'), tt.cast(gamma_condition_index, dtype='int32')][:, None], values, gaze, zerotol)) glam_ll = glam.components.tt_wienerrace_pdf( rt[:, None], drift, s[tt.cast(s_subject_index, dtype='int32'), tt.cast(s_condition_index, dtype='int32')][:, None], b, t0[tt.cast(t0_subject_index, dtype='int32'), tt.cast(t0_condition_index, dtype='int32')][:, None], zerotol) # mix likelihoods mixed_ll = ((1 - p_error) * glam_ll + p_error * error_lls[subject_idx]) mixed_ll = tt.where(tt.isnan(mixed_ll), 0., mixed_ll) mixed_ll = tt.where(tt.isinf(mixed_ll), 0., mixed_ll) return tt.log(mixed_ll + zerotol) obs = pm.DensityDist( 'obs', logp=lda_logp, observed=dict( rt=rts, gaze=gaze, values=values, error_lls=error_lls, s_condition_index=design['s']['condition_index'].astype( np.int32), s_subject_index=design['s']['subject_index'].astype(np.int32), v_condition_index=design['v']['condition_index'].astype( np.int32), v_subject_index=design['v']['subject_index'].astype(np.int32), tau_condition_index=design['tau']['condition_index'].astype( np.int32), tau_subject_index=design['tau']['subject_index'].astype( np.int32), gamma_condition_index=design['gamma'] ['condition_index'].astype(np.int32), gamma_subject_index=design['gamma']['subject_index'].astype( np.int32), t0_condition_index=design['t0']['condition_index'].astype( np.int32), t0_subject_index=design['t0']['subject_index'].astype( np.int32), is_multiplicative=is_multiplicative, zerotol=zerotol)) return glam_hierarchical
def generate_hierarchical_model_parameters(parameter, n_subjects, design, mu_lower, mu_upper, sd_lower, sd_upper, bound_lower, bound_upper, val, testval): if (design['conditions'] is not None): if val is None: mu = tt.stack([ pm.Uniform('{}_{}_mu'.format(parameter, condition), mu_lower, mu_upper, testval=testval) for condition in design['conditions'] ]) sd = tt.stack([ pm.Uniform('{}_{}_sd'.format(parameter, condition), sd_lower, sd_upper, testval=testval) for condition in design['conditions'] ]) bounded = pm.Bound(pm.Normal, bound_lower, bound_upper) parms = [] n_subjects_per_condition = [] for c, condition in enumerate(design['conditions']): n_subjects_in_condition = np.unique(design['subject_index'][ design['condition_index'] == c]).size n_subjects_per_condition.append(n_subjects_in_condition) parms_tmp = bounded('{}_{}'.format(parameter, condition), mu=mu[c], sd=sd[c], shape=(n_subjects_in_condition)) parms_tmp = tt.concatenate([tt.zeros(1), parms_tmp]) parms.append(parms_tmp[design['D'][:, c]][:, None]) parms = tt.concatenate(parms, axis=1) else: parms = [] n_subjects_per_condition = [] for c, condition in enumerate(design['conditions']): n_subjects_in_condition = np.unique(design['subject_index'][ design['condition_index'] == c]).size n_subjects_per_condition.append(n_subjects_in_condition) if len(val) == len(design['conditions']): parms.append( pm.Deterministic( '{}_{}'.format(parameter, condition), tt.ones(n_subjects_in_condition, 1) * val[c])) else: raise ValueError( 'Number of values in {}_val does not match the number of specified {}-conditions.' .format(parameter, parameter)) # make sure all elements in parms have same size for set_i, parm_set in enumerate(parms): if n_subjects_per_condition[set_i] < n_subjects: parms[set_i] = tt.concatenate([ parm_set, tt.zeros( (n_subjects - n_subjects_per_condition[set_i], 1)) ], axis=0) parms = tt.concatenate(parms, axis=1) else: if val is None: mu = pm.Uniform('{}_mu'.format(parameter), mu_lower, mu_upper, testval=testval) sd = pm.Uniform('{}_sd'.format(parameter), sd_lower, sd_upper, testval=testval) bounded = pm.Bound(pm.Normal, bound_lower, bound_upper) parms = bounded(parameter, mu=mu, sd=sd, shape=(n_subjects, 1)) else: parms = pm.Deterministic(parameter, tt.ones((n_subjects, 1)) * val) return parms
def __init__(self, collapse='mean', maxout=False, transpose=False, **kwargs): super(TwoDToOneDLayer, self).__init__(1, **kwargs) self.set_attr('collapse', collapse) self.set_attr('transpose', transpose) Y = self.sources[0].output if transpose: Y = Y.dimshuffle(1, 0, 2, 3) #index handling def index_fn(index, size): return T.set_subtensor(index[:size], numpy.cast['int8'](1)) index_init = T.zeros((Y.shape[2], Y.shape[1]), dtype='int8') self.index, _ = theano.scan( index_fn, [index_init, T.cast(self.sources[0].output_sizes[:, 1], "int32")]) self.index = self.index.dimshuffle(1, 0) n_out = self.sources[0].attrs['n_out'] if maxout: Y = Y.max(axis=3).dimshuffle(0, 1, 2, 'x') if collapse == 'sum' or collapse == True: Y = Y.sum(axis=0) elif collapse == 'mean': Y = Y.mean(axis=0) elif collapse == 'conv': from TheanoUtil import circular_convolution Y, _ = theano.scan(lambda x_i, x_p: circular_convolution(x_i, x_p), Y, Y[0]) Y = Y[-1] elif collapse == 'flatten': self.index = T.ones((Y.shape[0] * Y.shape[1], Y.shape[2]), dtype='int8') Y = Y.reshape((Y.shape[0] * Y.shape[1], Y.shape[2], Y.shape[3])) elif str(collapse).startswith('pad_'): pad = numpy.int32(collapse.split('_')[-1]) Y = ifelse([0], pad), T.concatenate([ Y, T.zeros( (pad - Y.shape[0], Y.shape[1], Y.shape[2], Y.shape[3]), 'float32') ], axis=0), ifelse([0], pad), Y[:pad], Y)) Y = Y.dimshuffle(1, 2, 3, 0).reshape( (Y.shape[1], Y.shape[2], Y.shape[3] * Y.shape[0])) n_out *= pad elif collapse != False: assert False, "invalid collapse mode" if self.attrs['batch_norm']: Y = self.batch_norm(Y, n_out, force_sample=False) self.output = Y self.act = [Y, Y] self.set_attr('n_out', n_out)
def linear_model(X, y): shape = X.shape X = pm.Normal('X', mu=np.mean(X, axis=0), sd=np.std(X, axis=0), shape=shape) coefs = pm.Normal('coefs', mu=tt.zeros(shape[1]), sd=tt.ones(shape[1]), shape=shape[1]) pm.Normal('y',, coefs), sd=tt.ones(shape[0]), shape=shape[0])
def partial_linear_model(X): shape = X.shape X = pm.Normal('X', mu=np.mean(X, axis=0), sd=np.std(X, axis=0), shape=shape) pm.Normal('coefs', mu=tt.zeros(shape[1]), sd=tt.ones(shape[1]), shape=shape[1])
def __init__( self, cell_state_mat: np.ndarray, X_data: np.ndarray, n_comb: int = 50, data_type: str = "float32", n_iter=20000, learning_rate=0.005, total_grad_norm_constraint=200, verbose=True, var_names=None, var_names_read=None, obs_names=None, fact_names=None, sample_id=None, gene_level_prior={"mean": 1 / 2, "sd": 1 / 4}, gene_level_var_prior={"mean_var_ratio": 1.0}, cell_number_prior={"cells_per_spot": 8.0, "factors_per_spot": 7.0, "combs_per_spot": 2.5}, cell_number_var_prior={"cells_mean_var_ratio": 1.0, "factors_mean_var_ratio": 1.0, "combs_mean_var_ratio": 1.0}, phi_hyp_prior={"mean": 3.0, "sd": 1.0}, spot_fact_mean_var_ratio=5.0, exper_gene_level_mean_var_ratio=10, ): ############# Initialise parameters ################ super().__init__( cell_state_mat, X_data, data_type, n_iter, learning_rate, total_grad_norm_constraint, verbose, var_names, var_names_read, obs_names, fact_names, sample_id, ) for k in gene_level_var_prior.keys(): gene_level_prior[k] = gene_level_var_prior[k] self.gene_level_prior = gene_level_prior self.phi_hyp_prior = phi_hyp_prior self.n_comb = n_comb self.spot_fact_mean_var_ratio = spot_fact_mean_var_ratio self.exper_gene_level_mean_var_ratio = exper_gene_level_mean_var_ratio # generate parameters for samples self.spot2sample_df = pd.get_dummies(sample_id) # convert to np.ndarray self.spot2sample_mat = self.spot2sample_df.values self.n_exper = self.spot2sample_mat.shape[1] # assign extra data to dictionary with (1) shared parameters (2) input data self.extra_data_tt = {"spot2sample": theano.shared(self.spot2sample_mat.astype(self.data_type))} self.extra_data = {"spot2sample": self.spot2sample_mat.astype(self.data_type)} cell_number_prior["factors_per_combs"] = ( cell_number_prior["factors_per_spot"] / cell_number_prior["combs_per_spot"] ) for k in cell_number_var_prior.keys(): cell_number_prior[k] = cell_number_var_prior[k] self.cell_number_prior = cell_number_prior ############# Define the model ################ self.model = pm.Model() with self.model: # =====================Gene expression level scaling======================= # # Explains difference in expression between genes and # how it differs in single cell and spatial technology # compute hyperparameters from mean and sd shape = gene_level_prior["mean"] ** 2 / gene_level_prior["sd"] ** 2 rate = gene_level_prior["mean"] / gene_level_prior["sd"] ** 2 shape_var = shape / gene_level_prior["mean_var_ratio"] rate_var = rate / gene_level_prior["mean_var_ratio"] self.gene_level_alpha_hyp = pm.Gamma( "gene_level_alpha_hyp", mu=shape, sigma=np.sqrt(shape_var), shape=(1, 1) ) self.gene_level_beta_hyp = pm.Gamma("gene_level_beta_hyp", mu=rate, sigma=np.sqrt(rate_var), shape=(1, 1)) # global gene levels self.gene_level = pm.Gamma( "gene_level", self.gene_level_alpha_hyp, self.gene_level_beta_hyp, shape=(self.n_var, 1) ) # scale cell state factors by gene_level self.gene_factors = pm.Deterministic("gene_factors", self.cell_state) # self.gene_factors = self.cell_state # tt.printing.Print('gene_factors sum')(gene_factors.sum(0).shape) # tt.printing.Print('gene_factors sum')(gene_factors.sum(0)) # =====================Spot factors======================= # # prior on spot factors reflects the number of cells, fraction of their cytoplasm captured, # times heterogeniety in the total number of mRNA between individual cells with each cell type self.cells_per_spot = pm.Gamma( "cells_per_spot", mu=cell_number_prior["cells_per_spot"], sigma=np.sqrt(cell_number_prior["cells_per_spot"] / cell_number_prior["cells_mean_var_ratio"]), shape=(self.n_obs, 1), ) self.comb_per_spot = pm.Gamma( "combs_per_spot", mu=cell_number_prior["combs_per_spot"], sigma=np.sqrt(cell_number_prior["combs_per_spot"] / cell_number_prior["combs_mean_var_ratio"]), shape=(self.n_obs, 1), ) shape = self.comb_per_spot / np.array(self.n_comb).reshape((1, 1)) rate = tt.ones((1, 1)) / self.cells_per_spot * self.comb_per_spot self.combs_factors = pm.Gamma("combs_factors", alpha=shape, beta=rate, shape=(self.n_obs, self.n_comb)) self.factors_per_combs = pm.Gamma( "factors_per_combs", mu=cell_number_prior["factors_per_combs"], sigma=np.sqrt(cell_number_prior["factors_per_combs"] / cell_number_prior["factors_mean_var_ratio"]), shape=(self.n_comb, 1), ) c2f_shape = self.factors_per_combs / np.array(self.n_fact).reshape((1, 1)) self.comb2fact = pm.Gamma( "comb2fact", alpha=c2f_shape, beta=self.factors_per_combs, shape=(self.n_comb, self.n_fact) ) self.spot_factors = pm.Gamma( "spot_factors",, self.comb2fact), sigma=pm.math.sqrt(, self.comb2fact) / self.spot_fact_mean_var_ratio), shape=(self.n_obs, self.n_fact), ) # =====================Spot-specific additive component======================= # # molecule contribution that cannot be explained by cell state signatures # these counts are distributed between all genes not just expressed genes self.spot_add_hyp = pm.Gamma("spot_add_hyp", 1, 1, shape=2) self.spot_add = pm.Gamma("spot_add", self.spot_add_hyp[0], self.spot_add_hyp[1], shape=(self.n_obs, 1)) # =====================Gene-specific additive component ======================= # # per gene molecule contribution that cannot be explained by cell state signatures # these counts are distributed equally between all spots (e.g. background, free-floating RNA) self.gene_add_hyp = pm.Gamma("gene_add_hyp", 1, 1, shape=2) self.gene_add = pm.Gamma( "gene_add", self.gene_add_hyp[0], self.gene_add_hyp[1], shape=(self.n_exper, self.n_var) ) # =====================Gene-specific overdispersion ======================= # self.phi_hyp = pm.Gamma("phi_hyp", mu=phi_hyp_prior["mean"], sigma=phi_hyp_prior["sd"], shape=(1, 1)) self.gene_E = pm.Exponential("gene_E", self.phi_hyp, shape=(self.n_exper, self.n_var)) # =====================Expected expression ======================= # # expected expression self.mu_biol = (, self.gene_factors.T) * self.gene_level.T +["spot2sample"], self.gene_add) + self.spot_add ) # tt.printing.Print('mu_biol')(self.mu_biol.shape) # =====================DATA likelihood ======================= # # Likelihood (sampling distribution) of observations & add overdispersion via NegativeBinomial / Poisson self.data_target = pm.NegativeBinomial( "data_target", mu=self.mu_biol,["spot2sample"], 1 / tt.pow(self.gene_E, 2)), observed=self.x_data, total_size=self.X_data.shape, ) # =====================Compute nUMI from each factor in spots ======================= # self.nUMI_factors = pm.Deterministic( "nUMI_factors", (self.spot_factors * (self.gene_factors * self.gene_level).sum(0)) )
def SIR_with_change_points(S_begin_beta, I_begin_beta, new_cases_obs, change_points_list, date_begin_simulation, num_days_sim, diff_data_sim, N, priors_dict=None, weekends_modulated=False): """ Parameters ---------- new_cases_obs : list or array Timeseries (day over day) of newly reported cases (not the total number) change_points_list : list of dicts List of dictionaries, each corresponding to one change point. Each dict can have the following key-value pairs. If a pair is not provided, the respective default is used. * pr_mean_date_begin_transient : datetime.datetime, NO default * pr_median_lambda : number, same as default priors, below * pr_sigma_lambda : number, same as default priors, below * pr_sigma_date_begin_transient : number, 3 * pr_median_transient_len : number, 3 * pr_sigma_transient_len : number, 0.3 date_begin_simulation: datetime.datetime The begin of the simulation data num_days_sim : integer Number of days to forecast into the future diff_data_sim : integer Number of days that the simulation-begin predates the first data point in `new_cases_obs`. This is necessary so the model can fit the reporting delay. Set this parameter to a value larger than what you expect to find for the reporting delay. should be significantly larger than the expected delay, in order to always fit the same number of data points. N : number The population size. For Germany, we used 83e6 priors_dict : dict Dictionary of the prior assumptions Possible key-value pairs (and default values) are: * pr_beta_I_begin : number, default = 100 * pr_median_lambda_0 : number, default = 0.4 * pr_sigma_lambda_0 : number, default = 0.5 * pr_median_mu : number, default = 1/8 * pr_sigma_mu : number, default = 0.2 * pr_median_delay : number, default = 8 * pr_sigma_delay : number, default = 0.2 * pr_beta_sigma_obs : number, default = 10 * week_end_days : tuple, default = (6,7) * pr_mean_weekend_factor : number, default = 0.7 * pr_sigma_weekend_factor :number, default = 0.17 weekends_modulated : bool Whether to add the prior that cases are less reported on week ends. Multiplies the new cases numbers on weekends by a number between 0 and 1, given by a prior beta distribution. The beta distribution is parametrised by pr_mean_weekend_factor and pr_sigma_weekend_factor weekend_modulation_type : 'step' or 'abs_sine': whether the weekends are modulated by a step function, which only multiplies the days given by week_end_days by the week_end_factor, or whether the whole week is modulated by an abs(sin(x)) function, with an offset with flat prior. Returns ------- : pymc3.Model Returns an instance of pymc3 model with the change points """ if priors_dict is None: priors_dict = dict() default_priors = dict(pr_beta_I_begin=10000.0, pr_median_lambda_0=0.2, pr_sigma_lambda_0=0.5, pr_median_mu=1 / 8, pr_sigma_mu=0.2, pr_median_delay=1.0, pr_sigma_delay=0.2, pr_beta_sigma_obs=5.0, week_end_days=(6, 7), pr_mean_weekend_factor=0.7, pr_sigma_weekend_factor=0.17) default_priors_change_points = dict( pr_median_lambda=default_priors["pr_median_lambda_0"], pr_sigma_lambda=default_priors["pr_sigma_lambda_0"], pr_sigma_date_begin_transient=3.0, pr_median_transient_len=3.0, pr_sigma_transient_len=0.3, pr_mean_date_begin_transient=None, ) if not weekends_modulated: del default_priors['week_end_days'] del default_priors['pr_mean_weekend_factor'] del default_priors['pr_sigma_weekend_factor'] for prior_name in priors_dict.keys(): if prior_name not in default_priors: raise RuntimeError(f"Prior with name {prior_name} not known") for change_point in change_points_list: for prior_name in change_point.keys(): if prior_name not in default_priors_change_points: raise RuntimeError(f"Prior with name {prior_name} not known") for prior_name, value in default_priors.items(): if prior_name not in priors_dict: priors_dict[prior_name] = value # print(f"{prior_name} was set to default value {value}") for prior_name, value in default_priors_change_points.items(): for i_cp, change_point in enumerate(change_points_list): if prior_name not in change_point: change_point[prior_name] = value # print(f"{prior_name} of change point {i_cp} was set to default value {value}") if num_days_sim < len(new_cases_obs) + diff_data_sim: raise RuntimeError( "Simulation ends before the end of the data. Increase num_days_sim." ) # ------------------------------------------------------------------------------ # # Model and prior implementation # ------------------------------------------------------------------------------ # with pm.Model() as model: # all pm functions now apply on the model instance # true cases at begin of loaded data but we do not know the real number I_begin = pm.Normal(name="I_begin", mu=I_begin_beta, sigma=I_begin_beta / 10) S_begin = pm.Normal(name="S_begin", mu=S_begin_beta, sigma=S_begin_beta / 10) # S_begin = N - I_begin # I_begin_print = tt.printing.Print('I_begin')(I_begin) # S_begin_print = tt.printing.Print('S_begin')(S_begin) # fraction of people that are newly infected each day lambda_list = [] lambda_list.append( pm.Lognormal( name="lambda_0", mu=np.log(priors_dict["pr_median_lambda_0"]), sigma=priors_dict["pr_sigma_lambda_0"], )) for i, cp in enumerate(change_points_list): lambda_list.append( pm.Lognormal( name=f"lambda_{i + 1}", mu=np.log(cp["pr_median_lambda"]), sigma=cp["pr_sigma_lambda"], )) # list of start dates of the transient periods of the change points tr_begin_list = [] dt_before = date_begin_simulation for i, cp in enumerate(change_points_list): dt_begin_transient = cp["pr_mean_date_begin_transient"] if dt_before is not None and dt_before > dt_begin_transient: raise RuntimeError( "Dates of change points are not temporally ordered") prior_mean = ( dt_begin_transient - date_begin_simulation ).days # - 1 # convert the provided date format (argument) into days (a number) tr_begin = pm.Normal( name=f"transient_begin_{i}", mu=prior_mean, sigma=cp["pr_sigma_date_begin_transient"], ) tr_begin_list.append(tr_begin) dt_before = dt_begin_transient # same for transient times tr_len_list = [] for i, cp in enumerate(change_points_list): tr_len = pm.Lognormal( name=f"transient_len_{i}", mu=np.log(cp["pr_median_transient_len"]), sigma=cp["pr_sigma_transient_len"], ) tr_len_list.append(tr_len) # build the time-dependent spreading rate lambda_t_list = [lambda_list[0] * tt.ones(num_days_sim)] lambda_before = lambda_list[0] for tr_begin, tr_len, lambda_after in zip(tr_begin_list, tr_len_list, lambda_list[1:]): lambda_t = model_helper.smooth_step_function( start_val=0, end_val=1, t_begin=tr_begin, t_end=tr_begin + tr_len, t_total=num_days_sim, ) * (lambda_after - lambda_before) lambda_before = lambda_after lambda_t_list.append(lambda_t) lambda_t = sum(lambda_t_list) # fraction of people that recover each day, recovery rate mu mu = pm.Lognormal( name="mu", mu=np.log(priors_dict["pr_median_mu"]), sigma=priors_dict["pr_sigma_mu"], ) # delay in days between contracting the disease and being recorded # delay = pm.Lognormal( # name="delay", # mu=np.log(priors_dict["pr_median_delay"]), # sigma=priors_dict["pr_sigma_delay"], # ) # prior of the error of observed cases sigma_obs = pm.HalfCauchy("sigma_obs", beta=priors_dict["pr_beta_sigma_obs"]) # -------------------------------------------------------------------------- # # training the model with loaded data provided as argument # -------------------------------------------------------------------------- # S, I, new_I = _SIR_model(lambda_t=lambda_t, mu=mu, S_begin=S_begin, I_begin=I_begin, N=N) # ignore this delay # new_cases_inferred = model_helper.delay_cases( # new_I_t=new_I, # len_new_I_t=num_days_sim, # len_out=num_days_sim - diff_data_sim, # delay=delay, # delay_diff=diff_data_sim, # ) new_cases_inferred = new_I # likelihood of the model: # observed cases are distributed following studentT around the model. # we want to approximate a Poisson distribution of new cases. # we choose nu=4 to get heavy tails and robustness to outliers. # num_days_data = new_cases_obs.shape[-1] pm.StudentT( name="_new_cases_studentT", nu=4, mu=new_cases_inferred[:num_days_data], sigma=tt.abs_(new_cases_inferred[:num_days_data] + 1)**0.5 * sigma_obs, # +1 and tt.abs to avoid nans observed=new_cases_obs, ) # add these observables to the model so we can extract a time series of them # later via e.g. `model.trace['lambda_t']` pm.Deterministic("lambda_t", lambda_t) pm.Deterministic("new_cases", new_cases_inferred) return model
max_steps = max_steps_var else: max_steps = 2 l_ans_softmax = AnsPointerLayer(mlstm, num_units=k, max_steps=max_steps, mask_input=l_passage_mask) if load_previous: print('loading previous saved model ...') # And load them again later on like this: with np.load(save_filename) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] lasagne.layers.set_all_param_values(l_ans_softmax, param_values) if not sequential: ans_mask = T.ones((1, 2)) ans_length = T.constant(2) else: ans_mask = ans_mask_var ans_length = ans_length_var # lasagne.layers.get_output produces a variable for the output of the net # prediction's shape is (n_batch, max_steps, passage_seq_len) prediction = lasagne.layers.get_output(l_ans_softmax, deterministic=False) loss, _ = categorical_crossentropy(prediction, target_var, ans_mask, ans_length) cost = loss.mean() if l2_weight > 0.: # apply l2 regularization print('apply l2 penalty to all layers, weight: {}'.format(l2_weight)) l2_penalty = lasagne.regularization.regularize_network_params(
def conv_cond_concat(x, y): """ concatenate conditioning vector on feature map axis """ return T.concatenate([x, y * T.ones((x.shape[0], y.shape[1], x.shape[2], x.shape[3]))], axis=1)
def build(self, dropout, char_dim, char_lstm_dim, char_bidirect, word_dim, word_lstm_dim, word_bidirect, lr_method, pos_dim, dep_dim, pre_emb, pre_emb_dep, crf, cap_dim, training=True, **kwargs): """ Build the network. """ # Training parameters n_words = len(self.id_to_word) n_chars = len(self.id_to_char) n_tags = len(self.id_to_tag) # Number of capitalization features if cap_dim: n_cap = 4 if pos_dim: n_POS = len(self.id_to_POS) if dep_dim: n_depN = len(self.id_to_N) n_depV = len(self.id_to_V) # Network variables is_train = T.iscalar('is_train') word_ids = T.ivector(name='word_ids') char_for_ids = T.imatrix(name='char_for_ids') char_rev_ids = T.imatrix(name='char_rev_ids') char_pos_ids = T.ivector(name='char_pos_ids') probs = T.ivector(name='probs') tag_ids = T.ivector(name='tag_ids') if cap_dim: cap_ids = T.ivector(name='cap_ids') if pos_dim: pos_ids = T.ivector(name='pos_ids') if dep_dim: N_ids = T.ivector(name='N_ids') V_ids = T.ivector(name='V_ids') # Sentence length s_len = (word_ids if word_dim else char_pos_ids).shape[0] # Final input (all word features) input_dim = 0 inputs = [] # # Word inputs # if word_dim: input_dim += word_dim word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer') word_input = inputs.append(word_input) # Initialize with pretrained embeddings if pre_emb and training: new_weights = word_layer.embeddings.get_value() print 'Loading pretrained embeddings from %s...' % pre_emb pretrained = {} emb_invalid = 0 for i, line in enumerate(, 'r', 'utf-8')): line = line.rstrip().split() if len(line) == word_dim + 1: pretrained[line[0]] = np.array( [float(x) for x in line[1:]]).astype(np.float32) else: emb_invalid += 1 if emb_invalid > 0: print 'WARNING: %i invalid lines' % emb_invalid c_found = 0 c_lower = 0 c_zeros = 0 # Lookup table initialization for i in xrange(n_words): word = self.id_to_word[i] if word in pretrained: new_weights[i] = pretrained[word] c_found += 1 elif word.lower() in pretrained: new_weights[i] = pretrained[word.lower()] c_lower += 1 elif re.sub('\d', '0', word.lower()) in pretrained: new_weights[i] = pretrained[re.sub( '\d', '0', word.lower())] c_zeros += 1 word_layer.embeddings.set_value(new_weights) print 'Loaded %i pretrained embeddings.' % len(pretrained) print( '%i / %i (%.4f%%) words have been initialized with ' 'pretrained embeddings.') % ( c_found + c_lower + c_zeros, n_words, 100. * (c_found + c_lower + c_zeros) / n_words) print( '%i found directly, %i after lowercasing, ' '%i after lowercasing + zero.') % (c_found, c_lower, c_zeros) # # Chars inputs # if char_dim: input_dim += char_lstm_dim char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer') char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_for') char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_rev') char_for_output = char_lstm_for.h.dimshuffle( (1, 0, 2))[T.arange(s_len), char_pos_ids] char_rev_output = char_lstm_rev.h.dimshuffle( (1, 0, 2))[T.arange(s_len), char_pos_ids] inputs.append(char_for_output) if char_bidirect: inputs.append(char_rev_output) input_dim += char_lstm_dim # # Capitalization feature # if cap_dim: input_dim += cap_dim cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer') inputs.append( if pos_dim: input_dim += pos_dim pos_layer = EmbeddingLayer(n_POS, pos_dim, name='pos_layer') inputs.append( if dep_dim: input_dim += dep_dim * 2 print '#########' print n_depN print n_depV dep_layer_N = EmbeddingLayer(n_depN, dep_dim, name='dep_layer_N') dep_layer_V = EmbeddingLayer(n_depV, dep_dim, name='dep_layer_V') dep_input_N = dep_input_V = inputs.append(dep_input_N) inputs.append(dep_input_V) # Initialize with pretrained embeddings if pre_emb_dep and training: new_weights_N = dep_layer_N.embeddings.get_value() new_weights_V = dep_layer_V.embeddings.get_value() print 'Loading pretrained embeddings from %s...' % pre_emb_dep pretrained = {} emb_invalid = 0 for i, line in enumerate(, 'r', 'utf-8')): line = line.rstrip().split() if len(line) == dep_dim + 1: pretrained[line[0]] = np.array( [float(x) for x in line[1:]]).astype(np.float32) else: emb_invalid += 1 if emb_invalid > 0: print 'WARNING: %i invalid lines' % emb_invalid c_found = 0 c_lower = 0 c_zeros = 0 # Lookup table initialization for i in xrange(n_depN): word = self.id_to_N[i] if word in pretrained: new_weights_N[i] = pretrained[word] c_found += 1 elif word.lower() in pretrained: new_weights_N[i] = pretrained[word.lower()] c_lower += 1 elif re.sub('\d', '0', word.lower()) in pretrained: new_weights_N[i] = pretrained[re.sub( '\d', '0', word.lower())] c_zeros += 1 dep_layer_N.embeddings.set_value(new_weights_N) print( '%i / %i (%.4f%%) words have been initialized with ' 'pretrained dep embeddings.') % ( c_found + c_lower + c_zeros, n_depN, 100. * (c_found + c_lower + c_zeros) / n_depN) print( '%i found directly, %i after lowercasing, ' '%i after lowercasing + zero.') % (c_found, c_lower, c_zeros) c_found = 0 c_lower = 0 c_zeros = 0 for i in xrange(n_depV): word = self.id_to_V[i] if word in pretrained: new_weights_V[i] = pretrained[word] c_found += 1 elif word.lower() in pretrained: new_weights_V[i] = pretrained[word.lower()] c_lower += 1 elif re.sub('\d', '0', word.lower()) in pretrained: new_weights_V[i] = pretrained[re.sub( '\d', '0', word.lower())] c_zeros += 1 dep_layer_V.embeddings.set_value(new_weights_V) print 'Loaded %i pretrained dep embeddings.' % len(pretrained) print( '%i / %i (%.4f%%) words have been initialized with ' 'pretrained dep embeddings.') % ( c_found + c_lower + c_zeros, n_depV, 100. * (c_found + c_lower + c_zeros) / n_depV) print( '%i found directly, %i after lowercasing, ' '%i after lowercasing + zero.') % (c_found, c_lower, c_zeros) # Prepare final input if len(inputs) != 1: inputs = T.concatenate(inputs, axis=1) # # Dropout on final input # if dropout: dropout_layer = DropoutLayer(p=dropout) input_train = input_test = (1 - dropout) * inputs inputs = T.switch(T.neq(is_train, 0), input_train, input_test) # LSTM for words word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_for') word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_rev')[::-1, :]) word_for_output = word_lstm_for.h word_rev_output = word_lstm_rev.h[::-1, :] if word_bidirect: print 'BUUUUUUUUUUUGGGGGGGG' final_output = T.concatenate([word_for_output, word_rev_output], axis=1) tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim, name='tanh_layer', activation='tanh') final_output = else: final_output = word_for_output # Sentence to Named Entity tags - Score final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer', activation=(None if crf else 'softmax')) tags_scores = tags_scores_softmax = tags_scores # No CRF if not crf: cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean() # CRF else: transitions = shared((n_tags + 2, n_tags + 2), 'transitions') small = -1000 b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32) e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32) observations = T.concatenate( [tags_scores, small * T.ones((s_len, 2))], axis=1) observations = T.concatenate([b_s, observations, e_s], axis=0) # Score from tags real_path_score = tags_scores[T.arange(s_len), tag_ids].sum() # Score from transitions b_id = theano.shared(value=np.array([n_tags], dtype=np.int32)) e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32)) padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0) real_path_score += transitions[padded_tags_ids[T.arange(s_len + 1)], padded_tags_ids[T.arange(s_len + 1) + 1]].sum() all_paths_scores = forward(observations, transitions) cost = -(real_path_score - all_paths_scores) # Network parameters params = [] if word_dim: self.add_component(word_layer) params.extend(word_layer.params) if char_dim: self.add_component(char_layer) self.add_component(char_lstm_for) params.extend(char_layer.params) params.extend(char_lstm_for.params) if char_bidirect: self.add_component(char_lstm_rev) params.extend(char_lstm_rev.params) self.add_component(word_lstm_for) params.extend(word_lstm_for.params) if word_bidirect: print 'BUUUUUGGGGGGG component' self.add_component(word_lstm_rev) params.extend(word_lstm_rev.params) if cap_dim: self.add_component(cap_layer) params.extend(cap_layer.params) if pos_dim: self.add_component(pos_layer) params.extend(pos_layer.params) if dep_dim: self.add_component(dep_layer_N) self.add_component(dep_layer_V) params.extend(dep_layer_N.params) params.extend(dep_layer_V.params) self.add_component(final_layer) params.extend(final_layer.params) if crf: self.add_component(transitions) params.append(transitions) if word_bidirect: self.add_component(tanh_layer) params.extend(tanh_layer.params) # Prepare train and eval inputs eval_inputs = [] if word_dim: eval_inputs.append(word_ids) if char_dim: eval_inputs.append(char_for_ids) if char_bidirect: eval_inputs.append(char_rev_ids) eval_inputs.append(char_pos_ids) if cap_dim: eval_inputs.append(cap_ids) if pos_dim: eval_inputs.append(pos_ids) if dep_dim: eval_inputs.append(N_ids) eval_inputs.append(V_ids) # train_inputs = eval_inputs + [tag_ids, probs] train_inputs = eval_inputs + [tag_ids] # Parse optimization method parameters if "-" in lr_method: lr_method_name = lr_method[:lr_method.find('-')] lr_method_parameters = {} for x in lr_method[lr_method.find('-') + 1:].split('-'): split = x.split('_') assert len(split) == 2 lr_method_parameters[split[0]] = float(split[1]) else: lr_method_name = lr_method lr_method_parameters = {} # Compile training function print 'Compiling...' if training: updates = Optimization(clip=5.0).get_updates( lr_method_name, cost, params, **lr_method_parameters) f_train = theano.function(inputs=train_inputs, outputs=cost, updates=updates, givens=({ is_train: np.cast['int32'](1) } if dropout else {})) else: f_train = None # Compile evaluation function if not crf: f_eval = theano.function(inputs=eval_inputs, outputs=tags_scores, givens=({ is_train: np.cast['int32'](0) } if dropout else {})) f_eval_softmax = theano.function( inputs=eval_inputs, outputs=tags_scores_softmax, givens=({ is_train: np.cast['int32'](0) } if dropout else {})) else: f_eval = theano.function(inputs=eval_inputs, outputs=forward( observations, transitions, viterbi=True, return_alpha=False, return_best_sequence=True), givens=({ is_train: np.cast['int32'](0) } if dropout else {})) f_eval_softmax = theano.function( inputs=eval_inputs, outputs=tags_scores_softmax, givens=({ is_train: np.cast['int32'](0) } if dropout else {})) return f_train, f_eval, f_eval_softmax
def get_output_for(self, inputs, **kwargs): # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied mask = None hid_init = None cell_init = None visual_input = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] if self.cell_init_incoming_index > 0: cell_init = inputs[self.cell_init_incoming_index] if self.visual_input_index > 0: visual_input = inputs[self.visual_input_index] # Treat all dimensions after the second as flattened feature dimensions if input.ndim > 3: input = T.flatten(input, 3) # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, 2) seq_len, num_batch, _ = input.shape # Stack input weight matrices into a (num_inputs, 4*num_units) # matrix, which speeds up computation W_in_stacked = T.concatenate( [self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell, self.W_in_to_outgate, self.W_in_to_ggate], axis=1 ) # Same for hidden weight matrices # pdb.set_trace() W_hid_stacked = T.concatenate( [self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell, self.W_hid_to_outgate, self.W_hid_to_ggate], axis=1 ) # Stack biases into a (4*num_units) vector b_stacked = T.concatenate( [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate, self.b_ggate], axis=0) if self.precompute_input: # Because the input is given for all time steps, we can # precompute_input the inputs dot weight matrices before scanning. # W_in_stacked is (n_features, 4*num_units). input is then # (n_time_steps, n_batch, 4*num_units). input =, W_in_stacked) + b_stacked # When theano.scan calls step, input_n will be (n_batch, 4*num_units). # We define a slicing function that extract the input to each LSTM gate def slice_w(x, n): return x[:, n*self.num_units:(n+1)*self.num_units] # Create single recurrent computation step function # input_n is the n'th vector of the input def step( input_n, cell_previous, hid_previous, visual, W_hid_stacked, W_in_stacked, b_stacked, W_cell_to_ingate, W_cell_to_forgetgate, W_cell_to_outgate, W_h_to_attenGate, W_g_to_attenGate, W_v_to_attenGate, W_s_to_attenGate, W_p ): if not self.precompute_input: input_n =, W_in_stacked) + b_stacked # Calculate gates pre-activations and slice gates = input_n +, W_hid_stacked) # Clip gradients if self.grad_clipping: gates = theano.gradient.grad_clip( gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) ggate = slice_w(gates, 4) if self.peepholes: # Compute peephole connections ingate += cell_previous*W_cell_to_ingate forgetgate += cell_previous*W_cell_to_forgetgate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) # ggate gt ggate = self.nonlinearity_ggate(ggate) # Compute new cell value cell = forgetgate*cell_previous + ingate*cell_input if self.peepholes: outgate += cell*W_cell_to_outgate outgate = self.nonlinearity_outgate(outgate) # Compute new hidden unit activation hid = outgate*self.nonlinearity(cell) st = ggate*self.nonlinearity(cell) # zt = # self.nonlinearity( #, W_v_to_attenGate) + # #, W_g_to_attenGate).dimshuffle(0, 1, 'x'), # T.ones((1, self.video_len)) # ) # ), # W_h_to_attenGate # )[:, :, 0] # to avoid optimization failure of Tenseor 3D dot vector, we should transform # e = to e = A*B.dimshuffle('x', 'x', 0), e=e.sum(axis=2) zt_dot_A = self.nonlinearity(, W_v_to_attenGate) +, W_g_to_attenGate).dimshuffle(0, 1, 'x'), T.ones((1, self.video_len)) ) ) zt = zt_dot_A*W_h_to_attenGate.dimshuffle('x', 'x', 0) zt = zt.sum(axis=2) # vt = # self.nonlinearity( # # st, W_s_to_attenGate # ) + # # hid, W_g_to_attenGate # ) # ), # W_h_to_attenGate # ) vt_dot_A = self.nonlinearity( st, W_s_to_attenGate ) + hid, W_g_to_attenGate ) ) vt = vt_dot_A*W_h_to_attenGate.dimshuffle('x', 0) vt = vt.sum(axis=1) vt = vt.dimshuffle(0, 'x') alpha_hat_t = self.nonlinearity_attenGate(T.concatenate( [zt, vt], axis=-1 )) feature = T.concatenate( [visual_input, st.dimshuffle(0, 'x', 1)], axis=1 ).dimshuffle(2, 0, 1) c_hat_t = T.sum(alpha_hat_t*feature, axis=-1) It = (c_hat_t.T+hid), W_p ) return [cell, hid, It] def step_masked( input_n, mask_n, cell_previous, hid_previous, It_previous, visual, W_hid_stacked, W_in_stacked, b_stacked, W_cell_to_ingate, W_cell_to_forgetgate, W_cell_to_outgate, W_h_to_attenGate, W_g_to_attenGate, W_v_to_attenGate, W_s_to_attenGate, W_p ): cell, hid, It = step( input_n, cell_previous, hid_previous, visual, W_hid_stacked, W_in_stacked, b_stacked, W_cell_to_ingate, W_cell_to_forgetgate, W_cell_to_outgate, W_h_to_attenGate, W_g_to_attenGate, W_v_to_attenGate, W_s_to_attenGate, W_p ) # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. cell = T.switch(mask_n, cell, cell_previous) hid = T.switch(mask_n, hid, hid_previous) It = T.switch(mask_n, It, It_previous) # theano.printing.Print('It')(It) return [cell, hid, It] if mask is not None: # mask is given as (batch_size, seq_len). Because scan iterates # over first dimension, we dimshuffle to (seq_len, batch_size) and # add a broadcastable dimension mask = mask.dimshuffle(1, 0, 'x') sequences = [input, mask] step_fun = step_masked else: sequences = input step_fun = step ones = T.ones((num_batch, 1)) if not isinstance(self.cell_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) cell_init =, self.cell_init) if not isinstance(self.hid_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) hid_init =, self.hid_init) It_init =, self.It_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [visual_input, W_hid_stacked] if not self.precompute_input: non_seqs += [W_in_stacked, b_stacked] else: non_seqs += [(), ()] # The "peephole" weight matrices are only used when self.peepholes=True if self.peepholes: non_seqs += [self.W_cell_to_ingate, self.W_cell_to_forgetgate, self.W_cell_to_outgate] else: non_seqs += [(), (), ()] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function non_seqs += [self.W_h_to_attenGate, self.W_g_to_attenGate, self.W_v_to_attenGate, self.W_s_to_attenGate, self.W_p] if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan cell_out, hid_out, It = unroll_scan( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init, It_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1]) else: # Scan op iterates over first dimension of input and repeatedly # applies the step function cell_out, hid_out, It = theano.scan( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init, It_init], go_backwards=self.backwards, truncate_gradient=self.gradient_steps, non_sequences=non_seqs, strict=True)[0] It = It.dimshuffle(1, 0, 2) if self.backwards: It = It[:, ::-1] return It
def get_output_for(self, inputs, **kwargs): # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied mask = None hid_init = None cell_init = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] if self.cell_init_incoming_index > 0: cell_init = inputs[self.cell_init_incoming_index] # TLSTM: Define new input time_mat = inputs[self.time_incoming_index] # Treat all dimensions after the second as flattened feature dimensions if input.ndim > 3: input = T.flatten(input, 3) if input = # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, 2) #(n_time_steps, n_batch) time_input = time_mat.dimshuffle(1, 0, 'x') time_seq_len, time_num_batch, _ = time_input.shape seq_len, num_batch, _ = input.shape # Stack input weight matrices into a (num_inputs, 4*num_units) # matrix, which speeds up computation W_in_stacked = T.concatenate([ self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell, self.W_in_to_outgate, self.W_x2_to_tg2, self.W_x1_to_tg1 ], axis=1) # Same for hidden weight matrices W_hid_stacked = T.concatenate([ self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell, self.W_hid_to_outgate ], axis=1) # Stack biases into a (4*num_units) vector b_stacked = T.concatenate([ self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate, self.b2_tg2, self.b1_tg1 ], axis=0) # W_t1_to_tg1_constraint < 0 W_t1_to_tg1_constraint = T.switch(, self.boundary), self.W_t1_to_tg1, self.boundary) # Stack delta time weight matrices into a (num_inputs, 2* num_units) W_t_stacked = T.concatenate( [self.W_to_to_outgate, self.W_t2_to_tg2, W_t1_to_tg1_constraint], axis=1) if self.precompute_input: # Because the input is given for all time steps, we can # precompute_input the inputs dot weight matrices before scanning. # W_in_stacked is (n_features, 4*num_units). input is then # (n_time_steps, n_batch, 4*num_units). time_input =, W_t_stacked) input =, W_in_stacked) + b_stacked # When theano.scan calls step, input_n will be (n_batch, 4*num_units). # We define a slicing function that extract the input to each LSTM gate def slice_w(x, start, stride=1): return x[:, start * self.num_units:(start + stride) * self.num_units] # Create single recurrent computation step function # input_n is the n'th vector of the input # todo # insert Tm_n, weight_t_o_n in to mask_n and xell_previous def step(input_n, time_input_n, cell_previous, hid_previous, *args): if not self.precompute_input: time_input_n =, W_t_stacked) input_n =, W_in_stacked) + b_stacked tm_wto_n = slice_w(time_input_n, 0) tm_w2_n = slice_w(time_input_n, 1) tm_w1_n = slice_w(time_input_n, 2) tm_w2_n = self.nonlinearity_inside_tg2(tm_w2_n) tm_w1_n = self.nonlinearity_inside_tg1(tm_w1_n) tm2_xwb_n = slice_w(input_n, 4) tm1_xwb_n = slice_w(input_n, 5) timegate2 = self.nonlinearity_outside_tg2(tm_w2_n + tm2_xwb_n) timegate1 = self.nonlinearity_outside_tg1(tm_w1_n + tm1_xwb_n) input_n = slice_w(input_n, 0, 4) # Calculate gates pre-activations and slice gates = input_n +, W_hid_stacked) # Clip gradients if self.grad_clipping: gates = theano.gradient.grad_clip(gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) outgate += tm_wto_n if self.peepholes: # Compute peephole connections ingate += cell_previous * self.W_cell_to_ingate forgetgate += cell_previous * self.W_cell_to_forgetgate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) # Compute new cell value cell = forgetgate * cell_previous + ingate * timegate2 * cell_input tilde_cell = forgetgate * cell_previous + ingate * timegate1 * cell_input if self.peepholes: outgate += tilde_cell * self.W_cell_to_outgate outgate = self.nonlinearity_outgate(outgate) # Compute new hidden unit activation hid = outgate * self.nonlinearity(tilde_cell) return [cell, hid] def step_masked(input_n, time_input_n, mask_n, cell_previous, hid_previous, *args): cell, hid = step(input_n, time_input_n, cell_previous, hid_previous, *args) # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. cell = T.switch(mask_n, cell, cell_previous) hid = T.switch(mask_n, hid, hid_previous) return [cell, hid] if mask is not None: # mask is given as (batch_size, seq_len). Because scan iterates # over first dimension, we dimshuffle to (seq_len, batch_size) and # add a broadcastable dimension mask = mask.dimshuffle(1, 0, 'x') sequences = [input, time_input, mask] step_fun = step_masked else: sequences = [input, time_input] step_fun = step ones = T.ones((num_batch, 1)) if not isinstance(self.cell_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) cell_init =, self.cell_init) if not isinstance(self.hid_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) hid_init =, self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [W_hid_stacked] # The "peephole" weight matrices are only used when self.peepholes=True if self.peepholes: non_seqs += [ self.W_cell_to_ingate, self.W_cell_to_forgetgate, self.W_cell_to_outgate ] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_seqs += [W_in_stacked, b_stacked, W_t_stacked] else: pass if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan cell_out, hid_out = unroll_scan(fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1]) else: # Scan op iterates over first dimension of input and repeatedly # applies the step function cell_out, hid_out = theano.scan( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, truncate_gradient=self.gradient_steps, non_sequences=non_seqs, strict=True)[0] # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: hid_out = hid_out[-1] else: # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, 2) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] return hid_out
def initial_outputs(self, batch_size): return self.initial_output * tensor.ones((batch_size, ), dtype='int64')
def generate(self, chars): return self.generator.generate(n_steps=3 * chars.shape[0], batch_size=chars.shape[1], attended=self.lookup.apply(chars), attended_mask=tensor.ones(chars.shape))
def build( self, dropout, ortho_char_input_dim, # Should be inferred from the input ortho_char_dim, ortho_char_lstm_dim, char_bidirect, word_vec_input_dim, # Should be inferred from the input wvecs word_dim, # The vector size after projection of the input vector word_lstm_dim, word_bidirect, lr_method, crf, use_type_sparse_feats, type_sparse_feats_input_dim, # Can be inferred from the output of the feature extractors type_sparse_feats_proj_dim, # This is a hyper-parameter use_token_sparse_feats, token_sparse_feats_input_dim, # Can be inferred from the output of the feature extractors # token_sparse_feats_proj_dim, # This is a hyper-parameter use_ortho_attention, use_phono_attention, # use_convolution, phono_char_input_dim, # Can be inferred phono_char_dim, phono_char_lstm_dim, training=True, **kwargs): """ Build the network. """ assert word_dim or phono_char_dim or ortho_char_dim, "No input selected while building the network!" # Training parameters n_tags = len(self.id_to_tag) # Network variables is_train = T.iscalar('is_train') word_vecs = T.dmatrix( name="word_vecs") # A vector for each word in the sentence # => matrix: (len_sent, w_emb_dim) ortho_char_for_vecs = T.dtensor3( name="ortho_char_for_vecs" ) # For each char of each word in the sentence, a char vector # ortho_char_for_vecs = T.ftensor3(name="ortho_char_for_vecs") # => tensor of form: (len_sent, max_wchar_len, char_emb_dim) ortho_char_rev_vecs = T.dtensor3(name="ortho_char_rev_vecs") # ortho_char_rev_vecs = T.ftensor3(name="ortho_char_rev_vecs") # For each char of each word in the sentence, a char vector # => tensor of form: (len_sent, max_wchar_len, char_emb_dim) phono_char_for_vecs = T.dtensor3(name="phono_char_for_vecs") # phono_char_for_vecs = T.ftensor3(name="phono_char_for_vecs") # For each char of each word in the sentence, a char vector # => tensor of form: (len_sent, max_ortho_char_len, char_emb_dim) phono_char_rev_vecs = T.dtensor3(name="phono_char_rev_vecs") # phono_char_rev_vecs = T.ftensor3(name="phono_char_rev_vecs") # For each char of each word in the sentence, a char vector # => tensor of form: (len_sent, max_phono_char_len, char_emb_dim) ortho_char_pos_ids = T.ivector(name='ortho_char_pos_ids') # The word len for each word in the sentence => vect of form: (len_sent,) phono_char_pos_ids = T.ivector(name='phono_char_pos_ids') # The word len for each word in the sentence => vect of form: (len_sent,) type_sparse_feats = T.imatrix(name="type_sparse_feats") # Type sparse features are appended to the input to the word lstm # For each word, a vector of type level sparse feats => mat of form: (len_sent, type_sparse_dim) token_sparse_feats = T.imatrix(name="token_sparse_feats") # Token sparse features are appended to the pre-crf layer # For each word, a vector of token level sparse feats => mat of form: (len_sent, token_sparse_dim) tag_ids = T.ivector(name='tag_ids') # The tag id for each word in the sentence => vect of form: (len_sent,) # Sentence length s_len = (word_vecs if word_dim else ortho_char_pos_ids if ortho_char_dim else phono_char_pos_ids).shape[0] # Final input (all word features) input_dim = 0 inputs = [] # # Word inputs # if word_dim: input_dim += word_dim word_layer = HiddenLayer(word_vec_input_dim, word_dim, activation="tanh", name="word_emb_proj") # TO DO : Try not using the bias term in the hidden layer word_input = inputs.append(word_input) # # Chars inputs # if ortho_char_dim: input_dim += ortho_char_lstm_dim ortho_char_layer = HiddenLayer(ortho_char_input_dim, ortho_char_dim, activation="tanh", name="ortho_char_emb_proj") # TO DO : Try not using bias in the hidden layer ortho_char_lstm_for = LSTM(ortho_char_dim, ortho_char_lstm_dim, with_batch=True, name='ortho_char_lstm_for') ortho_char_lstm_rev = LSTM(ortho_char_dim, ortho_char_lstm_dim, with_batch=True, name='ortho_char_lstm_rev') ortho_char_for_output = ortho_char_lstm_for.h.dimshuffle( (1, 0, 2))[T.arange(s_len), ortho_char_pos_ids] ortho_char_rev_output = ortho_char_lstm_rev.h.dimshuffle( (1, 0, 2))[T.arange(s_len), ortho_char_pos_ids] inputs.append(ortho_char_for_output) if char_bidirect: inputs.append(ortho_char_rev_output) input_dim += ortho_char_lstm_dim if phono_char_dim: input_dim += phono_char_lstm_dim phono_char_layer = HiddenLayer(phono_char_input_dim, phono_char_dim, activation="tanh", name="phono_char_emb_proj") # TO DO : Try not using bias in the hidden layer phono_char_lstm_for = LSTM(phono_char_dim, phono_char_lstm_dim, with_batch=True, name='phono_char_lstm_for') phono_char_lstm_rev = LSTM(phono_char_dim, phono_char_lstm_dim, with_batch=True, name='phono_char_lstm_rev') phono_char_for_output = phono_char_lstm_for.h.dimshuffle( (1, 0, 2))[T.arange(s_len), phono_char_pos_ids] phono_char_rev_output = phono_char_lstm_rev.h.dimshuffle( (1, 0, 2))[T.arange(s_len), phono_char_pos_ids] inputs.append(phono_char_for_output) if char_bidirect: inputs.append(phono_char_rev_output) input_dim += phono_char_lstm_dim # Type level sparse feats # if use_type_sparse_feats: input_dim += type_sparse_feats_input_dim type_level_sparse_layer = HiddenLayer( type_sparse_feats_input_dim, type_sparse_feats_proj_dim, activation="tanh", name='type_level_sparse_layer') # TO DO : Try not using the hidden layer here inputs.append( # Prepare final input if len(inputs) != 1: inputs = T.concatenate(inputs, axis=1) # TO DO : If using type sparse features, then apply hidden layer after concatenating all inputs else: inputs = inputs[0] # # Dropout on final input # if dropout: dropout_layer = DropoutLayer(p=dropout) input_train = input_test = (1 - dropout) * inputs """ Drop out involves sampling a vector of bernoulli random variables with a parameter 1-p and using it as a mask So, the expected value of the dropped out input is p * (0*x) + (1-p) * (1*x) = (1-p) * x. Since biases will on average respond to the expected input value, at test time we multiply test inputs (1-p) to supply the expected test input instead. """ inputs = T.switch(T.neq(is_train, 0), input_train, input_test) # LSTM for words word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_for') word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_rev')[::-1, :]) word_for_output = word_lstm_for.h word_rev_output = word_lstm_rev.h[::-1, :] lstm_outputs = [word_for_output] post_word_lstm_output_size = word_lstm_dim if use_token_sparse_feats: # token_level_sparse_layer = HiddenLayer(token_sparse_feats_input_dim, token_sparse_feats_proj_dim, # activation="tanh", # name='token_level_sparse_layer') # # TO DO : Try not using the hidden layer here # lstm_outputs.append( # post_word_lstm_output_size += token_sparse_feats_proj_dim lstm_outputs.append(token_sparse_feats) post_word_lstm_output_size += token_sparse_feats_input_dim if word_bidirect: lstm_outputs.append(word_rev_output) post_word_lstm_output_size += word_lstm_dim if len(lstm_outputs) > 1: final_output = T.concatenate(lstm_outputs, axis=1) tanh_layer = HiddenLayer(post_word_lstm_output_size, word_lstm_dim, name='tanh_layer', activation='tanh') final_output = else: final_output = word_for_output final_pre_crf_input_size = word_lstm_dim attention_vectors = [] attention_vector_size = 0 if use_ortho_attention and ortho_char_dim: # final_ortho_attention_input_layer = HiddenLayer(post_word_lstm_output_size, ortho_char_lstm_dim, # name='final_ortho_attention_input_layer', activation='tanh') final_ortho_attention_input_layer = HiddenLayer( word_lstm_dim, ortho_char_lstm_dim, name='final_ortho_attention_input_layer', activation='tanh') final_ortho_attention_input = final_output) # Evaluating attentional vector using a linear projection from final_output since the attention vector # must be conditioned on it and dimension must match the char lstm hidden dim. ortho_for_attention = self.get_TDAttention_vector( final_ortho_attention_input, ortho_char_lstm_for.h.dimshuffle((1, 0, 2)), ortho_char_pos_ids) if char_bidirect: ortho_rev_attention = self.get_TDAttention_vector( final_ortho_attention_input, ortho_char_lstm_rev.h.dimshuffle((1, 0, 2)), ortho_char_pos_ids) attention_vectors.append(ortho_rev_attention) attention_vector_size += ortho_char_lstm_dim attention_vectors.append(ortho_for_attention) attention_vector_size += ortho_char_lstm_dim if use_phono_attention and phono_char_dim: # final_phono_attention_input_layer = HiddenLayer(post_word_lstm_output_size, phono_char_lstm_dim, # name='final_phono_attention_input_layer', activation='tanh') final_phono_attention_input_layer = HiddenLayer( word_lstm_dim, phono_char_lstm_dim, name='final_phono_attention_input_layer', activation='tanh') # Evaluating attentional vector using a linear projection from final_output since the attention vector # must be conditioned on it and dimension must match the char lstm hidden dim. final_phono_attention_input = final_output) phono_for_attention = self.get_TDAttention_vector( final_phono_attention_input, phono_char_lstm_for.h.dimshuffle((1, 0, 2)), phono_char_pos_ids) if char_bidirect: phono_rev_attention = self.get_TDAttention_vector( final_phono_attention_input, phono_char_lstm_rev.h.dimshuffle((1, 0, 2)), phono_char_pos_ids) attention_vectors.append(phono_rev_attention) attention_vector_size += phono_char_lstm_dim attention_vectors.append(phono_for_attention) attention_vector_size += phono_char_lstm_dim if len(attention_vectors) > 1: attention_vectors = T.concatenate(attention_vectors, axis=1) if use_phono_attention or use_ortho_attention: final_output = T.concatenate([final_output, attention_vectors], axis=1) post_word_lstm_output_size += attention_vector_size final_pre_crf_input_size += attention_vector_size # Sentence to Named Entity tags - Score final_layer = HiddenLayer(final_pre_crf_input_size, n_tags, name='final_layer', activation=(None if crf else 'softmax')) tags_scores = # No CRF if not crf: cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean() # CRF else: transitions = shared((n_tags + 2, n_tags + 2), 'transitions') # n_tags + 2 to accommodate start and end symbols small = -1000 # = -log(inf) b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32) # Score of starting at start symbol is 1 => -log(1) = 0. Score of start symbol emitting any other NER # tag is -log(inf) = small e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32) # Score of ending at end symbol is 1 => -log(1) = 0. Score of end symbol emitting any other NER # tag is -log(inf) = small observations = T.concatenate( [tags_scores, small * T.ones((s_len, 2))], axis=1) # observations is the emission energy (-log potential) between each token and each tag. # Emission score of intermediate words towards start and end tags is -log(inf) observations = T.concatenate([b_s, observations, e_s], axis=0) # observations now contains the emission energies for start token, sentence tokens and end token # Score from tags real_path_score = tags_scores[T.arange(s_len), tag_ids].sum() # Sum of energies associated with the gold tags # Score from transitions b_id = theano.shared(value=np.array([n_tags], dtype=np.int32)) e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32)) padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0) real_path_score += transitions[padded_tags_ids[T.arange(s_len + 1)], padded_tags_ids[T.arange(s_len + 1) + 1]].sum() # Transition scores from label_i to label_{i+1} all_paths_scores = forward(observations, transitions) cost = -(real_path_score - all_paths_scores) # Network parameters params = [] if word_dim: self.add_component(word_layer) params.extend(word_layer.params) if ortho_char_dim: self.add_component(ortho_char_layer) self.add_component(ortho_char_lstm_for) params.extend(ortho_char_layer.params) params.extend(ortho_char_lstm_for.params) if char_bidirect: self.add_component(ortho_char_lstm_rev) params.extend(ortho_char_lstm_rev.params) if phono_char_dim: self.add_component(phono_char_layer) self.add_component(phono_char_lstm_for) params.extend(phono_char_layer.params) params.extend(phono_char_lstm_for.params) if char_bidirect: self.add_component(phono_char_lstm_rev) params.extend(phono_char_lstm_rev.params) if use_type_sparse_feats: self.add_component(type_level_sparse_layer) params.extend(type_level_sparse_layer.params) self.add_component(word_lstm_for) params.extend(word_lstm_for.params) if word_bidirect: self.add_component(word_lstm_rev) params.extend(word_lstm_rev.params) if word_bidirect or len(lstm_outputs) > 1: self.add_component(tanh_layer) params.extend(tanh_layer.params) if use_ortho_attention and ortho_char_dim: self.add_component(final_ortho_attention_input_layer) params.extend(final_ortho_attention_input_layer.params) if use_phono_attention and phono_char_dim: self.add_component(final_phono_attention_input_layer) params.extend(final_phono_attention_input_layer.params) self.add_component(final_layer) params.extend(final_layer.params) if crf: self.add_component(transitions) params.append(transitions) # Prepare train and eval inputs eval_inputs = [] if word_dim: # eval_inputs.append(word_ids) eval_inputs.append(word_vecs) if ortho_char_dim: # eval_inputs.append(char_for_ids) eval_inputs.append(ortho_char_for_vecs) if char_bidirect: # eval_inputs.append(char_rev_ids) eval_inputs.append(ortho_char_rev_vecs) eval_inputs.append(ortho_char_pos_ids) if phono_char_dim: # eval_inputs.append(char_for_ids) eval_inputs.append(phono_char_for_vecs) if char_bidirect: # eval_inputs.append(char_rev_ids) eval_inputs.append(phono_char_rev_vecs) eval_inputs.append(phono_char_pos_ids) if use_type_sparse_feats: eval_inputs.append(type_sparse_feats) if use_token_sparse_feats: eval_inputs.append(token_sparse_feats) train_inputs = eval_inputs + [tag_ids] # Parse optimization method parameters if "-" in lr_method: lr_method_name = lr_method[:lr_method.find('-')] lr_method_parameters = {} for x in lr_method[lr_method.find('-') + 1:].split('-'): split = x.split('_') assert len(split) == 2 lr_method_parameters[split[0]] = float(split[1]) else: lr_method_name = lr_method lr_method_parameters = {} # Compile training function print 'Compiling...' if training: updates = Optimization(clip=5.0).get_updates( lr_method_name, cost, params, **lr_method_parameters) f_train = theano.function(inputs=train_inputs, outputs=cost, updates=updates, givens=({ is_train: np.cast['int32'](1) } if dropout else {})) else: f_train = None # Compile evaluation function if not crf: f_eval = theano.function(inputs=eval_inputs, outputs=tags_scores, givens=({ is_train: np.cast['int32'](0) } if dropout else {})) else: f_eval = theano.function(inputs=eval_inputs, outputs=forward( observations, transitions, viterbi=True, return_alpha=False, return_best_sequence=True), givens=({ is_train: np.cast['int32'](0) } if dropout else {})) print("Finished Compiling") return f_train, f_eval
def __init__(self, n_out, collapse_output=False, directions=4, projection='average', base=None, **kwargs): if base is None: base = [] super(TwoDLSTMLayer, self).__init__(n_out, **kwargs) assert len(self.sources) == 1 source = self.sources[0] n_in = source.attrs['n_out'] X = source.output assert X.ndim == 4 sizes = source.output_sizes self.output_sizes = sizes assert directions in [1, 2, 4], "only 1, 2 or 4 directions are supported" assert projection in ['average', 'concat'], "invalid projection" if base: self.b1 = self.add_param(base[0].b1) self.b2 = self.add_param(base[0].b2) if directions >= 1: self.b3 = self.add_param(base[0].b3) self.b4 = self.add_param(base[0].b4) self.W1, self.V_h1, self.V_v1 = self.add_param( base[0].W1), self.add_param(base[0].V_h1), self.add_param( base[0].V_v1) self.W2, self.V_h2, self.V_v2 = self.add_param( base[0].W2), self.add_param(base[0].V_h2), self.add_param( base[0].V_v2) if directions >= 1: self.W3, self.V_h3, self.V_v3 = self.add_param( base[0].W3), self.add_param(base[0].V_h3), self.add_param( base[0].V_v3) self.W4, self.V_h4, self.V_v4 = self.add_param( base[0].W4), self.add_param(base[0].V_h4), self.add_param( base[0].V_v4) #self.mass = base[0].mass #self.masks = base[0].masks #self.b1 = base[0].b1 #self.b2 = base[0].b2 #if directions >= 1: # self.b3 = base[0].b3 # self.b4 = base[0].b4 #self.W1, self.V_h1, self.V_v1 = base[0].W1, base[0].V_h1, base[0].V_v1 #self.W2, self.V_h2, self.V_v2 = base[0].W2, base[0].V_h2, base[0].V_v2 #if directions >= 1: # self.W3, self.V_h3, self.V_v3 = base[0].W3, base[0].V_h3, base[0].V_v3 # self.W4, self.V_h4, self.V_v4 = base[0].W4, base[0].V_h4, base[0].V_v4 self.mass = base[0].mass self.masks = base[0].masks else: self.b1 = self.create_and_add_bias(n_out, "1") self.b2 = self.create_and_add_bias(n_out, "2") if directions >= 1: self.b3 = self.create_and_add_bias(n_out, "3") self.b4 = self.create_and_add_bias(n_out, "4") self.W1, self.V_h1, self.V_v1 = self.create_and_add_2d_lstm_weights( n_in, n_out, "1") self.W2, self.V_h2, self.V_v2 = self.create_and_add_2d_lstm_weights( n_in, n_out, "2") if directions >= 1: self.W3, self.V_h3, self.V_v3 = self.create_and_add_2d_lstm_weights( n_in, n_out, "3") self.W4, self.V_h4, self.V_v4 = self.create_and_add_2d_lstm_weights( n_in, n_out, "4") # dropout assert len(self.masks) == 1 mask = self.masks[0] if mask is not None: X = self.mass * mask * X if str(theano.config.device).startswith('cpu'): Y = T.zeros_like(X) if projection == 'concat': Y = Y.repeat(directions, axis=-1) n_out *= directions else: if directions <= 2: Y = BidirectionalTwoDLSTMOpInstance(X, self.W1, self.W2, self.V_h1, self.V_h2, self.V_v1, self.V_v2, self.b1, self.b2, sizes) else: Y = MultiDirectionalTwoDLSTMOpInstance( X, self.W1, self.W2, self.W3, self.W4, self.V_h1, self.V_h2, self.V_h3, self.V_h4, self.V_v1, self.V_v2, self.V_v3, self.V_v4, self.b1, self.b2, self.b3, self.b4, sizes) if directions > 1: Y = T.stack(Y[:directions], axis=-1) if projection == 'average': Y = Y.mean(axis=-1) elif projection == 'concat': Y = Y.reshape((Y.shape[0], Y.shape[1], Y.shape[2], Y.shape[3] * Y.shape[4])) n_out *= directions else: Y = Y[0] = 'Y' self.set_attr('n_out', n_out) self.set_attr('collapse_output', collapse_output) self.set_attr('directions', directions) self.set_attr('projection', projection) #index handling def index_fn(index, size): return T.set_subtensor(index[:size], numpy.cast['int8'](1)) index_init = T.zeros((Y.shape[2], Y.shape[1]), dtype='int8') self.index, _ = theano.scan( index_fn, [index_init, T.cast(sizes[:, 1], "int32")]) self.index = self.index.dimshuffle(1, 0) if collapse_output == 'sum' or collapse_output == True: Y = Y.sum(axis=0) elif collapse_output == 'mean': Y = Y.mean(axis=0) elif collapse_output == 'conv': from TheanoUtil import circular_convolution Y, _ = theano.scan(lambda x_i, x_p: circular_convolution(x_i, x_p), Y, Y[0]) Y = Y[-1] elif collapse_output == 'flatten': self.index = T.ones((Y.shape[0] * Y.shape[1], Y.shape[2]), dtype='int8') Y = Y.reshape((Y.shape[0] * Y.shape[1], Y.shape[2], Y.shape[3])) elif str(collapse_output).startswith('pad_'): pad = numpy.int32(collapse_output.split('_')[-1]) Y = ifelse([0], pad), T.concatenate([ Y, T.zeros( (pad - Y.shape[0], Y.shape[1], Y.shape[2], Y.shape[3]), 'float32') ], axis=0), ifelse([0], pad), Y[:pad], Y)) Y = Y.dimshuffle(1, 2, 3, 0).reshape( (Y.shape[1], Y.shape[2], Y.shape[3] * Y.shape[0])) self.attrs['n_out'] *= pad elif collapse_output != False: assert False, "invalid collapse mode" if self.attrs['batch_norm']: Y = self.batch_norm( Y, self.attrs['n_out'], index=sizes if not collapse_output else self.index, force_sample=False) self.output = Y
def broadcast_vec(x, n): form = TT.ones((n, 1)) return, x)
elif objective_flag == 'argmax': # argmax approximation cla_out_y_hard = cla_out_y.argmax(axis=1) dis_out_p_c = ll.get_output(dis_layers[-1], {dis_in_x:sym_x_u,dis_in_y:cla_out_y_hard}, deterministic=False) else: raise Exception('Unknown objective flags') image = ll.get_output(gen_layers[-1], {gen_in_y:sym_y_g, gen_in_z:sym_z_image}, deterministic=False) # for generation accurracy_eval = (lasagne.objectives.categorical_accuracy(cla_out_y_eval, sym_y)) # for evaluation accurracy_eval = accurracy_eval.mean() # costs bce = lasagne.objectives.binary_crossentropy dis_cost_p = bce(dis_out_p, T.ones(dis_out_p.shape)).mean() # D distincts p dis_cost_p_g = bce(dis_out_p_g, T.zeros(dis_out_p_g.shape)).mean() # D distincts p_g gen_cost_p_g = bce(dis_out_p_g, T.ones(dis_out_p_g.shape)).mean() # G fools D weight_decay_classifier = lasagne.regularization.regularize_layer_params_weighted({cla_layers[-1]:1}, lasagne.regularization.l2) # weight decay dis_cost_p_c = bce(dis_out_p_c, T.zeros(dis_out_p_c.shape)) # D distincts p_c cla_cost_p_c = bce(dis_out_p_c, T.ones(dis_out_p_c.shape)) # C fools D if objective_flag == 'integrate': # integrate weight_loss_c = T.reshape(cla_cost_p_c, (-1, num_classes)) * cla_out_y cla_cost_p_c = T.sum(weight_loss_c, axis=1).mean() weight_loss_d = T.reshape(dis_cost_p_c, (-1, num_classes)) * cla_out_y dis_cost_p_c = T.sum(weight_loss_d, axis=1).mean() elif objective_flag == 'argmax':
def reg_EPhi(self, lengthscale_trf, lengthscale_p_trf, sf_trf, S, MU, SIGMA_trf, U, b, N, M, i, D, order, non_rec): # lengthscale_trf # D[i] # lengthscale_p_trf # D[i] # sf_trf # 1 # S # M x D[i] # MU # N x D[i] # SIGMA_trf # N x D[i] # U # M x D[i] # b # M # N # 1 # M # 1 b = T.zeros(T.shape(b)) MU_S = T.zeros(T.shape(S)) SIGMA_S_trf = T.ones(T.shape(S)) inv_SIGMA_trf = SIGMA_trf**-1 # N x D[i] MU_S_hat = lengthscale_trf**-1 * MU_S + 2 * np.pi * lengthscale_p_trf**-1 # M x D[i] MU_S_hat_U_b = -(MU_S_hat * U).sum(1)[None, :] + b # M x M big_sum_minus = MU_S_hat_U_b - MU_S_hat_U_b.T # M x M big_sum_plus = MU_S_hat_U_b + MU_S_hat_U_b.T # M x M MU_S_hat_minus = MU_S_hat[ None, :, :] - MU_S_hat[:, None, :] # M x M x D[i] MU_S_hat_plus = MU_S_hat[ None, :, :] + MU_S_hat[:, None, :] # M x M x D[i] u_EEPhiTPhi = (U[None, :, :] - U[:, None, :])**2 # M x M x D[i] b_bold_denomi = SIGMA_S_trf[ None, :, :] + SIGMA_S_trf[:, None, :] # M x M x D[i] sum_SIGMA_S_U = SIGMA_S_trf * U # M x D[i] b_bold = (sum_SIGMA_S_U[None, :, :] + sum_SIGMA_S_U[:, None, :]) / b_bold_denomi # M x M x D[i] B = (lengthscale_trf**2)[None, None, :] / b_bold_denomi # M x M x D[i] inv_B = 1 / B # M x M x D[i] U_EEPhiTPhi = (lengthscale_trf**2)[None, None, :] * ( SIGMA_S_trf[None, :, :]**-1 + SIGMA_S_trf[:, None, :]**-1 ) # M x M x D[i] norm_EEPhiTPhi_U_temp = lengthscale_trf[None, None, :]**2 / ( (SIGMA_S_trf[None, :, :] * SIGMA_S_trf[:, None, :]) * U_EEPhiTPhi)**0.5 # M x M x D[i] Z_n_U_EEPhiTPhi = np.exp(-0.5 * (u_EEPhiTPhi / U_EEPhiTPhi).sum(2)) # M x M inv_B_b_bold = inv_B * b_bold # M x M x D[i] inv_SIGMA_trf_MU = inv_SIGMA_trf * MU # N x D[i] EPhiTPhi = np.zeros((M, M)) loop = np.int64(-1) def EPhiTPhi_loop_i0(loop, EPhiTPhi, non_rec, D, order, MU, SIGMA_trf, inv_SIGMA_trf, inv_SIGMA_trf_MU, inv_B, b_bold, inv_B_b_bold, B, MU_S_hat_minus, MU_S_hat_plus, big_sum_minus, big_sum_plus, norm_EEPhiTPhi_U_temp): loop = loop + 1 D_n = (inv_B + inv_SIGMA_trf[loop, :][None, None, :])**-1 # M x M x D[i] if non_rec == 0: d_n = D_n[:, :, D - order:D] * ( inv_B_b_bold[:, :, D - order:D] + inv_SIGMA_trf_MU[loop, :][None, None, D - order:D] ) # M x M x N x order d_n = T.concatenate( (MU[loop, :][0:D - order][None, None, :] + T.zeros_like(inv_B[:, :, 0:D - order]), d_n), axis=2) # M x M x N x D[i] else: d_n = MU[loop, :][None, None, :] + T.zeros_like( inv_B) # M x M x N x D[i] W = B + SIGMA_trf[loop, :][None, None, :] # M x M x D[i] norm_EEPhiTPhi_U_W = (norm_EEPhiTPhi_U_temp / W**0.5).prod( 2 ) # M x M % here we put det(U), det(W), because of numeric issues (prod(2) is huge for huge input-dimensions) Z_n_W = T.exp( -0.5 * ((b_bold - MU[loop, :][None, None, :])**2 / W).sum(2)) # M x M EPhiTPhi = EPhiTPhi + Z_n_W * norm_EEPhiTPhi_U_W * ( T.exp(-0.5 * (MU_S_hat_minus**2 * D_n).sum(2)) * T.cos( (MU_S_hat_minus * d_n).sum(2) + big_sum_minus) + T.exp(-0.5 * (MU_S_hat_plus**2 * D_n).sum(2)) * T.cos( (MU_S_hat_plus * d_n).sum(2) + big_sum_plus)) # M x M return loop, EPhiTPhi def EPhiTPhi_loop_i(loop, EPhiTPhi, order, MU, SIGMA_trf, inv_SIGMA_trf, inv_SIGMA_trf_MU, inv_B, b_bold, inv_B_b_bold, B, MU_S_hat_minus, MU_S_hat_plus, big_sum_minus, big_sum_plus, norm_EEPhiTPhi_U_temp): loop = loop + 1 D_n = (inv_B + inv_SIGMA_trf[loop, :][None, None, :])**-1 # M x M x D[i] d_n = D_n * ( inv_B_b_bold + inv_SIGMA_trf_MU[loop, :][None, None, :] ) # M x M x D[i] W = B + SIGMA_trf[loop, :][None, None, :] # M x M x D[i] norm_EEPhiTPhi_U_W = (norm_EEPhiTPhi_U_temp / W**0.5).prod( 2 ) # M x M % here we put det(U), det(W), because of numeric issues (prod(2) is huge for huge input-dimensions) Z_n_W = T.exp( -0.5 * ((b_bold - MU[loop, :][None, None, :])**2 / W).sum(2)) # M x M EPhiTPhi = EPhiTPhi + Z_n_W * norm_EEPhiTPhi_U_W * ( T.exp(-0.5 * (MU_S_hat_minus**2 * D_n).sum(2)) * T.cos( (MU_S_hat_minus * d_n).sum(2) + big_sum_minus) + T.exp(-0.5 * (MU_S_hat_plus**2 * D_n).sum(2)) * T.cos( (MU_S_hat_plus * d_n).sum(2) + big_sum_plus)) # M x M return loop, EPhiTPhi if i == 0: result, _ = theano.scan(EPhiTPhi_loop_i0, outputs_info=[loop, EPhiTPhi], n_steps=N, non_sequences=[ non_rec, D, order, MU, SIGMA_trf, inv_SIGMA_trf, inv_SIGMA_trf_MU, inv_B, b_bold, inv_B_b_bold, B, MU_S_hat_minus, MU_S_hat_plus, big_sum_minus, big_sum_plus, norm_EEPhiTPhi_U_temp ]) else: result, _ = theano.scan(EPhiTPhi_loop_i, outputs_info=[loop, EPhiTPhi], n_steps=N, non_sequences=[ order, MU, SIGMA_trf, inv_SIGMA_trf, inv_SIGMA_trf_MU, inv_B, b_bold, inv_B_b_bold, B, MU_S_hat_minus, MU_S_hat_plus, big_sum_minus, big_sum_plus, norm_EEPhiTPhi_U_temp ]) EPhiTPhi_out = result[-1][-1] # M x M reg_EEPhiTPhi = (sf_trf**2 / 2) * Z_n_U_EEPhiTPhi * EPhiTPhi_out # M x M return reg_EEPhiTPhi
s = [] oa = T.sum(a) ob = T.sum(b) o = oa + ob # sTore all the out puts in a list to return the values to the scan function s.append(o) s.append(a) s.append(b) return s s = [] s.append(None) s.append(dict(initial=T.ones(3))) s.append(dict(initial=T.ones(4))) #output, updates = theano.scan( # op, # sequences=x, # truncate_gradient=4, # outputs_info=[None, # dict(initial=T.ones(3)), # dict(initial=T.ones(4))]) output, updates = theano.scan(op, sequences=x, truncate_gradient=4, outputs_info=s)