Beispiel #1
0
    def __init__(self,model,
                 dis_updater = updates.Adam(lr=sharedX(0.0002), b1=0.5, regularizer=updates.Regularizer(l2=1e-5)),
                 gen_updater = updates.Adam(lr=sharedX(0.0002), b1=0.5, regularizer=updates.Regularizer(l2=1e-5))):

        X = model.X
        Z = model.Z
        targets = T.matrix()

        genX = model.genX

        disX = model.disX
        disgenX = model.disgenX

        disX_loss = bce(disX, T.ones(disX.shape)).mean()
        disgenX_loss = bce(disgenX, T.zeros(disgenX.shape)).mean()
        genX_loss = bce(disgenX, T.ones(disgenX.shape)).mean()

        dis_loss = disX_loss + disgenX_loss
        gen_loss = genX_loss

        trainable_discrim_params = model.trainable_discrim_params
        trainable_gen_params = model.trainable_gen_params

        dis_updates = dis_updater(trainable_discrim_params, dis_loss) + model.other_discrim_updates
        gen_updates = gen_updater(trainable_gen_params, gen_loss) + model.other_gen_updates

        print 'COMPILING'
        t = time()
        self._train_gen = theano.function([Z], gen_loss, updates=gen_updates)
        self._train_dis = theano.function([X, Z], dis_loss, updates=dis_updates)
        self._gen = theano.function([Z], genX)
        print '%.2f seconds to compile theano functions'%(time()-t)
 def compute(self, minibatch=1, steps=5, lrate=0.01):
     G = Generator(self.num_vis, self.num_hid)
     D = Discriminator(self.num_vis)
     for i in range(steps):
         # Sample m noise examples from Generator
         noise_samples = G.get_noise()
         # Sample m examples from data distribution
         data_examples = self._sample(minibatch)
         # Get real examples
         realX = D.output(data_examples)
         # Get generated examples
         genX = D.output(noise_samples)
         drealcost = T.mean(T.nnet.binary_crossentropy(realX, T.ones(realX.shape)))
         dgencost = T.mean(T.nnet.binary_crossentropy(noise_samples, T.zeros(genX.shape)))
         gencost = T.mean(T.nnet.binary_crossentropy(genX, T.ones(genX.shape)))
         cost = drealcost + dgencost
         updates = D.update(cost.mean())
         func = theano.function([], (realX, genX), updates=updates, givens={self.x: self.data})
         print("Discriminator cost {0}: ".format(func()))
     noise_samples = G.get_noise()
     allparams = []
     for param in G.params:
         allparams.append(param)
     '''for param in D.params:
         allparams.append(param)'''
     #gencost = 1 / self.num_samples * \
     #    T.sum(T.log(1 - D.output(G.output(noise_samples))))
     grads = T.grad(T.mean(gencost), allparams)
     return gencost, [(oldparam, oldparam - lrate * newparam) for (oldparam, newparam) in zip(allparams, grads)]
Beispiel #3
0
 def step_fun(self):
     if self._step_fun is None:
         inputs = T.matrix('inputs')
         states_tm1 = [T.matrix('state_%d_%d_tm1' % (layer, state))
                       for layer in range(self.n_layers)
                       for state in range(self.gate0.n_states)]
         if self.gates[-1].use_attention:
             raise NotImplementedError('Stacked RNN with attention')
             attended=T.tensor3('attended')
             attended_dot_u=T.tensor3('attended_dot_u')
             attention_mask=T.matrix('attention_mask')
             self._step_fun = function(
                     [inputs] + states_tm1 + [
                         attended, attended_dot_u, attention_mask],
                     self.step(*([inputs, T.ones(inputs.shape[:-1])] +
                                 states_tm1 + [T.ones_like(states_tm1[0]),
                                 attended, attended_dot_u,
                                 attention_mask])),
                     name='%s_step_fun'%self.name)
         else:
             self._step_fun = function(
                     [inputs] + states_tm1,
                     self.step(*([inputs, T.ones(inputs.shape[:-1])] +
                               states_tm1 + [T.ones_like(states_tm1[0])])),
                     name='%s_step_fun'%self.name)
     return self._step_fun
Beispiel #4
0
def chi2_test_statistic(M, Obs, K, num_M, num_Obs):
    #Getting frequencies from observations
    Ns = T.dot(Obs,T.ones((K,1)))
    p = Obs/Ns
        
    #Find the zeros so we can deal with them later
    pZEROs = T.eq(p, 0)
    mZEROs = T.eq(M, 0)
    
    #log probabilities, with -INF as log(0)
    lnM = T.log(M + mZEROs) - INF*mZEROs
    lnp = T.log(p + pZEROs) - INF*pZEROs


    #Using kroneker products so every row of M hits every row of P in the difference klnM - kln
    O_ones = T.ones((num_Obs,1))
    M_ones = T.ones((num_M,1))
    klnM = kron(lnM,O_ones)
    klnP = kron(M_ones, lnp)
    klnP_M = klnP - klnM
    kObs = kron(M_ones, Obs)
    
    G = 2.0*T.dot(klnP_M ,kObs.T)
    
    G = G*T.identity_like(G)
    G = T.dot(G,T.ones((num_M*num_Obs,1)))   
    G = T.reshape(G,(num_M,num_Obs))
    
    #The following quotient improves the convergence to chi^2 by an order of magnitude
    #source: http://en.wikipedia.org/wiki/Multinomial_test
    
    #numerator = T.dot(- 1.0/(M + 0.01),T.ones((K,1))) - T.ones((num_M,1))    
    #q1 = T.ones((num_M,num_Obs)) + T.dot(numerator,1.0/Ns.T/6.0)/(K-1.0)
        
    return G#/q1 
Beispiel #5
0
    def instantiate(self, shape=None):
        # Parse shape
        shape = [None, ] * self.ndim if shape is None else shape
        initshape = tuple([shape[n] if givenshape is None else givenshape for n, givenshape in enumerate(self.shape)])
        assert all([ishp is not None for ishp in initshape]), "Given shape information not sufficient to instantiate " \
                                                              "from ghost state."

        # Initialize. If shape is a tensor variable, initialize a tensor variable and return.
        if isinstance(shape, T.vector().__class__) or not self.shared:
            # Make variable
            var = T.zeros(shape=initshape, dtype='floatX') \
                if self.value == 0. else self.value(initshape) * T.ones(shape=initshape, dtype='floatX') \
                if callable(self.value) else self.value * T.ones(shape=initshape, dtype='floatX')
            # Safety cast
            var = T.cast(var, dtype='floatX')
            var.name = self.name
            # Warn if a shared variable is requested
            if self.shared:
                warn("Provided shape variable is a theano tensor variable, it cannot be used to initialize a shared "
                     "variable.")
            # Return
            return var
        else:
            # Make variable
            var = th.shared((getattr(np, th.config.floatX)(self.value)
                             if not callable(self.value) and not np.isscalar(self.value) else
                             getattr(np, th.config.floatX)(self.value(initshape)) if callable(self.value) else
                             self.value * np.ones(shape=initshape, dtype=th.config.floatX)))

            var.name = self.name
            # Safety cast and return
            return var
Beispiel #6
0
 def _initial_part_matrix(self, part, size, deterministic):
     if size is None:
         size = 1
     length, dist_name, dist_map = self._choose_alternative(
         part,
         (self.local_size, self.initial_dist_local_name, self.initial_dist_local_map),
         (self.global_size, self.initial_dist_global_name, self.initial_dist_global_map)
     )
     dtype = self.symbolic_initial_global_matrix.dtype
     if length == 0:  # in this case theano fails to compute sample of correct size
         return tt.ones((size, 0), dtype)
     length = tt.as_tensor(length)
     size = tt.as_tensor(size)
     shape = tt.stack((size, length))
     # apply optimizations if possible
     if not isinstance(deterministic, tt.Variable):
         if deterministic:
             return tt.ones(shape, dtype) * dist_map
         else:
             return getattr(self._rng, dist_name)(shape)
     else:
         sample = getattr(self._rng, dist_name)(shape)
         initial = tt.switch(
             deterministic,
             tt.ones(shape, dtype) * dist_map,
             sample
         )
         return initial
    def pos_phase_updates(self, v, init_state=None, n_steps=1, mean_field=False):
        """
        Implements the positive phase sampling, which performs blocks Gibbs
        sampling in order to sample from p(g,h,x,y|v).
        :param v: fixed training set
        :param init: dictionary of initial values, or None if sampling from scratch
        :param n_steps: scalar, number of Gibbs steps to perform.
        :param restart: if False, start sampling from buffers self.pos_*
        """
        if init_state is None:
            assert n_steps
            # start sampler from scratch
            init_state = OrderedDict()
            init_state['g'] = T.ones((self.batch_size,self.n_g)) * T.nnet.sigmoid(self.gbias)
            init_state['s'] = T.ones((self.batch_size,self.n_g)) * self.mu
            init_state['h'] = T.ones((self.batch_size,self.n_h)) * T.nnet.sigmoid(self.hbias)
            init_state['t'] = T.ones((self.batch_size,self.n_h)) * self.eta

        [new_g, new_s, new_h, new_t] = self.pos_phase(v,
                init_state = init_state,
                n_steps = n_steps,
                mean_field = mean_field)

        pos_states = OrderedDict()
        pos_states['g'] = new_g
        pos_states['s'] = new_s
        pos_states['h'] = new_h
        pos_states['t'] = new_t

        # update running average of positive phase activations
        pos_updates = OrderedDict()
        return pos_states, pos_updates
def sample_h_given_v_2wise(v, W, Wh, bh, nh):
	phi = T.dot(v, W) + bh
	ephi = T.exp(phi)

	adder = np.zeros((nh/2, nh), dtype=theano.config.floatX)
	for i in xrange(len(adder)):
		adder[i, 2*i] = 1
		adder[i, 2*i+1] = 1
	adder = theano.shared(adder)
	# wobble =  1 + exp(phi_2i) + exp(phi_{2i+1}) + exp(phi_2i + phi_{21+1} + Wh_i)
	# p(h_2i = 1 | v) = (exp(phi_2i) + exp(phi_2i + phi_{21+1} + Wh_i ) / wobble
	# p(h_{2i+1} = 1 | v) = (exp(phi_2i) + exp(phi_2i + phi_{2i+1} + Wh_i )) / wobble
	# the second term is the same in both - the pair term.  but it must be broadcasted (the kron!)
	# dotting by adder returns a vector of half the size of sums of pairs of elements

	pairsum = T.dot(ephi, adder.T)
	first = ephi.T[T.arange(0, nh, 2)].T
	pairprod = pairsum*first - first**2
	pairterm = pairprod*T.exp(Wh)

	wobble = 1 + pairsum + pairterm

	pairterm_broadcast = kron(pairterm.dimshuffle(0, 'x'), T.ones(2))
	wobble_broadcast = kron(wobble.dimshuffle(0, 'x'), T.ones(2))

	prop_up = (ephi + pairterm_broadcast) / wobble_broadcast

	h = theano_rng.binomial(n=1, p = prop_up, dtype=theano.config.floatX, size=(nh,), ndim=1)

	return h
    def pos_phase_updates(self, v, init_state=None, n_steps=1):
        """
        Implements the positive phase sampling, which performs blocks Gibbs
        sampling in order to sample from p(g,h,x,y|v).
        :param v: fixed training set
        :param init: dictionary of initial values, or None if sampling from scratch
        :param n_steps: scalar, number of Gibbs steps to perform.
        :param restart: if False, start sampling from buffers self.pos_*
        """
        if init_state is None:
            assert n_steps
            # start sampler from scratch
            init_state = OrderedDict()
            init_state['g'] = T.ones((v.shape[0], self.n_g)) * T.nnet.sigmoid(self.gbias)
            init_state['h'] = T.ones((v.shape[0], self.n_h)) * T.nnet.sigmoid(self.hbias)

        [new_g, new_h, new_s1, new_s0, crap_v, pos_counter] = self.pos_phase(
                v, init_state=init_state, n_steps=n_steps)

        # update running average of positive phase activations
        pos_updates = OrderedDict()
        pos_updates[self.pos_counter] = pos_counter
        pos_updates[self.odd_even] = (self.odd_even + 1) % 2
        pos_updates[self.pos_g] = new_g
        pos_updates[self.pos_h] = new_h
        pos_updates[self.pos_s1] = new_s1
        pos_updates[self.pos_s0] = new_s0
        pos_updates[self.pos_s]  = self.s_hat(new_h, new_s1, new_s0)
        if self.flags['pos_phase_ch']:
            pos_updates[self.ch] = T.cast(0.999 * self.ch + 0.001 * new_h.mean(axis=0), floatX)
        return pos_updates
    def pos_phase_updates(self, v, l=None, init_state=None, n_steps=1, mean_field=False):
        """
        Implements the positive phase sampling, which performs blocks Gibbs
        sampling in order to sample from p(g,h,x,y|v).
        :param v: fixed training set
        :param l: l is None means we sample l, l not None means we clamp l.
        :param init: dictionary of initial values, or None if sampling from scratch
        :param n_steps: scalar, number of Gibbs steps to perform.
        :param restart: if False, start sampling from buffers self.pos_*
        """
        if init_state is None:
            assert n_steps
            # start sampler from scratch
            init_state = OrderedDict()
            init_state['g'] = T.ones((self.batch_size,self.n_g)) * T.nnet.sigmoid(self.gbias)
            init_state['h'] = T.ones((self.batch_size,self.n_h)) * T.nnet.sigmoid(self.hbias)
            init_state['l'] = T.ones((self.batch_size,self.n_l)) * T.nnet.softmax(self.lbias)

        outputs = self.pos_phase(v, l=l,
                init_state=init_state,
                n_steps=n_steps,
                mean_field=mean_field)

        pos_states = OrderedDict()
        pos_states['g'] = outputs[0]
        pos_states['h'] = outputs[1]
        pos_states['l'] = outputs[2] if l is None else self.input_labels

        # update running average of positive phase activations
        pos_updates = OrderedDict()
        pos_updates[self.pos_counter] = outputs[-1]
        pos_updates[self.odd_even] = (self.odd_even + 1) % 2
        return pos_states, pos_updates
Beispiel #11
0
def _meshgrid(height, width, depth):
    # This function is the grid generator from eq. (1) in reference [1].
    # It is equivalent to the following numpy code:
    #  x_t, y_t,z_t = np.meshgrid(np.linspace(-1, 1, width),
    #                         np.linspace(-1, 1, height))
    #  ones = np.ones(np.prod(x_t.shape))
    #  grid = np.vstack([x_t.flatten(), y_t.flatten(), ones])
    # It is implemented in Theano instead to support symbolic grid sizes.
    # Note: If the image size is known at layer construction time, we could
    # compute the meshgrid offline in numpy instead of doing it dynamically
    # in Theano. However, it hardly affected performance when we tried.
    x_t = T.dot(
        T.reshape(T.dot(
            _linspace(-1.0, 1.0, height).dimshuffle(0, 'x'),
            T.ones((1, width))), (height, width, 1)),
        T.ones((1, 1, depth))
    )
    y_t = T.dot(
        T.reshape(T.dot(
            T.ones((height, 1)),
            _linspace(-1.0, 1.0, width).dimshuffle('x', 0)), (height, width, 1)),
        T.ones((1, 1, depth))
    )
    z_t = T.dot(T.ones((height, width, 1)), T.reshape(_linspace(-1.0, 1.0, depth), (1, 1, -1)))

    x_t_flat = x_t.reshape((1, -1))
    y_t_flat = y_t.reshape((1, -1))
    z_t_flat = z_t.reshape((1, -1))
    ones = T.ones_like(x_t_flat)
    grid = T.concatenate([x_t_flat, y_t_flat, z_t_flat, ones], axis=0)
    return grid
Beispiel #12
0
    def apply_log_domain(self, l, probs, l_len=None, probs_mask=None):
        # Does the same computation as apply, but alpha is in the log domain
        # This avoids numerical underflow issues that were not corrected in the previous version.

        def _log(a):
            return tensor.log(tensor.clip(a, 1e-12, 1e12))

        def _log_add(a, b):
            maximum = tensor.maximum(a, b)
            return (maximum + tensor.log1p(tensor.exp(a + b - 2 * maximum)))

        def _log_mul(a, b):
            return a + b

        # See comments above
        B = probs.shape[1]
        C = probs.shape[2]-1
        L = l.shape[0]
        S = 2*L+1
        
        l_blk = C * tensor.ones((S, B), dtype='int32')
        l_blk = tensor.set_subtensor(l_blk[1::2,:], l)
        l_blk = l_blk.T     # now l_blk is B x S

        alpha0 = tensor.concatenate([   tensor.ones((B, 1)),
                                        tensor.zeros((B, S-1))
                                    ], axis=1)
        alpha0 = _log(alpha0)

        l_blk_2 = tensor.concatenate([-tensor.ones((B,2)), l_blk[:,:-2]], axis=1)
        l_case2 = tensor.neq(l_blk, C) * tensor.neq(l_blk, l_blk_2)

        def recursion(p, p_mask, prev_alpha):
            prev_alpha_1 = tensor.concatenate([tensor.zeros((B,1)),prev_alpha[:,:-1]], axis=1)
            prev_alpha_2 = tensor.concatenate([tensor.zeros((B,2)),prev_alpha[:,:-2]], axis=1)

            alpha_bar1 = tensor.set_subtensor(prev_alpha[:,1:], _log_add(prev_alpha[:,1:],prev_alpha[:,:-1]))
            alpha_bar2 = tensor.set_subtensor(alpha_bar1[:,2:], _log_add(alpha_bar1[:,2:],prev_alpha[:,:-2]))

            alpha_bar = tensor.switch(l_case2, alpha_bar2, alpha_bar1)

            probs = _log(p[tensor.arange(B)[:,None].repeat(S,axis=1).flatten(), l_blk.flatten()].reshape((B,S)))
            next_alpha = _log_mul(alpha_bar, probs)
            next_alpha = tensor.switch(p_mask[:,None], next_alpha, prev_alpha)
            
            return next_alpha

        alpha, _ = scan(fn=recursion,
                             sequences=[probs, probs_mask],
                             outputs_info=[alpha0])

        last_alpha = alpha[-1]
        # last_alpha = theano.printing.Print('a-1')(last_alpha)

        prob = _log_add(last_alpha[tensor.arange(B), 2*l_len.astype('int32')-1],
                        last_alpha[tensor.arange(B), 2*l_len.astype('int32')])

        # return the negative log probability of the labellings
        return -prob
    def get_output(self, train=False):
        X = self.get_input(train=train)
        c0 = self.c0[None,:] * T.ones((X.shape[0], self.context_dim))
        cn = self.cn[None,:] * T.ones((X.shape[0], self.context_dim))
        X = T.concatenate(
            [
                T.shape_padleft(self.e0,2) * T.ones((X.shape[0], 1, X.shape[2])),
                X,
                T.shape_padleft(self.en,2) * T.ones((X.shape[0], 1, X.shape[2])),
            ],
            axis = 1
        )
        X = X.dimshuffle(1,0,2) # timestep 置于第一纬
        # 只有将int32 mask 强制转换为 float32 才不会在scan里面将mask_t[:, None] * cl_t 结果upcast成float64
        mask = T.cast(self.get_output_mask(train=train), T.config.floatX)
        mask = mask.dimshuffle(1,0) # timestep 置于第一纬
        #theano.printing.debugprint([mask], print_type=True)
        def _forward_step(e_t, e_tm1, mask_t, cl_tm1):
            #print 'e_t:', e_t.type.ndim
            #print 'cl_t:', cl_tm1.type.ndim
            cl_t = T.nnet.sigmoid(
                T.dot(cl_tm1, self.Wl) + T.dot(e_tm1, self.Wsl)
            )
            cl_t = mask_t[:, None] * cl_t + (1. - mask_t[:, None]) * cl_tm1 # 如果它被mask就直接继承那个词
            #theano.printing.debugprint([mask_t], print_type=True)
            #theano.printing.debugprint([cl_t], print_type=True)
            return cl_t
        def _backward_step(e_t, e_tp1, mask_t, cr_tp1):
            cr_t = T.nnet.sigmoid(
            T.dot(cr_tp1, self.Wr) + T.dot(e_tp1, self.Wsr))
            cr_t = mask_t[:, None] * cr_t + (1. - mask_t[:, None]) * cr_tp1 # 如果它被mask就直接继承那个词
            return cr_t
        Cl, _ = theano.scan(_forward_step,
                        sequences=[dict(input=X, taps=[0, -1]), mask],
                        outputs_info=[
                            dict(initial=c0, taps=[-1]) # 注意不是c0!!!
                        ],

        )
        Cr, _ = theano.scan(_backward_step,
                            sequences=[dict(input=X, taps=[0, -1]), mask],
                            outputs_info=[
                                dict(initial=cn, taps=[-1])
                            ],
                            go_backwards=True,
        )
        Cr = Cr[::-1] # 翻转Cr
        def _concatenate_activation_step(e_t, mask_t, cl_t, cr_t):
            #print theano.printing.debugprint(cr_t, print_type=True)
            h_t = T.tanh( T.dot(T.concatenate([e_t, cl_t, cr_t], axis=1), self.W2)
                       + self.b2)
            h_t = mask_t[:, None] * h_t + (1. - mask_t[:, None]) * (-10000000000.) # 将mask的地方设置为最小值
            return h_t

        Y, _ = theano.scan(_concatenate_activation_step,
                    sequences=[X, mask, Cl, Cr],
                    outputs_info=None,
        )
        return Y.dimshuffle(1,0,2) # 重置样本为第一维
Beispiel #14
0
 def scanr(self, x, y0=None, c0=None, mask=None, **kwargs):
     if y0 is None:
         #y0 = self.cact(self.y0)
         y0 = th.ones((x.shape[1],1))*self.y0
     if c0 is None:
         c0 = th.ones((x.shape[1],1))*self.c0
     return scanr(self.ws, y0, c0, x, mask=mask, iact=self.iact, fact=self.fact, oact=self.oact
                  , gact=self.gact, cact=self.cact, **kwargs)
Beispiel #15
0
    def result(theano, TT):
        def fn(s1, s2):
            return s1 + s2

        outputs, _ = theano.scan(
            fn,
            sequences=[TT.ones(10), 2 * TT.ones(10)])
        return theano.function([], outputs)()
Beispiel #16
0
Datei: cold2.py Projekt: zenna/ig
def gen_img(shape_params, rotation_matrix, width, height, nsteps, res):
    raster_space = gen_fragcoords(width, height)
    rd, ro = make_ro(rotation_matrix, raster_space, width, height)
    a = 0 - ro # c = 0
    b = 1 - ro # c = 1
    nmatrices = rotation_matrix.shape[0]
    tn = T.reshape(a, (nmatrices, 1, 1, 3))/rd
    tf = T.reshape(b, (nmatrices, 1, 1, 3))/rd
    tn_true = T.minimum(tn,tf)
    tf_true = T.maximum(tn,tf)
    # do X
    tn_x = tn_true[:,:,:,0]
    tf_x = tf_true[:,:,:,0]
    tmin = 0.0
    tmax = 10.0
    t0 = tmin
    t1 = tmax
    t02 = T.switch(tn_x > t0, tn_x, t0)
    t12 = T.switch(tf_x < t1, tf_x, t1)
    # y
    tn_x = tn_true[:,:,:,1]
    tf_x = tf_true[:,:,:,1]
    t03 = T.switch(tn_x > t02, tn_x, t02)
    t13 = T.switch(tf_x < t12, tf_x, t12)
    #z
    tn_x = tn_true[:,:,:,2]
    tf_x = tf_true[:,:,:,2]
    t04 = T.switch(tn_x > t03, tn_x, t03)
    t14 = T.switch(tf_x < t13, tf_x, t13)

    # Shift a little bit to avoid numerial inaccuracies
    t04 = t04*1.001
    t14 = t14*0.999

    nvoxgrids = shape_params.shape[0]
    left_over = T.ones((nvoxgrids, nmatrices * width * height,))
    step_size = (t14 - t04)/nsteps
    orig = T.reshape(ro, (nmatrices, 1, 1, 3)) + rd * T.reshape(t04,(nmatrices, width, height, 1))
    xres = yres = zres = res

    orig = T.reshape(orig, (nmatrices * width * height, 3))
    rd = T.reshape(rd, (nmatrices * width * height, 3))
    step_sz = T.reshape(step_size, (nmatrices * width * height,1))

    for i in range(nsteps):
        # print "step", i
        pos = orig + rd*step_sz*i
        voxel_indices = T.floor(pos*res)
        pruned = T.clip(voxel_indices,0,res-1)
        p_int =  T.cast(pruned, 'int32')
        indices = T.reshape(p_int, (nmatrices*width*height,3))
        attenuation = shape_params[:, indices[:,0],indices[:,1],indices[:,2]]
        left_over = left_over*T.exp(-attenuation*T.flatten(step_sz))

    img = left_over
    pixels = T.reshape(img, (nvoxgrids, nmatrices, width, height))
    mask = t14>t04
    return T.switch(t14>t04, pixels, T.ones_like(pixels)), rd, ro, tn_x, T.ones((nvoxgrids, nmatrices * width * height,)), orig, shape_params
 def f1_score(self, y):
     n_total = y.shape[0]
     n_relevant_documents_predicted = T.sum(T.eq(T.ones(self.y_pred.shape), self.y_pred))
     two_vector = T.add(T.ones(self.y_pred.shape), T.ones(self.y_pred.shape))
     n_relevant_predicted_correctly = T.sum(T.eq(T.add(self.y_pred, y), two_vector))
     precision = T.true_div(n_relevant_predicted_correctly, n_relevant_documents_predicted)
     recall = T.true_div(n_relevant_predicted_correctly, n_total)
     f1_score =  T.mul(2.0, T.true_div(T.mul(precision, recall), T.add(precision, recall)))
     return [f1_score, precision, recall]
 def new_attention_step(self, ct, prev_g, mem, q_q):
     cWq = T.dot(T.ones((1, self.batch_size), dtype=floatX), T.dot(T.dot(ct.T, self.W_b), q_q) * T.eye(n=self.batch_size, m=self.batch_size, dtype=floatX))
     cWm = T.dot(T.ones((1, self.batch_size), dtype=floatX), T.dot(T.dot(ct.T, self.W_b), mem) * T.eye(n=self.batch_size, m=self.batch_size, dtype=floatX))
     z = T.concatenate([ct, mem, q_q, ct * q_q, ct * mem, T.abs_(ct - q_q), T.abs_(ct - mem), cWq, cWm], axis=0)
     
     l_1 = T.dot(self.W_1, z) + self.b_1.dimshuffle(0, 'x')
     l_1 = T.tanh(l_1)
     l_2 = T.dot(self.W_2, l_1) + self.b_2.dimshuffle(0, 'x')
     G = T.nnet.sigmoid(l_2)[0]
     return G
Beispiel #19
0
 def gradient(self, observed, at_risk):
     prediction = self.output
     risk = T.exp(prediction)
     product = self.input * (risk * T.ones((1, self.input.shape[0])))
     numerator = Te.cumsum(product[::-1])[::-1][at_risk]
     denominator = Te.cumsum(risk[::-1])[::-1][at_risk] * T.ones((1, self.input.shape[0]))
     numerator = numerator.flatten()
     denominator = denominator.flatten()
     gradient = T.dot(observed, self.input - (numerator / denominator))
     return gradient
Beispiel #20
0
    def result(theano, TT):
        def fn(s1, s2, o1):
            return s1 + s2 + o1

        outputs, _ = theano.scan(
            fn,
            sequences=[TT.ones(10), 2 * TT.ones(10)],
            outputs_info=0.,
        )
        return theano.function([], outputs)()
Beispiel #21
0
    def result(theano, TT):
        def fn(s1, s2, addn):
            return s1 + s2 + addn

        outputs, _ = theano.scan(
            fn,
            sequences=[TT.ones(10), 2 * TT.ones(10)],
            non_sequences=1,
        )
        return theano.function([], outputs)()
Beispiel #22
0
 def backward(self, y):
     Km1 = y.shape[0]
     k = tt.arange(Km1)[(slice(None),) + (None,) * (y.ndim - 1)]
     eq_share = -tt.log(Km1 - k)  # logit(1./(Km1 + 1 - k))
     z = inverse_logit(y + eq_share)
     yl = tt.concatenate([z, tt.ones(y[:1].shape)])
     yu = tt.concatenate([tt.ones(y[:1].shape), 1 - z])
     S = tt.extra_ops.cumprod(yu, 0)
     x = S * yl
     return x
Beispiel #23
0
    def __init__(self, num_hidden, num_features, seq_length, mb_size, tf_states, rf_states):
        
        tf_states = T.specify_shape(tf_states, (seq_length, mb_size, num_features))
        rf_states = T.specify_shape(rf_states, (seq_length, mb_size, num_features))

        hidden_state_features = T.specify_shape(T.concatenate([tf_states, rf_states], axis = 1), (seq_length, mb_size * 2, num_features))

        gru_params_1 = init_tparams(param_init_gru(None, {}, prefix = "gru1", dim = num_hidden, nin = num_features))
        #gru_params_2 = init_tparams(param_init_gru(None, {}, prefix = "gru2", dim = num_hidden, nin = num_hidden + num_features))
        #gru_params_3 = init_tparams(param_init_gru(None, {}, prefix = "gru3", dim = num_hidden, nin = num_hidden + num_features))

        gru_1_out = gru_layer(gru_params_1, hidden_state_features, None, prefix = 'gru1')[0]
        #gru_2_out = gru_layer(gru_params_2, T.concatenate([gru_1_out, hidden_state_features], axis = 2), None, prefix = 'gru2', backwards = True)[0]
        #gru_3_out = gru_layer(gru_params_3, T.concatenate([gru_2_out, hidden_state_features], axis = 2), None, prefix = 'gru3')[0]

        final_out_recc = T.specify_shape(T.mean(gru_1_out, axis = 0), (mb_size * 2, num_hidden))

        h_out_1 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify)
        #h_out_2 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify)
        #h_out_3 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify)
        h_out_4 = DenseLayer((mb_size * 2, num_hidden), num_units = 1, nonlinearity=None)

        h_out_1_value = h_out_1.get_output_for(final_out_recc)
        h_out_4_value = h_out_4.get_output_for(h_out_1_value)

        raw_y = h_out_4_value
        #raw_y = T.clip(h_out_4_value, -10.0, 10.0)
        classification = T.nnet.sigmoid(raw_y)

        #tf comes before rf.  
        p_real =  classification[:mb_size]
        p_gen  = classification[mb_size:]

        #bce = lambda r,t: t * T.nnet.softplus(-r) + (1 - t) * (r + T.nnet.softplus(-r))

        self.d_cost_real = bce(p_real, 0.9 * T.ones(p_real.shape)).mean()
        self.d_cost_gen = bce(p_gen, 0.1 + T.zeros(p_gen.shape)).mean()
        self.g_cost_d = bce(p_gen, 0.9 * T.ones(p_gen.shape)).mean()
        self.d_cost = self.d_cost_real + self.d_cost_gen
        self.g_cost = self.g_cost_d


        self.classification = classification

        self.params = []
        self.params += lasagne.layers.get_all_params(h_out_4,trainable=True)
        #self.params += lasagne.layers.get_all_params(h_out_3,trainable=True)
        #self.params += lasagne.layers.get_all_params(h_out_2,trainable=True)
        self.params += lasagne.layers.get_all_params(h_out_1,trainable=True)

        self.params += gru_params_1.values()
        #self.params += gru_params_2.values()
        #self.params += gru_params_3.values()

        self.accuracy = T.mean(T.eq(T.ones(p_real.shape).flatten(), T.gt(p_real, 0.5).flatten())) + T.mean(T.eq(T.ones(p_gen.shape).flatten(), T.lt(p_gen, 0.5).flatten()))
Beispiel #24
0
 def backward(self, y_):
     y = y_.T
     Km1 = y.shape[0]
     k = tt.arange(Km1)[(slice(None), ) + (None, ) * (y.ndim - 1)]
     eq_share = logit(1./(Km1 + 1 - k)) #- tt.log(Km1 - k)
     z = invlogit(y + eq_share, self.eps)
     yl = tt.concatenate([z, tt.ones(y[:1].shape)])
     yu = tt.concatenate([tt.ones(y[:1].shape), 1-z])
     S = tt.extra_ops.cumprod(yu, 0)
     x = S * yl
     return x.T
Beispiel #25
0
 def generate(self, source_sentence, representation,tw_representation,topical_embedding,content_embedding, **kwargs):
     return self.sequence_generator.generate(
         n_steps=2 * source_sentence.shape[1],
         batch_size=source_sentence.shape[0],
         attended=representation,
         attended_mask=tensor.ones(source_sentence.shape).T,
         topical_attended=tw_representation,
         topical_attended_mask=tensor.ones([source_sentence.shape[0],10]).T,
         topical_embeddingq=topical_embedding,
         content_embedding=content_embedding,
         **kwargs)
def Meshgrid(height, width):
    x_t = T.dot(T.ones((height, 1)),
                Linspace(-1.0, 1.0, width).dimshuffle('x', 0))
    y_t = T.dot(Linspace(-1.0, 1.0, height).dimshuffle(0, 'x'),
                T.ones((1, width)))

    x_t_flat = x_t.reshape((1, -1))
    y_t_flat = y_t.reshape((1, -1))
    ones = T.ones_like(x_t_flat)
    grid = T.concatenate([x_t_flat, y_t_flat, ones], axis=0)
    return grid
Beispiel #27
0
    def sample(self, shape):
        """
        Paramaters
        --------
        shape : tuple
           sets a shape of the output sample
        """

        return super(UnitGammaSample,
                     self).sample(T.ones(shape),
                                  T.ones(shape))
 def kldiv_m(self, mu, std_r, std_c):
     pmu, pstdr, pstdc = self.get_priors()
     var_r, var_c = T.sqr(std_r), T.sqr(std_c)
     # first kl term
     fa = T.sum((1./(pstdc**2)) * var_c)*T.sum((1./(pstdr**2))*var_r)
     # second kl term
     prior_sigma = T.outer(T.ones((mu.shape[0],))*(pstdr**2), T.ones((mu.shape[1],))*(pstdc**2))
     fb = T.sum(T.sqr(mu - pmu) / prior_sigma)
     # third kl term
     fc = mu.shape[1]*(mu.shape[0]*T.log(pstdr**2) - T.sum(T.log(var_r))) + \
         mu.shape[0]*(mu.shape[1]*T.log(pstdc**2) - T.sum(T.log(var_c)))
     return - 0.5 * (fa + fb - T.prod(mu.shape) + fc)
    def log_likelihood(self):
        Users = self.L[:, :-2]
        Items = self.R[:, :-2]
        UserBiases = self.L[:, -1]
        ItemBiases = self.R[:, -2]
        UserOuter = self.L[:, -2]
        ItemOuter = self.R[:, -1]

        ## A = T.dot(Users, Items.T)
        ## A += UserBiases
        ## A += ItemBiases.T
        ## B = A * self.counts
        ## loglik = T.sum(B)

        # A implicitly stored as self.L @ self.R.T
        # loglik = T.sum(A * self.counts) => sum over nonzeros only
        print('nnz size: {}'.format(self.counts.nonzero()[0].size))
        loglik = T.dot(self.evaluate_lowrank(self.L, self.R, self.counts.nonzero(), fast=False),
                  np.array(self.counts[self.counts.nonzero()]).ravel())

        ## A = T.exp(A)
        ## A += 1
        ## A = T.log(A)
        # There we use Taylor series ln(exp(x) + 1) = ln(2) + x/2 + x^2/8 + O(x^4) at x=0
        # ln(2)
        const_term = (T.ones((self.num_users, 1)) * np.log(2), T.ones((self.num_items, 1)))
        # x/2
        first_order_term = (0.5 * self.L, 0.5 * self.R)
        # x^2/8
        second_order_term = hadamard((self.L, self.R), (self.L, self.R), self.num_factors)
        second_order_term = tuple(factor / 8.0 for factor in second_order_term)

        grouped_factors = list(zip(const_term, first_order_term, second_order_term))
        A = (T.concatenate(grouped_factors[0], axis=1), T.concatenate(grouped_factors[1], axis=1))

        ## A = (self.counts + 1) * A
        ## loglik -= T.sum(A)
        loglik -= sum_lowrank(A)
        loglik -= T.dot(self.evaluate_lowrank(A[0], A[1], self.counts.nonzero(), fast=False),
                  np.array(self.counts[self.counts.nonzero()]).ravel())


        # L2 regularization
        loglik -= 0.5 * self.reg_param * T.sum(T.square(Users))
        loglik -= 0.5 * self.reg_param * T.sum(T.square(Items))

        # we need strictly maintain UserOuter and ItemOuter be ones, just to ensure they properly
        # outer products with biases
        loglik -= self.num_users * T.sum(T.square(UserOuter - 1))
        loglik -= self.num_items * T.sum(T.square(ItemOuter - 1))

        # Return negation of LogLikelihood cause we will minimize cost
        return -loglik
Beispiel #30
0
def cost(y, y_hat_softmax, y_mask=None, mask=None):
    """
    Computes the CTC cost using just the forward computations.
    The difference between this function and the vanilla 'cost' function
    is that this function adds blanks first.

    Notes
    -----
    y_hat should be the output from a softmax layer. This is different from
    pseudo_cost which takes energies as input.

    Do not calculate the gradient from this cost but use pseudo_cost to
    calculate the gradients. This cost function can be used to monitor the
    cost during training.


    Parameters
    ----------
    y : matrix (num_batch, target_seq_len)
        the target label sequences
    y_hat_softmax : tensor3 (num_batch, input_seq_len, num_classes + 1)
        class probabily distribution sequences, potentially in log domain
    y_mask : matrix (num_batch, output_seq_len)
        indicates which values of y to use
    mask : matrix (num_batch, input_seq_len)
        indicates the lenghts of the sequences in y_hat
    """

    # dimshuffle from lasagnes output format
    y_hat_softmax = y_hat_softmax.dimshuffle(1, 0, 2)
    y = y.dimshuffle(1, 0)

    if y_mask is None:
        y_mask = T.ones(y.shape,
                        dtype=theano.config.floatX)
    else:
        y_mask = y_mask.dimshuffle(1, 0)

    if mask is None:
        mask = T.ones((y_hat_softmax.shape[0], y_hat_softmax.shape[1]),
                      dtype=theano.config.floatX)
    else:
        mask = mask.dimshuffle(1, 0)

    num_classes = y_hat_softmax.shape[2] - 1
    blanked_y, blanked_y_mask = _add_blanks(
        y=y,
        blank_symbol=num_classes,
        y_mask=y_mask)
    final_cost = -sequence_log_likelihood(blanked_y, y_hat_softmax,
                                          blanked_y_mask, mask,
                                          num_classes)
    return final_cost    
Beispiel #31
0
def f(q_i, D_gt_id, tparams, is_train, trng, options):

    # Use search engine again to compute the reward/metrics given a query.
    search = Search(options)

    # append the unknown vector for words whose index = -1.
    W_ = tensor.concatenate([tparams['W'], tparams['UNK']], axis=0)

    q_m = (q_i > -2).astype('float32')

    #get embeddings for the queries
    q_a = W_[q_i.flatten()].reshape((q_i.shape[0], q_i.shape[1], prm.dim_emb)) * q_m[:,:,None]

    if len(prm.filters_query) > 0:
        q_aa = conv_query(q_a, tparams)
    else:
        q_aa = q_a

    q_a_avg = q_a.sum(1) / tensor.maximum(1., q_m.sum(1, keepdims=True))

    out = []
    for n_iter in range(prm.n_iterations):

        if n_iter == 0 and prm.q_0_fixed_until >= prm.n_iterations:
            prob = tensor.zeros((q_a.shape[0], prm.max_words_input, 2))
            bl = tensor.zeros((q_a.shape[0],))
            D_m_r = tensor.zeros((q_a.shape[0], prm.max_words_input))
        else:
            if n_iter > 0:
                D_m_ = (D_i_ > -2).astype('float32')
                D_a_ = W_[D_i_.flatten()].reshape((D_i_.shape[0], D_i_.shape[1], D_i_.shape[2], prm.dim_emb)) * D_m_[:,:,:,None]
            else:
                D_a_ = 1. * q_a[:,None,:,:]
                D_m_ = 1. * q_m[:,None,:]


            if len(prm.filters_cand) > 0:
                D_aa_ = conv_cand(D_a_, tparams, 0)
            else:
                D_aa_ = D_a_

            D_aa_ = tensor.dot(D_aa_, tparams['Ad']) + tparams['bAd']

            if n_iter > 0:
                if prm.q_0_fixed_until < 2:
                    D_a = tensor.concatenate([D_a, D_a_], axis=1)
                    D_aa = tensor.concatenate([D_aa, D_aa_], axis=1)
                    D_m = tensor.concatenate([D_m, D_m_], axis=1)
                else:
                    D_a = D_a_
                    D_aa = D_aa_
                    D_m = D_m_
            else:
                D_a = D_a_
                D_aa = D_aa_
                D_m = D_m_

            D_a_r = D_a.reshape((D_a.shape[0], -1, D_a.shape[3]))
            D_aa_r = D_aa.reshape((D_aa.shape[0], -1, D_aa.shape[3]))

            D_m_r = D_m.reshape((D_m.shape[0],-1))

       
            q_aa_avg = q_aa.sum(1) / tensor.maximum(1., q_m.sum(1, keepdims=True))
            q_aa_att = q_aa_avg[:,None,:]
            q_aa_att = tensor.dot(q_aa_att, tparams['Aq'])

            z = D_aa_r + q_aa_att

            # estimate reward based on the query.
            bl = theano.gradient.grad_scale(z, 0.1)
            D_m_r_c = theano.gradient.disconnected_grad(D_m_r)
            bl = bl.sum(1) / tensor.maximum(1., D_m_r_c.sum(1))[:,None]
            for i in range(len(prm.n_hidden_critic)+1):
                if prm.dropout > 0:
                    bl = dropout_layer(bl, is_train, trng)
                bl = tensor.maximum(0., bl)
                bl = tensor.dot(bl, tparams['C'+str(i)]) + tparams['bC'+str(i)]

            bl = tensor.tanh(bl)
            bl = bl.flatten()
    

            for i in range(len(prm.n_hidden_actor)+1):
                if prm.dropout > 0:
                    z = dropout_layer(z, is_train, trng)
                z = tensor.maximum(0., z)
                z = tensor.dot(z, tparams['V'+str(i)]) + tparams['bV'+str(i)]

            prob = softmax_mask(z) * D_m_r[:,:,None]

            # if training, sample. Otherwise, pick maximum probability.
            s = trng.multinomial(n=1, pvals=prob.reshape((-1, 2)), dtype=prob.dtype)
            s = s.reshape((prob.shape[0],prob.shape[1],prob.shape[2]))

            #if frozen is enabled and this iteration is within its limit, pick maximum probability.
            if prm.frozen_until > 0:
                if n_iter < prm.frozen_until:
                    s = prob

            res = tensor.eq(is_train,1.) * s + tensor.eq(is_train,0.) * prob

            # final answer & valid words
            ans = res.argmax(2) * D_m_r

        if n_iter < prm.q_0_fixed_until:
            ones = tensor.ones((q_a.shape[0], prm.max_words_input))
            if n_iter > 0:
                # select everything from the original query in the first iteration.
                ans = tensor.concatenate([ones, ans], axis=1)
            else:
                ans = ones

        metrics, D_i_, D_id_, D_gt_m_ = search(ans, D_gt_id, n_iter, is_train)

        out.append([prob, ans, metrics, bl, D_m_r, D_id_])

    return out
Beispiel #32
0
    def set_up(self, config=None, make_prunable=False):
        """Loads and initializes all the theano variables for the
        training model and the decoding model.
        
        Args:
            config (dict): NMT configuration
        """
        if config:
            self.config = config
        else:
            config = self.config
        # Create Theano variables
        logging.debug('Creating theano variables')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence_mask = tensor.matrix('target_mask')

        # Construct model (fs439: Add NoLookup options)
        if config['dec_layers'] != 1:
            logging.fatal("Only dec_layers=1 supported.")
        logging.debug('Building RNN encoder-decoder')
        if config['src_sparse_feat_map']:
            if config['enc_layers'] != 1:
                logging.fatal("Only enc_layers=1 supported for sparse "
                              "source features.")
            source_sentence = tensor.tensor3('source')
            self.sampling_input = tensor.tensor3('input')
            encoder = NoLookupEncoder(config['enc_embed'], config['enc_nhids'])
        else:
            source_sentence = tensor.lmatrix('source')
            self.sampling_input = tensor.lmatrix('input')
            if config['enc_layers'] > 1 and not config['enc_share_weights']:
                encoder = DeepBidirectionalEncoder(
                    config['src_vocab_size'], config['enc_embed'],
                    config['enc_layers'], config['enc_skip_connections'],
                    config['enc_nhids'])
            else:
                encoder = BidirectionalEncoder(config['src_vocab_size'],
                                               config['enc_embed'],
                                               config['enc_layers'],
                                               config['enc_skip_connections'],
                                               config['enc_nhids'])
        if config['trg_sparse_feat_map']:
            target_sentence = tensor.tensor3('target')
            decoder = NoLookupDecoder(
                config['trg_vocab_size'], config['dec_embed'],
                config['dec_nhids'], config['att_nhids'],
                config['maxout_nhids'], config['enc_nhids'] * 2,
                config['attention'], config['dec_attention_sources'],
                config['dec_readout_sources'], config['memory'],
                config['memory_size'], config['seq_len'], config['dec_init'])
        else:
            target_sentence = tensor.lmatrix('target')
            decoder = Decoder(config['trg_vocab_size'],
                              config['dec_embed'],
                              config['dec_nhids'],
                              config['att_nhids'],
                              config['maxout_nhids'],
                              config['enc_nhids'] * 2,
                              config['attention'],
                              config['dec_attention_sources'],
                              config['dec_readout_sources'],
                              config['memory'],
                              config['memory_size'],
                              config['seq_len'],
                              config['dec_init'],
                              make_prunable=make_prunable)
        if config['annotations'] != 'direct':
            annotators = []
            add_direct = False
            for name in config['annotations'].split(','):
                if name == 'direct':
                    add_direct = True
                elif name == 'hierarchical':
                    annotators.append(HierarchicalAnnotator(encoder))
                else:
                    logging.fatal("Annotation strategy %s unknown" % name)
            encoder = EncoderWithAnnotators(encoder, annotators, add_direct)
        annotations, annotations_mask = encoder.apply(source_sentence,
                                                      source_sentence_mask)
        self.cost = decoder.cost(annotations, annotations_mask,
                                 target_sentence, target_sentence_mask)

        logging.info('Creating computational graph')
        self.cg = ComputationGraph(self.cost)

        # Initialize model
        logging.info('Initializing model')
        encoder.weights_init = decoder.weights_init = IsotropicGaussian(
            config['weight_scale'])
        encoder.biases_init = decoder.biases_init = Constant(0)
        encoder.push_initialization_config()
        decoder.push_initialization_config()
        try:
            encoder.bidir.prototype.weights_init = Orthogonal()
        except AttributeError:
            pass  # Its fine, no bidirectional encoder
        decoder.transition.weights_init = Orthogonal()
        encoder.initialize()
        decoder.initialize()

        # apply dropout for regularization
        if config['dropout'] < 1.0:
            # dropout is applied to the output of maxout in ghog
            logging.info('Applying dropout')
            dropout_inputs = [
                x for x in self.cg.intermediary_variables
                if x.name == 'maxout_apply_output'
            ]
            self.cg = apply_dropout(self.cg, dropout_inputs, config['dropout'])

        # Apply weight noise for regularization
        if config['weight_noise_ff'] > 0.0:
            logging.info('Applying weight noise to ff layers')
            if encoder.lookup:
                enc_params = Selector(encoder.lookup).get_parameters().values()
            enc_params += Selector(encoder.fwd_fork).get_parameters().values()
            enc_params += Selector(encoder.back_fork).get_parameters().values()
            dec_params = Selector(
                decoder.sequence_generator.readout).get_parameters().values()
            dec_params += Selector(
                decoder.sequence_generator.fork).get_parameters().values()
            self.cg = apply_noise(self.cg, enc_params + dec_params,
                                  config['weight_noise_ff'])

        # Print shapes
        shapes = [param.get_value().shape for param in self.cg.parameters]
        logging.debug("Parameter shapes: ")
        for shape, count in Counter(shapes).most_common():
            logging.debug('    {:15}: {}'.format(shape, count))
        logging.debug("Total number of CG parameters: {}".format(len(shapes)))

        # Print parameter names
        enc_dec_param_dict = merge(
            Selector(encoder).get_parameters(),
            Selector(decoder).get_parameters())
        logging.debug("Parameter names: ")
        for name, value in enc_dec_param_dict.items():
            logging.debug('    {:15}: {}'.format(value.get_value().shape,
                                                 name))
        logging.info("Total number of parameters: {}".format(
            len(enc_dec_param_dict)))

        # Set up training model
        logging.info("Building model")
        self.training_model = Model(self.cost)

        logging.info("Building sampling model")
        src_shape = (self.sampling_input.shape[-2],
                     self.sampling_input.shape[-1])  # batch_size x sen_length
        sampling_representation, _ = encoder.apply(self.sampling_input,
                                                   tensor.ones(src_shape))
        generated = decoder.generate(src_shape, sampling_representation)
        self.search_model = Model(generated)
        generated_outputs = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs
        self.samples = generated_outputs[1]
        self.encoder = encoder
        self.decoder = decoder
Beispiel #33
0
        def step(
            input_n,
            cell_previous, hid_previous,
            visual,
            W_hid_stacked, W_in_stacked, b_stacked,
            W_cell_to_ingate, W_cell_to_forgetgate, W_cell_to_outgate,
            W_h_to_attenGate, W_g_to_attenGate, W_v_to_attenGate, W_s_to_attenGate,
            W_p
        ):
            if not self.precompute_input:
                input_n = T.dot(input_n, W_in_stacked) + b_stacked

            # Calculate gates pre-activations and slice
            gates = input_n + T.dot(hid_previous, W_hid_stacked)

            # Clip gradients
            if self.grad_clipping:
                gates = theano.gradient.grad_clip(
                    gates, -self.grad_clipping, self.grad_clipping)

            # Extract the pre-activation gate values
            ingate = slice_w(gates, 0)
            forgetgate = slice_w(gates, 1)
            cell_input = slice_w(gates, 2)
            outgate = slice_w(gates, 3)
            ggate = slice_w(gates, 4)

            if self.peepholes:
                # Compute peephole connections
                ingate += cell_previous*W_cell_to_ingate
                forgetgate += cell_previous*W_cell_to_forgetgate

            # Apply nonlinearities
            ingate = self.nonlinearity_ingate(ingate)
            forgetgate = self.nonlinearity_forgetgate(forgetgate)
            cell_input = self.nonlinearity_cell(cell_input)
            # ggate gt
            ggate = self.nonlinearity_ggate(ggate)

            # Compute new cell value
            cell = forgetgate*cell_previous + ingate*cell_input

            if self.peepholes:
                outgate += cell*W_cell_to_outgate
            outgate = self.nonlinearity_outgate(outgate)

            # Compute new hidden unit activation
            hid = outgate*self.nonlinearity(cell)
            st = ggate*self.nonlinearity(cell)

            # zt = T.dot(
            #     self.nonlinearity(
            #         T.dot(visual, W_v_to_attenGate) +
            #         T.dot(
            #             T.dot(hid, W_g_to_attenGate).dimshuffle(0, 1, 'x'),
            #             T.ones((1, self.video_len))
            #         )
            #     ),
            #     W_h_to_attenGate
            # )[:, :, 0]

            # to avoid optimization failure of Tenseor 3D dot vector, we should transform
            # e = A.dot(B) to e = A*B.dimshuffle('x', 'x', 0), e=e.sum(axis=2)
            zt_dot_A = self.nonlinearity(
                T.dot(visual, W_v_to_attenGate) +
                T.dot(
                    T.dot(hid, W_g_to_attenGate).dimshuffle(0, 1, 'x'),
                    T.ones((1, self.video_len))
                )
            )
            zt = zt_dot_A*W_h_to_attenGate.dimshuffle('x', 'x', 0)
            zt = zt.sum(axis=2)

            # vt = T.dot(
            #     self.nonlinearity(
            #         T.dot(
            #             st, W_s_to_attenGate
            #         ) +
            #         T.dot(
            #             hid, W_g_to_attenGate
            #         )
            #     ),
            #     W_h_to_attenGate
            # )

            vt_dot_A = self.nonlinearity(
                T.dot(
                    st, W_s_to_attenGate
                ) +
                T.dot(
                    hid, W_g_to_attenGate
                )
            )
            vt = vt_dot_A*W_h_to_attenGate.dimshuffle('x', 0)
            vt = vt.sum(axis=1)
            vt = vt.dimshuffle(0, 'x')

            alpha_hat_t = self.nonlinearity_attenGate(T.concatenate(
                [zt, vt],
                axis=-1
            ))
            feature = T.concatenate(
                [visual_input, st.dimshuffle(0, 'x', 1)],
                axis=1
            ).dimshuffle(2, 0, 1)
            c_hat_t = T.sum(alpha_hat_t*feature, axis=-1)
            It = T.dot(
                (c_hat_t.T+hid), W_p
            )
            return [cell, hid, It]
Beispiel #34
0
def categorical_crossentropy_of_mean(predictions):
    num_cls = predictions.shape[1]
    uniform_targets = T.ones((1, num_cls)) / num_cls
    return categorical_crossentropy(predictions.mean(axis=0, keepdims=True), uniform_targets)
Beispiel #35
0
def another_simple_model():
    _model = models.simple_model()[1]
    with _model:
        pm.Potential('pot', tt.ones((10, 10)))
    return _model
Beispiel #36
0
 def ones(self, shape, dtype=None, name=None):
     dtype = dtype or self.floatx()
     return T.ones(shape, dtype=dtype)
def lohhla_clone_model(sample_ids,
                       tree_edges,
                       clonal_prevalence_mat,
                       cellularity,
                       ploidy_values,
                       tumour_sample_reads,
                       normal_sample_reads,
                       integercpn_info,
                       all_genotypes,
                       transition_inputs,
                       stayrate_alpha=0.9,
                       stayrate_beta=0.1,
                       sd=0.5,
                       nb_alpha=0.5,
                       iter_count=20000,
                       tune_iters=20000,
                       anchor_type='nb',
                       anchor_mode='snvcn',
                       nchains=2,
                       njobs=2):
    '''
    stayrate_alpha: Beta prior alpha-parameter on stayrate in clone tree Markov chain
    stayrate_beta: Beta prior beta-parameter on stayrate in clone tree Markov chain
    all_genotypes: Dataframe of genotypes, 0-indexed
    '''
    num_nodes = clonal_prevalence_mat.shape[1]

    valid_transitions = transition_inputs['valid_transitions']
    num_transitions = transition_inputs['num_transitions']
    num_genotypes = transition_inputs['num_genotypes']
    cn_genotype_matrix = transition_inputs['cn_genotype_matrix']

    ## Beta-binomial dispersion (higher = less dispersed)
    dispersion = 200.

    ## Tree edges
    edges = tree_edges.as_matrix().astype(int) - 1

    with pm.Model() as model:
        BoundedNormal = pm.Bound(pm.Normal, lower=0., upper=1.)
        stay_rate = BoundedNormal('stayrate', mu=0.75, sd=0.4)

        P = np.zeros(shape=(num_genotypes, num_genotypes))
        P = P + tt.eye(num_genotypes) * stay_rate

        fill_values = tt.as_tensor((1. - stay_rate) / num_transitions)
        fill_values = tt.set_subtensor(fill_values[0], 0)

        P = P + valid_transitions * fill_values[:, np.newaxis]
        P = tt.set_subtensor(P[0, 0], 1.)

        A = tt.dmatrix('A')

        PA = tt.ones(shape=(num_genotypes)) / num_genotypes

        states = CloneTreeGenotypes('genotypes',
                                    PA=PA,
                                    P=P,
                                    edges=edges,
                                    k=num_genotypes,
                                    shape=(num_nodes))

        total_cns = theano.shared(np.array(all_genotypes['total_cn'].values))
        alt_cns = theano.shared(np.array(all_genotypes['alt_cn'].values))

        total_cn = pm.Deterministic('total_cn', total_cns[states])
        alt_cn = pm.Deterministic('alt_cn', alt_cns[states])

        sample_alt_copies = tt.dot(clonal_prevalence_mat, alt_cn
                                   ) * cellularity + (1. - cellularity) * 1.

        vafs = sample_alt_copies / (
            tt.dot(clonal_prevalence_mat, total_cn) * cellularity +
            (1. - cellularity) * 2.)
        pm.Deterministic('vafs', vafs)

        alphas = vafs * dispersion
        betas = (1 - vafs) * dispersion

        ## Copy number of tumour cells (aggregated over clones, but not including normal contamination)
        tutotalcn = pm.Deterministic('tutotalcn',
                                     tt.dot(clonal_prevalence_mat, total_cn))

        ## Can't be vectorized further
        for j in range(len(sample_ids)):
            current_sample = sample_ids[j]
            total_counts = integercpn_info['TumorCov_type1'][
                current_sample].values + integercpn_info['TumorCov_type2'][
                    current_sample].values
            alt_counts = integercpn_info['TumorCov_type2'][
                current_sample].values
            alpha_sel = alphas[j]
            beta_sel = betas[j]

            ## Draw alternative allele counts for HLA locus for each polymorphic site
            alt_reads = pm.BetaBinomial('x_' + str(j),
                                        alpha=alpha_sel,
                                        beta=beta_sel,
                                        n=total_counts,
                                        observed=alt_counts)

            mult_factor_mean = (tumour_sample_reads[current_sample] /
                                normal_sample_reads)

            ploidy = ploidy_values[j]
            ploidy_ratio = (tutotalcn[j] * cellularity[j] +
                            (1 - cellularity[j]) * 2) / (
                                cellularity[j] * ploidy +
                                (1 - cellularity[j]) * 2)
            if anchor_mode == 'snvcn':
                mult_factor_computed = pm.Deterministic(
                    'mult_factor_computed_' + str(j), 1. / ploidy_ratio *
                    (integercpn_info['Total_TumorCov'][current_sample].values /
                     integercpn_info['Total_NormalCov'][current_sample].values)
                )
                nloci = len(
                    integercpn_info['Total_TumorCov'][current_sample].values)

                tumour_reads_observed = integercpn_info['Total_TumorCov'][
                    current_sample].values
                normal_reads_observed = integercpn_info['Total_NormalCov'][
                    current_sample].values
            elif anchor_mode == 'binmedian':
                binvar_tumour = 'combinedBinTumor'
                binvar_normal = 'combinedBinNormal'
                ## All within a bin are the same, so this is OK
                duplicated_entries = integercpn_info['binNum'][
                    current_sample].duplicated(keep='first')
                nloci = len(integercpn_info[binvar_tumour][current_sample]
                            [~duplicated_entries].values)

                mult_factor_computed = pm.Deterministic(
                    'mult_factor_computed_' + str(j),
                    (1. / ploidy_ratio *
                     (integercpn_info[binvar_tumour][current_sample]
                      [~duplicated_entries].values /
                      integercpn_info[binvar_normal][current_sample]
                      [~duplicated_entries].values)))

                tumour_reads_observed = integercpn_info[binvar_tumour][
                    current_sample][~duplicated_entries].values
                normal_reads_observed = integercpn_info[binvar_normal][
                    current_sample][~duplicated_entries].values
            else:
                raise Exception("Invalid option specified.")

            ## Draw ploidy-corrected tumour/normal locus coverage ratio for each polymorphic site

            if anchor_type == 'mult_factor':
                mult_factor = pm.Lognormal('mult_factor_' + str(j),
                                           mu=np.log(mult_factor_mean),
                                           sd=sd,
                                           observed=mult_factor_computed,
                                           shape=(nloci))
            elif anchor_type == 'nb':
                tc_nc_ratio = pm.Deterministic(
                    'tc_nc_ratio_' + str(j), (tutotalcn[j] * cellularity[j] +
                                              (1 - cellularity[j]) * 2) /
                    (ploidy * cellularity[j] + (1 - cellularity[j]) * 2))

                tumoursamplecn = pm.Deterministic(
                    'tumoursamplecn_' + str(j),
                    (tutotalcn[j] * cellularity[j] + (1 - cellularity[j]) * 2))

                tumour_reads_mean = pm.Deterministic(
                    'tumour_reads_mean_' + str(j),
                    tc_nc_ratio * mult_factor_mean * normal_reads_observed)

                tumour_reads = pm.NegativeBinomial(
                    'tumour_reads_' + str(j),
                    mu=tumour_reads_mean,
                    alpha=nb_alpha,
                    observed=tumour_reads_observed)
            else:
                raise Exception('Must specify a valid model type.')

        pm.Deterministic('log_prob', model.logpt)

        step1 = pm.CategoricalGibbsMetropolis(vars=[states])
        step2 = pm.Metropolis(vars=[stay_rate])

        trace = pm.sample(iter_count,
                          tune=tune_iters,
                          step=[step1, step2],
                          njobs=njobs,
                          chains=nchains)

        return trace
Beispiel #38
0
def best_right_path_cost(pred, mask, token, blank=0):
    '''
    best right path cost of multi sentences
    :param pred: (T, nb, voca_size+1)                    (4,1,3)
    :param mask: (nb, T)
    # :param pred_len: (nb,)    pred_len of prediction        (1)
    :param token: (nb, U)    -1 for NIL                    (1,2)
    :param blank: (1)

    :return: best_right_path_cost (nb,)
    :return: argmin_token (nb, T) best path, -1 for null
    '''

    pred_len = mask.sum(axis=-1).astype('int32')
    eps = theano.shared(np.float32(1e-35))
    EPS = theano.shared(np.float32(35))

    t = pred.shape[0]
    nb, U = token.shape[0], token.shape[1]
    token_len = T.sum(T.neq(token, -1), axis=-1)

    # token_with_blank
    token = token[:, :, None]  # (nb, U, 1)
    token_with_blank = T.concatenate(
        (T.ones_like(token, dtype=intX) * blank, token), axis=2).reshape(
            (nb, 2 * U))
    token_with_blank = T.concatenate(
        (token_with_blank, T.ones(
            (nb, 1), dtype=intX) * blank), axis=1)  # (nb, 2*U+1)
    length = token_with_blank.shape[1]

    # only use these predictions
    pred = pred[:, T.tile(T.arange(nb), (length, 1)).T,
                token_with_blank]  # (T, nb, 2U+1)
    pred = -T.log(pred + eps)

    # recurrence relation
    sec_diag = T.concatenate(
        (T.zeros((nb, 2), dtype=intX),
         T.neq(token_with_blank[:, :-2], token_with_blank[:, 2:])),
        axis=1) * T.neq(token_with_blank, blank)  # (nb, 2U+1)
    recurrence_relation = T.tile(
        (m_eye(length) + m_eye(length, k=1)),
        (nb, 1,
         1)) + T.tile(m_eye(length, k=2),
                      (nb, 1, 1)) * sec_diag[:, None, :]  # (nb, 2U+1, 2U+1)
    recurrence_relation = -T.log(recurrence_relation + eps).astype(floatX)

    # alpha
    alpha = T.ones_like(token_with_blank, dtype=floatX) * EPS
    alpha = T.set_subtensor(alpha[:, :2],
                            pred[0, :, :2])  ################(nb, 2U+1)

    # dynamic programming
    # (T, nb, 2U+1)
    [log_probability,
     argmin_pos_1], _ = theano.scan(lambda curr, accum: (
         (accum[:, :, None] + recurrence_relation).min(axis=1) + curr,
         (accum[:, :, None] + recurrence_relation).argmin(axis=1)),
                                    sequences=[pred[1:]],
                                    outputs_info=[alpha, None])

    # why pred_len-2?
    labels_1 = log_probability[pred_len - 2,
                               T.arange(nb), 2 * token_len - 1]  # (nb,)
    labels_2 = log_probability[pred_len - 2,
                               T.arange(nb), 2 * token_len]  # (nb,)
    concat_labels = T.concatenate([labels_1[:, None], labels_2[:, None]],
                                  axis=-1)
    argmin_labels = concat_labels.argmin(axis=-1)

    cost = concat_labels.min(axis=-1)

    min_path = T.ones((t - 1, nb), dtype=intX) * -1  # -1 for null
    min_path = T.set_subtensor(min_path[pred_len - 2,
                                        T.arange(nb)],
                               2 * token_len - 1 + argmin_labels)

    # (T-1, nb)
    min_full_path, _ = theano.scan(
        lambda m_path, argm_pos, m_full_path: argm_pos[
            T.arange(nb),
            T.maximum(m_path, m_full_path).astype('int32')].astype('int32'),
        sequences=[min_path[::-1], argmin_pos_1[::-1]],
        outputs_info=[min_path[-1]])
    argmin_pos = T.concatenate((min_full_path[::-1], min_path[-1][None, :]),
                               axis=0)  # (T, nb)
    argmin_pos = T.set_subtensor(argmin_pos[pred_len - 1,
                                            T.arange(nb)],
                                 2 * token_len - 1 + argmin_labels)

    argmin_token = token_with_blank[T.arange(nb)[None, :], argmin_pos]

    # (nb,), (nb, T)
    return cost, (argmin_token.transpose((1, 0)) * mask + mask - 1).astype(
        'int32'
    )  # alpha, log_probability, argmin_pos_1, argmin_labels, min_path, min_full_path, argmin_pos, token_with_blank, argmin_token
Beispiel #39
0
def ctc_cost(pred, pred_len, token, blank=0):
    '''
    ctc_cost of multi sentences
    :param pred: (T, nb, voca_size+1)                    (4,1,3)
    :param pred_len: (nb,)    pred_len of prediction        (1)
    :param token: (nb, U)    -1 for NIL                    (1,2)
    :param blank: (1)
    :return: ctc_cost
    '''

    eps = theano.shared(np.float32(1e-35))
    Time = pred.shape[0]
    nb, U = token.shape[0], token.shape[1]
    token_len = T.sum(T.neq(token, -1), axis=-1)

    # token_with_blank
    token = token[:, :, None]  # (nb, U, 1)
    token_with_blank = T.concatenate(
        (T.ones_like(token, dtype=intX) * blank, token), axis=2).reshape(
            (nb, 2 * U))
    token_with_blank = T.concatenate(
        (token_with_blank, T.ones(
            (nb, 1), dtype=intX) * blank), axis=1)  # (nb, 2*U+1)
    length = token_with_blank.shape[1]

    # only use these predictions
    pred = pred[T.arange(Time)[:, None, None],
                T.arange(nb)[None, :, None],
                token_with_blank[None, :, :]]  # (T, nb, 2U+1)

    # recurrence relation
    sec_diag = T.concatenate(
        (T.zeros((nb, 2), dtype=intX),
         T.neq(token_with_blank[:, :-2], token_with_blank[:, 2:])),
        axis=1) * T.neq(token_with_blank, blank)  # (nb, 2U+1)
    recurrence_relation = T.tile(
        (m_eye(length) + m_eye(length, k=1)),
        (nb, 1,
         1)) + T.tile(m_eye(length, k=2),
                      (nb, 1, 1)) * sec_diag[:, None, :]  # (nb, 2U+1, 2U+1)
    recurrence_relation = recurrence_relation.astype(floatX)

    # alpha
    alpha = T.zeros_like(token_with_blank, dtype=floatX)
    alpha = T.set_subtensor(alpha[:, :2],
                            pred[0, :, :2])  ################(nb, 2U+1)

    # dynamic programming
    # (T, nb, 2U+1)
    probability, _ = theano.scan(lambda curr, accum: T.sum(
        accum[:, :, None] * recurrence_relation, axis=1) * curr,
                                 sequences=[pred[1:]],
                                 outputs_info=[alpha])
    # T.batched_dot(accum[:, None, :], recurrence_relation)[:, 0] * curr,

    labels_2 = probability[pred_len - 2, T.arange(nb), 2 * token_len - 1]
    labels_1 = probability[pred_len - 2, T.arange(nb), 2 * token_len]
    labels_prob = labels_2 + labels_1

    cost = -T.log(labels_prob + eps)
    return cost
Beispiel #40
0
def top_k_right_path_cost(pred, mask, token, k, blank=0):
    '''
    best right path cost of multi sentences
    :param pred: (T, nb, voca_size+1)                    (4,1,3)
    :param mask: (nb, T)
    :param token: (nb, U)    -1 for NIL                    (1,2)
    :param k:     (1) top k paths
    :param blank: (1)

    :return: top_k_path_cost (nb, k)
    :return: argmin_k_token (nb, k, T) top k path, -1 for null
    '''

    pred_len = mask.sum(axis=-1).astype('int32')
    eps = theano.shared(np.float32(1e-35))
    EPS = theano.shared(np.float32(35))

    t = pred.shape[0]
    nb, U = token.shape[0], token.shape[1]
    token_len = T.sum(T.neq(token, -1), axis=-1)

    # token_with_blank
    token = token[:, :, None]  # (nb, U, 1)
    token_with_blank = T.concatenate(
        (T.ones_like(token, dtype=intX) * blank, token), axis=2).reshape(
            (nb, 2 * U))
    token_with_blank = T.concatenate(
        (token_with_blank, T.ones(
            (nb, 1), dtype=intX) * blank), axis=1)  # (nb, 2*U+1)
    length = token_with_blank.shape[1]

    # only use these predictions
    pred = pred[:, T.tile(T.arange(nb), (length, 1)).T,
                token_with_blank]  # (T, nb, 2U+1)
    pred = -T.log(pred + eps)

    # recurrence relation
    sec_diag = T.concatenate(
        (T.zeros((nb, 2), dtype=intX),
         T.neq(token_with_blank[:, :-2], token_with_blank[:, 2:])),
        axis=1) * T.neq(token_with_blank, blank)  # (nb, 2U+1)
    recurrence_relation = T.tile(
        (m_eye(length) + m_eye(length, k=1)),
        (nb, 1,
         1)) + T.tile(m_eye(length, k=2),
                      (nb, 1, 1)) * sec_diag[:, None, :]  # (nb, 2U+1, 2U+1)
    recurrence_relation = -T.log(recurrence_relation + eps).astype(floatX)

    # alpha
    alpha = T.ones((nb, k, length), dtype=floatX) * EPS
    alpha = T.set_subtensor(alpha[:, 0, :2], pred[0, :, :2])  #(nb, k, 2U+1)

    def step_func_1(curr, accum):
        '''
        :param curr: (nb, length)
        :param accum: (nb, k, length)
        '''
        alpha_t = (accum[:, :, :, None] +
                   recurrence_relation[:, None, :, :]).reshape(
                       (nb, k * length, length))
        accum_t = alpha_t.sort(axis=1)[:, :k, :] + curr[:, None, :]
        argmin_k_t = alpha_t.argsort(axis=1)[:, :k, :]  # from 0 to k*length
        return accum_t, argmin_k_t

    # dynamic programming
    # (T-1, nb, k, length),   (T-1, nb, k, length)
    [log_probability,
     argmin_pos_k], _ = theano.scan(step_func_1,
                                    sequences=[pred[1:]],
                                    outputs_info=[alpha, None])

    labels_1 = log_probability[(pred_len - 2)[:, None],
                               T.arange(nb)[:, None],
                               T.arange(k)[None, :],
                               (2 * token_len - 1)[:, None]]  # (nb, k)
    labels_2 = log_probability[(pred_len - 2)[:, None],
                               T.arange(nb)[:, None],
                               T.arange(k)[None, :],
                               (2 * token_len)[:, None]]  # (nb, k)
    concat_labels = T.concatenate([labels_1, labels_2], axis=-1)
    argmin_labels = (2 * token_len - 1)[:, None] + concat_labels.argsort(
        axis=-1)[:, :k].astype('int32') / k  # (nb, k) from 0 to 2k
    cost = concat_labels.sort(axis=-1)[:, :k]

    min_path = T.ones(
        (t - 1, nb, k), dtype=intX) * -1  # (T-1, nb, k) -1 for null
    min_path = T.set_subtensor(min_path[(pred_len - 2)[:, None],
                                        T.arange(nb)[:, None],
                                        T.arange(k)[None, :]], argmin_labels +
                               T.arange(k)[None, :] * length)  # set (nb, k)

    def step_func_2(m_path, argm_pos, m_full_path):
        '''
        :param m_path: (nb, k) min path (from 0 to k*length)
        :param argm_pos: (nb, k, length) argmin_pos_k
        :param m_full_path: (nb, k) min full path (from 0 to k*length)
        '''
        path_here = T.maximum(m_path, m_full_path).astype('int32')  # (nb, k)
        m_full_return = argm_pos.reshape(
            (nb, k * length))[T.arange(nb)[:, None],
                              path_here].astype('int32')  # (nb, k)
        return m_full_return

    # (T-1, nb, k)
    min_full_path, _ = theano.scan(
        step_func_2,
        sequences=[min_path[::-1], argmin_pos_k[::-1]],
        outputs_info=[min_path[-1]])
    # (T, nb, k)
    argmin_pos = T.concatenate((min_full_path[::-1], min_path[-1][None, :, :]),
                               axis=0)  # (T, nb, k)
    argmin_pos = T.set_subtensor(
        argmin_pos[(pred_len - 1)[:, None],
                   T.arange(nb)[:, None],
                   T.arange(k)[None, :]],
        argmin_labels + T.arange(k)[None, :] * length)

    # (nb, k*length) -> (T, nb, k)
    argmin_token = T.tile(token_with_blank[:, None, :], (1, k, 1)).reshape(
        (nb, k * length))[T.arange(nb)[None, :, None], argmin_pos]

    mask_k = T.le(cost, EPS - 1)
    argmin_token = (argmin_token.transpose(
        (1, 0, 2)) * mask[:, :, None] + mask[:, :, None] -
                    1) * mask_k[:, None, :] + mask_k[:, None, :] - 1

    # (nb, k), (nb, T, k)
    return cost, argmin_token.astype(
        'int32')  #, log_probability, argmin_pos_k, min_full_path
Beispiel #41
0
def make_hierarchical_model(rts,
                            gaze,
                            values,
                            error_lls,
                            subject_idx,
                            v_val=None,
                            gamma_val=None,
                            s_val=None,
                            tau_val=None,
                            t0_val=None,
                            zerotol=1e-6,
                            error_weight=0.05,
                            boundary=1.,
                            gamma_bounds=(-1, 1),
                            drift='multiplicative',
                            design=dict(v=dict(),
                                        gamma=dict(),
                                        s=dict(),
                                        tau=dict(),
                                        t0=dict())):

    if drift == 'multiplicative':
        is_multiplicative = True
    elif drift == 'additive':
        is_multiplicative = False
    else:
        is_multiplicative = None
        ValueError('Drift function "{}" not recognized.'.format(drift))

    n_subjects = np.unique(subject_idx).size

    with pm.Model() as glam_hierarchical:

        # Mechanics
        b = pm.Deterministic('b', tt.constant(boundary, dtype='float32'))
        p_error = pm.Deterministic('p_error',
                                   tt.constant(error_weight, dtype='float32'))

        # Parameter priors
        v = generate_hierarchical_model_parameters(parameter='v',
                                                   n_subjects=n_subjects,
                                                   design=design['v'],
                                                   mu_lower=zerotol,
                                                   mu_upper=0.0005,
                                                   sd_lower=zerotol,
                                                   sd_upper=0.0005,
                                                   bound_lower=0,
                                                   bound_upper=0.0005,
                                                   val=v_val,
                                                   testval=0.0001)

        gamma = generate_hierarchical_model_parameters(
            parameter='gamma',
            n_subjects=n_subjects,
            design=design['gamma'],
            mu_lower=gamma_bounds[0],
            mu_upper=gamma_bounds[1],
            sd_lower=zerotol,
            sd_upper=gamma_bounds[1] - gamma_bounds[0],
            bound_lower=gamma_bounds[0],
            bound_upper=gamma_bounds[1],
            val=gamma_val,
            testval=.5)

        s = generate_hierarchical_model_parameters(parameter='s',
                                                   n_subjects=n_subjects,
                                                   design=design['s'],
                                                   mu_lower=zerotol,
                                                   mu_upper=0.02,
                                                   sd_lower=zerotol,
                                                   sd_upper=0.02,
                                                   bound_lower=zerotol,
                                                   bound_upper=0.02,
                                                   val=s_val,
                                                   testval=0.0075)

        tau = generate_hierarchical_model_parameters(parameter='tau',
                                                     n_subjects=n_subjects,
                                                     design=design['tau'],
                                                     mu_lower=0,
                                                     mu_upper=5,
                                                     sd_lower=zerotol,
                                                     sd_upper=5,
                                                     bound_lower=0,
                                                     bound_upper=5,
                                                     val=tau_val,
                                                     testval=.5)

        if t0_val is None:
            t0 = pm.Uniform('t0', 0, 500, testval=50, shape=(n_subjects, 1))
        else:
            t0 = pm.Deterministic('t0', tt.ones((n_subjects, 1)) * t0_val)

        # Likelihood
        def lda_logp(rt, gaze, values, error_lls, s_condition_index,
                     s_subject_index, v_condition_index, v_subject_index,
                     tau_condition_index, tau_subject_index,
                     gamma_condition_index, gamma_subject_index,
                     t0_condition_index, t0_subject_index, is_multiplicative,
                     zerotol):

            # compute drifts
            drift = ifelse(
                is_multiplicative,
                glam.components.tt_drift_multiplicative(
                    v[tt.cast(v_subject_index, dtype='int32'),
                      tt.cast(v_condition_index, dtype='int32')][:, None],
                    tau[tt.cast(tau_subject_index, dtype='int32'),
                        tt.cast(tau_condition_index, dtype='int32')][:, None],
                    gamma[tt.cast(gamma_subject_index, dtype='int32'),
                          tt.cast(gamma_condition_index, dtype='int32')][:,
                                                                         None],
                    values, gaze, zerotol),
                glam.components.tt_drift_additive(
                    v[tt.cast(v_subject_index, dtype='int32'),
                      tt.cast(v_condition_index, dtype='int32')][:, None],
                    tau[tt.cast(tau_subject_index, dtype='int32'),
                        tt.cast(tau_condition_index, dtype='int32')][:, None],
                    gamma[tt.cast(gamma_subject_index, dtype='int32'),
                          tt.cast(gamma_condition_index, dtype='int32')][:,
                                                                         None],
                    values, gaze, zerotol))

            glam_ll = glam.components.tt_wienerrace_pdf(
                rt[:, None], drift,
                s[tt.cast(s_subject_index, dtype='int32'),
                  tt.cast(s_condition_index, dtype='int32')][:, None], b,
                t0[tt.cast(t0_subject_index, dtype='int32'),
                   tt.cast(t0_condition_index, dtype='int32')][:,
                                                               None], zerotol)

            # mix likelihoods
            mixed_ll = ((1 - p_error) * glam_ll +
                        p_error * error_lls[subject_idx])

            mixed_ll = tt.where(tt.isnan(mixed_ll), 0., mixed_ll)
            mixed_ll = tt.where(tt.isinf(mixed_ll), 0., mixed_ll)

            return tt.log(mixed_ll + zerotol)

        obs = pm.DensityDist(
            'obs',
            logp=lda_logp,
            observed=dict(
                rt=rts,
                gaze=gaze,
                values=values,
                error_lls=error_lls,
                s_condition_index=design['s']['condition_index'].astype(
                    np.int32),
                s_subject_index=design['s']['subject_index'].astype(np.int32),
                v_condition_index=design['v']['condition_index'].astype(
                    np.int32),
                v_subject_index=design['v']['subject_index'].astype(np.int32),
                tau_condition_index=design['tau']['condition_index'].astype(
                    np.int32),
                tau_subject_index=design['tau']['subject_index'].astype(
                    np.int32),
                gamma_condition_index=design['gamma']
                ['condition_index'].astype(np.int32),
                gamma_subject_index=design['gamma']['subject_index'].astype(
                    np.int32),
                t0_condition_index=design['t0']['condition_index'].astype(
                    np.int32),
                t0_subject_index=design['t0']['subject_index'].astype(
                    np.int32),
                is_multiplicative=is_multiplicative,
                zerotol=zerotol))
    return glam_hierarchical
Beispiel #42
0
def generate_hierarchical_model_parameters(parameter, n_subjects, design,
                                           mu_lower, mu_upper, sd_lower,
                                           sd_upper, bound_lower, bound_upper,
                                           val, testval):

    if (design['conditions'] is not None):
        if val is None:
            mu = tt.stack([
                pm.Uniform('{}_{}_mu'.format(parameter, condition),
                           mu_lower,
                           mu_upper,
                           testval=testval)
                for condition in design['conditions']
            ])
            sd = tt.stack([
                pm.Uniform('{}_{}_sd'.format(parameter, condition),
                           sd_lower,
                           sd_upper,
                           testval=testval)
                for condition in design['conditions']
            ])
            bounded = pm.Bound(pm.Normal, bound_lower, bound_upper)
            parms = []
            n_subjects_per_condition = []
            for c, condition in enumerate(design['conditions']):
                n_subjects_in_condition = np.unique(design['subject_index'][
                    design['condition_index'] == c]).size
                n_subjects_per_condition.append(n_subjects_in_condition)
                parms_tmp = bounded('{}_{}'.format(parameter, condition),
                                    mu=mu[c],
                                    sd=sd[c],
                                    shape=(n_subjects_in_condition))
                parms_tmp = tt.concatenate([tt.zeros(1), parms_tmp])
                parms.append(parms_tmp[design['D'][:, c]][:, None])
            parms = tt.concatenate(parms, axis=1)

        else:
            parms = []
            n_subjects_per_condition = []
            for c, condition in enumerate(design['conditions']):
                n_subjects_in_condition = np.unique(design['subject_index'][
                    design['condition_index'] == c]).size
                n_subjects_per_condition.append(n_subjects_in_condition)
                if len(val) == len(design['conditions']):
                    parms.append(
                        pm.Deterministic(
                            '{}_{}'.format(parameter, condition),
                            tt.ones(n_subjects_in_condition, 1) * val[c]))
                else:
                    raise ValueError(
                        'Number of values in {}_val does not match the number of specified {}-conditions.'
                        .format(parameter, parameter))
            # make sure all elements in parms have same size
            for set_i, parm_set in enumerate(parms):
                if n_subjects_per_condition[set_i] < n_subjects:
                    parms[set_i] = tt.concatenate([
                        parm_set,
                        tt.zeros(
                            (n_subjects - n_subjects_per_condition[set_i], 1))
                    ],
                                                  axis=0)
            parms = tt.concatenate(parms, axis=1)

    else:
        if val is None:
            mu = pm.Uniform('{}_mu'.format(parameter),
                            mu_lower,
                            mu_upper,
                            testval=testval)
            sd = pm.Uniform('{}_sd'.format(parameter),
                            sd_lower,
                            sd_upper,
                            testval=testval)
            bounded = pm.Bound(pm.Normal, bound_lower, bound_upper)
            parms = bounded(parameter, mu=mu, sd=sd, shape=(n_subjects, 1))
        else:
            parms = pm.Deterministic(parameter, tt.ones((n_subjects, 1)) * val)

    return parms
Beispiel #43
0
    def __init__(self,
                 collapse='mean',
                 maxout=False,
                 transpose=False,
                 **kwargs):
        super(TwoDToOneDLayer, self).__init__(1, **kwargs)
        self.set_attr('collapse', collapse)
        self.set_attr('transpose', transpose)
        Y = self.sources[0].output
        if transpose:
            Y = Y.dimshuffle(1, 0, 2, 3)

        #index handling
        def index_fn(index, size):
            return T.set_subtensor(index[:size], numpy.cast['int8'](1))

        index_init = T.zeros((Y.shape[2], Y.shape[1]), dtype='int8')
        self.index, _ = theano.scan(
            index_fn,
            [index_init,
             T.cast(self.sources[0].output_sizes[:, 1], "int32")])
        self.index = self.index.dimshuffle(1, 0)
        n_out = self.sources[0].attrs['n_out']

        if maxout:
            Y = Y.max(axis=3).dimshuffle(0, 1, 2, 'x')

        if collapse == 'sum' or collapse == True:
            Y = Y.sum(axis=0)
        elif collapse == 'mean':
            Y = Y.mean(axis=0)
        elif collapse == 'conv':
            from TheanoUtil import circular_convolution
            Y, _ = theano.scan(lambda x_i, x_p: circular_convolution(x_i, x_p),
                               Y, Y[0])
            Y = Y[-1]
        elif collapse == 'flatten':
            self.index = T.ones((Y.shape[0] * Y.shape[1], Y.shape[2]),
                                dtype='int8')
            Y = Y.reshape((Y.shape[0] * Y.shape[1], Y.shape[2], Y.shape[3]))
        elif str(collapse).startswith('pad_'):
            pad = numpy.int32(collapse.split('_')[-1])
            Y = ifelse(
                T.lt(Y.shape[0], pad),
                T.concatenate([
                    Y,
                    T.zeros(
                        (pad - Y.shape[0], Y.shape[1], Y.shape[2], Y.shape[3]),
                        'float32')
                ],
                              axis=0), ifelse(T.gt(Y.shape[0], pad), Y[:pad],
                                              Y))
            Y = Y.dimshuffle(1, 2, 3, 0).reshape(
                (Y.shape[1], Y.shape[2], Y.shape[3] * Y.shape[0]))
            n_out *= pad
        elif collapse != False:
            assert False, "invalid collapse mode"

        if self.attrs['batch_norm']:
            Y = self.batch_norm(Y, n_out, force_sample=False)
        self.output = Y
        self.act = [Y, Y]
        self.set_attr('n_out', n_out)
Beispiel #44
0
 def linear_model(X, y):
     shape = X.shape
     X = pm.Normal('X', mu=np.mean(X, axis=0), sd=np.std(X, axis=0), shape=shape)
     coefs = pm.Normal('coefs', mu=tt.zeros(shape[1]), sd=tt.ones(shape[1]), shape=shape[1])
     pm.Normal('y', mu=tt.dot(X, coefs), sd=tt.ones(shape[0]), shape=shape[0])
Beispiel #45
0
 def partial_linear_model(X):
     shape = X.shape
     X = pm.Normal('X', mu=np.mean(X, axis=0), sd=np.std(X, axis=0), shape=shape)
     pm.Normal('coefs', mu=tt.zeros(shape[1]), sd=tt.ones(shape[1]), shape=shape[1])
    def __init__(
        self,
        cell_state_mat: np.ndarray,
        X_data: np.ndarray,
        n_comb: int = 50,
        data_type: str = "float32",
        n_iter=20000,
        learning_rate=0.005,
        total_grad_norm_constraint=200,
        verbose=True,
        var_names=None,
        var_names_read=None,
        obs_names=None,
        fact_names=None,
        sample_id=None,
        gene_level_prior={"mean": 1 / 2, "sd": 1 / 4},
        gene_level_var_prior={"mean_var_ratio": 1.0},
        cell_number_prior={"cells_per_spot": 8.0, "factors_per_spot": 7.0, "combs_per_spot": 2.5},
        cell_number_var_prior={"cells_mean_var_ratio": 1.0, "factors_mean_var_ratio": 1.0, "combs_mean_var_ratio": 1.0},
        phi_hyp_prior={"mean": 3.0, "sd": 1.0},
        spot_fact_mean_var_ratio=5.0,
        exper_gene_level_mean_var_ratio=10,
    ):

        ############# Initialise parameters ################
        super().__init__(
            cell_state_mat,
            X_data,
            data_type,
            n_iter,
            learning_rate,
            total_grad_norm_constraint,
            verbose,
            var_names,
            var_names_read,
            obs_names,
            fact_names,
            sample_id,
        )

        for k in gene_level_var_prior.keys():
            gene_level_prior[k] = gene_level_var_prior[k]

        self.gene_level_prior = gene_level_prior
        self.phi_hyp_prior = phi_hyp_prior
        self.n_comb = n_comb
        self.spot_fact_mean_var_ratio = spot_fact_mean_var_ratio
        self.exper_gene_level_mean_var_ratio = exper_gene_level_mean_var_ratio

        # generate parameters for samples
        self.spot2sample_df = pd.get_dummies(sample_id)
        # convert to np.ndarray
        self.spot2sample_mat = self.spot2sample_df.values
        self.n_exper = self.spot2sample_mat.shape[1]
        # assign extra data to dictionary with (1) shared parameters (2) input data
        self.extra_data_tt = {"spot2sample": theano.shared(self.spot2sample_mat.astype(self.data_type))}
        self.extra_data = {"spot2sample": self.spot2sample_mat.astype(self.data_type)}

        cell_number_prior["factors_per_combs"] = (
            cell_number_prior["factors_per_spot"] / cell_number_prior["combs_per_spot"]
        )
        for k in cell_number_var_prior.keys():
            cell_number_prior[k] = cell_number_var_prior[k]
        self.cell_number_prior = cell_number_prior

        ############# Define the model ################
        self.model = pm.Model()

        with self.model:

            # =====================Gene expression level scaling======================= #
            # Explains difference in expression between genes and
            # how it differs in single cell and spatial technology
            # compute hyperparameters from mean and sd
            shape = gene_level_prior["mean"] ** 2 / gene_level_prior["sd"] ** 2
            rate = gene_level_prior["mean"] / gene_level_prior["sd"] ** 2
            shape_var = shape / gene_level_prior["mean_var_ratio"]
            rate_var = rate / gene_level_prior["mean_var_ratio"]
            self.gene_level_alpha_hyp = pm.Gamma(
                "gene_level_alpha_hyp", mu=shape, sigma=np.sqrt(shape_var), shape=(1, 1)
            )
            self.gene_level_beta_hyp = pm.Gamma("gene_level_beta_hyp", mu=rate, sigma=np.sqrt(rate_var), shape=(1, 1))

            # global gene levels
            self.gene_level = pm.Gamma(
                "gene_level", self.gene_level_alpha_hyp, self.gene_level_beta_hyp, shape=(self.n_var, 1)
            )
            # scale cell state factors by gene_level
            self.gene_factors = pm.Deterministic("gene_factors", self.cell_state)
            # self.gene_factors = self.cell_state
            # tt.printing.Print('gene_factors sum')(gene_factors.sum(0).shape)
            # tt.printing.Print('gene_factors sum')(gene_factors.sum(0))

            # =====================Spot factors======================= #
            # prior on spot factors reflects the number of cells, fraction of their cytoplasm captured,
            # times heterogeniety in the total number of mRNA between individual cells with each cell type
            self.cells_per_spot = pm.Gamma(
                "cells_per_spot",
                mu=cell_number_prior["cells_per_spot"],
                sigma=np.sqrt(cell_number_prior["cells_per_spot"] / cell_number_prior["cells_mean_var_ratio"]),
                shape=(self.n_obs, 1),
            )
            self.comb_per_spot = pm.Gamma(
                "combs_per_spot",
                mu=cell_number_prior["combs_per_spot"],
                sigma=np.sqrt(cell_number_prior["combs_per_spot"] / cell_number_prior["combs_mean_var_ratio"]),
                shape=(self.n_obs, 1),
            )

            shape = self.comb_per_spot / np.array(self.n_comb).reshape((1, 1))
            rate = tt.ones((1, 1)) / self.cells_per_spot * self.comb_per_spot
            self.combs_factors = pm.Gamma("combs_factors", alpha=shape, beta=rate, shape=(self.n_obs, self.n_comb))

            self.factors_per_combs = pm.Gamma(
                "factors_per_combs",
                mu=cell_number_prior["factors_per_combs"],
                sigma=np.sqrt(cell_number_prior["factors_per_combs"] / cell_number_prior["factors_mean_var_ratio"]),
                shape=(self.n_comb, 1),
            )
            c2f_shape = self.factors_per_combs / np.array(self.n_fact).reshape((1, 1))
            self.comb2fact = pm.Gamma(
                "comb2fact", alpha=c2f_shape, beta=self.factors_per_combs, shape=(self.n_comb, self.n_fact)
            )

            self.spot_factors = pm.Gamma(
                "spot_factors",
                mu=pm.math.dot(self.combs_factors, self.comb2fact),
                sigma=pm.math.sqrt(pm.math.dot(self.combs_factors, self.comb2fact) / self.spot_fact_mean_var_ratio),
                shape=(self.n_obs, self.n_fact),
            )

            # =====================Spot-specific additive component======================= #
            # molecule contribution that cannot be explained by cell state signatures
            # these counts are distributed between all genes not just expressed genes
            self.spot_add_hyp = pm.Gamma("spot_add_hyp", 1, 1, shape=2)
            self.spot_add = pm.Gamma("spot_add", self.spot_add_hyp[0], self.spot_add_hyp[1], shape=(self.n_obs, 1))

            # =====================Gene-specific additive component ======================= #
            # per gene molecule contribution that cannot be explained by cell state signatures
            # these counts are distributed equally between all spots (e.g. background, free-floating RNA)
            self.gene_add_hyp = pm.Gamma("gene_add_hyp", 1, 1, shape=2)
            self.gene_add = pm.Gamma(
                "gene_add", self.gene_add_hyp[0], self.gene_add_hyp[1], shape=(self.n_exper, self.n_var)
            )

            # =====================Gene-specific overdispersion ======================= #
            self.phi_hyp = pm.Gamma("phi_hyp", mu=phi_hyp_prior["mean"], sigma=phi_hyp_prior["sd"], shape=(1, 1))
            self.gene_E = pm.Exponential("gene_E", self.phi_hyp, shape=(self.n_exper, self.n_var))

            # =====================Expected expression ======================= #
            # expected expression
            self.mu_biol = (
                pm.math.dot(self.spot_factors, self.gene_factors.T) * self.gene_level.T
                + pm.math.dot(self.extra_data_tt["spot2sample"], self.gene_add)
                + self.spot_add
            )
            # tt.printing.Print('mu_biol')(self.mu_biol.shape)

            # =====================DATA likelihood ======================= #
            # Likelihood (sampling distribution) of observations & add overdispersion via NegativeBinomial / Poisson
            self.data_target = pm.NegativeBinomial(
                "data_target",
                mu=self.mu_biol,
                alpha=pm.math.dot(self.extra_data_tt["spot2sample"], 1 / tt.pow(self.gene_E, 2)),
                observed=self.x_data,
                total_size=self.X_data.shape,
            )

            # =====================Compute nUMI from each factor in spots  ======================= #
            self.nUMI_factors = pm.Deterministic(
                "nUMI_factors", (self.spot_factors * (self.gene_factors * self.gene_level).sum(0))
            )
Beispiel #47
0
def SIR_with_change_points(S_begin_beta,
                           I_begin_beta,
                           new_cases_obs,
                           change_points_list,
                           date_begin_simulation,
                           num_days_sim,
                           diff_data_sim,
                           N,
                           priors_dict=None,
                           weekends_modulated=False):
    """
        Parameters
        ----------
        new_cases_obs : list or array
            Timeseries (day over day) of newly reported cases (not the total number)
        change_points_list : list of dicts
            List of dictionaries, each corresponding to one change point.
            Each dict can have the following key-value pairs. If a pair is not provided,
            the respective default is used.
                * pr_mean_date_begin_transient :     datetime.datetime, NO default
                * pr_median_lambda :                 number, same as default priors, below
                * pr_sigma_lambda :                  number, same as default priors, below
                * pr_sigma_date_begin_transient :    number, 3
                * pr_median_transient_len :          number, 3
                * pr_sigma_transient_len :           number, 0.3
        date_begin_simulation: datetime.datetime
            The begin of the simulation data
        num_days_sim : integer
            Number of days to forecast into the future
        diff_data_sim : integer
            Number of days that the simulation-begin predates the first data point in
            `new_cases_obs`. This is necessary so the model can fit the reporting delay.
            Set this parameter to a value larger than what you expect to find
            for the reporting delay.
            should be significantly larger than the expected delay,
            in order to always fit the same number of data points.
        N : number
            The population size. For Germany, we used 83e6
        priors_dict : dict
            Dictionary of the prior assumptions
            Possible key-value pairs (and default values) are:
                * pr_beta_I_begin :        number, default = 100
                * pr_median_lambda_0 :     number, default = 0.4
                * pr_sigma_lambda_0 :      number, default = 0.5
                * pr_median_mu :           number, default = 1/8
                * pr_sigma_mu :            number, default = 0.2
                * pr_median_delay :        number, default = 8
                * pr_sigma_delay :         number, default = 0.2
                * pr_beta_sigma_obs :      number, default = 10
                * week_end_days :          tuple,  default = (6,7)
                * pr_mean_weekend_factor : number, default = 0.7
                * pr_sigma_weekend_factor :number, default = 0.17
        weekends_modulated : bool
            Whether to add the prior that cases are less reported on week ends. Multiplies the new cases numbers on weekends
            by a number between 0 and 1, given by a prior beta distribution. The beta distribution is parametrised
            by pr_mean_weekend_factor and pr_sigma_weekend_factor
        weekend_modulation_type : 'step' or 'abs_sine':
            whether the weekends are modulated by a step function, which only multiplies the days given by  week_end_days
            by the week_end_factor, or whether the whole week is modulated by an abs(sin(x)) function, with an offset
            with flat prior.
        Returns
        -------
        : pymc3.Model
            Returns an instance of pymc3 model with the change points
    """
    if priors_dict is None:
        priors_dict = dict()

    default_priors = dict(pr_beta_I_begin=10000.0,
                          pr_median_lambda_0=0.2,
                          pr_sigma_lambda_0=0.5,
                          pr_median_mu=1 / 8,
                          pr_sigma_mu=0.2,
                          pr_median_delay=1.0,
                          pr_sigma_delay=0.2,
                          pr_beta_sigma_obs=5.0,
                          week_end_days=(6, 7),
                          pr_mean_weekend_factor=0.7,
                          pr_sigma_weekend_factor=0.17)
    default_priors_change_points = dict(
        pr_median_lambda=default_priors["pr_median_lambda_0"],
        pr_sigma_lambda=default_priors["pr_sigma_lambda_0"],
        pr_sigma_date_begin_transient=3.0,
        pr_median_transient_len=3.0,
        pr_sigma_transient_len=0.3,
        pr_mean_date_begin_transient=None,
    )

    if not weekends_modulated:
        del default_priors['week_end_days']
        del default_priors['pr_mean_weekend_factor']
        del default_priors['pr_sigma_weekend_factor']

    for prior_name in priors_dict.keys():
        if prior_name not in default_priors:
            raise RuntimeError(f"Prior with name {prior_name} not known")
    for change_point in change_points_list:
        for prior_name in change_point.keys():
            if prior_name not in default_priors_change_points:
                raise RuntimeError(f"Prior with name {prior_name} not known")

    for prior_name, value in default_priors.items():
        if prior_name not in priors_dict:
            priors_dict[prior_name] = value
            # print(f"{prior_name} was set to default value {value}")
    for prior_name, value in default_priors_change_points.items():
        for i_cp, change_point in enumerate(change_points_list):
            if prior_name not in change_point:
                change_point[prior_name] = value
                # print(f"{prior_name} of change point {i_cp} was set to default value {value}")

    if num_days_sim < len(new_cases_obs) + diff_data_sim:
        raise RuntimeError(
            "Simulation ends before the end of the data. Increase num_days_sim."
        )

    # ------------------------------------------------------------------------------ #
    # Model and prior implementation
    # ------------------------------------------------------------------------------ #

    with pm.Model() as model:
        # all pm functions now apply on the model instance
        # true cases at begin of loaded data but we do not know the real number
        I_begin = pm.Normal(name="I_begin",
                            mu=I_begin_beta,
                            sigma=I_begin_beta / 10)
        S_begin = pm.Normal(name="S_begin",
                            mu=S_begin_beta,
                            sigma=S_begin_beta / 10)
        # S_begin = N - I_begin

        # I_begin_print = tt.printing.Print('I_begin')(I_begin)
        # S_begin_print = tt.printing.Print('S_begin')(S_begin)
        # fraction of people that are newly infected each day
        lambda_list = []
        lambda_list.append(
            pm.Lognormal(
                name="lambda_0",
                mu=np.log(priors_dict["pr_median_lambda_0"]),
                sigma=priors_dict["pr_sigma_lambda_0"],
            ))
        for i, cp in enumerate(change_points_list):
            lambda_list.append(
                pm.Lognormal(
                    name=f"lambda_{i + 1}",
                    mu=np.log(cp["pr_median_lambda"]),
                    sigma=cp["pr_sigma_lambda"],
                ))

        # list of start dates of the transient periods of the change points
        tr_begin_list = []
        dt_before = date_begin_simulation
        for i, cp in enumerate(change_points_list):
            dt_begin_transient = cp["pr_mean_date_begin_transient"]
            if dt_before is not None and dt_before > dt_begin_transient:
                raise RuntimeError(
                    "Dates of change points are not temporally ordered")

            prior_mean = (
                dt_begin_transient - date_begin_simulation
            ).days  # - 1  # convert the provided date format (argument) into days (a number)

            tr_begin = pm.Normal(
                name=f"transient_begin_{i}",
                mu=prior_mean,
                sigma=cp["pr_sigma_date_begin_transient"],
            )
            tr_begin_list.append(tr_begin)
            dt_before = dt_begin_transient

        # same for transient times
        tr_len_list = []
        for i, cp in enumerate(change_points_list):
            tr_len = pm.Lognormal(
                name=f"transient_len_{i}",
                mu=np.log(cp["pr_median_transient_len"]),
                sigma=cp["pr_sigma_transient_len"],
            )
            tr_len_list.append(tr_len)

        # build the time-dependent spreading rate
        lambda_t_list = [lambda_list[0] * tt.ones(num_days_sim)]
        lambda_before = lambda_list[0]

        for tr_begin, tr_len, lambda_after in zip(tr_begin_list, tr_len_list,
                                                  lambda_list[1:]):
            lambda_t = model_helper.smooth_step_function(
                start_val=0,
                end_val=1,
                t_begin=tr_begin,
                t_end=tr_begin + tr_len,
                t_total=num_days_sim,
            ) * (lambda_after - lambda_before)
            lambda_before = lambda_after
            lambda_t_list.append(lambda_t)
        lambda_t = sum(lambda_t_list)

        # fraction of people that recover each day, recovery rate mu
        mu = pm.Lognormal(
            name="mu",
            mu=np.log(priors_dict["pr_median_mu"]),
            sigma=priors_dict["pr_sigma_mu"],
        )

        # delay in days between contracting the disease and being recorded
        # delay = pm.Lognormal(
        #     name="delay",
        #     mu=np.log(priors_dict["pr_median_delay"]),
        #     sigma=priors_dict["pr_sigma_delay"],
        # )

        # prior of the error of observed cases
        sigma_obs = pm.HalfCauchy("sigma_obs",
                                  beta=priors_dict["pr_beta_sigma_obs"])

        # -------------------------------------------------------------------------- #
        # training the model with loaded data provided as argument
        # -------------------------------------------------------------------------- #

        S, I, new_I = _SIR_model(lambda_t=lambda_t,
                                 mu=mu,
                                 S_begin=S_begin,
                                 I_begin=I_begin,
                                 N=N)

        # ignore this delay
        # new_cases_inferred = model_helper.delay_cases(
        #     new_I_t=new_I,
        #     len_new_I_t=num_days_sim,
        #     len_out=num_days_sim - diff_data_sim,
        #     delay=delay,
        #     delay_diff=diff_data_sim,
        # )
        new_cases_inferred = new_I

        # likelihood of the model:
        # observed cases are distributed following studentT around the model.
        # we want to approximate a Poisson distribution of new cases.
        # we choose nu=4 to get heavy tails and robustness to outliers.
        # https://www.jstor.org/stable/2290063
        num_days_data = new_cases_obs.shape[-1]
        pm.StudentT(
            name="_new_cases_studentT",
            nu=4,
            mu=new_cases_inferred[:num_days_data],
            sigma=tt.abs_(new_cases_inferred[:num_days_data] + 1)**0.5 *
            sigma_obs,  # +1 and tt.abs to avoid nans
            observed=new_cases_obs,
        )

        # add these observables to the model so we can extract a time series of them
        # later via e.g. `model.trace['lambda_t']`
        pm.Deterministic("lambda_t", lambda_t)
        pm.Deterministic("new_cases", new_cases_inferred)
    return model
    max_steps = max_steps_var
else:
    max_steps = 2
l_ans_softmax = AnsPointerLayer(mlstm,
                                num_units=k,
                                max_steps=max_steps,
                                mask_input=l_passage_mask)
if load_previous:
    print('loading previous saved model ...')
    # And load them again later on like this:
    with np.load(save_filename) as f:
        param_values = [f['arr_%d' % i] for i in range(len(f.files))]
    lasagne.layers.set_all_param_values(l_ans_softmax, param_values)

if not sequential:
    ans_mask = T.ones((1, 2))
    ans_length = T.constant(2)
else:
    ans_mask = ans_mask_var
    ans_length = ans_length_var

# lasagne.layers.get_output produces a variable for the output of the net
# prediction's shape is (n_batch, max_steps, passage_seq_len)
prediction = lasagne.layers.get_output(l_ans_softmax, deterministic=False)
loss, _ = categorical_crossentropy(prediction, target_var, ans_mask,
                                   ans_length)
cost = loss.mean()
if l2_weight > 0.:
    # apply l2 regularization
    print('apply l2 penalty to all layers, weight: {}'.format(l2_weight))
    l2_penalty = lasagne.regularization.regularize_network_params(
def conv_cond_concat(x, y):
    """
    concatenate conditioning vector on feature map axis
    """
    return T.concatenate([x, y * T.ones((x.shape[0], y.shape[1], x.shape[2], x.shape[3]))], axis=1)
Beispiel #50
0
    def build(self,
              dropout,
              char_dim,
              char_lstm_dim,
              char_bidirect,
              word_dim,
              word_lstm_dim,
              word_bidirect,
              lr_method,
              pos_dim,
              dep_dim,
              pre_emb,
              pre_emb_dep,
              crf,
              cap_dim,
              training=True,
              **kwargs):
        """
        Build the network.
        """
        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)

        # Number of capitalization features
        if cap_dim:
            n_cap = 4
        if pos_dim:
            n_POS = len(self.id_to_POS)
        if dep_dim:
            n_depN = len(self.id_to_N)
            n_depV = len(self.id_to_V)
        # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')
        char_for_ids = T.imatrix(name='char_for_ids')
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')
        probs = T.ivector(name='probs')
        tag_ids = T.ivector(name='tag_ids')
        if cap_dim:
            cap_ids = T.ivector(name='cap_ids')
        if pos_dim:
            pos_ids = T.ivector(name='pos_ids')
        if dep_dim:
            N_ids = T.ivector(name='N_ids')
            V_ids = T.ivector(name='V_ids')

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        #
        # Word inputs
        #
        if word_dim:
            input_dim += word_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            inputs.append(word_input)
            # Initialize with pretrained embeddings
            if pre_emb and training:
                new_weights = word_layer.embeddings.get_value()
                print 'Loading pretrained embeddings from %s...' % pre_emb
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
                    line = line.rstrip().split()
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights[i] = pretrained[re.sub(
                            '\d', '0', word.lower())]
                        c_zeros += 1
                word_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained embeddings.' % len(pretrained)
                print(
                    '%i / %i (%.4f%%) words have been initialized with '
                    'pretrained embeddings.') % (
                        c_found + c_lower + c_zeros, n_words, 100. *
                        (c_found + c_lower + c_zeros) / n_words)
                print(
                    '%i found directly, %i after lowercasing, '
                    '%i after lowercasing + zero.') % (c_found, c_lower,
                                                       c_zeros)

        #
        # Chars inputs
        #
        if char_dim:
            input_dim += char_lstm_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

            char_lstm_for = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_layer.link(char_for_ids))
            char_lstm_rev.link(char_layer.link(char_rev_ids))

            char_for_output = char_lstm_for.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]
            char_rev_output = char_lstm_rev.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]

            inputs.append(char_for_output)
            if char_bidirect:
                inputs.append(char_rev_output)
                input_dim += char_lstm_dim

        #
        # Capitalization feature
        #
        if cap_dim:
            input_dim += cap_dim
            cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer')
            inputs.append(cap_layer.link(cap_ids))
        if pos_dim:
            input_dim += pos_dim
            pos_layer = EmbeddingLayer(n_POS, pos_dim, name='pos_layer')
            inputs.append(pos_layer.link(pos_ids))

        if dep_dim:
            input_dim += dep_dim * 2
            print '#########'
            print n_depN
            print n_depV
            dep_layer_N = EmbeddingLayer(n_depN, dep_dim, name='dep_layer_N')
            dep_layer_V = EmbeddingLayer(n_depV, dep_dim, name='dep_layer_V')
            dep_input_N = dep_layer_N.link(N_ids)
            dep_input_V = dep_layer_V.link(V_ids)
            inputs.append(dep_input_N)
            inputs.append(dep_input_V)
            # Initialize with pretrained embeddings
            if pre_emb_dep and training:
                new_weights_N = dep_layer_N.embeddings.get_value()
                new_weights_V = dep_layer_V.embeddings.get_value()
                print 'Loading pretrained embeddings from %s...' % pre_emb_dep
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(codecs.open(pre_emb_dep, 'r',
                                                     'utf-8')):
                    line = line.rstrip().split()
                    if len(line) == dep_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_depN):
                    word = self.id_to_N[i]
                    if word in pretrained:
                        new_weights_N[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights_N[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights_N[i] = pretrained[re.sub(
                            '\d', '0', word.lower())]
                        c_zeros += 1
                dep_layer_N.embeddings.set_value(new_weights_N)
                print(
                    '%i / %i (%.4f%%) words have been initialized with '
                    'pretrained dep embeddings.') % (
                        c_found + c_lower + c_zeros, n_depN, 100. *
                        (c_found + c_lower + c_zeros) / n_depN)
                print(
                    '%i found directly, %i after lowercasing, '
                    '%i after lowercasing + zero.') % (c_found, c_lower,
                                                       c_zeros)
                c_found = 0
                c_lower = 0
                c_zeros = 0
                for i in xrange(n_depV):
                    word = self.id_to_V[i]
                    if word in pretrained:
                        new_weights_V[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights_V[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights_V[i] = pretrained[re.sub(
                            '\d', '0', word.lower())]
                        c_zeros += 1
                dep_layer_V.embeddings.set_value(new_weights_V)
                print 'Loaded %i pretrained dep embeddings.' % len(pretrained)
                print(
                    '%i / %i (%.4f%%) words have been initialized with '
                    'pretrained dep embeddings.') % (
                        c_found + c_lower + c_zeros, n_depV, 100. *
                        (c_found + c_lower + c_zeros) / n_depV)
                print(
                    '%i found directly, %i after lowercasing, '
                    '%i after lowercasing + zero.') % (c_found, c_lower,
                                                       c_zeros)
        # Prepare final input
        if len(inputs) != 1:
            inputs = T.concatenate(inputs, axis=1)

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        # LSTM for words
        word_lstm_for = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_rev')
        word_lstm_for.link(inputs)
        word_lstm_rev.link(inputs[::-1, :])
        word_for_output = word_lstm_for.h
        word_rev_output = word_lstm_rev.h[::-1, :]
        if word_bidirect:
            print 'BUUUUUUUUUUUGGGGGGGG'
            final_output = T.concatenate([word_for_output, word_rev_output],
                                         axis=1)
            tanh_layer = HiddenLayer(2 * word_lstm_dim,
                                     word_lstm_dim,
                                     name='tanh_layer',
                                     activation='tanh')
            final_output = tanh_layer.link(final_output)
        else:
            final_output = word_for_output

        # Sentence to Named Entity tags - Score
        final_layer = HiddenLayer(word_lstm_dim,
                                  n_tags,
                                  name='final_layer',
                                  activation=(None if crf else 'softmax'))
        tags_scores = final_layer.link(final_output)
        tags_scores_softmax = tags_scores

        # No CRF
        if not crf:
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')

            small = -1000
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
            observations = T.concatenate(
                [tags_scores, small * T.ones((s_len, 2))], axis=1)
            observations = T.concatenate([b_s, observations, e_s], axis=0)

            # Score from tags
            real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()

            # Score from transitions
            b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
            e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
            padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
            real_path_score += transitions[padded_tags_ids[T.arange(s_len +
                                                                    1)],
                                           padded_tags_ids[T.arange(s_len + 1)
                                                           + 1]].sum()

            all_paths_scores = forward(observations, transitions)
            cost = -(real_path_score - all_paths_scores)

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)
        if char_dim:
            self.add_component(char_layer)
            self.add_component(char_lstm_for)
            params.extend(char_layer.params)
            params.extend(char_lstm_for.params)
            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)
        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)
        if word_bidirect:
            print 'BUUUUUGGGGGGG component'
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)
        if cap_dim:
            self.add_component(cap_layer)
            params.extend(cap_layer.params)
        if pos_dim:
            self.add_component(pos_layer)
            params.extend(pos_layer.params)
        if dep_dim:
            self.add_component(dep_layer_N)
            self.add_component(dep_layer_V)
            params.extend(dep_layer_N.params)
            params.extend(dep_layer_V.params)
        self.add_component(final_layer)
        params.extend(final_layer.params)
        if crf:
            self.add_component(transitions)
            params.append(transitions)
        if word_bidirect:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        if cap_dim:
            eval_inputs.append(cap_ids)
        if pos_dim:
            eval_inputs.append(pos_ids)
        if dep_dim:
            eval_inputs.append(N_ids)
            eval_inputs.append(V_ids)

        # train_inputs = eval_inputs + [tag_ids, probs]
        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            updates = Optimization(clip=5.0).get_updates(
                lr_method_name, cost, params, **lr_method_parameters)
            f_train = theano.function(inputs=train_inputs,
                                      outputs=cost,
                                      updates=updates,
                                      givens=({
                                          is_train: np.cast['int32'](1)
                                      } if dropout else {}))
        else:
            f_train = None

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=tags_scores,
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
            f_eval_softmax = theano.function(
                inputs=eval_inputs,
                outputs=tags_scores_softmax,
                givens=({
                    is_train: np.cast['int32'](0)
                } if dropout else {}))
        else:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=forward(
                                         observations,
                                         transitions,
                                         viterbi=True,
                                         return_alpha=False,
                                         return_best_sequence=True),
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
            f_eval_softmax = theano.function(
                inputs=eval_inputs,
                outputs=tags_scores_softmax,
                givens=({
                    is_train: np.cast['int32'](0)
                } if dropout else {}))

        return f_train, f_eval, f_eval_softmax
Beispiel #51
0
    def get_output_for(self, inputs, **kwargs):
        # Retrieve the layer input
        input = inputs[0]
        # Retrieve the mask when it is supplied
        mask = None
        hid_init = None
        cell_init = None
        visual_input = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]
        if self.hid_init_incoming_index > 0:
            hid_init = inputs[self.hid_init_incoming_index]
        if self.cell_init_incoming_index > 0:
            cell_init = inputs[self.cell_init_incoming_index]
        if self.visual_input_index > 0:
            visual_input = inputs[self.visual_input_index]

        # Treat all dimensions after the second as flattened feature dimensions
        if input.ndim > 3:
            input = T.flatten(input, 3)

        # Because scan iterates over the first dimension we dimshuffle to
        # (n_time_steps, n_batch, n_features)
        input = input.dimshuffle(1, 0, 2)
        seq_len, num_batch, _ = input.shape

        # Stack input weight matrices into a (num_inputs, 4*num_units)
        # matrix, which speeds up computation
        W_in_stacked = T.concatenate(
            [self.W_in_to_ingate, self.W_in_to_forgetgate,
             self.W_in_to_cell, self.W_in_to_outgate, self.W_in_to_ggate],
            axis=1
        )

        # Same for hidden weight matrices
        # pdb.set_trace()
        W_hid_stacked = T.concatenate(
            [self.W_hid_to_ingate, self.W_hid_to_forgetgate,
             self.W_hid_to_cell, self.W_hid_to_outgate, self.W_hid_to_ggate],
            axis=1
        )

        # Stack biases into a (4*num_units) vector
        b_stacked = T.concatenate(
            [self.b_ingate, self.b_forgetgate,
             self.b_cell, self.b_outgate, self.b_ggate], axis=0)

        if self.precompute_input:
            # Because the input is given for all time steps, we can
            # precompute_input the inputs dot weight matrices before scanning.
            # W_in_stacked is (n_features, 4*num_units). input is then
            # (n_time_steps, n_batch, 4*num_units).
            input = T.dot(input, W_in_stacked) + b_stacked

        # When theano.scan calls step, input_n will be (n_batch, 4*num_units).
        # We define a slicing function that extract the input to each LSTM gate
        def slice_w(x, n):
            return x[:, n*self.num_units:(n+1)*self.num_units]

        # Create single recurrent computation step function
        # input_n is the n'th vector of the input
        def step(
            input_n,
            cell_previous, hid_previous,
            visual,
            W_hid_stacked, W_in_stacked, b_stacked,
            W_cell_to_ingate, W_cell_to_forgetgate, W_cell_to_outgate,
            W_h_to_attenGate, W_g_to_attenGate, W_v_to_attenGate, W_s_to_attenGate,
            W_p
        ):
            if not self.precompute_input:
                input_n = T.dot(input_n, W_in_stacked) + b_stacked

            # Calculate gates pre-activations and slice
            gates = input_n + T.dot(hid_previous, W_hid_stacked)

            # Clip gradients
            if self.grad_clipping:
                gates = theano.gradient.grad_clip(
                    gates, -self.grad_clipping, self.grad_clipping)

            # Extract the pre-activation gate values
            ingate = slice_w(gates, 0)
            forgetgate = slice_w(gates, 1)
            cell_input = slice_w(gates, 2)
            outgate = slice_w(gates, 3)
            ggate = slice_w(gates, 4)

            if self.peepholes:
                # Compute peephole connections
                ingate += cell_previous*W_cell_to_ingate
                forgetgate += cell_previous*W_cell_to_forgetgate

            # Apply nonlinearities
            ingate = self.nonlinearity_ingate(ingate)
            forgetgate = self.nonlinearity_forgetgate(forgetgate)
            cell_input = self.nonlinearity_cell(cell_input)
            # ggate gt
            ggate = self.nonlinearity_ggate(ggate)

            # Compute new cell value
            cell = forgetgate*cell_previous + ingate*cell_input

            if self.peepholes:
                outgate += cell*W_cell_to_outgate
            outgate = self.nonlinearity_outgate(outgate)

            # Compute new hidden unit activation
            hid = outgate*self.nonlinearity(cell)
            st = ggate*self.nonlinearity(cell)

            # zt = T.dot(
            #     self.nonlinearity(
            #         T.dot(visual, W_v_to_attenGate) +
            #         T.dot(
            #             T.dot(hid, W_g_to_attenGate).dimshuffle(0, 1, 'x'),
            #             T.ones((1, self.video_len))
            #         )
            #     ),
            #     W_h_to_attenGate
            # )[:, :, 0]

            # to avoid optimization failure of Tenseor 3D dot vector, we should transform
            # e = A.dot(B) to e = A*B.dimshuffle('x', 'x', 0), e=e.sum(axis=2)
            zt_dot_A = self.nonlinearity(
                T.dot(visual, W_v_to_attenGate) +
                T.dot(
                    T.dot(hid, W_g_to_attenGate).dimshuffle(0, 1, 'x'),
                    T.ones((1, self.video_len))
                )
            )
            zt = zt_dot_A*W_h_to_attenGate.dimshuffle('x', 'x', 0)
            zt = zt.sum(axis=2)

            # vt = T.dot(
            #     self.nonlinearity(
            #         T.dot(
            #             st, W_s_to_attenGate
            #         ) +
            #         T.dot(
            #             hid, W_g_to_attenGate
            #         )
            #     ),
            #     W_h_to_attenGate
            # )

            vt_dot_A = self.nonlinearity(
                T.dot(
                    st, W_s_to_attenGate
                ) +
                T.dot(
                    hid, W_g_to_attenGate
                )
            )
            vt = vt_dot_A*W_h_to_attenGate.dimshuffle('x', 0)
            vt = vt.sum(axis=1)
            vt = vt.dimshuffle(0, 'x')

            alpha_hat_t = self.nonlinearity_attenGate(T.concatenate(
                [zt, vt],
                axis=-1
            ))
            feature = T.concatenate(
                [visual_input, st.dimshuffle(0, 'x', 1)],
                axis=1
            ).dimshuffle(2, 0, 1)
            c_hat_t = T.sum(alpha_hat_t*feature, axis=-1)
            It = T.dot(
                (c_hat_t.T+hid), W_p
            )
            return [cell, hid, It]

        def step_masked(
            input_n, mask_n,
            cell_previous, hid_previous, It_previous,
            visual,
            W_hid_stacked, W_in_stacked, b_stacked,
            W_cell_to_ingate, W_cell_to_forgetgate, W_cell_to_outgate,
            W_h_to_attenGate, W_g_to_attenGate, W_v_to_attenGate, W_s_to_attenGate,
            W_p
        ):
            cell, hid, It = step(
                input_n,
                cell_previous, hid_previous,
                visual,
                W_hid_stacked, W_in_stacked, b_stacked,
                W_cell_to_ingate, W_cell_to_forgetgate, W_cell_to_outgate,
                W_h_to_attenGate, W_g_to_attenGate, W_v_to_attenGate, W_s_to_attenGate,
                W_p
            )

            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.
            cell = T.switch(mask_n, cell, cell_previous)
            hid = T.switch(mask_n, hid, hid_previous)
            It = T.switch(mask_n, It, It_previous)
            # theano.printing.Print('It')(It)
            return [cell, hid, It]

        if mask is not None:
            # mask is given as (batch_size, seq_len). Because scan iterates
            # over first dimension, we dimshuffle to (seq_len, batch_size) and
            # add a broadcastable dimension
            mask = mask.dimshuffle(1, 0, 'x')
            sequences = [input, mask]
            step_fun = step_masked
        else:
            sequences = input
            step_fun = step

        ones = T.ones((num_batch, 1))
        if not isinstance(self.cell_init, Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            cell_init = T.dot(ones, self.cell_init)

        if not isinstance(self.hid_init, Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            hid_init = T.dot(ones, self.hid_init)

        It_init = T.dot(ones, self.It_init)

        # The hidden-to-hidden weight matrix is always used in step
        non_seqs = [visual_input, W_hid_stacked]
        if not self.precompute_input:
            non_seqs += [W_in_stacked, b_stacked]
        else:
            non_seqs += [(), ()]
        # The "peephole" weight matrices are only used when self.peepholes=True
        if self.peepholes:
            non_seqs += [self.W_cell_to_ingate,
                         self.W_cell_to_forgetgate,
                         self.W_cell_to_outgate]
        else:
            non_seqs += [(), (), ()]

        # When we aren't precomputing the input outside of scan, we need to
        # provide the input weights and biases to the step function

        non_seqs += [self.W_h_to_attenGate, self.W_g_to_attenGate, self.W_v_to_attenGate, self.W_s_to_attenGate, self.W_p]

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            cell_out, hid_out, It = unroll_scan(
                fn=step_fun,
                sequences=sequences,
                outputs_info=[cell_init, hid_init, It_init],
                go_backwards=self.backwards,
                non_sequences=non_seqs,
                n_steps=input_shape[1])
        else:
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function

            cell_out, hid_out, It = theano.scan(
                fn=step_fun,
                sequences=sequences,
                outputs_info=[cell_init, hid_init, It_init],
                go_backwards=self.backwards,
                truncate_gradient=self.gradient_steps,
                non_sequences=non_seqs,
                strict=True)[0]

        It = It.dimshuffle(1, 0, 2)
        if self.backwards:
            It = It[:, ::-1]
        return It
Beispiel #52
0
    def get_output_for(self, inputs, **kwargs):
        # Retrieve the layer input
        input = inputs[0]
        # Retrieve the mask when it is supplied
        mask = None
        hid_init = None
        cell_init = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]
        if self.hid_init_incoming_index > 0:
            hid_init = inputs[self.hid_init_incoming_index]
        if self.cell_init_incoming_index > 0:
            cell_init = inputs[self.cell_init_incoming_index]

        # TLSTM: Define new input
        time_mat = inputs[self.time_incoming_index]

        # Treat all dimensions after the second as flattened feature dimensions
        if input.ndim > 3:
            input = T.flatten(input, 3)

        if self.bn:
            input = self.bn.get_output_for(input)

        # Because scan iterates over the first dimension we dimshuffle to
        # (n_time_steps, n_batch, n_features)
        input = input.dimshuffle(1, 0, 2)
        #(n_time_steps, n_batch)
        time_input = time_mat.dimshuffle(1, 0, 'x')
        time_seq_len, time_num_batch, _ = time_input.shape
        seq_len, num_batch, _ = input.shape

        # Stack input weight matrices into a (num_inputs, 4*num_units)
        # matrix, which speeds up computation
        W_in_stacked = T.concatenate([
            self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell,
            self.W_in_to_outgate, self.W_x2_to_tg2, self.W_x1_to_tg1
        ],
                                     axis=1)

        # Same for hidden weight matrices
        W_hid_stacked = T.concatenate([
            self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell,
            self.W_hid_to_outgate
        ],
                                      axis=1)

        # Stack biases into a (4*num_units) vector
        b_stacked = T.concatenate([
            self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate,
            self.b2_tg2, self.b1_tg1
        ],
                                  axis=0)

        # W_t1_to_tg1_constraint < 0
        W_t1_to_tg1_constraint = T.switch(
            T.ge(self.W_t1_to_tg1, self.boundary), self.W_t1_to_tg1,
            self.boundary)

        # Stack delta time weight matrices into a (num_inputs, 2* num_units)
        W_t_stacked = T.concatenate(
            [self.W_to_to_outgate, self.W_t2_to_tg2, W_t1_to_tg1_constraint],
            axis=1)

        if self.precompute_input:
            # Because the input is given for all time steps, we can
            # precompute_input the inputs dot weight matrices before scanning.
            # W_in_stacked is (n_features, 4*num_units). input is then
            # (n_time_steps, n_batch, 4*num_units).
            time_input = T.dot(time_input, W_t_stacked)
            input = T.dot(input, W_in_stacked) + b_stacked

        # When theano.scan calls step, input_n will be (n_batch, 4*num_units).
        # We define a slicing function that extract the input to each LSTM gate
        def slice_w(x, start, stride=1):
            return x[:,
                     start * self.num_units:(start + stride) * self.num_units]

        # Create single recurrent computation step function
        # input_n is the n'th vector of the input
        # todo
        # insert Tm_n, weight_t_o_n in to mask_n and xell_previous
        def step(input_n, time_input_n, cell_previous, hid_previous, *args):
            if not self.precompute_input:
                time_input_n = T.dot(time_input_n, W_t_stacked)
                input_n = T.dot(input_n, W_in_stacked) + b_stacked

            tm_wto_n = slice_w(time_input_n, 0)
            tm_w2_n = slice_w(time_input_n, 1)
            tm_w1_n = slice_w(time_input_n, 2)
            tm_w2_n = self.nonlinearity_inside_tg2(tm_w2_n)
            tm_w1_n = self.nonlinearity_inside_tg1(tm_w1_n)
            tm2_xwb_n = slice_w(input_n, 4)
            tm1_xwb_n = slice_w(input_n, 5)
            timegate2 = self.nonlinearity_outside_tg2(tm_w2_n + tm2_xwb_n)
            timegate1 = self.nonlinearity_outside_tg1(tm_w1_n + tm1_xwb_n)
            input_n = slice_w(input_n, 0, 4)

            # Calculate gates pre-activations and slice
            gates = input_n + T.dot(hid_previous, W_hid_stacked)

            # Clip gradients
            if self.grad_clipping:
                gates = theano.gradient.grad_clip(gates, -self.grad_clipping,
                                                  self.grad_clipping)

            # Extract the pre-activation gate values
            ingate = slice_w(gates, 0)
            forgetgate = slice_w(gates, 1)
            cell_input = slice_w(gates, 2)
            outgate = slice_w(gates, 3)
            outgate += tm_wto_n

            if self.peepholes:
                # Compute peephole connections
                ingate += cell_previous * self.W_cell_to_ingate
                forgetgate += cell_previous * self.W_cell_to_forgetgate

            # Apply nonlinearities
            ingate = self.nonlinearity_ingate(ingate)
            forgetgate = self.nonlinearity_forgetgate(forgetgate)
            cell_input = self.nonlinearity_cell(cell_input)

            # Compute new cell value
            cell = forgetgate * cell_previous + ingate * timegate2 * cell_input
            tilde_cell = forgetgate * cell_previous + ingate * timegate1 * cell_input

            if self.peepholes:
                outgate += tilde_cell * self.W_cell_to_outgate

            outgate = self.nonlinearity_outgate(outgate)

            # Compute new hidden unit activation
            hid = outgate * self.nonlinearity(tilde_cell)
            return [cell, hid]

        def step_masked(input_n, time_input_n, mask_n, cell_previous,
                        hid_previous, *args):

            cell, hid = step(input_n, time_input_n, cell_previous,
                             hid_previous, *args)

            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.
            cell = T.switch(mask_n, cell, cell_previous)
            hid = T.switch(mask_n, hid, hid_previous)

            return [cell, hid]

        if mask is not None:
            # mask is given as (batch_size, seq_len). Because scan iterates
            # over first dimension, we dimshuffle to (seq_len, batch_size) and
            # add a broadcastable dimension
            mask = mask.dimshuffle(1, 0, 'x')
            sequences = [input, time_input, mask]
            step_fun = step_masked
        else:
            sequences = [input, time_input]
            step_fun = step

        ones = T.ones((num_batch, 1))
        if not isinstance(self.cell_init, Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            cell_init = T.dot(ones, self.cell_init)

        if not isinstance(self.hid_init, Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            hid_init = T.dot(ones, self.hid_init)

        # The hidden-to-hidden weight matrix is always used in step
        non_seqs = [W_hid_stacked]

        # The "peephole" weight matrices are only used when self.peepholes=True
        if self.peepholes:
            non_seqs += [
                self.W_cell_to_ingate, self.W_cell_to_forgetgate,
                self.W_cell_to_outgate
            ]

        # When we aren't precomputing the input outside of scan, we need to
        # provide the input weights and biases to the step function
        if not self.precompute_input:
            non_seqs += [W_in_stacked, b_stacked, W_t_stacked]
        else:
            pass

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            cell_out, hid_out = unroll_scan(fn=step_fun,
                                            sequences=sequences,
                                            outputs_info=[cell_init, hid_init],
                                            go_backwards=self.backwards,
                                            non_sequences=non_seqs,
                                            n_steps=input_shape[1])
        else:
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            cell_out, hid_out = theano.scan(
                fn=step_fun,
                sequences=sequences,
                outputs_info=[cell_init, hid_init],
                go_backwards=self.backwards,
                truncate_gradient=self.gradient_steps,
                non_sequences=non_seqs,
                strict=True)[0]

        # When it is requested that we only return the final sequence step,
        # we need to slice it out immediately after scan is applied
        if self.only_return_final:
            hid_out = hid_out[-1]
        else:
            # dimshuffle back to (n_batch, n_time_steps, n_features))
            hid_out = hid_out.dimshuffle(1, 0, 2)

            # if scan is backward reverse the output
            if self.backwards:
                hid_out = hid_out[:, ::-1]

        return hid_out
Beispiel #53
0
 def initial_outputs(self, batch_size):
     return self.initial_output * tensor.ones((batch_size, ), dtype='int64')
Beispiel #54
0
 def generate(self, chars):
     return self.generator.generate(n_steps=3 * chars.shape[0],
                                    batch_size=chars.shape[1],
                                    attended=self.lookup.apply(chars),
                                    attended_mask=tensor.ones(chars.shape))
Beispiel #55
0
    def build(
            self,
            dropout,
            ortho_char_input_dim,  # Should be inferred from the input
            ortho_char_dim,
            ortho_char_lstm_dim,
            char_bidirect,
            word_vec_input_dim,  # Should be inferred from the input wvecs
            word_dim,  # The vector size after projection of the input vector
            word_lstm_dim,
            word_bidirect,
            lr_method,
            crf,
            use_type_sparse_feats,
            type_sparse_feats_input_dim,  # Can be inferred from the output of the feature extractors
            type_sparse_feats_proj_dim,  # This is a hyper-parameter
            use_token_sparse_feats,
            token_sparse_feats_input_dim,  # Can be inferred from the output of the feature extractors
            # token_sparse_feats_proj_dim,  # This is a hyper-parameter
        use_ortho_attention,
            use_phono_attention,
            # use_convolution,
            phono_char_input_dim,  # Can be inferred
            phono_char_dim,
            phono_char_lstm_dim,
            training=True,
            **kwargs):
        """
        Build the network.
        """
        assert word_dim or phono_char_dim or ortho_char_dim, "No input selected while building the network!"
        # Training parameters
        n_tags = len(self.id_to_tag)

        # Network variables
        is_train = T.iscalar('is_train')
        word_vecs = T.dmatrix(
            name="word_vecs")  # A vector for each word in the sentence
        #  => matrix: (len_sent, w_emb_dim)
        ortho_char_for_vecs = T.dtensor3(
            name="ortho_char_for_vecs"
        )  # For each char of each word in the sentence, a char vector
        # ortho_char_for_vecs = T.ftensor3(name="ortho_char_for_vecs")
        # => tensor of form: (len_sent, max_wchar_len, char_emb_dim)
        ortho_char_rev_vecs = T.dtensor3(name="ortho_char_rev_vecs")
        # ortho_char_rev_vecs = T.ftensor3(name="ortho_char_rev_vecs")
        # For each char of each word in the sentence, a char vector
        # => tensor of form: (len_sent, max_wchar_len, char_emb_dim)
        phono_char_for_vecs = T.dtensor3(name="phono_char_for_vecs")
        # phono_char_for_vecs = T.ftensor3(name="phono_char_for_vecs")
        # For each char of each word in the sentence, a char vector
        # => tensor of form: (len_sent, max_ortho_char_len, char_emb_dim)
        phono_char_rev_vecs = T.dtensor3(name="phono_char_rev_vecs")
        # phono_char_rev_vecs = T.ftensor3(name="phono_char_rev_vecs")
        # For each char of each word in the sentence, a char vector
        # => tensor of form: (len_sent, max_phono_char_len, char_emb_dim)
        ortho_char_pos_ids = T.ivector(name='ortho_char_pos_ids')
        # The word len for each word in the sentence => vect of form: (len_sent,)
        phono_char_pos_ids = T.ivector(name='phono_char_pos_ids')
        # The word len for each word in the sentence => vect of form: (len_sent,)
        type_sparse_feats = T.imatrix(name="type_sparse_feats")
        # Type sparse features are appended to the input to the word lstm
        # For each word, a vector of type level sparse feats => mat of form: (len_sent, type_sparse_dim)
        token_sparse_feats = T.imatrix(name="token_sparse_feats")
        # Token sparse features are appended to the pre-crf layer
        # For each word, a vector of token level sparse feats => mat of form: (len_sent, token_sparse_dim)

        tag_ids = T.ivector(name='tag_ids')
        # The tag id for each word in the sentence => vect of form: (len_sent,)

        # Sentence length
        s_len = (word_vecs if word_dim else ortho_char_pos_ids
                 if ortho_char_dim else phono_char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        #
        # Word inputs
        #
        if word_dim:
            input_dim += word_dim
            word_layer = HiddenLayer(word_vec_input_dim,
                                     word_dim,
                                     activation="tanh",
                                     name="word_emb_proj")
            # TO DO : Try not using the bias term in the hidden layer
            word_input = word_layer.link(word_vecs)
            inputs.append(word_input)

        #
        # Chars inputs
        #
        if ortho_char_dim:
            input_dim += ortho_char_lstm_dim
            ortho_char_layer = HiddenLayer(ortho_char_input_dim,
                                           ortho_char_dim,
                                           activation="tanh",
                                           name="ortho_char_emb_proj")
            # TO DO : Try not using bias in the hidden layer
            ortho_char_lstm_for = LSTM(ortho_char_dim,
                                       ortho_char_lstm_dim,
                                       with_batch=True,
                                       name='ortho_char_lstm_for')
            ortho_char_lstm_rev = LSTM(ortho_char_dim,
                                       ortho_char_lstm_dim,
                                       with_batch=True,
                                       name='ortho_char_lstm_rev')
            ortho_char_lstm_for.link(
                ortho_char_layer.link(ortho_char_for_vecs))
            ortho_char_lstm_rev.link(
                ortho_char_layer.link(ortho_char_rev_vecs))

            ortho_char_for_output = ortho_char_lstm_for.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), ortho_char_pos_ids]
            ortho_char_rev_output = ortho_char_lstm_rev.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), ortho_char_pos_ids]

            inputs.append(ortho_char_for_output)
            if char_bidirect:
                inputs.append(ortho_char_rev_output)
                input_dim += ortho_char_lstm_dim

        if phono_char_dim:
            input_dim += phono_char_lstm_dim
            phono_char_layer = HiddenLayer(phono_char_input_dim,
                                           phono_char_dim,
                                           activation="tanh",
                                           name="phono_char_emb_proj")
            # TO DO : Try not using bias in the hidden layer
            phono_char_lstm_for = LSTM(phono_char_dim,
                                       phono_char_lstm_dim,
                                       with_batch=True,
                                       name='phono_char_lstm_for')
            phono_char_lstm_rev = LSTM(phono_char_dim,
                                       phono_char_lstm_dim,
                                       with_batch=True,
                                       name='phono_char_lstm_rev')

            phono_char_lstm_for.link(
                phono_char_layer.link(phono_char_for_vecs))
            phono_char_lstm_rev.link(
                phono_char_layer.link(phono_char_rev_vecs))

            phono_char_for_output = phono_char_lstm_for.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), phono_char_pos_ids]
            phono_char_rev_output = phono_char_lstm_rev.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), phono_char_pos_ids]

            inputs.append(phono_char_for_output)
            if char_bidirect:
                inputs.append(phono_char_rev_output)
                input_dim += phono_char_lstm_dim

        # Type level sparse feats
        #
        if use_type_sparse_feats:
            input_dim += type_sparse_feats_input_dim
            type_level_sparse_layer = HiddenLayer(
                type_sparse_feats_input_dim,
                type_sparse_feats_proj_dim,
                activation="tanh",
                name='type_level_sparse_layer')
            # TO DO : Try not using the hidden layer here
            inputs.append(type_level_sparse_layer.link(type_sparse_feats))

        # Prepare final input
        if len(inputs) != 1:
            inputs = T.concatenate(inputs, axis=1)
            # TO DO : If using type sparse features, then apply hidden layer after concatenating all inputs
        else:
            inputs = inputs[0]
        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            """
            Drop out involves sampling a vector of bernoulli random variables with a parameter 1-p and using it as a mask
            So, the expected value of the dropped out input is p * (0*x) + (1-p) * (1*x) = (1-p) * x. Since biases will
            on average respond to the expected input value, at test time we multiply test inputs (1-p) to supply the
            expected test input instead.
            """
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        # LSTM for words
        word_lstm_for = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_rev')
        word_lstm_for.link(inputs)
        word_lstm_rev.link(inputs[::-1, :])
        word_for_output = word_lstm_for.h
        word_rev_output = word_lstm_rev.h[::-1, :]
        lstm_outputs = [word_for_output]
        post_word_lstm_output_size = word_lstm_dim
        if use_token_sparse_feats:
            # token_level_sparse_layer = HiddenLayer(token_sparse_feats_input_dim, token_sparse_feats_proj_dim,
            #                                       activation="tanh",
            #                                       name='token_level_sparse_layer')
            # # TO DO : Try not using the hidden layer here
            # lstm_outputs.append(token_level_sparse_layer.link(token_sparse_feats))
            # post_word_lstm_output_size += token_sparse_feats_proj_dim
            lstm_outputs.append(token_sparse_feats)
            post_word_lstm_output_size += token_sparse_feats_input_dim
        if word_bidirect:
            lstm_outputs.append(word_rev_output)
            post_word_lstm_output_size += word_lstm_dim

        if len(lstm_outputs) > 1:
            final_output = T.concatenate(lstm_outputs, axis=1)
            tanh_layer = HiddenLayer(post_word_lstm_output_size,
                                     word_lstm_dim,
                                     name='tanh_layer',
                                     activation='tanh')
            final_output = tanh_layer.link(final_output)

        else:
            final_output = word_for_output

        final_pre_crf_input_size = word_lstm_dim
        attention_vectors = []
        attention_vector_size = 0
        if use_ortho_attention and ortho_char_dim:
            # final_ortho_attention_input_layer = HiddenLayer(post_word_lstm_output_size, ortho_char_lstm_dim,
            #                                   name='final_ortho_attention_input_layer', activation='tanh')
            final_ortho_attention_input_layer = HiddenLayer(
                word_lstm_dim,
                ortho_char_lstm_dim,
                name='final_ortho_attention_input_layer',
                activation='tanh')
            final_ortho_attention_input = final_ortho_attention_input_layer.link(
                final_output)
            # Evaluating attentional vector using a linear projection from final_output since the attention vector
            # must be conditioned on it and dimension must match the char lstm hidden dim.
            ortho_for_attention = self.get_TDAttention_vector(
                final_ortho_attention_input,
                ortho_char_lstm_for.h.dimshuffle((1, 0, 2)),
                ortho_char_pos_ids)
            if char_bidirect:
                ortho_rev_attention = self.get_TDAttention_vector(
                    final_ortho_attention_input,
                    ortho_char_lstm_rev.h.dimshuffle((1, 0, 2)),
                    ortho_char_pos_ids)
                attention_vectors.append(ortho_rev_attention)
                attention_vector_size += ortho_char_lstm_dim
            attention_vectors.append(ortho_for_attention)
            attention_vector_size += ortho_char_lstm_dim
        if use_phono_attention and phono_char_dim:
            # final_phono_attention_input_layer = HiddenLayer(post_word_lstm_output_size, phono_char_lstm_dim,
            #                                               name='final_phono_attention_input_layer', activation='tanh')
            final_phono_attention_input_layer = HiddenLayer(
                word_lstm_dim,
                phono_char_lstm_dim,
                name='final_phono_attention_input_layer',
                activation='tanh')
            # Evaluating attentional vector using a linear projection from final_output since the attention vector
            # must be conditioned on it and dimension must match the char lstm hidden dim.
            final_phono_attention_input = final_phono_attention_input_layer.link(
                final_output)
            phono_for_attention = self.get_TDAttention_vector(
                final_phono_attention_input,
                phono_char_lstm_for.h.dimshuffle((1, 0, 2)),
                phono_char_pos_ids)
            if char_bidirect:
                phono_rev_attention = self.get_TDAttention_vector(
                    final_phono_attention_input,
                    phono_char_lstm_rev.h.dimshuffle((1, 0, 2)),
                    phono_char_pos_ids)
                attention_vectors.append(phono_rev_attention)
                attention_vector_size += phono_char_lstm_dim
            attention_vectors.append(phono_for_attention)
            attention_vector_size += phono_char_lstm_dim
        if len(attention_vectors) > 1:
            attention_vectors = T.concatenate(attention_vectors, axis=1)

        if use_phono_attention or use_ortho_attention:
            final_output = T.concatenate([final_output, attention_vectors],
                                         axis=1)
            post_word_lstm_output_size += attention_vector_size
            final_pre_crf_input_size += attention_vector_size

        # Sentence to Named Entity tags - Score
        final_layer = HiddenLayer(final_pre_crf_input_size,
                                  n_tags,
                                  name='final_layer',
                                  activation=(None if crf else 'softmax'))
        tags_scores = final_layer.link(final_output)

        # No CRF
        if not crf:
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')
            # n_tags + 2 to accommodate start and end symbols

            small = -1000  # = -log(inf)
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            # Score of starting at start symbol is 1 => -log(1) = 0. Score of start symbol emitting any other NER
            # tag is -log(inf) = small
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
            # Score of ending at end symbol is 1 => -log(1) = 0. Score of end symbol emitting any other NER
            # tag is -log(inf) = small
            observations = T.concatenate(
                [tags_scores, small * T.ones((s_len, 2))], axis=1)
            # observations is the emission energy (-log potential) between each token and each tag.
            # Emission score of intermediate words towards start and end tags is -log(inf)

            observations = T.concatenate([b_s, observations, e_s], axis=0)
            # observations now contains the emission energies for start token, sentence tokens and end token

            # Score from tags
            real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()
            # Sum of energies associated with the gold tags

            # Score from transitions
            b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
            e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
            padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
            real_path_score += transitions[padded_tags_ids[T.arange(s_len +
                                                                    1)],
                                           padded_tags_ids[T.arange(s_len + 1)
                                                           + 1]].sum()
            # Transition scores from label_i to label_{i+1}

            all_paths_scores = forward(observations, transitions)
            cost = -(real_path_score - all_paths_scores)

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)
        if ortho_char_dim:
            self.add_component(ortho_char_layer)
            self.add_component(ortho_char_lstm_for)
            params.extend(ortho_char_layer.params)
            params.extend(ortho_char_lstm_for.params)
            if char_bidirect:
                self.add_component(ortho_char_lstm_rev)
                params.extend(ortho_char_lstm_rev.params)

        if phono_char_dim:
            self.add_component(phono_char_layer)
            self.add_component(phono_char_lstm_for)
            params.extend(phono_char_layer.params)
            params.extend(phono_char_lstm_for.params)
            if char_bidirect:
                self.add_component(phono_char_lstm_rev)
                params.extend(phono_char_lstm_rev.params)

        if use_type_sparse_feats:
            self.add_component(type_level_sparse_layer)
            params.extend(type_level_sparse_layer.params)

        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)

        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)

        if word_bidirect or len(lstm_outputs) > 1:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)

        if use_ortho_attention and ortho_char_dim:
            self.add_component(final_ortho_attention_input_layer)
            params.extend(final_ortho_attention_input_layer.params)
        if use_phono_attention and phono_char_dim:
            self.add_component(final_phono_attention_input_layer)
            params.extend(final_phono_attention_input_layer.params)

        self.add_component(final_layer)
        params.extend(final_layer.params)
        if crf:
            self.add_component(transitions)
            params.append(transitions)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            # eval_inputs.append(word_ids)
            eval_inputs.append(word_vecs)
        if ortho_char_dim:
            # eval_inputs.append(char_for_ids)
            eval_inputs.append(ortho_char_for_vecs)
            if char_bidirect:
                # eval_inputs.append(char_rev_ids)
                eval_inputs.append(ortho_char_rev_vecs)
            eval_inputs.append(ortho_char_pos_ids)
        if phono_char_dim:
            # eval_inputs.append(char_for_ids)
            eval_inputs.append(phono_char_for_vecs)
            if char_bidirect:
                # eval_inputs.append(char_rev_ids)
                eval_inputs.append(phono_char_rev_vecs)
            eval_inputs.append(phono_char_pos_ids)

        if use_type_sparse_feats:
            eval_inputs.append(type_sparse_feats)
        if use_token_sparse_feats:
            eval_inputs.append(token_sparse_feats)
        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            updates = Optimization(clip=5.0).get_updates(
                lr_method_name, cost, params, **lr_method_parameters)
            f_train = theano.function(inputs=train_inputs,
                                      outputs=cost,
                                      updates=updates,
                                      givens=({
                                          is_train: np.cast['int32'](1)
                                      } if dropout else {}))
        else:
            f_train = None

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=tags_scores,
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
        else:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=forward(
                                         observations,
                                         transitions,
                                         viterbi=True,
                                         return_alpha=False,
                                         return_best_sequence=True),
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
        print("Finished Compiling")
        return f_train, f_eval
Beispiel #56
0
    def __init__(self,
                 n_out,
                 collapse_output=False,
                 directions=4,
                 projection='average',
                 base=None,
                 **kwargs):
        if base is None:
            base = []
        super(TwoDLSTMLayer, self).__init__(n_out, **kwargs)
        assert len(self.sources) == 1
        source = self.sources[0]
        n_in = source.attrs['n_out']
        X = source.output
        assert X.ndim == 4
        sizes = source.output_sizes
        self.output_sizes = sizes
        assert directions in [1, 2,
                              4], "only 1, 2 or 4 directions are supported"
        assert projection in ['average', 'concat'], "invalid projection"

        if base:
            self.b1 = self.add_param(base[0].b1)
            self.b2 = self.add_param(base[0].b2)
            if directions >= 1:
                self.b3 = self.add_param(base[0].b3)
                self.b4 = self.add_param(base[0].b4)
            self.W1, self.V_h1, self.V_v1 = self.add_param(
                base[0].W1), self.add_param(base[0].V_h1), self.add_param(
                    base[0].V_v1)
            self.W2, self.V_h2, self.V_v2 = self.add_param(
                base[0].W2), self.add_param(base[0].V_h2), self.add_param(
                    base[0].V_v2)
            if directions >= 1:
                self.W3, self.V_h3, self.V_v3 = self.add_param(
                    base[0].W3), self.add_param(base[0].V_h3), self.add_param(
                        base[0].V_v3)
                self.W4, self.V_h4, self.V_v4 = self.add_param(
                    base[0].W4), self.add_param(base[0].V_h4), self.add_param(
                        base[0].V_v4)
            #self.mass = base[0].mass
            #self.masks = base[0].masks
            #self.b1 = base[0].b1
            #self.b2 = base[0].b2
            #if directions >= 1:
            #  self.b3 = base[0].b3
            #  self.b4 = base[0].b4
            #self.W1, self.V_h1, self.V_v1 = base[0].W1, base[0].V_h1, base[0].V_v1
            #self.W2, self.V_h2, self.V_v2 = base[0].W2, base[0].V_h2, base[0].V_v2
            #if directions >= 1:
            #  self.W3, self.V_h3, self.V_v3 = base[0].W3, base[0].V_h3, base[0].V_v3
            #  self.W4, self.V_h4, self.V_v4 = base[0].W4, base[0].V_h4, base[0].V_v4
            self.mass = base[0].mass
            self.masks = base[0].masks
        else:
            self.b1 = self.create_and_add_bias(n_out, "1")
            self.b2 = self.create_and_add_bias(n_out, "2")
            if directions >= 1:
                self.b3 = self.create_and_add_bias(n_out, "3")
                self.b4 = self.create_and_add_bias(n_out, "4")

            self.W1, self.V_h1, self.V_v1 = self.create_and_add_2d_lstm_weights(
                n_in, n_out, "1")
            self.W2, self.V_h2, self.V_v2 = self.create_and_add_2d_lstm_weights(
                n_in, n_out, "2")
            if directions >= 1:
                self.W3, self.V_h3, self.V_v3 = self.create_and_add_2d_lstm_weights(
                    n_in, n_out, "3")
                self.W4, self.V_h4, self.V_v4 = self.create_and_add_2d_lstm_weights(
                    n_in, n_out, "4")

        # dropout
        assert len(self.masks) == 1
        mask = self.masks[0]
        if mask is not None:
            X = self.mass * mask * X

        if str(theano.config.device).startswith('cpu'):
            Y = T.zeros_like(X)
            if projection == 'concat':
                Y = Y.repeat(directions, axis=-1)
                n_out *= directions
        else:
            if directions <= 2:
                Y = BidirectionalTwoDLSTMOpInstance(X, self.W1, self.W2,
                                                    self.V_h1, self.V_h2,
                                                    self.V_v1, self.V_v2,
                                                    self.b1, self.b2, sizes)
            else:
                Y = MultiDirectionalTwoDLSTMOpInstance(
                    X, self.W1, self.W2, self.W3, self.W4, self.V_h1,
                    self.V_h2, self.V_h3, self.V_h4, self.V_v1, self.V_v2,
                    self.V_v3, self.V_v4, self.b1, self.b2, self.b3, self.b4,
                    sizes)

            if directions > 1:
                Y = T.stack(Y[:directions], axis=-1)
                if projection == 'average':
                    Y = Y.mean(axis=-1)
                elif projection == 'concat':
                    Y = Y.reshape((Y.shape[0], Y.shape[1], Y.shape[2],
                                   Y.shape[3] * Y.shape[4]))
                    n_out *= directions
            else:
                Y = Y[0]

        Y.name = 'Y'
        self.set_attr('n_out', n_out)
        self.set_attr('collapse_output', collapse_output)
        self.set_attr('directions', directions)
        self.set_attr('projection', projection)

        #index handling
        def index_fn(index, size):
            return T.set_subtensor(index[:size], numpy.cast['int8'](1))

        index_init = T.zeros((Y.shape[2], Y.shape[1]), dtype='int8')
        self.index, _ = theano.scan(
            index_fn, [index_init, T.cast(sizes[:, 1], "int32")])
        self.index = self.index.dimshuffle(1, 0)

        if collapse_output == 'sum' or collapse_output == True:
            Y = Y.sum(axis=0)
        elif collapse_output == 'mean':
            Y = Y.mean(axis=0)
        elif collapse_output == 'conv':
            from TheanoUtil import circular_convolution
            Y, _ = theano.scan(lambda x_i, x_p: circular_convolution(x_i, x_p),
                               Y, Y[0])
            Y = Y[-1]
        elif collapse_output == 'flatten':
            self.index = T.ones((Y.shape[0] * Y.shape[1], Y.shape[2]),
                                dtype='int8')
            Y = Y.reshape((Y.shape[0] * Y.shape[1], Y.shape[2], Y.shape[3]))
        elif str(collapse_output).startswith('pad_'):
            pad = numpy.int32(collapse_output.split('_')[-1])
            Y = ifelse(
                T.lt(Y.shape[0], pad),
                T.concatenate([
                    Y,
                    T.zeros(
                        (pad - Y.shape[0], Y.shape[1], Y.shape[2], Y.shape[3]),
                        'float32')
                ],
                              axis=0), ifelse(T.gt(Y.shape[0], pad), Y[:pad],
                                              Y))
            Y = Y.dimshuffle(1, 2, 3, 0).reshape(
                (Y.shape[1], Y.shape[2], Y.shape[3] * Y.shape[0]))
            self.attrs['n_out'] *= pad
        elif collapse_output != False:
            assert False, "invalid collapse mode"

        if self.attrs['batch_norm']:
            Y = self.batch_norm(
                Y,
                self.attrs['n_out'],
                index=sizes if not collapse_output else self.index,
                force_sample=False)

        self.output = Y
Beispiel #57
0
def broadcast_vec(x, n):
    form = TT.ones((n, 1))
    return TT.dot(form, x)
Beispiel #58
0
elif objective_flag == 'argmax':
    # argmax approximation
    cla_out_y_hard = cla_out_y.argmax(axis=1)
    dis_out_p_c = ll.get_output(dis_layers[-1], {dis_in_x:sym_x_u,dis_in_y:cla_out_y_hard}, deterministic=False)
else:
    raise Exception('Unknown objective flags')

image = ll.get_output(gen_layers[-1], {gen_in_y:sym_y_g, gen_in_z:sym_z_image}, deterministic=False) # for generation

accurracy_eval = (lasagne.objectives.categorical_accuracy(cla_out_y_eval, sym_y)) # for evaluation
accurracy_eval = accurracy_eval.mean()

# costs
bce = lasagne.objectives.binary_crossentropy

dis_cost_p = bce(dis_out_p, T.ones(dis_out_p.shape)).mean() # D distincts p
dis_cost_p_g = bce(dis_out_p_g, T.zeros(dis_out_p_g.shape)).mean() # D distincts p_g
gen_cost_p_g = bce(dis_out_p_g, T.ones(dis_out_p_g.shape)).mean() # G fools D

weight_decay_classifier = lasagne.regularization.regularize_layer_params_weighted({cla_layers[-1]:1}, lasagne.regularization.l2) # weight decay

dis_cost_p_c = bce(dis_out_p_c, T.zeros(dis_out_p_c.shape)) # D distincts p_c
cla_cost_p_c = bce(dis_out_p_c, T.ones(dis_out_p_c.shape)) # C fools D

if objective_flag == 'integrate':
    # integrate
    weight_loss_c = T.reshape(cla_cost_p_c, (-1, num_classes)) * cla_out_y
    cla_cost_p_c = T.sum(weight_loss_c, axis=1).mean()
    weight_loss_d = T.reshape(dis_cost_p_c, (-1, num_classes)) * cla_out_y
    dis_cost_p_c = T.sum(weight_loss_d, axis=1).mean()
elif objective_flag == 'argmax':
    def reg_EPhi(self, lengthscale_trf, lengthscale_p_trf, sf_trf, S, MU,
                 SIGMA_trf, U, b, N, M, i, D, order, non_rec):

        #       lengthscale_trf # D[i]
        #       lengthscale_p_trf # D[i]
        #       sf_trf # 1
        #       S # M x D[i]
        #       MU # N x D[i]
        #       SIGMA_trf # N x D[i]
        #       U # M x D[i]
        #       b # M
        #       N # 1
        #       M # 1

        b = T.zeros(T.shape(b))
        MU_S = T.zeros(T.shape(S))
        SIGMA_S_trf = T.ones(T.shape(S))

        inv_SIGMA_trf = SIGMA_trf**-1  # N x D[i]
        MU_S_hat = lengthscale_trf**-1 * MU_S + 2 * np.pi * lengthscale_p_trf**-1  # M x D[i]

        MU_S_hat_U_b = -(MU_S_hat * U).sum(1)[None, :] + b  # M x M
        big_sum_minus = MU_S_hat_U_b - MU_S_hat_U_b.T  # M x M
        big_sum_plus = MU_S_hat_U_b + MU_S_hat_U_b.T  # M x M
        MU_S_hat_minus = MU_S_hat[
            None, :, :] - MU_S_hat[:, None, :]  # M x M x D[i]
        MU_S_hat_plus = MU_S_hat[
            None, :, :] + MU_S_hat[:, None, :]  # M x M x D[i]

        u_EEPhiTPhi = (U[None, :, :] - U[:, None, :])**2  # M x M x D[i]
        b_bold_denomi = SIGMA_S_trf[
            None, :, :] + SIGMA_S_trf[:, None, :]  # M x M x D[i]
        sum_SIGMA_S_U = SIGMA_S_trf * U  # M x D[i]
        b_bold = (sum_SIGMA_S_U[None, :, :] +
                  sum_SIGMA_S_U[:, None, :]) / b_bold_denomi  # M x M x D[i]
        B = (lengthscale_trf**2)[None, None, :] / b_bold_denomi  # M x M x D[i]
        inv_B = 1 / B  # M x M x D[i]
        U_EEPhiTPhi = (lengthscale_trf**2)[None, None, :] * (
            SIGMA_S_trf[None, :, :]**-1 + SIGMA_S_trf[:, None, :]**-1
        )  # M x M x D[i]
        norm_EEPhiTPhi_U_temp = lengthscale_trf[None, None, :]**2 / (
            (SIGMA_S_trf[None, :, :] * SIGMA_S_trf[:, None, :]) *
            U_EEPhiTPhi)**0.5  # M x M x D[i]
        Z_n_U_EEPhiTPhi = np.exp(-0.5 *
                                 (u_EEPhiTPhi / U_EEPhiTPhi).sum(2))  # M x M
        inv_B_b_bold = inv_B * b_bold  # M x M x D[i]
        inv_SIGMA_trf_MU = inv_SIGMA_trf * MU  # N x D[i]

        EPhiTPhi = np.zeros((M, M))
        loop = np.int64(-1)

        def EPhiTPhi_loop_i0(loop, EPhiTPhi, non_rec, D, order, MU, SIGMA_trf,
                             inv_SIGMA_trf, inv_SIGMA_trf_MU, inv_B, b_bold,
                             inv_B_b_bold, B, MU_S_hat_minus, MU_S_hat_plus,
                             big_sum_minus, big_sum_plus,
                             norm_EEPhiTPhi_U_temp):
            loop = loop + 1
            D_n = (inv_B +
                   inv_SIGMA_trf[loop, :][None, None, :])**-1  # M x M x D[i]
            if non_rec == 0:
                d_n = D_n[:, :, D - order:D] * (
                    inv_B_b_bold[:, :, D - order:D] +
                    inv_SIGMA_trf_MU[loop, :][None, None, D - order:D]
                )  # M x M x N x order
                d_n = T.concatenate(
                    (MU[loop, :][0:D - order][None, None, :] +
                     T.zeros_like(inv_B[:, :, 0:D - order]), d_n),
                    axis=2)  # M x M x N x D[i]
            else:
                d_n = MU[loop, :][None, None, :] + T.zeros_like(
                    inv_B)  # M x M x N x D[i]
            W = B + SIGMA_trf[loop, :][None, None, :]  # M x M x D[i]
            norm_EEPhiTPhi_U_W = (norm_EEPhiTPhi_U_temp / W**0.5).prod(
                2
            )  # M x M  % here we put det(U), det(W), because of numeric issues (prod(2) is huge for huge input-dimensions)
            Z_n_W = T.exp(
                -0.5 *
                ((b_bold - MU[loop, :][None, None, :])**2 / W).sum(2))  # M x M
            EPhiTPhi = EPhiTPhi + Z_n_W * norm_EEPhiTPhi_U_W * (
                T.exp(-0.5 * (MU_S_hat_minus**2 * D_n).sum(2)) * T.cos(
                    (MU_S_hat_minus * d_n).sum(2) + big_sum_minus) +
                T.exp(-0.5 * (MU_S_hat_plus**2 * D_n).sum(2)) * T.cos(
                    (MU_S_hat_plus * d_n).sum(2) + big_sum_plus))  # M x M
            return loop, EPhiTPhi

        def EPhiTPhi_loop_i(loop, EPhiTPhi, order, MU, SIGMA_trf,
                            inv_SIGMA_trf, inv_SIGMA_trf_MU, inv_B, b_bold,
                            inv_B_b_bold, B, MU_S_hat_minus, MU_S_hat_plus,
                            big_sum_minus, big_sum_plus,
                            norm_EEPhiTPhi_U_temp):
            loop = loop + 1
            D_n = (inv_B +
                   inv_SIGMA_trf[loop, :][None, None, :])**-1  # M x M x D[i]
            d_n = D_n * (
                inv_B_b_bold + inv_SIGMA_trf_MU[loop, :][None, None, :]
            )  # M x M x D[i]
            W = B + SIGMA_trf[loop, :][None, None, :]  # M x M x D[i]
            norm_EEPhiTPhi_U_W = (norm_EEPhiTPhi_U_temp / W**0.5).prod(
                2
            )  # M x M  % here we put det(U), det(W), because of numeric issues (prod(2) is huge for huge input-dimensions)
            Z_n_W = T.exp(
                -0.5 *
                ((b_bold - MU[loop, :][None, None, :])**2 / W).sum(2))  # M x M
            EPhiTPhi = EPhiTPhi + Z_n_W * norm_EEPhiTPhi_U_W * (
                T.exp(-0.5 * (MU_S_hat_minus**2 * D_n).sum(2)) * T.cos(
                    (MU_S_hat_minus * d_n).sum(2) + big_sum_minus) +
                T.exp(-0.5 * (MU_S_hat_plus**2 * D_n).sum(2)) * T.cos(
                    (MU_S_hat_plus * d_n).sum(2) + big_sum_plus))  # M x M
            return loop, EPhiTPhi

        if i == 0:
            result, _ = theano.scan(EPhiTPhi_loop_i0,
                                    outputs_info=[loop, EPhiTPhi],
                                    n_steps=N,
                                    non_sequences=[
                                        non_rec, D, order, MU, SIGMA_trf,
                                        inv_SIGMA_trf, inv_SIGMA_trf_MU, inv_B,
                                        b_bold, inv_B_b_bold, B,
                                        MU_S_hat_minus, MU_S_hat_plus,
                                        big_sum_minus, big_sum_plus,
                                        norm_EEPhiTPhi_U_temp
                                    ])
        else:
            result, _ = theano.scan(EPhiTPhi_loop_i,
                                    outputs_info=[loop, EPhiTPhi],
                                    n_steps=N,
                                    non_sequences=[
                                        order, MU, SIGMA_trf, inv_SIGMA_trf,
                                        inv_SIGMA_trf_MU, inv_B, b_bold,
                                        inv_B_b_bold, B, MU_S_hat_minus,
                                        MU_S_hat_plus, big_sum_minus,
                                        big_sum_plus, norm_EEPhiTPhi_U_temp
                                    ])

        EPhiTPhi_out = result[-1][-1]  # M x M

        reg_EEPhiTPhi = (sf_trf**2 /
                         2) * Z_n_U_EEPhiTPhi * EPhiTPhi_out  # M x M

        return reg_EEPhiTPhi
    s = []
    oa = T.sum(a)
    ob = T.sum(b)
    o = oa + ob

    # sTore all the out puts in a list to return the values to the scan function
    s.append(o)
    s.append(a)
    s.append(b)

    return s


s = []
s.append(None)
s.append(dict(initial=T.ones(3)))
s.append(dict(initial=T.ones(4)))

#output, updates = theano.scan(
#            op,
#            sequences=x,
#            truncate_gradient=4,
#            outputs_info=[None,
#                          dict(initial=T.ones(3)),
#                          dict(initial=T.ones(4))])

output, updates = theano.scan(op,
                              sequences=x,
                              truncate_gradient=4,
                              outputs_info=s)