Esempio n. 1
0
    def read(self, images, center_y, center_x, delta, sigma):
        """
        Parameters
        ----------
        images : T.matrix    (shape: batch_size x img_size)
            Batch of images. Internally it will be reshaped to be a 
            (batch_size, img_height, img_width)-shaped stack of images.
        center_y : T.vector (shape: batch_size)
        center_x : T.vector (shape: batch_size)
        delta : T.vector    (shape: batch_size)
        sigma : T.vector    (shape: batch_size)

        Returns
        -------
        window : T.matrix   (shape: batch_size x N**2)
        """
        N = self.N
        batch_size = images.shape[0]

        # Reshape input into proper 2d images
        I = images.reshape( (batch_size, self.img_height, self.img_width) )

        # Get separable filterbank
        FY, FX = self.filterbank_matrices(center_y, center_x, delta, sigma)

        # apply to the batch of images
        W = T.batched_dot(T.batched_dot(FY, I), FX.transpose([0,2,1]))

        return W.reshape((batch_size, N*N))
Esempio n. 2
0
def compute_psi2(lls, lsf, z, input_means, input_vars):

    ls = T.exp(lls)
    sf = T.exp(lsf)
    b = ls / casting(2.0)
    term_1 = T.prod(T.sqrt(b / (b + input_vars)), 1)

    scale = T.sqrt(4 * (2 * b[ None, : ] + 0 * input_vars))
    scaled_z = z[ None, : , : ] / scale[ : , None , : ]
    scaled_z_minus_m = scaled_z
    r2b = T.sum(scaled_z_minus_m**2, 2)[ :, None, : ] + T.sum(scaled_z_minus_m**2, 2)[ :, : , None ] - \
        2 * T.batched_dot(scaled_z_minus_m, np.transpose(scaled_z_minus_m, [ 0, 2, 1 ]))
    term_2 = T.exp(-r2b)

    scale = T.sqrt(4 * (2 * b[ None, : ] + 2 * input_vars))
    scaled_z = z[ None, : , : ] / scale[ : , None , : ]
    scaled_m = input_means / scale
    scaled_m = T.tile(scaled_m[ : , None, : ], [ 1, z.shape[ 0 ], 1])
    scaled_z_minus_m = scaled_z - scaled_m
    r2b = T.sum(scaled_z_minus_m**2, 2)[ :, None, : ] + T.sum(scaled_z_minus_m**2, 2)[ :, : , None ] + \
        2 * T.batched_dot(scaled_z_minus_m, np.transpose(scaled_z_minus_m, [ 0, 2, 1 ]))
    term_3 = T.exp(-r2b)
    
    psi2_computed = sf**casting(2.0) * term_1[ :, None, None ] * term_2 * term_3

    return T.transpose(psi2_computed, [ 1, 2, 0 ])
 def energy(self):
     rho_x = rho(self.x)
     rho_h = rho(self.h)
     squared_norm = ( T.batched_dot(self.x,self.x) + T.batched_dot(self.h,self.h) ) / 2
     uni_terms    = - T.dot(rho_x, self.bx) - T.dot(rho_h, self.bh)
     bi_terms     = - T.batched_dot( T.dot(rho_x, self.W1), rho_h )
     return squared_norm + uni_terms + bi_terms
Esempio n. 4
0
  def fwd(self, x, V, A, L):
    """
    x : signal
    V : eigenvectors
    A : area 
    L : eigenvalues
    """
    V = V[:,:self.K]
    L = L[:self.K]

    L = L.dimshuffle('x','x',0)

    rho = T.sqrt(T.sum(A))
   
    # Q x 1 x K, a window for each input function
    ghat = self.activation_interp(
            T.batched_dot(T.tile(L, [self.nin,1,1]), self.Winterp))
    # Q x K x N
    V_ = T.tile(V.dimshuffle('x',1,0), [self.nin, 1, 1])
    # Q x K x N
    tmp = (ghat * V).dimshuffle(0,2,1)
    
    # Q x N x N
    transl = rho * T.batched_dot(V_.dimshuffle(0,2,1), tmp)
    transl = A.dimshuffle('x',0,'x') * transl
    
    # Q x K x N
    tmp = (V.dimshuffle(0,'x',1) * x.dimshuffle(0,1,'x')).dimshuffle(1,2,0)
    # Q x K x N
    desc = rho * T.batched_dot(tmp, transl)
    desc = T.abs_(desc)
    
    desc = desc.dimshuffle(2,0,'x',1) # BC01 format : N x Q x 1 x K
    return self.activation(theano.tensor.nnet.conv.conv2d(desc, self.W).flatten(2) + self.b)
Esempio n. 5
0
    def one_step(self, l, images):
        '''
        l = [n_examples, 5]
        image = [n_examples, height, width]
        '''

        tol = 1e-4
        g_x = self.B * (l[:, 0] + 1) / 2.
        g_y = self.A * (l[:, 1] + 1) / 2.
        delta = (max(self.A, self.B) - 1) / (self.N - 1) * T.exp(l[:, 2])
        sigma = T.exp(l[:, 3])

        mu_x = g_x.dimshuffle([0, 'x']) +\
            (self.mu_ind - self.N / 2. + 0.5) * delta.dimshuffle([0, 'x'])
        mu_y = g_y.dimshuffle([0, 'x']) +\
            (self.mu_ind - self.N / 2. + 0.5) * delta.dimshuffle([0, 'x'])

        F_x = T.exp(-((self.B_ind - mu_x.dimshuffle([0, 1, 'x']))**2) / (
            2 * (sigma.dimshuffle([0, 'x', 'x'])))**2)
        F_x = F_x / (F_x.sum(axis=-1).dimshuffle(0, 1, 'x') + tol)

        # Compute Y filter banks##
        F_y = T.exp(-((self.A_ind - mu_y.dimshuffle([0, 1, 'x']))**2) / (
            2 * (sigma.dimshuffle([0, 'x', 'x'])))**2)
        F_y = F_y / (F_y.sum(axis=-1).dimshuffle(0, 1, 'x') + tol)

        read = T.batched_dot(T.batched_dot(F_y, images), F_x.dimshuffle([0, 2, 1]))
        return read, g_x, g_y, delta, sigma
Esempio n. 6
0
 def propup_given_h_lag(self, vt, h_lag, hbias):
     if h_lag == self.h0:
         x = T.batched_dot(vt, self.W) + T.addbroadcast(
             T.dot(h_lag, self.Wt) + hbias, 0, 1)
     else:
         x = T.batched_dot(vt, self.W) + hbias + T.dot(h_lag, self.Wt)
     return [x, T.nnet.sigmoid(x)]
Esempio n. 7
0
    def get_output_for(self, inputs, **kwargs):

        # seq_input: (batch_size, seq_size, n_hidden_con)
        # seq_mask: (batch_size, seq_size)
        # condition: (batch_size, n_hidden_con)
        seq_input, seq_mask, condition = inputs

        if self.gate_covariance:
            update = T.nnet.sigmoid(
                T.sum(seq_input * self.w_gate, axis=-1, keepdims=True) +
                self.b_gate)
            seq_input *= update

        length_seq = seq_input.shape[1]
        if self.covariance_decay:
            decay = T.arange(1, length_seq+1)
            decay = (self.covariance_decay +
                     (length_seq-decay) * (1 - self.covariance_decay))
            decay = T.sqrt(decay)
            decay = decay.dimshuffle('x', 0, 'x')
            seq_input *= decay

        seq_input *= T.shape_padright(seq_mask)
        # (batch_size, n_hidden_question, n_hidden_question)
        covariance = T.batched_dot(seq_input.dimshuffle(0, 2, 1), seq_input)
        # (batch_size, n_hidden_question), equivalent to the following line:
        # att = T.sum(covariance * condition.dimshuffle((0, 'x', 1)), axis=2)
        att = 1000 * T.batched_dot(covariance, condition.dimshuffle((0, 1)))

        if not self.covariance_decay:
            att /= T.sum(seq_mask, axis=1, keepdims=True)
        # norm2_att = T.sum(att * condition, axis=1, keepdims=True)
        # att = 1000 * att / norm2_att

        return att
Esempio n. 8
0
 def h_given_h_lag_vt(self, vt, h_lag, hbias):
     if h_lag == self.h0:
         x = T.batched_dot(vt, self.W) + T.addbroadcast(
             T.dot(h_lag, self.Wt) + hbias.dimshuffle('x', 0), 0, 1)
     else:
         x = T.batched_dot(vt, self.W) + \
             T.dot(h_lag, self.Wt) + hbias.dimshuffle('x', 0)
     return [x, T.nnet.sigmoid(x)]
Esempio n. 9
0
 def energy_function():
     squared_norm = (
         T.batched_dot(self.x, self.x) + T.batched_dot(self.h, self.h) + T.batched_dot(self.y, self.y)
     ) / 2.0
     uni_terms = -T.dot(self.rho_x, self.bx) - T.dot(self.rho_h, self.bh) - T.dot(self.rho_y, self.by)
     bi_terms = -T.batched_dot(T.dot(self.rho_x, self.W1), self.rho_h) - T.batched_dot(
         T.dot(self.rho_h, self.W2), self.rho_y
     )
     return squared_norm + uni_terms + bi_terms
Esempio n. 10
0
 def defmodel(self):
     lhs = T.ivector("lhs")
     rhs, nrhs = T.ivectors("rhs","nrhs")
     lhsemb = self.entembs[lhs, :]
     rhsemb = self.W[rhs, :]
     nrhsemb = self.W[nrhs, :]
     pdot = T.batched_dot(lhsemb, rhsemb)
     ndot = T.batched_dot(lhsemb, nrhsemb)
     return pdot, ndot, [lhs, rhs, nrhs]
Esempio n. 11
0
    def step(self, x, states):
        h_tild_tm1 = states[0]

        B_U = states[1]
        B_W = states[2]

        if self.consume_less == 'cpu':
            x_i = x[:, :self.output_dim]
            x_f = x[:, self.output_dim: 2 * self.output_dim]
            x_c = x[:, 2 * self.output_dim: 3 * self.output_dim]
            x_o = x[:, 3 * self.output_dim: 4 * self.output_dim]
            x_new = x[:, 4 * self.output_dim:]
        else:
            x_i = K.dot(x * B_W[0], self.W_i) + self.b_i
            x_f = K.dot(x * B_W[1], self.W_f) + self.b_f
            x_c = K.dot(x * B_W[2], self.W_c) + self.b_c
            x_o = K.dot(x * B_W[3], self.W_o) + self.b_o
            x_new = x

        # self.C_tape -> BT, t-1, k
        # self.H_tape -> BT, t-1, k

        # x -> BT, k 
        # h_tild_tm1 -> BT, k       

        if self.H_tape is None:
            self.H_tape = K.zeros_like(h_tild_tm1).dimshuffle((0,'x',1))
            self.C_tape = K.zeros_like(h_tild_tm1).dimshuffle((0,'x',1))

        # s_t -> BT, t-1, 1
        t = K.shape(self.C_tape)[1]

        sum1 = K.dot(self.H_tape, self.W_h)
        sum2 = K.dot(K.repeat_elements(x_new.dimshuffle((0,'x',1)),t, axis=1), self.W_x)
        sum3 = K.dot(K.repeat_elements(h_tild_tm1.dimshuffle((0,'x',1)),t, axis=1), self.W_h_tilde)
        tanhed_sum = K.tanh(sum1 + sum2 + sum3)    
        a_t = K.dot(tanhed_sum, self.v)[:,:,0]
        s_t = K.softmax(a_t)

        h_tilde_t = T.batched_dot(self.H_tape.dimshuffle((0,2,1)), s_t.dimshuffle((0,1,'x')))[:,:,0]
        c_tilde_t = T.batched_dot(self.C_tape.dimshuffle((0,2,1)), s_t.dimshuffle((0,1,'x')))[:,:,0]

        i = self.inner_activation(x_i + K.dot(h_tilde_t * B_U[0], self.U_i))
        f = self.inner_activation(x_f + K.dot(h_tilde_t * B_U[1], self.U_f))
        c_t = f * c_tilde_t + i * self.activation(x_c + K.dot(h_tilde_t * B_U[2], self.U_c))
        o = self.inner_activation(x_o + K.dot(h_tilde_t * B_U[3], self.U_o))

        h_t = o * self.activation(c_t)

        # Add to Tape
        self.C_tape = K.concatenate([self.C_tape, c_t.dimshuffle((0,'x',1))], axis=1)
        self.H_tape = K.concatenate([self.H_tape, h_t.dimshuffle((0,'x',1))], axis=1)

        return h_t, [h_tilde_t]
Esempio n. 12
0
    def batched_cos_sim(s):
        """ from (x,y,z)-shaped pair, produce (x,y)-shaped pair that replaces the z-vector pairs by their cosine similarities """
        import theano
        import theano.tensor as T

        return theano.scan(
            fn=lambda xm, ym: T.batched_dot(xm, ym) / T.sqrt(T.batched_dot(xm, xm) * T.batched_dot(ym, ym)),
            outputs_info=None,
            sequences=s,
            non_sequences=None,
        )[0]
Esempio n. 13
0
 def free_energy_given_hid_lag(self, vt, h_lag, hbias, vbias):
     if h_lag == self.h0:
         wx_b = T.batched_dot(vt, self.W) +\
             T.addbroadcast(T.dot(h_lag, self.Wt) + hbias, 0, 1)
         vbias_term = T.batched_dot(vt, vbias)
         hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=2)
     else:
         wx_b = T.batched_dot(vt, self.W) + T.dot(h_lag, self.Wt) + \
             hbias.dimshuffle('x', 0)
         vbias_term = T.batched_dot(vt, vbias)
         hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=2)
     return -hidden_term - vbias_term
Esempio n. 14
0
def MemLayer(incomings, params, linear=0):
  '''
  incomings = (u, u_shape, A, A_shape, C, C_shape)
  '''
  ((u, u_shape), (A, A_shape), (C, C_shape)) = incomings
  p = T.switch(linear, T.batched_dot(A, u), nnet.softmax(T.batched_dot(A, u)))
  p_shape = A_shape[:2]
  # C.shape = (batch_size, num_sen, embed_size), u.shape = (batch_size, embed_size)
  # p.shape = (batch_size, num_sen, 1)
  #return (p, u_shape)
  O = (C * p[:, :, None]).sum(axis = 1)

  return ((O, u_shape), (p, p_shape))
Esempio n. 15
0
    def write(self, windows, center_y, center_x, delta, sigma):
        N = self.N
        batch_size = windows.shape[0]

        # Reshape input into proper 2d windows
        W = windows.reshape( (batch_size, N, N) )

        # Get separable filterbank
        FY, FX = self.filterbank_matrices(center_y, center_x, delta, sigma)

        # apply...
        I = T.batched_dot(T.batched_dot(FY.transpose([0,2,1]), W), FX)

        return I.reshape( (batch_size, self.img_height*self.img_width) )
    def factorization(self, batchSize, argsEmbA, argsEmbB, wC, wC1, wC2):
        # l = batchSize
        # k = self.k  # embed size
        # r = self.r  # relation number
        # argEmbedsA = self.A[argsA.flatten()]  # [l,k]
        # argEmbedsB = self.A[argsB.flatten()]  # [l,k]

        # first = T.tensordot(relationProbs, self.C, axes=[[1], [2]])  # [l,r] * [k,k,r] = [l, k, k]
        Afirst = T.batched_tensordot(wC, argsEmbA, axes=[[1], [1]])  # + self.Cb  # [l, k, k] * [l, k] = [l, k]
        Asecond = T.batched_dot(Afirst, argsEmbB)  # [l, k] * [l, k] = [l]
        # entropy = T.sum(T.log(relationProbs) * relationProbs, axis=1)  # [l,r] * [l,r] = [l]
        spFirst = T.batched_dot(wC1, argsEmbA)
        spSecond = T.batched_dot(wC2, argsEmbB)
        return Asecond + spFirst + spSecond
Esempio n. 17
0
    def _initialize_posterior_distribution(self, RecognitionParams):

        # Now actually compute the precisions (from their square roots)
        self.Lambda = T.batched_dot(self.LambdaChol, self.LambdaChol.dimshuffle(0,2,1))

        # dynamics matrix & initialize the innovations precision, xDim x xDim
        self.A         = theano.shared(value=RecognitionParams['A'].astype(theano.config.floatX)        ,name='A'        )
        self.QinvChol  = theano.shared(value=RecognitionParams['QinvChol'].astype(theano.config.floatX) ,name='QinvChol' )
        self.Q0invChol = theano.shared(value=RecognitionParams['Q0invChol'].astype(theano.config.floatX),name='Q0invChol')

        self.Qinv  = T.dot(self.QinvChol,self.QinvChol.T)
        self.Q0inv = T.dot(self.Q0invChol,self.Q0invChol.T)

        ################## put together the total precision matrix ######################

        AQinvA = T.dot(T.dot(self.A.T, self.Qinv), self.A)

        # for now we (suboptimally) replicate a bunch of times
        AQinvrep = Tsla.kron(T.ones([self.Tt-1,1,1]),-T.dot(self.A.T, self.Qinv)) # off-diagonal blocks (upper triangle)

        AQinvArep = Tsla.kron(T.ones([self.Tt-2,1,1]), AQinvA+self.Qinv)
        AQinvArepPlusQ = T.concatenate([T.shape_padleft(self.Q0inv + AQinvA), AQinvArep, T.shape_padleft(self.Qinv)])

        # This is our inverse covariance matrix: diagonal (AA) and off-diagonal (BB) blocks.
        self.AA = self.Lambda + AQinvArepPlusQ
        self.BB = AQinvrep

        # symbolic recipe for computing the the diagonal (V) and
        # off-diagonal (VV) blocks of the posterior covariance
        self.V, self.VV, self.S = compute_sym_blk_tridiag(self.AA, self.BB)

        # now compute the posterior mean
        LambdaMu = T.batched_dot(self.Lambda, self.Mu) # scale by precision (no need for transpose; lambda is symmetric)

        #self.old_postX = compute_sym_blk_tridiag_inv_b(self.S,self.V,LambdaMu) # apply inverse

        # compute cholesky decomposition
        self.the_chol = blk_tridag_chol(self.AA, self.BB)
        # intermediary (mult by R^T) -
        ib = blk_chol_inv(self.the_chol[0], self.the_chol[1], LambdaMu)
        # final result (mult by R)-
        self.postX = blk_chol_inv(self.the_chol[0], self.the_chol[1], ib, lower=False, transpose=True)

        # The determinant of the covariance is the square of the determinant of the cholesky factor.
        # Determinant of the Cholesky factor is the product of the diagonal elements of the block-diagonal.
        def comp_log_det(L):
            return T.log(T.diag(L)).sum()
        self.ln_determinant = -2*theano.scan(fn=comp_log_det, sequences=self.the_chol[0])[0].sum()
def attention_decoder_calc(prefix, params, layer_setting, h_e, mask_below, state_below, h_init = None, c_init = None, mask = None, training = True):
    [h_d, c_d] = lstm_calc(prefix+'_lstm', params, layer_setting['_lstm'], state_below, h_init, c_init, mask, training = training)
    alpha = attention_calc(prefix+'_attention', params, layer_setting['_attention'], h_d, h_e)
    context = T.batched_dot(alpha.dimshuffle(1,0,2), h_e.dimshuffle(1,0,2)).dimshuffle(1,0,2)
    h_d2 = feedforward_calc(prefix+'_tanh', params, layer_setting['_tanh'], T.concatenate([h_d, context], axis = 2))
    dist = feedforward_calc(prefix+'_softmax', params, layer_setting['_softmax'], h_d2)
    return h_d, c_d, dist, alpha
Esempio n. 19
0
    def __init__(self, input_group, n_in_list, emb_dim):
        print input_group
        self.n_in_list = n_in_list
        self.n_out = emb_dim

        Xs = []
        Ws = []
        bs = []
        outs = []

        self.Ws = Ws
        self.bs = bs
        self.Xs = Xs
        self.outs = outs

        for i, input in enumerate(input_group):
            x = input
            w = theano.shared(value=(numpy.random.rand(n_in_list[i], emb_dim)-0.5), borrow=True) 
            b = theano.shared(value=numpy.random.rand(emb_dim), borrow=True)
            Xs.append( x )
            Ws.append( w )
            bs.append( b )
            outs.append( T.dot(x, w) + b )

        #### Active function ####
        # TODO: just support dot(Xs[0], Xs[1]) now.
        if len(Xs)!=2:
            raise Exception('Just support 2 input group now.')
        self.Y = T.batched_dot( outs[0], outs[1] )

        # Function Definition.
        self.active = theano.function(Xs, self.Y)
Esempio n. 20
0
    def get_output_for(self, inputs, **kwargs):

        assert len(inputs) == 4

        context, question, c_mask, q_mask = inputs
        batch_size, question_len, emb_size = question.shape

        question = question.reshape(
            (batch_size * question_len, emb_size)) * self.V
        question = question.reshape((batch_size, question_len, emb_size))

        # batch_size x emb_size x context_len
        context = context.dimshuffle(0, 2, 1)

        # batch_size x question_len x context_len
        x = T.batched_dot(question, context)
        x_max = x.max(axis=2).dimshuffle(0, 1, 'x')
        esim = T.exp(x - x_max)
        esim *= c_mask.reshape((batch_size, 1, -1))

        sums = esim.sum(axis=2)
        esim /= sums.dimshuffle(0, 1, 'x')

        esim *= q_mask.reshape((batch_size, -1, 1))

        return esim.sum(axis=1)  # batch_size x context_len
Esempio n. 21
0
def cosine_similarity(x, y, eps=1e-6):
    z = T.batched_dot(x, y.dimshuffle(0, 2, 1))
    z /= T.sqrt(
        T.sum(x * x, axis=2).dimshuffle(0, 1, 'x') *
        T.sum(y * y, axis=2).dimshuffle(0, 'x', 1) + eps)

    return z
Esempio n. 22
0
 def rightMostFactorization(self, batchSize, args, wC2):
     l = batchSize
     k = self.k  # embed size
     r = self.r  # relation number
     argEmbeds2 = self.A[args.flatten()]
     Asecond = T.batched_dot(wC2, argEmbeds2)
     return Asecond
Esempio n. 23
0
 def leftMostFactorization(self, batchSize, args, wC1):
     l = batchSize
     k = self.k  # embed size
     r = self.r  # relation number
     argEmbeds = self.A[args.flatten()]
     Afirst = T.batched_dot(wC1, argEmbeds)
     return Afirst
Esempio n. 24
0
def getDM_score(kb_entities, kb_relations, neg_samples_kb, opts):
    neg_samples   = opts.neg_samples
    vect_dim      = opts.vect_dim
    num_entities  = opts.num_entities
    num_relations = opts.num_relations
    l2_reg_entities = opts.l2_entity    
    l2_reg_relations = opts.l2_relation  
    '''
        while reading some models are stores with entity embeddings named as entity_embeddings, and some as entity_embeddings_DM
    ''' 
    entities  = Embedding(output_dim=vect_dim, input_dim=num_entities+1, init='normal',name = 'entity_embeddings', W_regularizer=l2(l2_reg_entities))
    relations = Embedding(output_dim=vect_dim, input_dim=num_relations, input_length=1,init='normal', name='relation_embeddings', W_regularizer=l2(l2_reg_relations))

    entity_vectors = entities(kb_entities)
    entity_negative_vectors = entities(neg_samples_kb)
    relation_vectors = Flatten()(relations(kb_relations))

    get_cross_1 = get_cross(0, neg_samples)
    get_cross_2 = get_cross(1, neg_samples)
    e1_cross_e2_prime = merge([entity_vectors, entity_negative_vectors], mode = get_cross_1, output_shape = (neg_samples, vect_dim))
    e1_prime_cross_e2 = merge([entity_vectors, entity_negative_vectors], mode = get_cross_2, output_shape = (neg_samples, vect_dim))
    e1_cross_e2    = Lambda(cross_e1_e2, output_shape = (vect_dim,))(entity_vectors)

    score_DM = merge([relation_vectors, e1_cross_e2], mode = lambda X : T.batched_dot(X[0], X[1]), output_shape=())
    score_DM_e2_corrupted = merge([relation_vectors, e1_cross_e2_prime], mode = 'dot', output_shape=(neg_samples,), dot_axes=(1,2))
    score_DM_e1_corrupted = merge([relation_vectors, e1_prime_cross_e2], mode = 'dot', output_shape=(neg_samples,), dot_axes=(1,2))

    return score_DM, score_DM_e1_corrupted, score_DM_e2_corrupted
Esempio n. 25
0
    def step(self, x, states):
        h_tm1 = states[0]
        c_tm1 = states[1]

        h_tilde = x[:,0,:]
        L = K.params['xmaxlen']
        
        M = K.tanh(self.precompute_W_y_y + x + K.repeat_elements(K.dot(h_tm1, self.U_r).dimshuffle((0,'x',1)),L, axis=1))
        alpha = K.dot(M, self.W)
        alpha = K.softmax(alpha[:,:,0]) 
        alpha = alpha.dimshuffle((0,'x',1))
        
        output = T.batched_dot(alpha,self.Y) 
        output = output[:,0,:]
        
        xt = K.concatenate([h_tilde,output],axis = 1)

        it = K.sigmoid( K.dot(xt,self.W_i) + K.dot(h_tilde,self.U_i) + self.b_i )
        ft = K.sigmoid(K.dot(xt,self.W_f) + K.dot(h_tilde,self.U_f) + self.b_f)
        ot = K.sigmoid(K.dot(xt,self.W_o) + K.dot(h_tilde,self.U_o) + self.b_o)
        c_tilde_t = K.dot(xt,self.W_c) + K.dot(h_tilde,self.U_c) + self.b_c
        c_t = ft * c_tm1 + it*K.tanh( c_tilde_t )

        h_t = ot * K.tanh(c_t)

        return h_t, [h_t,c_t]
Esempio n. 26
0
        def _fprop_step(state_below, state_below_in, state_below_z, state_below_r, 
                        state_before, W_recurrent, W_in, b,
                        W_z, U_z, b_z, W_r, U_r, b_r):
            print "state before 1", state_before, state_before.dtype, state_before.type, state_before.broadcastable
            #state_before = tensor.unbroadcast(state_before, 0)
            z = tensor.nnet.sigmoid(state_below_z + tensor.dot(state_before, U_z) + b_z)
            r = tensor.nnet.sigmoid(state_below_r + tensor.dot(state_before, U_r) + b_r)
            #print "r dim", r.type.ndim
            #W_rec = self.project1(W_recurrent, state_below)
            print "State below step", state_below, state_below.broadcastable, state_below.ndim

            print "state before 2", state_before, state_before.dtype, state_before.type, state_before.broadcastable
            W_rec = W_recurrent[state_below]
            
            bias = b[state_below]
            # !!! Move to efficient indexing
            #shape = (state_below.shape[0], state_below.shape[1], self.dim)
            pre_h = (
                state_below_in + r * tensor.batched_dot(state_before, W_rec)#.reshape(shape)
                + bias
            )
            print "pre_h dim", pre_h, pre_h.type.ndim
            #print "W_recurrent[state_below] dim", W_rec, W_rec.ndim
            # print "W_rec * state before", (state_before* W_rec).ndim

            new_h = tensor.tanh(pre_h)
            #print "new_h", new_h
            h = z * state_before + (1. - z) * new_h
            print "final h dim", h, h.type, h.broadcastable, h.ndim
            h = tensor.unbroadcast(h, 0)
            return h
    def step(mask, alpha_pre, s_pre, h_pre, c_pre):
        score = T.dot(h_pre, params[join(prefix, 'W_h')])
        score = state_below + score[None, :, :]
        score = T.dot(T.tanh(score), params[join(
            prefix, 'W_f2')]) + params[join(prefix, 'b_f2')]
        shp = score.shape
        alpha = softmax_mask(T.reshape(score, [shp[1], shp[0]], ndim=2),
                             inputMask.dimshuffle(1, 0))
        context = T.batched_dot(alpha.dimshuffle(0, 'x', 1),
                                reference.dimshuffle(1, 0, 2)).dimshuffle(
                                    0,
                                    2,
                                )

        activation = T.dot(h_pre, params[join(prefix, 'U')])
        activation += T.dot(context, params[join(prefix, 'W')]) + params[join(
            prefix, 'b')]

        activation_i = slice(activation, 0, n_out)
        activation_f = slice(activation, 1, n_out)
        activation_c = slice(activation, 2, n_out)
        activation_o = slice(activation, 3, n_out)

        i = sigmoid(activation_i)
        f = sigmoid(activation_f)
        o = sigmoid(activation_o)

        c = f * c_pre + i * tanh(activation_c)
        c = mask[:, None] * c + (1 - mask)[:, None] * c_pre

        h = o * tanh(c)
        h = mask[:, None] * h + (1 - mask)[:, None] * h_pre

        return alpha, context, h, c
Esempio n. 28
0
    def apply(self, doc, query, mask_, batch_size):
        # batch_size x doc_length x hidden_dim
        mask = mask_.flatten()
        att1 = self.image_embed.apply(doc)

        # y_q_i: the ith token of question
        #        batch_size x feature_dim
        # r_1: r_m_1
        #        batch_size x feature_dim
        # y_d: document
        #        batch_size x doc_length x feature_dim
        # y_d_m: d-to-m
        #        batch_size x doc_length x hidden_dim

        # batch_size x hidden_dim

        # batch_size x hidden_dim
        y_d = doc
        att3 = self.word_embed.apply(query)
        att = att1 + att3.dimshuffle(0, 'x', 1)
        # batch_size x doc_length x hidden_dim
        m = T.tanh(att)
        # batch_size x doc_length x 1
        s = self.m_to_s.apply(m)
        # batch_size x doc_length
        s = s.reshape((s.shape[0], s.shape[1]))
        s = self.attention_dist.apply(s)
        y_d_s = y_d.swapaxes(1, 2)
        # return batch_size x feature_dim
        r = T.batched_dot(y_d_s, s)


        # batch_size x output_dim
        return r
Esempio n. 29
0
def quadratic_saturating_loss(mx, Sx, target, Q, *args, **kwargs):
    '''
        Squashing loss penalty function
        c(x) = ( 1 - e^(-0.5*quadratic_loss(x, target)) )
    '''
    if Sx is None:
        if mx.ndim == 1:
            mx = mx[None, :]
        delta = mx - target[None, :]
        deltaQ = delta.dot(Q)
        cost = 1.0 - tt.exp(-0.5 * tt.batched_dot(deltaQ, delta))
        return cost
    else:
        # stochastic case (moment matching)
        delta = mx - target
        SxQ = Sx.dot(Q)
        EyeM = tt.eye(mx.shape[0])
        IpSxQ = EyeM + SxQ
        Ip2SxQ = EyeM + 2 * SxQ
        S1 = tt.dot(Q, matrix_inverse(IpSxQ))
        S2 = tt.dot(Q, matrix_inverse(Ip2SxQ))
        # S1 = solve(IpSxQ.T, Q.T).T
        # S2 = solve(Ip2SxQ.T, Q.T).T
        # mean
        m_cost = -tt.exp(-0.5 * delta.dot(S1).dot(delta)) / tt.sqrt(det(IpSxQ))
        # var
        s_cost = tt.exp(-delta.dot(S2).dot(delta)) / tt.sqrt(
            det(Ip2SxQ)) - m_cost**2

        return 1.0 + m_cost, s_cost
Esempio n. 30
0
 def forward(self):
     z = self.z0  # sxd
     u = self.u_  # d
     w = self.w_  # d
     b = self.b  # .
     h = self.h  # f
     # h(sxd \dot d + .)  = s
     if not self.batched:
         hwz = h(z.dot(w) + b)  # s
         # sxd + (s \outer d) = sxd
         z1 = z + tt.outer(hwz, u)  # sxd
         return z1
     else:
         z = z.swapaxes(0, 1)
         # z bxsxd
         # u bxd
         # w bxd
         b = b.dimshuffle(0, "x")
         # b bx-
         hwz = h(tt.batched_dot(z, w) + b)  # bxs
         # bxsxd + (bxsx- * bx-xd) = bxsxd
         hwz = hwz.dimshuffle(0, 1, "x")  # bxsx-
         u = u.dimshuffle(0, "x", 1)  # bx-xd
         z1 = z + hwz * u  # bxsxd
         return z1.swapaxes(0, 1)  # sxbxd
Esempio n. 31
0
    def make_layer(self, n_params, T_u, T_story, T_mask, rng):
        """
        Inputs:
                network params      (n_params)
                question vector     (T_u)
                story tensor        (T_story)
        Outputs: output vector      (T_o)
        """

        # ------ Encode encoder story data
        T_w2v_out = self.T_w2v[T_story] * T_mask[T_story]
        T_m = T.sum(T_w2v_out, axis=2)
        T_m_norm = T.sqrt(T.sum(T_m**2, axis=2))
        T_m = T_m / (T_m_norm.dimshuffle(0, 1, 'x') + 1e-6)
        T_m = T.dot(T_m, n_params['T_B'])

        # ------ Encode decoder story data
        T_w2v_out = self.T_w2v[T_story] * T_mask[T_story]
        T_c = T.sum(T_w2v_out, axis=2)
        T_c_norm = T.sqrt(T.sum(T_c**2, axis=2))
        T_c = T_c / (T_c_norm.dimshuffle(0, 1, 'x') + 1e-6)
        T_c = T.dot(T_c, n_params['T_B'])

        # ------ Sentence picker: tensor3-matrix product
        T_p = T.nnet.softmax(T.batched_dot(T_m, T_u))

        # ------ Sum over story decoder
        T_p_2 = T_p.dimshuffle(0, 1, 'x')
        T_o = T.sum(T_p_2 * T_c, axis=1)

        # Collect
        return T_o, T_p
Esempio n. 32
0
    def sample_XY(self,
                  X0data=None,
                  Nsamps=1,
                  Tbins=30,
                  Xdata=None,
                  withInflow=False):
        """
        TODO: Write docstring
        """
        if Xdata is None:
            Xdata = self.lat_ev_model.sample_X(X0data=X0data,
                                               Nsamps=Nsamps,
                                               Tbins=Tbins,
                                               withInflow=withInflow)
        else:
            Nsamps = Xdata.shape[0]
            Tbins = Xdata.shape[1]

        SigmaChol = T.tile(self.SigmaChol, (Nsamps * Tbins, 1, 1))
        SigmaCholN = T.batched_dot(np.random.randn(Nsamps * Tbins, self.yDim),
                                   SigmaChol)

        Musymb = theano.clone(self.MuY, replace={self.X: Xdata})
        Musymb = T.reshape(Musymb, (Nsamps * Tbins, self.yDim))

        Ysymb = SigmaCholN + Musymb
        Ysymb = T.reshape(Ysymb, (Nsamps, Tbins, self.yDim))

        Ydata = Ysymb.eval()
        return Ydata, Xdata
Esempio n. 33
0
    def call(self, input_tensors, mask=None):
        ''' wbw attention layer:
        :param ctxt (input_tensors[0]) : batch_size x T x ctxt_dim
        :param resp (input_tensors[1]) : batch_size x resp_dim
        '''

        ctxt = input_tensors[0]
        resp = input_tensors[1]
        ctxt_mask = mask[0]

        resp_w = T.dot(resp, self.resp_dense)  # bt_sz x dense_dim
        ctxt_w = T.dot(ctxt, self.ctxt_dense)  # bt_sz x T x dense_dim
        resp_w_rep = resp_w[:, None, :]  # bt_sz x T x dense_dim
        pre_alpha = T.tanh(ctxt_w + resp_w_rep)  # bt_sz x T x dense_dim
        unnorm_alpha = T.dot(pre_alpha,
                             self.alpha_dense).flatten(2)  # bt_sz x T
        if ctxt_mask:
            unnorm_alpha_masked = unnorm_alpha - 1000 * (1. - ctxt_mask)
        else:
            unnorm_alpha_masked = unnorm_alpha
        alpha = T.nnet.softmax(unnorm_alpha_masked)  # bt_sz x T
        attended_ctxt = T.batched_dot(alpha.dimshuffle((0, 'x', 1)),
                                      ctxt)[:, 0, :]  # bt_sz x ctxt_dim

        if self.return_att:
            return [attended_ctxt, alpha]
        else:
            return attended_ctxt
Esempio n. 34
0
 def get_summary(self, yy):
     out = {}
     out['xsm'] = numpy.asarray(self.postX.eval({self.Input:yy}), dtype=theano.config.floatX)
     V = T.batched_dot(self.LambdaChol, self.LambdaChol.dimshuffle(0,2,1))
     out['Vsm'] = numpy.asarray(V.eval({self.Input:yy}), dtype=theano.config.floatX)
     out['VVsm'] = np.zeros([yy.shape[0]-1, self.xDim, self.xDim]).astype(theano.config.floatX)
     return out
Esempio n. 35
0
    def get_output_for(self, inputs, **kwargs):
        q = self.q
        for i in range(self.hops):
            if self.fixed_query and not i:
                u = T.dot(inputs[0], q)            
            else:
                u = T.batched_dot(inputs[0], q)

            # set masked positions to large negative value
            if len(inputs) > 1:
                u = u*inputs[1] - (1-inputs[1])*10000

            #now batch_size x post_length x 1 but need to normalize via softmax

            # normalize over post_length (->large negative values = 0)
            u = T.reshape(u, (inputs[0].shape[0], inputs[0].shape[1]))
            alpha = T.nnet.softmax(u)

            #now B x S
            o = T.dot(T.sum(inputs[0] * alpha[:,:,None], axis=1), self.W_r)
            if self.fixed_query:
                q = q + o
            else:
                q = q + o

        return q
Esempio n. 36
0
    def _step(m_, x_, h_, U):
        #preact = tensor.dot(h_, U)
        #h_: n_p * n_samples * n_h
        #U: n_p * n_h * (k/h n_h)
        preact = tensor.batched_dot(h_, U[:, :, :2 * n_h])
        preact += x_[:, :, :2 * n_h]

        z = tensor.nnet.sigmoid(_slice(preact, 0, n_h))
        r = tensor.nnet.sigmoid(_slice(preact, 1, n_h))
        m = tensor.tanh(x_[:, :, 2 * n_h:] +
                        tensor.batched_dot(h_ * r, U[:, :, 2 * n_h:]))

        h = (1. - z) * h_ + z * m
        h = m_[:, :, None] * h + (1. - m_)[:, :, None] * h_

        return h
Esempio n. 37
0
 def forward(self):
     z = self.z0  # sxd
     H = self.H  # dxd
     if self.batched:
         return tt.batched_dot(z.swapaxes(0, 1), H).swapaxes(0, 1)
     else:
         return z.dot(H)
Esempio n. 38
0
    def get_output_for(self, inputs, **kwargs):
        # inputs[0]: B x N x D
        # inputs[1]: B x Q x D
        # self.mask: B x Q

        q_shuf = inputs[1].dimshuffle(0, 2, 1)  # B x D x Q
        return T.batched_dot(inputs[0], q_shuf)  # B x N x Q
Esempio n. 39
0
  def fwd_old(self, x, V, A, L):
    """
    x : signal
    V : eigenvectors
    A : area 
    L : eigenvalues
    """
    V = V[:,:self.K]
    L = L[:self.K]

    # ghat is already a linear combination. it is faster than doing a
    # traslation and modulation each time of course, everything is linear
    # and thus it can be done
    ghat = self.sample_ghat(self.taus, L)

    rho = T.sqrt(T.sum(A))
    trasl = rho * T.dot(V, ghat.dimshuffle(0,'x') * V.T)
    trasl = A.dimshuffle(0,'x') * trasl
    
    # size Q x K x N, intermediate N x Q x K
    tmp = (V.dimshuffle(0,'x',1) * x.dimshuffle(0,1,'x')).dimshuffle(1,2,0)
    trasl = T.tile(trasl.dimshuffle('x',0,1), [self.nin,1,1])
    # size Q x K x N
    desc = rho * T.batched_dot(tmp, trasl)
    desc = T.abs_(desc)

    desc = desc.dimshuffle(2,0,'x',1) # BC01 format : N x Q x 1 x K
    
    return self.activation(theano.tensor.nnet.conv.conv2d(desc, self.W).flatten(2) + self.b)
Esempio n. 40
0
  def output_func(self, input):
      # P(Y|X) = softmax(W.X + b)
      q, a = input[0], input[1]

      # dot = T.batched_dot(q, T.batched_dot(a, self.W))
      out = T.batched_dot(q, T.dot(a, self.W.T)).dimshuffle(0, 'x')
      return out
Esempio n. 41
0
 def output_func(self, input):
     # P(Y|X) = softmax(W.X + b)
     q, a = input[0], input[1]
     # dot = T.batched_dot(q, T.batched_dot(a, self.W))
     dot = T.batched_dot(q, T.dot(a, self.W.T))
     out = T.concatenate([dot.dimshuffle(0, 'x'), q, a], axis=1)
     return out
Esempio n. 42
0
 def forward(self):
     z = self.z0  # sxd
     H = self.H   # dxd
     if self.batched:
         return tt.batched_dot(z.swapaxes(0, 1), H).swapaxes(0, 1)
     else:
         return z.dot(H)
Esempio n. 43
0
    def factorization(self, batchSize, argsEmbA, argsEmbB, wC):

        # first = T.tensordot(relationProbs, self.C, axes=[[1], [2]])  # [l,r] * [k,k,r] = [l, k, k]
        Afirst = T.batched_tensordot(wC, argsEmbA, axes=[[1], [1]])  # [l, k, k] * [l, k] = [l, k]
        Asecond = T.batched_dot(Afirst, argsEmbB)  # [l, k] * [l, k] = [l]
        # entropy = T.sum(T.log(relationProbs) * relationProbs, axis=1)  # [l,r] * [l,r] = [l]
        return Asecond
    def make_layer(self, n_params, T_u, T_story, T_mask, rng):
        """
        Inputs:
                network params      (n_params)
                question vector     (T_u)
                story tensor        (T_story)
        Outputs: output vector      (T_o)
        """

        # ------ Encode encoder story data
        T_w2v_out = self.T_w2v[T_story] * T_mask[T_story]
        T_m = T.sum(T_w2v_out, axis=2)
        T_m_norm = T.sqrt(T.sum(T_m ** 2, axis=2))
        T_m = T_m / (T_m_norm.dimshuffle(0, 1, 'x') + 1e-6)
        T_m = T.dot(T_m, n_params['T_B'])

        # ------ Encode decoder story data
        T_w2v_out = self.T_w2v[T_story] * T_mask[T_story]
        T_c = T.sum(T_w2v_out, axis=2)
        T_c_norm = T.sqrt(T.sum(T_c ** 2, axis=2))
        T_c = T_c / (T_c_norm.dimshuffle(0, 1, 'x') + 1e-6)
        T_c = T.dot(T_c, n_params['T_B'])

        # ------ Sentence picker: tensor3-matrix product
        T_p = T.nnet.softmax(T.batched_dot(T_m, T_u))

        # ------ Sum over story decoder
        T_p_2 = T_p.dimshuffle(0, 1, 'x')
        T_o = T.sum(T_p_2 * T_c, axis=1)

        # Collect
        return T_o, T_p
Esempio n. 45
0
    def L_op(self, inputs, outputs, output_grads):
        # Gradients computed by Op
        assert self.compute_grad and len(outputs) == 2
        gradients = outputs[1]
        assert gradients is not None

        # Gradients of original function, to compose chain rule
        grad_op = output_grads[0]
        grad_shuffle = GpuDimShuffle(
            input_broadcastable=(
                False,
                False,
                False,
            ),
            new_order=(1, 0, 2),
        )(gradients)
        grad_bdot = tt.batched_dot(grad_op, grad_shuffle)
        grad_shuffle_reverse = GpuDimShuffle(
            input_broadcastable=(
                False,
                False,
                False,
            ),
            new_order=(1, 0, 2),
        )(grad_bdot)
        return [
            grad_shuffle_reverse,
            grad_undefined(self, 1, inputs[1]),
            grad_undefined(self, 2, inputs[2]),
        ]
Esempio n. 46
0
def getDM_score_joint(kb_entities, kb_relations, neg_samples_kb, relations, opts):
    neg_samples   = opts.neg_samples
    vect_dim      = opts.vect_dim
    num_entities  = opts.num_entities
    num_relations = opts.num_relations
    l2_reg_entities = opts.l2_entity    
    # +1 for the OOV embedding.
    entities  = Embedding(output_dim=vect_dim, input_dim=num_entities+1, init='normal',name = 'entity_embeddings_DM', W_regularizer=l2(l2_reg_entities))

    entity_vectors = entities(kb_entities)
    entity_negative_vectors = entities(neg_samples_kb)
    relation_vectors = Flatten()(relations(kb_relations))


    get_cross_1 = get_cross(0, neg_samples)
    e1_cross_e2_prime = merge([entity_vectors, entity_negative_vectors], mode = get_cross_1, output_shape = (neg_samples, vect_dim))
    e1_cross_e2 = Lambda(cross_e1_e2, output_shape = (vect_dim,))(entity_vectors)

    score_DM = merge([relation_vectors, e1_cross_e2], mode = lambda X : T.batched_dot(X[0], X[1]), output_shape=())
    score_DM_e2_corrupted = merge([relation_vectors, e1_cross_e2_prime], mode = 'dot', output_shape=(neg_samples,), dot_axes=(1,2))

 
    if opts.add_loss:
        get_cross_2 = get_cross(1, neg_samples)
        e1_prime_cross_e2 = merge([entity_vectors, entity_negative_vectors], mode = get_cross_2, output_shape = (neg_samples, vect_dim))
        score_DM_e1_corrupted = merge([relation_vectors, e1_prime_cross_e2], mode = 'dot', output_shape=(neg_samples,), dot_axes=(1,2))

    else:
        score_DM_e1_corrupted = None


    return score_DM, score_DM_e1_corrupted, score_DM_e2_corrupted
Esempio n. 47
0
 def apply(self, idx, inp):
     '''
     :param idx: vector of indices, one per sample
     :param inp: matrix (nb_samples, dims)
     :return:
     '''
     return T.batched_dot(inp, self.W[idx-self.idxoffset])
Esempio n. 48
0
 def output_func(self, input):
     # P(Y|X) = softmax(W.X + b)
     q, a = input[0], input[1]
     # dot = T.batched_dot(q, T.batched_dot(a, self.W))
     dot = T.batched_dot(q, T.dot(a, self.W.T))
     out = T.concatenate([dot.dimshuffle(0, 'x'), q, a], axis=1)
     return out
Esempio n. 49
0
    def output_func(self, input):
        # P(Y|X) = softmax(W.X + b)
        q, a = input[0], input[1]

        # dot = T.batched_dot(q, T.batched_dot(a, self.W))
        out = T.batched_dot(q, T.dot(a, self.W.T)).dimshuffle(0, 'x')
        return out
Esempio n. 50
0
 def __call__(self, inputs, mask, h, encoder_outputs):
     """
     decoder using gru layer
     :param inputs: input word indices, (batch_size, 1)
     :param mask: mask for inputs, (batch_size, 1)
     :param h: final state, (batch_size, hidden_size)
     :param encoder_outputs: output of encoder, (batch_size, max_length, hidden_size)
     :return:
     """
     embedded = self.embedding[inputs.flatten()].reshape(
         (-1, self.hidden_size))  # batch*hidden_size
     attn_weights = T.nnet.softmax(
         self.linear_func(
             T.concatenate([embedded, h], 1), self.attn_W,
             self.attn_b))  # batch*(hidden_size*2)-> batch * max_length
     attn_weights = attn_weights.reshape((-1, 1, self.max_length))
     attn_applied = T.batched_dot(
         attn_weights, encoder_outputs
     )  # batch*1*max_length   *   batch*max_length*hidden_size -> batch*1*hidden_size
     output = T.concatenate([embedded, attn_applied[:, 0, :]],
                            1)  # b*(hidden_size*2)
     output = self.linear_func(output, self.attn_combine_W,
                               self.attn_combine_b)  # b*hidden_size
     output = output.reshape((-1, 1, self.hidden_size))
     for i in xrange(self.num_layers):
         output = ReLU(output)
         output, h = self.gru_layer(output, mask, h)
     output = T.tensordot(output, self.linear, axes=[2, 0])
     return output, h, attn_weights  # b*1*vocab_size(unscaled), b*hidden_size, b*max_length
Esempio n. 51
0
 def get_output_for(self, inputs, **kwargs):
     M = inputs[0]
     u = inputs[1]
     output = T.batched_dot(M, u)
     if self.nonlinearity is not None:
         output = self.nonlinearity(output)
     return output
Esempio n. 52
0
def mem_focus(memory, key, strength):
    """
        mem_focus(memory, key, strength) -> weighting (batchsize x M)

        produces a weighting over memory positions based on a key

        @param memory: a batchsize x N x M 3-tensor
        @param key: a batchsize x 1 x N 3-tensor. mem_focus() is expected to output a weighting for each batch element
        @param strength: a batchsize x 1 matrix, sharpens the weighting 
    """

    # dot -> batchsize x 1 x M
    dot = T.batched_dot(key, memory)

    # memory_magnitude -> batchsize x M 
    memory_magnitude = T.sqrt(T.sum(memory ** 2, axis = 1))

    # key_magnitude -> batchsize x 1*
    key_magnitude = T.addbroadcast(T.sqrt(T.sum(key ** 2, axis = 2)), 1)

    # multiplied_magnitude -> batchsize x 1 x M
    multiplied_magnitude = (memory_magnitude * key_magnitude).dimshuffle([0, 'x', 1])

    # cosine_similarity -> batchsize x 1 x M
    cosine_similarity = dot/(multiplied_magnitude + SMALL_CONSTANT)

    # strengthened_cosine_similarity -> batchsize x 1 x M
    strengthened_cosine_similarity = cosine_similarity * strength.dimshuffle([0, 1, 'x']) 

    # weighting -> batchsize x M
    weighting = T.nnet.softmax(T.flatten(strengthened_cosine_similarity, outdim = 2))

    return weighting
    def __init__(self, x, y, l, window, opt, lr, init_emb, dim_emb, dim_hidden, n_vocab, L2_reg, unit,
                 sim='cos', n_layers=1, activation=tanh):
        self.tr_inputs = [x, y, l]
        self.pr_inputs = [x, y, l]

        self.x = x  # 1D: batch_size * l * 2, 2D: window; elem=word_id
        self.y = y  # 1D: batch_size; elem=label
        self.l = l  # scalar: elem=sentence length

        batch_size = y.shape[0]
        n_cands = x.shape[0] / batch_size / l

        self.pad = build_shared_zeros((1, dim_emb))
        if init_emb is None:
            self.emb = theano.shared(sample_weights(n_vocab - 1, dim_emb))
        else:
            self.emb = theano.shared(init_emb)
        self.E = T.concatenate([self.pad, self.emb], 0)
        self.W_out = theano.shared(sample_weights(dim_hidden, dim_hidden))
        self.params = [self.emb, self.W_out]

        """ Input Layer """
        e = self.E[x]  # e: 1D: batch_size * l * 2, 2D: window, 3D: dim_emb
        x_in = e.reshape((batch_size * n_cands, l, -1))

        """ Intermediate Layer """
        # h: 1D: n_batch * n_cands, 2D: dim_emb
        h, params = cnn.layers(x_in, window, dim_emb, dim_hidden, n_layers, activation)
        self.params.extend(params)

        """ Output Layer """
        h = h.reshape((batch_size, n_cands, -1))
        h_1 = h[T.arange(batch_size), 0]
        h_2 = h[T.arange(batch_size), 1:]
        if sim == 'cos':
            y_score = cosign_similarity(h_1, h_2)
        else:
            y_score = T.batched_dot(T.dot(h_1, self.W_out), h_2.dimshuffle(0, 2, 1))
        y_score_hat = T.max(y_score, 1)

        """ Objective Function """
        self.nll = max_margin_loss(y_score_hat, y_score[T.arange(batch_size), y])
        self.L2_sqr = regularization(self.params)
        self.cost = self.nll + L2_reg * self.L2_sqr / 2.

        """ Optimization """
        if opt == 'adagrad':
            self.update = ada_grad(cost=self.cost, params=self.params, lr=lr)
        elif opt == 'ada_delta':
            self.update = ada_delta(cost=self.cost, params=self.params)
        elif opt == 'adam':
            self.update = adam(cost=self.cost, params=self.params, lr=lr)
        else:
            self.update = sgd(cost=self.cost, params=self.params, lr=lr)

        """ Predicts """
        y_hat = T.argmax(y_score, 1)

        """ Check Accuracies """
        self.correct = T.eq(y_hat, y)
 def get_output_for(self, inputs, **kwargs):
     M = inputs[0]
     u = inputs[1]
     output = T.batched_dot(M, u)
     if self.nonlinearity is not None:
         output = self.nonlinearity(output)
     return output
 def get_output_for(self, inputs, **kwargs):
     #input[0]:(BS,max_senlen,emb_size),input[1]:(BS,1,emb_size),input[2]:(BS,max_sentlen)
     # activation0=(T.dot(inputs[0],self.W_h)).reshape([self.batch_size,self.max_sentlen])+self.b_h.repeat(self.batch_size,0).repeat(self.max_sentlen,1)
     # activation1=T.dot(inputs[1],self.W_q).reshape([self.batch_size]).dimshuffle(0,'x')
     # activation2=T.batched_dot(T.dot(inputs[0],self.W_o),inputs[1].reshape([self.batch_size,self.embedding_size,1])).reshape([self.batch_size,self.max_sentlen])
     activation2=T.batched_dot(inputs[0],inputs[1].reshape([self.batch_size,self.embedding_size,1])).reshape([self.batch_size,self.max_sentlen])
     norm2=T.sqrt(T.sum(T.mul(inputs[0],inputs[0]),axis=2))+0.0000001
     activation2=activation2/norm2
     # activation=(self.nonlinearity(activation0)+self.nonlinearity(activation1)+activation2).reshape([self.batch_size,self.max_sentlen])#.dimshuffle(0,'x',2)#.repeat(self.max_sentlen,axis=1)
     activation2=(activation2).reshape([self.batch_size,self.max_sentlen])#.dimshuffle(0,'x',2)#.repeat(self.max_sentlen,axis=1)
     # final=T.dot(activation,self.W_o) #(BS,max_sentlen)
     activation3=T.batched_dot(inputs[0],inputs[1].reshape([self.batch_size,self.embedding_size,1])).reshape([self.batch_size,self.max_sentlen])
     # if inputs[2] is not None:
     #     final=inputs[2]*final-(1-inputs[2])*1000000
     alpha=lasagne.nonlinearities.softmax(activation2) #(BS,max_sentlen)
     return alpha
Esempio n. 56
0
 def forward(self):
     z = self.z0  # sxd
     u = self.u_   # d
     w = self.w_   # d
     b = self.b   # .
     h = self.h   # f
     # h(sxd \dot d + .)  = s
     if not self.batched:
         hwz = h(z.dot(w) + b)  # s
         # sxd + (s \outer d) = sxd
         z1 = z + tt.outer(hwz,  u)  # sxd
         return z1
     else:
         z = z.swapaxes(0, 1)
         # z bxsxd
         # u bxd
         # w bxd
         b = b.dimshuffle(0, 'x')
         # b bx-
         hwz = h(tt.batched_dot(z, w) + b)  # bxs
         # bxsxd + (bxsx- * bx-xd) = bxsxd
         hwz = hwz.dimshuffle(0, 1, 'x')  # bxsx-
         u = u.dimshuffle(0, 'x', 1)  # bx-xd
         z1 = z + hwz * u  # bxsxd
         return z1.swapaxes(0, 1)  # sxbxd
Esempio n. 57
0
    def tf_update_state_batch(self, t_state_mat, t_obs_mat, t_act_mat):  
        t_ofeat_mat = self._f_obs(t_obs_mat)
        t_afeat_mat = self._f_act(t_act_mat)
        
        K = self._feat_dim          
        N = t_state_mat.shape[0]
        
        # Obtain extended state
        UU_efa = self._t_UU_efa                  
        C_ex = T.reshape(T.dot(t_state_mat, self._t_W_s2ex),(N, K.exfut_obs, K.exfut_act))
        C_ex.name='tf_update_state::C_ex'                 
        
        # Condition on action
        B = T.reshape(T.dot(t_afeat_mat, UU_efa.T), (N, K.fut_act, K.exfut_act)).transpose(0,2,1)
        B.name = 'tf_update_state::B'          
        #import pdb; pdb.set_trace()
        C_efo_fa = T.batched_dot(C_ex, B)
        C_efo_fa.name='tf_update_state::C_efo_fa'                
        
        # Obtain v = C_oo\o_feat                                
        C_oo_prj = T.batched_dot(T.reshape(T.dot(t_state_mat,self._t_W_s2oo), (N, K.oo, K.act)), t_afeat_mat)
        C_oo_prj.name = 'tf_update_state::Cooprj'
        C_oo = T.reshape(T.dot(C_oo_prj, self._t_U_oo.T), (N, K.obs, K.obs))
        C_oo.name='tf_update_state::C_oo'
                
        v = self._solve_batch(C_oo, t_ofeat_mat, self._lambda['filter'])                        
        v.name = 'tf_update_state::v'

        # Multply by v to condition on observation
        UU = self._t_UU_efo
        vproj = T.dot(v, UU)
        vproj.name ='tf_update_state::vproj'
        A = T.reshape(vproj,(N, K.exfut_obs, K.fut_obs)).transpose(0,2,1)
        
        A.name = 'tf_update_state::A'  
        ss = T.batched_dot(A, C_efo_fa).reshape([N,-1])        
        ss.name = 'tf_update_state::ss_Cefodot'                                   
        ss = T.dot(ss, self._t_UT_st.T)
        ss.name = 'tf_update_state::Uss_dot'
        ss = self._norm_method(ss)
        ss = self._smooth(ss, t_state_mat)
        
        self._dbg_batch = lambda : None
        self._dbg_batch.out = C_ex, C_oo, B, A, ss

        # Adding the sum of parameters fixes a Theano bug.
        return ss + sum(T.sum(p)*1e-30 for p in self.params)
Esempio n. 58
0
 def batched_batched_dot(s):
     """ from (x,y,z)-shaped pair, produce (x,y)-shaped pair that replaces the z-vector pairs by their dot-products """
     import theano
     import theano.tensor as T
     return theano.scan(fn=lambda xm, ym: T.batched_dot(xm, ym),
                        outputs_info=None,
                        sequences=s,
                        non_sequences=None)[0]
 def _normalize_attention((att, mat)):
     if transpose:
         att = att.dimshuffle((0, 2, 1))
     # 3d softmax
     e = K.exp(att - K.max(att, axis=-1, keepdims=True))
     s = K.sum(e, axis=-1, keepdims=True)
     sm_att = e / s
     return T.batched_dot(sm_att, mat)