Esempio n. 1
0
  def step(self, i_t, x_t, z_t, att_p, y_p, c_p, *other_args):
    # See Unit.scan() for seqs.
    # args: seqs (x_t = unit.xc, z_t, i_t), outputs (# unit.n_act, y_p, c_p, ...), non_seqs (none)
    other_outputs = []
    #att_p = theano.printing.Print('att in lstms', attrs=['__str__'])(att_p)
    if self.recurrent_transform:
      state_vars = other_args[:len(self.recurrent_transform.state_vars)]
      self.recurrent_transform.set_sorted_state_vars(state_vars)
      z_r, r_updates = self.recurrent_transform.step(y_p)
      z_t += z_r
      for v in self.recurrent_transform.get_sorted_state_vars():
        other_outputs += [r_updates[v]]
    maxatt = att_p.repeat(z_t.shape[1]).reshape((z_t.shape[0],z_t.shape[1]))#.dimshuffle(1,0)
    #maxatt = theano.printing.Print('maxatt',attrs=['__str__','shape'])(maxatt)
    z_t = T.switch(maxatt>0,z_t,z_t + T.dot(y_p, self.W_re))
    #z_t += T.dot(y_p, self.W_re)
    #z_t = theano.printing.Print('z_t lstms',attrs=['shape'])(z_t)

    partition = z_t.shape[1] // 4
    ingate = T.nnet.sigmoid(z_t[:,:partition])
    forgetgate = ((T.nnet.sigmoid(z_t[:,partition:2*partition])).T * (1.-att_p)).T
    outgate = T.nnet.sigmoid(z_t[:,2*partition:3*partition])
    input = T.tanh(z_t[:,3*partition:4*partition])
    #c_t = ((forgetgate * c_p + ingate * input).T * (1.-T.max(att_p,axis=-1))).T
    c_t = forgetgate * c_p + ingate * input
    y_t = outgate * T.tanh(c_t)
    i_output = T.outer(i_t, self.o_output)
    i_h = T.outer(i_t, self.o_h)
    # return: next outputs (# unit.n_act, y_t, c_t, ...)
    return (y_t * i_output, c_t * i_h + c_p * (1 - i_h)) + tuple(other_outputs)
Esempio n. 2
0
 def step(self, i_t, x_t, z_t, y_p, c_p, *other_args):
   # See Unit.scan() for seqs.
   # args: seqs (x_t = unit.xc, z_t, i_t), outputs (# unit.n_act, y_p, c_p, ...), non_seqs (none)
   other_outputs = []
   if self.recurrent_transform:
     state_vars = other_args[:len(self.recurrent_transform.state_vars)]
     self.recurrent_transform.set_sorted_state_vars(state_vars)
     z_r, r_updates = self.recurrent_transform.step(y_p)
     z_t += z_r
     for v in self.recurrent_transform.get_sorted_state_vars():
       other_outputs += [r_updates[v]]
   z_t += T.dot(y_p, self.W_re)
   partition = z_t.shape[1] // 4 #number of units
   forgetgate = T.nnet.sigmoid(z_t[:,:partition])
   propgate = T.nnet.sigmoid(z_t[:,partition:2*partition])
   diffgate = T.nnet.sigmoid(z_t[:,2*partition:3*partition])
   input = T.tanh(z_t[:,3*partition:4*partition])
   # c(t) = (1 - FG(t)) * IN(t) + FG(t) * c(t-1)
   c_t = (1-forgetgate) * input + forgetgate * c_p
   # y(t) = tanh( PG(t) * c(t) + DG(t) * ( c(t) - c(t-1)) ) HINT: The additional nonlinearity maybe has not a significant effect
   y_t = T.tanh(propgate * c_t + diffgate * ( c_t - c_p))
   i_output = T.outer(i_t, self.o_output)
   i_h = T.outer(i_t, self.o_h)
   # return: next outputs (# unit.n_act, y_t, c_t, ...)
   return (y_t * i_output, c_t * i_h + c_p * (1 - i_h)) + tuple(other_outputs)
Esempio n. 3
0
 def full(self, X, Xs=None):
     X, Xs = self._slice(X, Xs)
     scf_x = self.scaling_func(X, self.args)
     if Xs is None:
         return tt.outer(scf_x, scf_x) * self.cov_func(X)
     else:
         scf_xs = self.scaling_func(Xs, self.args)
         return tt.outer(scf_x, scf_xs) * self.cov_func(X, Xs)
Esempio n. 4
0
 def contrastive_divergence_1(self, v1):
     '''Determine the weight updates according to CD-1'''
     h1 = self.sample_h_given_v(v1)
     v2 = self.sample_v_given_h(h1)
     h2p = self.propup(v2)
     return (T.outer(v1, h1) - T.outer(v2, h2p),
             v1 - v2,
             h1 - h2p)
Esempio n. 5
0
def image_step_val(Imat, htm1mat, ctm1mat, 
                   Wcnn, Wxi, Whi, bi, Wxf, Whf, bf, 
                   Wxc, Whc, bc, Wxo, Who, bo, Why, by, forbatch):
    xtmat = theano.dot(Imat, Wcnn)
    itmat = sigma(theano.dot(xtmat,Wxi) + theano.dot(htm1mat,Whi) + T.outer(forbatch,bi) )
    ftmat = sigma(theano.dot(xtmat,Wxf) + theano.dot(htm1mat,Whf) + T.outer(forbatch,bf) )
    ctmat = ftmat * ctm1mat + itmat*act(theano.dot(xtmat,Wxc)+theano.dot(htm1mat,Whc)+T.outer(forbatch,bc) )
    otmat = sigma(theano.dot(xtmat,Wxo) + theano.dot(htm1mat,Who) + T.outer(forbatch,bo) )
    htmat = otmat * act(ctmat)
#    yt = T.concatenate([addzero,tempyt],axis=0)
    return htmat, ctmat    
Esempio n. 6
0
def psb(inverse_hessian, weight_delta, gradient_delta, **options):
    gradient_delta_t = gradient_delta.T
    param = weight_delta - inverse_hessian.dot(gradient_delta)

    devider = (1. / T.dot(gradient_delta, gradient_delta))
    param1 = T.outer(param, gradient_delta) + T.outer(gradient_delta, param)
    param2 = (
        T.dot(gradient_delta, param) *
        T.outer(gradient_delta, gradient_delta_t)
    )

    return inverse_hessian + param1 * devider - param2 * devider ** 2
def train():
    train_set, valid_set, test_set = loadData()
    x,y = train_set
    m,n_input = x.shape
    width = 28
    height = 28
    n_hidden = 49
    
    learning_rate = .1
    
    #set up shared variables
    W = theano.shared(numpy.random.uniform(-4 * numpy.sqrt(6. / (n_hidden + n_input)),4 * numpy.sqrt(6. / (n_hidden + n_input)),(n_hidden,width*height)),name="W")
    b_v = theano.shared(numpy.zeros((width*height,)),name="b_v")
    b_h = theano.shared(numpy.zeros((n_hidden,)),name="b_h")
    
    theano_rng = T.shared_randomstreams.RandomStreams(numpy.random.randint(2 ** 30))
    
    v_input = T.fvector("v_input")
    
    #1. sample hidden units
    h_prob = T.nnet.sigmoid(T.dot(v_input,W.T)+b_h)
    h_sample = theano_rng.binomial(size=(n_hidden,), n=1, p=h_prob)
    #2. calculate positive gradient
    g_p = T.outer(v_input,h_sample)
    #3. make reconstruction
    v_prob_reconstruction = T.nnet.sigmoid(T.dot(h_sample,W)+b_v)
    v_reconstruction = theano_rng.binomial(size=(n_input,), n=1, p=v_prob_reconstruction)
    h_prob_reconstruction = T.nnet.sigmoid(T.dot(v_reconstruction,W.T)+b_h)
    h_reconstruction = theano_rng.binomial(size=(n_hidden,), n=1, p=h_prob_reconstruction)
    #4. calculate negative gradient
    g_n = T.outer(v_reconstruction,h_reconstruction)
    #FUNCTIONS FOR TESTING
    #f_h_prob = theano.function(inputs=[v_input,],outputs=[h_prob,])
    #f_h_sample = theano.function(inputs=[v_input,],outputs=[h_sample,])
    #f_g_p = theano.function(inputs=[v_input,],outputs=[g_p,])
    #f_v_prob_reconstruction = theano.function(inputs=[v_input,],outputs=[v_prob_reconstruction,])
    #f_v_reconstruction = theano.function(inputs=[v_input,],outputs=[v_reconstruction,])
    #f_h_prob_reconstruction = theano.function(inputs=[v_input,],outputs=[h_prob_reconstruction,])
    #f_h_reconstruction = theano.function(inputs=[v_input,],outputs=[h_reconstruction,])
    #f_g_n = theano.function(inputs=[v_input,],outputs=[g_n,])
    
    learn = theano.function(inputs=[v_input,],updates=[(W,W+learning_rate*(g_p-g_n).T)])
    
    for i in range(300001):
        if i > 0:
            if i%10000 == 0:
                print "Epcoh: ",i
                display_weights(W,width,height,i)
        learn(x[i%m,:])
    
    with open('weights.pkl', 'wb') as output:
        pickle.dump(W.get_value(), output, pickle.HIGHEST_PROTOCOL)
Esempio n. 8
0
        def times_reflection(input, n_hidden, reflection):
            input_re = input[:, :n_hidden]
            input_im = input[:, n_hidden:]
            reflect_re = reflection[n_hidden:]
            reflect_im = reflection[:n_hidden]
            
            vstarv = (reflect_re**2 + reflect_im**2).sum()
            input_re_reflect = input_re - 2 / vstarv * (T.outer(T.dot(input_re, reflect_re), reflect_re) +
                                                        T.outer(T.dot(input_im, reflect_im), reflect_im))
            input_im_reflect = input_im - 2 / vstarv * (-T.outer(T.dot(input_re, reflect_im), reflect_im) +
                                                        T.outer(T.dot(input_im, reflect_re), reflect_re))

            return T.concatenate([input_re_reflect, input_im_reflect], axis=1)      
Esempio n. 9
0
 def _step(x_t, i_t, c_tm1, y_tm1):
   #z_t = T.dot(x_t, W) + T.dot(y_tm1, V_h) + b
   z_t = x_t + T.dot(y_tm1, V_h)
   partition = z_t.shape[1] / 4
   ingate = T.nnet.sigmoid(z_t[:,:partition])
   forgetgate = T.nnet.sigmoid(z_t[:,partition:2*partition])
   outgate = T.nnet.sigmoid(z_t[:,2*partition:3*partition])
   input = T.tanh(z_t[:,3*partition:4*partition])
   c_t = forgetgate * c_tm1 + ingate * input
   y_t = outgate * T.tanh(c_t)
   i_output = T.outer(i_t, o_output)
   i_h = T.outer(i_t, o_h)
   return c_t * i_h + c_tm1 * (1 - i_h), y_t * i_output
Esempio n. 10
0
    def learningstep_m1(self, Y, L, M, W, epsilon):
        """Perform a single learning step.

        This is a faster learning step for the case of
        mini-batch-size = 1.

        Keyword arguments:
        the keyword arguments must be the same as given in
        self.input_parameters(mode) for mode='train'.
        """
        # Input integration:
        I = T.dot(T.log(W),Y)
        # recurrent term:
        vM = theano.ifelse.ifelse(
            T.eq(L,-1), # if no label is provided
            T.sum(M, axis=0),
            M[L,:]
            )
        # numeric trick to prevent overflow in the exp-function:
        max_exponent = 88. - T.log(I.shape[0]).astype('float32')
        scale = theano.ifelse.ifelse(T.gt(I[T.argmax(I)], max_exponent),
            I[T.argmax(I)] - max_exponent, 0.)
        # activation: recurrent softmax with overflow protection
        s = vM*T.exp(I-scale)/T.sum(vM*T.exp(I-scale))
        s.name = 's_%d.%d[t]'%(self._nmultilayer,self._nlayer)
        # weight update
        W_new = W + epsilon*(T.outer(s,Y) - s[:,np.newaxis]*W)
        W_new.name = 'W_%d.%d[t]'%(self._nmultilayer,self._nlayer)
        return s, W_new
Esempio n. 11
0
 def one_iter(W_i, V_i, b_i, a, v_lt_i, p_lt_i, log_likelihood):
     h_i = self.sigmoid(a)
     p_i = self.sigmoid(T.dot(h_i, V_i) + b_i)
     v_i = 1. * (theano_rng.uniform([num_samples]) <= p_i)
     log_likelihood += v_i * T.log(p_i) + (1 - v_i) * T.log(1 - p_i)
     a += T.outer(v_i, W_i)
     return a, v_i, p_i, log_likelihood
Esempio n. 12
0
    def free_energy(self, visible):
        """Computes the free energy of the model.

        :type visible: theano.tensor.TensorType
        :param visible: The state of the visible units (either 1/0, or mean -
            not important).
        
        :rtype: theano.tensor.var.TensorVariable
        :returns: The free energy of the model, given the visible activation.
            Computed as 

            .. math::
               :label: free_energy

                \mathcal{F}(x) = - \log \sum_h e^{-E(x,h)}
        """
        print 'Running free energy.'

        D = TT.sum(visible, axis=1)
        exponent_term = TT.dot(visible, self.W) + TT.outer(D, self.b_hidden)
                        # TT.outer(D, self.b_hidden)
                        # D is a coefficient, b_hidden should

        hidden_term = TT.sum(TT.log(1 + TT.exp(exponent_term)), axis=1)

        # This is the other and more crucial difference between an RBM and a
        #  RSM: multiplying hidedn bias by "document length".
        b_visible_term = TT.dot(visible, self.b_visible)

        free_energy = - hidden_term - b_visible_term
        return free_energy
Esempio n. 13
0
    def compute_probabilistic_matrix(self,X, y, num_cases, k=5):

        z       = T.dot(X, self.A) #Transform x into z space 
        dists   = T.sqr(dist2hy(z,z))
        dists   = T.extra_ops.fill_diagonal(dists, T.max(dists)+1)
        nv      = T.min(dists,axis=1) # value of nearest neighbour 
        dists   = (dists.T - nv).T
        d       = T.extra_ops.fill_diagonal(dists, 0)
   
        #Take only k nearest 
        num     = T.zeros((num_cases, self.num_classes))
        denom   = T.zeros((num_cases,))
        for c_i in xrange(self.num_classes):

            #Mask for class i
            mask_i = T.eq(T.outer(T.ones_like(y),y),c_i)

            #K nearest neighbour within a class i 
            dim_ci = T.sum(mask_i[0])
            d_c_i = T.reshape(d[mask_i.nonzero()],(num_cases,dim_ci))
            k_indice = T.argsort(d_c_i, axis=1)[:,0:k]
            
            kd = T.zeros((num_cases,k))
            for it in xrange(k):
                kd = T.set_subtensor(kd[:,it], d_c_i[T.arange(num_cases),k_indice[:,it]]) 

            #Numerator
            value   = T.exp(-T.mean(kd,axis=1))
            num     = T.set_subtensor(num[:,c_i], value) 
            denom   += value 
            

        p = num / denom.dimshuffle(0,'x')    #prob that point i will be correctly classified    
        return p
Esempio n. 14
0
def kron(a, b):
    """ Kronecker product

    Same as scipy.linalg.kron(a, b).

    :note: numpy.kron(a, b) != scipy.linalg.kron(a, b)!
        They don't have the same shape and order when
        a.ndim != b.ndim != 2.

    :param a: array_like
    :param b: array_like
    :return: array_like with a.ndim + b.ndim - 2 dimensions.

    """
    a = tensor.as_tensor_variable(a)
    b = tensor.as_tensor_variable(b)
    if (a.ndim + b.ndim <= 2):
        raise TypeError('kron: inputs dimensions must sum to 3 or more. '
                        'You passed %d and %d.' % (a.ndim, b.ndim))
    o = tensor.outer(a, b)
    o = o.reshape(tensor.concatenate((a.shape, b.shape)),
                  a.ndim + b.ndim)
    shf = o.dimshuffle(0, 2, 1, * range(3, o.ndim))
    if shf.ndim == 3:
        shf = o.dimshuffle(1, 0, 2)
        o = shf.flatten()
    else:
        o = shf.reshape((o.shape[0] * o.shape[2],
                         o.shape[1] * o.shape[3]) +
                        tuple([o.shape[i] for i in range(4, o.ndim)]))
    return o
 def __init__(self, C, D, use_unlabeled):
     self.W = theano.shared(np.ones((C,D), dtype='float32'))
     t_eps = T.scalar('epsilon', dtype='float32')
     t_Y = T.vector('Y', dtype='float32')
     t_s = T.vector('s', dtype='float32')
     self.activation_unlabeled = theano.function(
         [t_Y],
         T.sum(t_Y*self.W/T.sum(self.W, axis=0), axis=1),
         allow_input_downcast=True
         )
     self.activation_normalization = theano.function(
         [t_s],
         t_s/T.sum(t_s),
         allow_input_downcast=True
         )
     self.weight_update = theano.function(
         [t_Y,t_s,t_eps],
         self.W,
         updates={
             self.W:
             self.W + t_eps*(T.outer(t_s,t_Y) - t_s[:,np.newaxis]*self.W)
             },
         allow_input_downcast=True
         )
     self.epsilon = None
     self._Y = None
     self._s = None
     self._delta = np.eye(C, dtype='float32')
     self._C = C
     self._use_unlabeled = use_unlabeled
     self._skipupdate = False
Esempio n. 16
0
def compute_psi1(lls, lsf, xmean, xvar, z):

    if xmean.ndim == 1:
        xmean = xmean[ None, : ]

    ls = T.exp(lls)
    sf = T.exp(lsf)
    lspxvar = ls + xvar
    constterm1 = ls / lspxvar
    constterm2 = T.prod(T.sqrt(constterm1), 1)
    r2_psi1 = T.outer(T.sum(xmean * xmean / lspxvar, 1), T.ones_like(z[ : , 0 : 1 ])) \
        - np.float32(2) * T.dot(xmean / lspxvar, T.transpose(z)) + \
        T.dot(np.float32(1.0) / lspxvar, T.transpose(z)**2)
    psi1 = sf * T.outer(constterm2, T.ones_like(z[ : , 0 : 1 ])) * T.exp(-np.float32(0.5) * r2_psi1)

    return psi1
Esempio n. 17
0
 def forward(self):
     z = self.z0  # sxd
     u = self.u_   # d
     w = self.w_   # d
     b = self.b   # .
     h = self.h   # f
     # h(sxd \dot d + .)  = s
     if not self.batched:
         hwz = h(z.dot(w) + b)  # s
         # sxd + (s \outer d) = sxd
         z1 = z + tt.outer(hwz,  u)  # sxd
         return z1
     else:
         z = z.swapaxes(0, 1)
         # z bxsxd
         # u bxd
         # w bxd
         b = b.dimshuffle(0, 'x')
         # b bx-
         hwz = h(tt.batched_dot(z, w) + b)  # bxs
         # bxsxd + (bxsx- * bx-xd) = bxsxd
         hwz = hwz.dimshuffle(0, 1, 'x')  # bxsx-
         u = u.dimshuffle(0, 'x', 1)  # bx-xd
         z1 = z + hwz * u  # bxsxd
         return z1.swapaxes(0, 1)  # sxbxd
Esempio n. 18
0
 def __init__(self, C, D):
     self.W = theano.shared(np.ones((C,D), dtype='float32'))
     t_M = T.matrix('M', dtype='float32')
     t_vM = T.vector('M', dtype='float32')
     t_Y = T.vector('Y', dtype='float32')
     t_I = T.vector('I', dtype='float32')
     t_s = T.vector('s', dtype='float32')
     t_eps = T.scalar('epsilon', dtype='float32')
     self.input_integration = theano.function(
         [t_Y],
         T.dot(T.log(self.W),t_Y),
         allow_input_downcast=True
         )
     self.M_summation = theano.function(
         [t_M],
         T.sum(t_M, axis=0),
         allow_input_downcast=True
         )
     self.recurrent_softmax = theano.function(
         [t_I,t_vM],
         t_vM*T.exp(t_I)/T.sum(t_vM*T.exp(t_I)),
         allow_input_downcast=True
         )
     self.weight_update = theano.function(
         [t_Y,t_s,t_eps],
         self.W,
         updates={
             self.W:
             self.W + t_eps*(T.outer(t_s,t_Y) - t_s[:,np.newaxis]*self.W)
             },
         allow_input_downcast=True
         )
     self.epsilon = None
     self._Y = None
     self._s = None
Esempio n. 19
0
 def one_iter(Wi, Vi, bi, rand_i, a, vis_i, post):
     hid  = self.sigmoid(a)
     pi   = self.sigmoid(T.dot(hid, Vi) + bi)
     vis_i = T.cast(rand_i <= pi, floatX)
     post  = post + T.log(pi*vis_i + (1-pi)*(1-vis_i))
     a     = a + T.outer(vis_i, Wi)
     return a, vis_i, post
def get_square_norm_gradients_scan(D_by_layer, cost, accum = 0):

    # This returns a theano variable that will be of shape (minibatch_size, ).
    # It will contain, for each training example, the associated square-norm of the total gradient.
    # If you take the element-wise square-root afterwards, you will get
    # the associated 2-norms, which is what you want for importance sampling.

    for (layer_name, D) in D_by_layer.items():

        backprop_output = tensor.grad(cost, D['output'])

        if D.has_key('weight'):
            A = D['input']
            B = backprop_output
            S, _ =  theano.scan(fn=lambda A, B: tensor.sqr(tensor.outer(A,B)).sum(),
                                        sequences=[A,B])
            accum = accum + S

        if D.has_key('bias'):

            B = backprop_output
            S, _ =  theano.scan(fn=lambda B: tensor.sqr(B).sum(),
                                        sequences=[B])
            accum = accum + S
        
    return accum
Esempio n. 21
0
    def grad(self, inputs, output_gradients):
        """
        Reverse-mode gradient updates for matrix solve operation c = A \ b.

        Symbolic expression for updates taken from [1]_.

        References
        ----------
        ..[1] M. B. Giles, "An extended collection of matrix derivative results
          for forward and reverse mode automatic differentiation",
          http://eprints.maths.ox.ac.uk/1079/

        """
        A, b = inputs
        c = self(A, b)
        c_bar = output_gradients[0]
        trans_map = {
            'lower_triangular': 'upper_triangular',
            'upper_triangular': 'lower_triangular'
        }
        trans_solve_op = Solve(
            # update A_structure and lower to account for a transpose operation
            A_structure=trans_map.get(self.A_structure, self.A_structure),
            lower=not self.lower
        )
        b_bar = trans_solve_op(A.T, c_bar)
        # force outer product if vector second input
        A_bar = -tensor.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T)
        if self.A_structure == 'lower_triangular':
            A_bar = tensor.tril(A_bar)
        elif self.A_structure == 'upper_triangular':
            A_bar = tensor.triu(A_bar)
        return [A_bar, b_bar]
Esempio n. 22
0
def bfgs(inverse_hessian, weight_delta, gradient_delta, maxrho=1e4):
    ident_matrix = T.eye(inverse_hessian.shape[0])

    maxrho = asfloat(maxrho)
    rho = asfloat(1.) / gradient_delta.dot(weight_delta)

    rho = ifelse(
        T.isinf(rho),
        maxrho * T.sgn(rho),
        rho,
    )

    param1 = ident_matrix - T.outer(weight_delta, gradient_delta) * rho
    param2 = ident_matrix - T.outer(gradient_delta, weight_delta) * rho
    param3 = rho * T.outer(weight_delta, weight_delta)

    return param1.dot(inverse_hessian).dot(param2) + param3
Esempio n. 23
0
def dfp(inverse_hessian, weight_delta, gradient_delta, maxnum=1e5):
    maxnum = asfloat(maxnum)
    quasi_dot_gradient = inverse_hessian.dot(gradient_delta)

    param1 = (
        T.outer(weight_delta, weight_delta)
    ) / (
        T.dot(gradient_delta, weight_delta)
    )
    param2_numerator = T.clip(
        T.outer(quasi_dot_gradient, gradient_delta) * inverse_hessian,
        -maxnum, maxnum
    )
    param2_denominator = gradient_delta.dot(quasi_dot_gradient)
    param2 = param2_numerator / param2_denominator

    return inverse_hessian + param1 - param2
Esempio n. 24
0
 def added_part_f(sen_part = T.matrix("sen_part")):
     inter_sen_part0 = T.zeros_like(T.outer(sen_part[0], sen_part[1]))
     inter_sen_part, updates = theano.scan(fn = inter_accu, \
                                           sequences = dict(input = sen_part, taps = [-1, 0]), \
                                           outputs_info = dict(initial = inter_sen_part0, taps = [-1]))
     added_part =  T.dot(inter_sen_part[-1], 1.0/(sen_part.shape[0]-1))
     added_part = added_part[0:pca_dim]
     new_sen_part = T.concatenate([sen_part, added_part], axis=0)
     return new_sen_part
Esempio n. 25
0
 def GradientForOneObject(sample, dream, h_lids, vBias, hBias):
     #T.sum(T.sqr(hBias - self.bm.computeProbabilityHByV(sample, self.W, hBias))) + \
     #T.sum(T.sqr(vBias - sample)) +
     #                     self.bm.cleverAddingToFreeEnergy(sample, self.W, vBias, hBias) +\
     # self.bm.cleverAddingToFreeEnergy(sample, self.W, vBias, hBias) +
     # self.bm.cleverAddingToFreeEnergy(sample, self.W, vBias, hBias) + \
     energy = regularization * self.bm.cleverAddingToFreeEnergy(sample, self.W, vBias, hBias) + \
              self.bm.freeEnergy(sample, self.W, vBias, hBias) - self.bm.freeEnergy(dream, self.W, vBias, hBias)
     #energy = self.bm.
     #energy = T.sum(energy)
     grad = theano.grad(energy, [self.W, vBias, hBias], consider_constant=[sample, dream])
     gradUByW1 = T.outer(grad[1], h_lids);
     gradUByW2 = T.outer(grad[2], h_lids);
     gradUByhBias = (grad[2]);
     gradUByvBias = (grad[1]);
     gradUByW = grad[0];
     gradHLid0 = (h_lids);
     return [energy, gradHLid0, gradUByW1, gradUByW2, gradUByhBias, gradUByvBias, gradUByW]
Esempio n. 26
0
def compute_kernel(lls, lsf, x, z):

    ls = T.exp(lls)
    sf = T.exp(lsf)

    if x.ndim == 1:
        x = x[ None, : ]

    if z.ndim == 1:
        z = z[ None, : ]

    lsre = T.outer(T.ones_like(x[ :, 0 ]), ls)

    r2 = T.outer(T.sum(x * x / lsre, 1), T.ones_like(z[ : , 0 : 1 ])) - np.float32(2) * \
        T.dot(x / lsre, T.transpose(z)) + T.dot(np.float32(1.0) / lsre, T.transpose(z)**2)

    k = sf * T.exp(-np.float32(0.5) * r2)

    return k
Esempio n. 27
0
def bp_mll(pred, target):
    # From : Multi-Label Neural Networks with Applications to
    # Functional Genomics and Text Categorization
    # https://cs.nju.edu.cn/zhouzh/zhouzh.files/publication/tkde06a.pdf
    y_i = pred * target
    not_y_i = pred * (1-target)
    matrices, updates = theano.scan(fn=lambda p, t: T.outer(p, t),
                                    sequences=[y_i, not_y_i])
    cost = matrices.sum(axis=(1,2))
    return cost, updates
Esempio n. 28
0
 def lstm(z, i_t, s_p, h_p):
   z += T.dot(h_p, self.N_re)
   i = T.outer(i_t, T.alloc(numpy.cast['int8'](1), n_out))
   ingate = T.nnet.sigmoid(z[:,n_out: 2 * n_out])
   forgetgate = T.nnet.sigmoid(z[:,2 * n_out:3 * n_out])
   outgate = T.nnet.sigmoid(z[:,3 * n_out:])
   input = T.tanh(z[:,:n_out])
   s_t = input * ingate + s_p * forgetgate
   h_t = T.tanh(s_t) * outgate
   return theano.gradient.grad_clip(s_t * i, -50, 50), h_t * i
Esempio n. 29
0
    def get_weight(self, encOutputs1, encMask1, encOutput2):
        e = T.alloc(1, encOutputs1.shape[0])
        tiledEncOutput2 = T.outer(e, encOutput2.flatten()).reshape([e.shape[0], encOutput2.shape[0], encOutput2.shape[1]])
        attInput = T.concatenate([encOutputs1, tiledEncOutput2], axis=2)

        A = self.h2.get_output(self.h1.get_output(attInput))[:,:,0]
        maskedExpA = T.exp(A) * encMask1
        weight = maskedExpA / T.sum(maskedExpA, axis=0)
        
        weight = self.sharpen(weight) / T.sum(self.sharpen(weight), axis=0) if not (self.sharpen is None) else weight
        return weight
Esempio n. 30
0
 def L_op(self, inputs, outputs, output_gradients):
     # Modified from theano/tensor/slinalg.py
     A, b = inputs
     c = outputs[0]
     c_bar = output_gradients[0]
     # FIXME: triangular structure would use GpuCublasTriangularsolve?
     # no need to handle A_structure like slinalg.py?
     trans_solve_op = GpuCusolverSolve('general')
     b_bar = trans_solve_op(A.T, c_bar)
     A_bar = -tensor.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T)
     return [A_bar, b_bar]
    def compute_output(self):

        # We compute the output mean

        self.Kzz = compute_kernel(
            self.lls, self.lsf, self.z,
            self.z) + T.eye(self.z.shape[0]) * self.jitter * T.exp(self.lsf)
        self.KzzInv = T.nlinalg.MatrixInversePSD()(self.Kzz)
        LLt = T.dot(self.LParamPost, T.transpose(self.LParamPost))
        self.covCavityInv = self.KzzInv + LLt * casting(
            self.n_points - self.set_for_training) / casting(self.n_points)
        self.covCavity = T.nlinalg.MatrixInversePSD()(self.covCavityInv)
        self.meanCavity = T.dot(
            self.covCavity,
            casting(self.n_points - self.set_for_training) /
            casting(self.n_points) * self.mParamPost)
        self.KzzInvcovCavity = T.dot(self.KzzInv, self.covCavity)
        self.KzzInvmeanCavity = T.dot(self.KzzInv, self.meanCavity)
        self.covPosteriorInv = self.KzzInv + LLt
        self.covPosterior = T.nlinalg.MatrixInversePSD()(self.covPosteriorInv)
        self.meanPosterior = T.dot(self.covPosterior, self.mParamPost)
        self.Kxz = compute_kernel(self.lls, self.lsf, self.input_means, self.z)
        self.B = T.dot(self.KzzInvcovCavity, self.KzzInv) - self.KzzInv
        v_out = T.exp(self.lsf) + T.dot(self.Kxz * T.dot(self.Kxz, self.B),
                                        T.ones_like(self.z[:, 0:1]))

        if self.ignore_variances:

            self.output_means = T.dot(self.Kxz, self.KzzInvmeanCavity)
            self.output_vars = abs(v_out) + casting(0) * T.sum(self.input_vars)

        else:

            self.EKxz = compute_psi1(self.lls, self.lsf, self.input_means,
                                     self.input_vars, self.z)
            self.output_means = T.dot(self.EKxz, self.KzzInvmeanCavity)

            # In other layers we have to compute the expected variance

            self.B2 = T.outer(T.dot(self.KzzInv, self.meanCavity),
                              T.dot(self.KzzInv, self.meanCavity))

            exact_output_vars = True

            if exact_output_vars:

                # We compute the exact output variance

                self.psi2 = compute_psi2(self.lls, self.lsf, self.z,
                                         self.input_means, self.input_vars)
                ll = T.transpose(self.EKxz[:, None, :] * self.EKxz[:, :, None],
                                 [1, 2, 0])
                kk = T.transpose(self.Kxz[:, None, :] * self.Kxz[:, :, None],
                                 [1, 2, 0])
                v1 = T.transpose(
                    T.sum(T.sum(
                        T.shape_padaxis(self.B2, 2) * (self.psi2 - ll), 0),
                          0,
                          keepdims=True))
                v2 = T.transpose(
                    T.sum(T.sum(
                        T.shape_padaxis(self.B, 2) * (self.psi2 - kk), 0),
                          0,
                          keepdims=True))

            else:

                # We compute the approximate output variance using the unscented kalman filter

                v1 = 0
                v2 = 0

                n = self.input_d
                for j in range(1, n + 1):
                    mask = T.zeros_like(self.input_vars)
                    mask = T.set_subtensor(mask[:, j - 1], 1)
                    inc = mask * T.sqrt(casting(n) * self.input_vars)
                    self.kplus = T.sqrt(
                        casting(1.0) / casting(2 * n)) * compute_kernel(
                            self.lls, self.lsf, self.input_means + inc, self.z)
                    self.kminus = T.sqrt(
                        casting(1.0) / casting(2 * n)) * compute_kernel(
                            self.lls, self.lsf, self.input_means - inc, self.z)

                    v1 += T.dot(self.kplus * T.dot(self.kplus, self.B2),
                                T.ones_like(self.z[:, 0:1]))
                    v1 += T.dot(self.kminus * T.dot(self.kminus, self.B2),
                                T.ones_like(self.z[:, 0:1]))
                    v2 += T.dot(self.kplus * T.dot(self.kplus, self.B),
                                T.ones_like(self.z[:, 0:1]))
                    v2 += T.dot(self.kminus * T.dot(self.kminus, self.B),
                                T.ones_like(self.z[:, 0:1]))

                v1 -= T.dot(self.EKxz * T.dot(self.EKxz, self.B2),
                            T.ones_like(self.z[:, 0:1]))
                v2 -= T.dot(self.Kxz * T.dot(self.Kxz, self.B),
                            T.ones_like(self.z[:, 0:1]))

            self.output_vars = abs(v_out) + abs(v2) + abs(v1)

        self.output_vars = self.output_vars + T.exp(self.lvar_noise)

        return
def main():

    #Load mastectomy dataset
    df = datasets.get_rdataset('mastectomy', 'HSAUR', cache=True).data
    #Change event to integer
    df.event = df.event.astype(np.int64)
    #Change metastized to integer (1 for yes, 0 for no)
    df.metastized = (df.metastized == 'yes').astype(np.int64)
    #Count the number of patients
    n_patients = df.shape[0]
    #Create array for each individual patient
    patients = np.arange(n_patients)

    #Censoring - we do not observe the death of every subject, and subjects may still be alive at time t=0
    #1 - observation is not censored (death was observed)
    #0 - observation is censored (death was not observed)
    nonCensored = df.event.mean()

    #Create censoring plot
    fig, ax = plt.subplots(figsize=(8, 6))
    blue, _, red = sns.color_palette()[:3]
    #Create horizontal lines for censored observations
    ax.hlines(patients[df.event.values == 0],
              0,
              df[df.event.values == 0].time,
              color=blue,
              label='Censored')
    #Create horizontal red lines for uncensored observations
    ax.hlines(patients[df.event.values == 1],
              0,
              df[df.event.values == 1].time,
              color=red,
              label='Uncensored')
    #Create scatter ppoints for metastized months
    ax.scatter(df[df.metastized.values == 1].time,
               patients[df.metastized.values == 1],
               color='k',
               zorder=10,
               label='Metastized')
    ax.set_xlim(left=0)
    ax.set_xlabel('Months since mastectomy')
    ax.set_yticks([])
    ax.set_ylabel('Subject')
    ax.set_ylim(-0.25, n_patients + 0.25)
    ax.legend(loc='center right')

    #To understand the impact of metastization on survival time, we use a risk regression model
    #Cox proportional hazards model
    #Make intervals 3 months long
    interval_length = 3
    interval_bounds = np.arange(0,
                                df.time.max() + interval_length + 1,
                                interval_length)
    n_intervals = interval_bounds.size - 1
    intervals = np.arange(n_intervals)
    #Check how deaths and censored observations are distributed in intervals
    fig, ax = plt.subplots(figsize=(8, 6))
    #Plot histogram of uncensored events
    ax.hist(df[df.event == 1].time.values,
            bins=interval_bounds,
            color=red,
            alpha=0.5,
            lw=0,
            label='Uncensored')
    #Plot histogram of censored events
    ax.hist(df[df.event == 0].time.values,
            bins=interval_bounds,
            color=blue,
            alpha=0.5,
            lw=0,
            label='Censored')
    ax.set_xlim(0, interval_bounds[-1])
    ax.set_xlabel('Months since mastectomy')
    ax.set_yticks([0, 1, 2, 3])
    ax.set_ylabel('Number of observations')
    ax.legend()

    #Calculates the last interval period when a subject was alive
    last_period = np.floor((df.time - 0.01) / interval_length).astype(int)
    #Creates an empty matrix to store deaths
    death = np.zeros((n_patients, n_intervals))
    #For each patient (row), create an event where the last interval period was observed (column)
    death[patients, last_period] = df.event

    #Create matrix of the amount of time a subject (row) was at risk in an interval (column)
    exposure = np.greater_equal.outer(df.time,
                                      interval_bounds[:-1]) * interval_length
    exposure[patients, last_period] = df.time - interval_bounds[last_period]

    #Define parameters for PyMC
    SEED = 5078864
    n_samples = 1000
    n_tune = 1000

    #Create PyMC model -> lambda(t) = lambda0(t) * e ^ (X*beta)
    with pm.Model() as model:
        #Define prior distribution of hazards as vague Gamma distribution
        lambda0 = pm.Gamma('lambda0', 0.01, 0.01, shape=n_intervals)

        #Define hazard regression coefficients (beta) for covariates X as a normal distribution
        beta = pm.Normal('beta', 0, sd=1000)

        #Create equation for lambda(t) as a deterministic node - record sampled values as part of output
        #T.outer = symbolic matrix, vector-vector outer product
        lambda_ = pm.Deterministic(
            'lambda_', T.outer(T.exp(beta * df.metastized), lambda0))
        #Mu is created from our lambda values (hazard) times patient exposure per interval
        mu = pm.Deterministic('mu', exposure * lambda_)

        #We model the posterior distribution as a Poisson distribution with mean Mu
        obs = pm.Poisson('obs', mu, observed=death)

    with model:
        trace = pm.sample(n_samples, tune=n_tune, random_seed=SEED)

    pm.traceplot(trace)

    #Calculate hazard rate for subjects with metastized cancer (based on regression coefficients)
    hazardRate = np.exp(trace['beta'].mean())
    pm.plot_posterior(trace, varnames=['beta'], color='#87ceeb')
    pm.autocorrplot(trace, varnames=['beta'])

    #Store base hazard as well as metastized hazard for each sample per interval
    #(sample x number of intervals)
    base_hazard = trace['lambda0']
    met_hazard = trace['lambda0'] * np.exp(np.atleast_2d(trace['beta']).T)

    #Calculate cumulative hazard
    def cum_hazard(hazard):
        return (interval_length * hazard).cumsum(axis=-1)

    #Calculative survival as = e^(-cumulative hazard)
    def survival(hazard):
        return np.exp(-cum_hazard(hazard))

    #Plot highest posterior density
    def plot_with_hpd(x, hazard, f, ax, color=None, label=None, alpha=0.05):
        #Use function f on hazard mean
        mean = f(hazard.mean(axis=0))
        #Create confidence percentiles
        percentiles = 100 * np.array([alpha / 2., 1. - alpha / 2.])
        hpd = np.percentile(f(hazard), percentiles, axis=0)

        ax.fill_between(x, hpd[0], hpd[1], color=color, alpha=0.25)
        ax.step(x, mean, color=color, label=label)

    #Create figure
    fig, (hazard_ax, surv_ax) = plt.subplots(ncols=2,
                                             sharex=True,
                                             sharey=False,
                                             figsize=(16, 6))
    #Plot Hazard with HPD up until the last interval for non-metasized cancer
    plot_with_hpd(interval_bounds[:-1],
                  base_hazard,
                  cum_hazard,
                  hazard_ax,
                  color=blue,
                  label='Had not metastized')
    #Plot Hazard with HPD up until the last interval for metasized cancer
    plot_with_hpd(interval_bounds[:-1],
                  met_hazard,
                  cum_hazard,
                  hazard_ax,
                  color=red,
                  label='Metastized')
    hazard_ax.set_xlim(0, df.time.max())
    hazard_ax.set_xlabel('Months since mastectomy')
    hazard_ax.set_ylabel(r'Cumulative hazard $\Lambda(t)$')
    hazard_ax.legend(loc=2)
    #Plot Survival with HPD up until the last interval for non-metasized cancer
    plot_with_hpd(interval_bounds[:-1],
                  base_hazard,
                  survival,
                  surv_ax,
                  color=blue)
    #Plot Survival with HPD up until the last interval for metasized cancer
    plot_with_hpd(interval_bounds[:-1],
                  met_hazard,
                  survival,
                  surv_ax,
                  color=red)
    surv_ax.set_xlim(0, df.time.max())
    surv_ax.set_xlabel('Months since mastectomy')
    surv_ax.set_ylabel('Survival function $S(t)$')
    fig.suptitle('Bayesian survival model')

    #Consider time varying effects
    with pm.Model() as time_varying_model:
        lambda0 = pm.Gamma('lambda0', 0.01, 0.01, shape=n_intervals)
        #Beta is now modeled as a normal random walk instead of a normal distribution
        #This is due to the fact that the regression coefficients can vary over time
        beta = GaussianRandomWalk('beta', tau=1., shape=n_intervals)

        lambda_ = pm.Deterministic(
            'h', lambda0 * T.exp(T.outer(T.constant(df.metastized), beta)))
        mu = pm.Deterministic('mu', exposure * lambda_)

        obs = pm.Poisson('obs', mu, observed=death)

    with time_varying_model:
        time_varying_trace = pm.sample(n_samples,
                                       tune=n_tune,
                                       random_seed=SEED)

    pm.traceplot(time_varying_trace)
    pm.plot_posterior(time_varying_trace, varnames=['beta'], color='#87ceeb')
    pm.forestplot(time_varying_trace, varnames=['beta'])

    #Create plot to show the mean trace of beta
    fig, ax = plt.subplots(figsize=(8, 6))
    #Create percentiles of the new trace
    beta_hpd = np.percentile(time_varying_trace['beta'], [2.5, 97.5], axis=0)
    beta_low = beta_hpd[0]
    beta_high = beta_hpd[1]
    #Fill percentile interval
    ax.fill_between(interval_bounds[:-1],
                    beta_low,
                    beta_high,
                    color=blue,
                    alpha=0.25)
    #Create the mean estimate for beta from trace samples
    beta_hat = time_varying_trace['beta'].mean(axis=0)
    #Plot a stepwise line for beta_hat per interval
    ax.step(interval_bounds[:-1], beta_hat, color=blue)
    #Plot points where cancer was metastized, differentiation between death and censorship
    ax.scatter(interval_bounds[last_period[(df.event.values == 1)
                                           & (df.metastized == 1)]],
               beta_hat[last_period[(df.event.values == 1)
                                    & (df.metastized == 1)]],
               c=red,
               zorder=10,
               label='Died, cancer metastized')
    ax.scatter(interval_bounds[last_period[(df.event.values == 0)
                                           & (df.metastized == 1)]],
               beta_hat[last_period[(df.event.values == 0)
                                    & (df.metastized == 1)]],
               c=blue,
               zorder=10,
               label='Censored, cancer metastized')
    ax.set_xlim(0, df.time.max())
    ax.set_xlabel('Months since mastectomy')
    ax.set_ylabel(r'$\beta_j$')
    ax.legend()

    #Store time-varying model
    tv_base_hazard = time_varying_trace['lambda0']
    tv_met_hazard = time_varying_trace['lambda0'] * np.exp(
        np.atleast_2d(time_varying_trace['beta']))

    #Plot cumulative hazard functions with and without time-varying effect
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.step(interval_bounds[:-1],
            cum_hazard(base_hazard.mean(axis=0)),
            color=blue,
            label='Had not metastized')
    ax.step(interval_bounds[:-1],
            cum_hazard(met_hazard.mean(axis=0)),
            color=red,
            label='Metastized')
    ax.step(interval_bounds[:-1],
            cum_hazard(tv_base_hazard.mean(axis=0)),
            color=blue,
            linestyle='--',
            label='Had not metastized (time varying effect)')
    ax.step(interval_bounds[:-1],
            cum_hazard(tv_met_hazard.mean(axis=0)),
            color=red,
            linestyle='--',
            label='Metastized (time varying effect)')
    ax.set_xlim(0, df.time.max() - 4)
    ax.set_xlabel('Months since mastectomy')
    ax.set_ylim(0, 2)
    ax.set_ylabel(r'Cumulative hazard $\Lambda(t)$')
    ax.legend(loc=2)

    #Plot cumulative hazard and survival models with HPD
    fig, (hazard_ax, surv_ax) = plt.subplots(ncols=2,
                                             sharex=True,
                                             sharey=False,
                                             figsize=(16, 6))
    plot_with_hpd(interval_bounds[:-1],
                  tv_base_hazard,
                  cum_hazard,
                  hazard_ax,
                  color=blue,
                  label='Had not metastized')
    plot_with_hpd(interval_bounds[:-1],
                  tv_met_hazard,
                  cum_hazard,
                  hazard_ax,
                  color=red,
                  label='Metastized')
    hazard_ax.set_xlim(0, df.time.max())
    hazard_ax.set_xlabel('Months since mastectomy')
    hazard_ax.set_ylim(0, 2)
    hazard_ax.set_ylabel(r'Cumulative hazard $\Lambda(t)$')
    hazard_ax.legend(loc=2)
    plot_with_hpd(interval_bounds[:-1],
                  tv_base_hazard,
                  survival,
                  surv_ax,
                  color=blue)
    plot_with_hpd(interval_bounds[:-1],
                  tv_met_hazard,
                  survival,
                  surv_ax,
                  color=red)
    surv_ax.set_xlim(0, df.time.max())
    surv_ax.set_xlabel('Months since mastectomy')
    surv_ax.set_ylabel('Survival function $S(t)$')
    fig.suptitle('Bayesian survival model with time varying effects')

    plt.show()

    print('x')
Esempio n. 33
0
 def test_A_plus_scaled_outer(self):
     f = self.function(
         [self.A, self.x, self.y], self.A + 0.1 * tensor.outer(self.x, self.y)
     )
     self.assertFunctionContains(f, ScipyGer(destructive=False))
     self.run_f(f)  # DebugMode tests correctness
Esempio n. 34
0
    def attributes_update(self, attributes, depth, graph, original_graph,
                          bonds):
        '''Given the current attributes, the current depth, and the graph that the attributes
        are based on, this function will update the 2D attributes tensor'''

        ############# GET NEW ATTRIBUTE MATRIX #########################
        # New pre-activated attribute matrix v = M_i,j,: x ones((N_atom, 1)) -> (N_atom, N_features)
        # as long as dimensions are appropriately shuffled
        shuffled_graph = graph.copy().dimshuffle(
            (2, 0, 1))  # (N_feature x N_atom x N_atom)
        shuffled_graph.name = 'shuffled_graph'

        ones_vec = K.ones_like(attributes[:, 0])  # (N_atom x 1)
        ones_vec.name = 'ones_vec'
        (new_preactivated_attributes, updates) = theano.scan(
            lambda x: K.dot(x, ones_vec),
            sequences=shuffled_graph)  # (N_features x N_atom)

        # Need to pass through an activation function still
        # Final attribute = bond flag = is not part of W_inner or b_inner
        (new_attributes,
         updates) = theano.scan(lambda x: self.activation_inner(
             K.dot(x, self.W_inner[depth, :, :]) + self.b_inner[depth, 0, :]),
                                sequences=new_preactivated_attributes[:-1, :].T
                                )  # (N_atom x N_features -1)

        # Append last feature (bond flag) after the loop
        new_attributes = K.concatenate((new_attributes, attributes[:, -1:]),
                                       axis=1)
        new_attributes.name = 'new_attributes'

        ############ UPDATE GRAPH TENSOR WITH NEW ATOM ATTRIBUTES ###################
        ### Node attribute contribution is located in every entry of graph[i,j,:] where
        ### there is a bond @ ij or when i = j (self)
        # Get atoms matrix (identity)
        atoms = T.identity_like(bonds)  # (N_atom x N_atom)
        atoms.name = 'atoms_identity'
        # Combine
        bonds_or_atoms = bonds + atoms  # (N_atom x N_atom)
        bonds_or_atoms.name = 'bonds_or_atoms'

        atom_indeces = T.arange(
            ones_vec.shape[0])  # 0 to N_atoms - 1 (indeces)
        atom_indeces.name = 'atom_indeces vector'
        ### Subtract previous node attribute contribution
        # Multiply each entry in bonds_or_atoms by the previous atom features for that column
        (old_features_to_sub, updates) = theano.scan(
            lambda i: T.outer(bonds_or_atoms[:, i], attributes[i, :]),
            sequences=T.arange(ones_vec.shape[0]))
        old_features_to_sub.name = 'old_features_to_sub'

        ### Add new node attribute contribution
        # Multiply each entry in bonds_or_atoms by the previous atom features for that column
        (new_features_to_add, updates) = theano.scan(
            lambda i: T.outer(bonds_or_atoms[:, i], new_attributes[i, :]),
            sequences=T.arange(ones_vec.shape[0]))
        new_features_to_add.name = 'new_features_to_add'

        # Update new graph
        new_graph = graph - old_features_to_sub + new_features_to_add
        new_graph.name = 'new_graph'

        return (new_attributes, new_graph)
Esempio n. 35
0
 def test_int_fails(self):
     self.manual_setup_method("int32")
     f = self.function([self.x, self.y], tensor.outer(self.x, self.y))
     self.assertFunctionContains0(f, CGer(destructive=True))
     self.assertFunctionContains0(f, CGer(destructive=False))
Esempio n. 36
0
def flat_outer(a, b):
    return tt.outer(a, b).ravel()
Esempio n. 37
0
 def test_scaled_A_plus_scaled_outer(self):
     f = self.function(
         [self.A, self.x, self.y], 0.2 * self.A + 0.1 * tensor.outer(self.x, self.y)
     )
     self.assertFunctionContains(f, gemm_no_inplace)
     self.run_f(f)  # DebugMode tests correctness
Esempio n. 38
0
 def test_int_fails(self):
     self.setUp('int32')
     f = self.function([self.x, self.y], tensor.outer(self.x, self.y))
     self.assertFunctionContains0(f, CGer(destructive=True))
     self.assertFunctionContains0(f, CGer(destructive=False))
Esempio n. 39
0
 def force_outer(l, r):
     return tensor.outer(l, r) if r.ndim == 1 else l.dot(r.T)
Esempio n. 40
0
 def test_optimization_pipeline_float(self):
     skip_if_blas_ldflags_empty()
     self.setUp('float32')
     f = self.function([self.x, self.y], tensor.outer(self.x, self.y))
     self.assertFunctionContains(f, CGer(destructive=True))
     f(self.xval, self.yval)  # DebugMode tests correctness
Esempio n. 41
0
 def test_A_plus_scaled_outer(self):
     skip_if_blas_ldflags_empty()
     f = self.function([self.A, self.x, self.y],
                       self.A + 0.1 * tensor.outer(self.x, self.y))
     self.assertFunctionContains(f, CGer(destructive=False))
     self.run_f(f)  # DebugMode tests correctness
Esempio n. 42
0
    def test_profiling(self):

        config1 = theano.config.profile
        config2 = theano.config.profile_memory
        config3 = theano.config.profiling.min_peak_memory
        try:
            theano.config.profile = True
            theano.config.profile_memory = True
            theano.config.profiling.min_peak_memory = True

            x = [T.fvector("val%i" % i) for i in range(3)]

            z = []
            z += [
                T.outer(x[i], x[i + 1]).sum(axis=1) for i in range(len(x) - 1)
            ]
            z += [x[i] + x[i + 1] for i in range(len(x) - 1)]

            p = theano.ProfileStats(False, gpu_checks=False)

            if theano.config.mode in [
                    "DebugMode", "DEBUG_MODE", "FAST_COMPILE"
            ]:
                m = "FAST_RUN"
            else:
                m = None

            f = theano.function(x, z, profile=p, name="test_profiling", mode=m)

            inp = [np.arange(1024, dtype="float32") + 1 for i in range(len(x))]
            f(*inp)

            buf = StringIO()
            f.profile.summary(buf)

            # regression testing for future algo speed up
            the_string = buf.getvalue()
            lines1 = [
                l for l in the_string.split("\n") if "Max if linker" in l
            ]
            lines2 = [l for l in the_string.split("\n") if "Minimum peak" in l]
            if theano.config.device == "cpu":
                assert "CPU: 4112KB (4104KB)" in the_string, (lines1, lines2)
                assert "CPU: 8204KB (8196KB)" in the_string, (lines1, lines2)
                assert "CPU: 8208KB" in the_string, (lines1, lines2)
                assert (
                    "Minimum peak from all valid apply node order is 4104KB"
                    in the_string), (lines1, lines2)
            else:
                assert "CPU: 16KB (16KB)" in the_string, (lines1, lines2)
                assert "GPU: 8204KB (8204KB)" in the_string, (lines1, lines2)
                assert "GPU: 12300KB (12300KB)" in the_string, (lines1, lines2)
                assert "GPU: 8212KB" in the_string, (lines1, lines2)
                assert (
                    "Minimum peak from all valid apply node order is 4116KB"
                    in the_string), (lines1, lines2)

        finally:
            theano.config.profile = config1
            theano.config.profile_memory = config2
            theano.config.profiling.min_peak_memory = config3
Esempio n. 43
0
def cosine_similarity(x, y, eps=1e-6):
    z = T.dot(x, y.T)
    z /= T.sqrt(T.outer(T.sum(x * x, axis=1), T.sum(y * y, axis=1)) + eps)

    return z
Esempio n. 44
0
def OptimalGaussian(x_train,
                    y_train,
                    Regression=True,
                    Classification=False,
                    bias=False,
                    n_iter=5,
                    alpha=0.01,
                    minibatch=False):
    '''
    inputs
        x_train: training features
        y_train: response variable
        n_iter: # of iterations for SGD
        alpha: strength of L2 penalty (default penalty for now)
    outputs
        Gaussian Node: dictionary with Node parameters an predict method
    '''

    rng = numpy.random

    feats = len(x_train[0, :])
    N = len(x_train[:, 0])
    D = [x_train, y_train]
    training_steps = n_iter
    #print "training steps: ", training_steps
    #print "penalty strength: ", alpha
    #print "Uses bias: ", bias

    # Declare Theano symbolic variables
    x = T.matrix("x")
    y = T.vector("y")
    w = theano.shared(rng.uniform(low=-0.25, high=0.25, size=feats), name="w")
    b = theano.shared(abs(rng.randn(1)[0]), name="b")
    a = theano.shared(abs(rng.randn(1)[0]), name="a")
    rep = theano.shared(numpy.asarray([1] * N), name="rep")
    #print "Initialize node as:"
    #print w.get_value(), b.get_value(), a.get_value()

    # Construct Theano expression graph
    W = T.outer(rep, w)
    if bias:
        p_1 = a * T.exp(-0.5 / (b**2) * T.dot((x - w).T, (x - w)))
    else:
        p_1 = a * T.exp(-0.5 / (1**2) * T.diagonal(T.dot((x - W), (x - W).T)))
    prediction = p_1 > 0.5
    if Regression:
        xent = 0.5 * (y - p_1)**2
    if alpha == 0:
        cost = xent.mean()  # The cost to minimize
    else:
        cost = xent.mean() + alpha * ((w**2).sum())
    if bias:
        gw, gb, ga = T.grad(cost, [w, b, a])
    else:
        gw, ga = T.grad(cost, [w, a])  # Compute the gradient of the cost

    # Compile
    Node = {}
    Node['Path'] = {}
    NodePath = Node['Path']
    if bias:
        train = theano.function(inputs=[x, y],
                                outputs=[prediction, xent],
                                updates=((w, w - 0.1 * gw), (b, b - 0.1 * gb),
                                         (a, a - 0.1 * ga)))
    else:
        train = theano.function(inputs=[x, y],
                                outputs=[prediction, xent],
                                updates=((w, w - 0.1 * gw), (a, a - 0.1 * ga)))

    predict = theano.function(inputs=[x], outputs=p_1)

    # Train
    for i in range(training_steps):
        if minibatch:
            batch_split = train_test_split(x_train, y_train, test_size=0.2)
            _, D[0], _, D[1] = batch_split
            #IPython.embed()
            pred, err = train(D[0], D[1])

        elif not minibatch:
            pred, err = train(D[0], D[1])
        NodePath[str(i)] = {}
        NodePath[str(i)]['w'] = w.get_value()
        NodePath[str(i)]['b'] = b.get_value()
        NodePath[str(i)]['a'] = a.get_value()

    Node['w'] = w.get_value()
    Node['b'] = b.get_value()
    Node['a'] = a.get_value()
    Node['predict'] = predict

    return Node
Esempio n. 45
0
def build_model(tparams, options):
    # for training

    # encoder input
    x_node = tensor.tensor4('x_node', dtype=config.floatX)
    x           = tensor.tensor4('x', dtype='int64')
    x_mask_word = tensor.tensor4('x_mask_word', dtype=config.floatX)
    x_mask_sent = tensor.tensor3('x_mask_sent', dtype=config.floatX)
    x_mask_doc  = tensor.matrix('x_mask_doc', dtype=config.floatX)

    # decoder input
    dec_inp      = tensor.matrix('dec_inp', dtype='int64')
    dec_inp_mask = tensor.matrix('dec_inp_mask', dtype=config.floatX)

    # decoder output
    dec_out      = tensor.matrix('dec_out', dtype='int64')
    dec_out_mask = tensor.matrix('dec_out_mask', dtype=config.floatX)

    #TODO
    # for generation
    hidi = tensor.matrix('hidi', dtype=config.floatX)
    celi = tensor.matrix('celi', dtype=config.floatX)
    hids = tensor.tensor4('hids', dtype=config.floatX)
    xi = tensor.vector('xi', dtype='int64')
    xi_mask = tensor.vector('xi_mask', dtype=config.floatX)

    preds, f_encode, f_decode, f_probi = ptr_network(tparams, x_node,x, x_mask_word, x_mask_sent, x_mask_doc,
                                                     dec_inp, dec_inp_mask,
                                                     xi, xi_mask, hidi, celi, hids, options)

    #cost = None
    #return x, x_mask_word, x_mask_sent, x_mask_doc, dec_inp, dec_inp_mask, dec_out, dec_out_mask, preds, cost, f_encode, f_decode, f_probi

    n_steps    = preds.shape[0]
    n_sents    = preds.shape[1]
    n_docs     = preds.shape[2]
    n_clusters = preds.shape[3]

    #preds = preds.reshape([n_steps, n_sents * n_docs, n_clusters])
    preds_contiguous = preds.dimshuffle(0,2,1,3).reshape([n_steps, n_docs * n_sents, n_clusters])

    # pull out the probs of the correct ones
    n_steps = dec_inp.shape[0]
    n_samples = dec_inp.shape[1]
    idx_steps = tensor.outer(tensor.arange(n_steps, dtype='int64'), tensor.ones((n_samples,), dtype='int64'))
    idx_samples = tensor.outer(tensor.ones((n_steps,), dtype='int64'), tensor.arange(n_samples, dtype='int64'))
    # idx_steps, dec_out, idx_samples are all n_steps x n_samples, then probs is also n_steps x n_samples
    #probs = preds[idx_steps, dec_out, idx_samples] # n_steps x n_samples
    probs = preds_contiguous[idx_steps, dec_out, idx_samples] # n_steps x n_samples

    # probs *= y_mask
    off = 1e-8
    if probs.dtype == 'float16':
        off = 1e-6
    # probs += (1 - y_mask)  # change unmasked position to 1, since log(1) = 0
    probs += off
    probs_printed = theano.printing.Print('this is probs')(probs)
    cost = -tensor.log(probs)
    cost *= dec_out_mask
    #TODO: might cause NaN here !
    # This should be okay since in dec_out_mask, we always have at least one 1. for the terminate signal.
    cost = cost.sum(axis=0) / tensor.maximum(1.0, dec_out_mask.sum(axis=0))
    cost = cost.mean()

    return x_node,x, x_mask_word, x_mask_sent, x_mask_doc, dec_inp, dec_inp_mask, dec_out, dec_out_mask, preds, cost, f_encode, f_decode, f_probi
Esempio n. 46
0
 def free_energy(self, v_sample):
     D = T.sum(v_sample, axis=1)
     wx_b = T.dot(v_sample, self.W) + T.outer(D, self.hbias)
     vbias_term = T.dot(v_sample, self.vbias)
     hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=1)
     return -hidden_term - vbias_term
Esempio n. 47
0
def prepare_model(x_train, y_train, batchsize, params=None):
    input_var = T.matrix('inputs')
    target_var = T.ivector('targets')
    same_cluster_indices_matrix = T.matrix('same_clusters')
    diff_cluster_indices_matrix = T.matrix('diff_clusters')

    # prepare network
    print '\nPreparing the model with primary hidden layer size %d...' % HOURGLASS_LAYER_SIZE
    print 'X-shape = %d, Num_classes = %d, num_samples = %d' % (
        x_train[0].shape[0], max(y_train), len(x_train))
    representation_layer, network = build_args_nn(x_train, y_train, batchsize,
                                                  input_var)

    # loss stuff
    prediction = lasagne.layers.get_output(network)
    get_representations = lasagne.layers.get_output(representation_layer,
                                                    inputs=input_var,
                                                    deterministic=True)
    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)

    if LAMBDA1 == LAMBDA2 == 0.0:
        loss = loss.mean()
    else:
        representations = get_representations
        dot_prods = T.dot(representations, representations.T)  # X times X.T
        diag = T.sqrt(T.diagonal(dot_prods))  # sqrt(||ri||^2) = ||ri||

        norms = T.outer(diag, diag.T)
        distances = 0.5 * (1 - (dot_prods * (1. / norms))
                           )  # d(a,b) = 1/2 (1 - dot(a,b) / (||a||*||b||))

        # we want the first sum to be as close to zero as possible, so we add it to the loss.
        # we want the second sum to be as close to 1 as possible, so we want LAMBDA2 * (1 - sum2)
        # to be as close to zero as possible, thus adding that difference to the overall loss.
        loss = loss.mean() \
               + (LAMBDA1 * T.sum(same_cluster_indices_matrix * distances)) \
               + (LAMBDA2 * (1.0 - T.sum(diff_cluster_indices_matrix * distances)))

    # for loading/building the parameters
    if not params:
        params = lasagne.layers.get_all_params(network, trainable=True)
    else:
        lasagne.layers.set_all_param_values(network, params)
        params = lasagne.layers.get_all_params(network, trainable=True)

    updates = lasagne.updates.adam(loss, params, learning_rate=LEARNING_RATE)

    # the final keys
    train_function = theano.function([
        input_var, target_var, same_cluster_indices_matrix,
        diff_cluster_indices_matrix
    ],
                                     loss,
                                     updates=updates,
                                     allow_input_downcast=True,
                                     on_unused_input='ignore')
    convert_to_numpy_function = theano.function([input_var],
                                                get_representations,
                                                allow_input_downcast=True)

    # theano.printing.debugprint(train_function.maker.fgraph.outputs[0])

    return network, train_function, convert_to_numpy_function
Esempio n. 48
0
 def sample_h_given_v(self, v, beta, D):
     pre_sigmoid_activation = beta * (T.dot(v, self.model.W) + T.outer(D, self.model.hbias))
     h_mean = T.nnet.sigmoid(pre_sigmoid_activation)
     h_sample = self.theano_rng.binomial(size=h_mean.shape, n=1, p=h_mean, dtype=theano.config.floatX)
     return h_sample
Esempio n. 49
0
    def __init__(self,
                 X_train,
                 y_train,
                 X_test,
                 y_test,
                 num_class,
                 batch_size=100,
                 max_iter=1000,
                 M=20,
                 n_hidden=50,
                 a0=1,
                 b0=1,
                 master_stepsize=5e-4,
                 auto_corr=0.99):
        self.n_hidden = n_hidden
        self.d = X_train.shape[1]  # number of data, dimension
        self.M = M
        self.num_class = num_class
        self.batch_size = batch_size
        self.stepsize = master_stepsize
        self.epoch = int(max_iter * batch_size / 60000)  # For Mnist
        num_vars = self.d * n_hidden + n_hidden + n_hidden * num_class + num_class + 2 + n_hidden * (
            n_hidden + 1
        ) + 4 * self.n_hidden + self.num_class + self.d  # w1: d*n_hidden; b1: n_hidden; w3 = n_hidden; b3 = 1; 2 variances
        self.theta = np.zeros([self.M, num_vars
                               ])  # particles, will be initialized later
        '''
            The data sets are normalized so that the input features and the targets have zero mean and unit variance
        '''
        self.std_X_train = np.std(X_train, 0)
        self.std_X_train[self.std_X_train == 0] = 1
        self.mean_X_train = np.mean(X_train, 0)

        self.mean_y_train = np.mean(y_train)
        self.std_y_train = np.std(y_train)
        self.history = np.zeros([self.epoch, 2])

        self.learningRateBlock = int(self.epoch * 0.2 * 60000 /
                                     self.batch_size)
        self.learningRateBlockDecay = 0.5
        '''
            Theano symbolic variables
            Define the neural network here
        '''
        X = T.matrix('X')  # Feature matrix
        y = T.matrix('y')  # labels

        w_1 = T.matrix('w_1')  # weights between input layer and hidden layer
        v_11 = T.vector('v_11')
        v_12 = T.vector(
            'v_12')  # Transform Vector between input layer and hidden layer
        b_1 = T.vector('b_1')  # bias vector of hidden layer

        w_2 = T.matrix('w_2')  # weights between hidden layer and hidden layer
        v_21 = T.vector('v_21')
        v_22 = T.vector('v_22')
        b_2 = T.vector('b_2')  # bias of output

        w_3 = T.matrix('w_3')  # weights between hidden layer and output layer
        v_31 = T.vector('v_31')
        v_32 = T.vector(
            'v_32')  # Transform Vector between output layer and hidden layer
        b_3 = T.vector('b_3')  # bias of output

        N = T.scalar('N')  # number of observations

        p_1 = T.eye(self.d) - 2 * T.outer(v_11, v_11) / T.sum(v_11**2)
        q_1 = T.eye(self.n_hidden) - 2 * T.outer(v_12, v_12) / T.sum(v_12**2)

        p_2 = T.eye(self.n_hidden) - 2 * T.outer(v_21, v_21) / T.sum(v_21**2)
        q_2 = T.eye(self.n_hidden) - 2 * T.outer(v_22, v_22) / T.sum(v_22**2)

        p_3 = T.eye(self.n_hidden) - 2 * T.outer(v_31, v_31) / T.sum(v_31**2)
        q_3 = T.eye(self.num_class) - 2 * T.outer(v_32, v_32) / T.sum(v_32**2)

        wf_1 = T.dot(T.dot(p_1, w_1), q_1)
        wf_2 = T.dot(T.dot(p_2, w_2), q_2)
        wf_3 = T.dot(T.dot(p_3, w_3), q_3)

        log_gamma = T.scalar('log_gamma')  # variances related parameters
        log_lambda = T.scalar('log_lambda')

        ###
        #prediction = (T.nnet.nnet.softmax(T.dot( T.nnet.relu(T.dot(T.nnet.relu(T.dot(X, wf_1)+b_1), wf_2) + b_2) , wf_3) + b_3))
        prediction = (T.nnet.nnet.softmax(
            T.dot(
                T.nnet.relu(
                    batchnorm(
                        T.dot(T.nnet.relu(batchnorm(T.dot(X, wf_1) +
                                                    b_1)), wf_2) +
                        b_2)), wf_3) + b_3))
        ''' define the log posterior distribution '''
        priorprec = T.log(b0 / a0)
        log_lik_data = T.sum(T.sum(y * T.log(prediction)))
        log_prior_w = -0.5 * (num_vars - 2) * (
            T.log(2 * np.pi) - priorprec) - (T.exp(priorprec) / 2) * (
                (w_1**2).sum() + (w_2**2).sum() + (w_3**2).sum() +
                (b_1**2).sum() + (b_2**2).sum() +
                (b_3**2).sum()) + 1e-9 * log_gamma + 1e-9 * log_lambda

        # sub-sampling mini-batches of data, where (X, y) is the batch data, and N is the number of whole observations
        log_posterior = (log_lik_data * N / X.shape[0] + log_prior_w)
        dw_1, db_1, dw_2, db_2, dw_3, db_3, dv_11, dv_12, dv_21, dv_22, dv_31, dv_32, d_log_gamma, d_log_lambda = T.grad(
            log_posterior, [
                w_1, b_1, w_2, b_2, w_3, b_3, v_11, v_12, v_21, v_22, v_31,
                v_32, log_gamma, log_lambda
            ])

        # automatic gradient
        logp_gradient = theano.function(inputs=[
            X, y, w_1, b_1, w_2, b_2, w_3, b_3, v_11, v_12, v_21, v_22, v_31,
            v_32, log_gamma, log_lambda, N
        ],
                                        outputs=[
                                            dw_1, db_1, dw_2, db_2, dw_3, db_3,
                                            dv_11, dv_12, dv_21, dv_22, dv_31,
                                            dv_32, d_log_gamma, d_log_lambda
                                        ])

        # prediction function
        self.nn_predict = theano.function(inputs=[
            X, w_1, b_1, w_2, b_2, w_3, b_3, v_11, v_12, v_21, v_22, v_31, v_32
        ],
                                          outputs=prediction)
        '''
            Training with SVGD
        '''
        # normalization
        X_train = self.normalization(X_train)
        N0 = X_train.shape[0]  # number of observations
        ''' initializing all particles '''
        for i in range(self.M):
            w1, b1, w2, b2, w3, b3, v11, v12, v21, v22, v31, v32, loggamma, loglambda = self.init_weights(
                a0, b0)
            # use better initialization for gamma
            ridx = np.random.choice(range(X_train.shape[0]), \
                                           np.min([X_train.shape[0], 1000]), replace = False)
            y_hat = self.nn_predict(X_train[ridx, :], w1, b1, w2, b2, w3, b3,
                                    v11, v12, v21, v22, v31, v32)
            loggamma = -np.log(np.mean(np.power(y_hat - y_train[ridx], 2)))
            self.theta[i, :] = self.pack_weights(w1, b1, w2, b2, w3, b3, v11,
                                                 v12, v21, v22, v31, v32,
                                                 loggamma, loglambda)
            #w1_, b1_, w2_, b2_, w3_, b3_, v11_, v12_, v21_, v22_, v31_, v32_, loggamma_, loglambda_ = self.unpack_weights(self.theta[i,:])
            #print(np.sum((v31_-v31)**2))
            #pdb.set_trace()

        grad_theta = np.zeros([self.M, num_vars])  # gradient
        # adagrad with momentum
        fudge_factor = 1e-5
        historical_grad = 0
        for iter in range(max_iter):
            # sub-sampling
            batch = [
                i % N0
                for i in range(iter * batch_size, (iter + 1) * batch_size)
            ]
            for i in range(self.M):
                w1, b1, w2, b2, w3, b3, v11, v12, v21, v22, v31, v32, loggamma, loglambda = self.unpack_weights(
                    self.theta[i, :])
                dw1, db1, dw2, db2, dw3, db3, dv11, dv12, dv21, dv22, dv31, dv32, dloggamma, dloglambda = logp_gradient(
                    X_train[batch, :], y_train[batch], w1, b1, w2, b2, w3, b3,
                    v11, v12, v21, v22, v31, v32, loggamma, loglambda, N0)
                grad_theta[i, :] = self.pack_weights(dw1, db1, dw2, db2, dw3,
                                                     db3, dv11, dv12, dv21,
                                                     dv22, dv31, dv32,
                                                     dloggamma, dloglambda)

            # calculating the kernel matrix
            if (self.M > 1):
                kxy, dxkxy = self.svgd_kernel(h=-1)
                grad_theta = (np.matmul(kxy, grad_theta) +
                              dxkxy) / self.M  # \Phi(x)

            # adagrad
            if iter == 0:
                historical_grad = historical_grad + np.multiply(
                    grad_theta, grad_theta)
            else:
                historical_grad = auto_corr * historical_grad + (
                    1 - auto_corr) * np.multiply(grad_theta, grad_theta)
            adj_grad = np.divide(grad_theta,
                                 fudge_factor + np.sqrt(historical_grad))

            if ((iter + 1) % self.learningRateBlock == 0):
                master_stepsize = master_stepsize * self.learningRateBlockDecay
                print(master_stepsize)

            self.theta = self.theta + master_stepsize * adj_grad

            if (iter * self.batch_size % (X_train.shape[0]) == 0):
                epoch_index = int(iter * self.batch_size / X_train.shape[0])
                pred = self.predict(X_test)
                self.history[epoch_index,
                             0] = self.evluation(X_train, y_train, iter)
                self.history[epoch_index,
                             1] = sum(pred == y_test) * 1.0 / X_test.shape[0]
                print('Epoch ', iter * self.batch_size / X_train.shape[0],
                      ' Iter:', iter, ' Cost: ', self.history[epoch_index, 0])
                print('Precision: ', self.history[epoch_index, 1])
                if (epoch_index % 10 == 0):
                    np.savez('structure' + np.str(epoch_index) + '.npz',
                             v11=v11,
                             v12=v12,
                             v21=v21,
                             v22=v22,
                             v31=v31,
                             v32=v32)
                    self.savemodel()
Esempio n. 50
0
 def test_optimization_pipeline(self):
     f = self.function([self.x, self.y], tensor.outer(self.x, self.y))
     self.assertFunctionContains(f, CGer(destructive=True))
     f(self.xval, self.yval)  # DebugMode tests correctness
Esempio n. 51
0
    def __init__(self, 
        glimpse_shape, glimpse_times, 
        dim_hidden, dim_fc, dim_out, 
        reward_base, 
        rng_std=1.0, activation=T.tanh, bptt_truncate=-1, 
        lmbd=0.1, # gdupdate + lmbd*rlupdate
        DEBUG=False,
        ): 
#       super(AttentionUnit, self).__init__()

        if reward_base == None: 
            reward_base = np.zeros((glimpse_times)).astype('float32')
            reward_base[-1] = 1.0
        x = T.ftensor3('x')  # N * W * H 
        y = T.ivector('y')  # label 
        lr = T.fscalar('lr')
        reward_base = theano.shared(name='reward_base', value=np.array(reward_base).astype(theano.config.floatX), borrow=True) # Time (vector)
        reward_bias = T.fvector('reward_bias')
#       rng = T.shared_randomstreams.RandomStreams(123)
        rng = MRG_RandomStreams(np.random.randint(9999999))
    
        i = InputLayer(x)
        au = AttentionUnit(x, glimpse_shape, glimpse_times, dim_hidden, rng, rng_std, activation, bptt_truncate)
#       All hidden states are put into decoder
#       layers = [i, au, InputLayer(au.output[:,:,:].flatten(2))]
#       dim_fc = [glimpse_times*dim_hidden] + dim_fc + [dim_out]
#       Only the last hidden states
        layers = [i, au, InputLayer(au.output[:,-1,:])]
        dim_fc = [dim_hidden] + dim_fc + [dim_out]
        for Idim, Odim in zip(dim_fc[:-1], dim_fc[1:]):
            fc = FullConnectLayer(layers[-1].output, Idim, Odim, activation, 'FC')
            layers.append(fc)
        sm = SoftmaxLayer(layers[-1].output)
        layers.append(sm)

        output = sm.output       # N * classes 
        hidoutput = au.output    # N * dim_output 
        location = au.location   # N * T * dim_hidden
        prediction = output.argmax(1) # N

        # calc
        equalvec = T.eq(prediction, y) # [0, 1, 0, 0, 1 ...]
        correct = T.cast(T.sum(equalvec), 'float32')
#       noequalvec = T.neq(prediction, y)
#       nocorrect = T.cast(T.sum(noequalvec), 'float32')
        logLoss = T.log(output)[T.arange(y.shape[0]), y] # 
#       reward_biased = T.outer(equalvec, reward_base - reward_bias.dimshuffle('x', 0))
        reward_biased = T.outer(equalvec, reward_base) - reward_bias.dimshuffle('x', 0)
            # N * Time
            # (R_t - b_t), where b = E[R]
        
        # gradient descent
        gdobjective = logLoss.sum()/x.shape[0]  # correct * dim_output (only has value on the correctly predicted sample)
        gdparams = reduce(lambda x, y: x+y.params, layers, []) 
        gdupdates = map(lambda x: (x, x+lr*T.grad(gdobjective, x)), gdparams)

        # reinforce learning
        # without maximum, then -log(p) will decrease the p
        rlobjective = (T.maximum(reward_biased.dimshuffle(0, 1, 'x'), 0) * T.log(au.location_p)).sum() / correct 
            # location_p: N * Time * 2
            # location_logp: N * Time
            # reward_biased: N * 2
        rlparams = au.reinforceParams 
        rlupdates = map(lambda x: (x, x+lr*lmbd*T.grad(rlobjective, x)), rlparams)


        # Hidden state keeps unchange in time
        deltas = T.stack(*[((au.output[:,i,:].mean(0)-au.output[:,i+1,:].mean(0))**2).sum()  for i in xrange(glimpse_times-1)])
            # N * Time * dim_hidden
         
        print 'compile step()'
        self.step = theano.function([x, y, lr, reward_bias], [gdobjective, rlobjective, correct, T.outer(equalvec, reward_base)], updates=gdupdates+rlupdates)
    #       print 'compile gdstep()'
    #       self.gdstep = theano.function([x, y, lr], [gdobjective, correct, location], updates=gdupdates)
    #       print 'compile rlstep()'
    #       self.rlstep = theano.function([x, y, lr], [rlobjective], updates=rlupdates)
        print 'compile predict()'
        self.predict = theano.function([x], prediction)
        if DEBUG:
            print 'compile glimpse()'
            self.glimpse = theano.function([x], au.glimpse) #[layers[-3].output, fc.output])
            print 'compile innerstate()'
            self.getinnerstate = theano.function([x], au.innerstate)
            print 'compile forward()'
            self.forward = theano.function([x], map(lambda x: x.output, layers)) #[layers[-3].output, fc.output])
            print 'compile error()'
            self.error = theano.function([x, y, reward_bias], [gdobjective, rlobjective])
        print 'compile locate()'
        self.locate = theano.function([x], [au.location_mean, location]) #[layers[-3].output, fc.output])
        print 'compile debug()'
        self.debug = theano.function([x, y, lr, reward_bias], [deltas, au.location_p], on_unused_input='warn')


        # self.xxx
        self.layers = layers
        self.params = gdparams + rlparams
        self.glimpse_times = glimpse_times
Esempio n. 52
0
    def predict(self, mx, Sx, *args, **kwargs):
        if self.N < self.n_inducing:
            # stick with the full GP
            return GP_UI.predict(self, mx, Sx)

        idims = self.D
        odims = self.E

        # centralize inputs
        zeta = self.X_sp - mx

        # initialize some variables
        sf2 = self.hyp[:, idims]**2
        eyeE = tt.tile(tt.eye(idims), (odims, 1, 1))
        lscales = self.hyp[:, :idims]
        iL = eyeE/lscales.dimshuffle(0, 1, 'x')

        # predictive mean
        inp = iL.dot(zeta.T).transpose(0, 2, 1)
        iLdotSx = iL.dot(Sx)
        B = (iLdotSx[:, :, None, :]*iL[:, None, :, :]).sum(-1) + tt.eye(idims)
        t = tt.stack([solve(B[i].T, inp[i].T).T for i in range(odims)])
        c = sf2/tt.sqrt(tt.stack([det(B[i]) for i in range(odims)]))
        l_ = tt.exp(-0.5*tt.sum(inp*t, 2))
        lb = l_*self.beta_sp
        M = tt.sum(lb, 1)*c

        # input output covariance
        tiL = tt.stack([t[i].dot(iL[i]) for i in range(odims)])
        V = tt.stack([tiL[i].T.dot(lb[i]) for i in range(odims)]).T*c

        # predictive covariance
        logk = (tt.log(sf2))[:, None] - 0.5*tt.sum(inp*inp, 2)
        logk_r = logk.dimshuffle(0, 'x', 1)
        logk_c = logk.dimshuffle(0, 1, 'x')
        Lambda = tt.square(iL)
        LL = (Lambda.dimshuffle(0, 'x', 1, 2) + Lambda).transpose(0, 1, 3, 2)
        R = tt.dot(LL, Sx.T).transpose(0, 1, 3, 2) + tt.eye(idims)
        z_ = Lambda.dot(zeta.T).transpose(0, 2, 1)

        M2 = tt.zeros((odims, odims))

        # initialize indices
        triu_indices = np.triu_indices(odims)
        indices = [tt.as_index_variable(idx) for idx in triu_indices]

        def second_moments(i, j, M2, beta, iK, sf2, R, logk_c, logk_r, z_, Sx):
            # This comes from Deisenroth's thesis ( Eqs 2.51- 2.54 )
            Rij = R[i, j]
            n2 = logk_c[i] + logk_r[j]
            n2 += utils.maha(z_[i], -z_[j], 0.5*solve(Rij, Sx))

            Q = tt.exp(n2)/tt.sqrt(det(Rij))

            # Eq 2.55
            m2 = matrix_dot(beta[i], Q, beta[j])

            m2 = theano.ifelse.ifelse(
                tt.eq(i, j), m2 - tt.sum(iK[i]*Q) + sf2[i], m2)
            M2 = tt.set_subtensor(M2[i, j], m2)
            M2 = theano.ifelse.ifelse(
                tt.eq(i, j), M2 + 1e-6, tt.set_subtensor(M2[j, i], m2))
            return M2

        nseq = [self.beta_sp, (self.iKmm - self.iBmm), sf2,
                R, logk_c, logk_r, z_, Sx]
        M2_, updts = theano.scan(
            fn=second_moments, sequences=indices, outputs_info=[M2],
            non_sequences=nseq, allow_gc=False)
        M2 = M2_[-1]
        S = M2 - tt.outer(M, M)

        return M, S, V
Esempio n. 53
0
 def prop_up(self, vis, D=None):
     if D == None:
         D = self.D
     pre_sigmoid_activation = T.dot(vis, self.W) + T.outer(D, self.hbias)
     return [pre_sigmoid_activation, T.nnet.sigmoid(pre_sigmoid_activation)]
Esempio n. 54
0
#                             [14.12499977, 11.7499998 , 29.37499945, 30.74999942, 32.74999931],
#                             [ 7.87499985, 12.12499977, 27.56249941, 32.74999931, 42.9374991 ]]))
#
# print T.nlinalg.det(m).eval()
#
# exit(0)

gama = theano.shared(
    np.float32([[0.2, 0.4, 0.4], [0.7, 0.2, 0.1], [0.2, 0.1, 0.7],
                [0.5, 0.3, 0.2]]))

z_var = theano.shared(
    np.float32([[1, 2, 3, 4, 9], [1, 2, 5, 6, 8], [1, 3, 6, 8, 6],
                [8, 2, 7, 4, 1]]))

mu_N_k_z, updates_dete = theano.scan(lambda z_s, gama_s: T.outer(z_s, gama_s),
                                     sequences=[z_var, gama])

phi_k = T.mean(gama, axis=0)

mu_z_k = T.sum(mu_N_k_z, axis=0) / T.sum(gama, axis=0)

mu_k_z = T.transpose(mu_z_k, (1, 0))

sigma_N_k_z_z, _  = theano.scan(lambda z_v, gama_v: theano.scan(lambda mu_v, gama_v_v:gama_v_v*T.outer(z_v-mu_v,z_v-mu_v) ,\
                                                          sequences= [mu_k_z, gama_v]), sequences=[z_var, gama])

sigma_k_z_z = T.sum(sigma_N_k_z_z, axis=0)
sigma_k_z_z = T.transpose(
    T.transpose(sigma_k_z_z, (1, 2, 0)) / T.sum(gama, axis=0), (2, 0, 1))
Esempio n. 55
0
 def free_energy(self, v, beta, D):
     return -beta * T.dot(v, self.model.vbias.T) - T.sum(T.log(1+T.exp( beta * (T.dot(v,self.model.W) + T.outer(D,self.model.hbias)) )),axis=1)
    def attributes_update(self, attributes, depth, graph, original_graph,
                          bonds):
        '''Given the current attributes, the current depth, and the graph that the attributes
		are based on, this function will update the 2D attributes tensor'''

        ############# GET NEW ATTRIBUTE MATRIX #########################
        # New pre-activated attribute matrix v = M_i,j,: x ones((N_atom, 1)) -> (N_atom, N_features)
        # as long as dimensions are appropriately shuffled
        shuffled_graph = graph.copy().dimshuffle(
            (2, 0, 1))  # (N_feature x N_atom x N_atom)
        shuffled_graph.name = 'shuffled_graph'

        ones_vec = K.ones_like(attributes[:, 0])  # (N_atom x 1)
        ones_vec.name = 'ones_vec'

        # Embed individually
        # (scan sequences iterates over the FIRST dimension)
        # (flatten(ndim) keeps the first ndim-1 dimensions the same, then expands the rest to fill)
        flattened_graph = shuffled_graph.flatten(
            ndim=2).T  # (N_atom^2 x N_feature)
        # Embed each possible atom-atom interaction
        (new_presummed_attributes_flat, updates) = theano.scan(
            lambda x: self.activation_inner(
                K.dot(x[:-1], self.W_inner[depth, :, :]) + self.b_inner[depth,
                                                                        0, :]),
            sequences=flattened_graph)  # still (N_atom^2 x N_feature)
        # Reshape into #(N_feature-1 x N_atom x N_atom)
        new_presummed_attributes = new_presummed_attributes_flat.T.reshape(
            shuffled_graph[:-1, :, :].shape)

        # Now sum activated self+neighbors
        (new_attributes, updates) = theano.scan(
            lambda x: K.dot(x, ones_vec),
            sequences=new_presummed_attributes)  # (N_features x N_atom)

        # Append last feature (bond flag) after the loop
        new_attributes = K.concatenate((new_attributes.T, attributes[:, -1:]),
                                       axis=1)
        new_attributes.name = 'new_attributes'

        ############ UPDATE GRAPH TENSOR WITH NEW ATOM ATTRIBUTES ###################
        ### Node attribute contribution is located in every entry of graph[i,j,:] where
        ### there is a bond @ ij or when i = j (self)
        # Get atoms matrix (identity)
        atoms = T.identity_like(bonds)  # (N_atom x N_atom)
        atoms.name = 'atoms_identity'
        # Combine
        bonds_or_atoms = bonds + atoms  # (N_atom x N_atom)
        bonds_or_atoms.name = 'bonds_or_atoms'

        atom_indeces = T.arange(
            ones_vec.shape[0])  # 0 to N_atoms - 1 (indeces)
        atom_indeces.name = 'atom_indeces vector'
        ### Subtract previous node attribute contribution
        # Multiply each entry in bonds_or_atoms by the previous atom features for that column
        (old_features_to_sub, updates) = theano.scan(
            lambda i: T.outer(bonds_or_atoms[:, i], attributes[i, :]),
            sequences=T.arange(ones_vec.shape[0]))
        old_features_to_sub.name = 'old_features_to_sub'

        ### Add new node attribute contribution
        # Multiply each entry in bonds_or_atoms by the previous atom features for that column
        (new_features_to_add, updates) = theano.scan(
            lambda i: T.outer(bonds_or_atoms[:, i], new_attributes[i, :]),
            sequences=T.arange(ones_vec.shape[0]))
        new_features_to_add.name = 'new_features_to_add'

        # Update new graph
        new_graph = graph - old_features_to_sub + new_features_to_add
        new_graph.name = 'new_graph'

        return (new_attributes, new_graph)
Esempio n. 57
0
    def __init__(self,
                 rng,
                 size,
                 N_word,
                 max_length,
                 Wf_values=None,
                 Wp_values=None,
                 L_values=None,
                 activation=T.tanh):
        self.size = size
        self.max_length = max_length

        #initial Wf, bf
        if Wf_values is None:
            Wf_values = np.asarray(rng.uniform(
                low=-np.sqrt(6. / (size + size * 2)),
                high=np.sqrt(6. / (size + size * 2)),
                size=(size, size * 2 + 1)),
                                   dtype=theano.config.floatX)
            if activation == T.nnet.sigmoid:
                Wf_values *= 4

        Wf = theano.shared(value=Wf_values, name='Wf', borrow=True)

        self.Wf = Wf

        #initial Wp, bp
        if Wp_values is None:
            Wp_values = np.asarray(rng.uniform(low=-np.sqrt(6. / (size * 2)),
                                               high=np.sqrt(6. / (size * 2)),
                                               size=(2 * size + 1, )),
                                   dtype=theano.config.floatX)
        Wp = theano.shared(value=Wp_values, name='Wp', borrow=True)
        self.Wp = Wp
        if L_values is None:
            L_values = np.asarray(rng.uniform(low=-np.sqrt(6. / (N_word)),
                                              high=np.sqrt(6. / (N_word)),
                                              size=(N_word, size)),
                                  dtype=theano.config.floatX)

            self.L = theano.shared(value=L_values, name='L', borrow=True)

            self.params = [self.Wf, self.Wp, self.L]
        else:
            self.L = theano.shared(value=L_values, name='L', borrow=True)

            self.params = [
                self.Wf,
                self.Wp,
                #self.L
            ]

        self.L1 = (abs(self.Wf).sum() + abs(self.Wp).sum())

        self.L2_sqr = ((self.Wf**2).sum() + (self.Wp**2).sum())

        v1 = T.fvector('v1')
        v2 = T.fvector('v2')
        dv = T.fvector('dv')
        v = T.fvector('v')
        p = T.fscalar('p')
        p1 = T.fscalar('p1')
        p2 = T.fscalar('p2')
        dp = T.fscalar('dp')
        i = T.iscalar('i')

        f_function = self.f_function(v1, v2)
        p_function = self.g_function(v1, v2) * p1 * p2

        self.f = theano.function(inputs=[v1, v2], outputs=f_function)
        self.p = theano.function(inputs=[v1, v2, p1, p2], outputs=p_function)
        self.L_i = theano.function(inputs=[
            i,
        ], outputs=self.L[i])
        da = (1 - v**2) * dv
        dWf = T.outer(da, T.concatenate([v1, v2, [np.float32(1.0)]]))
        g_f = [
            dWf,
            T.dot(da, self.Wf[:, 0:self.size]),
            T.dot(da, self.Wf[:, self.size:self.size * 2])
        ]
        #g_p = [
        #	T.grad(p_function, element) * dp
        #	for element in [self.Wp, v1, v2, p1, p2]
        #	]
        b = p / p1 / p2
        db = b * (1 - b)
        temp = dp * p1 * p2 * db
        g_p = [
            temp * T.concatenate([v1, v2, [np.float32(1.0)]]),
            temp * self.Wp[0:self.size],
            temp * self.Wp[self.size:self.size * 2], dp * p / p1, dp * p / p2
        ]

        self.g_p = theano.function(inputs=[v1, v2, p1, p2, p, dp], outputs=g_p)

        self.g_f = theano.function(inputs=[v1, v2, v, dv], outputs=g_f)
Esempio n. 58
0
 def test_outer(self):
     f = self.function([self.x, self.y], tensor.outer(self.x, self.y))
     self.assertFunctionContains(f, ScipyGer(destructive=True))
Esempio n. 59
0
import theano
import numpy
import theano.tensor as T

bn = numpy.array([5,6,100,200])
bn = bn.reshape(2,2)
print "Bn shape",bn.shape
x = T.matrix('x')
b = theano.shared(bn)
y = T.outer(x,b)
f = theano.function(inputs=[x],outputs=[y,])
xn = numpy.array([1,2,3,4])
xn = xn.reshape(2,2)
print "Xn shape",xn.shape
print f(xn)
Esempio n. 60
0
File: gmmil.py Progetto: xairc/gmmil
    def __init__(self,
                 obsfeat_space, action_space,
                 enable_inputnorm, favor_zero_expert_reward,
                 include_time,
                 time_scale,
                 exobs_Bex_Do, exa_Bex_Da, ext_Bex,
                 kernel_bandwidth_params,
                 kernel_batchsize,
                 kernel_reg_weight,
                 use_median_heuristic,
                 use_logscale_reward,
                 save_reward,
                 epsilon
                 ):

        self.obsfeat_space, self.action_space = obsfeat_space, action_space
        self.favor_zero_expert_reward = favor_zero_expert_reward
        self.include_time = include_time
        self.time_scale = time_scale
        self.exobs_Bex_Do, self.exa_Bex_Da, self.ext_Bex = exobs_Bex_Do, exa_Bex_Da, ext_Bex
        self.use_logscale_reward = use_logscale_reward
        self.save_reward = save_reward
        self.epsilon = epsilon

        with nn.variable_scope('inputnorm'):
            # Standardize both observations and actions if actions are continuous
            # otherwise standardize observations only.
            self.inputnorm = (nn.Standardizer if enable_inputnorm else nn.NoOpStandardizer)(
                (obsfeat_space.dim + action_space.dim) if isinstance(action_space, ContinuousSpace)
                    else obsfeat_space.dim)
            self.inputnorm_updated = False
        self.update_inputnorm(self.exobs_Bex_Do, self.exa_Bex_Da) # pre-standardize with expert data

        # Expert feature expectations
        #self.expert_feat_Df = self._compute_featexp(self.exobs_Bex_Do, self.exa_Bex_Da, self.ext_Bex)
        self.expert_feat_B_Df = self._featurize(self.exobs_Bex_Do, self.exa_Bex_Da, self.ext_Bex)

        # Arguments for MMD Reward
        self.kernel_bandwidth_params = kernel_bandwidth_params
        self.kernel_batchsize = kernel_batchsize
        self.kernel_reg_weight = kernel_reg_weight
        self.use_median_heuristic = use_median_heuristic
        self.mmd_square = 1.

        self.expert_sigmas = []
        self.iteration = 0
        self.YY = None

        self.min_param = 100.0
        self.max_param = 300.0

        # MMD reward function
        # - Use Radial Basis Function Kernel
        #   : k(x,y) = \sum exp(- sigma(i) * ||x-y||^2 )
        # - sigmas : Bandwidth parameters
        x = T.matrix('x')
        y = T.matrix('y')
        sigmas = T.vector('sigmas')
        feat_dim = self.expert_feat_B_Df.shape[1]

        # - dist[i]: ||x[i]-y[i]||^2
        # We should normalize x, y w.r.t its dimension
        # since in large dimension, a small difference between x, y
        # makes large difference in total kernel function value.
        normalized_x = x / feat_dim
        normalized_y = y / feat_dim
        dist_B = ((normalized_x)**2).sum(1).reshape((normalized_x.shape[0], 1)) \
               + ((normalized_y)**2).sum(1).reshape((1, normalized_y.shape[0])) \
               - 2*(normalized_x).dot((normalized_y).T)

        rbf_kernel_sum, _ = theano.scan(fn=lambda sigma, distance: T.exp(-sigma*distance),
                                    outputs_info=None,
                                    sequences=sigmas, non_sequences=dist_B)

        rbf_kernel = rbf_kernel_sum.mean(axis=0)

        if self.kernel_reg_weight > 0.0:
            xynorm = T.outer(normalized_x.norm(2, axis=1), normalized_y.norm(2, axis=1))
            rbf_kernel += self.kernel_reg_weight*((normalized_x).dot(normalized_y.T)) / xynorm

        self.kernel_function = theano.function([x, y, sigmas],
                                               [rbf_kernel],
                                               allow_input_downcast=True)

        # Evaluate k( expert, expert )
        if not (self.use_median_heuristic > 0):
            self.kernel_exex_total = self.kernel_function(self.expert_feat_B_Df,
                                                          self.expert_feat_B_Df,
                                                          self.kernel_bandwidth_params)
            self.kernel_exex_total = np.mean(self.kernel_exex_total)