Example #1
0
    def __init__(self, rng, input, nhistory, feature,n_feat, n_in, n_out, N=4096, W=None,
                 sparse=None,activation=None):

        self.input = input
        if W is None:
            W_values = numpy.asarray(rng.uniform(
                    low=-numpy.sqrt(6. / (N*n_in + n_out)),
                    high=numpy.sqrt(6. / (N*n_in + n_out)),
                    size=(N*n_in, n_out)), dtype=theano.config.floatX)
            if activation == theano.tensor.nnet.sigmoid:
                W_values *= 4

            W = theano.shared(value=W_values, name='W', borrow=True)
        else:
            w_values=W
            W = theano.shared(value=w_values, name='W',borrow=True)

        self.W = W
        lin_output = T.dot(self.input, self.W)
        for history in nhistory:
            lin_output = T.concatenate((lin_output,T.dot(history,self.W)),axis=1)
         
        if n_feat==0:
            self.output = lin_output
        else:
            self.output = T.concatenate((lin_output,feature),axis=1)
        # parameters of the model
        self.params = [self.W]
Example #2
0
 def create_prediction(self):#做一次predict的方法
     gfs=self.gfs
     pm25in=self.pm25in
     #初始第一次前传
     x=T.concatenate([gfs[:,0],gfs[:,1],gfs[:,2],pm25in[:,0],pm25in[:,1],self.cnt[:,:,0]],axis=1)
     if self.celltype==RNN:
         init_hiddens = [(T.repeat(T.shape_padleft(create_shared(layer.hidden_size, name="RNN.initial_hidden_state")),
                                   x.shape[0], axis=0)
                          if x.ndim > 1 else create_shared(layer.hidden_size, name="RNN.initial_hidden_state"))
                         if hasattr(layer, 'initial_hidden_state') else None
                         for layer in self.model.layers]
     if self.celltype==LSTM:
         init_hiddens = [(T.repeat(T.shape_padleft(create_shared(layer.hidden_size * 2, name="LSTM.initial_hidden_state")),
                                   x.shape[0], axis=0)
                          if x.ndim > 1 else create_shared(layer.hidden_size * 2, name="LSTM.initial_hidden_state"))
                         if hasattr(layer, 'initial_hidden_state') else None
                         for layer in self.model.layers]
     self.layerstatus=self.model.forward(x,init_hiddens)
     #results.shape?40*1
     self.results=self.layerstatus[-1]
     if self.steps > 1:
         self.layerstatus=self.model.forward(T.concatenate([gfs[:,1],gfs[:,2],gfs[:,3],pm25in[:,1],self.results,self.cnt[:,:,1]],axis=1),self.layerstatus)
         self.results=T.concatenate([self.results,self.layerstatus[-1]],axis=1)      
         #前传之后step-2次
         for i in xrange(2,self.steps):
             self.layerstatus=self.model.forward(T.concatenate([gfs[:,i],gfs[:,i+1],gfs[:,i+2],T.shape_padright(self.results[:,i-2]),T.shape_padright(self.results[:,i-1]),self.cnt[:,:,i]],axis=1),self.layerstatus)
             #need T.shape_padright???
             self.results=T.concatenate([self.results,self.layerstatus[-1]],axis=1)
     return self.results
Example #3
0
    def transform(self, inputs):
        '''Transform the inputs for this layer into an output for the layer.

        Parameters
        ----------
        inputs : dict of theano expressions
            Symbolic inputs to this layer, given as a dictionary mapping string
            names to Theano expressions. See :func:`base.Layer.connect`.

        Returns
        -------
        outputs : dict of theano expressions
            Theano expressions representing the output from the layer. This
            layer type produces an "out" output that concatenates the outputs
            from its underlying workers. If present, it also concatenates the
            "pre" and "cell" outputs from the underlying workers. Finally, it
            passes along the individual outputs from its workers using "fw" and
            "bw" prefixes for forward and backward directions.
        updates : list of update pairs
            A list of state updates to apply inside a theano function.
        '''
        fout, fupd = self.forward.transform(inputs)
        bout, bupd = self.backward.transform(inputs)
        outputs = dict(out=TT.concatenate([fout['out'], bout['out']], axis=2))
        if 'pre' in fout:
            outputs['pre'] = TT.concatenate([fout['pre'], bout['pre']], axis=2)
        if 'cell' in fout:
            outputs['cell'] = TT.concatenate([fout['cell'], bout['cell']], axis=2)
        for k, v in fout.items():
            outputs['fw_{}'.format(k)] = v
        for k, v in bout.items():
            outputs['bw_{}'.format(k)] = v
        return outputs, fupd + bupd
Example #4
0
    def recurrence( sample_z_t, sample_x_t, h_tm1_enc, h_tm1_dec, c_tm1_enc, c_tm1_dec,  mu_z_t,  mu_x_tm1, coeff_x_tm1,  v):
        v_hat = v - T.sum(( coeff_x_tm1.dimshuffle(0,'x',1) *  ( mu_x_tm1 + (T.exp(b_sig_x) * sample_x_t).reshape((batch_size, n_visible*n_gmm)) ).reshape((batch_size, n_visible, n_gmm)) ), axis = -1 ) #error input
        r_t = T.concatenate( [v , v_hat], axis = 1 ) 
        
        # v_enc = [r_t, h_tm1_dec]
        v_enc = T.concatenate( [r_t, h_tm1_dec] , axis = 1)
        
        #Generate h_t_enc = RNN_enc(h_tm1_enc, v_enc)
        i_t_enc = T.nnet.sigmoid(bi_enc + T.dot(c_tm1_enc, Wci_enc) + T.dot(h_tm1_enc, Whi_enc) + T.dot(v_enc, Wvi_enc))
        f_t_enc = T.nnet.sigmoid(bf_enc + T.dot(c_tm1_enc, Wcf_enc) + T.dot(h_tm1_enc, Whf_enc) + T.dot(v_enc, Wvf_enc))
        c_t_enc = (f_t_enc * c_tm1_enc) + ( i_t_enc * T.tanh( T.dot(v_enc, Wvc_enc) + T.dot( h_tm1_enc, Whc_enc) + bc_enc ))
        o_t_enc = T.nnet.sigmoid(bo_enc + T.dot(c_t_enc, Wco_enc) + T.dot(h_tm1_enc, Who_enc) + T.dot(v_enc, Wvo_enc))
        h_t_enc = o_t_enc * T.tanh( c_t_enc )
        
        # Get z_t
        mu_z_t = T.dot(h_t_enc, Wh_enc_mu_z ) + b_mu_z
        #sigma_z_t = T.dot(h_t_enc, Wh_enc_sig_z ) + b_sig_z
        #sample =  theano_rng.normal(size=mew_t.shape, avg = 0, std = 1, dtype=theano.config.floatX)
        z_t = mu_z_t + (T.exp(b_sig_z) * sample_z_t).reshape((batch_size,n_z)) 
        # Generate h_t_dec = RNN_dec(h_tm1_dec, z_t) 
        i_t_dec = T.nnet.sigmoid(bi_dec + T.dot(c_tm1_dec, Wci_dec) + T.dot(h_tm1_dec, Whi_dec) + T.dot(z_t, Wzi_dec))
        f_t_dec = T.nnet.sigmoid(bf_dec + T.dot(c_tm1_dec, Wcf_dec) + T.dot(h_tm1_dec, Whf_dec) + T.dot(z_t , Wzf_dec))
        c_t_dec = (f_t_dec * c_tm1_dec) + ( i_t_dec * T.tanh( T.dot(z_t, Wzc_dec) + T.dot( h_tm1_dec, Whc_dec) + bc_dec ))
        o_t_dec = T.nnet.sigmoid(bo_dec + T.dot(c_t_dec, Wco_dec) + T.dot(h_tm1_dec, Who_dec) + T.dot(z_t, Wzo_dec))
        h_t_dec = o_t_dec * T.tanh( c_t_dec )

        # Get w_t
        mu_x_t = mu_x_tm1 + T.dot(h_t_dec, Wh_dec_mu_x) + b_mu_x
        coeff_x_t = T.nnet.softmax( T.dot(h_t_dec, Wh_dec_coeff_x) + b_coeff_x)
        #sigma_x_t = sigma_x_tm1 + T.dot(h_t_dec, Wh_dec_sigma_x) + b_sig_x

        return [ h_t_enc, h_t_dec, c_t_enc, c_t_dec,  mu_z_t,  mu_x_t , coeff_x_t]
Example #5
0
def get_uhs_operator(uhs, depth, n_hidden, rhos):
    """

    :param uhs:
    :param depth:
    :param n_hidden:
    :param rhos: can be shared variable or constant of shape (depth, )!!
    :return:
    """
    # Will use a Fourier matrix (will be O(n^2)...)
    # Doesn't seem to slow things down much though!
    exp_phases = [T.cos(uhs), T.sin(uhs)]
    neg_exp_phases = [T.cos(uhs[:, ::-1]), -T.sin(uhs[:, ::-1])]
    ones_ = [T.ones((depth, 1), dtype=theano.config.floatX), T.zeros((depth, 1), dtype=theano.config.floatX)]

    rhos_reshaped = T.reshape(rhos, (depth, 1), ndim=2)
    rhos_reshaped = T.addbroadcast(rhos_reshaped, 1)

    eigvals_re = rhos_reshaped * T.concatenate((ones_[0], exp_phases[0], -ones_[0], neg_exp_phases[0]), axis=1)
    eigvals_im = rhos_reshaped * T.concatenate((ones_[1], exp_phases[1], -ones_[1], neg_exp_phases[1]), axis=1)
    phase_array = -2 * np.pi * np.outer(np.arange(n_hidden), np.arange(n_hidden)) / n_hidden
    f_array_re_val = np.cos(phase_array) / n_hidden
    f_array_im_val = np.sin(phase_array) / n_hidden
    f_array_re = theano.shared(f_array_re_val.astype(theano.config.floatX), name="f_arr_re")
    f_array_im = theano.shared(f_array_im_val.astype(theano.config.floatX), name="f_arr_im")

    a_k = T.dot(eigvals_re, f_array_re) + T.dot(eigvals_im, f_array_im)
    uhs_op = rep_vec(a_k, n_hidden, n_hidden)  # shape (depth, 2 * n_hidden - 1)

    return uhs_op
Example #6
0
 def predict(self, new_data, batch_size, pool_size):
     """
     predict for new data
     """
     img_shape = (batch_size, 1, self.image_shape[2], self.image_shape[3])
     conv_out = conv.conv2d(input=new_data, filters=self.W, filter_shape=self.filter_shape, image_shape=img_shape)
     pool_list = []
     if self.non_linear == "tanh":
         conv_out_tanh = T.tanh(conv_out + self.b.dimshuffle("x", 0, "x", "x"))
         # pad_len = int(self.max_window_len/2)
         # right_pad_len = int(self.filter_shape[2]/2)
         # index_shift = pad_len-right_pad_len
         index_shift = int(self.filter_shape[2] / 2)
         for i in xrange(batch_size):
             # partition sentence via pool size
             e1pos = pool_size[i, 0] + index_shift
             e2pos = pool_size[i, 1] + index_shift
             # if T.gt(e1pos, 0):
             #     p1 = conv_out_tanh[i, :, :e1pos, :]
             # else:
             #     p1 = conv_out_tanh[i, :, 0, :]
             p1 = conv_out_tanh[i, :, :e1pos, :]
             p2 = conv_out_tanh[i, :, e1pos:e2pos, :]
             p3 = conv_out_tanh[i, :, e2pos:, :]
             p1_pool_out = T.max(p1, axis=1)
             p2_pool_out = T.max(p2, axis=1)
             p3_pool_out = T.max(p3, axis=1)
             temp = T.concatenate([p1_pool_out, p2_pool_out, p3_pool_out], axis=1)
             pool_list.append(temp.dimshuffle("x", 0, 1))
     else:
         pass
     output = T.concatenate(pool_list, axis=0)
     return output
Example #7
0
    def output_probabilistic(self, m_w_previous, v_w_previous):
        if (self.non_linear):
            m_in = self.m_w - m_w_previous
            v_in = self.v_w
            # We compute the mean and variance after the ReLU activation
            lam = self.lam
            v_1 = 1 + 2*lam*v_in
            v_1_inv = v_1**-1

            s_1 = T.prod(v_1,axis=1)**-0.5
            v_2 = 1 + 4*lam*v_in
            v_2_inv = v_2**-1
            s_2 = T.prod(v_2,axis=1)**-0.5
            v_inv = v_in**-1
            exponent1 = m_in**2*(1 - v_1_inv)*v_inv
            exponent1 = T.sum(exponent1,axis=1)
            exponent2 = m_in**2*(1 - v_2_inv)*v_inv
            exponent2 = T.sum(exponent2,axis=1)
            m_a = s_1*T.exp(-0.5*exponent1)
            v_a = s_2*T.exp(-0.5*exponent2) - m_a**2

            return (m_a, v_a)

        else:
            m_w_previous_with_bias = \
            T.concatenate([ m_w_previous, T.alloc(1, 1) ], 0)
            v_w_previous_with_bias = \
            T.concatenate([ v_w_previous, T.alloc(0, 1) ], 0)

            m_linear = T.dot(self.m_w, m_w_previous_with_bias) / T.sqrt(self.n_inputs)
            v_linear = (T.dot(self.v_w, v_w_previous_with_bias) + \
                T.dot(self.m_w**2, v_w_previous_with_bias) + \
                T.dot(self.v_w, m_w_previous_with_bias**2)) / self.n_inputs
            return (m_linear, v_linear)
Example #8
0
def _best_path_decode(activations):
    """Calculate the CTC best-path decoding for a given activation sequence.
       In the returned matrix, shorter sequences are padded with -1s."""

    # For each timestep, get the highest output
    decoding = T.argmax(activations, axis=2)

    # prev_outputs[time][example] == decoding[time - 1][example]
    prev_outputs = T.concatenate([T.alloc(_BLANK, 1, decoding.shape[1]), decoding], axis=0)[:-1]

    # Filter all repetitions to zero (blanks are already zero)
    decoding = decoding * T.neq(decoding, prev_outputs)

    # Calculate how many blanks each sequence has relative to longest sequence
    blank_counts = T.eq(decoding, 0).sum(axis=0)
    min_blank_count = T.min(blank_counts, axis=0)
    max_seq_length = decoding.shape[0] - min_blank_count # used later
    padding_needed = blank_counts - min_blank_count

    # Generate the padding matrix by ... doing tricky things
    max_padding_needed = T.max(padding_needed, axis=0)
    padding_needed = padding_needed.dimshuffle('x',0).repeat(max_padding_needed, axis=0)
    padding = T.arange(max_padding_needed).dimshuffle(0,'x').repeat(decoding.shape[1],axis=1)
    padding = PADDING * T.lt(padding, padding_needed)

    # Apply the padding
    decoding = T.concatenate([decoding, padding], axis=0)

    # Remove zero values
    nonzero_vals = decoding.T.nonzero_values()
    decoding = T.reshape(nonzero_vals, (decoding.shape[1], max_seq_length)).T

    return decoding
Example #9
0
def _create_maximum_activation_update(output, record, streamindex, topn):
    """
    Calculates update of the topn maximums for one batch of outputs.
    """
    dims, maximums, indices, snapshot = record
    counters = tensor.tile(tensor.shape_padright(
        tensor.arange(output.shape[0]) + streamindex), (1, output.shape[1]))
    if len(dims) == 1:
        # output is a 2d tensor, (cases, units) -> activation
        tmax = output
        # counters is a 2d tensor broadcastable (cases, units) -> case_index
        tind = counters
    else:
        # output is a 4d tensor: fmax flattens it to 3d
        fmax = output.flatten(ndim=3)
        # fargmax is a 2d tensor containing rolled maximum locations
        fargmax = fmax.argmax(axis=2)
        # fetch the maximum. tmax is 2d, (cases, units) -> activation
        tmax = _apply_index(fmax, fargmax, axis=2)
        # targmax is a tuple that separates rolled-up location into (x, y)
        targmax = divmod(fargmax, dims[2])
        # tind is a 3d tensor (cases, units, 3) -> case_index, maxloc
        # this will match indices which is a 3d tensor also
        tind = tensor.stack((counters, ) + targmax, axis=2)
    cmax = tensor.concatenate((maximums, tmax), axis=0)
    cind = tensor.concatenate((indices, tind), axis=0)
    cargsort = (-cmax).argsort(axis=0)[:topn]
    newmax = _apply_perm(cmax, cargsort, axis=0)
    newind = _apply_perm(cind, cargsort, axis=0)
    updates = [(maximums, newmax), (indices, newind)]
    if snapshot:
        csnap = tensor.concatenate((snapshot, output), axis=0)
        newsnap = _apply_perm(csnap, cargsort, axis=0)
        updates.append((snapshot, newsnap))
    return updates
Example #10
0
def create_TrainFunc_tranPES(simfn, embeddings,  marge=0.5, alpha=1., beta=1.):

    # parse the embedding data
    embedding = embeddings[0] # D x N matrix
    lembedding = embeddings[1]

    # declare the symbolic variables for training triples
    hp = S.csr_matrix('head positive') # N x batchsize matrix
    rp = S.csr_matrix('relation')
    tp = S.csr_matrix('tail positive')

    hn = S.csr_matrix('head negative')
    tn = S.csr_matrix('tail negative')

    lemb = T.scalar('embedding learning rate')
    lremb = T.scalar('relation learning rate')

    subtensorE = T.ivector('batch entities set')
    subtensorR = T.ivector('batch link set')

    # Generate the training positive and negative triples
    hpmat = S.dot(embedding.E, hp).T #  batchsize x D dense matrix
    rpmat = S.dot(lembedding.E, rp).T
    tpmat = S.dot(embedding.E, tp).T

    hnmat = S.dot(embedding.E, hn).T
    tnmat = S.dot(embedding.E, tn).T

    # calculate the score
    pos = tranPES3(simfn, T.concatenate([hpmat, tpmat], axis=1).reshape((hpmat.shape[0], 2, hpmat.shape[1])).dimshuffle(0, 2, 1), hpmat, rpmat, tpmat)


    negh = tranPES3(simfn, T.concatenate([hnmat, tpmat], axis=1).reshape((hnmat.shape[0], 2, hnmat.shape[1])).dimshuffle(0, 2, 1), hnmat, rpmat, tpmat)
    negt = tranPES3(simfn, T.concatenate([hpmat, tnmat], axis=1).reshape((hpmat.shape[0], 2, hpmat.shape[1])).dimshuffle(0, 2, 1), hpmat, rpmat, tnmat)

    costh, outh = margeCost(pos, negh, marge)
    costt, outt = margeCost(pos, negt, marge)

    embreg = regEmb(embedding, subtensorE, alpha)
    lembreg = regLink(lembedding, subtensorR, beta)
    

    cost = costh + costt + embreg[0] + lembreg
    out = T.concatenate([outh, outt])
    outc = embreg[1]

    # list of inputs to the function
    list_in = [lemb, lremb, hp, rp, tp, hn, tn, subtensorE, subtensorR]

    # updating the embeddings using gradient descend
    emb_grad = T.grad(cost, embedding.E)
    New_embedding = embedding.E - lemb*emb_grad

    remb_grad = T.grad(cost, lembedding.E)
    New_rembedding = lembedding.E - lremb * remb_grad

    updates = OrderedDict({embedding.E: New_embedding, lembedding.E: New_rembedding})

    return theano.function(list_in, [cost, T.mean(out), T.mean(outc), embreg[0], lembreg],
                          updates=updates, on_unused_input='ignore')
Example #11
0
def get_bivariate_normal_spec():
    X1,X2,mu,sigma = [T.scalar('X1'),T.scalar('X2'), T.vector('mu'), T.matrix('sigma')]
    GaussianDensitySpec = FunctionSpec(variables=[X1, X2, mu, sigma],
                                       output_expression = -0.5*T.dot(T.dot((T.concatenate([X1.dimshuffle('x'),X2.dimshuffle('x')])-mu).T,
                                                                            nlinalg.matrix_inverse(sigma)),
                                                                      (T.concatenate([X1.dimshuffle('x'),X2.dimshuffle('x')])-mu)))
    return GaussianDensitySpec
    def forward(self, x, hc):
        """
        :param x: 1D: batch, 2D: self.n_in
        :param hc: 1D: batch, 2D: self.n_out * (self.order+1)
        :return:
        """
        order, n_in, n_out, activation = self.order, self.n_in, self.n_out, self.activation
        layers = self.internal_layers
        if hc.ndim > 1:
            h_tm1 = hc[:, n_out*order:]
        else:
            h_tm1 = hc[n_out*order:]

        lst = []
        for i in range(order):
            if hc.ndim > 1:
                c_i_tm1 = hc[:, n_out * i: n_out * i + n_out]
            else:
                c_i_tm1 = hc[n_out * i: n_out * i + n_out]
            if i == 0:
                c_i_t = layers[i].forward(x)
            else:
                c_i_t = c_im1_tm1 + layers[i].forward(x)
            lst.append(c_i_t)
            c_im1_tm1 = c_i_tm1

        h_t = activation(c_i_t + self.bias)
        lst.append(h_t)

        if hc.ndim > 1:
            return T.concatenate(lst, axis=1)
        else:
            return T.concatenate(lst)
Example #13
0
        def recurrence( sample_z_t, sample_x_t, h_tm1_enc, h_tm1_dec, c_tm1_enc, c_tm1_dec,  mu_z_t,  sigma_z_t, mu_x_tm1, sigma_x_tm1,  v):
            if v is not None:
                v_hat = v -  ( mu_x_tm1 + (sigma_x_tm1 * sample_x_t.reshape((batch_size, n_visible)) ) )#error input
                r_t = T.concatenate( [v , v_hat], axis = 1 ) 
            else:
                v_hat = mu_x_tm1 -  ( mu_x_tm1 + (sigma_x_tm1 * sample_x_t.reshape((batch_size, n_visible)) ) )#error input
                r_t = T.concatenate( [mu_x_tm1 , v_hat], axis = 1 ) 
            # v_enc = [r_t, h_tm1_dec]
            v_enc = T.concatenate( [r_t, h_tm1_dec] , axis = 1)
        
            #Generate h_t_enc = RNN_enc(h_tm1_enc, v_enc)
            i_t_enc = T.nnet.sigmoid(bi_enc + T.dot(c_tm1_enc, Wci_enc) + T.dot(h_tm1_enc, Whi_enc) + T.dot(v_enc, Wvi_enc))
            f_t_enc = T.nnet.sigmoid(bf_enc + T.dot(c_tm1_enc, Wcf_enc) + T.dot(h_tm1_enc, Whf_enc) + T.dot(v_enc, Wvf_enc))
            c_t_enc = (f_t_enc * c_tm1_enc) + ( i_t_enc * T.tanh( T.dot(v_enc, Wvc_enc) + T.dot( h_tm1_enc, Whc_enc) + bc_enc ))
            o_t_enc = T.nnet.sigmoid(bo_enc + T.dot(c_t_enc, Wco_enc) + T.dot(h_tm1_enc, Who_enc) + T.dot(v_enc, Wvo_enc))
            h_t_enc = o_t_enc * T.tanh( c_t_enc )
        
            # Get z_t
            mu_z_t = T.dot(h_t_enc, Wh_enc_mu_z ) + b_mu_z
            sigma_z_t = sigma_b + T.nnet.softplus(T.dot(h_t_enc, Wh_enc_sig_z ) + b_sig_z)
            #sample =  theano_rng.normal(size=mew_t.shape, avg = 0, std = 1, dtype=theano.config.floatX)
            z_t = mu_z_t + (sigma_z_t * (sample_z_t.reshape((batch_size,n_z))) ) 
            # Generate h_t_dec = RNN_dec(h_tm1_dec, z_t) 
            i_t_dec = T.nnet.sigmoid(bi_dec + T.dot(c_tm1_dec, Wci_dec) + T.dot(h_tm1_dec, Whi_dec) + T.dot(z_t, Wzi_dec))
            f_t_dec = T.nnet.sigmoid(bf_dec + T.dot(c_tm1_dec, Wcf_dec) + T.dot(h_tm1_dec, Whf_dec) + T.dot(z_t , Wzf_dec))
            c_t_dec = (f_t_dec * c_tm1_dec) + ( i_t_dec * T.tanh( T.dot(z_t, Wzc_dec) + T.dot( h_tm1_dec, Whc_dec) + bc_dec ))
            o_t_dec = T.nnet.sigmoid(bo_dec + T.dot(c_t_dec, Wco_dec) + T.dot(h_tm1_dec, Who_dec) + T.dot(z_t, Wzo_dec))
            h_t_dec = o_t_dec * T.tanh( c_t_dec )

            # Get w_t
            mu_x_t = mu_x_tm1 + T.dot(h_t_dec, Wh_dec_mu_x) + b_mu_x
            sigma_x_t = sigma_b +  T.nnet.softplus(T.dot(h_t_dec, Wh_dec_sig_x) + b_sig_x)

            return [ h_t_enc, h_t_dec, c_t_enc, c_t_dec,  mu_z_t, sigma_z_t,  mu_x_t, sigma_x_t]
    def getScores(self, args1, args2, l, n, relationProbs, neg1, neg2, entropy):
        weightedC1= T.dot(relationProbs, self.C1.dimshuffle(1, 0))
        weightedC2= T.dot(relationProbs, self.C2.dimshuffle(1, 0))

        left1 = self.leftMostFactorization(batchSize=l, args=args1, wC1=weightedC1)
        right1 = self.rightMostFactorization(batchSize=l, args=args2, wC2=weightedC2)
        one = left1 + right1

        u = T.concatenate([one + self.Ab[args1], one + self.Ab[args2]])
        logScoresP = T.log(T.nnet.sigmoid(u))
        allScores = logScoresP
        allScores = T.concatenate([allScores, entropy, entropy])

        negembed1 = self.A[neg1.flatten()].reshape((n, l, self.k))
        negembed2 = self.A[neg2.flatten()].reshape((n, l, self.k))
        negative1 = self.negLeftMostFactorization(batchSize=l,
                                                  negEmbed=negembed1,
                                                  wC1=weightedC1)
        negative2 = self.negRightMostFactorization(batchSize=l,
                                                  negEmbed=negembed2,
                                                  wC2=weightedC2)

        negOne = negative1.dimshuffle(1, 0) + right1
        negTwo = negative2.dimshuffle(1, 0) + left1
        g = T.concatenate([negOne + self.Ab[neg1], negTwo + self.Ab[neg2]])
        logScores = T.log(T.nnet.sigmoid(-g))
        allScores = T.concatenate([allScores, logScores.flatten()])

        return allScores
    def forward(self, x, hc):
        """
        :param x: the input vector or matrix
        :param hc: the vector/matrix of [ c_tm1, h_tm1 ], i.e. hidden state and visible state concatenated together
        :return: [ c_t, h_t ] as a single concatenated vector/matrix
        """
        n_in, n_out, activation = self.n_in, self.n_out, self.activation

        if hc.ndim > 1:
            c_tm1 = hc[:, :n_out]
            h_tm1 = hc[:, n_out:]
        else:
            c_tm1 = hc[:n_out]
            h_tm1 = hc[n_out:]

        in_t = self.in_gate.forward(x, h_tm1)
        forget_t = self.forget_gate.forward(x, h_tm1)
        out_t = self.out_gate.forward(x, h_tm1)

        c_t = forget_t * c_tm1 + in_t * self.input_layer.forward(x, h_tm1)
        h_t = out_t * T.tanh(c_t)

        if hc.ndim > 1:
            return T.concatenate([c_t, h_t], axis=1)
        else:
            return T.concatenate([c_t, h_t])
Example #16
0
    def getScores(self, args1, args2, l, n, relationProbs, neg1, neg2, entropy):
        argembed1 = self.A[args1]
        argembed2 = self.A[args2]

        weightedC = T.tensordot(relationProbs, self.C, axes=[[1], [2]])
        one = self.factorization(batchSize=l,
                                 argsEmbA=argembed1,
                                 argsEmbB=argembed2,
                                 wC=weightedC)  # [l,n]

        u = T.concatenate([one + self.Ab[args1], one + self.Ab[args2]])

        logScoresP = T.log(T.nnet.sigmoid(u))

        allScores = logScoresP
        allScores = T.concatenate([allScores, entropy, entropy])


        negembed1 = self.A[neg1.flatten()].reshape((n, l, self.k))
        negembed2 = self.A[neg2.flatten()].reshape((n, l, self.k))
        negOne = self.negFactorization1(batchSize=l,
                                        negEmbA=negembed1,
                                        argsEmbB=argembed2,
                                        wC=weightedC)

        negTwo = self.negFactorization2(batchSize=l,
                                        argsEmbA=argembed1,
                                        negEmbB=negembed2,
                                        wC=weightedC)

        g = T.concatenate([negOne + self.Ab[neg1].dimshuffle(1, 0),
                           negTwo + self.Ab[neg2].dimshuffle(1, 0)])
        logScores = T.log(T.nnet.sigmoid(-g))
        allScores = T.concatenate([allScores, logScores.flatten()])
        return allScores
Example #17
0
File: lds.py Project: ddofer/breze
def filter_and_prob(inpt, transition, emission,
           visible_noise_mean, visible_noise_cov,
           hidden_noise_mean, hidden_noise_cov,
           initial_hidden, initial_hidden_cov):
    step = forward_step(
        transition, emission,
        visible_noise_mean, visible_noise_cov,
        hidden_noise_mean, hidden_noise_cov)

    hidden_mean_0 = T.zeros_like(hidden_noise_mean).dimshuffle('x', 0)
    hidden_cov_0 = T.zeros_like(hidden_noise_cov).dimshuffle('x', 0, 1)
    f0, F0, ll0 = step(inpt[0], hidden_mean_0, hidden_cov_0)
    replace = {hidden_noise_mean: initial_hidden, 
               hidden_noise_cov: initial_hidden_cov}
    f0 = theano.clone(f0, replace)
    F0 = theano.clone(F0, replace)
    ll0 = theano.clone(ll0, replace)

    (f, F, ll), _ = theano.scan(
        step,
        sequences=inpt[1:],
        outputs_info=[f0, F0, None])

    ll = ll.sum(axis=0)

    f = T.concatenate([T.shape_padleft(f0), f])
    F = T.concatenate([T.shape_padleft(F0), F])
    ll += ll0

    return f, F, ll
def gru_layers(x, batch, n_fin, n_h, n_y, n_layers=1):
    params = []

    for i in xrange(n_layers):
        if i == 0:
            layer = GRU(n_i=n_fin, n_h=n_h)
            layer_input = relu(T.dot(x.dimshuffle(1, 0, 2), layer.W))
            # h0: 1D: Batch, 2D: n_h
            h0 = T.zeros((batch, n_h), dtype=theano.config.floatX)
        else:
            layer = GRU(n_i=n_h * 2, n_h=n_h)
            # h: 1D: n_words, 2D: Batch, 3D n_h
            layer_input = relu(T.dot(T.concatenate([layer_input, h], 2), layer.W))[::-1]
            h0 = layer_input[0]

        xr = T.dot(layer_input, layer.W_xr)
        xz = T.dot(layer_input, layer.W_xz)
        xh = T.dot(layer_input, layer.W_xh)

        h, _ = theano.scan(fn=layer.forward, sequences=[xr, xz, xh], outputs_info=[h0])
        params.extend(layer.params)

    layer = CRF(n_i=n_h * 2, n_h=n_y)
    params.extend(layer.params)
    h = relu(T.dot(T.concatenate([layer_input, h], 2), layer.W))

    if n_layers % 2 == 0:
        emit = h[::-1]
    else:
        emit = h

    return params, layer, emit
Example #19
0
    def get_output_for(self, inputs, **kwargs):
        """
            Updates stack given input, stack controls and output in the inputs array
        """

        # unpack inputs
        input_val, prev_stack, controls = inputs
        assert input_val.ndim == 2

        # cast shapes
        controls = controls.reshape([-1, 3, 1, 1])
        input_val = insert_dim(input_val, 1)
        zeros_at_the_top = insert_dim(T.zeros_like(prev_stack[:, 0]), 1)

        # unpack controls
        a_push, a_pop, a_no_op = controls[:, 0], controls[:, 1], controls[:, 2]

        # a version of stack that is pushed down (push)
        stack_down = T.concatenate([prev_stack[:, 1:], zeros_at_the_top], axis=1)

        # a version of stack that is moved up (pop)
        stack_up = T.concatenate([input_val, prev_stack[:, :-1]], axis=1)

        # new stack
        new_stack = a_no_op * prev_stack + a_push * stack_up + a_pop * stack_down

        return new_stack
Example #20
0
 def _pad_blanks(queryseq, blank_symbol, queryseq_mask=None):
     """
     Pad queryseq and corresponding queryseq_mask with blank symbol
     :param queryseq  (L, B)
     :param queryseq_mask (L, B)
     :param blank_symbol  scalar
     :return queryseq_padded, queryseq_mask_padded, both with shape (2L+1, B)
     """
     # for queryseq
     queryseq_extended = queryseq.dimshuffle(1, 0, 'x')                              # (L, B) -> (B, L, 1)
     blanks = tensor.zeros_like(queryseq_extended) + blank_symbol                    # (B, L, 1)
     concat = tensor.concatenate([queryseq_extended, blanks], axis=2)                # concat.shape = (B, L, 2)
     res = concat.reshape((concat.shape[0], concat.shape[1] * concat.shape[2])).T    # res.shape = (2L, B), the reshape will cause the last 2 dimensions interlace
     begining_blanks = tensor.zeros((1, res.shape[1])) + blank_symbol                # (1, B)
     queryseq_padded = tensor.concatenate([begining_blanks, res], axis=0)            # (1+2L, B)
     # for queryseq_mask
     if queryseq_mask is not None:
         queryseq_mask_extended = queryseq_mask.dimshuffle(1, 0, 'x')                          # (L, B) -> (B, L, 1)
         concat = tensor.concatenate([queryseq_mask_extended, queryseq_mask_extended], axis=2) # concat.shape = (B, L, 2)
         res = concat.reshape((concat.shape[0], concat.shape[1] * concat.shape[2])).T
         begining_blanks = tensor.ones((1, res.shape[1]), dtype=floatX)
         queryseq_mask_padded = tensor.concatenate([begining_blanks, res], axis=0)
     else:
         queryseq_mask_padded = None
     return queryseq_padded, queryseq_mask_padded
Example #21
0
    def recur(self, ms_j, mt_jm1, mscut_j, mtcut_jm1,
            ssrcpos_js, vsrcpos_js, starpos_js, vtarpos_js ):
        
         # cnn encoding
        ngms_j,  uttms_j   = self.sCNN.encode(ms_j,  mscut_j)
        ngmt_jm1,uttmt_jm1 = self.tCNN.encode(mt_jm1,mtcut_jm1)
        
        # padding dummy vector
        ngms_j   = T.concatenate([ngms_j,T.zeros_like(ngms_j[-1:,:])],axis=0)
        ngmt_jm1 = T.concatenate([ngmt_jm1,T.zeros_like(ngmt_jm1[-1:,:])],axis=0)

        # source features
        ssrcemb_js = T.sum(ngms_j[ssrcpos_js,:],axis=0)
        vsrcemb_js = T.sum(ngms_j[vsrcpos_js,:],axis=0)
        src_js = T.concatenate([ssrcemb_js,vsrcemb_js,uttms_j],axis=0)
        
        # target features
        staremb_js = T.sum(ngmt_jm1[starpos_js,:],axis=0)
        vtaremb_js = T.sum(ngmt_jm1[vtarpos_js,:],axis=0)
        tar_js = T.concatenate([staremb_js,vtaremb_js,uttmt_jm1],axis=0)
       
        # update g_j
        g_j   = T.dot( self.Whb, T.nnet.sigmoid( 
                T.dot(src_js,self.Wfbs) + 
                T.dot(tar_js,self.Wfbt) +
                self.B0)).dimshuffle('x')
        # update b_j
        g_j = T.concatenate([g_j,self.B],axis=0)
        b_j = T.nnet.softmax( g_j )[0,:]
        
        return b_j
Example #22
0
 def apply(self, source_sentence, source_sentence_mask):
     """Creates the final list of annotations.
     
     Args:
         source_sentence (Variable): Source sentence with words in
                                     vector representation.
         source_sentence_mask (Variable): Source mask
     
     Returns:
         Variable. source annotations
     """
     # Time as first dimension
     base_representations,base_mask = self.base_encoder.apply(
                                                       source_sentence,
                                                       source_sentence_mask)
     annotations = []
     masks = []
     if self.add_direct:
         annotations.append(base_representations)
         masks.append(base_mask)
     for annotator in self.annotators:
         ann,mask = annotator.apply(base_representations,
                                    base_mask)
         annotations.append(ann)
         masks.append(mask)
     return tensor.concatenate(annotations), tensor.concatenate(masks)
Example #23
0
    def __init__(self, input_ngram, input_sm, vocab_size, emb_dim, num_section, linear_W_emb=None, fix_emb=False, nonlinear=None, activation=None):
        
        global rng
        global init_range
        if linear_W_emb is None:
            # random initialize
            linear_W_emb = np.asarray(rng.uniform(
                low=-init_range, high=init_range, size=(vocab_size, emb_dim)), dtype=theano.config.floatX)
        else:
            # use the given model parameter
            given_vocab_size, given_emb_dim = linear_W_emb.shape
            assert(given_vocab_size == vocab_size and given_emb_dim == emb_dim)

        # shared variables
        self.W_emb = theano.shared(value=linear_W_emb, name='W_emb')

        # stack vectors
        input_ngram = T.cast(input_ngram, 'int32')
        input_sm = T.cast(input_sm, 'int32')

        # output is a matrix where each row correponds to a context_size embedding vector, and row number equals to batch size
        # output dimensions: batch_size * ((context_size + 1) * emb_dim)
        output_local = self.W_emb[input_ngram[:, :-1].flatten()].reshape(
            (input_ngram.shape[0], emb_dim * (input_ngram.shape[1] - 1)))  # self.W_emb.shape[1]
        
        sentence_lengths = input_sm[:,0]
        sentence_matrix = input_sm[:,1:]

        sentence_num = sentence_matrix.shape[0]
        global_length = sentence_matrix.shape[1]
        section_length = T.cast(T.ceil(global_length / float(num_section)), 'int32')

        # For the first section
        sentence_embeddings = T.mean(self.W_emb[sentence_matrix[:, :section_length].flatten()].reshape(
            (sentence_num, section_length, emb_dim)), axis=1)

        # For the rest sections
        for i in xrange(1, num_section):
            current_section = T.mean(self.W_emb[sentence_matrix[:, i*section_length:(i+1)*section_length].flatten()].reshape(
                (sentence_num, section_length, emb_dim)), axis=1)
            sentence_embeddings = T.concatenate([sentence_embeddings, current_section], axis=1)

        # get the sentence index for each ngram vector, and transform it to 0-based
        sentence_indeces = input_ngram[:,-1]
        base_index = sentence_indeces[0]
        sentence_indeces = sentence_indeces - base_index

        # the last column of output should be a weighted sum of the sentence
        # vectors
        output_global = sentence_embeddings[sentence_indeces.flatten()].reshape((sentence_indeces.shape[0], emb_dim * num_section))

        # handle non-linear layer
        if nonlinear is None or activation is None:
            self.output = T.concatenate([output_local, output_global], axis=1)
            # params is the word embedding matrix
            self.params = [self.W_emb] if not fix_emb else []
        else:
            self.non_linear_params, non_linear_output_global = addNonlinearLayer(output_global, emb_dim * num_section, nonlinear, activation)
            self.output = T.concatenate([output_local, non_linear_output_global], axis=1)
            self.params = [self.W_emb] + self.non_linear_params if not fix_emb else self.non_linear_params
Example #24
0
def _join_global_RVs(global_RVs, global_order):
    if len(global_RVs) == 0:
        inarray_global = None
        uw_global = None
        replace_global = {}
        c_g = 0
    else:
        joined_global = tt.concatenate([v.ravel() for v in global_RVs])
        uw_global = tt.vector('uw_global')
        uw_global.tag.test_value = np.concatenate(
            [joined_global.tag.test_value, joined_global.tag.test_value]
        )

        inarray_global = joined_global.type('inarray_global')
        inarray_global.tag.test_value = joined_global.tag.test_value

        # Replace RVs with reshaped subvectors of the joined vector
        # The order of global_order is the same with that of global_RVs
        subvecs = [reshape_t(inarray_global[slc], shp).astype(dtyp)
                   for _, slc, shp, dtyp in global_order.vmap]
        replace_global = {v: subvec for v, subvec in zip(global_RVs, subvecs)}

        # Weight vector
        cs = [c for _, c in global_RVs.items()]
        oness = [tt.ones(v.ravel().tag.test_value.shape) for v in global_RVs]
        c_g = tt.concatenate([c * ones for c, ones in zip(cs, oness)])

    return inarray_global, uw_global, replace_global, c_g
Example #25
0
def _join_local_RVs(local_RVs, local_order):
    if len(local_RVs) == 0:
        inarray_local = None
        uw_local = None
        replace_local = {}
        c_l = 0
    else:
        joined_local = tt.concatenate([v.ravel() for v in local_RVs])
        uw_local = tt.vector('uw_local')
        uw_local.tag.test_value = np.concatenate([joined_local.tag.test_value,
                                                  joined_local.tag.test_value])

        inarray_local = joined_local.type('inarray_local')
        inarray_local.tag.test_value = joined_local.tag.test_value

        get_var = {var.name: var for var in local_RVs}
        replace_local = {
            get_var[var]: reshape_t(inarray_local[slc], shp).astype(dtyp)
            for var, slc, shp, dtyp in local_order.vmap
        }

        # Weight vector
        cs = [c for _, (_, c) in local_RVs.items()]
        oness = [tt.ones(v.ravel().tag.test_value.shape) for v in local_RVs]
        c_l = tt.concatenate([c * ones for c, ones in zip(cs, oness)])

    return inarray_local, uw_local, replace_local, c_l
Example #26
0
    def get_unfolding_cost(self):
        ''' computes the unfolding rwconstructed cost (more than 2 inputs) '''
        x  = T.reshape(self.x, (-1, self.n_vector)) 
        yi = x[0];i=1
        for i in range(1, self.num):
        #while T.lt(i, self.num):
            xi = T.concatenate((yi, x[i]))
            yi = self.get_hidden_values(xi)
            i += 1
        # Save the deepest hidden value as output vactor
        self.vector = copy.deepcopy(yi)

        tmp = []
        i = 1
        for i in range(1, self.num):
        #while T.lt(i, self.num):
            zi = self.get_reconstructed(yi)
            t  = T.reshape(zi, (2, self.n_vector))
            tmp.append(t[1])
            yi = t[0]
            i += 1
        tmp.append(yi)
        tmp.reverse()
    
        x = self.x
        z = T.concatenate(tmp)
        
        # cross-entropy cost should be modified here.
        L = -T.sum( (0.5*x+0.5)*T.log(0.5*z+0.5) + (-0.5*x+0.5)*T.log(-0.5*z+0.5) )
        # squred cost.
        #L = -T.sum( (x-z)**2 )
        
        cost = T.mean(L) + 0.01*(self.W**2).sum()   # cost for a minibatch
        return cost 
Example #27
0
def diag_gauss(inpt):
    """Transfer function to turn an arary into sufficient statistics of a
    diagonal Gaussian.

    The first half of the input will be left unchanged, the second will be
    squared. the "split" into halves is performed along the second axis.

    Parameters
    ----------

    inpt : Theano tensor
        Array of shape ``(n, d)`` or ``(t, n, d)``.

    Returns
    -------

    output : Theano variable.
        Transformed input. Same shape as ``inpt``.
    """
    half = inpt.shape[-1] // 2
    if inpt.ndim == 3:
        mean, var = inpt[:, :, :half], inpt[:, :, half:]
        res = T.concatenate([mean, var ** 2 + 1e-8], axis=2)
    else:
        mean, var = inpt[:, :half], inpt[:, half:]
        res = T.concatenate([mean, var ** 2 + 1e-8], axis=1)
    return res
Example #28
0
    def recurrence( sample_t, h_tm1_enc, h_tm1_dec, c_tm1_enc, c_tm1_dec, w_tm1, mew_t, sigma_t, v):
        v_hat = v - T.nnet.sigmoid(w_tm1) #error input
        r_t = T.concatenate( [v , v_hat], axis = 1 ) 
        
        # v_enc = [r_t, h_tm1_dec]
        v_enc = T.concatenate( [r_t, h_tm1_dec] , axis = 1)
        
        #Generate h_t_enc = RNN_enc(h_tm1_enc, v_enc)
        i_t_enc = T.nnet.sigmoid(bi_enc + T.dot(c_tm1_enc, Wci_enc) + T.dot(h_tm1_enc, Whi_enc) + T.dot(v_enc, Wvi_enc))
        f_t_enc = T.nnet.sigmoid(bf_enc + T.dot(c_tm1_enc, Wcf_enc) + T.dot(h_tm1_enc, Whf_enc) + T.dot(v_enc, Wvf_enc))
        c_t_enc = (f_t_enc * c_tm1_enc) + ( i_t_enc * T.tanh( T.dot(v_enc, Wvc_enc) + T.dot( h_tm1_enc, Whc_enc) + bc_enc ))
        o_t_enc = T.nnet.sigmoid(bo_enc + T.dot(c_t_enc, Wco_enc) + T.dot(h_tm1_enc, Who_enc) + T.dot(v_enc, Wvo_enc))
        h_t_enc = o_t_enc * T.tanh( c_t_enc )
        
        # Get z_t
        mew_t = T.dot(h_t_enc, Wh_enc_mew )
        sigma_t = T.dot(h_t_enc, Wh_enc_sig )
        #sample =  theano_rng.normal(size=mew_t.shape, avg = 0, std = 1, dtype=theano.config.floatX)
        z_t = mew_t + (T.exp(sigma_t) * sample_t )
        # Generate h_t_dec = RNN_dec(h_tm1_dec, z_t) 
        i_t_dec = T.nnet.sigmoid(bi_dec + T.dot(c_tm1_dec, Wci_dec) + T.dot(h_tm1_dec, Whi_dec) + T.dot(z_t, Wzi_dec))
        f_t_dec = T.nnet.sigmoid(bf_dec + T.dot(c_tm1_dec, Wcf_dec) + T.dot(h_tm1_dec, Whf_dec) + T.dot(z_t , Wzf_dec))
        c_t_dec = (f_t_dec * c_tm1_dec) + ( i_t_dec * T.tanh( T.dot(z_t, Wzc_dec) + T.dot( h_tm1_dec, Whc_dec) + bc_dec ))
        o_t_dec = T.nnet.sigmoid(bo_dec + T.dot(c_t_dec, Wco_dec) + T.dot(h_tm1_dec, Who_dec) + T.dot(z_t, Wzo_dec))
        h_t_dec = o_t_dec * T.tanh( c_t_dec )

        # Get w_t
        w_t = w_tm1 + T.dot(h_t_dec, Wh_dec_w)
        return [ h_t_enc, h_t_dec, c_t_enc, c_t_dec, w_t, mew_t, sigma_t]
        def _build(det_dropout):
            all_out_probs = []
            for encoding, lstmstack, encoded_melody, relative_pos in zip(self.encodings, self.lstmstacks, encoded_melodies, relative_posns):
                activations = lstmstack.do_preprocess_scan( timestep=T.tile(T.arange(n_time), (n_batch,1)) ,
                                                            relative_position=relative_pos,
                                                            cur_chord_type=chord_types,
                                                            cur_chord_root=chord_roots,
                                                            last_output=T.concatenate([T.tile(encoding.initial_encoded_form(), (n_batch,1,1)),
                                                                                encoded_melody[:,:-1,:] ], 1),
                                                            deterministic_dropout=det_dropout)

                out_probs = encoding.decode_to_probs(activations, relative_pos, self.bounds.lowbound, self.bounds.highbound)
                all_out_probs.append(out_probs)
            reduced_out_probs = functools.reduce((lambda x,y: x*y), all_out_probs)
            if self.normalize_artic_only:
                non_artic_probs = reduced_out_probs[:,:,:2]
                artic_probs = reduced_out_probs[:,:,2:]
                non_artic_sum = T.sum(non_artic_probs, 2, keepdims=True)
                artic_sum = T.sum(artic_probs, 2, keepdims=True)
                norm_artic_probs = artic_probs*(1-non_artic_sum)/artic_sum
                norm_out_probs = T.concatenate([non_artic_probs, norm_artic_probs], 2)
            else:
                normsum = T.sum(reduced_out_probs, 2, keepdims=True)
                normsum = T.maximum(normsum, constants.EPSILON)
                norm_out_probs = reduced_out_probs/normsum
            return Encoding.compute_loss(norm_out_probs, correct_notes, True)
Example #30
0
	def _setOutputs(self) :
		outs = []
		for l in self.network.inConnections[self] :
			outs.append(l.outputs)
	
		self.outputs = tt.concatenate( outs, axis = 1 )
		self.testOutputs = tt.concatenate( outs, axis = 1 )
Example #31
0
def best_right_path_cost(pred, mask, token, blank):
    '''
    best right path cost of multi sentences
    :param pred: (T, nb, voca_size+1)                    (4,1,3)
    :param mask: (nb, T)
    # :param pred_len: (nb,)    pred_len of prediction        (1)
    :param token: (nb, U)    -1 for NIL                    (1,2)
    :param blank: (1)

    :return: best_right_path_cost (nb,)
    :return: argmin_token (nb, T) best path, -1 for null
    '''

    pred_len = mask.sum(axis=-1).astype('int32')
    eps = theano.shared(np.float32(1e-35))
    EPS = theano.shared(np.float32(35))

    t = pred.shape[0]
    nb, U = token.shape[0], token.shape[1]
    token_len = T.sum(T.neq(token, -1), axis=-1)

    # token_with_blank
    token = token[:, :, None]  # (nb, U, 1)
    token_with_blank = T.concatenate(
        (T.ones_like(token, dtype=intX) * blank, token), axis=2).reshape(
            (nb, 2 * U))
    token_with_blank = T.concatenate(
        (token_with_blank, T.ones(
            (nb, 1), dtype=intX) * blank), axis=1)  # (nb, 2*U+1)
    length = token_with_blank.shape[1]

    # only use these predictions
    pred = pred[:, T.tile(T.arange(nb), (length, 1)).T,
                token_with_blank]  # (T, nb, 2U+1)
    pred = -T.log(pred + eps)

    # recurrence relation
    sec_diag = T.concatenate(
        (T.zeros((nb, 2), dtype=intX),
         T.neq(token_with_blank[:, :-2], token_with_blank[:, 2:])),
        axis=1) * T.neq(token_with_blank, blank)  # (nb, 2U+1)
    recurrence_relation = T.tile(
        (T.eye(length) + T.eye(length, k=1)),
        (nb, 1,
         1)) + T.tile(T.eye(length, k=2),
                      (nb, 1, 1)) * sec_diag[:, None, :]  # (nb, 2U+1, 2U+1)
    recurrence_relation = -T.log(recurrence_relation + eps).astype(floatX)

    # alpha
    alpha = T.ones_like(token_with_blank, dtype=floatX) * EPS
    alpha = T.set_subtensor(alpha[:, :2],
                            pred[0, :, :2])  ################(nb, 2U+1)

    # dynamic programming
    # (T, nb, 2U+1)
    [log_probability,
     argmin_pos_1], _ = theano.scan(lambda curr, accum: (
         (accum[:, :, None] + recurrence_relation).min(axis=1) + curr,
         (accum[:, :, None] + recurrence_relation).argmin(axis=1)),
                                    sequences=[pred[1:]],
                                    outputs_info=[alpha, None])

    # why pred_len-2?
    labels_1 = log_probability[pred_len - 2,
                               T.arange(nb), 2 * token_len - 1]  # (nb,)
    labels_2 = log_probability[pred_len - 2,
                               T.arange(nb), 2 * token_len]  # (nb,)
    concat_labels = T.concatenate([labels_1[:, None], labels_2[:, None]],
                                  axis=-1)
    argmin_labels = concat_labels.argmin(axis=-1)

    cost = concat_labels.min(axis=-1)

    min_path = T.ones((t - 1, nb), dtype=intX) * -1  # -1 for null
    min_path = T.set_subtensor(min_path[pred_len - 2,
                                        T.arange(nb)],
                               2 * token_len - 1 + argmin_labels)

    # (T-1, nb)
    min_full_path, _ = theano.scan(
        lambda m_path, argm_pos, m_full_path: argm_pos[
            T.arange(nb),
            T.maximum(m_path, m_full_path).astype('int32')].astype('int32'),
        sequences=[min_path[::-1], argmin_pos_1[::-1]],
        outputs_info=[min_path[-1]])
    argmin_pos = T.concatenate((min_full_path[::-1], min_path[-1][None, :]),
                               axis=0)  # (T, nb)
    argmin_token = token_with_blank[T.arange(nb)[None, :], argmin_pos]

    return cost, (argmin_token.transpose((1, 0)) * mask + mask - 1).astype(
        'int32'
    )  # alpha, log_probability, argmin_pos_1, argmin_labels, min_path, min_full_path, argmin_pos, token_with_blank, argmin_token
Example #32
0
def bayes_estimate_cell(k, adm, eadm, coh, ecoh, alph=False, atype='joint'):
    """
    Function to estimate the parameters of the flexural model at a single cell location
    of the input grids. 

    :type k: :class:`~numpy.ndarray`
    :param k: 1D array of wavenumbers
    :type adm: :class:`~numpy.ndarray`
    :param adm: 1D array of wavelet admittance
    :type eadm: :class:`~numpy.ndarray`
    :param eadm: 1D array of error on wavelet admittance
    :type coh: :class:`~numpy.ndarray`
    :param coh: 1D array of wavelet coherence
    :type ecoh: :class:`~numpy.ndarray`
    :param ecoh: 1D array of error on wavelet coherence
    :type alph: bool, optional
    :param alph: Whether or not to estimate parameter ``alpha``
    :type atype: str, optional
    :param atype: Whether to use the admittance (`'admit'`), coherence (`'coh'`) or both (`'joint'`)

    :return:
        (tuple): Tuple containing:
            * ``trace`` : :class:`~pymc3.backends.base.MultiTrace`
                Posterior samples from the MCMC chains
            * ``summary`` : :class:`~pandas.core.frame.DataFrame`
                Summary statistics from Posterior distributions
            * ``map_estimate`` : dict
                Container for Maximum a Posteriori (MAP) estimates

    """

    with pm.Model() as model:

        # k is an array - needs to be passed as distribution
        k_obs = pm.Normal('k', mu=k, sigma=1., observed=k)

        # Prior distributions
        Te = pm.Uniform('Te', lower=1., upper=250.)
        F = pm.Uniform('F', lower=0., upper=0.9999)

        if alph:

            # Prior distribution of `alpha`
            alpha = pm.Uniform('alpha', lower=0., upper=np.pi)
            admit_exp, coh_exp = real_xspec_functions_alpha(
                k_obs, Te, F, alpha)

        else:
            admit_exp, coh_exp = real_xspec_functions_noalpha(k_obs, Te, F)

        # Select type of analysis to perform
        if atype == 'admit':

            # Uncertainty as observed distribution
            sigma = pm.Normal('sigma', mu=eadm, sigma=1., observed=eadm)

            # Likelihood of observations
            admit_obs = pm.Normal('admit_obs',
                                  mu=admit_exp,
                                  sigma=sigma,
                                  observed=adm)

        elif atype == 'coh':

            # Uncertainty as observed distribution
            sigma = pm.Normal('sigma', mu=ecoh, sigma=1., observed=ecoh)

            # Likelihood of observations
            coh_obs = pm.Normal('coh_obs',
                                mu=coh_exp,
                                sigma=sigma,
                                observed=coh)

        elif atype == 'joint':

            # Define uncertainty as concatenated arrays
            ejoint = np.array([eadm, ecoh]).flatten()

            # Define array of observations and expected values as
            # concatenated arrays
            joint = np.array([adm, coh]).flatten()
            joint_exp = tt.flatten(tt.concatenate([admit_exp, coh_exp]))

            # Uncertainty as observed distribution
            sigma = pm.Normal('sigma', mu=ejoint, sigma=1., observed=ejoint)

            # Likelihood of observations
            joint_obs = pm.Normal('admit_coh_obs',
                                  mu=joint_exp,
                                  sigma=sigma,
                                  observed=joint)

        # Sample the Posterior distribution
        trace = pm.sample(cf.draws, tune=cf.tunes, cores=cf.cores)

        # Get Max a porteriori estimate
        map_estimate = pm.find_MAP()

        # Get Summary
        summary = pm.summary(trace)

    return trace, summary, map_estimate
    def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, dropout, l2, mode, batch_norm, rnn_num_units, **kwargs):
        
        print "==> not used params in DMN class:", kwargs.keys()
        self.train_list_raw = train_list_raw
        self.test_list_raw = test_list_raw
        self.png_folder = png_folder
        self.batch_size = batch_size
        self.dropout = dropout
        self.l2 = l2
        self.mode = mode
        self.batch_norm = batch_norm
        self.num_units = rnn_num_units
        
        self.input_var = T.tensor4('input_var')
        self.answer_var = T.ivector('answer_var')
        
        print "==> building network"
        example = np.random.uniform(size=(self.batch_size, 1, 128, 858), low=0.0, high=1.0).astype(np.float32) #########
        answer = np.random.randint(low=0, high=176, size=(self.batch_size,)) #########
       
        network = layers.InputLayer(shape=(None, 1, 128, 858), input_var=self.input_var)
        print layers.get_output(network).eval({self.input_var:example}).shape
        
        # CONV-RELU-POOL 1
        network = layers.Conv2DLayer(incoming=network, num_filters=16, filter_size=(7, 7), 
                                     stride=1, nonlinearity=rectify)
        print layers.get_output(network).eval({self.input_var:example}).shape
        network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False)
        print layers.get_output(network).eval({self.input_var:example}).shape
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)
        
        # CONV-RELU-POOL 2
        network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(5, 5), 
                                     stride=1, nonlinearity=rectify)
        print layers.get_output(network).eval({self.input_var:example}).shape
        network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False)
        print layers.get_output(network).eval({self.input_var:example}).shape
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)

        
        # CONV-RELU-POOL 3
        network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), 
                                     stride=1, nonlinearity=rectify)
        print layers.get_output(network).eval({self.input_var:example}).shape
        network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False)
        print layers.get_output(network).eval({self.input_var:example}).shape
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)
        
        
        self.params = layers.get_all_params(network, trainable=True)
        
        output = layers.get_output(network)
        num_channels  = 32 
        filter_W = 104
        filter_H = 13
        # NOTE: these constants are shapes of last pool layer, it can be symbolic 
        # explicit values are better for optimizations
        
        channels = []
        for channel_index in range(num_channels):
            channels.append(output[:, channel_index, :, :].transpose((0, 2, 1)))
        
        rnn_network_outputs = []
        for channel_index in range(num_channels):
            rnn_input_var = channels[channel_index]
            
            # InputLayer       
            network = layers.InputLayer(shape=(None, filter_W, filter_H), input_var=rnn_input_var)

            # GRULayer
            network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True)
            
            # BatchNormalization Layer
            if (self.batch_norm):
                network = layers.BatchNormLayer(incoming=network)
              
            # add params 
            self.params += layers.get_all_params(network, trainable=True)
            
            rnn_network_outputs.append(layers.get_output(network))
        
        all_output_var = T.concatenate(rnn_network_outputs, axis=1)
        print all_output_var.eval({self.input_var:example}).shape
        
        # InputLayer
        network = layers.InputLayer(shape=(None, self.num_units * num_channels), input_var=all_output_var)
        
        # DENSE 1
        network = layers.DenseLayer(incoming=network, num_units=512, nonlinearity=rectify)
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)
        if (self.dropout > 0):
            network = layers.dropout(network, self.dropout)
        print layers.get_output(network).eval({self.input_var:example}).shape
        
        
        # Last layer: classification
        network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax)
        print layers.get_output(network).eval({self.input_var:example}).shape
        
    
        self.params += layers.get_all_params(network, trainable=True)
        self.prediction = layers.get_output(network)
    
        #print "==> param shapes", [x.eval().shape for x in self.params]
        
        self.loss_ce = lasagne.objectives.categorical_crossentropy(self.prediction, self.answer_var).mean()
        if (self.l2 > 0):
            self.loss_l2 = self.l2 * lasagne.regularization.apply_penalty(self.params, 
                                                                          lasagne.regularization.l2)
        else:
            self.loss_l2 = 0
        self.loss = self.loss_ce + self.loss_l2
        
        #updates = lasagne.updates.adadelta(self.loss, self.params)
        updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003)
        
        if self.mode == 'train':
            print "==> compiling train_fn"
            self.train_fn = theano.function(inputs=[self.input_var, self.answer_var], 
                                            outputs=[self.prediction, self.loss],
                                            updates=updates)
        
        print "==> compiling test_fn"
        self.test_fn = theano.function(inputs=[self.input_var, self.answer_var],
                                       outputs=[self.prediction, self.loss])
Example #34
0
    def make_backprop_scan(self, error_signal,
                           extra_cost_inputs=None,
                           compute_embedding_gradients=True):
        """
        Args:
            error_signal: The external gradient d(cost)/d(stack top). A Theano
                batch of size `batch_size * model_dim`.
        """

        assert hasattr(self, "stack_2_ptrs"), \
            ("self._make_scan (forward pass) must be defined before "
             "self.make_backprop_scan is called")

        # We need to add extra updates to the `_zero_updates` member, so we
        # must be called before `_zero_updates` is read.
        assert self._zero is None, \
            ("Can only install backprop on a fresh ThinStack. Don't call "
             "ThinStack.zero before setting up backprop.")

        if (compute_embedding_gradients
            and self._embedding_projection_network not in [None, util.IdentityLayer]):
            raise ValueError(
                "Do not support backprop for both an embedding projection "
                "layer and individual embeddings.")

        if self.use_input_batch_norm:
            raise ValueError(
                "Thin-stack backprop not supported with input batch-norm. Jon "
                "worked on BN gradients for 3 days without success, and then "
                "dropped it.")

        if extra_cost_inputs is None:
            extra_cost_inputs = []

        wrt, f_proj_delta, f_push_delta, f_merge_delta = \
            self._make_backward_graphs(extra_cost_inputs)
        wrt_shapes = [wrt_i.get_value().shape for wrt_i in wrt]

        # Build shared variables for accumulating wrt deltas.
        wrt_vars = [theano.shared(np.zeros(wrt_shape, dtype=np.float32),
                                  name=self._prefix + "bwd/wrt/%s" % wrt_i)
                    for wrt_i, wrt_shape in zip(wrt, wrt_shapes)]
        # All of these need to be zeroed out in between batches.
        self._zero_updates += wrt_vars

        # Also accumulate embedding gradients separately
        if compute_embedding_gradients:
            dE = theano.shared(np.zeros(self.embeddings.get_value().shape,
                                        dtype=np.float32),
                               name=self._prefix + "bwd/wrt/embeddings")
            self._zero_updates.append(dE)
        else:
            # Make dE a dummy variable.
            dE = T.zeros((1,))

        # Useful batch zero-constants.
        zero_stack = T.zeros((self.batch_size, self.model_dim))
        zero_extra_inps = [T.zeros((self.batch_size, extra_shape[-1]))
                           for extra_shape in self.recurrence.extra_outputs]
        # Zero Jacobian matrices for masked reductions during backprop. May not
        # be used.
        zero_jac_wrts = [T.zeros((self.batch_size,) + wrt_shape)
                         for wrt_shape in wrt_shapes]

        DUMMY = util.zeros_nobroadcast((1,))

        batch_size = self.batch_size
        batch_range = T.arange(batch_size)
        stack_shift = T.cast(batch_range, theano.config.floatX)
        buffer_shift = T.cast(batch_range * self.seq_length, theano.config.floatX)

        def lookup(t_f, stack_fwd, stack_2_ptrs_t, buffer_cur_t,
                  stack_bwd_t, extra_bwd):
            """Retrieve all relevant bwd inputs/outputs at time `t`."""

            grad_cursor = t_f * batch_size + stack_shift
            main_grad = cuda_util.AdvancedSubtensor1Floats("B_maingrad")(
                stack_bwd_t, grad_cursor)
            extra_grads = tuple([
                cuda_util.AdvancedSubtensor1Floats("B_extragrad_%i" % i)(
                    extra_bwd_i, grad_cursor)
                for i, extra_bwd_i in enumerate(extra_bwd)])

            # Find the timesteps of the two elements involved in the potential
            # merge at this timestep.
            t_c1 = (t_f - 1.0) * batch_size + stack_shift
            t_c2 = stack_2_ptrs_t

            # Find the two elements involved in the potential merge.
            c1 = cuda_util.AdvancedSubtensor1Floats("B_stack1")(stack_fwd, t_c1)
            c2 = cuda_util.AdvancedSubtensor1Floats("B_stack2")(stack_fwd, t_c2)

            buffer_top_t = cuda_util.AdvancedSubtensor1Floats("B_buffer_top")(
                self.buffer_t, buffer_cur_t + buffer_shift)

            # Retrieve extra inputs from auxiliary stack(s).
            extra_inps_t = tuple([
                cuda_util.AdvancedSubtensor1Floats("B_extra_inp_%i" % i)(
                    extra_inp_i, t_c1)
                for extra_inp_i in self.final_aux_stacks])

            inputs = (c1, c2, buffer_top_t) + extra_inps_t
            grads = (main_grad,) + extra_grads
            return t_c1, t_c2, inputs, grads

        def step_b(# sequences
                   t_f, transitions_t_f, stack_2_ptrs_t, buffer_cur_t,
                   # accumulators
                   dE,
                   # rest (incl. outputs_info, non_sequences)
                   *rest):

            # Separate the accum arguments from the non-sequence arguments.
            n_wrt = len(wrt_shapes)
            n_extra_bwd = len(self.recurrence.extra_outputs)
            wrt_deltas = rest[:n_wrt]
            stack_bwd_t = rest[n_wrt]
            extra_bwd = rest[n_wrt + 1:n_wrt + 1 + n_extra_bwd]
            id_buffer, stack_final = \
                rest[n_wrt + 1 + n_extra_bwd:n_wrt + 1 + n_extra_bwd + 2]

            # At first iteration, drop the external error signal into the main
            # backward stack.
            stack_bwd_next = ifelse(T.eq(t_f, self.seq_length),
                                    T.set_subtensor(stack_bwd_t[-self.batch_size:], error_signal),
                                    stack_bwd_t)


            # Retrieve all relevant inputs/outputs at this timestep.
            t_c1, t_c2, inputs, grads = \
                lookup(t_f, stack_final, stack_2_ptrs_t, buffer_cur_t,
                       stack_bwd_next, extra_bwd)
            main_grad = grads[0]

            # Calculate deltas for this timestep.
            m_delta_inp, m_delta_wrt = f_merge_delta(inputs, grads)
            # NB: main_grad is not passed to push function.
            p_delta_inp, p_delta_wrt = f_push_delta(inputs, grads[1:])

            # Check that delta function outputs match (at least in number).
            assert len(m_delta_inp) == len(p_delta_inp), \
                "%i %i" % (len(m_delta_inp), len(p_delta_inp))
            assert len(m_delta_wrt) == len(p_delta_wrt), \
                "%i %i" % (len(m_delta_wrt), len(p_delta_wrt))
            assert len(m_delta_inp) == 3 + len(self.aux_stacks), \
                "%i %i" % (len(m_delta_inp), 3 + len(self.aux_stacks))
            assert len(m_delta_wrt) == len(wrt)

            # Retrieve embedding indices on buffer at this timestep.
            # (Necessary for sending embedding gradients.)
            buffer_ids_t = cuda_util.AdvancedSubtensor1Floats("B_buffer_ids")(
                    id_buffer, buffer_cur_t + buffer_shift)

            # Prepare masks for op-wise gradient accumulation.
            # TODO: Record actual transitions (e.g. for model 1S and higher)
            # and repeat those here
            mask = transitions_t_f
            masks = [mask, mask.dimshuffle(0, "x"),
                     mask.dimshuffle(0, "x", "x")]

            # Insert gradients for the embedding projection network as well.
            if f_proj_delta is not None:
                # Look up raw buffer top for this timestep -- i.e., buffer top
                # *before* the op at this timestep was performed. This was the
                # input to the projection network at this timestep.
                proj_input = cuda_util.AdvancedSubtensor1Floats("B_raw_buffer_top")(
                    self._raw_buffer_t, buffer_cur_t + buffer_shift)

                proj_inputs = (proj_input,)
                if self.use_input_dropout:
                    embedding_dropout_mask = cuda_util.AdvancedSubtensor1Floats("B_buffer_dropout")(
                        self._embedding_dropout_masks, buffer_cur_t + buffer_shift)
                    proj_inputs = (proj_input, embedding_dropout_mask)

                # Compute separate graphs based on gradient from above.
                # NB: We discard the delta_inp return here. The delta_inp
                # should actually be passed back to the raw embedding
                # parameters, but we don't have any reason to support this in
                # practice. (Either we backprop to embeddings or project them
                # and learn the projection -- not both.)
                if m_delta_inp[2] is not None:
                    _, m_proj_delta_wrt = f_proj_delta(proj_inputs,
                                                       (m_delta_inp[2],))
                    m_delta_wrt = util.merge_update_lists(m_delta_wrt, m_proj_delta_wrt)

                # If we pushed (moved the buffer top onto the stack), the
                # gradient from above is a combination of the accumulated stack
                # gradient (main_grad) and any buffer top deltas from the push
                # function (e.g. tracking LSTM gradient).
                embedding_grad = main_grad
                if p_delta_inp[2] is not None:
                    embedding_grad += p_delta_inp[2]
                _, p_proj_delta_wrt = f_proj_delta(proj_inputs,
                                                   (embedding_grad,))
                p_delta_wrt = util.merge_update_lists(p_delta_wrt, p_proj_delta_wrt)

            # Accumulate inp deltas, switching over push/merge decision.
            stacks = (stack_bwd_next, stack_bwd_next,
                      (compute_embedding_gradients and dE) or None)
            cursors = (t_c1, t_c2,
                       (compute_embedding_gradients and buffer_ids_t) or None)
            # Handle potential aux bwd stacks.
            stacks += extra_bwd
            cursors += ((t_c1,)) * len(extra_bwd)
            new_stacks = {}
            for stack, cursor, m_delta, p_delta in zip(stacks, cursors, m_delta_inp, p_delta_inp):
                if stack is None or cursor is None:
                    continue
                elif m_delta is None and p_delta is None:
                    # Disconnected gradient.
                    continue

                base = new_stacks.get(stack, stack)
                mask_i = masks[(m_delta or p_delta).ndim - 1]
                if m_delta is None:
                    delta = (1. - mask_i) * p_delta
                elif p_delta is None:
                    delta = mask_i * m_delta
                else:
                    delta = mask_i * m_delta + (1. - mask_i) * p_delta

                # Run subtensor update on associated structure using the
                # current cursor.
                new_stack = cuda_util.AdvancedIncSubtensor1Floats(inplace=True)(
                    base, delta, cursor)
                new_stacks[stack] = new_stack

            # Accumulate wrt deltas, switching over push/merge decision.
            new_wrt_deltas = {}
            wrt_data = enumerate(zip(wrt, zero_jac_wrts, wrt_deltas,
                                     m_delta_wrt, p_delta_wrt))
            for i, (wrt_var, wrt_zero, accum_delta, m_delta, p_delta) in wrt_data:
                if m_delta is None and p_delta is None:
                    # Disconnected gradient.
                    continue

                # Check that tensors returned by delta functions match shape
                # expectations.
                assert m_delta is None or accum_delta.ndim == m_delta.ndim - 1, \
                    "%s %i %i" % (wrt_var.name, accum_delta.ndim, m_delta.ndim)
                assert p_delta is None or accum_delta.ndim == p_delta.ndim - 1, \
                    "%s %i %i" % (wrt_var.name, accum_delta.ndim, p_delta.ndim)

                mask_i = masks[(m_delta or p_delta).ndim - 1]
                if m_delta is None:
                    delta = T.switch(mask_i, wrt_zero, p_delta)
                elif p_delta is None:
                    delta = T.switch(mask_i, m_delta, wrt_zero)
                else:
                    delta = T.switch(mask_i, m_delta, p_delta)
                # TODO: Is this at all efficient? (Bring back GPURowSwitch?)
                delta = delta.sum(axis=0)
                # TODO: we want this to be inplace
                new_wrt_deltas[accum_delta] = accum_delta + delta

            # On push ops, backprop the stack_bwd error onto the embedding
            # projection network / embedding parameters.
            # TODO make sparse?
            if compute_embedding_gradients:
                new_stacks[dE] = cuda_util.AdvancedIncSubtensor1Floats(inplace=True)(
                    new_stacks.get(dE, dE), (1. - masks[1]) * main_grad, buffer_ids_t)

            updates = dict(new_wrt_deltas.items() + new_stacks.items())
            updates = util.prepare_updates_dict(updates)

            return updates

        # TODO: These should come from forward pass -- not fixed -- in model
        # 1S, etc.
        transitions_f = T.cast(self.transitions.dimshuffle(1, 0),
                               dtype=theano.config.floatX)

        ts_f = T.cast(T.arange(1, self.seq_length + 1), dtype=theano.config.floatX)

        # Representation of buffer using embedding indices rather than values
        id_buffer = T.cast(self.X.flatten(), theano.config.floatX)
        # Build sequence of buffer pointers, where buf_ptrs[i] indicates the
        # buffer pointer values *before* computation at timestep *i* proceeds.
        # (This means we need to slice off the last actual buf_ptr output and
        # prepend a dummy.)
        buf_ptrs = T.concatenate([T.zeros((1, batch_size,)),
                                  self.buf_ptrs[:-1]], axis=0)

        sequences = [ts_f, transitions_f, self.stack_2_ptrs, buf_ptrs]
        outputs_info = []

        # Shared variables: Accumulated wrt deltas and bwd stacks.
        non_sequences = [dE] + wrt_vars
        non_sequences += [self.stack_bwd] + self.aux_bwd_stacks
        # More auxiliary data
        non_sequences += [id_buffer, self.final_stack]

        # More helpers (not referenced directly in code, but we need to include
        # them as non-sequences to satisfy scan strict mode)
        aux_data = [self.stack, self.buffer_t] + self.aux_stacks + self.final_aux_stacks
        aux_data += [self.X, self.transitions, self._raw_buffer_t]
        if self.use_input_dropout:
            aux_data.append(self._embedding_dropout_masks)
        aux_data += self._vs.vars.values() + extra_cost_inputs
        if self.premise_stack_tops:
            aux_data.append(self.premise_stack_tops)
        non_sequences += list(set(aux_data))

        bscan_ret, self.bscan_updates = theano.scan(
                step_b, sequences, outputs_info, non_sequences,
                go_backwards=True,
                n_steps=self.seq_length,
#                strict=True,
                name=self._prefix + "stack_bwd")

        self.gradients = {wrt_i: self.bscan_updates.get(wrt_var)
                          for wrt_i, wrt_var in zip(wrt, wrt_vars)}
        if compute_embedding_gradients:
            self.embedding_gradients = self.bscan_updates[dE]
Example #35
0
 def get_output_for(self, input, **kwargs):
     x, y = input
     if y.ndim == 1:
         y = T.extra_ops.to_one_hot(y, self.num_cls)
     assert y.ndim == 2
     return T.concatenate([x, y], axis=1)
def c_6layer_mnist_imputation(seed=0,
                              ctype='cva',
                              pertub_type=3,
                              pertub_prob=6,
                              pertub_prob1=14,
                              visualization_times=20,
                              denoise_times=200,
                              predir=None,
                              n_batch=144,
                              dataset='mnist.pkl.gz',
                              batch_size=500):
    """
    Missing data imputation
    """
    #cp->cd->cpd->cd->c
    nkerns = [32, 32, 64, 64, 64]
    drops = [0, 0, 0, 0, 0, 1]
    #skerns=[5, 3, 3, 3, 3]
    #pools=[2, 1, 1, 2, 1]
    #modes=['same']*5
    n_hidden = [500, 50]
    drop_inverses = [
        1,
    ]
    # 28->12->12->5->5/5*5*64->500->50->500->5*5*64/5->5->12->12->28

    if dataset == 'mnist.pkl.gz':
        dim_input = (28, 28)
        colorImg = False

    logdir = 'results/imputation/' + ctype + '/mnist/' + ctype + '_6layer_mnist_' + str(
        pertub_type) + '_' + str(pertub_prob) + '_' + str(
            pertub_prob1) + '_' + str(denoise_times) + '_'
    logdir += str(int(time.time())) + '/'

    if not os.path.exists(logdir): os.makedirs(logdir)

    print predir
    with open(logdir + 'hook.txt', 'a') as f:
        print >> f, predir

    train_set_x, test_set_x, test_set_x_pertub, pertub_label, pertub_number = datapy.load_pertub_data(
        dirs='data_imputation/',
        pertub_type=pertub_type,
        pertub_prob=pertub_prob,
        pertub_prob1=pertub_prob1)

    datasets = datapy.load_data_gpu(dataset, have_matrix=True)

    _, _, _ = datasets[0]
    valid_set_x, _, _ = datasets[1]
    _, _, _ = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')
    x_pertub = T.matrix(
        'x_pertub')  # the data is presented as rasterized images
    p_label = T.matrix('p_label')

    random_z = T.matrix('random_z')

    drop = T.iscalar('drop')
    drop_inverse = T.iscalar('drop_inverse')

    activation = nonlinearity.relu

    rng = np.random.RandomState(seed)
    rng_share = theano.tensor.shared_randomstreams.RandomStreams(0)

    input_x = x_pertub.reshape((batch_size, 1, 28, 28))

    recg_layer = []
    cnn_output = []

    #1
    recg_layer.append(
        ConvMaxPool.ConvMaxPool(rng,
                                image_shape=(batch_size, 1, 28, 28),
                                filter_shape=(nkerns[0], 1, 5, 5),
                                poolsize=(2, 2),
                                border_mode='valid',
                                activation=activation))
    if drops[0] == 1:
        cnn_output.append(recg_layer[-1].drop_output(input=input_x,
                                                     drop=drop,
                                                     rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(input=input_x))

    #2
    recg_layer.append(
        ConvMaxPool.ConvMaxPool(rng,
                                image_shape=(batch_size, nkerns[0], 12, 12),
                                filter_shape=(nkerns[1], nkerns[0], 3, 3),
                                poolsize=(1, 1),
                                border_mode='same',
                                activation=activation))
    if drops[1] == 1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1],
                                                     drop=drop,
                                                     rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))

    #3
    recg_layer.append(
        ConvMaxPool.ConvMaxPool(rng,
                                image_shape=(batch_size, nkerns[1], 12, 12),
                                filter_shape=(nkerns[2], nkerns[1], 3, 3),
                                poolsize=(2, 2),
                                border_mode='valid',
                                activation=activation))
    if drops[2] == 1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1],
                                                     drop=drop,
                                                     rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))

    #4
    recg_layer.append(
        ConvMaxPool.ConvMaxPool(rng,
                                image_shape=(batch_size, nkerns[2], 5, 5),
                                filter_shape=(nkerns[3], nkerns[2], 3, 3),
                                poolsize=(1, 1),
                                border_mode='same',
                                activation=activation))
    if drops[3] == 1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1],
                                                     drop=drop,
                                                     rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    #5
    recg_layer.append(
        ConvMaxPool.ConvMaxPool(rng,
                                image_shape=(batch_size, nkerns[3], 5, 5),
                                filter_shape=(nkerns[4], nkerns[3], 3, 3),
                                poolsize=(1, 1),
                                border_mode='same',
                                activation=activation))
    if drops[4] == 1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1],
                                                     drop=drop,
                                                     rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))

    mlp_input_x = cnn_output[-1].flatten(2)

    activations = []

    #1
    recg_layer.append(
        FullyConnected.FullyConnected(rng=rng,
                                      n_in=5 * 5 * nkerns[-1],
                                      n_out=n_hidden[0],
                                      activation=activation))
    if drops[-1] == 1:
        activations.append(recg_layer[-1].drop_output(input=mlp_input_x,
                                                      drop=drop,
                                                      rng=rng_share))
    else:
        activations.append(recg_layer[-1].output(input=mlp_input_x))

    #stochastic layer
    recg_layer.append(
        GaussianHidden.GaussianHidden(rng=rng,
                                      input=activations[-1],
                                      n_in=n_hidden[0],
                                      n_out=n_hidden[1],
                                      activation=None))

    z = recg_layer[-1].sample_z(rng_share)

    gene_layer = []
    z_output = []
    random_z_output = []

    #1
    gene_layer.append(
        FullyConnected.FullyConnected(rng=rng,
                                      n_in=n_hidden[1],
                                      n_out=n_hidden[0],
                                      activation=activation))

    z_output.append(gene_layer[-1].output(input=z))
    random_z_output.append(gene_layer[-1].output(input=random_z))

    #2
    gene_layer.append(
        FullyConnected.FullyConnected(rng=rng,
                                      n_in=n_hidden[0],
                                      n_out=5 * 5 * nkerns[-1],
                                      activation=activation))

    if drop_inverses[0] == 1:
        z_output.append(gene_layer[-1].drop_output(input=z_output[-1],
                                                   drop=drop_inverse,
                                                   rng=rng_share))
        random_z_output.append(gene_layer[-1].drop_output(
            input=random_z_output[-1], drop=drop_inverse, rng=rng_share))
    else:
        z_output.append(gene_layer[-1].output(input=z_output[-1]))
        random_z_output.append(
            gene_layer[-1].output(input=random_z_output[-1]))

    input_z = z_output[-1].reshape((batch_size, nkerns[-1], 5, 5))
    input_random_z = random_z_output[-1].reshape((n_batch, nkerns[-1], 5, 5))

    #1
    gene_layer.append(
        UnpoolConvNon.UnpoolConvNon(rng,
                                    image_shape=(batch_size, nkerns[-1], 5, 5),
                                    filter_shape=(nkerns[-2], nkerns[-1], 3,
                                                  3),
                                    poolsize=(1, 1),
                                    border_mode='same',
                                    activation=activation))

    z_output.append(gene_layer[-1].output(input=input_z))
    random_z_output.append(gene_layer[-1].output_random_generation(
        input=input_random_z, n_batch=n_batch))

    #2
    gene_layer.append(
        UnpoolConvNon.UnpoolConvNon(rng,
                                    image_shape=(batch_size, nkerns[-2], 5, 5),
                                    filter_shape=(nkerns[-3], nkerns[-2], 3,
                                                  3),
                                    poolsize=(2, 2),
                                    border_mode='full',
                                    activation=activation))

    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(
        input=random_z_output[-1], n_batch=n_batch))

    #3
    gene_layer.append(
        UnpoolConvNon.UnpoolConvNon(rng,
                                    image_shape=(batch_size, nkerns[-3], 12,
                                                 12),
                                    filter_shape=(nkerns[-4], nkerns[-3], 3,
                                                  3),
                                    poolsize=(1, 1),
                                    border_mode='same',
                                    activation=activation))

    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(
        input=random_z_output[-1], n_batch=n_batch))

    #4
    gene_layer.append(
        UnpoolConvNon.UnpoolConvNon(rng,
                                    image_shape=(batch_size, nkerns[-4], 12,
                                                 12),
                                    filter_shape=(nkerns[-5], nkerns[-4], 3,
                                                  3),
                                    poolsize=(1, 1),
                                    border_mode='same',
                                    activation=activation))

    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(
        input=random_z_output[-1], n_batch=n_batch))

    #5 stochastic layer
    # for the last layer, the nonliearity should be sigmoid to achieve mean of Bernoulli
    gene_layer.append(
        UnpoolConvNon.UnpoolConvNon(rng,
                                    image_shape=(batch_size, nkerns[-5], 12,
                                                 12),
                                    filter_shape=(1, nkerns[-5], 5, 5),
                                    poolsize=(2, 2),
                                    border_mode='full',
                                    activation=nonlinearity.sigmoid))

    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(
        input=random_z_output[-1], n_batch=n_batch))

    gene_layer.append(
        NoParamsBernoulliVisiable.NoParamsBernoulliVisiable(
            #rng=rng,
            #mean=z_output[-1],
            #data=input_x,
        ))
    logpx = gene_layer[-1].logpx(mean=z_output[-1], data=input_x)

    # 4-D tensor of random generation
    random_x_mean = random_z_output[-1]
    random_x = gene_layer[-1].sample_x(rng_share, random_x_mean)

    x_denoised = z_output[-1].flatten(2)
    x_denoised = p_label * x + (1 - p_label) * x_denoised

    mse = ((x - x_denoised)**2).sum() / pertub_number

    params = []
    for g in gene_layer:
        params += g.params
    for r in recg_layer:
        params += r.params

    train_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x_pertub: train_set_x[index * batch_size:(index + 1) * batch_size],
            drop: np.cast['int32'](0)
        })

    valid_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x_pertub: valid_set_x[index * batch_size:(index + 1) * batch_size],
            drop: np.cast['int32'](0)
        })

    test_activations = theano.function(inputs=[x_pertub],
                                       outputs=T.concatenate(activations,
                                                             axis=1),
                                       givens={drop: np.cast['int32'](0)})

    imputation_model = theano.function(
        inputs=[index, x_pertub],
        outputs=[x_denoised, mse],
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            p_label: pertub_label[index * batch_size:(index + 1) * batch_size],
            drop: np.cast['int32'](0),
            drop_inverse: np.cast['int32'](0)
        })

    ##################
    # Pretrain MODEL #
    ##################

    model_epoch = 600
    if os.environ.has_key('model_epoch'):
        model_epoch = int(os.environ['model_epoch'])
    if predir is not None:
        color.printBlue('... setting parameters')
        color.printBlue(predir)
        if model_epoch == -1:
            pre_train = np.load(predir + 'best-model.npz')
        else:
            pre_train = np.load(predir + 'model-' + str(model_epoch) + '.npz')
        pre_train = pre_train['model']
        if ctype == 'cva':
            for (para, pre) in zip(params, pre_train):
                para.set_value(pre)
        elif ctype == 'cmmva':
            for (para, pre) in zip(params, pre_train[:-2]):
                para.set_value(pre)
        else:
            exit()
    else:
        exit()

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'

    epoch = 0
    n_visualization = 100
    output = np.ones((n_visualization, visualization_times + 2, 784))
    output[:, 0, :] = test_set_x.get_value()[:n_visualization, :]
    output[:, 1, :] = test_set_x_pertub.get_value()[:n_visualization, :]

    image = paramgraphics.mat_to_img(output[:, 0, :].T,
                                     dim_input,
                                     colorImg=colorImg)
    image.save(logdir + 'data.png', 'PNG')
    image = paramgraphics.mat_to_img(output[:, 1, :].T,
                                     dim_input,
                                     colorImg=colorImg)
    image.save(logdir + 'data_pertub.png', 'PNG')

    tmp = test_set_x_pertub.get_value()

    while epoch < denoise_times:
        epoch = epoch + 1
        this_mse = 0
        for i in xrange(n_test_batches):
            d, m = imputation_model(i,
                                    tmp[i * batch_size:(i + 1) * batch_size])
            tmp[i * batch_size:(i + 1) * batch_size] = np.asarray(d)
            this_mse += m
        if epoch <= visualization_times:
            output[:, epoch + 1, :] = tmp[:n_visualization, :]

        print epoch, this_mse
        with open(logdir + 'hook.txt', 'a') as f:
            print >> f, epoch, this_mse

        image = paramgraphics.mat_to_img(tmp[:n_visualization, :].T,
                                         dim_input,
                                         colorImg=colorImg)
        image.save(logdir + 'procedure-' + str(epoch) + '.png', 'PNG')
        np.savez(logdir + 'procedure-' + str(epoch), tmp=tmp)

    image = paramgraphics.mat_to_img((output.reshape(-1, 784)).T,
                                     dim_input,
                                     colorImg=colorImg,
                                     tile_shape=(n_visualization, 22))
    image.save(logdir + 'output.png', 'PNG')
    np.savez(logdir + 'output', output=output)

    # save original train features and denoise test features
    for i in xrange(n_train_batches):
        if i == 0:
            train_features = np.asarray(train_activations(i))
        else:
            train_features = np.vstack(
                (train_features, np.asarray(train_activations(i))))

    for i in xrange(n_valid_batches):
        if i == 0:
            valid_features = np.asarray(valid_activations(i))
        else:
            valid_features = np.vstack(
                (valid_features, np.asarray(valid_activations(i))))

    for i in xrange(n_test_batches):
        if i == 0:
            test_features = np.asarray(
                test_activations(tmp[i * batch_size:(i + 1) * batch_size]))
        else:
            test_features = np.vstack(
                (test_features,
                 np.asarray(
                     test_activations(tmp[i * batch_size:(i + 1) *
                                          batch_size]))))

    np.save(logdir + 'train_features', train_features)
    np.save(logdir + 'valid_features', valid_features)
    np.save(logdir + 'test_features', test_features)
Example #37
0
File: rnn.py Project: osdf/breze
def exprs(inpt_mean,
          inpt_var,
          in_to_hidden,
          hidden_to_hiddens,
          hidden_to_out,
          hidden_biases,
          hidden_var_scales_sqrt,
          initial_hiddens,
          recurrents,
          out_bias,
          out_var_scale_sqrt,
          hidden_transfers,
          out_transfer,
          in_to_out=None,
          skip_to_outs=None,
          p_dropouts=None,
          hotk_inpt=False):
    """Return a dictionary containing Theano expressions for various components
    of a recurrent network with variance propagation.

    Parameters
    ----------

    inpt_mean : Theano variable
        Represents the mean of the input sequences as a sequence tensor.

    inpt_var : Theano variable
        Representes the variance of the input sequences as a sequence tensor
        Can a be a scalar as well. (E.g. 1e-8 if no variance is desired at
        this point.)

    in_to_hidden : Theano variable
        Matrix representing the map from input to the first hidden layer.

    hidden_to_hiddens : list of Theano variables
        List of matrices representing the maps between the hiddens.

    hidden_to_out : Theano variable
        Matrix representing the map from the last hidden layer to the output
        layer.

    hidden_biases : list of Theano variables
        Biases for the hidden layers.

    hidden_var_scales_sqrt : Theano variable
        Biases for the variances. See ``forward_layer`` for an exact description
        of what it does.

    initial_hiddens : list of Theano variables
        List of vectors representing the initial hidden states.

    recurrents : list of Theano variables
        List of matrices representing the recurrent weight matrices.

    out_bias : Theano variable
        Bias vector of the output layer.

    hidden_transfers : list of funtions or strings
        List of transfer functions for the layers. Each element is either a
        function that given a mean and a variance sequence tensor produces
        equally shaped mean and variance tensors or a string pointing to a
        function in ``breze.arch.component.varprop.transfer``.

    out_transfer : Theano variable
        Function or string of the form described for ``hidden_transfers``.

    p_dropouts : List of scalars
        Each element in this list represents the probability to drop out an
        individual unit in the corresponding layer.
        The list should contain N+1 items, where N is the number of hidden
        layers. If N+2 items are contained, the last element is used to
        drop out units from hidden to out, while the one before is used to drop
        out units from hidden to hidden.

    Returns
    -------

    exprs : dictionary
       Map of strings to Theano expressions.

       Keys are:

         - ``hidden_in_mean_*``: pre-synaptic mean of layer,
         - ``hidden_in_var_0``: pre-synaptic variance of layer,
         - ``hidden_mean_0``: post-synaptic mean of layer,
         - ``hidden_var_0``: post-synaptic variance of layer,
         - ``inpt_mean``: mean of the input,
         - ``inpt_var``: variance of the input
         - ``output_in_mean``: pre-synaptic mean of output,
         - ``output_in_var``: pre-synptic variance of output,
         - ``output_mean``: post-synaptic mean of output,
         - ``output_var``: post-synaptic variance of output,
         - ``output``: concatenation of mean and variance of output
    """
    # TODO add skip to outs docs
    # TODO: add pooling
    # TODO: add leaky integration
    exprs = {}

    f_hiddens = [lookup(i, transfer) for i in hidden_transfers]
    f_output = lookup(out_transfer, transfer)

    if inpt_var.ndim != 3:
        # Scalar
        inpt_var = T.ones_like(inpt_mean) * inpt_var

    if hotk_inpt:
        hmi, hvi, hmo, hvo = int_forward_layer(inpt_mean, inpt_var,
                                               in_to_hidden, hidden_biases[0],
                                               hidden_var_scales_sqrt[0],
                                               f_hiddens[0], p_dropouts[0])
    else:
        hmi, hvi, hmo, hvo = forward_layer(inpt_mean, inpt_var, in_to_hidden,
                                           hidden_biases[0],
                                           hidden_var_scales_sqrt[0],
                                           f_hiddens[0], p_dropouts[0])

    hmi_rec, hvi_rec, hmo_rec, hvo_rec = recurrent_layer(
        hmi, hvi, recurrents[0], f_hiddens[0], initial_hiddens[0],
        p_dropouts[1])

    exprs.update({
        'hidden_in_mean_0': hmi_rec,
        'hidden_in_var_0': hvi_rec,
        'hidden_mean_0': hmo_rec,
        'hidden_var_0': hvo_rec
    })

    zipped = zip(hidden_to_hiddens, hidden_biases[1:],
                 hidden_var_scales_sqrt[1:], recurrents[1:], f_hiddens[1:],
                 initial_hiddens[1:], p_dropouts[1:])

    for i, (w, b, vb, r, t, j, d) in enumerate(zipped):
        hmo_rec_m1, hvo_rec_m1 = hmo_rec, hvo_rec

        hmi, hvi, hmo, hvo = forward_layer(hmo_rec_m1, hvo_rec_m1, w, b, vb, t,
                                           d)

        hmi_rec, hvi_rec, hmo_rec, hvo_rec = recurrent_layer(
            hmi, hvi, r, t, j, d)

        exprs.update({
            'hidden_in_mean_%i' % (i + 1): hmi,
            'hidden_in_var_%i' % (i + 1): hvi,
            'hidden_mean_%i' % (i + 1): hmo,
            'hidden_var_%i' % (i + 1): hvo
        })

    output_in_mean, output_in_var, _, _ = forward_layer(
        hmo_rec, hvo_rec, hidden_to_out, out_bias, hidden_var_scales_sqrt[-1],
        lambda x, y: (x, y), p_dropouts[-1])

    if in_to_out is not None:
        output_mean_inc, output_var_inc, _, _ = forward_layer(
            inpt_mean, inpt_var, in_to_out, T.zeros_like(out_bias),
            T.ones_like(out_bias), lambda x, y: (x, y), p_dropouts[0])
        output_in_mean += output_mean_inc
        output_in_var += output_var_inc
    if skip_to_outs is not None:
        for i, s in enumerate(skip_to_outs):
            output_mean_inc, output_var_inc, _, _ = forward_layer(
                exprs['hidden_mean_%i' % i], exprs['hidden_var_%i' % i], s,
                T.zeros_like(out_bias), T.ones_like(out_bias), lambda x, y:
                (x, y), p_dropouts[i + 1])
            output_in_mean += output_mean_inc
            output_in_var += output_var_inc

    output_mean, output_var = f_output(output_in_mean, output_in_var)

    # TODO: raise not implemented for out scale

    exprs.update({
        'inpt_mean': inpt_mean,
        'inpt_var': inpt_var,
        'output_in_mean': output_in_mean,
        'output_in_var': output_in_var,
        'output_mean': output_mean,
        'output_var': output_var,
        'output': T.concatenate([output_mean, output_var], axis=2),
    })

    return exprs
Example #38
0
def lstm_decoder_layer(tparams_all, input_state, options, maxlen, dp, prefix="lstm_decoder_layer"):

    tparams_d = tparams_all[0]
    tparams_g = tparams_all[1]

    #rng = numpy.random.RandomState(4567)
    trng = RandomStreams(SEED)

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n * dim:(n + 1) * dim]
        return _x[:, n * dim:(n + 1) * dim]

    def _step(x_, m_, h_, c_):

        preact = tensor.dot(x_, tparams_g[_p(prefix, 'W')]) + tparams_g[_p(prefix, 'b')] + \
                 tensor.dot(h_, tparams_g[_p(prefix, 'U')])
        
        i = tensor.nnet.sigmoid(_slice(preact, 0, options[_p(prefix, 'n')]))
        f = tensor.nnet.sigmoid(_slice(preact, 1, options[_p(prefix, 'n')]))
        o = tensor.nnet.sigmoid(_slice(preact, 2, options[_p(prefix, 'n')]))
        c = tensor.tanh(_slice(preact, 3, options[_p(prefix, 'n')]))

        c = f * c_ + i * c
        
        h = o * tensor.tanh(c)

        s = tensor.nnet.softmax(tensor.dot(h, tparams_g['to_idx_emb']))

        #x_t = tensor.dot((s / s.max(axis=1)[:,None]).astype('int32').astype(theano.config.floatX), tparams_d['Wemb'])
        x_t = tensor.dot(tensor.switch(s < s.max(axis=1)[:,None], 0.0, 1.0).astype(theano.config.floatX), 
                         tparams_d['Wemb'])

        x_out = s.argmax(axis=1)

        m = tensor.switch(tensor.eq(x_out, 10), 0.0, 1.0).astype(theano.config.floatX) * m_
        
        #x_t = tensor.dot(h_, tparams[_p(prefix, 'W_x')]) + tparams[_p(prefix, 'b_x')]

        return x_out, x_t, m, h, c


    ##############################################################################################
    rval, updates = theano.scan(_step,
                                outputs_info=[None,
                                              input_state,
                                              tensor.alloc(numpy_floatX(1.), input_state.shape[0]),
                                              tensor.alloc(numpy_floatX(0.), input_state.shape[0], options['lstm_decoder_layer_n']),
                                              tensor.alloc(numpy_floatX(0.), input_state.shape[0], options['lstm_decoder_layer_n'])],
                                name=_p(prefix, '_layers'),
                                n_steps=maxlen)


    #proj_0 = rval[1]#tensor.tanh(rval[0])

    m22 = trng.binomial(size=(input_state.shape[0],), p=dp, n=1, dtype=theano.config.floatX)
    
    #return rval[0]*m2, rval[1]*m2[:,None], rval[2]*m2

    if(tensor.gt(maxlen, 4) == 1):
        x2 = tensor.alloc(numpy.asarray(0, dtype='int32'), maxlen - 4, input_state.shape[0])
        x2 = tensor.concatenate((tensor.alloc(numpy.asarray(options['end_idx'], dtype='int32'), input_state.shape[0])[None, :],
                                 tensor.alloc(numpy.asarray(options['end_idx'], dtype='int32'), input_state.shape[0])[None, :],
                                 tensor.alloc(numpy.asarray(7, dtype='int32'), input_state.shape[0])[None, :],
                                 tensor.alloc(numpy.asarray(10, dtype='int32'), input_state.shape[0])[None, :],
                                 x2),
                                 axis=0)


        m2 = tensor.alloc(numpy_floatX(0.), maxlen - 3, input_state.shape[0])
        m2 = tensor.concatenate((tensor.alloc(numpy_floatX(1.), input_state.shape[0])[None, :],
                                 tensor.alloc(numpy_floatX(1.), input_state.shape[0])[None, :],
                                 tensor.alloc(numpy_floatX(1.), input_state.shape[0])[None, :],
                                 m2), 
                                 axis=0)
    
        xt2 = tparams_d['Wemb'][x2]

        return rval[0]*m22+x2*(1-m22), rval[1]*m22[:,None]+xt2*(1-m22[:,None]), rval[2]*m22+m2*(1-m22)

    else:
        return rval[0]*m22, rval[1]*m22[:,None], rval[2]*m22
Example #39
0
    def __init__(self, We, params):

        lstm_layers_num = 1
        en_hidden_size = We.shape[1]
        self.eta = params.eta
        self.num_labels = params.num_labels
        self.en_hidden_size = en_hidden_size
        self.de_hidden_size = params.de_hidden_size

        self.lstm_layers_num = params.lstm_layers_num
        self._train = None
        self._utter = None
        self.params = []
        self.encoder_lstm_layers = []
        self.decoder_lstm_layers = []
        self.hos = []
        self.Cos = []

        encoderInputs = tensor.imatrix()
        decoderInputs, decoderTarget = tensor.imatrices(2)
        encoderMask, TF, decoderMask, decoderInputs0 = tensor.fmatrices(4)

        self.lookuptable = theano.shared(We)

        #### the last one is for the stary symbole
        self.de_lookuptable = theano.shared(name="Decoder LookUpTable",
                                            value=init_xavier_uniform(
                                                self.num_labels + 1,
                                                self.de_hidden_size),
                                            borrow=True)

        self.linear = theano.shared(
            name="Linear",
            value=init_xavier_uniform(self.de_hidden_size + 2 * en_hidden_size,
                                      self.num_labels),
            borrow=True)

        self.hidden_decode = theano.shared(name="Hidden to Decode",
                                           value=init_xavier_uniform(
                                               2 * en_hidden_size,
                                               self.de_hidden_size),
                                           borrow=True)

        self.hidden_bias = theano.shared(
            name="Hidden to Bias",
            value=np.asarray(np.random.randn(self.de_hidden_size, ) * 0.,
                             dtype=theano.config.floatX),
            borrow=True)

        #self.params += [self.linear, self.de_lookuptable, self.hidden_decode, self.hidden_bias]    #concatenate
        self.params += [self.linear, self.de_lookuptable
                        ]  #the initial hidden state of decoder lstm is zeros
        #(max_sent_size, batch_size, hidden_size)
        state_below = self.lookuptable[encoderInputs.flatten()].reshape(
            (encoderInputs.shape[0], encoderInputs.shape[1],
             self.en_hidden_size))
        for _ in range(self.lstm_layers_num):

            enclstm_f = LSTM(self.en_hidden_size)
            enclstm_b = LSTM(self.en_hidden_size, True)
            self.encoder_lstm_layers.append(enclstm_f)  #append
            self.encoder_lstm_layers.append(enclstm_b)  #append
            self.params += enclstm_f.params + enclstm_b.params  #concatenate

            hs_f, Cs_f = enclstm_f.forward(state_below, encoderMask)
            hs_b, Cs_b = enclstm_b.forward(state_below, encoderMask)

            hs = tensor.concatenate([hs_f, hs_b], axis=2)
            Cs = tensor.concatenate([Cs_f, Cs_b], axis=2)
            hs0 = tensor.concatenate([hs_f[-1], hs_b[0]], axis=1)
            Cs0 = tensor.concatenate([Cs_f[-1], Cs_b[0]], axis=1)
            #self.hos += tensor.tanh(tensor.dot(hs0, self.hidden_decode) + self.hidden_bias),
            #self.Cos += tensor.tanh(tensor.dot(Cs0, self.hidden_decode) + self.hidden_bias),
            self.hos += tensor.alloc(
                np.asarray(0., dtype=theano.config.floatX),
                encoderInputs.shape[1], self.de_hidden_size),
            self.Cos += tensor.alloc(
                np.asarray(0., dtype=theano.config.floatX),
                encoderInputs.shape[1], self.de_hidden_size),
            state_below = hs

        Encoder = state_below

        state_below = self.de_lookuptable[decoderInputs.flatten()].reshape(
            (decoderInputs.shape[0], decoderInputs.shape[1],
             self.de_hidden_size))
        for i in range(self.lstm_layers_num):
            declstm = LSTM(self.de_hidden_size)
            self.decoder_lstm_layers += declstm,  #append
            self.params += declstm.params  #concatenate
            ho, Co = self.hos[i], self.Cos[i]
            state_below, Cs = declstm.forward(state_below, decoderMask, ho, Co)

        ##### Here we include the representation from the encoder
        decoder_lstm_outputs = tensor.concatenate([state_below, Encoder],
                                                  axis=2)

        ei, di, dt = tensor.imatrices(3)  #place holders
        em, dm, tf, di0 = tensor.fmatrices(4)
        #####################################################
        #####################################################
        linear_outputs = tensor.dot(decoder_lstm_outputs, self.linear)
        softmax_outputs, updates = theano.scan(
            fn=lambda x: tensor.nnet.softmax(x),
            sequences=[linear_outputs],
        )

        def _NLL(pred, y, m):
            return -m * tensor.log(pred[tensor.arange(encoderInputs.shape[1]),
                                        y])

        costs, _ = theano.scan(
            fn=_NLL, sequences=[softmax_outputs, decoderTarget, decoderMask])
        loss = costs.sum() / decoderMask.sum() + params.L2 * sum(
            lasagne.regularization.l2(x) for x in self.params)

        updates = lasagne.updates.adam(loss, self.params, self.eta)
        #updates = lasagne.updates.apply_momentum(updates, self.params, momentum=0.9)

        ###################################################
        #### using the ground truth when training
        ##################################################
        self._train = theano.function(inputs=[ei, em, di, dm, dt],
                                      outputs=[loss, softmax_outputs],
                                      updates=updates,
                                      givens={
                                          encoderInputs: ei,
                                          encoderMask: em,
                                          decoderInputs: di,
                                          decoderMask: dm,
                                          decoderTarget: dt
                                      })

        #########################################################################
        ### For schedule sampling
        #########################################################################

        ###### always use privous predict as next input
        def _step2(ctx_, state_, hs_, Cs_):

            hs, Cs = [], []
            token_idxs = tensor.cast(state_.argmax(axis=-1), "int32")
            msk_ = tensor.fill(
                (tensor.zeros_like(token_idxs, dtype="float32")), 1)
            msk_ = msk_.dimshuffle('x', 0)
            state_below0 = self.de_lookuptable[token_idxs].reshape(
                (1, encoderInputs.shape[1], self.de_hidden_size))
            for i, lstm in enumerate(self.decoder_lstm_layers):
                h, C = lstm.forward(state_below0, msk_, hs_[i],
                                    Cs_[i])  #mind msk
                hs += h[-1],
                Cs += C[-1],
                state_below0 = h

            hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable(
                Cs)
            state_below0 = state_below0.reshape(
                (encoderInputs.shape[1], self.de_hidden_size))
            state_below0 = tensor.concatenate([ctx_, state_below0], axis=1)
            newpred = tensor.dot(state_below0, self.linear)
            state_below = tensor.nnet.softmax(newpred)

            return state_below, hs, Cs

        hs0, Cs0 = tensor.as_tensor_variable(
            self.hos, name="hs0"), tensor.as_tensor_variable(self.Cos,
                                                             name="Cs0")
        train_outputs, _ = theano.scan(fn=_step2,
                                       sequences=[Encoder],
                                       outputs_info=[decoderInputs0, hs0, Cs0],
                                       n_steps=encoderInputs.shape[0])

        train_predict = train_outputs[0]
        train_costs, _ = theano.scan(
            fn=_NLL, sequences=[train_predict, decoderTarget, decoderMask])

        train_loss = train_costs.sum() / decoderMask.sum() + params.L2 * sum(
            lasagne.regularization.l2(x) for x in self.params)

        train_updates = lasagne.updates.sgd(train_loss, self.params, self.eta)
        train_updates = lasagne.updates.apply_momentum(train_updates,
                                                       self.params,
                                                       momentum=0.9)

        self._train2 = theano.function(
            inputs=[ei, em, di0, dm, dt],
            outputs=[train_loss, train_predict],
            updates=train_updates,
            givens={
                encoderInputs: ei,
                encoderMask: em,
                decoderInputs0: di0,
                decoderMask: dm,
                decoderTarget: dt
            }
            #givens={encoderInputs:ei, encoderMask:em, decoderInputs:di, decoderMask:dm, decoderTarget:dt, TF:tf}
        )

        listof_token_idx = train_predict.argmax(axis=-1)
        self._utter = theano.function(inputs=[ei, em, di0],
                                      outputs=listof_token_idx,
                                      givens={
                                          encoderInputs: ei,
                                          encoderMask: em,
                                          decoderInputs0: di0
                                      })
Example #40
0
def Build_Model(tparams_all, options):

    trng = RandomStreams(SEED)

    # Discriminator
    x0 = tensor.matrix('x0', dtype='int32') #SIP Path
    x1 = tensor.matrix('x1', dtype='int32') #RelSrc Path
    x3 = tensor.matrix('x3', dtype='int32') #Cue Path

    mask0 = tensor.matrix('mask0', dtype=config.floatX)
    mask1 = tensor.matrix('mask1', dtype=config.floatX)
    mask3 = tensor.matrix('mask3', dtype=config.floatX)

    x0_d_y_fake = tensor.vector('x0_d_y_fake', dtype='int32')
    x1_d_y_fake = tensor.vector('x1_d_y_fake', dtype='int32')
    x3_d_y_fake = tensor.vector('x3_d_y_fake', dtype='int32')

    y0 = tensor.vector('y0', dtype='int32')
    y1 = tensor.vector('y1', dtype='int32')

    # Generator
    x_noise_0 = tensor.matrix('x_noise_0', dtype=config.floatX)
    x_noise_1 = tensor.matrix('x_noise_1', dtype=config.floatX)
    x_noise_3 = tensor.matrix('x_noise_3', dtype=config.floatX)

    #x0_g_y_fake = tensor.vector('x0_g_fake', dtype='int32')
    #x1_g_y_fake = tensor.vector('x1_g_fake', dtype='int32')
    #x3_g_y_fake = tensor.vector('x3_g_fake', dtype='int32')

    maxlen_0 = tensor.scalar(name='maxlen_0', dtype='int32')
    maxlen_1 = tensor.scalar(name='maxlen_1', dtype='int32')
    maxlen_3 = tensor.scalar(name='maxlen_3', dtype='int32')

    ###################

    dropout_ratio = tensor.scalar(name='dropout_ratio')
    dropout_decay_ratio = tensor.scalar(name='dropout_decay_ratio')

    tparams_d = tparams_all[0]
    tparams_g = tparams_all[1]

    #####################################
    # Discriminator
    p_0 = lstm_layer(tparams_d, input_state=tparams_d['Wemb'][x0], mask=mask0, options=options)
    p_1 = lstm_layer(tparams_d, input_state=tparams_d['Wemb'][x1], mask=mask1, options=options)
    p_3 = lstm_layer(tparams_d, input_state=tparams_d['Wemb'][x3[2:,:]], mask=mask3, options=options)

    proj_0 = tensor.concatenate((p_0, p_1), axis=1)
    proj_1 = tensor.concatenate((tparams_d['CueTemb'][x3[0, :]], tparams_d['Lemb'][x3[1, :]], p_3), axis=1)

    proj_0 = proj_0 * dropout_mask_1D(proj_0, 1, dropout_ratio, trng) * dropout_decay_ratio
    proj_1 = proj_1 * dropout_mask_1D(proj_1, 1, dropout_ratio, trng) * dropout_decay_ratio

    pred_0 = tensor.nnet.softmax(tensor.dot(proj_0, tparams_d['Ws0']) + tparams_d['bs0'])
    pred_1 = tensor.nnet.softmax(tensor.dot(proj_1, tparams_d['Ws1']) + tparams_d['bs1'])

    x0_d_fake_pred = tensor.nnet.softmax(tensor.dot(p_0, tparams_d['Ws_fake']))
    x1_d_fake_pred = tensor.nnet.softmax(tensor.dot(p_1, tparams_d['Ws_fake']))
    x3_d_fake_pred = tensor.nnet.softmax(tensor.dot(p_3, tparams_d['Ws_fake']))


    f_D_pred_prob = theano.function(inputs=[x0, x1, x3, mask0, mask1, mask3, dropout_ratio, dropout_decay_ratio], 
                                    outputs=[pred_0.max(axis=1), pred_1.max(axis=1)], 
                                    name='f_D_pred_prob')

    f_D_pred = theano.function(inputs=[x0, x1, x3, mask0, mask1, mask3, dropout_ratio, dropout_decay_ratio],
                               outputs=[pred_0.argmax(axis=1), pred_1.argmax(axis=1)], 
                               name='f_D_pred')

    off = 1e-8


    d_cost = - 1./3.*tensor.mean(tensor.log(x0_d_fake_pred[tensor.arange(x0_d_y_fake.shape[0]), x0_d_y_fake] + off)) + \
             - 1./3.*tensor.mean(tensor.log(x1_d_fake_pred[tensor.arange(x1_d_y_fake.shape[0]), x1_d_y_fake] + off)) + \
             - 1./3.*tensor.mean(tensor.log(x3_d_fake_pred[tensor.arange(x3_d_y_fake.shape[0]), x3_d_y_fake] + off)) + \
             - 1./2.*tensor.mean(tensor.log(pred_0[tensor.arange(y0.shape[0]), y0] + off)) + \
             - 1./2.*tensor.mean(tensor.log(pred_1[tensor.arange(y1.shape[0]), y1] + off))

    ##############################################################
    # Generator
    xn_0 = x_noise_0 * tparams_g['label_emb_0'][y0] * tparams_g['label_emb_1'][y1]
    xn_1 = x_noise_1 * tparams_g['label_emb_0'][y0] * tparams_g['label_emb_1'][y1]
    xn_3 = x_noise_3 * tparams_g['label_emb_0'][y0] * tparams_g['label_emb_1'][y1]

    x0_g, x0_g_emb, x0_g_mask = lstm_decoder_layer(tparams_all, xn_0, options, maxlen_0, 0.9)
    x1_g, x1_g_emb, x1_g_mask = lstm_decoder_layer(tparams_all, xn_1, options, maxlen_1, 0.9)
    x3_g, x3_g_emb, x3_g_mask = lstm_decoder_layer(tparams_all, xn_3, options, maxlen_3, 0.7)

    p_g0 = lstm_layer(tparams_d, input_state=x0_g_emb, mask=x0_g_mask, options=options)
    p_g1 = lstm_layer(tparams_d, input_state=x1_g_emb, mask=x1_g_mask, options=options)
    p_g3 = lstm_layer(tparams_d, input_state=x3_g_emb, mask=x3_g_mask, options=options)

    f_G_produce = theano.function(inputs=[x_noise_0, x_noise_1, x_noise_3, 
                                          maxlen_0, maxlen_1, maxlen_3,
                                          y0, y1],
                                  outputs=[x0_g.astype('int32'), x1_g.astype('int32'), x3_g.astype('int32'),
                                           x0_g_mask, x1_g_mask, x3_g_mask],
                                  name='f_G_produce')

    g_cost = (((p_0 - p_g0)**2).sum(axis=1).mean() + 
              ((p_1 - p_g1)**2).sum(axis=1).mean() + 
              ((p_3 - p_g3)**2).sum(axis=1).mean()) / 3.


    return [x0,x1,x3],[mask0, mask1, mask3],[x0_d_y_fake, x1_d_y_fake, x3_d_y_fake], [y0, y1], \
           [x_noise_0, x_noise_1, x_noise_3], \
           [maxlen_0, maxlen_1, maxlen_3], \
           f_D_pred_prob, f_D_pred, f_G_produce, \
           [dropout_ratio, dropout_decay_ratio], \
           d_cost, g_cost
Example #41
0
    def build_and_train_rbf(self, X, Y):

        y_onehot = self.class_to_onehot(Y)
        n_dims = y_onehot.shape[1]
        centers = self.compute_centers(X)

        x = T.dmatrix()
        y = T.imatrix()

        #bias, centers, sigmas, weights
        template = [
            n_dims, centers.shape, self.l1_size, (self.l1_size, n_dims)
        ]

        #initialize and train RBF network
        model = theano_rbfnet(input=x,
                              n_cents=self.l1_size,
                              centers=centers,
                              n_dims=n_dims,
                              reg=self.penalty)

        cost = model.neg_log_likelihood(y)

        g_b = T.grad(cost, model.b)
        g_c = T.grad(cost, model.c)
        g_s = T.grad(cost, model.s)
        g_w = T.grad(cost, model.w)

        g_params = T.concatenate(
            [g_b.flatten(),
             g_c.flatten(),
             g_s.flatten(),
             g_w.flatten()])

        getcost = theano.function([x, y], outputs=cost)
        getdcost = theano.function([x, y], outputs=g_params)

        def cost_fcn(params, inputs, targets):
            model.set_params(params, template)
            x = inputs
            y = targets
            return getcost(x, y)

        def cost_grad(params, inputs, targets):
            model.set_params(params, template)
            x = inputs
            y = targets
            return getdcost(x, y)

        args = climin.util.iter_minibatches([X, y_onehot], self.batch_size,
                                            [0, 0])
        batch_args = itertools.repeat(([X, y_onehot], {}))
        args = ((i, {}) for i in args)
        init_params = model.get_params(template)

        opt_sgd = climin.GradientDescent(init_params,
                                         cost_fcn,
                                         cost_grad,
                                         steprate=0.1,
                                         momentum=0.99,
                                         args=args,
                                         momentum_type="nesterov")

        opt_ncg = climin.NonlinearConjugateGradient(init_params,
                                                    cost_fcn,
                                                    cost_grad,
                                                    args=batch_args)

        opt_lbfgs = climin.Lbfgs(init_params,
                                 cost_fcn,
                                 cost_grad,
                                 args=batch_args)
        #choose the optimizer
        if self.optimizer == 'sgd':
            optimizer = opt_sgd
        elif self.optimizer == 'ncg':
            optimizer = opt_ncg
        else:
            optimizer = opt_lbfgs

        #do the actual training.
        costs = []
        for itr_info in optimizer:
            if itr_info['n_iter'] > self.max_iters: break
            costs.append(itr_info['loss'])

        model.set_params(init_params, template)
        return model, costs
Example #42
0
    def __init__(self, We_initial, words, memsize, rel, relsize, Rel_init, LC,
                 LW, eta, margin, usepeep, acti):

        self.LC = LC
        self.LW = LW
        self.margin = margin
        self.memsize = memsize
        self.usepeep = usepeep
        self.relsize = relsize
        self.words = words
        self.rel = rel

        self.a1 = np.zeros((35, relsize, relsize))
        for k in range(35):
            for i in range(self.a1.shape[1]):
                for j in range(self.a1.shape[2]):
                    if (i == j):
                        self.a1[k][i][j] = 1
                    else:
                        self.a1[k][i][j] = 0

        self.Rel = theano.shared(Rel_init).astype(theano.config.floatX)
        self.iden = theano.shared(self.a1)
        self.we = theano.shared(We_initial).astype(theano.config.floatX)

        g1batchindices = T.imatrix()
        g2batchindices = T.imatrix()
        p1batchindices = T.imatrix()
        p2batchindices = T.imatrix()
        g1mask = T.tensor3()
        g2mask = T.tensor3()
        p1mask = T.tensor3()
        p2mask = T.tensor3()
        g1length = T.imatrix()
        g2length = T.imatrix()
        p1length = T.imatrix()
        p2length = T.imatrix()
        target = T.dmatrix()

        g1mask = T.patternbroadcast(g1mask, broadcastable=[False, False, True])
        g2mask = T.patternbroadcast(g2mask, broadcastable=[False, False, True])
        p1mask = T.patternbroadcast(p1mask, broadcastable=[False, False, True])
        p2mask = T.patternbroadcast(p2mask, broadcastable=[False, False, True])

        We0 = T.dmatrix()
        l_in = lasagne.layers.InputLayer((None, None))
        l_mask = lasagne.layers.InputLayer(shape=(None, None))

        l_emb = lasagne.layers.EmbeddingLayer(
            l_in,
            input_size=self.we.get_value().shape[0],
            output_size=self.we.get_value().shape[1],
            W=self.we)
        l_out = l_emb
        embg1 = lasagne.layers.get_output(l_emb, {
            l_in: g1batchindices,
            l_mask: g1mask
        })
        embg2 = lasagne.layers.get_output(l_emb, {
            l_in: g2batchindices,
            l_mask: g2mask
        })
        embp1 = lasagne.layers.get_output(l_emb, {
            l_in: p1batchindices,
            l_mask: p1mask
        })
        embp2 = lasagne.layers.get_output(l_emb, {
            l_in: p2batchindices,
            l_mask: p2mask
        })

        embg1 = embg1 * g1mask
        embg1_sum = T.sum(embg1, axis=1)
        embg1_len = T.patternbroadcast(g1length, broadcastable=[False, True])
        embg1_mean = embg1_sum / embg1_len
        embg1_mean = embg1_mean.reshape([-1, self.memsize])

        embg2 = embg2 * g2mask
        embg2_sum = T.sum(embg2, axis=1)
        embg2_len = T.patternbroadcast(g2length, broadcastable=[False, True])
        embg2_mean = embg2_sum / embg2_len
        embg2_mean = embg2_mean.reshape([-1, self.memsize])

        embp1 = embp1 * p1mask
        embp1_sum = T.sum(embp1, axis=1)
        embp1_len = T.patternbroadcast(p1length, broadcastable=[False, True])
        embp1_mean = embp1_sum / embp1_len
        embp1_mean = embp1_mean.reshape([-1, self.memsize])

        embp2 = embp2 * p2mask
        embp2_sum = T.sum(embp2, axis=1)
        embp2_len = T.patternbroadcast(p2length, broadcastable=[False, True])
        embp2_mean = embp2_sum / embp2_len
        embp2_mean = embp2_mean.reshape([-1, self.memsize])

        #############################################################
        r = T.ivector()
        p3 = T.ivector()
        r0 = self.Rel[r]
        r1 = self.Rel[p3]

        self.a2 = np.random.uniform(low=-0.2,
                                    high=0.2,
                                    size=[memsize, relsize])
        self.a3 = np.random.uniform(low=-0.2, high=0.2, size=[
            relsize,
        ])
        self.w = theano.shared(self.a2)
        self.b = theano.shared(self.a3)

        embg1_rel = T.tanh(
            T.dot(embg1_mean, self.w) + self.b.dimshuffle('x', 0))
        embg2_rel = T.tanh(
            T.dot(embg2_mean, self.w) + self.b.dimshuffle('x', 0))
        embp1_rel = T.tanh(
            T.dot(embp1_mean, self.w) + self.b.dimshuffle('x', 0))
        embp2_rel = T.tanh(
            T.dot(embp2_mean, self.w) + self.b.dimshuffle('x', 0))

        g1g2 = T.batched_dot(embg1_rel, r0)
        g1g2 = T.batched_dot(g1g2, embg2_rel)
        p1g1_neg = T.batched_dot(embp1_rel, r0)
        p1g1_neg = T.batched_dot(p1g1_neg, embg2_rel)
        p2g2_neg = T.batched_dot(embg1_rel, r0)
        p2g2_neg = T.batched_dot(p2g2_neg, embp2_rel)
        g1g2_neg = T.batched_dot(embg1_rel, r1)
        g1g2_neg = T.batched_dot(g1g2_neg, embg2_rel)

        g1g2 = T.nnet.sigmoid(g1g2).reshape([-1, 1])
        p1g1_neg = T.nnet.sigmoid(p1g1_neg).reshape([-1, 1])
        p2g2_neg = T.nnet.sigmoid(p2g2_neg).reshape([-1, 1])
        g1g2_neg = T.nnet.sigmoid(g1g2_neg).reshape([-1, 1])

        lsm = T.concatenate([g1g2, p1g1_neg, p2g2_neg, g1g2_neg], axis=0)

        #updates
        network_params = lasagne.layers.get_all_params(l_out, trainable=True)
        network_params.append(self.w)
        network_params.append(self.b)
        self.all_params = network_params
        self.all_params.append(self.Rel)
        self.all_params.append(self.we)

        #feedforward
        self.feedforward_function = theano.function(
            [g1batchindices, g1mask, g1length],
            embg1_rel,
            on_unused_input='warn')
        # self.softMax=theano.function([ar],outputs=softmaxOutput2)
        #cost_function
        softmaxOutput = lsm.clip(1e-7, 1.0 - 1e-7)
        # softmaxOutput2=lsm2.clip(1e-7,1.0 - 1e-7)

        loss = lasagne.objectives.binary_crossentropy(softmaxOutput, target)
        loss = lasagne.objectives.aggregate(loss, mode='mean')

        l2_penalty1 = lasagne.regularization.apply_penalty(network_params, l2)

        cost_new = (1000 * loss) + (self.LC * l2_penalty1) + (
            self.LW * lasagne.regularization.l2(r0 - self.iden[r]))

        #train_function
        updates = lasagne.updates.adagrad(cost_new,
                                          self.all_params,
                                          learning_rate=eta)
        self.train_function = theano.function([
            g1batchindices, g2batchindices, p1batchindices, p2batchindices,
            g1mask, g2mask, p1mask, p2mask, g1length, g2length, p1length,
            p2length, r, We0, p3, target
        ], [cost_new, loss],
                                              updates=updates,
                                              on_unused_input='warn')
Example #43
0
def edhmm_fit(inp, nans, n_subs, last):
    # inp - array containing responses, outcomes, and a switch variable witch turns off update in the presence of nans
    # nans - bool array pointing towards locations of nan responses and outcomes
    # n_subs - int value, total number of subjects (each subjects is fited to a different parameter value)
    # last - int value, negative value denoting number of last trials to exclude from parameter estimation
    #        e.g. setting last = -35 excludes the last 35 trials from parameter estimation.

    #define the hierarchical parametric model for ED-HMM
    #define the hierarchical parametric model
    d_max = 200  #maximal value for state duration
    with Model() as edhmm:
        d = tt.arange(
            d_max)  #vector of possible duration values from zero to d_max
        d = tt.tile(d, (n_subs, 1))
        P = tt.ones((2, 2)) - tt.eye(2)  #permutation matrix

        #set prior state probability
        theta0 = tt.ones(n_subs) / 2

        #set hierarchical prior for delta parameter of prior beliefs p_0(d)
        dtau = HalfCauchy('dtau', beta=1)
        dloc = HalfCauchy('dloc', beta=dtau, shape=(n_subs, ))
        delta = Deterministic('delta', dloc / (1 + dloc))

        #set hierarchical prior for r parameter of prior beleifs p_0(d)
        rtau = HalfCauchy('rtau', beta=1)
        rloc = HalfCauchy('rloc', beta=rtau, shape=(n_subs, ))
        r = Deterministic('r', 1 + rloc)

        #compute prior beliefs over state durations for given
        binomln = tt.gammaln(d + r[:, None]) - tt.gammaln(d + 1) - tt.gammaln(
            r[:, None])
        pd0 = tt.nnet.softmax(binomln + d * log(1 - delta[:, None]) +
                              r[:, None] * log(delta[:, None]))

        #set joint probability distribution
        joint0 = tt.stack([theta0[:, None] * pd0,
                           (1 - theta0)[:, None] * pd0]).dimshuffle(1, 0, 2)

        #set hierarchical priors for response noises
        btau = HalfCauchy('btau', beta=1)
        bloc = HalfCauchy('bloc', beta=btau, shape=(n_subs, ))
        beta = Deterministic('beta', 1 / bloc)

        #set hierarchical priors for initial inital beliefs about reward probability
        mtau = HalfCauchy('mtau', beta=4)
        mloc = HalfCauchy('mloc', beta=mtau, shape=(n_subs, 2))
        muA = Deterministic('muA', mloc[:, 0] / (1 + mloc[:, 0]))
        muB = Deterministic('muB', 1 / (1 + mloc[:, 1]))
        init = tt.stacklists([[10*muA, 10*(1-muA)], \
                              [10*muB, 10*(1-muB)]]).dimshuffle(2,0,1)

        #compute the posterior beleifs over states, durations, and reward probabilities
        (post, _) = scan(edhmm_model,
                         sequences=[inp],
                         outputs_info=[init, joint0],
                         non_sequences=[pd0, P, range(n_subs)],
                         name='edhmm')

        #get posterior reward probabliity and state probability
        a0 = init[None, :, :, 0]
        b0 = init[None, :, :, 1]
        a = tt.concatenate([a0, post[0][:-1, :, :, 0]])
        b = tt.concatenate([b0, post[0][:-1, :, :, 1]])
        mu = Deterministic('mu', a / (a + b))
        theta = Deterministic('theta', tt.concatenate([theta0[None, :], \
                              post[1][:-1].sum(axis=-1)[:,:,0]])[:,:,None])

        #compute choice dependend expected reward probability
        mean = (theta * mu + (1 - theta) * mu.dot(P))

        #compute expected utility
        U = Deterministic('U', 2 * mean - 1)

        #set hierarchical prior for response biases
        ctau = HalfCauchy('ctau', beta=1)
        cloc = HalfCauchy('cloc', beta=ctau, shape=(n_subs, ))
        c0 = Deterministic('c0', cloc / (1 + cloc))

        #compute response noise and response bias modulated expected free energy
        G = Deterministic(
            'G', beta[None, :, None] * U + log([c0, 1 - c0]).T[None, :, :])

        #compute response probability for the prereversal and the reversal phase of the experiment
        nzero = tt.nonzero(~nans[:last])
        p = Deterministic('p', tt.nnet.softmax(G[:last][nzero]))

        #set observation likelihood of responses
        responses = inp[:last, :, 0][~nans[:last]]
        Categorical('obs', p=p, observed=responses)

    #fit the model
    with edhmm:
        approx = fit(method='advi', n=50000, progressbar=True)

    return approx
Example #44
0
    def down_q(input, train, w):
        
        #if name == '1':
        #    print input.tag.test_value
        
        # prior
        h = down_nl1(input, w)
        #h = T.printing.Print('h1'+name)(h)
        h = down_conv1(h, w)
        #h = T.printing.Print('h2'+name)(h)
        
        logqs = 0
        
        # posterior
        if posterior in ['up_diag','up_iaf1','up_iaf2','up_iaf1_nl','up_iaf2_nl']:
            z = qz[0].sample
            logqs = qz[0].logps
        elif posterior == 'down_diag':
            rz_mean = h[:,n_conv_down_prior:n_conv_down_prior+n_z,:,:]
            rz_logsd = h[:,n_conv_down_prior+n_z:n_conv_down_prior+2*n_z,:,:]
            _qz = N.rand.gaussian_diag(qz[0].mean + rz_mean, qz[0].logvar + 2*rz_logsd)
            z = _qz.sample
            logqs = _qz.logps                
        elif posterior == 'down_tim':
            assert prior == 'diag'
            pz_mean = h[:,n_h2:n_h2+n_z,:,:]
            pz_logsd = h[:,n_h2+n_z:n_h2+2*n_z,:,:]
            
            qz_prec = 1./T.exp(qz[0].logvar)
            pz_prec = 1./T.exp(2*pz_logsd)
            rz_prec = qz_prec + pz_prec
            rz_mean = (pz_prec/rz_prec) * pz_mean + (qz_prec/rz_prec) * qz[0].mean
            _qz = N.rand.gaussian_diag(rz_mean, -T.log(rz_prec))
            z = _qz.sample
            logqs = _qz.logps
        elif posterior == 'down_iaf1':
            rz_mean = h[:,n_conv_down_prior:n_conv_down_prior+n_z,:,:]
            rz_logsd = h[:,n_conv_down_prior+n_z:n_conv_down_prior+2*n_z,:,:]
            _qz = N.rand.gaussian_diag(qz[0].mean + rz_mean, qz[0].logvar + 2*rz_logsd)
            z = _qz.sample
            logqs = _qz.logps
            # ARW transform
            arw_mean = posterior_conv1(z, w)
            arw_mean *= .1
            z = (z - arw_mean)
        elif posterior == 'down_iaf2':
            rz_mean = h[:,n_conv_down_prior:n_conv_down_prior+n_z,:,:]
            rz_logsd = h[:,n_conv_down_prior+n_z:n_conv_down_prior+2*n_z,:,:]
            _qz = N.rand.gaussian_diag(qz[0].mean + rz_mean, qz[0].logvar + 2*rz_logsd)
            z = _qz.sample
            logqs = _qz.logps
            # ARW transform
            arw_mean_logsd = posterior_conv1(z, w)
            arw_mean = arw_mean_logsd[:,::2,:,:]
            arw_logsd = arw_mean_logsd[:,1::2,:,:]
            arw_mean *= .1
            arw_logsd *= .1
            z = (z - arw_mean) / T.exp(arw_logsd)
            logqs += arw_logsd
        elif posterior in ['down_iaf1_nl','down_iaf1_deep']:
            rz_mean = h[:,n_conv_down_prior:n_conv_down_prior+n_z,:,:]
            rz_logsd = h[:,n_conv_down_prior+n_z:n_conv_down_prior+2*n_z,:,:]
            _qz = N.rand.gaussian_diag(qz[0].mean + rz_mean, qz[0].logvar + 2*rz_logsd)
            z = _qz.sample
            logqs = _qz.logps
            # ARW transform
            down_context = h[:,n_conv_down_prior+2*n_z:n_conv_down_prior+2*n_z+n_h2,:,:]
            context = up_context[0] + down_context
            arw_mean = posterior_conv1(z, context, w)
            arw_mean *= .1
            z = (z - arw_mean)
        elif posterior in ['down_iaf2_nl','down_iaf2_nl2','down_iaf2_deep']:
            rz_mean = h[:,n_conv_down_prior:n_conv_down_prior+n_z,:,:]
            rz_logsd = h[:,n_conv_down_prior+n_z:n_conv_down_prior+2*n_z,:,:]
            _qz = N.rand.gaussian_diag(qz[0].mean + rz_mean, qz[0].logvar + 2*rz_logsd)
            z = _qz.sample
#            logqs = _qz.logps # specifically we block the gradient here 
            logqs = _qz.logps_pd
            # ARW transform
            down_context = h[:,n_conv_down_prior+2*n_z:n_conv_down_prior+2*n_z+n_h2,:,:]
            context = up_context[0] + down_context
            arw_mean, arw_logsd = posterior_conv1(z, context, w)
            arw_mean *= .1
            arw_logsd *= .1
            z = (z - arw_mean) / T.exp(arw_logsd)
            logqs += arw_logsd
            if posterior == 'down_iaf2_nl2':
                arw_mean, arw_logsd = posterior_conv2(z, context, w)
                arw_mean *= .1
                arw_logsd *= .1
                z = (z - arw_mean) / T.exp(arw_logsd)
                logqs += arw_logsd
            
        
        # Prior
        if prior == 'diag':
            pz_mean = h[:,n_h2:n_h2+n_z,:,:]
            pz_logsd = h[:,n_h2+n_z:n_h2+2*n_z,:,:]
            logps = N.rand.gaussian_diag(pz_mean, 2*pz_logsd, z).logps
        elif prior == 'diag2':
            logps = N.rand.gaussian_diag(0*z, 0*z, z).logps
            pz_mean = h[:,n_h2:n_h2+n_z,:,:]
            pz_logsd = h[:,n_h2+n_z:n_h2+2*n_z,:,:]
            z = pz_mean + z * T.exp(pz_logsd)
        elif prior == 'made':
            made_context = h[:,n_h2:2*n_h2,:,:]
            made_mean, made_logsd = prior_conv1(z, made_context, w)
            made_mean *= .1
            made_logsd *= .1
            logps = N.rand.gaussian_diag(made_mean, 2*made_logsd, z).logps
        elif prior == 'bernoulli':
            assert posterior == 'down_bernoulli'
            pz_p = bernoulli_p(h[:,n_h2:n_h2+n_z,:,:])
            logps = z01 * T.log(pz_p) + (1.-z01) * T.log(1.-pz_p)
        else:
            raise Exception()
        
        h_det = h[:,:n_h2,:,:]
        h = T.concatenate([h_det, z], axis=1)
        if downsample:
            if downsample_type == 'nn':
                input = N.conv.upsample2d_nearest_neighbour(input)
            elif downsample_type == 'conv':
                input = down_conv3(input, w)
        
        output = input + .1 * down_conv2(down_nl2(h, w), w)
        
        return output, logqs - logps
Example #45
0
 def apply_detector(W, jet, n_jets):
     map_i = []
     for start, end in zip(range(0, 16, n_jets), range(4, 16+4, n_jets)):
         map_i.append(rectify(T.dot(W, jet[start:end])))
     return T.concatenate(map_i, axis=0)
Example #46
0
    def build_and_train_nnet(self, X, Y):

        y_onehot = self.class_to_onehot(Y)
        n_in = X.shape[1]
        n_nodes = self.l1_size
        n_out = y_onehot.shape[1]

        x = T.dmatrix()
        y = T.imatrix()

        #bias1, bias2, weights1, weights2
        template = [(n_nodes, ), (n_out, ), (n_in, n_nodes), (n_nodes, n_out)]

        #initialize nnet
        model = nnet(input=x, n_in=n_in, n_nodes=n_nodes, n_out=n_out)
        cost = model.neg_log_likelihood(y)

        g_b1 = T.grad(cost, model.b1)
        g_b2 = T.grad(cost, model.b2)
        g_w1 = T.grad(cost, model.w1)
        g_w2 = T.grad(cost, model.w2)

        g_params = T.concatenate(
            [g_b1.flatten(),
             g_b2.flatten(),
             g_w1.flatten(),
             g_w2.flatten()])

        getcost = theano.function([x, y], outputs=cost)
        getdcost = theano.function([x, y], outputs=g_params)

        def cost_fcn(params, inputs, targets):
            model.set_params(params, template)
            x = inputs
            y = targets
            return getcost(x, y)

        def cost_grad(params, inputs, targets):
            model.set_params(params, template)
            x = inputs
            y = targets
            return getdcost(x, y)

        args = climin.util.iter_minibatches([X, y_onehot], self.batch_size,
                                            [0, 0])
        batch_args = itertools.repeat(([X, y_onehot], {}))
        args = ((i, {}) for i in args)
        init_params = model.get_params(template)

        opt_sgd = climin.GradientDescent(init_params,
                                         cost_fcn,
                                         cost_grad,
                                         steprate=0.01,
                                         momentum=0.99,
                                         args=args,
                                         momentum_type="nesterov")

        opt_ncg = climin.NonlinearConjugateGradient(init_params,
                                                    cost_fcn,
                                                    cost_grad,
                                                    args=batch_args)

        opt_lbfgs = climin.Lbfgs(init_params,
                                 cost_fcn,
                                 cost_grad,
                                 args=batch_args)
        #choose the optimizer
        if self.optimizer == 'sgd':
            optimizer = opt_sgd
        elif self.optimizer == 'ncg':
            optimizer = opt_ncg
        else:
            optimizer = opt_lbfgs

        #do the actual training.
        costs = []
        for itr_info in optimizer:
            if itr_info['n_iter'] > self.max_iters: break
            costs.append(itr_info['loss'])

        model.set_params(init_params, template)
        return model, costs
Example #47
0
    def build(self,
              dropout,
              char_dim,
              char_lstm_dim,
              char_bidirect,
              word_dim,
              word_lstm_dim,
              word_bidirect,
              lr_method,
              pre_emb,
              crf,
              cap_dim,
              crowd_dim,
              n_crowds,
              training=True,
              crowd_reg=1.0,
              **kwargs):
        """
        Build the network.
        """
        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)

        #crowd embed dim = number of tags
        #if crowd_dim:
        #    crowd_dim = n_tags

        # Number of capitalization features
        if cap_dim:
            n_cap = 4

        # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')
        char_for_ids = T.imatrix(name='char_for_ids')
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')
        tag_ids = T.ivector(name='tag_ids')
        crowd_ids = T.ivector(name='crowd_ids')

        if cap_dim:
            cap_ids = T.ivector(name='cap_ids')

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        #
        # Word inputs
        #
        if word_dim:
            input_dim += word_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            inputs.append(word_input)
            # Initialize with pretrained embeddings
            if pre_emb and training:
                new_weights = word_layer.embeddings.get_value()
                print 'Loading pretrained embeddings from %s...' % pre_emb
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
                    line = line.rstrip().split()
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights[i] = pretrained[re.sub(
                            '\d', '0', word.lower())]
                        c_zeros += 1
                word_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained embeddings.' % len(pretrained)
                print(
                    '%i / %i (%.4f%%) words have been initialized with '
                    'pretrained embeddings.') % (
                        c_found + c_lower + c_zeros, n_words, 100. *
                        (c_found + c_lower + c_zeros) / n_words)
                print(
                    '%i found directly, %i after lowercasing, '
                    '%i after lowercasing + zero.') % (c_found, c_lower,
                                                       c_zeros)

        #
        # Chars inputs
        #
        if char_dim:
            input_dim += char_lstm_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

            char_lstm_for = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_layer.link(char_for_ids))
            char_lstm_rev.link(char_layer.link(char_rev_ids))

            char_for_output = char_lstm_for.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]
            char_rev_output = char_lstm_rev.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]

            inputs.append(char_for_output)
            if char_bidirect:
                inputs.append(char_rev_output)
                input_dim += char_lstm_dim

        #
        # Capitalization feature
        #
        if cap_dim:
            input_dim += cap_dim
            cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer')
            inputs.append(cap_layer.link(cap_ids))

        # Prepare final input
        if len(inputs) != 1:
            inputs = T.concatenate(inputs, axis=1)

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        # LSTM for words
        word_lstm_for = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_rev')
        word_lstm_for.link(inputs)
        word_lstm_rev.link(inputs[::-1, :])
        word_for_output = word_lstm_for.h
        word_rev_output = word_lstm_rev.h[::-1, :]
        if word_bidirect:
            final_output = T.concatenate([word_for_output, word_rev_output],
                                         axis=1)
            tanh_layer = HiddenLayer(2 * word_lstm_dim,
                                     word_lstm_dim,
                                     name='tanh_layer',
                                     activation='tanh')
            final_output = tanh_layer.link(final_output)
        else:
            final_output = word_for_output

        # Sentence to Named Entity tags - Score

        if crowd_dim:
            crowd_layer = EmbeddingLayer(n_crowds,
                                         crowd_dim,
                                         name='crowd_layer')
            crowd_scores = T.switch(T.neq(is_train, 0),
                                    crowd_layer.link(crowd_ids),
                                    0 * crowd_layer.link(crowd_ids))
            #final_output = T.switch(T.neq(is_train, 0), T.concatenate([final_output, crowd_scores], axis = 1), final_output)
            final_output = T.concatenate([final_output, crowd_scores], axis=1)

        #final_layer_test = HiddenLayer(word_lstm_dim, n_tags, name='final_layer',
        #                          activation=(None if crf else 'softmax'))

        final_layer = HiddenLayer(word_lstm_dim + crowd_dim,
                                  n_tags,
                                  name='final_layer',
                                  activation=(None if crf else 'softmax'))

        #tags_scores = T.switch(T.neq(is_train, 0), final_layer_train.link(final_output), final_layer_test.link(final_output))
        tags_scores = final_layer.link(final_output)

        # No CRF
        if not crf:
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')

            small = -1000
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
            observations = T.concatenate(
                [tags_scores, small * T.ones((s_len, 2))], axis=1)
            observations = T.concatenate([b_s, observations, e_s], axis=0)

            # Score from tags
            real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()

            # Score from transitions
            b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
            e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
            padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
            real_path_score += transitions[padded_tags_ids[T.arange(s_len +
                                                                    1)],
                                           padded_tags_ids[T.arange(s_len + 1)
                                                           + 1]].sum()

            all_paths_scores = forward(observations, transitions)
            cost = -(real_path_score - all_paths_scores)

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)
        if char_dim:
            self.add_component(char_layer)
            self.add_component(char_lstm_for)
            params.extend(char_layer.params)
            params.extend(char_lstm_for.params)
            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)
        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)
        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)
        if cap_dim:
            self.add_component(cap_layer)
            params.extend(cap_layer.params)
        self.add_component(final_layer)
        params.extend(final_layer.params)
        if crf:
            self.add_component(transitions)
            params.append(transitions)
        if word_bidirect:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)
        if crowd_dim:
            self.add_component(crowd_layer)
            params.extend(crowd_layer.params)
            cost = cost + (crowd_reg * crowd_layer.params[0] *
                           crowd_layer.params[0]).mean()

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        if cap_dim:
            eval_inputs.append(cap_ids)
        train_inputs = eval_inputs + [tag_ids]
        if crowd_dim:
            eval_inputs.append(crowd_ids)
            train_inputs.append(crowd_ids)

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        #return immediate function
        #return theano.function(train_inputs, tags_scores, on_unused_input='ignore', \
        #                       givens=({is_train: np.cast['int32'](1)} if dropout else {}))

        # Compile training function
        print 'Compiling...'
        if training:
            updates = Optimization(clip=5.0).get_updates(
                lr_method_name, cost, params, **lr_method_parameters)
            f_train = theano.function(inputs=train_inputs,
                                      outputs=cost,
                                      updates=updates,
                                      givens=({
                                          is_train: np.cast['int32'](1)
                                      } if dropout else {}))
        else:
            f_train = None

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=tags_scores,
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
        else:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=forward(
                                         observations,
                                         transitions,
                                         viterbi=True,
                                         return_alpha=False,
                                         return_best_sequence=True),
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))

        return f_train, f_eval
Example #48
0
def durw_fit(inp, nans, n_subs, last):
    # inp - array containing responses, outcomes, and a switch variable witch turns off update in the presence of nans
    # nans - bool array pointing towards locations of nan responses and outcomes
    # n_subs - int value, total number of subjects (each subjects is fited to a different parameter value)
    # last - int value, negative value denoting number of last trials to exclude from parameter estimation
    #        e.g. setting last = -35 excludes the last 35 trials from parameter estimation.

    #define the hierarchical parametric model for DU-RW
    with Model() as durw:

        #set hierarchical priors for learning rates
        atau = HalfCauchy('atau', beta=1)
        aloc = HalfCauchy('aloc', beta=atau, shape=(n_subs, ))
        alpha = Deterministic('alpha', aloc / (1 + aloc))

        #set hierarchical priors for coupling strengths
        ktau = HalfCauchy('ktau', beta=1)
        kloc = HalfCauchy('kloc', beta=ktau, shape=(n_subs, ))
        kappa = Deterministic('kappa', kloc / (1 + kloc))

        #set hierarchical priors for response noises
        btau = HalfCauchy('btau', beta=1)
        bloc = HalfCauchy('bloc', beta=btau, shape=(n_subs, ))
        beta = Deterministic('beta', 1 / bloc)

        #set hierarchical priors for initial choice value
        mtau = HalfCauchy('mtau', beta=1)
        mlocA = HalfCauchy('mlocA', beta=mtau, shape=(n_subs, ))
        mlocB = HalfCauchy('mlocB', beta=mtau, shape=(n_subs, ))
        muA = Deterministic('muA', mlocA / (1 + mlocA))
        muB = Deterministic('muB', 1 / (1 + mlocB))
        V0 = tt.stacklists([2 * muA - 1, 2 * muB - 1]).T

        #compute the choice values
        (Q, _) = scan(durw_model,
                      sequences=[inp],
                      outputs_info=V0,
                      non_sequences=[alpha, kappa, range(n_subs)],
                      name='rw')

        V0 = Deterministic('V0', V0[None, :, :])
        V = Deterministic('V', tt.concatenate([V0, Q[:-1]]))

        #set hierarchical prior for response biases
        ctau = HalfCauchy('ctau', beta=1)
        cloc = HalfCauchy('cloc', beta=ctau, shape=(n_subs, ))
        c0 = Deterministic('c0', cloc / (1 + cloc))

        #compute response noise and response bias modulated response values
        G = Deterministic(
            'G', beta[None, :, None] * V + log([c0, 1 - c0]).T[None, :, :])

        #compute response probability for the prereversal and the reversal phase of the experiment
        nzero = tt.nonzero(~nans[:last])
        p = Deterministic('p', tt.nnet.softmax(G[:last][nzero]))

        #set observation likelihood of responses
        Categorical('obs', p=p, observed=inp[:last, :, 0][~nans[:last]])

    #fit the model
    with durw:
        approx = fit(method='advi', n=50000, progressbar=True)

    return approx
Example #49
0
    def __init__(self,
                 rng,
                 input_source,
                 input_target,
                 label_source,
                 batch_size,
                 struct,
                 coef,
                 train=False,
                 init_params=None):
        """Initialize the parameters for the multilayer perceptron

        :type rng: numpy.random.RandomState
        :param rng: a random number generator used to initialize weights

        :type input_source: theano.tensor.TensorType
        :param input: symbolic variable that describes the "Source Domain" input of the architecture (one minibatch)
        
        :type input_target: theano.tensor.TensorType
        :param input: symbolic variable that describes the "Target Domain" input of the architecture (one minibatch)        

        :type xxx_struct: class NN_struct
        :param xxx_strucat: define the structure of each NN
        """

        if train == True:
            batch_size[0] = batch_size[0] * coef.L
            batch_size[1] = batch_size[1] * coef.L
            tmp_S = input_source
            tmp_T = input_target
            tmp_l = label_source
            for i in range(coef.L - 1):
                tmp_S = T.concatenate([tmp_S, input_source], axis=0)
                tmp_T = T.concatenate([tmp_T, input_target], axis=0)
                tmp_l = T.concatenate([tmp_l, label_source], axis=0)
            input_source = tmp_S
            input_target = tmp_T
            label_source = tmp_l
            L = coef.L
        else:
            L = 1

        self.L = L

        self.struct = struct
        encoder1_struct = struct.encoder1
        encoder2_struct = struct.encoder2
        encoder3_struct = struct.encoder3
        encoder4_struct = struct.encoder4
        encoder5_struct = struct.encoder5
        decoder1_struct = struct.decoder1
        decoder2_struct = struct.decoder2
        decoder3_struct = struct.decoder3
        DoC_struct = struct.DomainClassifier

        alpha = coef.alpha
        beta = coef.beta
        optimize = coef.optimize

        if init_params == None:
            init_params = VLDF_ANN_params()

        #------------------------------------------------------------------------
        #Encoder 1 Neural Network: present q_\phi({z_y}_n | x_n, d_n)
        zero_v_S = T.zeros([batch_size[0], 1], dtype=theano.config.floatX)
        zero_v_T = T.zeros([batch_size[1], 1], dtype=theano.config.floatX)
        one_v_S = T.ones([batch_size[0], 1], dtype=theano.config.floatX)
        one_v_T = T.ones([batch_size[1], 1], dtype=theano.config.floatX)

        d_source = T.concatenate([zero_v_S, one_v_S], axis=1)
        xd_source = T.concatenate([input_source, d_source], axis=1)
        d_target = T.concatenate([one_v_T, zero_v_T], axis=1)
        xd_target = T.concatenate([input_target, d_target], axis=1)

        self.Encoder1 = nn.Gaussian_MLP(rng=rng,
                                        input_source=xd_source,
                                        input_target=xd_target,
                                        struct=encoder1_struct,
                                        batch_size=batch_size,
                                        params=init_params.EC1_params,
                                        name='Encoder1')

        zy_dim = encoder1_struct.mu.layer_dim[-1]
        self.EC_zy_S_mu = self.Encoder1.S_mu
        self.EC_zy_S_log_sigma = self.Encoder1.S_log_sigma
        self.EC_zy_S_sigma = T.exp(self.EC_zy_S_log_sigma)
        self.EC_zy_T_mu = self.Encoder1.T_mu
        self.EC_zy_T_log_sigma = self.Encoder1.T_log_sigma
        self.EC_zy_T_sigma = T.exp(self.EC_zy_T_log_sigma)

        self.zy_S = self.Encoder1.S_output
        self.zy_T = self.Encoder1.T_output

        self.Encoder1_params = self.Encoder1.params
        self.Encoder1_learning_rate = self.Encoder1.learning_rate
        self.Encoder1_decay = self.Encoder1.decay

        #------------------------------------------------------------------------
        #Encoder 5 Neural Network: present q_\phi(y_n | {z_y}_n)
        self.Encoder5_pi = nn.NN_Block(rng=rng,
                                       input_source=self.zy_S,
                                       input_target=self.zy_T,
                                       struct=encoder5_struct,
                                       params=init_params.EC5_params,
                                       name='Encoder5_pi')

        #Sample layer
        self.EC_5_CSL_target = nn.CatSampleLayer(
            pi=self.Encoder5_pi.output_target,
            n_in=encoder5_struct.layer_dim[-1],
            batch_size=batch_size[1])
        y_dim = encoder5_struct.layer_dim[-1]
        self.EC_y_S_pi = self.Encoder5_pi.output_source
        self.EC_y_T_pi = self.Encoder5_pi.output_target

        self.y_T = self.EC_5_CSL_target.output

        self.Encoder5_params = self.Encoder5_pi.params
        self.Encoder5_learning_rate = self.Encoder5_pi.learning_rate
        self.Encoder5_decay = self.Encoder5_pi.decay

        #------------------------------------------------------------------------
        #Encoder 3 Neural Network: present q_\phi({a_y}_n | {z_y}_n, y_n)
        #Input Append
        zyy_source = T.concatenate([self.zy_S, label_source], axis=1)
        zyy_target = T.concatenate([self.zy_T, self.y_T], axis=1)

        self.Encoder3 = nn.Gaussian_MLP(rng=rng,
                                        input_source=zyy_source,
                                        input_target=zyy_target,
                                        struct=encoder3_struct,
                                        batch_size=batch_size,
                                        params=init_params.EC3_params,
                                        name='Encoder3')

        ay_dim = encoder3_struct.mu.layer_dim[-1]
        self.EC_ay_S_mu = self.Encoder3.S_mu
        self.EC_ay_S_log_sigma = self.Encoder3.S_log_sigma
        self.EC_ay_S_sigma = T.exp(self.EC_ay_S_log_sigma)
        self.EC_ay_T_mu = self.Encoder3.T_mu
        self.EC_ay_T_log_sigma = self.Encoder3.T_log_sigma
        self.EC_ay_T_sigma = T.exp(self.EC_ay_T_log_sigma)

        self.ay_S = self.Encoder3.S_output
        self.ay_T = self.Encoder3.T_output

        self.Encoder3_params = self.Encoder3.params
        self.Encoder3_learning_rate = self.Encoder3.learning_rate
        self.Encoder3_decay = self.Encoder3.decay

        #------------------------------------------------------------------------
        #Encoder 2 Neural Network: present q_\phi({z_d}_n | x_n, d_n)
        self.Encoder2 = nn.Gaussian_MLP(rng=rng,
                                        input_source=xd_source,
                                        input_target=xd_target,
                                        struct=encoder2_struct,
                                        batch_size=batch_size,
                                        params=init_params.EC2_params,
                                        name='Encoder2')

        zd_dim = encoder2_struct.mu.layer_dim[-1]
        self.EC_zd_S_mu = self.Encoder2.S_mu
        self.EC_zd_S_log_sigma = self.Encoder2.S_log_sigma
        self.EC_zd_S_sigma = T.exp(self.EC_zd_S_log_sigma)
        self.EC_zd_T_mu = self.Encoder2.T_mu
        self.EC_zd_T_log_sigma = self.Encoder2.T_log_sigma
        self.EC_zd_T_sigma = T.exp(self.EC_zd_T_log_sigma)

        self.zd_S = self.Encoder2.S_output
        self.zd_T = self.Encoder2.T_output

        self.Encoder2_params = self.Encoder2.params
        self.Encoder2_learning_rate = self.Encoder2.learning_rate
        self.Encoder2_decay = self.Encoder2.decay

        #------------------------------------------------------------------------
        #Encoder 4 Neural Network: present q_\phi({a_d}_n | {z_d}_n, d_n)
        #Input Append
        zdd_source = T.concatenate([self.zd_S, d_source], axis=1)
        zdd_target = T.concatenate([self.zd_T, d_target], axis=1)

        self.Encoder4 = nn.Gaussian_MLP(rng=rng,
                                        input_source=zdd_source,
                                        input_target=zdd_target,
                                        struct=encoder4_struct,
                                        batch_size=batch_size,
                                        params=init_params.EC4_params,
                                        name='Encoder4')

        ad_dim = encoder4_struct.mu.layer_dim[-1]
        self.EC_ad_S_mu = self.Encoder4.S_mu
        self.EC_ad_S_log_sigma = self.Encoder4.S_log_sigma
        self.EC_ad_S_sigma = T.exp(self.EC_ad_S_log_sigma)
        self.EC_ad_T_mu = self.Encoder4.T_mu
        self.EC_ad_T_log_sigma = self.Encoder4.T_log_sigma
        self.EC_ad_T_sigma = T.exp(self.EC_ad_T_log_sigma)

        self.ad_S = self.Encoder4.S_output
        self.ad_T = self.Encoder4.T_output

        self.Encoder4_params = self.Encoder4.params
        self.Encoder4_learning_rate = self.Encoder4.learning_rate
        self.Encoder4_decay = self.Encoder4.decay

        #------------------------------------------------------------------------
        #Decoder 1 Neural Network: present p_\theta(x_n | {z_y}_n, {z_d}_n)
        zyzd_source = T.concatenate([self.zy_S, self.zd_S], axis=1)
        zyzd_target = T.concatenate([self.zy_T, self.zd_T], axis=1)

        self.Decoder1 = nn.Gaussian_MLP(rng=rng,
                                        input_source=zyzd_source,
                                        input_target=zyzd_target,
                                        struct=decoder1_struct,
                                        batch_size=batch_size,
                                        params=init_params.DC1_params,
                                        name='Decoder1')

        x_dim = decoder1_struct.mu.layer_dim[-1]
        self.DC_x_S_mu = self.Decoder1.S_mu
        self.DC_x_S_log_sigma = self.Decoder1.S_log_sigma
        self.DC_x_S_sigma = T.exp(self.DC_x_S_log_sigma)
        self.DC_x_T_mu = self.Decoder1.T_mu
        self.DC_x_T_log_sigma = self.Decoder1.T_log_sigma
        self.DC_x_T_sigma = T.exp(self.DC_x_T_log_sigma)

        self.Decoder1_params = self.Decoder1.params
        self.Decoder1_learning_rate = self.Decoder1.learning_rate
        self.Decoder1_decay = self.Decoder1.decay

        #------------------------------------------------------------------------
        #Decoder 2 Neural Network: present p_\theta({z_y}_n | {a_y}_n, y_n)
        ayy_source = T.concatenate([self.ay_S, label_source], axis=1)
        ayy_target = T.concatenate([self.ay_T, self.y_T], axis=1)

        self.Decoder2 = nn.Gaussian_MLP(rng=rng,
                                        input_source=ayy_source,
                                        input_target=ayy_target,
                                        struct=decoder2_struct,
                                        batch_size=batch_size,
                                        params=init_params.DC2_params,
                                        name='Decoder2')

        self.DC_zy_S_mu = self.Decoder2.S_mu
        self.DC_zy_S_log_sigma = self.Decoder2.S_log_sigma
        self.DC_zy_S_sigma = T.exp(self.DC_zy_S_log_sigma)
        self.DC_zy_T_mu = self.Decoder2.T_mu
        self.DC_zy_T_log_sigma = self.Decoder2.T_log_sigma
        self.DC_zy_T_sigma = T.exp(self.DC_zy_T_log_sigma)

        self.Decoder2_params = self.Decoder2.params
        self.Decoder2_learning_rate = self.Decoder2.learning_rate
        self.Decoder2_decay = self.Decoder2.decay

        #------------------------------------------------------------------------
        #Decoder 3 Neural Network: present p_\theta({z_d}_n | {a_d}_n, d_n)
        add_source = T.concatenate([self.ad_S, d_source], axis=1)
        add_target = T.concatenate([self.ad_T, d_target], axis=1)

        self.Decoder3 = nn.Gaussian_MLP(rng=rng,
                                        input_source=add_source,
                                        input_target=add_target,
                                        struct=decoder3_struct,
                                        batch_size=batch_size,
                                        params=init_params.DC3_params,
                                        name='Decoder3')

        self.DC_zd_S_mu = self.Decoder3.S_mu
        self.DC_zd_S_log_sigma = self.Decoder3.S_log_sigma
        self.DC_zd_S_sigma = T.exp(self.DC_zd_S_log_sigma)
        self.DC_zd_T_mu = self.Decoder3.T_mu
        self.DC_zd_T_log_sigma = self.Decoder3.T_log_sigma
        self.DC_zd_T_sigma = T.exp(self.DC_zd_T_log_sigma)

        self.Decoder3_params = self.Decoder3.params
        self.Decoder3_learning_rate = self.Decoder3.learning_rate
        self.Decoder3_decay = self.Decoder3.decay

        #------------------------------------------------------------------------
        #Domain Clasiifier Neural Network: present p_\varphi(d=0|z_y)
        self.DomainClassifier = nn.NN_Block(rng=rng,
                                            input_source=self.zy_S,
                                            input_target=self.zy_T,
                                            struct=DoC_struct,
                                            params=init_params.DoC_params,
                                            name='DomainClassifier')

        self.DoC_output_S = self.DomainClassifier.output_source
        self.DoC_output_T = self.DomainClassifier.output_target

        self.DoC_params = self.DomainClassifier.params
        self.DoC_learning_rate = self.DomainClassifier.learning_rate
        self.DoC_decay = self.DomainClassifier.decay

        #------------------------------------------------------------------------
        # Error Function Set
        # KL(q(zy)||p(zy)) -----------
        self.KL_zy_source = er.KLGaussianGaussian(
            self.EC_zy_S_mu, self.EC_zy_S_log_sigma, self.DC_zy_S_mu,
            self.DC_zy_S_log_sigma).sum()
        self.KL_zy_target = er.KLGaussianGaussian(
            self.EC_zy_T_mu, self.EC_zy_T_log_sigma, self.DC_zy_T_mu,
            self.DC_zy_T_log_sigma).sum()

        # KL(q(zd)||p(zd)) -----------
        self.KL_zd_source = er.KLGaussianGaussian(
            self.EC_zd_S_mu, self.EC_zd_S_log_sigma, self.DC_zd_S_mu,
            self.DC_zd_S_log_sigma).sum()
        self.KL_zd_target = er.KLGaussianGaussian(
            self.EC_zd_T_mu, self.EC_zd_T_log_sigma, self.DC_zd_T_mu,
            self.DC_zd_T_log_sigma).sum()

        # KL(q(ay)||p(ay)) -----------
        self.KL_ay_source = er.KLGaussianStdGaussian(
            self.EC_ay_S_mu, self.EC_ay_S_log_sigma).sum()
        self.KL_ay_target = er.KLGaussianStdGaussian(
            self.EC_ay_T_mu, self.EC_ay_T_log_sigma).sum()

        # KL(q(ad)||p(ad)) -----------
        self.KL_ad_source = er.KLGaussianStdGaussian(
            self.EC_ad_S_mu, self.EC_ad_S_log_sigma).sum()
        self.KL_ad_target = er.KLGaussianStdGaussian(
            self.EC_ad_T_mu, self.EC_ad_T_log_sigma).sum()

        # KL(q(y)||p(y)) only target data-----------
        # prior of y is set to 1/K, K is category number
        threshold = 0.0000001
        pi_0 = T.ones([batch_size[1], y_dim],
                      dtype=theano.config.floatX) / y_dim
        self.KL_y_target = T.sum(
            -self.EC_y_T_pi *
            T.log(T.maximum(self.EC_y_T_pi / pi_0, threshold)),
            axis=1).sum()

        # Likelihood q(y) only source data-----------
        self.LH_y_source = -T.sum(
            -label_source * T.log(T.maximum(self.EC_y_S_pi, threshold)),
            axis=1).sum()
        #self.LH_y_source = T.nnet.nnet.categorical_crossentropy(self.EC_y_S_pi, label_source)

        # Likelihood p(x) ----------- if gaussian
        self.LH_x_source = er.LogGaussianPDF(input_source, self.DC_x_S_mu,
                                             self.DC_x_S_log_sigma).sum()
        self.LH_x_target = er.LogGaussianPDF(input_target, self.DC_x_T_mu,
                                             self.DC_x_T_log_sigma).sum()

        # Domain classification error  smaller, better
        self.DoC_error_S = T.sum(
            -d_source * T.log(T.maximum(self.DoC_output_S, threshold)),
            axis=1).sum()
        self.DoC_error_T = T.sum(
            -d_target * T.log(T.maximum(self.DoC_output_T, threshold)),
            axis=1).sum()
        self.DoC_error = self.DoC_error_S + self.DoC_error_T

        #Cost function
        LM_tmp = self.KL_zy_source + self.KL_zy_target + self.KL_ay_source + self.KL_ay_target \
            + self.KL_zd_source + self.KL_zd_target + self.KL_ad_source + self.KL_ad_target \
            + self.LH_x_source + self.LH_x_target+ self.KL_y_target + self.LH_y_source * alpha
        self.LM_cost = -LM_tmp / (batch_size[0] + batch_size[1])

        DoC_tmp = self.DoC_error
        self.DoC_cost = -DoC_tmp.mean() / (batch_size[0] +
                                           batch_size[1]) * beta

        self.cost = self.LM_cost + self.DoC_cost

        # the parameters of the model
        self.LM_params = self.Encoder1_params + self.Encoder2_params + self.Encoder3_params \
                    + self.Encoder4_params + self.Encoder5_params \
                    + self.Decoder1_params + self.Decoder2_params + self.Decoder3_params
        self.params = self.LM_params + self.DoC_params

        self.LM_learning_rate = self.Encoder1_learning_rate + self.Encoder2_learning_rate + self.Encoder3_learning_rate \
                    + self.Encoder4_learning_rate + self.Encoder5_learning_rate \
                    + self.Decoder1_learning_rate + self.Decoder2_learning_rate + self.Decoder3_learning_rate
        self.learning_rate = self.LM_learning_rate + self.DoC_learning_rate

        self.LM_decay = self.Encoder1_decay + self.Encoder2_decay + self.Encoder3_decay \
                    + self.Encoder4_decay + self.Encoder5_decay \
                    + self.Decoder1_decay + self.Decoder2_decay + self.Decoder3_decay
        self.decay = self.LM_decay + self.DoC_decay

        if optimize == 'Adam_update':
            #Adam update function
            self.LM_updates = nn.adam(loss=self.cost,
                                      all_params=self.LM_params,
                                      all_learning_rate=self.LM_learning_rate)

            self.DoC_updates = nn.adam(
                loss=-self.DoC_cost,
                all_params=self.DoC_params,
                all_learning_rate=self.DoC_learning_rate)
        elif optimize == 'SGD':
            #Standard update function
            LM_gparams = [T.grad(self.cost, param) for param in self.LM_params]

            self.LM_params_updates = [
                (LM_param, LM_param - learning_rate * LM_gparam)
                for LM_param, LM_gparam, learning_rate in zip(
                    self.LM_params, LM_gparams, self.LM_learning_rate)
            ]

            self.LM_learning_rate_update = [
                (learning_rate, learning_rate * decay)
                for learning_rate, decay in zip(self.LM_learning_rate,
                                                self.LM_decay)
            ]

            self.LM_updates = self.LM_params_updates + self.LM_learning_rate_update

            DoC_gparams = [
                T.grad(-self.DoC_cost, param) for param in self.DoC_params
            ]

            self.DoC_params_updates = [
                (DoC_param, DoC_param - learning_rate * DoC_gparam)
                for DoC_param, DoC_gparam, learning_rate in zip(
                    self.DoC_params, DoC_gparams, self.DoC_learning_rate)
            ]

            self.DoC_learning_rate_update = [
                (learning_rate, learning_rate * decay)
                for learning_rate, decay in zip(self.DoC_learning_rate,
                                                self.DoC_decay)
            ]

            self.DoC_updates = self.DoC_params_updates + self.DoC_learning_rate_update

        # keep track of model input
        self.input_source = input_source
        self.input_target = input_target

        #Predict Label
        self.y_pred_source = T.argmax(self.EC_y_S_pi, axis=1)
        self.y_pred_target = T.argmax(self.EC_y_T_pi, axis=1)
Example #50
0
def build_model(tparams, options):
    x = T.matrix('x', dtype=config.floatX)
    d = T.matrix('d', dtype=config.floatX)
    y = T.matrix('y', dtype=config.floatX)
    mask = T.vector('mask', dtype=config.floatX)

    logEps = options['logEps']

    emb = T.maximum(T.dot(x, tparams['W_emb']) + tparams['b_emb'], 0)
    if options['demoSize'] > 0: emb = T.concatenate((emb, d), axis=1)
    visit = T.maximum(T.dot(emb, tparams['W_hidden']) + tparams['b_hidden'], 0)
    results = T.nnet.softmax(
        T.dot(visit, tparams['W_output']) + tparams['b_output'])

    mask1 = (mask[:-1] * mask[1:])[:, None]
    mask2 = (mask[:-2] * mask[1:-1] * mask[2:])[:, None]
    mask3 = (mask[:-3] * mask[1:-2] * mask[2:-1] * mask[3:])[:, None]
    mask4 = (mask[:-4] * mask[1:-3] * mask[2:-2] * mask[3:-1] * mask[4:])[:,
                                                                          None]
    mask5 = (mask[:-5] * mask[1:-4] * mask[2:-3] * mask[3:-2] * mask[4:-1] *
             mask[5:])[:, None]

    t = None
    if options['numYcodes'] > 0: t = y
    else: t = x

    forward_results = results[:-1] * mask1
    forward_cross_entropy = -(
        t[1:] * T.log(forward_results + logEps) +
        (1. - t[1:]) * T.log(1. - forward_results + logEps))

    forward_results2 = results[:-2] * mask2
    forward_cross_entropy2 = -(
        t[2:] * T.log(forward_results2 + logEps) +
        (1. - t[2:]) * T.log(1. - forward_results2 + logEps))

    forward_results3 = results[:-3] * mask3
    forward_cross_entropy3 = -(
        t[3:] * T.log(forward_results3 + logEps) +
        (1. - t[3:]) * T.log(1. - forward_results3 + logEps))

    forward_results4 = results[:-4] * mask4
    forward_cross_entropy4 = -(
        t[4:] * T.log(forward_results4 + logEps) +
        (1. - t[4:]) * T.log(1. - forward_results4 + logEps))

    forward_results5 = results[:-5] * mask5
    forward_cross_entropy5 = -(
        t[5:] * T.log(forward_results5 + logEps) +
        (1. - t[5:]) * T.log(1. - forward_results5 + logEps))

    backward_results = results[1:] * mask1
    backward_cross_entropy = -(
        t[:-1] * T.log(backward_results + logEps) +
        (1. - t[:-1]) * T.log(1. - backward_results + logEps))

    backward_results2 = results[2:] * mask2
    backward_cross_entropy2 = -(
        t[:-2] * T.log(backward_results2 + logEps) +
        (1. - t[:-2]) * T.log(1. - backward_results2 + logEps))

    backward_results3 = results[3:] * mask3
    backward_cross_entropy3 = -(
        t[:-3] * T.log(backward_results3 + logEps) +
        (1. - t[:-3]) * T.log(1. - backward_results3 + logEps))

    backward_results4 = results[4:] * mask4
    backward_cross_entropy4 = -(
        t[:-4] * T.log(backward_results4 + logEps) +
        (1. - t[:-4]) * T.log(1. - backward_results4 + logEps))

    backward_results5 = results[5:] * mask5
    backward_cross_entropy5 = -(
        t[:-5] * T.log(backward_results5 + logEps) +
        (1. - t[:-5]) * T.log(1. - backward_results5 + logEps))

    visit_cost1 = (forward_cross_entropy.sum(axis=1).sum(axis=0) +
                   backward_cross_entropy.sum(axis=1).sum(axis=0)) / (
                       mask1.sum() + logEps)
    visit_cost2 = (forward_cross_entropy2.sum(axis=1).sum(axis=0) +
                   backward_cross_entropy2.sum(axis=1).sum(axis=0)) / (
                       mask2.sum() + logEps)
    visit_cost3 = (forward_cross_entropy3.sum(axis=1).sum(axis=0) +
                   backward_cross_entropy3.sum(axis=1).sum(axis=0)) / (
                       mask3.sum() + logEps)
    visit_cost4 = (forward_cross_entropy4.sum(axis=1).sum(axis=0) +
                   backward_cross_entropy4.sum(axis=1).sum(axis=0)) / (
                       mask4.sum() + logEps)
    visit_cost5 = (forward_cross_entropy5.sum(axis=1).sum(axis=0) +
                   backward_cross_entropy5.sum(axis=1).sum(axis=0)) / (
                       mask5.sum() + logEps)

    windowSize = options['windowSize']
    visit_cost = visit_cost1
    if windowSize == 2:
        visit_cost = visit_cost1 + visit_cost2
    elif windowSize == 3:
        visit_cost = visit_cost1 + visit_cost2 + visit_cost3
    elif windowSize == 4:
        visit_cost = visit_cost1 + visit_cost2 + visit_cost3 + visit_cost4
    elif windowSize == 5:
        visit_cost = visit_cost1 + visit_cost2 + visit_cost3 + visit_cost4 + visit_cost5

    iVector = T.vector('iVector', dtype='int32')
    jVector = T.vector('jVector', dtype='int32')
    preVec = T.maximum(tparams['W_emb'], 0)
    norms = (T.exp(T.dot(preVec, preVec.T))).sum(axis=1)
    emb_cost = -T.log((T.exp((preVec[iVector] * preVec[jVector]).sum(axis=1)) /
                       norms[iVector]) + logEps)

    total_cost = visit_cost + T.mean(
        emb_cost) + options['L2_reg'] * (tparams['W_emb']**2).sum()

    if options['demoSize'] > 0 and options['numYcodes'] > 0:
        return x, d, y, mask, iVector, jVector, total_cost
    elif options['demoSize'] == 0 and options['numYcodes'] > 0:
        return x, y, mask, iVector, jVector, total_cost
    elif options['demoSize'] > 0 and options['numYcodes'] == 0:
        return x, d, mask, iVector, jVector, total_cost
    else:
        return x, mask, iVector, jVector, total_cost
Example #51
0
    def __theano_build__(self):
        E, V, U, W, b, c, W_att, b_att = self.E, self.V, self.U, self.W, self.b, self.c, self.W_att, self.b_att

        x_a = T.ivector('x_a')
        x_b = T.ivector('x_b')
        y = T.lvector('y')

        def forward_direction_step(x_t, s_t_prev):
            # Word embedding layer
            x_e = E[:, x_t]
            # GRU layer 1
            z_t = T.nnet.hard_sigmoid(U[0].dot(x_e) +
                                      W[0].dot(s_t_prev)) + b[0]
            r_t = T.nnet.hard_sigmoid(U[1].dot(x_e) +
                                      W[1].dot(s_t_prev)) + b[1]
            c_t = T.tanh(U[2].dot(x_e) + W[2].dot(s_t_prev * r_t) + b[2])
            s_t = (T.ones_like(z_t) - z_t) * c_t + z_t * s_t_prev
            # directly return the hidden state as intermidate output
            return [s_t]

        def backward_direction_step(x_t, s_t_prev):
            # Word embedding layer
            x_e = E[:, x_t]
            # GRU layer 2
            z_t = T.nnet.hard_sigmoid(U[3].dot(x_e) +
                                      W[3].dot(s_t_prev)) + b[3]
            r_t = T.nnet.hard_sigmoid(U[4].dot(x_e) +
                                      W[4].dot(s_t_prev)) + b[4]
            c_t = T.tanh(U[5].dot(x_e) + W[5].dot(s_t_prev * r_t) + b[5])
            s_t = (T.ones_like(z_t) - z_t) * c_t + z_t * s_t_prev
            # directly return the hidden state as intermidate output
            return [s_t]

        # sentence a vector (states) forward direction
        a_s_f, updates = theano.scan(forward_direction_step,
                                     sequences=x_a,
                                     truncate_gradient=self.bptt_truncate,
                                     outputs_info=T.zeros(self.hidden_dim))

        # sentence b vector (states) backward direction
        a_s_b, updates = theano.scan(backward_direction_step,
                                     sequences=x_a[::-1],
                                     truncate_gradient=self.bptt_truncate,
                                     outputs_info=T.zeros(self.hidden_dim))

        # sentence b vector (states) forward direction
        b_s_f, updates = theano.scan(forward_direction_step,
                                     sequences=x_b,
                                     truncate_gradient=self.bptt_truncate,
                                     outputs_info=T.zeros(self.hidden_dim))

        # sentence b vector (states) backward direction
        b_s_b, updates = theano.scan(backward_direction_step,
                                     sequences=x_b[::-1],
                                     truncate_gradient=self.bptt_truncate,
                                     outputs_info=T.zeros(self.hidden_dim))

        # combine the sena
        a_s = T.concatenate([a_s_f, a_s_b[::-1]], axis=1)
        b_s = T.concatenate([b_s_f, b_s_b[::-1]], axis=1)

        def soft_attention(h_i):
            return T.tanh(W_att.dot(h_i) + b_att)

        def weight_attention(h_i, a_j):
            return h_i * a_j

        a_att, updates = theano.scan(soft_attention, sequences=a_s)
        b_att, updates = theano.scan(soft_attention, sequences=b_s)

        # softmax
        # a_att = (59,1)
        # b_att = (58,1)
        a_att = T.exp(a_att)
        a_att = a_att.flatten()
        a_att = a_att / a_att.sum()

        b_att = T.exp(b_att)
        b_att = b_att.flatten()
        b_att = b_att / b_att.sum()

        a_s_att, updates = theano.scan(weight_attention,
                                       sequences=[a_s, a_att])
        b_s_att, updates = theano.scan(weight_attention,
                                       sequences=[b_s, b_att])
        # eps = np.asarray([1.0e-10]*self.label_dim,dtype=theano.config.floatX)

        # semantic similarity
        # s_sim = manhattan_distance(a_s[-1],b_s[-1])

        # for classification using simple strategy
        # for now we still use the last word vector as sentence vector
        # apply a simple single hidden layer on each word in sentence
        #
        # a (wi) = attention(wi) = tanh(w_att.dot(wi)+b)
        # theano scan
        # exp(a)
        #
        sena = a_s_att.sum(axis=0)
        senb = b_s_att.sum(axis=0)

        combined_s = T.concatenate([sena, senb], axis=0)

        # softmax class
        o = T.nnet.softmax(V.dot(combined_s) + c)[0]

        # in case the o contains 0 which cause inf and nan
        eps = np.asarray([1.0e-10] * self.label_dim,
                         dtype=theano.config.floatX)
        o = o + eps
        om = o.reshape((1, o.shape[0]))
        prediction = T.argmax(om, axis=1)
        o_error = T.nnet.categorical_crossentropy(om, y)

        # cost
        cost = T.sum(o_error)

        # updates
        updates = sgd_updates_adadelta(norm=0, params=self.params, cost=cost)

        # monitor parameter
        mV = V * T.ones_like(V)
        mc = c * T.ones_like(c)
        mU = U * T.ones_like(U)
        mW = W * T.ones_like(W)

        gV = T.grad(cost, V)
        gc = T.grad(cost, c)
        gU = T.grad(cost, U)
        gW = T.grad(cost, W)

        mgV = gV * T.ones_like(gV)
        mgc = gc * T.ones_like(gc)
        mgU = gU * T.ones_like(gU)
        mgW = gW * T.ones_like(gW)

        # Assign functions
        self.comsen = theano.function([x_a, x_b], [a_att, b_att])
        self.monitor = theano.function([x_a, x_b],
                                       [sena, senb, mV, mc, mU, mW])
        self.monitor_grad = theano.function([x_a, x_b, y],
                                            [mgV, mgc, mgU, mgW])
        self.predict = theano.function([x_a, x_b], om)
        self.predict_class = theano.function([x_a, x_b], prediction)
        self.ce_error = theano.function([x_a, x_b, y], cost)
        # self.bptt = theano.function([x,y],[dE,dU,dW,db,dV,dc])

        # SGD parameters
        learning_rate = T.scalar('learning_rate')
        decay = T.scalar('decay')

        # rmsprop cache updates
        # find the nan
        self.sgd_step = theano.function(
            [x_a, x_b, y], [],
            updates=updates
            # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)
        )
Example #52
0
 def fprop(self, *args):
     self.out = TT.concatenate(args, axis=self.axis)
     return self.out
Example #53
0
    def __theano_build__(self):
        E = self.E
        W = self.W
        U = self.U
        V = self.V
        b = self.b
        c = self.c

        x = T.lvector('x') #
        y = T.lvector('y') #

        def forward_prop_step(x_t, h_t_prev, c_t_prev):

            # Word embedding layer
            x_e = E[:, x_t]

            i_t = T.nnet.sigmoid(W[0].dot(x_e) + U[0].dot(h_t_prev) + b[0])
            f_t = T.nnet.sigmoid(W[1].dot(x_e) + U[1].dot(h_t_prev) + b[1])
            o_t = T.nnet.sigmoid(W[2].dot(x_e) + U[2].dot(h_t_prev) + b[2])
            u_t = T.tanh(W[3].dot(x_e) + U[3].dot(h_t_prev) + b[3])

            c_t = i_t*u_t + f_t * c_t_prev
            h_t = o_t * T.tanh(c_t)

            # Final output calculation
            # Theano's softmax returns a matrix with one row, we only need the row
            # o = T.nnet.softmax(V.dot(h_t) + c)[0]
            # o = T.nnet.softmax(V[0].dot(h_t) + c)
            return [h_t, c_t]

        [h_t, c_t], updates = theano.scan(fn=forward_prop_step,
                                             sequences=x,
                                             truncate_gradient=self.bptt_truncate,
                                             outputs_info=[
                                                           dict(initial=T.zeros(self.hidden_dim)),
                                                           dict(initial=T.zeros(self.hidden_dim))
                                                           ])
        # o is an array for o[t] is output of time step t
        # we only care the output of final time step

        def forward_prop_step_b(x_t, h_t_prev_b, c_t_prev_b):
            # the backward

            # Word embedding layer
            x_e_b = E[:, x_t]

            i_t_b = T.nnet.sigmoid(W[4].dot(x_e_b) + U[4].dot(h_t_prev_b) + b[4])
            f_t_b = T.nnet.sigmoid(W[5].dot(x_e_b) + U[5].dot(h_t_prev_b) + b[5])
            o_t_b = T.nnet.sigmoid(W[6].dot(x_e_b) + U[6].dot(h_t_prev_b) + b[6])
            u_t_b = T.tanh(W[7].dot(x_e_b) + U[7].dot(h_t_prev_b) + b[7])

            c_t_b = i_t_b * u_t_b + f_t_b * c_t_prev_b
            h_t_b = o_t_b * T.tanh(c_t_b)

            # Final output calculation
            # Theano's softmax returns a matrix with one row, we only need the row
            # o = T.nnet.softmax(V.dot(h_t) + c)[0]
            # o_b = T.nnet.softmax(V[1].dot(h_t) + c)
            return [h_t_b, c_t_b]

        [h_t_b, c_t_b], updates = theano.scan(fn=forward_prop_step_b,
                                                   sequences=x[::-1],
                                                   truncate_gradient=self.bptt_truncate,
                                                   outputs_info=[dict(initial=T.zeros(self.hidden_dim)),
                                                                 dict(initial=T.zeros(self.hidden_dim))])


        final_h = h_t[-1]
        final_h_b = h_t_b[-1]
        final_h_concat = T.concatenate([final_h,final_h_b], axis=0)
        final_o = T.nnet.softmax(V[0].dot(final_h_concat) + c) # a array with one row


        prediction = T.argmax(final_o[0], axis=0)
        print('final_o', final_o.ndim)
        print('y ', y.ndim)
        final_o_error = T.sum(T.nnet.categorical_crossentropy(final_o, y))

        cost = final_o_error

        # gradient
        dE = T.grad(cost, E)
        dU = T.grad(cost, U)
        dW = T.grad(cost, W)
        db = T.grad(cost, b)
        dV = T.grad(cost, V)
        dc = T.grad(cost, c)

        # function
        self.predict = theano.function([x], final_o)
        self.predict_class = theano.function([x], prediction)
        self.ce_error = theano.function([x,y], cost)

        # SGD parameters
        learning_rate = T.scalar('learning_rate')

        self.sgd_step = theano.function([x,y,learning_rate],[],
                                        updates=[(self.U, self.U - learning_rate * dU),
                                                 (self.V, self.V - learning_rate * dV),
                                                 (self.W, self.W - learning_rate * dW),
                                                 (self.E, self.E - learning_rate * dE),
                                                 (self.b, self.b - learning_rate * db),
                                                 (self.c, self.c - learning_rate * dc)])
def main(args):
    
    #theano.optimizer='fast_compile'
    #theano.config.exception_verbosity='high'
    

    trial = int(args['trial'])
    pkl_name = 'vrnn_gmm_%d' % trial
    channel_name = 'mse'

    data_path = args['data_path']
    save_path = args['save_path']#+'/aggVSdisag_distrib/'+datetime.datetime.now().strftime("%y-%m-%d_%H-%M")
    period = int(args['period'])
    n_steps = int(args['n_steps'])
    stride_train = int(args['stride_train'])
    stride_test = n_steps
    typeLoad = int(args['typeLoad'])

    flgMSE = int(args['flgMSE'])
    monitoring_freq = int(args['monitoring_freq'])
    epoch = int(args['epoch'])
    batch_size = int(args['batch_size'])
    x_dim = int(args['x_dim'])
    y_dim = int(args['y_dim'])
    z_dim = int(args['z_dim'])
    rnn_dim = int(args['rnn_dim'])
    k = int(args['num_k']) #a mixture of K Gaussian functions
    lr = float(args['lr'])
    origLR = lr
    debug = int(args['debug'])
    kSchedSamp = int(args['kSchedSamp'])

    print "trial no. %d" % trial
    print "batch size %d" % batch_size
    print "learning rate %f" % lr
    print "saving pkl file '%s'" % pkl_name
    print "to the save path '%s'" % save_path

    q_z_dim = 350
    p_z_dim = 400
    p_x_dim = 450
    x2s_dim = 400
    y2s_dim = 200
    z2s_dim = 350
    target_dim = k# As different appliances are separeted in theta_mu1, theta_mu2, etc... each one is just created from k different Gaussians

    Xtrain, ytrain, Xval, yval, Xtest, ytest, reader = fetch_ukdale(data_path, windows, appliances,numApps=-1, period=period,n_steps= n_steps, 
                                              stride_train = stride_train, stride_test = stride_test,
                                              flgAggSumScaled = 1, flgFilterZeros = 1, typeLoad = typeLoad)

    instancesPlot = {0:[10]}
    #instancesPlot = reader.build_dict_instances_plot(listDates, batch_size, Xval.shape[0])
    
    train_data = UKdale(name='train',
                         prep='normalize',
                         cond=True,# False
                         #path=data_path,
                         inputX=Xtrain,
                         labels=ytrain)

    X_mean = train_data.X_mean
    X_std = train_data.X_std

    valid_data = UKdale(name='valid',
                         prep='normalize',
                         cond=True,# False
                         #path=data_path,
                         X_mean=X_mean,
                         X_std=X_std,
                         inputX=Xval,
                         labels = yval)

    test_data = UKdale(name='valid',
                         prep='normalize',
                         cond=True,# False
                         #path=data_path,
                         X_mean=X_mean,
                         X_std=X_std,
                         inputX=Xtest,
                         labels = ytest)

    init_W = InitCell('rand')
    init_U = InitCell('ortho')
    init_b = InitCell('zeros')
    init_b_sig = InitCell('const', mean=0.6)

    x, mask, y , y_mask = train_data.theano_vars()
    scheduleSamplingMask = T.fvector('schedMask')

    x.name = 'x_original'

    if debug:
        x.tag.test_value = np.zeros((15, batch_size, x_dim), dtype=np.float32)
        temp = np.ones((15, batch_size), dtype=np.float32)
        temp[:, -2:] = 0.
        mask.tag.test_value = temp

    """x_1 = FullyConnectedLayer(name='x_1',
                              parent=['x_t'],
                              parent_dim=[x_dim],
                              nout=x2s_dim,
                              unit='relu',
                              init_W=init_W,
                              init_b=init_b)

    y_1 = FullyConnectedLayer(name='y_1',
                              parent=['y_t'],
                              parent_dim=[y_dim],
                              nout=y2s_dim,
                              unit='relu',
                              init_W=init_W,
                              init_b=init_b)

    z_1 = FullyConnectedLayer(name='z_1',
                              parent=['z_t'],
                              parent_dim=[z_dim],
                              nout=z2s_dim,
                              unit='relu',
                              init_W=init_W,
                              init_b=init_b)

    rnn = LSTM(name='rnn',
               parent=['x_1', 'z_1', 'y_1'],
               parent_dim=[x2s_dim, z2s_dim, y2s_dim],
               nout=rnn_dim,
               unit='tanh',
               init_W=init_W,
               init_U=init_U,
               init_b=init_b)

    phi_1 = FullyConnectedLayer(name='phi_1',
                                parent=['x_1', 's_tm1','y_1'],
                                parent_dim=[x2s_dim, rnn_dim, y2s_dim],
                                nout=q_z_dim,
                                unit='relu',
                                init_W=init_W,
                                init_b=init_b)

    phi_mu = FullyConnectedLayer(name='phi_mu',
                                 parent=['phi_1'],
                                 parent_dim=[q_z_dim],
                                 nout=z_dim,
                                 unit='linear',
                                 init_W=init_W,
                                 init_b=init_b)

    phi_sig = FullyConnectedLayer(name='phi_sig',
                                  parent=['phi_1'],
                                  parent_dim=[q_z_dim],
                                  nout=z_dim,
                                  unit='softplus',
                                  cons=1e-4,
                                  init_W=init_W,
                                  init_b=init_b_sig)

    prior_1 = FullyConnectedLayer(name='prior_1',
                                  parent=['x_1','s_tm1'],
                                  parent_dim=[x2s_dim,rnn_dim],
                                  nout=p_z_dim,
                                  unit='relu',
                                  init_W=init_W,
                                  init_b=init_b)

    prior_mu = FullyConnectedLayer(name='prior_mu',
                                   parent=['prior_1'],
                                   parent_dim=[p_z_dim],
                                   nout=z_dim,
                                   unit='linear',
                                   init_W=init_W,
                                   init_b=init_b)

    prior_sig = FullyConnectedLayer(name='prior_sig',
                                    parent=['prior_1'],
                                    parent_dim=[p_z_dim],
                                    nout=z_dim,
                                    unit='softplus',
                                    cons=1e-4,
                                    init_W=init_W,
                                    init_b=init_b_sig)

    theta_1 = FullyConnectedLayer(name='theta_1',
                                  parent=['z_1', 's_tm1'],
                                  parent_dim=[z2s_dim, rnn_dim],
                                  nout=p_x_dim,
                                  unit='relu',
                                  init_W=init_W,
                                  init_b=init_b)

    theta_mu1 = FullyConnectedLayer(name='theta_mu1',
                                   parent=['theta_1'],
                                   parent_dim=[p_x_dim],
                                   nout=target_dim,
                                   unit='linear',
                                   init_W=init_W,
                                   init_b=init_b)

    theta_mu2 = FullyConnectedLayer(name='theta_mu2',
                                   parent=['theta_1'],
                                   parent_dim=[p_x_dim],
                                   nout=target_dim,
                                   unit='linear',
                                   init_W=init_W,
                                   init_b=init_b)

    theta_mu3 = FullyConnectedLayer(name='theta_mu3',
                                   parent=['theta_1'],
                                   parent_dim=[p_x_dim],
                                   nout=target_dim,
                                   unit='linear',
                                   init_W=init_W,
                                   init_b=init_b)

    theta_mu4 = FullyConnectedLayer(name='theta_mu4',
                                   parent=['theta_1'],
                                   parent_dim=[p_x_dim],
                                   nout=target_dim,
                                   unit='linear',
                                   init_W=init_W,
                                   init_b=init_b)

    theta_mu5 = FullyConnectedLayer(name='theta_mu5',
                                   parent=['theta_1'],
                                   parent_dim=[p_x_dim],
                                   nout=target_dim,
                                   unit='linear',
                                   init_W=init_W,
                                   init_b=init_b)

    theta_sig1 = FullyConnectedLayer(name='theta_sig1',
                                    parent=['theta_1'],
                                    parent_dim=[p_x_dim],
                                    nout=target_dim,
                                    unit='softplus',
                                    cons=1e-4,
                                    init_W=init_W,
                                    init_b=init_b_sig)

    theta_sig2 = FullyConnectedLayer(name='theta_sig2',
                                    parent=['theta_1'],
                                    parent_dim=[p_x_dim],
                                    nout=target_dim,
                                    unit='softplus',
                                    cons=1e-4,
                                    init_W=init_W,
                                    init_b=init_b_sig)

    theta_sig3 = FullyConnectedLayer(name='theta_sig3',
                                    parent=['theta_1'],
                                    parent_dim=[p_x_dim],
                                    nout=target_dim,
                                    unit='softplus',
                                    cons=1e-4,
                                    init_W=init_W,
                                    init_b=init_b_sig)

    theta_sig4 = FullyConnectedLayer(name='theta_sig4',
                                    parent=['theta_1'],
                                    parent_dim=[p_x_dim],
                                    nout=target_dim,
                                    unit='softplus',
                                    cons=1e-4,
                                    init_W=init_W,
                                    init_b=init_b_sig)

    theta_sig5 = FullyConnectedLayer(name='theta_sig5',
                                    parent=['theta_1'],
                                    parent_dim=[p_x_dim],
                                    nout=target_dim,
                                    unit='softplus',
                                    cons=1e-4,
                                    init_W=init_W,
                                    init_b=init_b_sig)

    coeff1 = FullyConnectedLayer(name='coeff1',
                                parent=['theta_1'],
                                parent_dim=[p_x_dim],
                                nout=k,
                                unit='softmax',
                                init_W=init_W,
                                init_b=init_b)

    coeff2 = FullyConnectedLayer(name='coeff2',
                                parent=['theta_1'],
                                parent_dim=[p_x_dim],
                                nout=k,
                                unit='softmax',
                                init_W=init_W,
                                init_b=init_b)

    coeff3 = FullyConnectedLayer(name='coeff3',
                                parent=['theta_1'],
                                parent_dim=[p_x_dim],
                                nout=k,
                                unit='softmax',
                                init_W=init_W,
                                init_b=init_b)
 
    coeff4 = FullyConnectedLayer(name='coeff4',
                                parent=['theta_1'],
                                parent_dim=[p_x_dim],
                                nout=k,
                                unit='softmax',
                                init_W=init_W,
                                init_b=init_b)

    coeff5 = FullyConnectedLayer(name='coeff5',
                                parent=['theta_1'],
                                parent_dim=[p_x_dim],
                                nout=k,
                                unit='softmax',
                                init_W=init_W,
                                init_b=init_b)"""

    fmodel = open('vrnn_gmm_disall_best.pkl', 'rb')
    mainloop = cPickle.load(fmodel)
    fmodel.close()

    #for node in mainloop.model.nodes:
    #  print(node.name)

    #define layers
    rnn = mainloop.model.nodes[0]
    x_1 = mainloop.model.nodes[1]
    y_1 = mainloop.model.nodes[2]
    z_1 = mainloop.model.nodes[3]
    phi_1 = mainloop.model.nodes[4]
    phi_mu = mainloop.model.nodes[5]
    phi_sig = mainloop.model.nodes[6]
    prior_1 = mainloop.model.nodes[7]
    prior_mu = mainloop.model.nodes[8]
    prior_sig = mainloop.model.nodes[9]
    theta_1 = mainloop.model.nodes[10]
    theta_mu1 = mainloop.model.nodes[11]
    theta_sig1 = mainloop.model.nodes[12]
    coeff1 = mainloop.model.nodes[13]

    nodes = [rnn,
             x_1, y_1,z_1, #dissag_pred,
             phi_1, phi_mu, phi_sig,
             prior_1, prior_mu, prior_sig,
             theta_1, theta_mu1, theta_sig1, coeff1]

    params = mainloop.model.params

    dynamicOutput = [None, None, None, None, None, None, None, None]
    #dynamicOutput_val = [None, None, None, None, None, None,None,  None, None]
    if (y_dim>1):
      theta_mu2 = mainloop.model.nodes[14]
      theta_sig2 = mainloop.model.nodes[15]
      coeff2 = mainloop.model.nodes[16]
      nodes = nodes + [theta_mu2, theta_sig2, coeff2]
      dynamicOutput = dynamicOutput+[None, None, None, None] #mu, sig, coef and pred
    if (y_dim>2):
      theta_mu3 = mainloop.model.nodes[17]
      theta_sig3 = mainloop.model.nodes[18]
      coeff3 = mainloop.model.nodes[19]
      nodes = nodes + [theta_mu3, theta_sig3, coeff3]
      dynamicOutput = dynamicOutput +[None, None, None, None]
    if (y_dim>3):
      theta_mu4 = mainloop.model.nodes[20]
      theta_sig4 = mainloop.model.nodes[21]
      coeff4 = mainloop.model.nodes[22]
      nodes = nodes + [theta_mu4, theta_sig4, coeff4]
      dynamicOutput = dynamicOutput + [None, None, None, None]
    if (y_dim>4):
      theta_mu5 = mainloop.model.nodes[23]
      theta_sig5 = mainloop.model.nodes[24]
      coeff5 = mainloop.model.nodes[25]
      nodes = nodes + [theta_mu5, theta_sig5, coeff5]
      dynamicOutput = dynamicOutput + [None, None, None, None]

    s_0 = rnn.get_init_state(batch_size)

    x_1_temp = x_1.fprop([x], params)
    y_1_temp = y_1.fprop([y], params)

    output_fn = [s_0] + dynamicOutput
    output_fn_val = [s_0] + dynamicOutput[2:]
    print(len(output_fn), len(output_fn_val))

    def inner_fn_test(x_t, s_tm1):

        prior_1_t = prior_1.fprop([x_t,s_tm1], params)
        prior_mu_t = prior_mu.fprop([prior_1_t], params)
        prior_sig_t = prior_sig.fprop([prior_1_t], params)

        z_t = Gaussian_sample(prior_mu_t, prior_sig_t)#in the original code it is gaussian. GMM is for the generation
        z_1_t = z_1.fprop([z_t], params)

        theta_1_t = theta_1.fprop([z_1_t, s_tm1], params)
        theta_mu1_t = theta_mu1.fprop([theta_1_t], params)
        theta_sig1_t = theta_sig1.fprop([theta_1_t], params)
        coeff1_t = coeff1.fprop([theta_1_t], params)

        y_pred1 = GMM_sampleY(theta_mu1_t, theta_sig1_t, coeff1_t) #Gaussian_sample(theta_mu_t, theta_sig_t)

        tupleMulti = prior_mu_t, prior_sig_t, theta_mu1_t, theta_sig1_t, coeff1_t, y_pred1

        if (y_dim>1):
          theta_mu2_t = theta_mu2.fprop([theta_1_t], params)
          theta_sig2_t = theta_sig2.fprop([theta_1_t], params)
          coeff2_t = coeff2.fprop([theta_1_t], params)
          y_pred2 = GMM_sampleY(theta_mu2_t, theta_sig2_t, coeff2_t)
          y_pred1 = T.concatenate([y_pred1, y_pred2],axis=1)
          tupleMulti = tupleMulti + (theta_mu2_t, theta_sig2_t, coeff2_t, y_pred2)

        if (y_dim>2):
          theta_mu3_t = theta_mu3.fprop([theta_1_t], params)
          theta_sig3_t = theta_sig3.fprop([theta_1_t], params)
          coeff3_t = coeff3.fprop([theta_1_t], params)
          y_pred3 = GMM_sampleY(theta_mu3_t, theta_sig3_t, coeff3_t)
          y_pred1 = T.concatenate([y_pred1, y_pred3],axis=1)
          tupleMulti = tupleMulti + (theta_mu3_t, theta_sig3_t, coeff3_t, y_pred3)

        if (y_dim>3):
          theta_mu4_t = theta_mu4.fprop([theta_1_t], params)
          theta_sig4_t = theta_sig4.fprop([theta_1_t], params)
          coeff4_t = coeff4.fprop([theta_1_t], params)
          y_pred4 = GMM_sampleY(theta_mu4_t, theta_sig4_t, coeff4_t)
          y_pred1 = T.concatenate([y_pred1, y_pred4],axis=1)
          tupleMulti = tupleMulti + (theta_mu4_t, theta_sig4_t, coeff4_t, y_pred4)

        if (y_dim>4):
          theta_mu5_t = theta_mu5.fprop([theta_1_t], params)
          theta_sig5_t = theta_sig5.fprop([theta_1_t], params)
          coeff5_t = coeff5.fprop([theta_1_t], params)
          y_pred5 = GMM_sampleY(theta_mu5_t, theta_sig5_t, coeff5_t)
          y_pred1 = T.concatenate([y_pred1, y_pred5],axis=1)
          tupleMulti = tupleMulti + (theta_mu5_t, theta_sig5_t, coeff5_t, y_pred5)
        
        pred_1_t=y_1.fprop([y_pred1], params)
        #y_pred = [GMM_sampleY(theta_mu_t[i], theta_sig_t[i], coeff_t[i]) for i in range(y_dim)]#T.stack([y_pred1,y_pred2],axis = 0 )
        s_t = rnn.fprop([[x_t, z_1_t, pred_1_t], [s_tm1]], params)
        #y_pred = dissag_pred.fprop([s_t], params)

        return (s_t,)+tupleMulti
        #corr_temp, binary_temp
    (restResults_val, updates_val) = theano.scan(fn=inner_fn_test, sequences=[x_1_temp],
                            outputs_info=output_fn_val )

    for k, v in updates_val.iteritems():
        k.default_update = v

    """def inner_fn(x_t, y_t, scheduleSamplingMask, s_tm1):

        phi_1_t = phi_1.fprop([x_t, s_tm1, y_t], params)
        phi_mu_t = phi_mu.fprop([phi_1_t], params)
        phi_sig_t = phi_sig.fprop([phi_1_t], params)

        prior_1_t = prior_1.fprop([x_t,s_tm1], params)
        prior_mu_t = prior_mu.fprop([prior_1_t], params)
        prior_sig_t = prior_sig.fprop([prior_1_t], params)

        z_t = Gaussian_sample(phi_mu_t, phi_sig_t)#in the original code it is gaussian. GMM is for the generation
        z_1_t = z_1.fprop([z_t], params)

        theta_1_t = theta_1.fprop([z_1_t, s_tm1], params)
        theta_mu1_t = theta_mu1.fprop([theta_1_t], params)
        theta_sig1_t = theta_sig1.fprop([theta_1_t], params)
        coeff1_t = coeff1.fprop([theta_1_t], params)

        y_pred1 = GMM_sampleY(theta_mu1_t, theta_sig1_t, coeff1_t) #Gaussian_sample(theta_mu_t, theta_sig_t)
        y_pred = y_pred1

        tupleMulti = phi_mu_t, phi_sig_t, prior_mu_t, prior_sig_t, theta_mu1_t, theta_sig1_t, coeff1_t, y_pred1

        if (y_dim>1):
          theta_mu2_t = theta_mu2.fprop([theta_1_t], params)
          theta_sig2_t = theta_sig2.fprop([theta_1_t], params)
          coeff2_t = coeff2.fprop([theta_1_t], params)
          y_pred2 = GMM_sampleY(theta_mu2_t, theta_sig2_t, coeff2_t)
          y_pred = T.concatenate([y_pred, y_pred2],axis=1)
          tupleMulti = tupleMulti + (theta_mu2_t, theta_sig2_t, coeff2_t, y_pred2)

        if (y_dim>2):
          theta_mu3_t = theta_mu3.fprop([theta_1_t], params)
          theta_sig3_t = theta_sig3.fprop([theta_1_t], params)
          coeff3_t = coeff3.fprop([theta_1_t], params)
          y_pred3 = GMM_sampleY(theta_mu3_t, theta_sig3_t, coeff3_t)
          y_pred = T.concatenate([y_pred, y_pred3],axis=1)
          tupleMulti = tupleMulti + (theta_mu3_t, theta_sig3_t, coeff3_t, y_pred3)

        if (y_dim>3):
          theta_mu4_t = theta_mu4.fprop([theta_1_t], params)
          theta_sig4_t = theta_sig4.fprop([theta_1_t], params)
          coeff4_t = coeff4.fprop([theta_1_t], params)
          y_pred4 = GMM_sampleY(theta_mu4_t, theta_sig4_t, coeff4_t)
          y_pred = T.concatenate([y_pred, y_pred4],axis=1)
          tupleMulti = tupleMulti + (theta_mu4_t, theta_sig4_t, coeff4_t, y_pred4)

        if (y_dim>4):
          theta_mu5_t = theta_mu5.fprop([theta_1_t], params)
          theta_sig5_t = theta_sig5.fprop([theta_1_t], params)
          coeff5_t = coeff5.fprop([theta_1_t], params)
          y_pred5 = GMM_sampleY(theta_mu5_t, theta_sig5_t, coeff5_t)
          y_pred = T.concatenate([y_pred, y_pred5],axis=1)
          tupleMulti = tupleMulti + (theta_mu5_t, theta_sig5_t, coeff5_t, y_pred5)
        
        if (scheduleSamplingMask==1):
          s_t = rnn.fprop([[x_t, z_1_t, y_t], [s_tm1]], params)
        else:
          y_t_aux = y_1.fprop([y_pred], params)
          s_t = rnn.fprop([[x_t, z_1_t, y_t_aux], [s_tm1]], params)

        return (s_t,)+tupleMulti
        #corr_temp, binary_temp
    (restResults, updates) = theano.scan(fn=inner_fn, sequences=[x_1_temp, y_1_temp, scheduleSamplingMask],
                            outputs_info=output_fn )
    '''
    ((s_temp, phi_mu_temp, phi_sig_temp, prior_mu_temp, prior_sig_temp,z_t_temp, z_1_temp, theta_1_temp, 
      theta_mu1_temp, theta_sig1_temp, coeff1_temp, theta_mu2_temp, theta_sig2_temp, coeff2_temp, 
      theta_mu3_temp, theta_sig3_temp, coeff3_temp, theta_mu4_temp, theta_sig4_temp, coeff4_temp,
      theta_mu5_temp, theta_sig5_temp, coeff5_temp, 
      y_pred1_temp, y_pred2_temp, y_pred3_temp, y_pred4_temp, y_pred5_temp), updates) =\
        theano.scan(fn=inner_fn,
                    sequences=[x_1_temp, y_1_temp],
                    outputs_info=[s_0,  None, None, None, None, None, None, None, None,None,  None, None, 
                                  None, None, None, None, None, None, None, None,
                                  None, None, None, None, None, None, None, None])
    '''
    s_temp, phi_mu_temp, phi_sig_temp, prior_mu_temp, prior_sig_temp,\
      theta_mu1_temp, theta_sig1_temp, coeff1_temp, y_pred1_temp = restResults[:9]
    restResults = restResults[9:]

    for k, v in updates.iteritems():
        k.default_update = v

    #s_temp = concatenate([s_0[None, :, :], s_temp[:-1]], axis=0)# seems like this is for creating an additional dimension to s_0


    theta_mu1_temp.name = 'theta_mu1'
    theta_sig1_temp.name = 'theta_sig1'
    coeff1_temp.name = 'coeff1'
    y_pred1_temp.name = 'disaggregation1'

    #[:,:,flgAgg].reshape((y.shape[0],y.shape[1],1)
    mse1 = T.mean((y_pred1_temp - y[:,:,0].reshape((y.shape[0],y.shape[1],1)))**2)
    mae1 = T.mean( T.abs_(y_pred1_temp - y[:,:,0].reshape((y.shape[0],y.shape[1],1))) )
    mse1.name = 'mse1'
    mae1.name = 'mae1'

    kl_temp = KLGaussianGaussian(phi_mu_temp, phi_sig_temp, prior_mu_temp, prior_sig_temp)
  
    x_shape = x.shape
    y_shape = y.shape
    x_in = x.reshape((x_shape[0]*x_shape[1], -1))
    y_in = y.reshape((y_shape[0]*y_shape[1], -1))
    
    theta_mu1_in = theta_mu1_temp.reshape((x_shape[0]*x_shape[1], -1))
    theta_sig1_in = theta_sig1_temp.reshape((x_shape[0]*x_shape[1], -1))
    coeff1_in = coeff1_temp.reshape((x_shape[0]*x_shape[1], -1))


    ddoutMSEA = []
    ddoutYpreds = [y_pred1_temp]
    indexSepDynamic = 6 # plus one for totaMSE

    totaMSE = T.copy(mse1)
    mse2 = T.zeros((1,))
    mae2 = T.zeros((1,))
    mse3 = T.zeros((1,))
    mae3 = T.zeros((1,))
    mse4 = T.zeros((1,))
    mae4 = T.zeros((1,))
    mse5 = T.zeros((1,))
    mae5 = T.zeros((1,))

    if (y_dim>1):
      theta_mu2_temp, theta_sig2_temp, coeff2_temp, y_pred2_temp = restResults[:4]
      restResults = restResults[4:]
      theta_mu2_temp.name = 'theta_mu2'
      theta_sig2_temp.name = 'theta_sig2'
      coeff2_temp.name = 'coeff2'
      y_pred2_temp.name = 'disaggregation2'
      mse2 = T.mean((y_pred2_temp - y[:,:,1].reshape((y.shape[0],y.shape[1],1)))**2) # As axis = None is calculated for all
      mae2 = T.mean( T.abs_(y_pred2_temp - y[:,:,1].reshape((y.shape[0],y.shape[1],1))) )
      mse2.name = 'mse2'
      mae2.name = 'mae2'

      theta_mu2_in = theta_mu2_temp.reshape((x_shape[0]*x_shape[1], -1))
      theta_sig2_in = theta_sig2_temp.reshape((x_shape[0]*x_shape[1], -1))
      coeff2_in = coeff2_temp.reshape((x_shape[0]*x_shape[1], -1))

      argsGMM = theta_mu2_in, theta_sig2_in, coeff2_in

      ddoutMSEA = ddoutMSEA + [mse2, mae2]
      ddoutYpreds = ddoutYpreds + [y_pred2_temp]
      #totaMSE+=mse2
      indexSepDynamic +=2

    if (y_dim>2):
      theta_mu3_temp, theta_sig3_temp, coeff3_temp, y_pred3_temp = restResults[:4]
      restResults = restResults[4:]
      theta_mu3_temp.name = 'theta_mu3'
      theta_sig3_temp.name = 'theta_sig3'
      coeff3_temp.name = 'coeff3'
      y_pred3_temp.name = 'disaggregation3'
      mse3 = T.mean((y_pred3_temp - y[:,:,2].reshape((y.shape[0],y.shape[1],1)))**2) # As axis = None is calculated for all
      mae3 = T.mean( T.abs_(y_pred3_temp - y[:,:,2].reshape((y.shape[0],y.shape[1],1))) )
      mse3.name = 'mse3'
      mae3.name = 'mae3'

      theta_mu3_in = theta_mu3_temp.reshape((x_shape[0]*x_shape[1], -1))
      theta_sig3_in = theta_sig3_temp.reshape((x_shape[0]*x_shape[1], -1))
      coeff3_in = coeff3_temp.reshape((x_shape[0]*x_shape[1], -1))

      argsGMM = argsGMM + (theta_mu3_in, theta_sig3_in, coeff3_in)
      ddoutMSEA = ddoutMSEA + [mse3, mae3]
      ddoutYpreds = ddoutYpreds + [y_pred3_temp]
      #totaMSE+=mse3
      indexSepDynamic +=2

    if (y_dim>3):
      theta_mu4_temp, theta_sig4_temp, coeff4_temp, y_pred4_temp = restResults[:4]
      restResults = restResults[4:]
      theta_mu4_temp.name = 'theta_mu4'
      theta_sig4_temp.name = 'theta_sig4'
      coeff4_temp.name = 'coeff4'
      y_pred4_temp.name = 'disaggregation4'
      mse4 = T.mean((y_pred4_temp - y[:,:,3].reshape((y.shape[0],y.shape[1],1)))**2) # As axis = None is calculated for all
      mae4 = T.mean( T.abs_(y_pred4_temp - y[:,:,3].reshape((y.shape[0],y.shape[1],1))) )
      mse4.name = 'mse4'
      mae4.name = 'mae4'

      theta_mu4_in = theta_mu4_temp.reshape((x_shape[0]*x_shape[1], -1))
      theta_sig4_in = theta_sig4_temp.reshape((x_shape[0]*x_shape[1], -1))
      coeff4_in = coeff4_temp.reshape((x_shape[0]*x_shape[1], -1))

      argsGMM = argsGMM + (theta_mu4_in, theta_sig4_in, coeff4_in)
      ddoutMSEA = ddoutMSEA + [mse4, mae4]
      ddoutYpreds = ddoutYpreds + [y_pred4_temp]
      #totaMSE+=mse4
      indexSepDynamic +=2

    if (y_dim>4):
      theta_mu5_temp, theta_sig5_temp, coeff5_temp, y_pred5_temp = restResults[:4]
      restResults = restResults[4:]
      theta_mu5_temp.name = 'theta_mu5'
      theta_sig5_temp.name = 'theta_sig5'
      coeff5_temp.name = 'coeff5'
      y_pred5_temp.name = 'disaggregation5'
      mse5 = T.mean((y_pred5_temp - y[:,:,4].reshape((y.shape[0],y.shape[1],1)))**2) # As axis = None is calculated for all
      mae5 = T.mean( T.abs_(y_pred5_temp - y[:,:,4].reshape((y.shape[0],y.shape[1],1))) )
      mse5.name = 'mse5'
      mae5.name = 'mae5'

      theta_mu5_in = theta_mu5_temp.reshape((x_shape[0]*x_shape[1], -1))
      theta_sig5_in = theta_sig5_temp.reshape((x_shape[0]*x_shape[1], -1))
      coeff5_in = coeff5_temp.reshape((x_shape[0]*x_shape[1], -1))

      argsGMM = argsGMM + (theta_mu5_in, theta_sig5_in, coeff5_in)
      ddoutMSEA = ddoutMSEA + [mse5, mae5]
      ddoutYpreds = ddoutYpreds + [y_pred5_temp]
      #totaMSE+=mse5
      indexSepDynamic +=2

    totaMSE = (mse1+mse2+mse3+mse4+mse5)/y_dim
    totaMSE.name = 'mse'

    kl_temp = KLGaussianGaussian(phi_mu_temp, phi_sig_temp, prior_mu_temp, prior_sig_temp)
    """
    x_shape = x.shape
    y_shape = y.shape
    x_in = x.reshape((x_shape[0]*x_shape[1], -1))
    y_in = y.reshape((y_shape[0]*y_shape[1], -1))
    """

    recon = GMMdisagMulti(y_dim, y_in, theta_mu1_in, theta_sig1_in, coeff1_in, *argsGMM)# BiGMM(x_in, theta_mu_in, theta_sig_in, coeff_in, corr_in, binary_in)
    recon = recon.reshape((x_shape[0], x_shape[1]))
    recon.name = 'gmm_out'

    '''
    recon5 = GMM(y_in[:,4, None], theta_mu5_in, theta_sig5_in, coeff5_in)
    recon5 = recon.reshape((x_shape[0], x_shape[1]))    
    '''
    recon_term = recon.sum(axis=0).mean()
    recon_term = recon.sum(axis=0).mean()
    recon_term.name = 'recon_term'

    #kl_temp = kl_temp * mask
    
    kl_term = kl_temp.sum(axis=0).mean()
    kl_term.name = 'kl_term'

    #nll_upper_bound_0 = recon_term + kl_term
    #nll_upper_bound_0.name = 'nll_upper_bound_0'
    if (flgMSE==1):
      nll_upper_bound =  recon_term + kl_term + totaMSE
    else:
      nll_upper_bound =  recon_term + kl_term
    nll_upper_bound.name = 'nll_upper_bound'"""

    ######################## TEST (GENERATION) TIME
    s_temp_val, prior_mu_temp_val, prior_sig_temp_val,  \
      theta_mu1_temp_val, theta_sig1_temp_val, coeff1_temp_val, y_pred1_temp_val = restResults_val[:7]
    restResults_val = restResults_val[7:]

    #s_temp_val = concatenate([s_0[None, :, :], s_temp_val[:-1]], axis=0)# seems like this is for creating an additional dimension to s_0


    theta_mu1_temp_val.name = 'theta_mu1_val'
    theta_sig1_temp_val.name = 'theta_sig1_val'
    coeff1_temp_val.name = 'coeff1_val'
    y_pred1_temp_val.name = 'disaggregation1_val'

    #[:,:,flgAgg].reshape((y.shape[0],y.shape[1],1)
    mse1_val = T.mean((y_pred1_temp_val - y[:,:,0].reshape((y.shape[0],y.shape[1],1)))**2) # As axis = None is calculated for all
    mae1_val = T.mean( T.abs_(y_pred1_temp_val - y[:,:,0].reshape((y.shape[0],y.shape[1],1))) )

    #NEURALNILM #(sum_output - sum_target) / max(sum_output, sum_target))
    totPred = T.sum(y_pred1_temp_val)
    totReal = T.sum(y[:,:,0])
    relErr1_val =( totPred -  totReal)/ T.maximum(totPred,totReal)
    propAssigned1_val = 1 - T.sum(T.abs_(y_pred1_temp_val - y[:,:,0].reshape((y.shape[0],y.shape[1],1))))/(2*T.sum(x))

    #y_unNormalize = (y[:,:,0] * reader.stdTraining[0]) + reader.meanTraining[0]
    #y_pred1_temp_val = (y_pred1_temp_val * reader.stdTraining[0]) + reader.meanTraining[0]

    #mse1_valUnNorm = T.mean((y_pred1_temp_val - y_unNormalize.reshape((y.shape[0],y.shape[1],1)))**2) # As axis = None is calculated for all
    #mae1_valUnNorm = T.mean( T.abs_(y_pred1_temp_val - y_unNormalize.reshape((y.shape[0],y.shape[1],1))))
    mse1_val.name = 'mse1_val'
    mae1_val.name = 'mae1_val'



    theta_mu1_in_val = theta_mu1_temp_val.reshape((x_shape[0]*x_shape[1], -1))
    theta_sig1_in_val = theta_sig1_temp_val.reshape((x_shape[0]*x_shape[1], -1))
    coeff1_in_val = coeff1_temp_val.reshape((x_shape[0]*x_shape[1], -1))


    ddoutMSEA_val = []
    ddoutYpreds_val = [y_pred1_temp_val]
    totaMSE_val = mse1_val
    totaMAE_val =mae1_val
    indexSepDynamic_val = 5
    prediction_val = y_pred1_temp_val

    #Initializing values of mse and mae
    mse2_val = T.zeros((1,))
    mae2_val = T.zeros((1,))
    mse3_val = T.zeros((1,))
    mae3_val = T.zeros((1,))
    mse4_val = T.zeros((1,))
    mae4_val = T.zeros((1,))
    mse5_val = T.zeros((1,))
    mae5_val = T.zeros((1,))

    relErr2_val = T.zeros((1,))
    relErr3_val = T.zeros((1,))
    relErr4_val = T.zeros((1,))
    relErr5_val = T.zeros((1,))
    propAssigned2_val = T.zeros((1,))
    propAssigned3_val = T.zeros((1,))
    propAssigned4_val = T.zeros((1,))
    propAssigned5_val = T.zeros((1,))

    if (y_dim>1):
      theta_mu2_temp_val, theta_sig2_temp_val, coeff2_temp_val, y_pred2_temp_val = restResults_val[:4]
      restResults_val = restResults_val[4:]
      theta_mu2_temp_val.name = 'theta_mu2_val'
      theta_sig2_temp_val.name = 'theta_sig2_val'
      coeff2_temp_val.name = 'coeff2_val'
      y_pred2_temp_val.name = 'disaggregation2_val'
      mse2_val = T.mean((y_pred2_temp_val - y[:,:,1].reshape((y.shape[0],y.shape[1],1)))**2) # As axis = None is calculated for all
      mae2_val = T.mean( T.abs_(y_pred2_temp_val - y[:,:,1].reshape((y.shape[0],y.shape[1],1))) )

      totPred = T.sum(y_pred2_temp_val)
      totReal = T.sum(y[:,:,1])
      relErr2_val =( totPred -  totReal)/ T.maximum(totPred,totReal)
      propAssigned2_val = 1 - T.sum(T.abs_(y_pred2_temp_val - y[:,:,1].reshape((y.shape[0],y.shape[1],1))))/(2*T.sum(x))

      mse2_val.name = 'mse2_val'
      mae2_val.name = 'mae2_val'

      theta_mu2_in_val = theta_mu2_temp_val.reshape((x_shape[0]*x_shape[1], -1))
      theta_sig2_in_val = theta_sig2_temp_val.reshape((x_shape[0]*x_shape[1], -1))
      coeff2_in_val = coeff2_temp_val.reshape((x_shape[0]*x_shape[1], -1))

      argsGMM_val = theta_mu2_in_val, theta_sig2_in_val, coeff2_in_val

      ddoutMSEA_val = ddoutMSEA_val + [mse2_val, mae2_val]
      ddoutYpreds_val = ddoutYpreds_val + [y_pred2_temp_val]
      totaMSE_val+=mse2_val
      totaMAE_val+=mae2_val
      indexSepDynamic_val +=2

      prediction_val = T.concatenate([prediction_val, y_pred2_temp_val], axis=2)

    if (y_dim>2):
      theta_mu3_temp_val, theta_sig3_temp_val, coeff3_temp_val, y_pred3_temp_val = restResults_val[:4]
      restResults_val = restResults_val[4:]
      theta_mu3_temp_val.name = 'theta_mu3_val'
      theta_sig3_temp_val.name = 'theta_sig3_val'
      coeff3_temp_val.name = 'coeff3_val'
      y_pred3_temp_val.name = 'disaggregation3_val'
      mse3_val = T.mean((y_pred3_temp_val - y[:,:,2].reshape((y.shape[0],y.shape[1],1)))**2) # As axis = None is calculated for all
      mae3_val = T.mean( T.abs_(y_pred3_temp_val - y[:,:,2].reshape((y.shape[0],y.shape[1],1))) )

      totPred = T.sum(y_pred3_temp_val)
      totReal = T.sum(y[:,:,2])
      relErr3_val =( totPred -  totReal)/ T.maximum(totPred,totReal)
      propAssigned3_val = 1 - T.sum(T.abs_(y_pred3_temp_val - y[:,:,2].reshape((y.shape[0],y.shape[1],1))))/(2*T.sum(x))

      mse3_val.name = 'mse3_val'
      mae3_val.name = 'mae3_val'

      theta_mu3_in_val = theta_mu3_temp_val.reshape((x_shape[0]*x_shape[1], -1))
      theta_sig3_in_val = theta_sig3_temp_val.reshape((x_shape[0]*x_shape[1], -1))
      coeff3_in_val = coeff3_temp_val.reshape((x_shape[0]*x_shape[1], -1))

      argsGMM_val = argsGMM_val + (theta_mu3_in_val, theta_sig3_in_val, coeff3_in_val)
      ddoutMSEA_val = ddoutMSEA_val + [mse3_val, mae3_val]
      ddoutYpreds_val = ddoutYpreds_val + [y_pred3_temp_val]
      totaMSE_val+=mse3_val
      totaMAE_val+=mae3_val
      indexSepDynamic_val +=2

      prediction_val = T.concatenate([prediction_val, y_pred3_temp_val], axis=2)

    if (y_dim>3):
      theta_mu4_temp_val, theta_sig4_temp_val, coeff4_temp_val, y_pred4_temp_val = restResults_val[:4]
      restResults_val = restResults_val[4:]
      theta_mu4_temp_val.name = 'theta_mu4_val'
      theta_sig4_temp_val.name = 'theta_sig4_val'
      coeff4_temp_val.name = 'coeff4_val'
      y_pred4_temp_val.name = 'disaggregation4_val'
      mse4_val = T.mean((y_pred4_temp_val - y[:,:,3].reshape((y.shape[0],y.shape[1],1)))**2) # As axis = None is calculated for all
      mae4_val = T.mean( T.abs_(y_pred4_temp_val - y[:,:,3].reshape((y.shape[0],y.shape[1],1))) )

      totPred = T.sum(y_pred4_temp_val)
      totReal = T.sum(y[:,:,3])
      relErr4_val =( totPred -  totReal)/ T.maximum(totPred,totReal)
      propAssigned4_val = 1 - T.sum(T.abs_(y_pred4_temp_val - y[:,:,3].reshape((y.shape[0],y.shape[1],1))))/(2*T.sum(x))

      mse4_val.name = 'mse4_val'
      mae4_val.name = 'mae4_val'

      theta_mu4_in_val = theta_mu4_temp_val.reshape((x_shape[0]*x_shape[1], -1))
      theta_sig4_in_val = theta_sig4_temp_val.reshape((x_shape[0]*x_shape[1], -1))
      coeff4_in_val = coeff4_temp_val.reshape((x_shape[0]*x_shape[1], -1))

      argsGMM_val = argsGMM_val + (theta_mu4_in_val, theta_sig4_in_val, coeff4_in_val)
      ddoutMSEA_val = ddoutMSEA_val + [mse4_val, mae4_val]
      ddoutYpreds_val = ddoutYpreds_val + [y_pred4_temp_val]
      totaMSE_val+=mse4_val
      totaMAE_val+=mae4_val
      indexSepDynamic_val +=2
      prediction_val = T.concatenate([prediction_val, y_pred4_temp_val], axis=2)

    if (y_dim>4):
      theta_mu5_temp_val, theta_sig5_temp_val, coeff5_temp_val, y_pred5_temp_val = restResults_val[:4]
      restResults_val = restResults_val[4:]
      theta_mu5_temp_val.name = 'theta_mu5_val'
      theta_sig5_temp_val.name = 'theta_sig5_val'
      coeff5_temp_val.name = 'coeff5_val'
      y_pred5_temp_val.name = 'disaggregation5_val'
      mse5_val = T.mean((y_pred5_temp_val - y[:,:,4].reshape((y.shape[0],y.shape[1],1)))**2) # As axis = None is calculated for all
      mae5_val = T.mean( T.abs_(y_pred5_temp_val - y[:,:,4].reshape((y.shape[0],y.shape[1],1))))

      totPred = T.sum(y_pred5_temp_val)
      totReal = T.sum(y[:,:,4])
      relErr5_val =( totPred -  totReal)/ T.maximum(totPred,totReal)
      propAssigned5_val = 1 - T.sum(T.abs_(y_pred5_temp_val - y[:,:,4].reshape((y.shape[0],y.shape[1],1))))/(2*T.sum(x))

      mse5_val.name = 'mse5_val'
      mae5_val.name = 'mae5_val'

      theta_mu5_in_val = theta_mu5_temp_val.reshape((x_shape[0]*x_shape[1], -1))
      theta_sig5_in_val = theta_sig5_temp_val.reshape((x_shape[0]*x_shape[1], -1))
      coeff5_in_val = coeff5_temp_val.reshape((x_shape[0]*x_shape[1], -1))

      argsGMM_val = argsGMM_val + (theta_mu5_in_val, theta_sig5_in_val, coeff5_in_val)
      ddoutMSEA_val = ddoutMSEA_val + [mse5_val, mae5_val]
      ddoutYpreds_val = ddoutYpreds_val + [y_pred5_temp_val]
      totaMSE_val+=mse5_val
      totaMAE_val+=mae5_val
      indexSepDynamic_val +=2
      prediction_val = T.concatenate([prediction_val, y_pred5_temp_val], axis=2)

    recon_val = GMMdisagMulti(y_dim, y_in, theta_mu1_in_val, theta_sig1_in_val, coeff1_in_val, *argsGMM_val)# BiGMM(x_in, theta_mu_in, theta_sig_in, coeff_in, corr_in, binary_in)
    recon_val = recon_val.reshape((x_shape[0], x_shape[1]))
    recon_val.name = 'gmm_out'
    totaMSE_val=totaMSE_val/y_dim
    totaMAE_val=totaMAE_val/y_dim
    
    '''
    recon5 = GMM(y_in[:,4, None], theta_mu5_in, theta_sig5_in, coeff5_in)
    recon5 = recon.reshape((x_shape[0], x_shape[1]))    
    '''
    recon_term_val = recon_val.sum(axis=0).mean()
    recon_term_val = recon_val.sum(axis=0).mean()
    recon_term_val.name = 'recon_term'

    ######################
    '''
    model.inputs = [x, mask, y, y_mask, scheduleSamplingMask]
    model.params = params
    model.nodes = nodes
    '''
    optimizer = Adam(
        lr=lr
    )
    header = "epoch,log,kl,nll_upper_bound,mse,mae\n"
    extension = [
        GradientClipping(batch_size=batch_size),
        EpochCount(epoch, save_path, header),
        Monitoring(freq=monitoring_freq,
                   #ddout=[nll_upper_bound, recon_term, kl_term, totaMSE, mse1, mae1]+ddoutMSEA+ddoutYpreds ,
                   #indexSep=indexSepDynamic,
                   indexDDoutPlot = [13], # adding indexes of ddout for the plotting
                   #, (6,y_pred_temp)
                   instancesPlot = instancesPlot,#0-150
                   data=[Iterator(valid_data, batch_size)],
                   savedFolder = save_path),
        Picklize(freq=monitoring_freq, path=save_path),
        EarlyStopping(freq=monitoring_freq, path=save_path, channel=channel_name),
        WeightNorm()
    ]

    lr_iterations = {0:lr}

    """mainloop = Training(
        name=pkl_name,
        data=Iterator(train_data, batch_size),
        model=model,
        optimizer=optimizer,
        cost=nll_upper_bound,
        outputs=[nll_upper_bound],
        n_steps = n_steps,
        extension=extension,
        lr_iterations=lr_iterations,
        k_speedOfconvergence=kSchedSamp
    )
    mainloop.run()"""

    
    data=Iterator(test_data, batch_size)
    test_fn = theano.function(inputs=[x, y],#[x, y],
                              #givens={x:Xtest},
                              #on_unused_input='ignore',
                              #z=( ,200,1)
                              allow_input_downcast=True,
                              outputs=[prediction_val, recon_term_val, totaMSE_val, totaMAE_val,
                                        mse1_val,mse2_val,mse3_val,mse4_val,mse5_val,
                                        mae1_val,mae2_val,mae3_val,mae4_val,mae5_val, 
                                        relErr1_val,relErr2_val,relErr3_val,relErr4_val,relErr5_val,
                                        propAssigned1_val, propAssigned2_val,propAssigned3_val,propAssigned4_val,propAssigned5_val]#prediction_val, mse_val, mae_val
                              ,updates=updates_val
                              )
    testOutput = []
    testMetrics2 = []
    numBatchTest = 0
    for batch in data:
      outputGeneration = test_fn(batch[0], batch[2]) #ERROR HERE
      testOutput.append(outputGeneration[1:14])
      testMetrics2.append(outputGeneration[14:])
      #{0:[4,20], 2:[5,10]} 
      #if (numBatchTest==0):

      plt.figure(1)
      plt.plot(np.transpose(outputGeneration[0],[1,0,2])[4])#ORIGINAL 1,0,2
      plt.legend(appliances)
      plt.savefig(save_path+"/vrnn_dis_generated{}_Pred_0-4".format(numBatchTest))

      plt.clf()

      plt.figure(2)
      plt.plot(np.transpose(batch[2],[1,0,2])[4])
      plt.legend(appliances)
      plt.savefig(save_path+"/vrnn_dis_generated{}_RealDisag_0-4".format(numBatchTest))
      plt.clf()

      plt.figure(3)
      plt.plot(np.transpose(batch[0],[1,0,2])[4]) #ORIGINAL 1,0,2
      plt.savefig(save_path+"/vrnn_dis_generated{}_Realagg_0-4".format(numBatchTest))
      plt.clf()
      numBatchTest+=1

    
    testOutput = np.asarray(testOutput)
    testMetrics2 = np.asarray(testMetrics2)
    print(testOutput.shape)
    print(testMetrics2.shape)
    recon_test =  testOutput[:, 0].mean()
    mse_test =  testOutput[:, 1].mean()
    mae_test =  testOutput[:, 2].mean()
    mse1_test =  testOutput[:, 3].mean()
    mae1_test =  testOutput[:, 8].mean()
    mse2_test =  testOutput[:, 4].mean()
    mae2_test =  testOutput[:, 9].mean()
    mse3_test =  testOutput[:, 5].mean()
    mae3_test =  testOutput[:, 10].mean()
    mse4_test =  testOutput[:, 6].mean()
    mae4_test =  testOutput[:, 11].mean()
    mse5_test =  testOutput[:, 7].mean()
    mae5_test =  testOutput[:, 12].mean()

    relErr1_test = testMetrics2[:,0].mean()
    relErr2_test = testMetrics2[:,1].mean()
    relErr3_test = testMetrics2[:,2].mean()
    relErr4_test = testMetrics2[:,3].mean()
    relErr5_test = testMetrics2[:,4].mean()

    propAssigned1_test = testMetrics2[:, 5].mean()
    propAssigned2_test = testMetrics2[:, 6].mean()
    propAssigned3_test = testMetrics2[:, 7].mean()
    propAssigned4_test = testMetrics2[:, 8].mean()
    propAssigned5_test = testMetrics2[:, 9].mean()

    fLog = open(save_path+'/output.csv', 'w')
    fLog.write(str(lr_iterations)+"\n")
    fLog.write(str(appliances)+"\n")
    fLog.write(str(windows)+"\n")
    fLog.write("logTest,mse1_test,mse2_test,mse3_test,mse4_test,mse5_test,mae1_test,mae2_test,mae3_test,mae4_test,mae5_test,mseTest,maeTest\n")
    fLog.write("{},{},{},{},{},{},{},{},{},{},{},{},{}\n\n".format(recon_test,mse1_test,mse2_test,mse3_test, mse4_test,mse5_test,mae1_test,mae2_test,mae3_test, mae4_test,mae5_test,mse_test,mae_test))
    fLog.write("relErr1,relErr2,relErr3,relErr4,relErr5,propAssigned1,propAssigned2,propAssigned3,propAssigned4,propAssigned5\n")
    fLog.write("{},{},{},{},{},{},{},{},{},{}\n".format(relErr1_test,relErr2_test,relErr3_test,relErr4_test, relErr5_test,propAssigned1_test,propAssigned2_test,propAssigned3_test, propAssigned4_test,propAssigned5_test))

    fLog.write("q_z_dim,p_z_dim,p_x_dim,x2s_dim,y2s_dim,z2s_dim\n")
    fLog.write("{},{},{},{},{},{}\n".format(q_z_dim,p_z_dim,p_x_dim,x2s_dim,y2s_dim,z2s_dim))
    fLog.write("epoch,log,kl,mse1,mse2,mse3,mse4,mse5,mae1,mae2,mae3,mae4,mae5\n")
    for i , item in enumerate(mainloop.trainlog.monitor['nll_upper_bound']):
      d, e, f,g,j,k,l,m =  0,0,0,0,0,0,0,0
      ep = mainloop.trainlog.monitor['epoch'][i]
      a = mainloop.trainlog.monitor['recon_term'][i]
      b = mainloop.trainlog.monitor['kl_term'][i]
      c = mainloop.trainlog.monitor['mse1'][i]
      h = mainloop.trainlog.monitor['mae1'][i]
      
      if (y_dim>1):
        d = mainloop.trainlog.monitor['mse2'][i]
        j = mainloop.trainlog.monitor['mae2'][i]
      if (y_dim>2):
        e = mainloop.trainlog.monitor['mse3'][i]
        k = mainloop.trainlog.monitor['mae3'][i]
      if (y_dim>3):
        f = mainloop.trainlog.monitor['mse4'][i]
        l = mainloop.trainlog.monitor['mae4'][i]
      if (y_dim>4):
        g = mainloop.trainlog.monitor['mse5'][i]
        m = mainloop.trainlog.monitor['mae5'][i]
      fLog.write("{:d},{:.2f},{:.2f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f},{:.3f}\n".format(ep,a,b,c,d,e,f,g,h,j,k,l,m))

    f = open(save_path+'/outputRealGeneration.pkl', 'wb')
    pickle.dump(outputGeneration, f, -1)
    f.close()
Example #55
0
	def _get_cost2(
			self,
			output,
			truth,
			rescore=True
		):

		if not hasattr(self, '_lambda_obj'):
			lambda_obj, lambda_noobj, thresh = T.scalar('lambda_obj'), T.scalar('lambda_noobj'), T.scalar('thresh')
			self._lambda_obj, self._lambda_noobj, self._thresh = lambda_obj, lambda_noobj, thresh
		else:
			lambda_obj, lambda_noobj, thresh = self._lambda_obj, self._lambda_noobj, self._thresh
		
		cost = 0.
		# create grid for cells
		w_cell, h_cell =  1. / self.output_shape[1], 1. / self.output_shape[0]
		x, y = T.arange(w_cell / 2, 1., w_cell), T.arange(h_cell / 2, 1., h_cell)
		y, x = meshgrid(x, y)
		
		# reshape truth to match with cell
		truth_cell = truth.dimshuffle(0, 1, 2, 'x','x')
		x, y = x.dimshuffle('x','x',0,1), y.dimshuffle('x','x',0,1)
		
		# calculate overlap between cell and ground truth boxes
		xi, yi = T.maximum(truth_cell[:,:,0], x - w_cell/2), T.maximum(truth_cell[:,:,1], y - h_cell/2)
		xf = T.minimum(truth_cell[:,:,[0,2]].sum(axis=2), x + w_cell/2)
		yf = T.minimum(truth_cell[:,:,[1,3]].sum(axis=2), y + h_cell/2)
		w, h = T.maximum(xf - xi, 0), T.maximum(yf - yi, 0)
		
		# overlap between cell and ground truth box
		overlap = (w * h) / (w_cell * h_cell)
		
		# repeat truth boxes
		truth_boxes = truth.dimshuffle(0, 1, 'x', 2, 'x', 'x')
		
		# create grid for anchor boxes
		anchors = T.concatenate((x.dimshuffle(0,1,'x','x',2,3) - w_cell/2, y.dimshuffle(0,1,'x','x',2,3) - h_cell/2), axis=3)
		anchors = T.concatenate((anchors, T.ones_like(anchors)), axis=3)
		anchors = T.repeat(anchors, self.boxes.__len__(), axis=2)
		
		w_acr = theano.shared(np.asarray([b[0] for b in self.boxes]), name='w_acr', borrow=True).dimshuffle('x','x',0,'x','x')
		h_acr = theano.shared(np.asarray([b[1] for b in self.boxes]), name='h_acr', borrow=True).dimshuffle('x','x',0,'x','x')
		
		anchors = T.set_subtensor(anchors[:,:,:,2], anchors[:,:,:,2] * w_acr)
		anchors = T.set_subtensor(anchors[:,:,:,3], anchors[:,:,:,3] * h_acr)
		
		# find iou between anchors and ground truths
		xi, yi = T.maximum(truth_boxes[:,:,:,0], anchors[:,:,:,0]), T.maximum(truth_boxes[:,:,:,1], anchors[:,:,:,1])
		xf = T.minimum(truth_boxes[:,:,:,[0,2]].sum(axis=3), anchors[:,:,:,[0,2]].sum(axis=3))
		yf = T.minimum(truth_boxes[:,:,:,[1,3]].sum(axis=3), anchors[:,:,:,[1,3]].sum(axis=3))
		w, h = T.maximum(xf - xi, 0), T.maximum(yf - yi, 0)
		
		isec = w * h
		iou = isec / (T.prod(truth_boxes[:,:,:,[2,3]], axis=3) + T.prod(anchors[:,:,:,[2,3]], axis=3) - isec)
		
		overlap = overlap.dimshuffle(0,1,'x',2,3)
		
		best_iou_obj_idx = T.argmax(iou, axis=1).dimshuffle(0,'x',1,2,3)
		best_iou_box_idx = T.argmax(iou, axis=2).dimshuffle(0,1,'x',2,3)
		
		_,obj_idx,box_idx,_,_ = meshgrid(
			T.arange(truth.shape[0]),
			T.arange(truth.shape[1]),
			T.arange(self.boxes.__len__()),
			T.arange(self.output_shape[0]),
			T.arange(self.output_shape[1])
		)
		
		# define logical matrix assigning object to correct anchor box and cell.
		best_iou_idx = T.bitwise_and(
			T.bitwise_and(
				T.eq(best_iou_box_idx, box_idx),
				T.eq(best_iou_obj_idx, obj_idx)
			),
			overlap >= thresh
		)
		
		constants = []
		if rescore: 
			# scale predictions correctly
			pred = output.dimshuffle(0,'x',1,2,3,4)
			pred = T.set_subtensor(pred[:,:,:,0], pred[:,:,:,0] + x.dimshuffle(0,1,'x',2,3))
			pred = T.set_subtensor(pred[:,:,:,1], pred[:,:,:,1] + y.dimshuffle(0,1,'x',2,3))
			pred = T.set_subtensor(pred[:,:,:,2], w_acr * T.exp(pred[:,:,:,2]))
			pred = T.set_subtensor(pred[:,:,:,3], h_acr * T.exp(pred[:,:,:,3]))
			
			xi, yi = T.maximum(pred[:,:,:,0], truth_boxes[:,:,:,0]), T.maximum(pred[:,:,:,1], truth_boxes[:,:,:,1])
			xf = T.minimum(pred[:,:,:,[0,2]].sum(axis=3), truth_boxes[:,:,:,[0,2]].sum(axis=3))
			yf = T.minimum(pred[:,:,:,[1,3]].sum(axis=3), truth_boxes[:,:,:,[1,3]].sum(axis=3))
			w, h = T.maximum(xf - xi, 0.), T.maximum(yf - yi, 0.)
			
			isec = w * h
			iou = isec / (pred[:,:,:,[2,3]].prod(axis=3) + truth_boxes[:,:,:,[2,3]].prod(axis=3) - isec)

			# make sure iou is considered constant when taking gradient
			constants.append(iou)
	
		# format ground truths correclty
		truth_boxes = truth_boxes = T.repeat(
			T.repeat(
				T.repeat(truth_boxes, self.boxes.__len__(), axis=2),
				self.output_shape[0], axis=4
			),
			self.output_shape[1], axis=5
		)
		
		truth_boxes = T.set_subtensor(truth_boxes[:,:,:,0], truth_boxes[:,:,:,0] - anchors[:,:,:,0])
		truth_boxes = T.set_subtensor(truth_boxes[:,:,:,1], truth_boxes[:,:,:,1] - anchors[:,:,:,1])
		truth_boxes = T.set_subtensor(truth_boxes[:,:,:,2], T.log(truth_boxes[:,:,:,2] / anchors[:,:,:,2]))
		truth_boxes = T.set_subtensor(truth_boxes[:,:,:,3], T.log(truth_boxes[:,:,:,3] / anchors[:,:,:,3]))
		
		# add dimension for objects per image
		pred = T.repeat(output.dimshuffle(0,'x',1,2,3,4), truth.shape[1], axis=1)
				
		# penalize coordinates
		cost += lambda_obj * T.mean(((pred[:,:,:,:4] - truth_boxes[:,:,:,:4])**2).sum(axis=3)[best_iou_idx.nonzero()])
				
		# penalize class scores
		cost += lambda_obj * T.mean((-truth_boxes[:,:,:,-self.num_classes:] * T.log(pred[:,:,:,-self.num_classes:])).sum(axis=3)[best_iou_idx.nonzero()])
		
		# penalize objectness score
		if rescore:
			cost += lambda_obj * T.mean(((pred[:,:,:,4] - iou)**2)[best_iou_idx.nonzero()])
		else:
			cost += lambda_obj * T.mean(((pred[:,:,:,4] - 1.)**2)[best_iou_idx.nonzero()])
		
		# flip all matched and penalize all un-matched objectness scores
		not_matched_idx = best_iou_idx.sum(axis=1) > 0
		not_matched_idx = bitwise_not(not_matched_idx)

		# penalize objectness score for non-matched boxes
		cost += lambda_noobj * T.mean((pred[:,0,:,4]**2)[not_matched_idx.nonzero()])
		
		return cost, constants
Example #56
0
 def backward(self, y):
     remaining = 1 - tt.sum(y[..., :], axis=-1, keepdims=True)
     return tt.concatenate([y[..., :], remaining], axis=-1)
def concatenate(tensors, axis=-1):
    return T.concatenate(tensors, axis=axis)
Example #58
0
	def _get_cost(
			self,
			output,
			truth,
			rescore=True
		):
		if not hasattr(self, '_lambda_obj'):
			lambda_obj, lambda_noobj, lambda_anchor = T.scalar('lambda_obj'), T.scalar('lambda_noobj'), T.scalar('lambda_anchor')
			self._lambda_obj, self._lambda_noobj, self._lambda_anchor = lambda_obj, lambda_noobj, lambda_anchor
		else:
			lambda_obj, lambda_noobj, lambda_anchor = self._lambda_obj, self._lambda_noobj, self._lambda_anchor
			
		# lambda_obj, lambda_noobj, lambda_anchor = 1., 5., 0.1

		w_cell, h_cell = 1./self.output_shape[1], 1./self.output_shape[0]
		x, y = T.arange(w_cell/2, 1., w_cell), T.arange(h_cell/2, 1., h_cell)
		y, x = meshgrid(x, y)
		x, y = x.dimshuffle('x','x','x',0,1), y.dimshuffle('x','x','x',0,1)

		# create anchors for later
		w_acr = theano.shared(np.asarray([b[0] for b in self.boxes]), name='w_acr').dimshuffle('x',0,'x','x','x') * T.ones_like(x)
		h_acr = theano.shared(np.asarray([b[1] for b in self.boxes]), name='h_acr').dimshuffle('x',0,'x','x','x') * T.ones_like(y)
		anchors = T.concatenate((x * T.ones_like(w_acr), y * T.ones_like(h_acr), w_acr, h_acr), axis=2)
		anchors = T.repeat(anchors, truth.shape[0], axis=0)

		cell_coord = T.concatenate((x,y), axis=2)
		gt_coord = (truth[:,:,:2] + truth[:,:,2:4]/2).dimshuffle(0,1,2,'x','x')
		
		gt_dist = T.sum((gt_coord - cell_coord)**2, axis=2).reshape((truth.shape[0],truth.shape[1],-1))
		
		cell_idx = argmin_unique(gt_dist, 1, 2).reshape((-1,)) # assign unique cell to each obj per example
		row_idx = T.cast(cell_idx // self.output_shape[1], 'int64')
		col_idx = cell_idx - row_idx * self.output_shape[1]
		num_idx = T.repeat(T.arange(truth.shape[0]).reshape((-1,1)), truth.shape[1], axis=1).reshape((-1,))
		obj_idx = T.repeat(T.arange(truth.shape[1]).reshape((1,-1)), truth.shape[0], axis=0).reshape((-1,))
		
		valid_example = gt_dist[num_idx, obj_idx, cell_idx] < 1 # if example further than 1 away from cell it's a garbage example
		
		num_idx, obj_idx = num_idx[valid_example.nonzero()], obj_idx[valid_example.nonzero()]
		row_idx, col_idx = row_idx[valid_example.nonzero()], col_idx[valid_example.nonzero()]
		
		truth_flat = truth[num_idx, obj_idx, :].dimshuffle(0,'x',1)
		
		pred_matched = output[num_idx,:,:,row_idx, col_idx]
		x, y = x[:,0,0,row_idx, col_idx].dimshuffle(1,0), y[:,0,0,row_idx, col_idx].dimshuffle(1,0)
		w_acr = theano.shared(np.asarray([b[0] for b in self.boxes]), name='w_acr').dimshuffle('x',0)
		h_acr = theano.shared(np.asarray([b[1] for b in self.boxes]), name='h_acr').dimshuffle('x',0)
	
		# reformat prediction
		pred_shift = pred_matched
		pred_shift = T.set_subtensor(pred_shift[:,:,2], w_acr * T.exp(pred_shift[:,:,2]))
		pred_shift = T.set_subtensor(pred_shift[:,:,3], h_acr * T.exp(pred_shift[:,:,3]))
		pred_shift = T.set_subtensor(pred_shift[:,:,0], pred_shift[:,:,0] + T.repeat(x, pred_shift.shape[1], axis=1) - pred_shift[:,:,2]/2)
		pred_shift = T.set_subtensor(pred_shift[:,:,1], pred_shift[:,:,1] + T.repeat(y, pred_shift.shape[1], axis=1) - pred_shift[:,:,3]/2)
		
		# calculate iou
		xi = T.maximum(pred_shift[:,:,0], truth_flat[:,:,0])
		yi = T.maximum(pred_shift[:,:,1], truth_flat[:,:,1])
		xf = T.minimum(pred_shift[:,:,[0,2]].sum(axis=2), truth_flat[:,:,[0,2]].sum(axis=2))
		yf = T.minimum(pred_shift[:,:,[1,3]].sum(axis=2), truth_flat[:,:,[1,3]].sum(axis=2))
		w, h = T.maximum(xf - xi, 0), T.maximum(yf - yi, 0)
		
		isec = w * h
		union = T.prod(pred_shift[:,:,[2,3]], axis=2) + T.prod(truth_flat[:,:,[2,3]], axis=2) - isec
		iou = isec / union

		# calculate iou for anchor
		anchors_matched = anchors[num_idx,:,:,row_idx,col_idx]
		xi = T.maximum(anchors_matched[:,:,0], truth_flat[:,:,0])
		yi = T.maximum(anchors_matched[:,:,1], truth_flat[:,:,1])
		xf = T.minimum(anchors_matched[:,:,[0,2]].sum(axis=2), truth_flat[:,:,[0,2]].sum(axis=2))
		yf = T.minimum(anchors_matched[:,:,[1,3]].sum(axis=2), truth_flat[:,:,[1,3]].sum(axis=2))
		w, h = T.maximum(xf - xi, 0), T.maximum(yf - yi, 0)
		
		isec = w * h
		union = T.prod(anchors_matched[:,:,[2,3]], axis=2) + T.prod(truth_flat[:,:,[2,3]], axis=2) - isec
		iou_acr = isec / union
		
		# get max iou
		acr_idx = T.argmax(iou_acr, axis=1)
		
		# reformat truth
		truth_formatted = truth_flat
		truth_formatted = T.repeat(truth_formatted, self.boxes.__len__(), axis=1)
		truth_formatted = T.set_subtensor(truth_formatted[:,:,0], truth_formatted[:,:,0] + truth_formatted[:,:,2]/2 - T.repeat(x, truth_formatted.shape[1], axis=1))
		truth_formatted = T.set_subtensor(truth_formatted[:,:,1], truth_formatted[:,:,1] + truth_formatted[:,:,3]/2 - T.repeat(y, truth_formatted.shape[1], axis=1))
		truth_formatted = T.set_subtensor(truth_formatted[:,:,2], T.log(truth_formatted[:,:,2] / w_acr))
		truth_formatted = T.set_subtensor(truth_formatted[:,:,3], T.log(truth_formatted[:,:,3] / h_acr))
		truth_formatted = truth_formatted[T.arange(truth_formatted.shape[0]),acr_idx,:]
		
			
		#
		# calculate cost
		#
		item_idx = T.arange(pred_matched.shape[0])
		anchors = T.set_subtensor(anchors[:,:,:2], 0.)

		cost = 0.

		cost_noobject = lambda_noobj * (T.mean(output[:,:,4]**2) - T.sum(pred_matched[item_idx, acr_idx,4]**2) / output[:,:,4].size)
		cost_anchor = lambda_anchor * (T.mean(T.sum(output[:,:,:4]**2, axis=2)) - T.sum(T.sum(pred_matched[item_idx,acr_idx,:4]**2, axis=1)) / output[:,:,0].size)
		cost_coord = lambda_obj * T.mean(T.sum((pred_matched[item_idx,acr_idx,:4] - truth_formatted[:,:4])**2, axis=1))
		cost_class = lambda_obj * T.mean(T.sum(-truth_formatted[:,-self.num_classes:] * T.log(pred_matched[item_idx, acr_idx, -self.num_classes:]), axis=1))

		if rescore:
			cost_obj = lambda_obj * T.mean((pred_matched[item_idx, acr_idx,4] - iou[item_idx, acr_idx])**2)
		else:
			cost_obj = lambda_obj * T.mean((pred_matched[item_idx, acr_idx,4] - 1)**2)
		
		cost = cost_noobject + cost_obj + cost_anchor + cost_coord + cost_class

		return cost, [iou], [row_idx, col_idx, acr_idx, cost_noobject, cost_anchor, cost_coord, cost_class, cost_obj]
Example #59
0
def concat(tensor_list, axis):
    return T.concatenate(tensor_list=tensor_list, axis=axis)
Example #60
0
	def detect(self, im, thresh=0.75, overlap=0.5, num_to_label=None, return_iou=False):
		im = format_image(im, dtype=theano.config.floatX)

		old_size = im.shape[:2]
		im = cv2.resize(im, self.input_shape[::-1], interpolation=cv2.INTER_LINEAR).swapaxes(2,1).swapaxes(1,0).reshape((1,3) + self.input_shape)

		if not hasattr(self, '_detect_fn'):
			'''
			Make theano do all the heavy lifting for detection, this should speed up the process marginally.
			'''

			output = self.output_test

			if self.use_custom_cost:
				new_output = None
				for i in range(len(self.boxes)):
					cls_idx = T.arange(i * (5 + self.num_classes), (i+1) * (5 + self.num_classes))
					if new_output is None:
						new_output = output[:,cls_idx,:,:].dimshuffle(0,'x',1,2,3)
					else:
						new_output = T.concatenate((new_output, output[:,cls_idx,:,:].dimshuffle(0,'x',1,2,3)), axis=1)
				output = new_output

			thresh_var = T.scalar(name='thresh')
			conf = output[:,:,4] * T.max(output[:,:,-self.num_classes:], axis=2)

			# define offsets to predictions
			w_cell, h_cell =  1. / self.output_shape[1], 1. / self.output_shape[0]
			x, y = T.arange(w_cell / 2, 1., w_cell), T.arange(h_cell / 2, 1., h_cell)
			y, x = meshgrid(x, y)

			x, y = x.dimshuffle('x','x',0,1), y.dimshuffle('x','x',0,1)
			
			# define scale
			w_acr = theano.shared(np.asarray([b[0] for b in self.boxes]), name='w_acr', borrow=True).dimshuffle('x',0,'x','x')
			h_acr = theano.shared(np.asarray([b[1] for b in self.boxes]), name='h_acr', borrow=True).dimshuffle('x',0,'x','x')

			# rescale output
			output = T.set_subtensor(output[:,:,2], w_acr * T.exp(output[:,:,2]))
			output = T.set_subtensor(output[:,:,3], h_acr * T.exp(output[:,:,3]))
			output = T.set_subtensor(output[:,:,0], output[:,:,0] + x - output[:,:,2] / 2)
			output = T.set_subtensor(output[:,:,1], output[:,:,1] + y - output[:,:,3] / 2)
			output = T.set_subtensor(output[:,:,2:4], output[:,:,2:4] + output[:,:,:2])

			# define confidence in prediction
			conf = output[:,:,4] * T.max(output[:,:,-self.num_classes:], axis=2)
			cls = T.argmax(output[:,:,-self.num_classes:], axis=2)

			# filter out all below thresh
			above_thresh_idx = conf > thresh_var			
			pred = T.concatenate(
				(
					output[:,:,0][above_thresh_idx.nonzero()].dimshuffle(0,'x'),
					output[:,:,1][above_thresh_idx.nonzero()].dimshuffle(0,'x'),
					output[:,:,2][above_thresh_idx.nonzero()].dimshuffle(0,'x'),
					output[:,:,3][above_thresh_idx.nonzero()].dimshuffle(0,'x'),
					conf[above_thresh_idx.nonzero()].dimshuffle(0,'x'),
					cls[above_thresh_idx.nonzero()].dimshuffle(0,'x')
				),
				axis=1
			)

			iou_matrix = utils.iou_matrix(pred)
			
			self._detect_fn = theano.function([self.input, thresh_var], [pred, iou_matrix])

		output, iou_matrix = self._detect_fn(im, thresh)

		boxes = []
		for i in range(output.shape[0]):
			coord, conf, cls = output[i,:4], output[i,4], output[i,5]
			coord[2:] += coord[:2]
			if num_to_label is not None:
				cls =num_to_label[cls]
			box = utils.BoundingBox(*coord.tolist(), confidence=conf, cls=cls)
			boxes.append(box)

		boxes = [b * old_size for b in boxes]

		if return_iou:
			return boxes, iou_matrix
		else:
			return boxes