Esempio n. 1
0
def convolve1d_4D_scan(input, W, mode='full'):
  batch_size, nchannels, nwords, ndim = input.shape
  nkernels_out, nkernels_in, filter_width, ndim = W.shape

  # Unroll filter along columns
  W_unrolled = W.dimshuffle(0, 2, 1, 3).flatten(ndim=3)
  # Replicate input filters 'batch_size' times and squash out_filters along column axis.
  # W_tiled = T.tile(W_unrolled, (1, 1, batch_size)).dimshuffle(1, 0, 2).flatten(ndim=2)  # doesn't give a gradient
  W_tiled = T.alloc(W_unrolled, batch_size, W_unrolled.shape[0], W_unrolled.shape[1], W_unrolled.shape[2]).dimshuffle(1, 2, 0, 3).flatten(ndim=3).dimshuffle(1, 0, 2).flatten(ndim=2)
  W_tiled = W_tiled[::-1]
  # reverse_slicing = [slice(None, None, None)] * W_tiled.ndim
  # reverse_slicing[0] = slice(None, None, -1)
  # reverse_slicing = tuple(reverse_slicing)
  # W_tiled = W_tiled[reverse_slicing]  # flip the kernel

  # Unroll input and pad to fit the output filters.
  input_reshaped = input.dimshuffle(0, 2, 1, 3).flatten(ndim=3).dimshuffle(1,0,2).flatten(ndim=2)
  # input_tiled = T.tile(input_reshaped, (1, nkernels_out))
  input_tiled = T.alloc(input_reshaped, nkernels_out, input_reshaped.shape[0], input_reshaped.shape[1]).dimshuffle(1, 0, 2).flatten(ndim=2)

  if mode == 'full':
    pad = T.zeros((filter_width-1, nkernels_out*batch_size*nchannels*ndim))
    input_padded = T.concatenate([pad, input_tiled, pad])
    conv_out, _ = theano.scan(fn=lambda i: (W_tiled * input_padded[i:i+filter_width]).sum(axis=0),
                              outputs_info=None,
                              sequences=[T.arange(0, nwords+filter_width-1)])
    new_shape = (nwords+filter_width-1, nkernels_out, batch_size, nkernels_in, ndim)
  elif mode == 'valid':
    conv_out, _ = theano.scan(fn=lambda i: (W_tiled * input_tiled[i:i+filter_width]).sum(axis=0),
                              outputs_info=None,
                              sequences=[T.arange(0, nwords-filter_width+1)])
    new_shape = (nwords-filter_width+1, nkernels_out, batch_size, nkernels_in, ndim)

  conv_reshaped = conv_out.reshape(new_shape).dimshuffle(2, 1, 0, 3, 4).sum(axis=3)
  return conv_reshaped
Esempio n. 2
0
 def apply(self , src , mask_length , tgt):
     """
         viterbi algorithm
     """
     result , updates = theano.scan(
         fn = self.train_step,
         sequences = src,
         outputs_info = [self.A_start, None] ,
         non_sequences = self.A ,
         n_steps = mask_length
     )
     # the score of best path
     best_path_score = result[0][-1].max()
     idx = T.argmax(result[0][-1])
     #backtracking
     res2 , _ = theano.scan(
         fn = lambda dps , idx , idx2 : [dps[idx] , idx],
         sequences = result[1][::-1],
         outputs_info = [idx , idx],
         n_steps = mask_length
     )
     # the path of best score
     best_path = res2[1]
     #if len(best_path) < seq_len:
     #    best_path.extend((seq_len - len(best_path)) * [2])
     # the score of tgt path
     tgt_score = self.decode(src , mask_length , tgt)
     # max_margin
     max_margin = T.sum(T.neq(tgt[:mask_length] , best_path))
     cost = best_path_score + max_margin - tgt_score
     return T.switch(T.lt(cost , T.alloc(numpy.float32(0.)))
                     , T.alloc(numpy.float32(0.))
                     , cost
                     ),best_path
Esempio n. 3
0
    def define_complete_network(self):
        """Sets connections for predicting all values given all inputs"""

        def step(htm1_f, htm1_b):
            y_t = self.activation[-1](T.dot(htm1_f, self.W_out_f) + T.dot(htm1_b, self.W_out_b) +
                                                                                    self.b)
            return y_t


        padding_f = T.alloc(0, 1, self.forward_rnn.h.shape[1], self.forward_rnn.h.shape[2])
        padding_b = T.alloc(0, 1, self.backward_rnn.h.shape[1], self.backward_rnn.h.shape[2])

        self.y_t, _ = theano.scan(step,
                    sequences=[T.concatenate([padding_f, self.forward_rnn.h[:-1]], axis=0), T.concatenate([self.backward_rnn.h[-2::-1], padding_b], axis=0)],
                    outputs_info=None)


        self.L1 = abs(self.W_out_f.sum()) + abs(self.W_out_b.sum()) + \
                                                        self.forward_rnn.L1 + self.backward_rnn.L1

        # square of L2 norm ; one regularization option is to enforce
        # square of L2 norm to be small
        self.L2_sqr = (self.W_out_f ** 2).sum() + (self.W_out_b ** 2).sum() +  \
                                                self.forward_rnn.L2_sqr + self.backward_rnn.L2_sqr

        self.predict = theano.function(
                inputs=[self.x], outputs=self.y_t)

        self.complete_defined = True
Esempio n. 4
0
def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
    '''
    LSTM计算的核心,首先得注意参数state_below,这是个3D矩阵,[n_Step,BatchSize,Emb_Dim] [句子数,[单词batch数,词向量维度] ]
    '''
    nsteps = state_below.shape[0] # 最高维
    if state_below.ndim == 3:
        n_samples = state_below.shape[1]  # 取出单词batch数
    else:
        n_samples = 1

    assert mask is not None

    def _slice(_x, n, dim):

        if _x.ndim == 3:
            return _x[:, :, n * dim:(n + 1) * dim]
        return _x[:, n * dim:(n + 1) * dim]

    def _step(m_, x_, h_, c_):
        # x_是形参,下面的state_below是实参
        # _step中的四个形参: x_是state_below降为二维矩阵形成的序列,m_是mask降为1维“行”向量的序列
        preact = tensor.dot(h_, tparams[_p(prefix, 'U')])  # 每次新的h_与lstm_U矩阵相乘,使得f、o、c均不再为零,其中f、o是中间变量
        preact += x_  # 2维矩阵序列(x_) + 2维矩阵 = 2维矩阵序列

        # 每一个preact矩阵序列 的4块并列子矩阵 进行切片分块运算
        i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj']))  # 拿出了preact中的input GATE相关项做sigmoid激活
        f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj']))  # forget GATE
        o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj']))  # output GATE
        c = tensor.tanh(_slice(preact, 3, options['dim_proj']))  # cell

        c = f * c_ + i * c
        # [:,None] 表示“行数组”升一维变为“列向量”
        c = m_[:, None] * c + (1. - m_)[:, None] * c_ #c_代表初始时刻或上一时刻

        # 每个Step里,h结果是一个2D矩阵,[BatchSize,Emb_Dim]
        h = o * tensor.tanh(c)  # 相当于octave中的.*
        h = m_[:, None] * h + (1. - m_)[:, None] * h_  # h_是h的上一时刻或toutputs_info中的初始时刻的值

        return h, c  # 输出值对应着outputs_info中的元素
        # scan函数最终的return时,2维矩阵序列还原为三维矩阵,向量序列还原为2维矩阵,即scan函数的输出结果会增加1-D

    state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
                   tparams[_p(prefix, 'b')])  # 3维矩阵 乘 2维矩阵仍是3维矩阵,无须每一个Step做一次Wx+b,而是把所有Step的Wx一次性预计算好了

    dim_proj = options['dim_proj']
    # scan函数的一旦sequence不为空,就进入序列循环模式
    # theano.scan中的sequences中的张量会被降低一维成为在迭代函数中使用,原最高维的维数作为sequences的迭代的次数,out
    # 在scan函数的Sequence里,每步循环,都会降解n_Step维,得到一个Emb矩阵,作为输入X_
    rval, updates = theano.scan(_step,
                                sequences=[mask, state_below],  # mask对应m_,state_below对应x_,迭代次数由sequences的序列个数决定
                                outputs_info=[tensor.alloc(numpy_floatX(0.),
                                                           n_samples, # n_samples是句子中单词batch个数(即LSTM时序上输入的个数)
                                                           dim_proj), # 这个张量是outputs[-1]时刻所初始化的值,在第一次loop之后(outputs[0]之后)将会被覆盖,覆盖后此处对应着_step中的h_(h的前一时刻)
                                              tensor.alloc(numpy_floatX(0.),
                                                           n_samples,
                                                           dim_proj) # 第1次loop后此张量被覆盖,此处对应c_(c的前一时刻)
                                              ],
                                name=_p(prefix, '_layers'),
                                n_steps=nsteps)
    return rval[0]  # rval[0]是h  rval[1]是c  它们都是tensor.shared类型
Esempio n. 5
0
    def apply(self, x):
        W, U, b = self.params
        ndim = self.ndim

        def _slice(x, n, dim):
            return x[:, n * dim:(n + 1) * dim]

        def _step(x_t, h_t, c_t):
            preact = T.dot(h_t, U) + x_t

            i = T.nnet.sigmoid(_slice(preact, 0, self.ndim))
            f = T.nnet.sigmoid(_slice(preact, 1, self.ndim))
            o = T.nnet.sigmoid(_slice(preact, 2, self.ndim))
            c = T.tanh(_slice(preact, 3, self.ndim))

            c = f * c_t + i * c
            h = o * T.tanh(c)

            return h, c

        state_below = T.dot(x, W) + b

        rval, _ = theano.scan(
                _step, [state_below],
                outputs_info = [T.alloc(numpy.float32(0.), x.shape[1], ndim),
                                T.alloc(numpy.float32(0.), x.shape[1], ndim)],
                profile = _doProfile)

        return rval[0]
Esempio n. 6
0
    def init(sequence_length):
        initial_V = T.alloc(np.float32(0), sequence_length, size)
        initial_s = T.alloc(np.float32(0), sequence_length)

        def step(t, v, d, u, prev_V, prev_s):
            prev_V_to_t = prev_V[:t]
            prev_s_to_t = prev_s[:t]
            V = T.concatenate([
                prev_V_to_t,
                v.dimshuffle('x', 0),
                initial_V[t + 1:]
            ])

            to_flip = rectify(u - rev_cumsum(prev_s[1:t+1]))
            new_s = rectify(prev_s_to_t - to_flip)

            s = T.concatenate([
                new_s,
                d.dimshuffle('x'),
                initial_s[t + 1:]
            ])

            flip_score = rectify(1 - rev_cumsum(s[1:t+1]))
            score = T.min([new_s, flip_score], axis=0)

            r = T.dot(score, prev_V_to_t) + d * v

            return V, s, r
        return initial_V, initial_s, step
Esempio n. 7
0
def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
    nsteps = state_below.shape[0]
    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = 1

    assert mask is not None

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n * dim:(n + 1) * dim]
        return _x[:, n * dim:(n + 1) * dim]

    def _step(m_, x_, h_, c_):
        preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
        preact += x_

        i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj']))
        f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj']))
        o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj']))
        c = tensor.tanh(_slice(preact, 3, options['dim_proj']))

        if has_input_gate:
            if has_forget_gate:
                c = f * c_ + i * c
            else:
                c = c_ + i*c
        else:
            if has_forget_gate:
                c = f*c_ + c
            else:
                c = c_ + c

        c = m_[:, None] * c + (1. - m_)[:, None] * c_

        if has_output_gate:
            h = o * tensor.tanh(c)
        else:
            h = tensor.tanh(c)

        h = m_[:, None] * h + (1. - m_)[:, None] * h_

        return h, c

    state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
                   tparams[_p(prefix, 'b')])

    dim_proj = options['dim_proj']
    rval, updates = theano.scan(_step,
                                sequences=[mask, state_below],
                                outputs_info=[tensor.alloc(numpy_floatX(0.),
                                                           n_samples,
                                                           dim_proj),
                                              tensor.alloc(numpy_floatX(0.),
                                                           n_samples,
                                                           dim_proj)],
                                name=_p(prefix, '_layers'),
                                n_steps=nsteps)
    return rval[0]
Esempio n. 8
0
File: lstm.py Progetto: Yeahhhh/grnn
    def calc_lstm(self, input, mask):

        def _slice(_x, n, dim):
            return _x[:, n * dim:(n + 1) * dim]

        def _step(m_, x_, h_, c_):
            preact = T.dot(h_, self.U)
            preact += x_

            i = T.nnet.sigmoid(_slice(preact, 0, self.n_hidden))
            f = T.nnet.sigmoid(_slice(preact, 1, self.n_hidden))
            o = T.nnet.sigmoid(_slice(preact, 2, self.n_hidden))
            c = T.tanh(_slice(preact, 3, self.n_hidden))

            c = f * c_ + i * c
            c = m_[:, None] * c + (1. - m_)[:, None] * c_

            h = o * T.tanh(c)
            h = m_[:, None] * h + (1. - m_)[:, None] * h_

            return h, c

        n_samples = input.shape[1]

        wx = T.dot(input, self.W) + self.b
        rval, updates = theano.scan(_step,
                                sequences=[mask, wx],
                                outputs_info=[T.alloc(numpy.asarray(0., dtype=numpy.float64),
                                                      n_samples, self.n_hidden),
                                              T.alloc(numpy.asarray(0., dtype=numpy.float64),
                                                      n_samples, self.n_hidden)])

        return rval[0]
Esempio n. 9
0
    def output_probabilistic(self, m_w_previous, v_w_previous):
        if (self.non_linear):
            m_in = self.m_w - m_w_previous
            v_in = self.v_w
            # We compute the mean and variance after the ReLU activation
            lam = self.lam
            v_1 = 1 + 2*lam*v_in
            v_1_inv = v_1**-1

            s_1 = T.prod(v_1,axis=1)**-0.5
            v_2 = 1 + 4*lam*v_in
            v_2_inv = v_2**-1
            s_2 = T.prod(v_2,axis=1)**-0.5
            v_inv = v_in**-1
            exponent1 = m_in**2*(1 - v_1_inv)*v_inv
            exponent1 = T.sum(exponent1,axis=1)
            exponent2 = m_in**2*(1 - v_2_inv)*v_inv
            exponent2 = T.sum(exponent2,axis=1)
            m_a = s_1*T.exp(-0.5*exponent1)
            v_a = s_2*T.exp(-0.5*exponent2) - m_a**2

            return (m_a, v_a)

        else:
            m_w_previous_with_bias = \
            T.concatenate([ m_w_previous, T.alloc(1, 1) ], 0)
            v_w_previous_with_bias = \
            T.concatenate([ v_w_previous, T.alloc(0, 1) ], 0)

            m_linear = T.dot(self.m_w, m_w_previous_with_bias) / T.sqrt(self.n_inputs)
            v_linear = (T.dot(self.v_w, v_w_previous_with_bias) + \
                T.dot(self.m_w**2, v_w_previous_with_bias) + \
                T.dot(self.v_w, m_w_previous_with_bias**2)) / self.n_inputs
            return (m_linear, v_linear)
Esempio n. 10
0
    def mf(self, V, Y = None, return_history = False, niter = None, block_grad = None):

        drop_mask = T.zeros_like(V)

        if Y is not None:
            drop_mask_Y = T.zeros_like(Y)
        else:
            batch_size = V.shape[0]
            num_classes = self.dbm.hidden_layers[-1].n_classes
            assert isinstance(num_classes, int)
            Y = T.alloc(1., V.shape[0], num_classes)
            drop_mask_Y = T.alloc(1., V.shape[0])

        history = self.do_inpainting(X=V,
            Y=Y,
            return_history=True,
            drop_mask=drop_mask,
            drop_mask_Y=drop_mask_Y,
            noise=False,
            niter=niter,
            block_grad=block_grad)

        if return_history:
            return [elem['H_hat'] for elem in history]

        return history[-1]['H_hat']
Esempio n. 11
0
def crop_images(data, image_shape, border_width=8, mode=0):
    """ Function used to crop the images by a certain border width.
        data         : input data, theano 4D tensor
        image_shape  : 4-tuple, (batch_size, num_channels, image_rows, image_cols)
        border_width : border width to be cropped, default value 8
        mode         : binary, 0 for random, 1 for centered crop.
    """
    if (mode == 0):
        row_step = image_shape[2] - border_width
        col_step = image_shape[3] - border_width
        output = T.alloc(0., image_shape[0], image_shape[1], row_step, col_step)
        for i in range(image_shape[0]):           
            begin_idx = numpy.random.randint(border_width)
            output = T.set_subtensor(output[i,:,:,:], 
                data[i,:,begin_idx:(begin_idx+row_step),begin_idx:(begin_idx+col_step)])
        return output
    else: 
        row_step = image_shape[2] - border_width
        col_step = image_shape[3] - border_width
        output = T.alloc(0., image_shape[0], image_shape[1], row_step, col_step)
        for i in range(image_shape[0]):           
            begin_idx = border_width / 2 
            output = T.set_subtensor(output[i,:,:,:], 
                data[i,:,begin_idx:(begin_idx+row_step),begin_idx:(begin_idx+col_step)])
        return output
Esempio n. 12
0
    def build(self, antialias_samples=4):

        # returns top-level render function and associated variables
        image = T.alloc(0., self.camera.x_dims, self.camera.y_dims, 3)

        #Anti-Aliasing
        sampleDist_x = np.asarray(np.random.random((self.camera.x_dims, self.camera.y_dims,antialias_samples)),dtype=theano.config.floatX)
        sampleDist_y = np.asarray(np.random.random((self.camera.x_dims, self.camera.y_dims,antialias_samples)),dtype=theano.config.floatX)

        for sample in xrange(antialias_samples): #TODO USE SCAN

            #Make Rays
            self.camera.rays = self.camera.make_rays(self.camera.x_dims, self.camera.y_dims,\
                            sampleDist_x=(sampleDist_x[:,:,sample] + sample)/antialias_samples,
                            sampleDist_y=(sampleDist_y[:,:,sample] + sample)/antialias_samples)
            #self.camera.variables.add_child(self.camera.rays.variables)
            image_per_sample = T.alloc(0.0, self.camera.x_dims, self.camera.y_dims, 3)
            min_dists = T.alloc(float('inf'), self.camera.x_dims, self.camera.y_dims)

            # for each shape find its shadings and draw closer shapes on top
            for shape in self.shapes:
                dists = shape.distance(self.camera.rays)
                shadings = self.shader.shade(shape, self.lights, self.camera)
                #for each shape != obj, draw shadow of shape on obj
                #for obj2 in self.shapes:
                #    if obj == obj2: continue
                #    shadings = broadcasted_switch(obj2.shadow(
                #        obj.surface_pts(self.camera.rays), self.lights) < 0, shadings, [0., 0., 0.])
                image_per_sample = broadcasted_switch(dists < min_dists, shadings, image_per_sample)
                min_dists = T.switch(dists < min_dists, dists, min_dists)

            image = image + image_per_sample
        image = image / antialias_samples

        return image
def RNN_layer(tparams,inputs,mask=None,init_h=None,prefix=None,name='rnn',std=True):
	"""
	inputs: n_steps*n_samples*x_size
	return h
	"""
	prefix=GetPrefix(prefix,name);
	# if length!=None: inputs=inputs[index:index+length,:,:];
	n_steps=inputs.shape[0];
	n_samples=inputs.shape[1];
	x_size=inputs.shape[2];

	hdim=tparams[_p(prefix,'wh')].shape[0];

	if mask == None:		
		mask = T.alloc(1., n_steps, n_samples);
	if init_h == None:       
		init_h = T.alloc(0., n_samples, hdim);


	def _step(m,x,h):
		inputs_h=( T.dot(x,tparams[_p(prefix,'wx')])+T.dot(h,tparams[_p(prefix,'wh')]) )/2+tparams[_p(prefix,'b')];
		_h=tanh(inputs_h);
		return _h;

	if std:	inputs=standardize(inputs);
	out,updates=theano.scan(lambda m,x,h:_step(m,x,h), 
							sequences=[mask,inputs], 
							outputs_info=[init_h],
							name=_p(prefix,'scan'),
							n_steps=n_steps,
							# truncate_gradient=10,
							profile=False);
	return out
    def __init__(self, cell, rng, layer_id, shape, X, mask, is_train = 1, batch_size = 1, p = 0.5):
        prefix = "SentDecoderLayer_"
        layer_id = "_" + layer_id
        self.in_size, self.out_size = shape
        self.X = X
        self.summs = batch_size
        
        self.W_hy = init_weights((self.in_size, self.out_size), prefix + "W_hy" + layer_id)
        self.b_y = init_bias(self.out_size, prefix + "b_y" + layer_id)

        if cell == "gru":
            self.decoder = GRULayer(rng, prefix + layer_id, shape, self.X, mask, is_train, 1, p)
            def _active(pre_h, x):
                h = self.decoder._active(x, pre_h)
                y = T.tanh(T.dot(h, self.W_hy) + self.b_y)
                return h, y
            [h, y], updates = theano.scan(_active, n_steps = self.summs, sequences = [],
                                      outputs_info = [{'initial':self.X, 'taps':[-1]},
                                                      T.alloc(floatX(0.), 1, self.out_size)])
        elif cell == "lstm":
            self.decoder = LSTMLayer(rng, prefix + layer_id, shape, self.X, mask, is_train, 1, p)
            def _active(pre_h, pre_c, x):
                h, c = self.decoder._active(x, pre_h, pre_c)
                y = T.tanh(T.dot(h, self.W_hy) + self.b_y)
                return h, c, y
            [h, c, y], updates = theano.scan(_active, n_steps = self.summs, sequences = [],
                                             outputs_info = [{'initial':self.X, 'taps':[-1]},
                                                             {'initial':self.X, 'taps':[-1]},
                                                             T.alloc(floatX(0.), 1, self.out_size)])
       
        y = T.reshape(y, (self.summs, self.out_size))
        self.activation = y

        self.params = self.decoder.params + [self.W_hy, self.b_y]
Esempio n. 15
0
def gru_layer(tparams, state_below, init_state, options, prefix='gru', mask=None, **kwargs):
    """
    Feedforward pass through GRU
    """
    nsteps = state_below.shape[0]
    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = 1

    dim = tparams[_p(prefix,'Ux')].shape[1]

    if init_state == None:
        init_state = tensor.alloc(0., n_samples, dim)

    if mask == None:
        mask = tensor.alloc(1., state_below.shape[0], 1)

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n*dim:(n+1)*dim]
        return _x[:, n*dim:(n+1)*dim]

    state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')]
    state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + tparams[_p(prefix, 'bx')]
    U = tparams[_p(prefix, 'U')]
    Ux = tparams[_p(prefix, 'Ux')]

    def _step_slice(m_, x_, xx_, h_, U, Ux):
        preact = tensor.dot(h_, U)
        preact += x_

        r = tensor.nnet.sigmoid(_slice(preact, 0, dim))
        u = tensor.nnet.sigmoid(_slice(preact, 1, dim))

        preactx = tensor.dot(h_, Ux)
        preactx = preactx * r
        preactx = preactx + xx_

        h = tensor.tanh(preactx)

        h = u * h_ + (1. - u) * h
        h = m_[:,None] * h + (1. - m_)[:,None] * h_

        return h

    seqs = [mask, state_below_, state_belowx]
    _step = _step_slice

    rval, updates = theano.scan(_step,
                                sequences=seqs,
                                outputs_info = [init_state],
                                non_sequences = [tparams[_p(prefix, 'U')],
                                                 tparams[_p(prefix, 'Ux')]],
                                name=_p(prefix, '_layers'),
                                n_steps=nsteps,
                                profile=False,
                                strict=True)
    rval = [rval]
    return rval
Esempio n. 16
0
def symbolic_lstm(input, W, b, n_hidden, input_layer, init_hidden=None, prefix="lstm"):
    def _slice(_x, n, dim):
        return _x[n*dim:(n+1) * dim]

    def _step(x_, h_, c_):
        preact = tensor.dot(tensor.concatenate((h_, input_layer(x_, h_))), W)
        preact += b

        i = nnet.sigmoid(_slice(preact, 0, n_hidden))
        f = nnet.sigmoid(_slice(preact, 1, n_hidden))
        o = nnet.sigmoid(_slice(preact, 2, n_hidden))
        c = nnet.sigmoid(_slice(preact, 3, n_hidden))

        c = f * c_ + i * c
        h = o * tensor.tanh(c)
        return h, c

    if init_hidden is None:
        init_hidden = tensor.alloc(numpy_floatX(0.), n_hidden)

    rval, updates = theano.scan(_step,
                                sequences=[input],
                                outputs_info=[init_hidden,
                                              tensor.alloc(numpy_floatX(0.),
                                                           n_hidden)],
                                name=_p(prefix, '_layers'))
    return rval[0]
Esempio n. 17
0
	def encode(self, state_below):
		"""
		:development:
			(1) may need to prepend encoding_length * padding array to the state_below to produce the same length sequence as state_below
			(2) can return an offset encoding by only returing certain indices of the encoding (though this is pretty wasteful)

		:type state_below: 2d tensor
		:param state_below: the enitre sequence of states from the layer below the current one

		:type rval: 2d tensor
		:param rval: an encoding of the state_below (the entire sequence of state) to be passed to the above layer
		"""

		total_sequence_length = T.cast(state_below.shape[0], theano.config.floatX)
		self.n_encodings = T.cast(T.ceil(total_sequence_length / self.encoding_length), 'int32')
		self.n_padding_timesteps = T.cast(self.n_encodings * self.encoding_length - total_sequence_length, 'int32')
		zeros = T.alloc(np.cast[theano.config.floatX](0), self.n_padding_timesteps, self.n_vis)
		state_below = T.concatenate((zeros, state_below))

		Wxh = self.Wxh
		bxh = self.bxh
		Whhe = self.Whhe

		state_below = state_below.reshape((self.encoding_length, self.n_encodings, self.n_vis))
		state_below = T.dot(state_below, Wxh) + bxh
		
		# a single output will be n_encoding rows with n_hid features each
		encoding_0 = T.alloc(np.cast[theano.config.floatX](0), self.n_encodings, self.n_hid)

		encodings, updates = scan(fn=self.encode_step, sequences=[state_below], outputs_info=[encoding_0], non_sequences=[Whhe])
		# encodings is a 3d vector (encoding_length, n_encodings, n_hid)
		# returns encodings[-1] in 2d vector shape = (n_encodings, n_hid)
		return encodings[-1]
Esempio n. 18
0
    def layer_output(self, state_blow, tag_blow, mask=None):
        """
        :type tag_blow: object
        """
        nsteps = state_blow.shape[0]
        if state_blow.ndim == 3:
            nsamples = state_blow.shape[1]
        else:
            nsamples = 1
        # assert
        assert mask is not None

        state_blow = tensor.dot(state_blow, self.w) + tensor.dot(tag_blow, self.v)+ self.b

        results, updates = theano.scan(
            fn=self._step,
            sequences=[mask, state_blow],
            outputs_info=[tensor.alloc(numpy_floatX(0.),
                                       nsamples,
                                       self.mem_dim),
                          tensor.alloc(numpy_floatX(0.),
                                       nsamples,
                                       self.mem_dim)],
            n_steps=nsteps,
            name=self.name + '_layer'
        )

        return results[0]
Esempio n. 19
0
def lstm_function(state_below, n_hidden, W, U, b, prefix="lstm", truncate_gradient=-1):
    def _slice(_x, n, dim):
        return _x[n*dim:(n+1) * dim]

    def _step(x_, h_, c_):
        preact = tensor.dot(h_, U)
        preact += x_

        i = nnet.sigmoid(_slice(preact, 0, n_hidden))
        f = nnet.sigmoid(_slice(preact, 1, n_hidden))
        o = nnet.sigmoid(_slice(preact, 2, n_hidden))
        c = tensor.tanh(_slice(preact, 3, n_hidden))

        c = f * c_ + i * c
        h = o * tensor.tanh(c)
        return h, c

    init_hidden = tensor.alloc(numpy_floatX(0.), n_hidden)
    state_below = tensor.dot(state_below, W) + b
    rval, updates = theano.scan(_step,
                                sequences=[state_below],
                                outputs_info=[init_hidden,
                                              tensor.alloc(numpy_floatX(0.),
                                                           n_hidden)],
                                name=_p(prefix, '_layers'),
                                truncate_gradient=truncate_gradient)
    return rval[0]
Esempio n. 20
0
    def fprop(self, data):
        if self.use_ground_truth:
            self.input_space.validate(data)
            features, phones = data

            init_h = T.alloc(numpy.cast[theano.config.floatX](0), self.nhid)
            init_out = T.alloc(numpy.cast[theano.config.floatX](0), 1)
            init_out = T.unbroadcast(init_out, 0)

            fn = lambda f, p, h, o: self.fprop_step(f, p, h, o)

            ((h, out), updates) = theano.scan(fn=fn,
                                              sequences=[features, phones],
                                              outputs_info=[dict(initial=init_h,
                                                                 taps=[-1]),
                                                            init_out])
            return out
        else:
            self.input_space.validate(data)
            features, phones = data

            init_in = features[0]
            init_h = T.alloc(numpy.cast[theano.config.floatX](0), self.nhid)
            init_out = T.alloc(numpy.cast[theano.config.floatX](0), 1)
            init_out = T.unbroadcast(init_out, 0)

            fn = lambda t, p, f, h, o: self.fprop_step_prime(t, p, f, h, o)

            ((f, h, out), updates) = theano.scan(fn=fn,
                                                 sequences=[features, phones],
                                                 outputs_info=[init_in,
                                                               dict(initial=init_h,
                                                                    taps=[-1]),
                                                               init_out])
            return out
Esempio n. 21
0
 def outputs_info(self, n_samples):
     # initialize hidden states: c, h
     shape = (n_samples,) + self.output_shape
     return [
         T.unbroadcast(T.alloc(numpy.asarray(0., dtype=theano.config.floatX), *shape), *range(len(shape))), # c
         T.unbroadcast(T.alloc(numpy.asarray(0., dtype=theano.config.floatX), *shape), *range(len(shape)))  # h
     ]
    def encode_lstm(self, x, mask):
        def _step(m_tm_1, x_t, h_tm_1, c_tm_1):
            lstm_preactive = T.dot(h_tm_1, self.encode_U) + \
                             T.dot(x_t, self.encode_W) + \
                             self.encode_b

            i = T.nnet.sigmoid(lstm_preactive[:,0:self.hidden_dim])
            f = T.nnet.sigmoid(lstm_preactive[:,self.hidden_dim:self.hidden_dim*2])
            o = T.nnet.sigmoid(lstm_preactive[:,self.hidden_dim*2:self.hidden_dim*3])
            c = T.tanh(lstm_preactive[:,self.hidden_dim*3:self.hidden_dim*4])

            c = f*c_tm_1 + i*c
            c = m_tm_1[:,None]*c + (1.-m_tm_1)[:,None]*c_tm_1

            h = o*T.tanh(c)
            h = m_tm_1[:,None]*h + (1.-m_tm_1)[:,None]*h_tm_1

            return [h,c]

        h0 = T.alloc(0., x.shape[1], self.hidden_dim)
        c0 = T.alloc(0., x.shape[1], self.hidden_dim)

        rval, updates = theano.scan(
                fn=_step,
                sequences=[mask,x],
                outputs_info=[h0,c0]
                )

        h_list, c_list = rval
        return h_list
Esempio n. 23
0
    def encode(self, state_below):
        """
		:development:
			(1) may need to prepend encoding_length * padding array to the state_below to produce the same length sequence as state_below
			(2) can return an offset encoding by only returing certain indices of the encoding (though this is pretty wasteful)

		:type state_below: 2d tensor
		:param state_below: the enitre sequence of states from the layer below the current one

		:type rval: 2d tensor
		:param rval: an encoding of the state_below (the entire sequence of state) to be passed to the above layer
		"""
        # to make the encodings start with the first state in state_below, prepend encoding_length vectors of value zero
        zeros = T.alloc(np.cast[theano.config.floatX](0), self.encoding_length - 1, self.n_hid)
        state_below = T.concatenate((zeros, state_below))

        encoding_0 = T.alloc(np.cast[theano.config.floatX](0), self.n_hid)
        # negative, reverse indicies for the taps
        # e.g., [-4, -3, -2, -1, -0] would pass those indicies from state_below to the encode_step
        taps = [-1 * tap for tap in range(self.encoding_length)[::-1]]
        encodings, updates = scan(
            fn=self.encode_subsequence, sequences=dict(input=state_below, taps=taps), outputs_info=[encoding_0]
        )

        return encodings
def arc_distance_theano_alloc_prepare(dtype='float64'):
    """
    Calculates the pairwise arc distance between all points in vector a and b.
    """
    a = tensor.matrix(dtype=str(dtype))
    b = tensor.matrix(dtype=str(dtype))
    # Theano don't implement all case of tile, so we do the equivalent with alloc.
    #theta_1 = tensor.tile(a[:, 0], (b.shape[0], 1)).T
    theta_1 = tensor.alloc(a[:, 0], b.shape[0], b.shape[0]).T
    phi_1 = tensor.alloc(a[:, 1], b.shape[0], b.shape[0]).T

    theta_2 = tensor.alloc(b[:, 0], a.shape[0], a.shape[0])
    phi_2 = tensor.alloc(b[:, 1], a.shape[0], a.shape[0])

    temp = (tensor.sin((theta_2 - theta_1) / 2)**2
            +
            tensor.cos(theta_1) * tensor.cos(theta_2)
            * tensor.sin((phi_2 - phi_1) / 2)**2)
    distance_matrix = 2 * (tensor.arctan2(tensor.sqrt(temp),
                                          tensor.sqrt(1 - temp)))
    name = "arc_distance_theano_alloc"
    rval = theano.function([a, b],
                           distance_matrix,
                           name=name)
    rval.__name__ = name

    return rval
Esempio n. 25
0
    def sample(self, x0=None, h0=None, c0=None, n_samples=10, n_steps=10,
               condition_on=None, debug=False):
        if x0 is None:
            x0, _ = self.output_net.sample(
                p=T.constant(0.5).astype(floatX),
                size=(n_samples, self.output_net.dim_out)).astype(floatX)

        if h0 is None:
            h0 = T.alloc(0., x0.shape[0], self.dim_h).astype(floatX)
        if c0 is None:
            c0 = T.alloc(0., x0.shape[0], self.dim_h).astype(floatX)
        z0 = self.output_net.preact(h0)

        seqs = []
        outputs_info = [h0, c0, x0, None]
        non_seqs = []
        step = self.step_sample
        p0 = self.output_net.distribution(z0)

        non_seqs += self.get_sample_params()
        if debug:
            return self.step_sample(h0, x0, *self.get_sample_params())

        outs = scan(step, seqs, outputs_info, non_seqs, n_steps,
                    name=self.name+'_sampling', strict=False)
        (h, c, x, p), updates = outs

        x = concatenate([x0[None, :, :], x])
        h = concatenate([h0[None, :, :], h])
        p = concatenate([p0[None, :, :], p])

        return OrderedDict(x=x, p=p, h=h, x0=x0, p0=p0, h0=h0), updates
Esempio n. 26
0
File: nn.py Progetto: dongx-duan/crf
  def __call__(self, input):
    nh = self.hidden_size
    # _in: input of t
    # _m : output of t - 1
    # _c : memory of t - 1 
    def _step(_in, _m, _c, nh):
      _x = tensor.concatenate([numpy.asarray([1.], dtype=numpy.float32), _in, _m])
      ifog = tensor.dot(_x, self.W)

      i = tensor.nnet.sigmoid(ifog[ : nh])
      f = tensor.nnet.sigmoid(ifog[nh : 2*nh])
      o = tensor.nnet.sigmoid(ifog[2*nh : 3*nh])
      g = tensor.tanh(ifog[3*nh : ])

      _c = f * _c + i * g
      _m = o * _c
      return _m, _c
    self._step = _step

    results, update = theano.scan(
        _step, 
        sequences=[input],
        outputs_info=[tensor.alloc(0.0, nh), tensor.alloc(0.0, nh)],
        non_sequences=[self.hidden_size]
      )
    return results[0] #(_m_list, _c_list)[0]
Esempio n. 27
0
    def decode(self, hidden):
        hidden_ = T.alloc(0.,*self.hidden_shape)
        deconv_out = T.alloc(0.,*self.output_shape)
       
        # Zero padding How can I code easily?
        hidden_ = T.set_subtensor(hidden_[:,:,:,self.filter_shape[3]-1:],hidden)

        # Calculate output
        conv_odd = conv.conv2d(
            input = hidden_,
            filters = self.W_odd,
            filter_shape = self.filter_shape,
            image_shape = self.hidden_shape,)
        conv_even = conv.conv2d(
            input = hidden_,
            filters = self.W_even,
            filter_shape = self.filter_shape,
            image_shape = self.hidden_shape,)
        
        deconv_out = T.set_subtensor(deconv_out[:,:,:,::2], conv_odd)
        deconv_out = T.set_subtensor(deconv_out[:,:,:,1::2], conv_even)

        linout = deconv_out + self.b.dimshuffle('x',0,'x','x')
        
        if self.dec_hid == 'tanh':
            convout= T.tanh(linout)
        elif self.dec_hid == 'lin':
            convout=linout
        elif self.dec_hid == 'relu':
            convout=linout * (linout > 0.) + 0. * (linout < 0.)
        else:
            raise ValueError('Invalid dec_hid')
        #### Recurrent connection####
        return convout
    def generate_lstm(self, context):
        x0 = T.alloc(0., context.shape[0], self.embedding_dim)
        h0 = T.alloc(0., context.shape[0], self.hidden_dim)
        c0 = T.alloc(0., context.shape[0], self.hidden_dim)

        def _step(x_tm_1, h_tm_1, c_tm_1):
            lstm_preactive = T.dot(h_tm_1, self.decode_U)+ \
                             T.dot(context, self.decode_V)+ \
                             T.dot(x_tm_1, self.decode_W) + \
                             self.decode_b

            i = T.nnet.sigmoid(lstm_preactive[:,0:self.hidden_dim])
            f = T.nnet.sigmoid(lstm_preactive[:,self.hidden_dim:self.hidden_dim*2])
            o = T.nnet.sigmoid(lstm_preactive[:,self.hidden_dim*2:self.hidden_dim*3])
            c = T.tanh(lstm_preactive[:,self.hidden_dim*3:self.hidden_dim*4])

            c = f*c_tm_1 + i*c
            h = o*T.tanh(c)

            x_emb = T.dot(h, self.output_W) + self.output_b # (n_samples, embedding_dim)
            x_word = T.dot(x_emb, self.word_W) + self.word_b # (n_samples, n_words)

            x_index = T.argmax(x_word, axis=1)
            x = self.emb[x_index]

            return [x,h,c]

        rval, updates = theano.scan(
                fn=_step,
                outputs_info=[x0,h0,c0],
                n_steps=20)

        generated_sequence = rval[0]
        return generated_sequence
Esempio n. 29
0
def ENCODER_R(X, tparams, options):

    # (tensor.alloc(numpy_floatX(1.), options['hidden_size'], 1)-tensor.nnet.sigmoid(tensor.dot(tparams['Wr_Z'], xr) + tensor.dot(tparams['Ur_Z'], hr_tm1))) * hr_tm1\
    #  + tensor.nnet.sigmoid(tensor.dot(tparams['Wr_Z'], xr) + tensor.dot(tparams['Ur_Z'], hr_tm1)) * tensor.tanh(\
    #     tensor.dot(tparams['Wr'], xr) + tensor.dot(tparams['Ur'],  \
    #         (tensor.nnet.sigmoid(tensor.dot(tparams['Wr_R'], xr) + \
    #             tensor.dot(tparams['Ur_R'], hr_tm1)) * hr_tm1)\
    #         )\
    #     )

    # (tensor.alloc(numpy_floatX(1.), options['hidden_size'])-tensor.nnet.sigmoid(tensor.dot\
    # (tparams["Emb"][xr], tparams['Wr_Z']) + tensor.dot(hr_tm1, tparams['Ur_Z']))) * hr_tm1\
    #  + tensor.nnet.sigmoid(tensor.dot(tparams["Emb"][xr], tparams['Wr_Z']) + tensor.dot(hr_tm1, \
    # tparams['Ur_Z'])) *  tensor.tanh(tensor.dot(tparams["Emb"][xr], tparams['Wr']) + \
    #     tensor.dot((tensor.nnet.sigmoid(tensor.dot(tparams["Emb"][xr], tparams['Wr_R']) + tensor\
    # .dot(hr_tm1, tparams['Ur_R'])) * hr_tm1) , tparams['Ur']))\

    # tparams["Emb"][xr]
    # X_Vec = word2VecLayer(X, tparams)

    results_r, updates = theano.scan(lambda xr, hr_tm1:    (tensor.alloc(numpy_floatX(1.), options['hidden_size'])\
        -tensor.nnet.sigmoid(tensor.dot(tparams["Emb"][xr], tparams['Wr_Z']) + tensor.dot(hr_tm1, tparams['Ur_Z']))) * hr_tm1\
     + tensor.nnet.sigmoid(tensor.dot(tparams["Emb"][xr], tparams['Wr_Z']) + tensor.dot(hr_tm1, \
        tparams['Ur_Z'])) *  tensor.tanh(tensor.dot(tparams["Emb"][xr], tparams['Wr']) + \
        tensor.dot((tensor.nnet.sigmoid(tensor.dot(tparams["Emb"][xr], tparams['Wr_R']) + tensor.\
            dot(hr_tm1, tparams['Ur_R'])) * hr_tm1) , tparams['Ur']))\
     ,  sequences=[X], outputs_info=tensor.alloc(numpy_floatX(0.), options['hidden_size']))
    #initial value of the scan can only be vec

    return results_r # [hi_right]  # return[ (n,) *l ] that is [(1*n) * l]
Esempio n. 30
0
    def __init__(self, input_var, 
                layerid,sequence,n_input_channels=1,
                height=3,width=3,n_filters=8):
        
        X = input_var
        imH, imW = X.shape[-2],X.shape[-1]

        H, W, F, C = height, width, n_filters, n_input_channels
        Tt, N = input_var.shape[0],input_var.shape[1]
        self.n_filters = n_filters


        self.Wx = shared(self.glorot_init(H*W*F,4*F,C,H,W), name='Wx'+layerid)
        self.Wh = shared(self.glorot_init(4*H*W*F,4*F,F,H,W),name='Wh'+layerid)
        self.b = shared(np.zeros(4*F,dtype=np.float32),name='b'+layerid)

        self.params = {
            self.Wx.name: self.Wx,
            self.Wh.name: self.Wh,
            self.b.name: self.b
            }

        [h,c], _ = scan(self.step,
            sequences=[X],
            outputs_info=[
                        T.alloc(np.cast['float32'](0), N,F,imH,imW),
                        T.alloc(np.cast['float32'](0), N,F,imH,imW)
                        ])

        self.output = h
Esempio n. 31
0
            def compute_Gv(*args):
                cgv = [
                    theano.shared(numpy.zeros(shp, dtype=theano.config.floatX),
                                  name='cgv%d' % idx)
                    for idx, shp in enumerate(model.params_shape)
                ]
                print_mem('allocated mem for cgv')
                idx0 = const([0])
                ep = [
                    TT.alloc(const(0), 1, *shp) for shp in model.params_shape
                ]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_outs = safe_clone(model.outs, replace)
                    final_results = dict(
                        zip(model.params, [None] * len(model.params)))
                    for nw_out, out_operator in zip(nw_outs,
                                                    model.outs_operator):
                        loc_params = [
                            x for x in model.params
                            if x in theano.gof.graph.inputs([nw_out])
                        ]
                        loc_args = [
                            x for x, y in zip(cgv, model.params)
                            if y in theano.gof.graph.inputs([nw_out])
                        ]
                        if out_operator == 'softmax':
                            factor = const(options['cbs']) * nw_out
                        elif out_operator == 'sigmoid':
                            factor = const(
                                options['cbs']) * nw_out * (1 - nw_out)
                        else:
                            factor = const(options['cbs'])

                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params, loc_args) /\
                                         factor)

                        for lp, lgv in zip(loc_params, loc_Gvs):
                            if final_results[lp] is None:
                                final_results[lp] = lgv
                            else:
                                final_results[lp] += lgv

                    Gvs = [
                        ogv + final_results[param]
                        for (ogv, param) in zip(gv_args[1:], model.params)
                    ]
                    return [gv_args[0] + const(1)] + Gvs

                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=gpu_mode,
                                      name='Gv_step',
                                      profile=options['profile'])
                final_Gvs = [
                    TT.as_tensor_variable(x[0]) / const(n_steps)
                    for x in rvals[1:]
                ]
                grad_inps = zip(loc_inputs, shared_data)
                loc_fn = theano.function([],
                                         final_Gvs,
                                         updates=updates,
                                         givens=dict(grad_inps),
                                         on_unused_input='warn',
                                         mode=gpu_mode,
                                         name='loc_fn',
                                         profile=options['profile'])
                fake_op = FakeGPUShell(cgv, loc_fn, len(cgv))

                return fake_op(*args), {}
def gru_cond_layer(tparams,
                   state_below,
                   options,
                   prefix='gru',
                   mask=None,
                   context=None,
                   one_step=False,
                   init_memory=None,
                   init_state=None,
                   context_mask=None,
                   emb_dropout=None,
                   rec_dropout=None,
                   ctx_dropout=None,
                   pctx_=None,
                   truncate_gradient=-1,
                   profile=False,
                   **kwargs):

    assert context, 'Context must be provided'

    if one_step:
        assert init_state, 'previous state must be provided'

    nsteps = state_below.shape[0]
    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = 1

    # mask
    if mask is None:
        mask = tensor.alloc(1., state_below.shape[0], 1)

    dim = tparams[pp(prefix, 'Wcx')].shape[1]

    # initial/previous state
    if init_state is None:
        init_state = tensor.alloc(0., n_samples, dim)

    # projected context
    assert context.ndim == 3, 'Context must be 3-d: #annotation x #sample x dim'
    if pctx_ is None:
        pctx_ = tensor.dot(context*ctx_dropout[0], tparams[pp(prefix, 'Wc_att')]) +\
            tparams[pp(prefix, 'b_att')]

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n * dim:(n + 1) * dim]
        return _x[:, n * dim:(n + 1) * dim]

    # state_below is the previous output word embedding
    state_belowx = tensor.dot(state_below*emb_dropout[0], tparams[pp(prefix, 'Wx')]) +\
        tparams[pp(prefix, 'bx')]
    state_below_ = tensor.dot(state_below*emb_dropout[1], tparams[pp(prefix, 'W')]) +\
        tparams[pp(prefix, 'b')]

    def _step_slice(m_, x_, xx_, h_, ctx_, alpha_, pctx_, cc_, rec_dropout,
                    ctx_dropout, U, Wc, W_comb_att, U_att, c_tt, Ux, Wcx, U_nl,
                    Ux_nl, b_nl, bx_nl):

        preact1 = tensor.dot(h_ * rec_dropout[0], U)
        preact1 += x_
        preact1 = tensor.nnet.sigmoid(preact1)

        r1 = _slice(preact1, 0, dim)
        u1 = _slice(preact1, 1, dim)

        preactx1 = tensor.dot(h_ * rec_dropout[1], Ux)
        preactx1 *= r1
        preactx1 += xx_

        h1 = tensor.tanh(preactx1)

        h1 = u1 * h_ + (1. - u1) * h1
        h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_

        # attention
        pstate_ = tensor.dot(h1 * rec_dropout[2], W_comb_att)
        pctx__ = pctx_ + pstate_[None, :, :]
        #pctx__ += xc_
        pctx__ = tensor.tanh(pctx__)
        alpha = tensor.dot(pctx__ * ctx_dropout[1], U_att) + c_tt
        alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]])
        alpha = tensor.exp(alpha - alpha.max(0, keepdims=True))
        if context_mask:
            alpha = alpha * context_mask
        alpha = alpha / alpha.sum(0, keepdims=True)
        ctx_ = (cc_ * alpha[:, :, None]).sum(0)  # current context

        preact2 = tensor.dot(h1 * rec_dropout[3], U_nl) + b_nl
        preact2 += tensor.dot(ctx_ * ctx_dropout[2], Wc)
        preact2 = tensor.nnet.sigmoid(preact2)

        r2 = _slice(preact2, 0, dim)
        u2 = _slice(preact2, 1, dim)

        preactx2 = tensor.dot(h1 * rec_dropout[4], Ux_nl) + bx_nl
        preactx2 *= r2
        preactx2 += tensor.dot(ctx_ * ctx_dropout[3], Wcx)

        h2 = tensor.tanh(preactx2)

        h2 = u2 * h1 + (1. - u2) * h2
        h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1

        return h2, ctx_, alpha.T  # pstate_, preact, preactx, r, u

    seqs = [mask, state_below_, state_belowx]
    #seqs = [mask, state_below_, state_belowx, state_belowc]
    _step = _step_slice

    shared_vars = [
        tparams[pp(prefix, 'U')], tparams[pp(prefix, 'Wc')],
        tparams[pp(prefix, 'W_comb_att')], tparams[pp(prefix, 'U_att')],
        tparams[pp(prefix, 'c_tt')], tparams[pp(prefix, 'Ux')],
        tparams[pp(prefix, 'Wcx')], tparams[pp(prefix, 'U_nl')],
        tparams[pp(prefix,
                   'Ux_nl')], tparams[pp(prefix,
                                         'b_nl')], tparams[pp(prefix, 'bx_nl')]
    ]

    if one_step:
        rval = _step(*(
            seqs +
            [init_state, None, None, pctx_, context, rec_dropout, ctx_dropout
             ] + shared_vars))
    else:
        rval, updates = theano.scan(
            _step,
            sequences=seqs,
            outputs_info=[
                init_state,
                tensor.alloc(0., n_samples, context.shape[2]),
                tensor.alloc(0., n_samples, context.shape[0])
            ],
            non_sequences=[pctx_, context, rec_dropout, ctx_dropout] +
            shared_vars,
            name=pp(prefix, '_layers'),
            n_steps=nsteps,
            truncate_gradient=truncate_gradient,
            profile=profile,
            strict=True)
    return rval
def gru_layer(tparams,
              state_below,
              options,
              prefix='gru',
              mask=None,
              emb_dropout=None,
              rec_dropout=None,
              truncate_gradient=-1,
              profile=False,
              **kwargs):
    nsteps = state_below.shape[0]
    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = 1

    dim = tparams[pp(prefix, 'Ux')].shape[1]

    if mask is None:
        mask = tensor.alloc(1., state_below.shape[0], 1)

    # utility function to slice a tensor
    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n * dim:(n + 1) * dim]
        return _x[:, n * dim:(n + 1) * dim]

    # state_below is the input word embeddings
    # input to the gates, concatenated
    state_below_ = tensor.dot(state_below*emb_dropout[0], tparams[pp(prefix, 'W')]) + \
        tparams[pp(prefix, 'b')]
    # input to compute the hidden state proposal
    state_belowx = tensor.dot(state_below*emb_dropout[1], tparams[pp(prefix, 'Wx')]) + \
        tparams[pp(prefix, 'bx')]

    # step function to be used by scan
    # arguments    | sequences |outputs-info| non-seqs
    def _step_slice(m_, x_, xx_, h_, U, Ux, rec_dropout):

        preact = tensor.dot(h_ * rec_dropout[0], U)
        preact += x_

        # reset and update gates
        r = tensor.nnet.sigmoid(_slice(preact, 0, dim))
        u = tensor.nnet.sigmoid(_slice(preact, 1, dim))

        # compute the hidden state proposal
        preactx = tensor.dot(h_ * rec_dropout[1], Ux)
        preactx = preactx * r
        preactx = preactx + xx_

        # hidden state proposal
        h = tensor.tanh(preactx)

        # leaky integrate and obtain next hidden state
        h = u * h_ + (1. - u) * h
        h = m_[:, None] * h + (1. - m_)[:, None] * h_

        return h

    # prepare scan arguments
    seqs = [mask, state_below_, state_belowx]
    init_states = [tensor.alloc(0., n_samples, dim)]
    _step = _step_slice
    shared_vars = [
        tparams[pp(prefix, 'U')], tparams[pp(prefix, 'Ux')], rec_dropout
    ]

    rval, updates = theano.scan(_step,
                                sequences=seqs,
                                outputs_info=init_states,
                                non_sequences=shared_vars,
                                name=pp(prefix, '_layers'),
                                n_steps=nsteps,
                                truncate_gradient=truncate_gradient,
                                profile=profile,
                                strict=True)
    rval = [rval]
    return rval
Esempio n. 34
0
def gru_layer(tparams,
              state_below,
              options,
              prefix='gru',
              mask=None,
              **kwargs):
    """
    Forward pass through GRU layer
    """
    nsteps = state_below.shape[0]
    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = 1

    dim = tparams[_p(prefix, 'Ux')].shape[1]

    if mask == None:
        mask = tensor.alloc(1., state_below.shape[0], 1)

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n * dim:(n + 1) * dim]
        return _x[:, n * dim:(n + 1) * dim]

    state_below_ = tensor.dot(state_below, tparams[_p(
        prefix, 'W')]) + tparams[_p(prefix, 'b')]
    state_belowx = tensor.dot(state_below, tparams[_p(
        prefix, 'Wx')]) + tparams[_p(prefix, 'bx')]
    U = tparams[_p(prefix, 'U')]
    Ux = tparams[_p(prefix, 'Ux')]

    def _step_slice(m_, x_, xx_, h_, U, Ux):
        preact = tensor.dot(h_, U)
        preact += x_

        r = tensor.nnet.sigmoid(_slice(preact, 0, dim))
        u = tensor.nnet.sigmoid(_slice(preact, 1, dim))

        preactx = tensor.dot(h_, Ux)
        preactx = preactx * r
        preactx = preactx + xx_

        h = tensor.tanh(preactx)

        h = u * h_ + (1. - u) * h
        h = m_[:, None] * h + (1. - m_)[:, None] * h_

        return h

    seqs = [mask, state_below_, state_belowx]
    _step = _step_slice

    rval, updates = theano.scan(
        _step,
        sequences=seqs,
        outputs_info=[tensor.alloc(0., n_samples, dim)],
        non_sequences=[tparams[_p(prefix, 'U')], tparams[_p(prefix, 'Ux')]],
        name=_p(prefix, '_layers'),
        n_steps=nsteps,
        profile=profile,
        strict=True)
    rval = [rval]
    return rval
Esempio n. 35
0
def BlockGLSTMScanArrayToArray(rng,
                               inlayer,
                               szgate,
                               szhidden,
                               blocksize=10,
                               warmup=10,
                               outf=T.tanh,
                               noot=False,
                               backwards=False,
                               shareLayer=None,
                               warmupHidden=None,
                               warmupOut=None):
    if backwards:
        inout = inlayer.output[::-1]
    else:
        inout = inlayer.output

    if warmupHidden != None:
        if backwards:
            whid = warmupHidden.output[::-1]
        else:
            whid = warmupHidden.output

    if warmupOut != None:
        if backwards:
            wout = warmupOut.output[::-1]
        else:
            wout = warmupOut.output

    #PrepareData
    totblks = (inlayer.output.shape[0] + blocksize - 1) / blocksize

    def oneStep(inp, laststate, lastout):
        inl = SymbolLayer(inp, (totblks, inlayer.output_shape[1]))
        lstmout = LCollect(
            GLSTM(rng,
                  inl,
                  laststate,
                  lastout,
                  szgate,
                  szhidden,
                  outf=outf,
                  noot=noot,
                  shareLayer=shareLayer))
        return lstmout.hidden, lstmout.output

    stackinp = T.alloc(dtypeX(0), totblks, blocksize + warmup,
                       inlayer.output_shape[1])

    #Fill block data
    stackinp = T.set_subtensor(
        stackinp[:-1, warmup:], inout[:(totblks - 1) * blocksize].reshape(
            (totblks - 1, blocksize, inlayer.output.shape[1])))
    stackinp = T.set_subtensor(
        stackinp[-1, warmup:warmup + inlayer.output.shape[0] -
                 (totblks - 1) * blocksize],
        inout[(totblks - 1) * blocksize:].reshape(
            (inlayer.output.shape[0] - (totblks - 1) * blocksize,
             inlayer.output.shape[1])))
    #Fill block warmup data
    stackinp = T.set_subtensor(stackinp[1:, :warmup], stackinp[:-1, -warmup:])
    stackinp = stackinp.dimshuffle(1, 0, 2)
    LPush()
    #A large number
    firsthidden = T.alloc(
        dtypeX(0), totblks, szhidden
    )  #T.as_tensor_variable(np.zeros((1000,szhidden),'f'))[:totblks]
    if warmupHidden:
        firsthidden = T.set_subtensor(
            firsthidden[warmup / blocksize + 1:],
            whid[-warmup + blocksize * (warmup / blocksize + 1):-warmup +
                 blocksize * totblks:blocksize])
    firstout = T.alloc(
        dtypeX(0), totblks, szhidden
    )  #T.as_tensor_variable(np.zeros((1000,szhidden),'f'))[:totblks]
    if warmupOut:
        firstout = T.set_subtensor(
            firstout[warmup / blocksize + 1:],
            wout[-warmup + blocksize * (warmup / blocksize + 1):-warmup +
                 blocksize * totblks:blocksize])
    (hiddens,
     outs), updates = theano.scan(fn=oneStep,
                                  outputs_info=[firsthidden, firstout],
                                  sequences=stackinp)
    lstml = LPop()[0]
    #ExpandData
    hiddens = hiddens.dimshuffle(1, 0, 2)
    hiddens = hiddens[:, warmup:].reshape(
        (totblks * blocksize, szhidden))[:inlayer.output.shape[0]]
    outs = outs.dimshuffle(1, 0, 2)
    outs = outs[:, warmup:].reshape(
        (totblks * blocksize, szhidden))[:inlayer.output.shape[0]]
    if backwards:
        hiddens = hiddens[::-1]
        outs = outs[::-1]
    global extraHid
    extraHid = SymbolLayer(hiddens, (inlayer.output_shape[0], szhidden))
    return SymbolLayer(outs, (inlayer.output_shape[0], szhidden)), lstml
Esempio n. 36
0
def BlockLSTMUnrollArrayToArray(rng,
                                inlayer,
                                szhidden,
                                blocksize=10,
                                warmup=10,
                                outf=T.tanh,
                                noot=False,
                                backwards=False,
                                shareLayer=None,
                                warmupHidden=None,
                                warmupOut=None):
    if backwards:
        inout = inlayer.output[::-1]
    else:
        inout = inlayer.output

    if warmupHidden != None:
        if backwards:
            whid = warmupHidden.output[::-1]
        else:
            whid = warmupHidden.output

    if warmupOut != None:
        if backwards:
            wout = warmupOut.output[::-1]
        else:
            wout = warmupOut.output

    #PrepareData
    totblks = (inlayer.output.shape[0] + blocksize - 1) / blocksize

    def oneStep(inp, laststate, lastout):
        inl = SymbolLayer(inp, (totblks, inlayer.output_shape[1]))
        lstmout = LSTM(rng,
                       inl,
                       laststate,
                       lastout,
                       szhidden,
                       outf=outf,
                       noot=noot,
                       shareLayer=shareLayer)
        return lstmout.hidden, lstmout.output, lstmout

    stackinp = T.alloc(dtypeX(0), totblks, blocksize + warmup,
                       inlayer.output_shape[1])

    #Fill block data
    stackinp = T.set_subtensor(
        stackinp[:-1, warmup:], inout[:(totblks - 1) * blocksize].reshape(
            (totblks - 1, blocksize, inlayer.output.shape[1])))
    stackinp = T.set_subtensor(
        stackinp[-1, warmup:warmup + inlayer.output.shape[0] -
                 (totblks - 1) * blocksize],
        inout[(totblks - 1) * blocksize:].reshape(
            (inlayer.output.shape[0] - (totblks - 1) * blocksize,
             inlayer.output.shape[1])))
    #Fill block warmup data
    stackinp = T.set_subtensor(stackinp[1:, :warmup], stackinp[:-1, -warmup:])
    stackinp = stackinp.dimshuffle(1, 0, 2)
    #A large number
    firsthidden = T.alloc(
        dtypeX(0), totblks, szhidden
    )  #T.as_tensor_variable(np.zeros((1000,szhidden),'f'))[:totblks]
    if warmupHidden:
        firsthidden = T.set_subtensor(
            firsthidden[warmup / blocksize + 1:],
            whid[-warmup + blocksize * (warmup / blocksize + 1):-warmup +
                 blocksize * totblks:blocksize])
    firstout = T.alloc(
        dtypeX(0), totblks, szhidden
    )  #T.as_tensor_variable(np.zeros((1000,szhidden),'f'))[:totblks]
    if warmupOut:
        firstout = T.set_subtensor(
            firstout[warmup / blocksize + 1:],
            wout[-warmup + blocksize * (warmup / blocksize + 1):-warmup +
                 blocksize * totblks:blocksize])

    hiddens = []
    outs = []
    firstshare = None
    for i in range(warmup):
        firsthidden, firstout, shareLayer = oneStep(stackinp[i], firsthidden,
                                                    firstout)
        if firstshare == None: firstshare = shareLayer
    for i in range(blocksize):
        firsthidden, firstout, shareLayer = oneStep(stackinp[i + warmup],
                                                    firsthidden, firstout)
        if firstshare == None: firstshare = shareLayer
        hiddens.append(firsthidden)
        outs.append(firstout)

    hiddens = T.stack(*hiddens)
    outs = T.stack(*outs)
    #ExpandData (warmup is automatically eatten)
    hiddens = hiddens.dimshuffle(1, 0, 2)
    hiddens = hiddens.reshape(
        (totblks * blocksize, szhidden))[:inlayer.output.shape[0]]
    outs = outs.dimshuffle(1, 0, 2)
    outs = outs.reshape(
        (totblks * blocksize, szhidden))[:inlayer.output.shape[0]]
    if backwards:
        hiddens = hiddens[::-1]
        outs = outs[::-1]
    global extraHid
    extraHid = SymbolLayer(hiddens, (inlayer.output_shape[0], szhidden))
    return SymbolLayer(outs, (inlayer.output_shape[0], szhidden)), firstshare
Esempio n. 37
0
    def __init__(self, options, channel, data, model):
        """
        Parameters:
            options: Dictionary
            `options` is expected to contain the following keys:
                `cbs` -> int
                    Number of samples to consider at a time when computing
                    some property of the model
                `gbs` -> int
                    Number of samples over which to compute the gradients
                `mbs` -> int
                    Number of samples over which to compute the metric
                `ebs` -> int
                    Number of samples over which to evaluate the training
                    error
                `mreg` -> float
                    Regularization added to the metric
                `mrtol` -> float
                    Relative tolerance for inverting the metric
                `miters` -> int
                    Number of iterations
                `seed` -> int
                    Random number generator seed
                `profile` -> bool
                    Flag, if profiling should be on or not
                `verbose` -> int
                    Verbosity level
                `lr` -> float
                    Learning rate
            channel: jobman channel or None
            data: dictionary-like object return by numpy.load containing the
                data
            model : model
        """
        n_params = len(model.params)
        self.data = data

        if options['device'] != 'gpu':
            xdata = theano.shared(data['train_x'][:options['gbs']],
                                  name='xdata')
            ydata = TT._shared(data['train_y'][:options['gbs']], name='ydata')
            self.xdata = xdata
            self.ydata = ydata
            shared_data = [xdata, ydata]
        else:
            self.cpu_shared_data = []
            xdata = theano.shared(data['train_x'], name='xdata')
            ydata = TT._shared(data['train_y'], name='ydata')
            self.xdata = xdata
            self.ydata = ydata
            shared_data = [xdata, ydata]

        self.rng = numpy.random.RandomState(options['seed'])
        n_samples = data['train_x'].shape[0]
        self.grad_batches = n_samples // options['gbs']
        self.metric_batches = n_samples // options['mbs']
        self.eval_batches = n_samples // options['ebs']

        self.verbose = options['verbose']
        if options['device'] != 'gpu':
            # Store eucledian gradients
            self.gs = [
                TT._shared(numpy.zeros(shp, dtype=theano.config.floatX))
                for shp in model.params_shape
            ]
            # Store riemannian gradients
            self.rs = [
                TT._shared(numpy.zeros(shp, dtype=theano.config.floatX))
                for shp in model.params_shape
            ]
        else:
            # Store eucledian gradients
            self.gs = [
                theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                for shp in model.params_shape
            ]
            # Store riemannian gradients
            self.rs = [
                theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                for shp in model.params_shape
            ]

        self.permg = self.rng.permutation(self.grad_batches)
        self.permr = self.rng.permutation(self.metric_batches)
        self.perme = self.rng.permutation(self.eval_batches)
        self.k = 0
        self.posg = 0
        self.posr = 0
        self.pose = 0

        # Step 1. Compile function for computing eucledian gradients

        # inputs
        gbdx = TT.iscalar('grad_batch_idx')
        print 'Constructing grad function'
        srng = RandomStreams(numpy.random.randint(1e5))
        loc_inputs = [x.type() for x in model.inputs]

        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1:1 + n_params], gs)]
            return [args[0] + const(1)] + \
                    nw_gs

        ig = [
            TT.unbroadcast(TT.alloc(const(0), 1, *shp), 0)
            for shp in model.params_shape
        ]
        idx0 = TT.unbroadcast(const([0]), 0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig,
                              n_steps=n_steps,
                              name='grad_loop',
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1:1 + n_params]]

        # updates
        updates.update(dict(zip(self.gs, nw_gs)))
        # givens
        if options['device'] == 'gpu':
            grad_inps = [(x,
                          y[gbdx * options['gbs']:(gbdx + 1) * options['gbs']])
                         for x, y in zip(loc_inputs, shared_data)]
        else:
            grad_inps = zip(loc_inputs, shared_data)

        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx], [],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            on_unused_input='warn',
            profile=options['profile'])

        # Step 2. Compile function for Computing Riemannian gradients
        rbdx = TT.iscalar('riemmanian_batch_idx')
        rbpos = rbdx * options['mbs']

        if options['device'] == 'gpu':
            mode = gpu_mode

            def compute_Gv(*args):
                idx0 = const([0])
                ep = [
                    TT.alloc(const(0), 1, *shp) for shp in model.params_shape
                ]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_outs = safe_clone(model.outs, replace)
                    final_results = dict(
                        zip(model.params, [None] * len(model.params)))
                    for nw_out, out_operator in zip(nw_outs,
                                                    model.outs_operator):
                        loc_params = [
                            x for x in model.params
                            if x in theano.gof.graph.inputs([nw_out])
                        ]
                        loc_args = [
                            x for x, y in zip(args, model.params)
                            if y in theano.gof.graph.inputs([nw_out])
                        ]
                        if out_operator == 'softmax':
                            factor = const(options['cbs']) * nw_out
                        elif out_operator == 'sigmoid':
                            factor = const(
                                options['cbs']) * nw_out * (1 - nw_out)
                        else:
                            factor = const(options['cbs'])

                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params, loc_args) /\
                                         factor)

                        for lp, lgv in zip(loc_params, loc_Gvs):
                            if final_results[lp] is None:
                                final_results[lp] = lgv
                            else:
                                final_results[lp] += lgv

                    Gvs = [
                        ogv + final_results[param]
                        for (ogv, param) in zip(gv_args[1:], model.params)
                    ]
                    return [gv_args[0] + const(1)] + Gvs

                    nw_cost, nw_preactiv_out = safe_clone(
                        [model.train_cost, model.preactiv_out], replace)
                    nw_gvs = TT.Lop(
                        nw_preactiv_out, model.params,
                        TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params,
                               args))

                    Gvs = [
                        ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)
                    ]
                    return [gv_args[0] + const(1)] + Gvs

                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=theano.Mode(linker='cvm'),
                                      name='Gv_step',
                                      profile=options['profile'])

                final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
                return final_Gvs, updates
        else:
            mode = cpu_mode

            def compute_Gv(*args):
                cgv = [
                    theano.shared(numpy.zeros(shp, dtype=theano.config.floatX),
                                  name='cgv%d' % idx)
                    for idx, shp in enumerate(model.params_shape)
                ]
                print_mem('allocated mem for cgv')
                idx0 = const([0])
                ep = [
                    TT.alloc(const(0), 1, *shp) for shp in model.params_shape
                ]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_outs = safe_clone(model.outs, replace)
                    final_results = dict(
                        zip(model.params, [None] * len(model.params)))
                    for nw_out, out_operator in zip(nw_outs,
                                                    model.outs_operator):
                        loc_params = [
                            x for x in model.params
                            if x in theano.gof.graph.inputs([nw_out])
                        ]
                        loc_args = [
                            x for x, y in zip(cgv, model.params)
                            if y in theano.gof.graph.inputs([nw_out])
                        ]
                        if out_operator == 'softmax':
                            factor = const(options['cbs']) * nw_out
                        elif out_operator == 'sigmoid':
                            factor = const(
                                options['cbs']) * nw_out * (1 - nw_out)
                        else:
                            factor = const(options['cbs'])

                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params, loc_args) /\
                                         factor)

                        for lp, lgv in zip(loc_params, loc_Gvs):
                            if final_results[lp] is None:
                                final_results[lp] = lgv
                            else:
                                final_results[lp] += lgv

                    Gvs = [
                        ogv + final_results[param]
                        for (ogv, param) in zip(gv_args[1:], model.params)
                    ]
                    return [gv_args[0] + const(1)] + Gvs

                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=gpu_mode,
                                      name='Gv_step',
                                      profile=options['profile'])
                final_Gvs = [
                    TT.as_tensor_variable(x[0]) / const(n_steps)
                    for x in rvals[1:]
                ]
                grad_inps = zip(loc_inputs, shared_data)
                loc_fn = theano.function([],
                                         final_Gvs,
                                         updates=updates,
                                         givens=dict(grad_inps),
                                         on_unused_input='warn',
                                         mode=gpu_mode,
                                         name='loc_fn',
                                         profile=options['profile'])
                fake_op = FakeGPUShell(cgv, loc_fn, len(cgv))

                return fake_op(*args), {}

        print 'Constructing riemannian gradient function'
        norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs))
        rvals = minres.minres(compute_Gv, [x / norm_grads for x in self.gs],
                              rtol=options['mrtol'],
                              shift=-options['mreg'],
                              maxit=options['miters'],
                              mode=mode,
                              profile=options['profile'])
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = rvals[1]
        niters = rvals[2]
        rel_residual = rvals[3]
        rel_Aresidual = rvals[4]
        Anorm = rvals[5]
        Acond = rvals[6]
        xnorm = rvals[7]
        Axnorm = rvals[8]
        updates = rvals[9]

        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r)))

        updates.update(dict(zip(self.rs, nw_rs)))
        grad_inps = [(x, y[rbdx * options['mbs']:(rbdx + 1) * options['mbs']])
                     for x, y in zip(loc_inputs[:1], shared_data[:1])]
        print 'Compiling riemannian gradient function'
        self.compute_riemannian_gradients = theano.function(
            [rbdx], [
                flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm,
                Axnorm, norm_grads, norm_ord0
            ],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_riemannian_gradients',
            on_unused_input='warn',
            mode=mode,
            profile=options['profile'])

        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        self.lr = numpy.float32(options['lr'])
        ebdx = TT.iscalar('eval_batch_idx')
        nw_ps = [p - lr * r for p, r in zip(model.params, self.rs)]

        def cost_step(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            return [_idx + const(1), acc + nw_cost]

        acc0 = const([0])
        idx0 = const([0])
        n_steps = options['ebs'] // options['cbs']
        rvals, updates = scan(cost_step,
                              states=[idx0, acc0],
                              n_steps=n_steps,
                              name='cost_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        final_cost = rvals[1] / const(n_steps)
        if options['device'] == 'gpu':
            grad_inps = [(x,
                          y[ebdx * options['ebs']:(ebdx + 1) * options['ebs']])
                         for x, y in zip(loc_inputs, shared_data)]
        else:
            grad_inps = zip(loc_inputs, shared_data)

        print 'compling evaluation function'
        self.eval_fn = theano.function([ebdx, lr],
                                       final_cost,
                                       givens=dict(grad_inps),
                                       on_unused_input='warn',
                                       updates=updates,
                                       name='eval_fn',
                                       mode=gpu_mode,
                                       profile=options['profile'])

        update_dict = dict(zip(model.params, nw_ps))
        if options['device'] != 'gpu':
            update_dict.update(dict(zip(model.cparams, nw_ps)))
        self.update_params = theano.function([lr], [],
                                             updates=update_dict,
                                             name='update_params',
                                             on_unused_input='warn',
                                             mode=mode,
                                             profile=options['profile'])
        self.options = options
        self.old_cost = 1e6
        self.device = options['device']
        n_steps = options['ebs'] // options['cbs']

        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(model.err, replace=replace),
                              'float32')
            return [_idx + const(1), acc + nw_cost]

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_error,
                        states=states,
                        n_steps=n_steps,
                        name='ls_err_step',
                        mode=cpu_mode,
                        profile=options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([ebdx],
                                             ferr,
                                             givens=dict(grad_inps),
                                             name='compute_err',
                                             mode=gpu_mode,
                                             on_unused_input='warn',
                                             profile=options['profile'])
Esempio n. 38
0
    def __init__(self,
                 enc_h,
                 mask,
                 emb_mat,
                 vocab_size,
                 emb_dim,
                 hidden_dim,
                 eos_token,
                 batch_size,
                 max_len,
                 init='uniform',
                 inner_init='orthonormal',
                 activation=T.tanh,
                 params=None,
                 max_response=100):
        self.enc_h = enc_h
        self.mask = mask
        self.eos_token = eos_token
        self.batch_size = batch_size
        self.activation = activation
        self.max_response = max_response
        if params is None:
            self.emb = theano.shared(value=np.asarray(
                emb_mat, dtype=theano.config.floatX),
                                     name='emb',
                                     borrow=True)
            self.W = theano.shared(value=get(identifier=init,
                                             shape=(emb_dim, hidden_dim)),
                                   name='W',
                                   borrow=True)
            self.U = theano.shared(value=get(identifier=inner_init,
                                             shape=(hidden_dim, hidden_dim)),
                                   name='U',
                                   borrow=True)
            self.V = theano.shared(value=get(identifier=init,
                                             shape=(hidden_dim, vocab_size)),
                                   name='V',
                                   borrow=True)
            self.bh = theano.shared(value=get(identifier='zero',
                                              shape=(hidden_dim, )),
                                    name='bh',
                                    borrow=True)
            self.by = theano.shared(value=get(identifier='zero',
                                              shape=(vocab_size, )),
                                    name='by',
                                    borrow=True)
            # to weight 'context' from encoder
            self.c_h = theano.shared(value=get(identifier=init,
                                               shape=(hidden_dim, hidden_dim)),
                                     name='c_h',
                                     borrow=True)
            self.c_y = theano.shared(value=get(identifier=init,
                                               shape=(hidden_dim, vocab_size)),
                                     name='c_y',
                                     borrow=True)
            # to weight 'y_t-1' for decoder's 'y'
            self.y_t1 = theano.shared(value=get(identifier=init,
                                                shape=(emb_dim, vocab_size)),
                                      name='y_t1',
                                      borrow=True)
        else:
            self.emb, self.W, self.U, self.V, self.bh, self.by, self.c_h, self.c_y, self.y_t1 = params

        self.params = [
            self.emb, self.W, self.U, self.V, self.bh, self.by, self.c_h,
            self.c_y, self.y_t1
        ]

        self.h0 = theano.shared(value=get(identifier='zero',
                                          shape=(hidden_dim, )),
                                name='h0',
                                borrow=True)
        # y(t-1) from encoder will always be 'eos' token
        self.y0 = theano.shared(value=np.asarray(np.full((batch_size, ),
                                                         self.eos_token),
                                                 dtype='int32'),
                                name='y0',
                                borrow=True)

        # remember for decoder both h_t and y_t are conditioned on 'enc_h' & 'y_t-1'.
        def recurrence(msk, h_tm_prev, y_tm_prev):
            h_t = self.activation(
                T.dot(self.emb[y_tm_prev], self.W) + T.dot(h_tm_prev, self.U) +
                T.dot(self.enc_h, self.c_h) + self.bh)
            # needed to back-propagate errors
            y_d_t = T.dot(h_t, self.V) + T.dot(self.enc_h, self.c_y) + T.dot(
                self.emb[y_tm_prev], self.y_t1) + self.by
            # ignore padded tokens
            y_d_t = T.batched_dot(y_d_t, msk)
            y_d = T.clip(T.nnet.softmax(y_d_t), 0.0001, 0.9999)
            y_t = T.argmax(y_d, axis=1)
            return h_t, y_d, T.cast(y_t.flatten(), 'int32')

        [_, y_dist, y], _ = theano.scan(
            fn=recurrence,
            sequences=mask.dimshuffle(
                1, 0),  # ugly, but we have to go till the end
            outputs_info=[
                T.alloc(self.h0, self.enc_h.shape[0], hidden_dim), None,
                T.alloc(self.y0, self.enc_h.shape[0])
            ],
            n_steps=max_len)

        self.y = y.dimshuffle(1, 0)
        self.y_dist = y_dist.dimshuffle(1, 0, 2)
Esempio n. 39
0
def tree_lstm_layer(tparams, inputs, options, prefix='tree_lstm', **kwargs):
    state_below, mask, left_mask, right_mask = inputs

    # state_below: #step x #sample x dim_emb
    # mask: #step x #sample
    # left_mask: #step x #sample x #step
    # right_mask: #step x #sample x #step

    nsteps = state_below.shape[0]
    dim = tparams[_p(prefix, 'U_l')].shape[0]

    n_samples = state_below.shape[1]
    init_state = tensor.alloc(0., n_samples, nsteps, dim)
    init_memory = tensor.alloc(0., n_samples, nsteps, dim)

    # use the slice to calculate all the different gates
    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n * dim:(n + 1) * dim]
        elif _x.ndim == 2:
            return _x[:, n * dim:(n + 1) * dim]
        return _x[n * dim:(n + 1) * dim]

    # one time step of the lstm
    def _step(m_, x_, left_mask_, right_mask_, counter_, h_, c_):

        # zero out the input unless this is a leaf node
        # flag = tensor.switch(tensor.eq(tensor.sum(left_mask_, axis=1) + tensor.sum(right_mask_, axis=1), 0), 1., 0.)
        # x_ = x_ * flag[:, None]

        preact_l = tensor.dot(tensor.sum(left_mask_[:, :, None] * h_, axis=1),
                              tparams[_p(prefix, 'U_l')])
        preact_r = tensor.dot(tensor.sum(right_mask_[:, :, None] * h_, axis=1),
                              tparams[_p(prefix, 'U_r')])

        x_ = concatenate([
            _slice(x_, 0, dim),
            _slice(x_, 1, dim),
            _slice(x_, 1, dim),
            _slice(x_, 2, dim),
            _slice(x_, 3, dim)
        ],
                         axis=1)
        preact = preact_l + preact_r + x_

        i = tensor.nnet.sigmoid(_slice(preact, 0, dim))
        fl = tensor.nnet.sigmoid(_slice(preact, 1, dim))
        fr = tensor.nnet.sigmoid(_slice(preact, 2, dim))
        o = tensor.nnet.sigmoid(_slice(preact, 3, dim))
        u = tensor.tanh(_slice(preact, 4, dim))

        c_temp = fl * tensor.sum(left_mask_[:, :, None] * c_, axis=1) \
                    + fr * tensor.sum(right_mask_[:, :, None] * c_, axis=1) \
                    + i * u
        h_temp = o * tensor.tanh(c_temp)

        h = tensor.set_subtensor(h_[:, counter_, :], h_temp)
        c = tensor.set_subtensor(c_[:, counter_, :], c_temp)

        c = m_[:, None, None] * c + (1. - m_)[:, None, None] * c_
        h = m_[:, None, None] * h + (1. - m_)[:, None, None] * h_

        return h, c, i, fl, fr, o

    state_below = tensor.dot(state_below, tparams[_p(
        prefix, 'W')]) + tparams[_p(prefix, 'b')]

    rval, updates = theano.scan(
        fn=_step,
        sequences=[
            mask, state_below, left_mask, right_mask,
            tensor.arange(0, nsteps)
        ],
        outputs_info=[init_state, init_memory, None, None, None, None],
        name=_p(prefix, '_layers'),
        profile=False)
    return rval
Esempio n. 40
0
            def compute_Gv(*args):
                idx0 = const([0])
                ep = [
                    TT.alloc(const(0), 1, *shp) for shp in model.params_shape
                ]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_outs = safe_clone(model.outs, replace)
                    final_results = dict(
                        zip(model.params, [None] * len(model.params)))
                    for nw_out, out_operator in zip(nw_outs,
                                                    model.outs_operator):
                        loc_params = [
                            x for x in model.params
                            if x in theano.gof.graph.inputs([nw_out])
                        ]
                        loc_args = [
                            x for x, y in zip(args, model.params)
                            if y in theano.gof.graph.inputs([nw_out])
                        ]
                        if out_operator == 'softmax':
                            factor = const(options['cbs']) * nw_out
                        elif out_operator == 'sigmoid':
                            factor = const(
                                options['cbs']) * nw_out * (1 - nw_out)
                        else:
                            factor = const(options['cbs'])

                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params, loc_args) /\
                                         factor)

                        for lp, lgv in zip(loc_params, loc_Gvs):
                            if final_results[lp] is None:
                                final_results[lp] = lgv
                            else:
                                final_results[lp] += lgv

                    Gvs = [
                        ogv + final_results[param]
                        for (ogv, param) in zip(gv_args[1:], model.params)
                    ]
                    return [gv_args[0] + const(1)] + Gvs

                    nw_cost, nw_preactiv_out = safe_clone(
                        [model.train_cost, model.preactiv_out], replace)
                    nw_gvs = TT.Lop(
                        nw_preactiv_out, model.params,
                        TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params,
                               args))

                    Gvs = [
                        ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)
                    ]
                    return [gv_args[0] + const(1)] + Gvs

                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=theano.Mode(linker='cvm'),
                                      name='Gv_step',
                                      profile=options['profile'])

                final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
                return final_Gvs, updates
Esempio n. 41
0
    def build_sampler(self, **kwargs):
        x = tensor.matrix('x', dtype=INT)
        xr = x[::-1]
        n_timesteps = x.shape[0]
        n_samples = x.shape[1]

        # word embedding (source), forward and backward
        emb = self.tparams['Wemb_enc'][x.flatten()]
        emb = emb.reshape([n_timesteps, n_samples, self.embedding_dim])

        embr = self.tparams['Wemb_enc'][xr.flatten()]
        embr = embr.reshape([n_timesteps, n_samples, self.embedding_dim])

        # encoder
        proj = get_new_layer(self.enc_type)[1](self.tparams,
                                               emb,
                                               prefix='encoder',
                                               layernorm=self.lnorm)
        projr = get_new_layer(self.enc_type)[1](self.tparams,
                                                embr,
                                                prefix='encoder_r',
                                                layernorm=self.lnorm)

        # concatenate forward and backward rnn hidden states
        ctx = [
            tensor.concatenate([proj[0], projr[0][::-1]],
                               axis=proj[0].ndim - 1)
        ]

        for i in range(1, self.n_enc_layers):
            ctx = get_new_layer(self.enc_type)[1](self.tparams,
                                                  ctx[0],
                                                  prefix='deepencoder_%d' % i,
                                                  layernorm=self.lnorm)

        ctx = ctx[0]

        if self.init_cgru == 'text' and 'ff_state_W' in self.tparams:
            # get the input for decoder rnn initializer mlp
            ctx_mean = ctx.mean(0)
            init_state = get_new_layer('ff')[1](self.tparams,
                                                ctx_mean,
                                                prefix='ff_state',
                                                activ='tanh')
        else:
            # assume zero-initialized decoder
            init_state = tensor.alloc(0., n_samples, self.rnn_dim)

        outs = [init_state, ctx]
        self.f_init = theano.function([x], outs, name='f_init')

        # x: 1 x 1
        y1 = tensor.vector('y1_sampler', dtype=INT)
        y2 = tensor.vector('y2_sampler', dtype=INT)
        init_state = tensor.matrix('init_state', dtype=FLOAT)

        # if it's the first word, emb should be all zero and it is indicated by -1
        emb_lem = tensor.switch(
            y1[:, None] < 0,
            tensor.alloc(0., 1, self.tparams['Wemb_dec_lem'].shape[1]),
            self.tparams['Wemb_dec_lem'][y1])
        emb_fact = tensor.switch(
            y2[:, None] < 0,
            tensor.alloc(0., 1, self.tparams['Wemb_dec_fact'].shape[1]),
            self.tparams['Wemb_dec_fact'][y2])

        # Concat the 2 embeddings
        emb_prev = tensor.concatenate([emb_lem, emb_fact], axis=1)

        # apply one step of conditional gru with attention
        # get the next hidden states
        # get the weighted averages of contexts for this target word y
        r = get_new_layer('gru_cond')[1](self.tparams,
                                         emb_prev,
                                         prefix='decoder',
                                         mask=None,
                                         context=ctx,
                                         one_step=True,
                                         init_state=init_state,
                                         layernorm=False)

        next_state = r[0]
        ctxs = r[1]
        alphas = r[2]

        logit_lem = get_new_layer('ff')[1](self.tparams,
                                           emb_lem,
                                           prefix='ff_logit_lem',
                                           activ='linear')
        logit_fact = get_new_layer('ff')[1](self.tparams,
                                            emb_fact,
                                            prefix='ff_logit_fact',
                                            activ='linear')
        logit_ctx = get_new_layer('ff')[1](self.tparams,
                                           ctxs,
                                           prefix='ff_logit_ctx',
                                           activ='linear')
        logit_gru = get_new_layer('ff')[1](self.tparams,
                                           next_state,
                                           prefix='ff_logit_gru',
                                           activ='linear')

        logit1 = tanh(logit_gru + logit_lem + logit_ctx)
        logit2 = tanh(logit_gru + logit_fact + logit_ctx)

        if self.tied_trg_emb is False:
            logit = get_new_layer('ff')[1](self.tparams,
                                           logit1,
                                           prefix='ff_logit',
                                           activ='linear')
            logit_trgmult = get_new_layer('ff')[1](self.tparams,
                                                   logit2,
                                                   prefix='ff_logit_trgmult',
                                                   activ='linear')
        else:
            logit_trg = tensor.dot(logit1, self.tparams['Wemb_dec_lem'].T)
            logit_trgmult = tensor.dot(logit2, self.tparams['Wemb_dec_fact'].T)

        # compute the logsoftmax
        next_log_probs_trg = tensor.nnet.logsoftmax(logit_trg)
        next_log_probs_trgmult = tensor.nnet.logsoftmax(logit_trgmult)

        # Sample from the softmax distribution
        next_probs_trg = tensor.exp(next_log_probs_trg)
        next_probs_trgmult = tensor.exp(next_log_probs_trgmult)
        next_word_trg = self.trng.multinomial(pvals=next_probs_trg).argmax(1)
        next_word_trgmult = self.trng.multinomial(
            pvals=next_probs_trgmult).argmax(1)

        # NOTE: We never use sampling and it incurs performance penalty
        # let's disable it for now
        #next_word = self.trng.multinomial(pvals=next_probs).argmax(1)

        # compile a function to do the whole thing above
        # next hidden state to be used
        inputs = [y1, y2, init_state, ctx]
        outs = [next_log_probs_trg, next_log_probs_trgmult, next_state, alphas]

        self.f_next = theano.function(inputs, outs, name='f_next')
Esempio n. 42
0
def gru_decoder_multi(tparams, state_below,
                      ctx1, ctx2, prefix='gru_decoder_multi',
                      input_mask=None, one_step=False,
                      init_state=None, ctx1_mask=None):
    if one_step:
        assert init_state, 'previous state must be provided'

    # Context
    # n_timesteps x n_samples x ctxdim
    assert ctx1 and ctx2, 'Contexts must be provided'
    assert ctx1.ndim == 3 and ctx2.ndim == 3, 'Contexts must be 3-d: #annotation x #sample x dim'

    # Number of padded source timesteps
    nsteps = state_below.shape[0]

    # Batch or single sample?
    n_samples = state_below.shape[1] if state_below.ndim == 3 else 1

    # if we have no mask, we assume all the inputs are valid
    # tensor.alloc(value, *shape)
    # input_mask: (n_steps, 1) filled with 1
    if input_mask is None:
        input_mask = tensor.alloc(1., nsteps, 1)

    # Infer RNN dimensionality
    dim = tparams[pp(prefix, 'Wcx')].shape[1]

    # initial/previous state
    # if not given, assume it's all zeros
    if init_state is None:
        init_state = tensor.alloc(0., n_samples, dim)

    # These two dot products are same with gru_layer, refer to the equations.
    # [W_r * X + b_r, W_z * X + b_z]
    state_below_ = tensor.dot(state_below, tparams[pp(prefix, 'W')]) + tparams[pp(prefix, 'b')]

    # input to compute the hidden state proposal
    # This is the [W*x]_j in the eq. 8 of the paper
    state_belowx = tensor.dot(state_below, tparams[pp(prefix, 'Wx')]) + tparams[pp(prefix, 'bx')]

    # Wc_att: dimctx -> dimctx
    # Linearly transform the contexts to another space with same dimensionality
    pctx1_ = tensor.dot(ctx1, tparams[pp(prefix, 'Wc_att')]) + tparams[pp(prefix, 'b_att')]
    pctx2_ = tensor.dot(ctx2, tparams[pp(prefix, 'Wc_att')]) + tparams[pp(prefix, 'b_att')]

    # Step function for the recurrence/scan
    # Sequences
    # ---------
    # m_    : mask
    # x_    : state_below_
    # xx_   : state_belowx
    # outputs_info
    # ------------
    # h_     : init_state,
    # ctx_   : need to be defined as it's returned by _step
    # alpha1_: need to be defined as it's returned by _step
    # alpha2_: need to be defined as it's returned by _step
    # non sequences
    # -------------
    # pctx1_ : pctx1_
    # pctx2_ : pctx2_
    # cc1_   : ctx1
    # cc2_   : ctx2
    # and all the shared weights and biases..
    def _step(m_, x_, xx_,
              h_, ctx_, alpha1_, alpha2_, # These ctx and alpha's are not used in the computations
              pctx1_, pctx2_, cc1_, cc2_, U, Wc, W_comb_att, U_att, c_att, Ux, Wcx, U_nl, Ux_nl, b_nl, bx_nl):

        # Do a step of classical GRU
        h1 = gru_step(m_, x_, xx_, h_, U, Ux)

        ###########
        # Attention
        ###########
        # h1 X W_comb_att
        # W_comb_att: dim -> dimctx
        # pstate_ should be 2D as we're working with unrolled timesteps
        pstate_ = tensor.dot(h1, W_comb_att)

        # Accumulate in pctx*__ and apply tanh()
        # This becomes the projected context(s) + the current hidden state
        # of the decoder, e.g. this is the information accumulating
        # into the returned original contexts with the knowledge of target
        # sentence decoding.
        pctx1__ = tanh(pctx1_ + pstate_[None, :, :])
        pctx2__ = tanh(pctx2_ + pstate_[None, :, :])

        # Affine transformation for alpha* = (pctx*__ X U_att) + c_att
        # We're now down to scalar alpha's for each accumulated
        # context (0th dim) in the pctx*__
        # alpha1 should be n_timesteps, 1, 1
        alpha1 = tensor.dot(pctx1__, U_att) + c_att
        alpha2 = tensor.dot(pctx2__, U_att) + c_att

        # Drop the last dimension, e.g. (n_timesteps, 1)
        alpha1 = alpha1.reshape([alpha1.shape[0], alpha1.shape[1]])
        alpha2 = alpha2.reshape([alpha2.shape[0], alpha2.shape[1]])

        # Exponentiate alpha1
        alpha1 = tensor.exp(alpha1 - alpha1.max(0, keepdims=True))
        alpha2 = tensor.exp(alpha2 - alpha2.max(0, keepdims=True))

        # If there is a context mask, multiply with it to cancel unnecessary steps
        # We won't have a ctx_mask for image vectors
        if ctx1_mask:
            alpha1 = alpha1 * ctx1_mask

        # Normalize so that the sum makes 1
        alpha1 = alpha1 / alpha1.sum(0, keepdims=True)
        alpha2 = alpha2 / alpha2.sum(0, keepdims=True)

        # Compute the current context ctx*_ as the alpha-weighted sum of
        # the initial contexts ctx*'s
        ctx1_ = (cc1_ * alpha1[:, :, None]).sum(0)
        ctx2_ = (cc2_ * alpha2[:, :, None]).sum(0)
        # n_samples x ctxdim (2000)

        # Sum of contexts
        ctx_ = tanh(ctx1_ + ctx2_)

        ############################################
        # ctx*_ and alpha computations are completed
        ############################################

        ####################################
        # The below code is another GRU cell
        ####################################
        # Affine transformation: h1 X U_nl + b_nl
        # U_nl, b_nl: Stacked dim*2
        preact = tensor.dot(h1, U_nl) + b_nl

        # Transform the weighted context sum with Wc
        # and add it to preact
        # Wc: dimctx -> Stacked dim*2
        preact += tensor.dot(ctx_, Wc)

        # Apply sigmoid nonlinearity
        preact = sigmoid(preact)

        # Slice activations: New gates r2 and u2
        r2 = tensor_slice(preact, 0, dim)
        u2 = tensor_slice(preact, 1, dim)

        preactx = (tensor.dot(h1, Ux_nl) + bx_nl) * r2
        preactx += tensor.dot(ctx_, Wcx)

        # Candidate hidden
        h2_tilda = tanh(preactx)

        # Leaky integration between the new h2 and the
        # old h1 computed in line 285
        h2 = u2 * h2_tilda + (1. - u2) * h1
        h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1

        return h2, ctx_, alpha1.T, alpha2.T

    # Sequences are the input mask and the transformed target embeddings
    seqs = [input_mask, state_below_, state_belowx]

    # Create a list of shared parameters for easy parameter passing
    shared_vars = [tparams[pp(prefix, 'U')],
                   tparams[pp(prefix, 'Wc')],
                   tparams[pp(prefix, 'W_comb_att')],
                   tparams[pp(prefix, 'U_att')],
                   tparams[pp(prefix, 'c_att')],
                   tparams[pp(prefix, 'Ux')],
                   tparams[pp(prefix, 'Wcx')],
                   tparams[pp(prefix, 'U_nl')],
                   tparams[pp(prefix, 'Ux_nl')],
                   tparams[pp(prefix, 'b_nl')],
                   tparams[pp(prefix, 'bx_nl')]]

    if one_step:
        rval = _step(*(seqs + [init_state, None, None, None, pctx1_, pctx2_, ctx1, ctx2] + shared_vars))
    else:
        outputs_info=[init_state,
                      tensor.alloc(0., n_samples, ctx1.shape[2]), # ctxdim       (ctx_)
                      tensor.alloc(0., n_samples, ctx1.shape[0]), # n_timesteps  (alpha1)
                      tensor.alloc(0., n_samples, ctx2.shape[0])] # n_timesteps  (alpha2)

        rval, updates = theano.scan(_step,
                                    sequences=seqs,
                                    outputs_info=outputs_info,
                                    non_sequences=[pctx1_, pctx2_, ctx1, ctx2] + shared_vars,
                                    name=pp(prefix, '_layers'),
                                    n_steps=nsteps,
                                    strict=True)
    return rval
Esempio n. 43
0
# -*- coding: utf-8 -*-
"""
     theano.tensor.alloc(value,*shape):生成一个变化的tensor,维度是shape大小的,
但是值但是由value填充。
"""
import numpy as np
import theano
import theano.tensor as T


X = T.matrix()
e = T.alloc(1, 4, 3)
p = theano.function([X], e + X)
a = np.random.rand(4, 3).astype('float32')
print a
print p(a)
Esempio n. 44
0
def lstm_layer(tparams,
               state_below,
               options,
               prefix='lstm',
               mask=None,
               **kwargs):
    nsteps = state_below.shape[0]
    dim = tparams[_p(prefix, 'U')].shape[0]

    # if we are dealing with a mini-batch
    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
        init_state = tensor.alloc(0., n_samples, dim)
        init_memory = tensor.alloc(0., n_samples, dim)
    # during sampling
    else:
        n_samples = 1
        init_state = tensor.alloc(0., dim)
        init_memory = tensor.alloc(0., dim)

    # if we have no mask, we assume all the inputs are valid
    if mask == None:
        mask = tensor.alloc(1., state_below.shape[0], 1)

    # use the slice to calculate all the different gates
    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n * dim:(n + 1) * dim]
        elif _x.ndim == 2:
            return _x[:, n * dim:(n + 1) * dim]
        return _x[n * dim:(n + 1) * dim]

    # one time step of the lstm
    def _step(m_, x_, h_, c_):
        preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
        preact += x_

        i = tensor.nnet.sigmoid(_slice(preact, 0, dim))
        f = tensor.nnet.sigmoid(_slice(preact, 1, dim))
        o = tensor.nnet.sigmoid(_slice(preact, 2, dim))
        c = tensor.tanh(_slice(preact, 3, dim))

        c = f * c_ + i * c
        c = m_[:, None] * c + (1. - m_)[:, None] * c_

        h = o * tensor.tanh(c)
        h = m_[:, None] * h + (1. - m_)[:, None] * h_

        return h, c, i, f, o, preact

    state_below = tensor.dot(state_below, tparams[_p(
        prefix, 'W')]) + tparams[_p(prefix, 'b')]

    rval, updates = theano.scan(
        _step,
        sequences=[mask, state_below],
        outputs_info=[init_state, init_memory, None, None, None, None],
        name=_p(prefix, '_layers'),
        n_steps=nsteps,
        profile=False)
    return rval
Esempio n. 45
0
# one step function that will be used by scan
def oneStep(x_t, h_tm1, W_x, W_h, W_o):

    h_t = tensor.tanh(tensor.dot(x_t, W_x) + tensor.dot(h_tm1, W_h))
    o_t = tensor.dot(h_t, W_o)

    return h_t, o_t


# spawn theano tensor variable, our symbolic input
# a 3D tensor (n_steps, n_samples, dim)
x = tensor.tensor3(dtype='float32')

# initial state of our rnn
init_state = tensor.alloc(0., n_samples, dim)

# create parameters that we will use,
# note that, parameters are theano shared variables

# parameters for input to hidden states
W_x_ = numpy.random.randn(input_dim, dim).astype('float32')
W_x = theano.shared(W_x_)

# parameters for hidden state transition
W_h_ = numpy.random.randn(dim, dim).astype('float32')
W_h = theano.shared(W_h_)

# parameters from hidden state to output
W_o_ = numpy.random.randn(dim, output_dim).astype('float32')
W_o = theano.shared(W_o_)
Esempio n. 46
0
    def build(self):
        # description string: #words x #samples
        x = tensor.matrix('x', dtype=INT)
        x_mask = tensor.matrix('x_mask', dtype=FLOAT)
        y1 = tensor.matrix('y1', dtype=INT)
        y1_mask = tensor.matrix('y1_mask', dtype=FLOAT)
        y2 = tensor.matrix('y2', dtype=INT)
        y2_mask = tensor.matrix('y2_mask', dtype=FLOAT)

        self.inputs = OrderedDict()
        self.inputs['x'] = x
        self.inputs['x_mask'] = x_mask
        self.inputs['y1'] = y1
        self.inputs['y2'] = y2
        self.inputs['y1_mask'] = y1_mask
        self.inputs['y2_mask'] = y2_mask

        # for the backward rnn, we just need to invert x and x_mask
        xr = x[::-1]
        xr_mask = x_mask[::-1]

        n_timesteps = x.shape[0]
        n_timesteps_trg = y1.shape[0]
        n_timesteps_trgmult = y2.shape[0]
        n_samples = x.shape[1]

        # word embedding for forward rnn (source)
        emb = dropout(self.tparams['Wemb_enc'][x.flatten()], self.trng,
                      self.emb_dropout, self.use_dropout)
        emb = emb.reshape([n_timesteps, n_samples, self.embedding_dim])
        proj = get_new_layer(self.enc_type)[1](self.tparams,
                                               emb,
                                               prefix='encoder',
                                               mask=x_mask,
                                               layernorm=self.lnorm)

        # word embedding for backward rnn (source)
        embr = dropout(self.tparams['Wemb_enc'][xr.flatten()], self.trng,
                       self.emb_dropout, self.use_dropout)
        embr = embr.reshape([n_timesteps, n_samples, self.embedding_dim])
        projr = get_new_layer(self.enc_type)[1](self.tparams,
                                                embr,
                                                prefix='encoder_r',
                                                mask=xr_mask,
                                                layernorm=self.lnorm)

        # context will be the concatenation of forward and backward rnns
        ctx = [
            tensor.concatenate([proj[0], projr[0][::-1]],
                               axis=proj[0].ndim - 1)
        ]

        for i in range(1, self.n_enc_layers):
            ctx = get_new_layer(self.enc_type)[1](self.tparams,
                                                  ctx[0],
                                                  prefix='deepencoder_%d' % i,
                                                  mask=x_mask,
                                                  layernorm=self.lnorm)

        # Apply dropout
        ctx = dropout(ctx[0], self.trng, self.ctx_dropout, self.use_dropout)

        if self.init_cgru == 'text':
            # mean of the context (across time) will be used to initialize decoder rnn
            ctx_mean = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:,
                                                                         None]
            init_state = get_new_layer('ff')[1](self.tparams,
                                                ctx_mean,
                                                prefix='ff_state',
                                                activ='tanh')
        else:
            # Assume zero-initialized decoder
            init_state = tensor.alloc(0., n_samples, self.rnn_dim)

        # word embedding (target), we will shift the target sequence one time step
        # to the right. This is done because of the bi-gram connections in the
        # readout and decoder rnn. The first target will be all zeros and we will
        emb_lem = self.tparams['Wemb_dec_lem'][y1.flatten()]
        emb_lem = emb_lem.reshape(
            [n_timesteps_trg, n_samples, self.embedding_dim])
        emb_lem_shifted = tensor.zeros_like(emb_lem)
        emb_lem_shifted = tensor.set_subtensor(emb_lem_shifted[1:],
                                               emb_lem[:-1])
        emb_lem = emb_lem_shifted

        emb_fact = self.tparams['Wemb_dec_fact'][y2.flatten()]
        emb_fact = emb_fact.reshape(
            [n_timesteps_trgmult, n_samples, self.embedding_dim])
        emb_fact_shifted = tensor.zeros_like(emb_fact)
        emb_fact_shifted = tensor.set_subtensor(emb_fact_shifted[1:],
                                                emb_fact[:-1])
        emb_fact = emb_fact_shifted

        # Concat the 2 embeddings
        emb_prev = tensor.concatenate([emb_lem, emb_fact], axis=2)

        # decoder - pass through the decoder conditional gru with attention
        proj = get_new_layer('gru_cond')[1](self.tparams,
                                            emb_prev,
                                            prefix='decoder',
                                            mask=y1_mask,
                                            context=ctx,
                                            context_mask=x_mask,
                                            one_step=False,
                                            init_state=init_state,
                                            layernorm=False)
        # hidden states of the decoder gru
        proj_h = proj[0]

        # weighted averages of context, generated by attention module
        ctxs = proj[1]

        # weights (alignment matrix)
        self.alphas = proj[2]

        # compute word probabilities
        logit_gru = get_new_layer('ff')[1](self.tparams,
                                           proj_h,
                                           prefix='ff_logit_gru',
                                           activ='linear')
        logit_ctx = get_new_layer('ff')[1](self.tparams,
                                           ctxs,
                                           prefix='ff_logit_ctx',
                                           activ='linear')
        logit_lem = get_new_layer('ff')[1](self.tparams,
                                           emb_lem,
                                           prefix='ff_logit_lem',
                                           activ='linear')
        logit_fact = get_new_layer('ff')[1](self.tparams,
                                            emb_fact,
                                            prefix='ff_logit_fact',
                                            activ='linear')

        logit1 = dropout(tanh(logit_gru + logit_lem + logit_ctx), self.trng,
                         self.out_dropout, self.use_dropout)
        logit2 = dropout(tanh(logit_gru + logit_fact + logit_ctx), self.trng,
                         self.out_dropout, self.use_dropout)

        if self.tied_trg_emb is False:
            logit_trg = get_new_layer('ff')[1](self.tparams,
                                               logit1,
                                               prefix='ff_logit_trg',
                                               activ='linear')
            logit_trgmult = get_new_layer('ff')[1](self.tparams,
                                                   logit2,
                                                   prefix='ff_logit_trgmult',
                                                   activ='linear')

        else:
            logit_trg = tensor.dot(logit1, self.tparams['Wemb_dec_lem'].T)
            logit_trgmult = tensor.dot(logit2, self.tparams['Wemb_dec_fact'].T)

        logit_trg_shp = logit_trg.shape
        logit_trgmult_shp = logit_trgmult.shape

        # Apply logsoftmax (stable version)
        log_trg_probs = -tensor.nnet.logsoftmax(
            logit_trg.reshape(
                [logit_trg_shp[0] * logit_trg_shp[1], logit_trg_shp[2]]))
        log_trgmult_probs = -tensor.nnet.logsoftmax(
            logit_trgmult.reshape([
                logit_trgmult_shp[0] * logit_trgmult_shp[1],
                logit_trgmult_shp[2]
            ]))

        # cost
        y1_flat = y1.flatten()
        y2_flat = y2.flatten()
        y1_flat_idx = tensor.arange(
            y1_flat.shape[0]) * self.n_words_trg1 + y1_flat
        y2_flat_idx = tensor.arange(
            y2_flat.shape[0]) * self.n_words_trg2 + y2_flat

        cost_trg = log_trg_probs.flatten()[y1_flat_idx]
        cost_trg = cost_trg.reshape([n_timesteps_trg, n_samples])
        cost_trg = (cost_trg * y1_mask).sum(0)

        cost_trgmult = log_trgmult_probs.flatten()[y2_flat_idx]
        cost_trgmult = cost_trgmult.reshape([n_timesteps_trgmult, n_samples])
        cost_trgmult = (cost_trgmult * y2_mask).sum(0)

        cost = cost_trg + cost_trgmult
        self.f_log_probs = theano.function(list(self.inputs.values()), cost)

        # For alpha regularization

        return cost
Esempio n. 47
0
def ReplicateLayer(x, n_times):
    a = T.shape_padleft(x)
    padding = [1] * x.ndim
    b = T.alloc(numpy.float32(1), n_times, *padding)
    return a * b
Esempio n. 48
0
        def recurrent_apply(brick, application, application_call, *args,
                            **kwargs):
            """Iterates a transition function.

            Parameters
            ----------
            iterate : bool
                If ``True`` iteration is made. By default ``True``.
            reverse : bool
                If ``True``, the sequences are processed in backward
                direction. ``False`` by default.
            return_initial_states : bool
                If ``True``, initial states are included in the returned
                state tensors. ``False`` by default.

            .. todo::

                * Handle `updates` returned by the :func:`theano.scan`
                    routine.
                * ``kwargs`` has a random order; check if this is a
                    problem.

            """
            # Extract arguments related to iteration and immediately relay the
            # call to the wrapped function if `iterate=False`
            iterate = kwargs.pop('iterate', True)
            if not iterate:
                return application_function(brick, *args, **kwargs)
            reverse = kwargs.pop('reverse', False)
            return_initial_states = kwargs.pop('return_initial_states', False)

            # Push everything to kwargs
            for arg, arg_name in zip(args, arg_names):
                kwargs[arg_name] = arg
            # Separate sequences, states and contexts
            scan_arguments = (application.sequences + application.states +
                              application.contexts)

            # Check what is given and what is not
            def only_given(arg_names):
                return OrderedDict((arg_name, kwargs[arg_name])
                                   for arg_name in arg_names
                                   if kwargs.get(arg_name))

            sequences_given = only_given(application.sequences)
            contexts_given = only_given(application.contexts)

            # TODO Assumes 1 time dim!
            if len(sequences_given):
                shape = list(sequences_given.values())[0].shape
                if not iterate:
                    batch_size = shape[0]
                else:
                    n_steps = shape[0]
                    batch_size = shape[1]
            else:
                # TODO Raise error if n_steps and batch_size not found?
                n_steps = kwargs.pop('n_steps')
                batch_size = kwargs.pop('batch_size')

            # Handle the rest kwargs
            rest_kwargs = {
                key: value
                for key, value in kwargs.items() if key not in scan_arguments
            }
            for value in rest_kwargs.values():
                if (isinstance(value, Variable)
                        and not is_shared_variable(value)):
                    warnings.warn(
                        'Your function uses a non-shared variable other than'
                        ' those given by scan explicitly. That can'
                        ' significantly slow down `tensor.grad` call.'
                        ' Did you forget to declare it in `contexts`?')

            # Ensure that all initial states are available.
            for state_name in application.states:
                dim = brick.get_dim(state_name)
                if state_name in kwargs:
                    if isinstance(kwargs[state_name], NdarrayInitialization):
                        kwargs[state_name] = tensor.alloc(
                            kwargs[state_name].generate(brick.rng, (1, dim)),
                            batch_size, dim)
                    elif isinstance(kwargs[state_name], Application):
                        kwargs[state_name] = \
                            kwargs[state_name](state_name, batch_size,
                                               *args, **kwargs)
                else:
                    # TODO init_func returns 2D-tensor, fails for iterate=False
                    kwargs[state_name] = \
                        brick.initial_state(state_name, batch_size,
                                            *args, **kwargs)
                    assert kwargs[state_name]
            states_given = only_given(application.states)
            assert len(states_given) == len(application.states)

            # Theano issue 1772
            for name, state in states_given.items():
                states_given[name] = tensor.unbroadcast(
                    state, *range(state.ndim))

            def scan_function(*args):
                args = list(args)
                arg_names = (list(sequences_given) + list(states_given) +
                             list(contexts_given))
                kwargs = dict(zip(arg_names, args))
                kwargs.update(rest_kwargs)
                outputs = getattr(brick,
                                  application_function.__name__)(iterate=False,
                                                                 **kwargs)
                # We want to save the computation graph returned by the
                # `application_function` when it is called inside the
                # `theano.scan`.
                application_call.inner_inputs = args
                application_call.inner_outputs = pack(outputs)
                return outputs

            outputs_info = (
                list(states_given.values()) + [None] *
                (len(application.outputs) - len(application.states)))
            result, updates = theano.scan(
                scan_function,
                sequences=list(sequences_given.values()),
                outputs_info=outputs_info,
                non_sequences=list(contexts_given.values()),
                n_steps=n_steps,
                go_backwards=reverse)
            result = pack(result)
            if return_initial_states:
                # Undo Subtensor
                for i in range(len(states_given)):
                    assert isinstance(result[i].owner.op,
                                      tensor.subtensor.Subtensor)
                    result[i] = result[i].owner.inputs[0]
            if updates:
                application_call.updates = dict_union(application_call.updates,
                                                      updates)

            return result
Esempio n. 49
0
def lstm_cond_layer(tparams,
                    state_below,
                    options,
                    prefix='lstm',
                    mask=None,
                    init_memory=None,
                    init_state=None,
                    trng=None,
                    use_noise=None,
                    **kwargs):
    """
    Computation graph for the LSTM.
    Note that we removed 'context' and put this into 'state_below'
    Video frames need to be part of scan, since it changes each step
    """
    nsteps = state_below.shape[0]
    n_samples = state_below.shape[1]
    n_annotations = state_below.shape[2]

    # mask
    if mask == None:
        mask = tensor.alloc(1., state_below.shape[0], 1)

    dim = tparams[_p(prefix, 'U')].shape[0]

    # initial/previous state
    if init_state == None:
        init_state = tensor.alloc(0., n_samples, dim)
    # initial/previous memory
    if init_memory == None:
        init_memory = tensor.alloc(0., n_samples, dim)

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n * dim:(n + 1) * dim]
        return _x[:, n * dim:(n + 1) * dim]

    def _step(m_, x_, h_, c_, a_, ct_, dp_=None, dp_att_=None):
        # mask, xt, ht-1, ct-1, alpha, ctx
        # attention
        # print '\n\ncheck\n\n'
        pstate_ = tensor.dot(h_, tparams[_p(prefix, 'Wd_att')])  # pstate_
        pctx_ = tensor.dot(x_, tparams[_p(prefix, 'Wc_att')]) + tparams[_p(
            prefix, 'b_att')]
        if options['n_layers_att'] > 1:
            for lidx in xrange(1, options['n_layers_att']):
                pctx_ = tensor.dot(pctx_, tparams[_p(
                    prefix, 'W_att_%d' % lidx)]) + tparams[_p(
                        prefix, 'b_att_%d' % lidx)]
                if lidx < options['n_layers_att'] - 1:
                    pctx_ = tanh(pctx_)
        pctx_ = pctx_ + pstate_[:, None, :]
        pctx_list = []
        pctx_list.append(pctx_)
        pctx_ = tanh(pctx_)
        alpha = tensor.dot(pctx_, tparams[_p(prefix, 'U_att')]) + tparams[_p(
            prefix, 'c_tt')]
        alpha_pre = alpha
        alpha_shp = alpha.shape
        alpha = tensor.nnet.softmax(
            options['temperature_inverse'] *
            alpha.reshape([alpha_shp[0], alpha_shp[1]]))  # softmax
        ctx_ = (x_ * alpha[:, :, None]).sum(1)  # current context
        # print '\n\ncheck\n\n'
        if options['selector']:
            sel_ = tensor.nnet.sigmoid(
                tensor.dot(h_, tparams[_p(prefix, 'W_sel')]) +
                tparams[_p(prefix, 'b_sel')])
            sel_ = sel_.reshape([sel_.shape[0]])
            ctx_ = sel_[:, None] * ctx_

        preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
        preact += tensor.dot(ctx_, tparams[_p(prefix, 'W')]) + tparams[_p(
            prefix, 'b')]

        i = _slice(preact, 0, dim)  # z_it
        f = _slice(preact, 1, dim)  # z_ft
        o = _slice(preact, 2, dim)  # z_ot
        i = tensor.nnet.sigmoid(i)  # it = sigmoid(z_it)
        f = tensor.nnet.sigmoid(f)  # ft = sigmoid(z_ft)
        o = tensor.nnet.sigmoid(o)  # ot = sigmoid(z_ot)
        c = tensor.tanh(_slice(preact, 3, dim))  # at = tanh(z_at)

        c = f * c_ + i * c  # ct = ft * ct-1 + it * at
        c = m_[:, None] * c + (1. - m_)[:, None] * c_

        h = o * tensor.tanh(c)  # ht = ot * thanh(ct)
        h = m_[:, None] * h + (1. - m_)[:, None] * h_

        rval = [h, c, alpha, ctx_]
        if options['selector']:
            rval += [sel_]
        rval += [pstate_, pctx_, i, f, o, preact, alpha_pre] + pctx_list
        # print '\n\ncheck\n\n'
        return rval

    if options['selector']:
        _step0 = lambda m_, x_, h_, c_, a_, ct_, sel_: _step(
            m_, x_, h_, c_, a_, ct_)
    else:
        _step0 = lambda m_, x_, h_, c_, a_, ct_: _step(m_, x_, h_, c_, a_, ct_)

    seqs = [mask, state_below]
    outputs_info = [
        init_state, init_memory,
        tensor.alloc(0., n_samples, n_annotations),
        tensor.alloc(0., n_samples, options['ctx_dim'])
    ]
    if options['selector']:
        outputs_info += [tensor.alloc(0., n_samples)]
    outputs_info += [None, None, None, None, None, None, None
                     ] + [None]  #*options['n_layers_att']
    rval, updates = theano.scan(_step0,
                                sequences=seqs,
                                outputs_info=outputs_info,
                                name=_p(prefix, '_layers'),
                                n_steps=nsteps,
                                profile=False)
    return rval
Esempio n. 50
0
def sample_level_rnn(input_sequences, h0, reset):
    """
    input_sequences.shape: (batch size, seq len)
    h0.shape:              (batch size, N_GRUS, DIM)
    reset.shape:           ()
    output.shape:          (batch size, seq len, Q_LEVELS)
    """

    learned_h0 = lib.param(
        'SampleLevel.h0',
        numpy.zeros((N_GRUS, DIM), dtype=theano.config.floatX)
    )
    learned_h0 = T.alloc(learned_h0, h0.shape[0], N_GRUS, DIM)
    h0 = theano.ifelse.ifelse(reset, learned_h0, h0)

    # Embedded inputs
    #################

    FRAME_SIZE = Q_LEVELS
    frames = lib.ops.Embedding('SampleLevel.Embedding', Q_LEVELS, Q_LEVELS, input_sequences)

    # Real-valued inputs
    ####################

    # 'frames' of size 1
    # FRAME_SIZE = 1
    # frames = input_sequences.reshape((
    #     input_sequences.shape[0],
    #     input_sequences.shape[1],
    #     1
    # ))
    # # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2]
    # # (a reasonable range to pass as inputs to the RNN)
    # frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
    # frames *= lib.floatX(2)

    gru0 = lib.ops.LowMemGRU('SampleLevel.GRU0', FRAME_SIZE, DIM, frames, h0=h0[:, 0])
    # gru0 = T.nnet.relu(lib.ops.Linear('SampleLevel.GRU0FF', DIM, DIM, gru0, initialization='he'))
    grus = [gru0]
    for i in xrange(1, N_GRUS):
        gru = lib.ops.LowMemGRU('SampleLevel.GRU'+str(i), DIM, DIM, grus[-1], h0=h0[:, i])
        # gru = T.nnet.relu(lib.ops.Linear('SampleLevel.GRU'+str(i)+'FF', DIM, DIM, gru, initialization='he'))
        grus.append(gru)

    # We apply the softmax later
    output = lib.ops.Linear(
        'Output',
        N_GRUS*DIM,
        Q_LEVELS,
        T.concatenate(grus, axis=2)
    )
    # output = lib.ops.Linear(
    #     'Output',
    #     DIM,
    #     Q_LEVELS,
    #     grus[-1]
    # )

    last_hidden = T.stack([gru[:,-1] for gru in grus], axis=1)

    return (output, last_hidden)
Esempio n. 51
0
def max_pool_b01c(z, pool_shape, top_down=None, theano_rng=None):
    """
    .. todo::

        WRITEME properly

    An implementation of max_pool but where all 4-tensors use the
    ('b', 0, 1, 'c') format.
    """

    z_name = z.name
    if z_name is None:
        z_name = 'anon_z'

    batch_size, zr, zc, ch = z.shape

    r, c = pool_shape

    zpart = []

    mx = None

    if top_down is None:
        t = 0.
    else:
        t = -top_down

    for i in xrange(r):
        zpart.append([])
        for j in xrange(c):
            cur_part = z[:, i:zr:r, j:zc:c, :]
            if z_name is not None:
                cur_part.name = z_name + '[%d, %d]' % (i, j)
            zpart[i].append(cur_part)
            if mx is None:
                mx = T.maximum(t, cur_part)
                if cur_part.name is not None:
                    mx.name = 'max(-top_down,' + cur_part.name + ')'
            else:
                max_name = None
                if cur_part.name is not None:
                    mx_name = 'max(' + cur_part.name + ',' + mx.name + ')'
                mx = T.maximum(mx, cur_part)
                mx.name = mx_name
    mx.name = 'local_max(' + z_name + ')'

    pt = []

    for i in xrange(r):
        pt.append([])
        for j in xrange(c):
            z_ij = zpart[i][j]
            safe = z_ij - mx
            safe.name = 'safe_z(%s)' % z_ij.name
            cur_pt = T.exp(safe)
            cur_pt.name = 'pt(%s)' % z_ij.name
            pt[-1].append(cur_pt)

    off_pt = T.exp(t - mx)
    off_pt.name = 'p_tilde_off(%s)' % z_name
    denom = off_pt

    for i in xrange(r):
        for j in xrange(c):
            denom = denom + pt[i][j]
    denom.name = 'denom(%s)' % z_name

    off_prob = off_pt / denom
    p = 1. - off_prob
    p.name = 'p(%s)' % z_name

    hpart = []
    for i in xrange(r):
        hpart.append([pt_ij / denom for pt_ij in pt[i]])

    h = T.alloc(0., batch_size, zr, zc, ch)

    for i in xrange(r):
        for j in xrange(c):
            h = T.set_subtensor(h[:, i:zr:r, j:zc:c, :], hpart[i][j])

    h.name = 'h(%s)' % z_name

    if theano_rng is None:
        return p, h
    else:
        events = []
        for i in xrange(r):
            for j in xrange(c):
                events.append(hpart[i][j])
        events.append(off_prob)

        events = [event.dimshuffle(0, 1, 2, 3, 'x') for event in events]

        events = tuple(events)

        stacked_events = T.concatenate(events, axis=4)

        batch_size, rows, cols, channels, outcomes = stacked_events.shape
        reshaped_events = stacked_events.reshape(
            (batch_size * rows * cols * channels, outcomes))

        multinomial = theano_rng.multinomial(pvals=reshaped_events,
                                             dtype=p.dtype)

        reshaped_multinomial = multinomial.reshape(
            (batch_size, rows, cols, channels, outcomes))

        h_sample = T.alloc(0., batch_size, zr, zc, ch)

        idx = 0
        for i in xrange(r):
            for j in xrange(c):
                h_sample = T.set_subtensor(
                    h_sample[:, i:zr:r, j:zc:c, :],
                    reshaped_multinomial[:, :, :, :, idx])
                idx += 1

        p_sample = 1 - reshaped_multinomial[:, :, :, :, -1]

        return p, h, p_sample, h_sample
Esempio n. 52
0
    def __init__(self,
                 input,
                 n_in,
                 n_hidden,
                 n_out,
                 activation=T.tanh,
                 output_type='real'):

        self.input = input
        self.activation = activation
        self.output_type = output_type

        self.batch_size = T.iscalar()

        # theta is a vector of all trainable parameters
        # it represents the value of W, W_in, W_out, h0, bh, by
        theta_shape = n_hidden ** 2 + n_in * n_hidden + n_hidden * n_out + \
                      n_hidden + n_hidden + n_out
        self.theta = theano.shared(
            value=np.zeros(theta_shape, dtype=theano.config.floatX))

        # Parameters are reshaped views of theta
        param_idx = 0  # pointer to somewhere along parameter vector

        # recurrent weights as a shared variable
        self.W = self.theta[param_idx:(param_idx + n_hidden**2)].reshape(
            (n_hidden, n_hidden))
        self.W.name = 'W'
        W_init = np.asarray(np.random.uniform(size=(n_hidden, n_hidden),
                                              low=-0.01,
                                              high=0.01),
                            dtype=theano.config.floatX)
        param_idx += n_hidden**2

        # input to hidden layer weights
        self.W_in = self.theta[param_idx:(param_idx + n_in * \
                                          n_hidden)].reshape((n_in, n_hidden))
        self.W_in.name = 'W_in'
        W_in_init = np.asarray(np.random.uniform(size=(n_in, n_hidden),
                                                 low=-0.01,
                                                 high=0.01),
                               dtype=theano.config.floatX)
        param_idx += n_in * n_hidden

        # hidden to output layer weights
        self.W_out = self.theta[param_idx:(param_idx + n_hidden * \
                                           n_out)].reshape((n_hidden, n_out))
        self.W_out.name = 'W_out'

        W_out_init = np.asarray(np.random.uniform(size=(n_hidden, n_out),
                                                  low=-0.01,
                                                  high=0.01),
                                dtype=theano.config.floatX)
        param_idx += n_hidden * n_out

        self.h0 = self.theta[param_idx:(param_idx + n_hidden)]
        self.h0.name = 'h0'
        h0_init = np.zeros((n_hidden, ), dtype=theano.config.floatX)
        param_idx += n_hidden

        self.bh = self.theta[param_idx:(param_idx + n_hidden)]
        self.bh.name = 'bh'
        bh_init = np.zeros((n_hidden, ), dtype=theano.config.floatX)
        param_idx += n_hidden

        self.by = self.theta[param_idx:(param_idx + n_out)]
        self.by.name = 'by'
        by_init = np.zeros((n_out, ), dtype=theano.config.floatX)
        param_idx += n_out

        assert (param_idx == theta_shape)

        # for convenience
        self.params = [
            self.W, self.W_in, self.W_out, self.h0, self.bh, self.by
        ]

        # shortcut to norms (for monitoring)
        self.l2_norms = {}
        for param in self.params:
            self.l2_norms[param] = T.sqrt(T.sum(param**2))

        # initialize parameters
        # DEBUG_MODE gives division by zero error when we leave parameters
        # as zeros
        self.theta.set_value(
            np.concatenate([
                x.ravel() for x in (W_init, W_in_init, W_out_init, h0_init,
                                    bh_init, by_init)
            ]))

        self.theta_update = theano.shared(
            value=np.zeros(theta_shape, dtype=theano.config.floatX))

        # recurrent function (using tanh activation function) and linear output
        # activation function
        def step(x_t, h_tm1):
            h_t = self.activation(T.dot(x_t, self.W_in) + \
                                  T.dot(h_tm1, self.W) + self.bh)
            y_t = T.dot(h_t, self.W_out) + self.by
            return h_t, y_t

        # the hidden state `h` for the entire sequence, and the output for the
        # entire sequence `y` (first dimension is always time)
        # Note the implementation of weight-sharing h0 across variable-size
        # batches using T.ones multiplying h0
        [self.h,
         self.y_pred], _ = theano.scan(step,
                                       sequences=self.input,
                                       outputs_info=[
                                           T.alloc(self.h0,
                                                   self.input.shape[1],
                                                   n_hidden), None
                                       ])
        # outputs_info=[T.ones(shape=(self.input.shape[1],
        # self.h0.shape[0])) * self.h0, None])

        # L1 norm ; one regularization option is to enforce L1 norm to
        # be small
        self.L1 = 0
        self.L1 += abs(self.W.sum())
        self.L1 += abs(self.W_in.sum())
        self.L1 += abs(self.W_out.sum())

        # square of L2 norm ; one regularization option is to enforce
        # square of L2 norm to be small
        self.L2_sqr = 0
        self.L2_sqr += (self.W**2).sum()
        self.L2_sqr += (self.W_in**2).sum()
        self.L2_sqr += (self.W_out**2).sum()

        if self.output_type == 'real':
            self.loss = lambda y: self.mse(y)
        elif self.output_type == 'binary':
            # push through sigmoid
            self.p_y_given_x = T.nnet.sigmoid(self.y_pred)  # apply sigmoid
            self.y_out = T.round(self.p_y_given_x)  # round to {0,1}
            self.loss = lambda y: self.nll_binary(y)
        elif self.output_type == 'softmax':
            # push through softmax, computing vector of class-membership
            # probabilities in symbolic form
            #
            # T.nnet.softmax will not operate on T.tensor3 types, only matrices
            # We take our n_steps x n_seq x n_classes output from the net
            # and reshape it into a (n_steps * n_seq) x n_classes matrix
            # apply softmax, then reshape back
            y_p = self.y_pred
            y_p_m = T.reshape(y_p, (y_p.shape[0] * y_p.shape[1], -1))
            y_p_s = T.nnet.softmax(y_p_m)
            self.p_y_given_x = T.reshape(y_p_s, y_p.shape)

            # compute prediction as class whose probability is maximal
            self.y_out = T.argmax(self.p_y_given_x, axis=-1)
            self.loss = lambda y: self.nll_multiclass(y)

        else:
            raise NotImplementedError
Esempio n. 53
0
def max_pool(z, pool_shape, top_down=None, theano_rng=None):
    """
    .. todo::

        WRITEME properly

    z : a theano 4-tensor representing input from below
    pool_shape: tuple of ints. the shape of regions to be pooled
    top_down: (optional) a theano 4-tensor representing input from above
                if None, assumes top-down input is 0
    theano_rng: (optional) a MRG_RandomStreams instance

    returns:
        a theano 4-tensor for the expected value of the detector layer h
        a theano 4-tensor for the expected value of the pooling layer p
        if theano_rng is not None, also returns:
            a theano 4-tensor of samples of the detector layer
            a theano 4-tensor of samples of the pooling layer

    all 4-tensors are formatted with axes ('b', 'c', 0, 1).
    This is for maximum speed when using theano's conv2d
    to generate z and top_down, or when using it to infer conditionals of
    other layers using the return values.

    Detailed description:

    Suppose you have a variable h that lives in a Conv2DSpace h_space and
    you want to pool it down to a variable p that lives in a smaller
    Conv2DSpace p.

    This function does that, using non-overlapping pools.

    Specifically, consider one channel of h. h must have a height that is a
    multiple of pool_shape[0] and a width that is a multiple of pool_shape[1].
    A channel of h can thus be broken down into non-overlapping rectangles
    of shape pool_shape.

    Now consider one rectangular pooled region within one channel of h.
    I now use 'h' to refer just to this rectangle, and 'p' to refer to
    just the one pooling unit associated with that rectangle.
    We assume that the space that h and p live in is constrained such
    that h and p are both binary and p = max(h). To reduce the state-space
    in order to make probabilistic computations cheaper we also
    constrain sum(h) <= 1.
    Suppose h contains k different units. Suppose that the only term
    in the model's energy function involving h is -(z*h).sum()
    (elemwise multiplication) and the only term in
    the model's energy function involving p is -(top_down*p).sum().

    Then P(h[i] = 1) = softmax( [ z[1], z[2], ..., z[k], -top_down] )[i]
    and P(p = 1) = 1-softmax( [z[1], z[2], ..., z[k], -top_down])[k]


    This variation of the function assumes that z, top_down, and all
    return values use Conv2D axes ('b', 'c', 0, 1).
    This variation of the function implements the softmax using a
    theano graph of exp, maximum, sub, and div operations.

    Performance notes:
    It might be possible to make a faster implementation with different
    theano ops. rather than using set_subtensor, it might be possible
    to use the stuff in theano.sandbox.neighbours. Probably not possible,
    or at least nasty, because that code isn't written with multiple
    channels in mind, and I don't think just a reshape can fix it.
    Some work on this in galatea.cond.neighbs.py
    At some point images2neighbs' gradient was broken so check that
    it has been fixed before sinking too much time into this.

    Stabilizing the softmax is also another source of slowness.
    Here it is stabilized with several calls to maximum and sub.
    It might also be possible to stabilize it with
    T.maximum(-top_down,T.signal.downsample.max_pool(z)).
    Don't know if that would be faster or slower.

    Elsewhere in this file I implemented the softmax with a reshape
    and call to Softmax / SoftmaxWithBias.
    This is slower, even though Softmax is faster on the GPU than the
    equivalent max/sub/exp/div graph. Maybe the reshape is too expensive.

    Benchmarks show that most of the time is spent in GpuIncSubtensor
    when running on gpu. So it is mostly that which needs a faster
    implementation. One other way to implement this would be with
    a linear.Conv2D.lmul_T, where the convolution stride is equal to
    the pool width, and the thing to multiply with is the hparts stacked
    along the channel axis. Unfortunately, conv2D doesn't work right
    with stride > 2 and is pretty slow for stride 2. Conv3D is used to
    mitigat some of this, but only has CPU code.
    """

    z_name = z.name
    if z_name is None:
        z_name = 'anon_z'

    batch_size, ch, zr, zc = z.shape

    r, c = pool_shape

    zpart = []

    mx = None

    if top_down is None:
        t = 0.
    else:
        t = -top_down
        t.name = 'neg_top_down'

    for i in xrange(r):
        zpart.append([])
        for j in xrange(c):
            cur_part = z[:, :, i:zr:r, j:zc:c]
            if z_name is not None:
                cur_part.name = z_name + '[%d,%d]' % (i, j)
            zpart[i].append(cur_part)
            if mx is None:
                mx = T.maximum(t, cur_part)
                if cur_part.name is not None:
                    mx.name = 'max(-top_down,' + cur_part.name + ')'
            else:
                max_name = None
                if cur_part.name is not None:
                    mx_name = 'max(' + cur_part.name + ',' + mx.name + ')'
                mx = T.maximum(mx, cur_part)
                mx.name = mx_name
    mx.name = 'local_max(' + z_name + ')'

    pt = []

    for i in xrange(r):
        pt.append([])
        for j in xrange(c):
            z_ij = zpart[i][j]
            safe = z_ij - mx
            safe.name = 'safe_z(%s)' % z_ij.name
            cur_pt = T.exp(safe)
            cur_pt.name = 'pt(%s)' % z_ij.name
            pt[-1].append(cur_pt)

    off_pt = T.exp(t - mx)
    off_pt.name = 'p_tilde_off(%s)' % z_name
    denom = off_pt

    for i in xrange(r):
        for j in xrange(c):
            denom = denom + pt[i][j]
    denom.name = 'denom(%s)' % z_name

    off_prob = off_pt / denom
    p = 1. - off_prob
    p.name = 'p(%s)' % z_name

    hpart = []
    for i in xrange(r):
        hpart.append([pt_ij / denom for pt_ij in pt[i]])

    h = T.alloc(0., batch_size, ch, zr, zc)

    for i in xrange(r):
        for j in xrange(c):
            h.name = 'h_interm'
            h = T.set_subtensor(h[:, :, i:zr:r, j:zc:c], hpart[i][j])

    h.name = 'h(%s)' % z_name

    if theano_rng is None:
        return p, h
    else:
        events = []
        for i in xrange(r):
            for j in xrange(c):
                events.append(hpart[i][j])
        events.append(off_prob)

        events = [event.dimshuffle(0, 1, 2, 3, 'x') for event in events]

        events = tuple(events)

        stacked_events = T.concatenate(events, axis=4)

        rows = zr // pool_shape[0]
        cols = zc // pool_shape[1]
        outcomes = pool_shape[0] * pool_shape[1] + 1
        assert stacked_events.ndim == 5
        for se, bs, r, c, chv in get_debug_values(stacked_events, batch_size,
                                                  rows, cols, ch):
            assert se.shape[0] == bs
            assert se.shape[1] == r
            assert se.shape[2] == c
            assert se.shape[3] == chv
            assert se.shape[4] == outcomes
        reshaped_events = stacked_events.reshape(
            (batch_size * rows * cols * ch, outcomes))

        multinomial = theano_rng.multinomial(pvals=reshaped_events,
                                             dtype=p.dtype)

        reshaped_multinomial = multinomial.reshape(
            (batch_size, ch, rows, cols, outcomes))

        h_sample = T.alloc(0., batch_size, ch, zr, zc)

        idx = 0
        for i in xrange(r):
            for j in xrange(c):
                h_sample = T.set_subtensor(
                    h_sample[:, :, i:zr:r, j:zc:c],
                    reshaped_multinomial[:, :, :, :, idx])
                idx += 1

        p_sample = 1 - reshaped_multinomial[:, :, :, :, -1]

        return p, h, p_sample, h_sample
Esempio n. 54
0
    def define_layers(self):
        self.layers = []
        self.params = []

        for i in xrange(self.num_hds):
            if i == 0:
                layer_input = self.X
                h_shape = (self.out_size, self.hidden_size_list[0])
            else:
                layer_input = self.layers[i - 1].activation
                h_shape = (self.hidden_size_list[i - 1],
                           self.hidden_size_list[i])

            if self.cell == "gru":
                hidden_layer = GRULayer(self.rng,
                                        self.prefix + self.layer_id + str(i),
                                        h_shape, layer_input, self.mask,
                                        self.is_train, self.batch_size,
                                        self.drop_rate)
            elif self.cell == "lstm":
                hidden_layer = LSTMLayer(self.rng,
                                         self.prefix + self.layer_id + str(i),
                                         h_shape, layer_input, self.mask,
                                         self.is_train, self.batch_size,
                                         self.drop_rate)
            self.layers.append(hidden_layer)
            self.params += hidden_layer.params

        #the last decoder layer for decoding
        if self.num_hds == 0:
            output_layer_input = self.X
            last_shape = (self.in_size, self.out_size)
        else:
            output_layer_input = self.layers[-1].activation
            last_shape = (self.in_size, self.layers[-1].out_size)

        self.W_hy = init_weights((last_shape[1], last_shape[0]),
                                 self.prefix + "W_hy" + self.layer_id)
        self.b_y = init_bias(last_shape[0],
                             self.prefix + "b_y" + self.layer_id)
        if self.cell == "gru":
            self.decoder = GRULayer(self.rng, self.prefix + self.layer_id,
                                    last_shape, output_layer_input, self.mask,
                                    self.is_train, self.batch_size,
                                    self.drop_rate)

            def _active(m, pre_h, x):
                x = T.reshape(x, (self.batch_size, last_shape[0]))
                pre_h = T.reshape(pre_h, (self.batch_size, last_shape[1]))

                h = self.decoder._active(x, pre_h)
                y = T.nnet.softmax(T.dot(h, self.W_hy) + self.b_y)
                y = y * m[:, None]

                h = T.reshape(h, (1, self.batch_size * last_shape[1]))
                y = T.reshape(y, (1, self.batch_size * last_shape[0]))
                return h, y

            [h, y], updates = theano.scan(
                _active,  #n_steps = self.words,
                sequences=[self.mask],
                outputs_info=[{
                    'initial': output_layer_input,
                    'taps': [-1]
                },
                              T.alloc(floatX(0.), 1,
                                      self.batch_size * last_shape[0])])
        elif self.cell == "lstm":
            self.decoder = LSTMLayer(self.rng, self.prefix + self.layer_id,
                                     last_shape, output_layer_input, self.mask,
                                     self.is_train, self.batch_size,
                                     self.drop_rate)

            def _active(m, pre_h, pre_c, x):
                x = T.reshape(x, (self.batch_size, last_shape[0]))
                pre_h = T.reshape(pre_h, (self.batch_size, last_shape[1]))
                pre_c = T.reshape(pre_c, (self.batch_size, last_shape[1]))

                h, c = self.decoder._active(x, pre_h, pre_c)

                y = T.nnet.softmax(T.dot(h, self.W_hy) + self.b_y)
                y = y * m[:, None]

                h = T.reshape(h, (1, self.batch_size * last_shape[1]))
                c = T.reshape(c, (1, self.batch_size * last_shape[1]))
                y = T.reshape(y, (1, self.batch_size * last_shape[0]))
                return h, c, y

            [h, c, y], updates = theano.scan(
                _active,
                sequences=[self.mask],
                outputs_info=[{
                    'initial': output_layer_input,
                    'taps': [-1]
                }, {
                    'initial': output_layer_input,
                    'taps': [-1]
                },
                              T.alloc(floatX(0.), 1,
                                      self.batch_size * last_shape[0])])

        y = T.reshape(y, (self.words, self.batch_size * last_shape[0]))
        self.activation = y
        self.params += self.decoder.params
        self.params += [self.W_hy, self.b_y]
        # self.layers.append(self.decoder)
        self.hhhh = h
Esempio n. 55
0
def build_sampler(tparams, options, trng):
    x = tensor.tensor3('x', dtype='float32')
    xr = x[::-1]
    n_timesteps = x.shape[0]
    n_samples = x.shape[1]

    # word embedding (source), forward and backward
    h=x
    hr=xr
    hidden_sizes=options['dim_enc']

    for i in range(len(hidden_sizes)):
        proj = get_layer(options['encoder'])[1](tparams, h, options,
                                                prefix='encoder'+str(i))
        # word embedding for backward rnn (source)
        projr = get_layer(options['encoder'])[1](tparams, hr, options,
                                                 prefix='encoder_r'+str(i))

        h=concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1)
        if options['down_sample'][i]==1:
            h=h[0::2]
        hr=h[::-1]

    ctx = h
    # get the input for decoder rnn initializer mlp
    ctx_mean = ctx.mean(0)
    # ctx_mean = concatenate([proj[0][-1],projr[0][-1]], axis=proj[0].ndim-2)
    init_state = get_layer('ff')[1](tparams, ctx_mean, options,
                                    prefix='ff_state', activ='tanh')

    print('Building f_init...',)
    outs = [init_state, ctx]
    f_init = theano.function([x], outs, name='f_init', profile=profile)
    print('Done')

    # x: 1 x 1
    y = tensor.vector('y_sampler', dtype='int64')
    init_state = tensor.matrix('init_state', dtype='float32')
    alpha_past = tensor.matrix('alpha_past', dtype='float32')

    # if it's the first word, emb should be all zero and it is indicated by -1
    emb = tensor.switch(y[:, None] < 0,
                        tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]),
                        tparams['Wemb_dec'][y])

    # apply one step of conditional gru with attention
    proj = get_layer(options['decoder'])[1](tparams, emb, options,
                                            prefix='decoder',
                                            mask=None, context=ctx,
                                            one_step=True,
                                            init_state=init_state, alpha_past = alpha_past)
    # get the next hidden state
    next_state = proj[0]

    # get the weighted averages of context for this target word y
    ctxs = proj[1]
    next_alpha_past = proj[3]

    logit_lstm = get_layer('ff')[1](tparams, next_state, options,
                                    prefix='ff_logit_lstm', activ='linear')
    logit_prev = get_layer('ff')[1](tparams, emb, options,
                                    prefix='ff_logit_prev', activ='linear')
    logit_ctx = get_layer('ff')[1](tparams, ctxs, options,
                                   prefix='ff_logit_ctx', activ='linear')
    logit = logit_lstm+logit_prev+logit_ctx

    # maxout layer
    shape = logit.shape
    shape1 = tensor.cast(shape[1] // 2, 'int64')
    shape2 = tensor.cast(2, 'int64')
    logit = logit.reshape([shape[0], shape1, shape2]) # batch*256 -> batch*128*2
    logit=logit.max(2) # batch*500


    logit = get_layer('ff')[1](tparams, logit, options,
                               prefix='ff_logit', activ='linear')

    # compute the softmax probability
    next_probs = tensor.nnet.softmax(logit)

    # sample from softmax distribution to get the sample
    next_sample = trng.multinomial(pvals=next_probs).argmax(1)

    # compile a function to do the whole thing above, next word probability,
    # sampled word for the next target, next hidden state to be used
    print('Building f_next..')
    inps = [y, ctx, init_state, alpha_past]
    outs = [next_probs, next_sample, next_state, next_alpha_past]
    f_next = theano.function(inps, outs, name='f_next', profile=profile, on_unused_input='ignore')
    print('Done')

    return f_init, f_next
Esempio n. 56
0
def max_pool_channels(z, pool_size, top_down=None, theano_rng=None):
    """
    .. todo::

        WRITEME properly

    Unlike Honglak's convolutional max pooling, which pools over spatial
    locations within each channels, this does max pooling in a densely
    connected model. Here we pool groups of channels together.

    z : a theano matrix representing a batch of input from below
    pool_size: int. the number of features to combine into one pooled unit
    top_down: (optional) a theano matrix representing input from above
                if None, assumes top-down input is 0
    theano_rng: (optional) a MRG_RandomStreams instance

    returns:
        a theano matrix for the expected value of the detector layer h
        a theano matrix for the expected value of the pooling layer p
        if theano_rng is not None, also returns:
            a theano matrix of samples of the detector layer
            a theano matrix of samples of the pooling layer

    all matrices are formatted as (num_example, num_features)
    """

    z_name = z.name
    if z_name is None:
        z_name = 'anon_z'

    if pool_size == 1:
        if top_down is None:
            top_down = 0.
        total_input = z + top_down
        p = T.nnet.sigmoid(total_input)
        h = p

        if theano_rng is None:
            return p, h
        else:
            t1 = time.time()
            p_samples = theano_rng.binomial(p=p,
                                            size=p.shape,
                                            dtype=p.dtype,
                                            n=1)
            t2 = time.time()
            if t2 - t1 > 0.5:
                warnings.warn("TODO: speed up theano's random number seeding. "
                              "max pooling spent " + str(t2 - t1) +
                              "in a call to theano_rng.binomial.")
            h_samples = p_samples
            return p_samples, h_samples, p_samples, h_samples
    else:
        batch_size, n = z.shape

        mx = None

        if top_down is None:
            t = 0.
        else:
            t = -top_down
            t.name = 'neg_top_down'

        zpart = []
        for i in xrange(pool_size):
            cur_part = z[:, i:n:pool_size]
            if z_name is not None:
                cur_part.name = z_name + '[%d]' % (i)
            zpart.append(cur_part)
            if mx is None:
                mx = T.maximum(t, cur_part)
                if cur_part.name is not None:
                    mx.name = 'max(-top_down,' + cur_part.name + ')'
            else:
                max_name = None
                if cur_part.name is not None:
                    mx_name = 'max(' + cur_part.name + ',' + mx.name + ')'
                mx = T.maximum(mx, cur_part)
                mx.name = mx_name
        mx.name = 'local_max(' + z_name + ')'

        pt = []

        for i in xrange(pool_size):
            z_i = zpart[i]
            safe = z_i - mx
            safe.name = 'safe_z(%s)' % z_i.name
            cur_pt = T.exp(safe)
            cur_pt.name = 'pt(%s)' % z_i.name
            assert cur_pt.ndim == 2
            pt.append(cur_pt)

        off_pt = T.exp(t - mx)
        assert off_pt.ndim == 2
        off_pt.name = 'p_tilde_off(%s)' % z_name

        denom = off_pt
        for i in xrange(pool_size):
            denom = denom + pt[i]
        assert denom.ndim == 2
        denom.name = 'denom(%s)' % z_name

        off_prob = off_pt / denom
        p = 1. - off_prob
        assert p.dtype == z.dtype

        hpart = [pt_i / denom for pt_i in pt]

        h = T.alloc(0., batch_size, n)

        for i in xrange(pool_size):
            h.name = 'h_interm'
            hp = hpart[i]
            sub_h = h[:, i:n:pool_size]
            assert sub_h.ndim == 2
            assert hp.ndim == 2
            for hv, hsv, hpartv in get_debug_values(h, sub_h, hp):
                print hv.shape
                print hsv.shape
                print hpartv.shape
            h = T.set_subtensor(sub_h, hp)

    p.name = 'p(%s)' % z_name
    h.name = 'h(%s)' % z_name

    if theano_rng is None:
        return p, h
    else:
        events = []
        for i in xrange(pool_size):
            events.append(hpart[i])
        events.append(off_prob)

        events = [event.dimshuffle(0, 1, 'x') for event in events]

        events = tuple(events)

        stacked_events = T.concatenate(events, axis=2)

        outcomes = pool_size + 1
        reshaped_events = stacked_events.reshape(
            (batch_size * n // pool_size, outcomes))

        t1 = time.time()
        multinomial = theano_rng.multinomial(pvals=reshaped_events,
                                             dtype=p.dtype)
        t2 = time.time()
        if t2 - t1 > 0.5:
            warnings.warn("TODO: speed up theano's random number seeding."
                          "max pooling spent " + str(t2 - t1) +
                          " in a call to theano_rng.multinomial.")

        reshaped_multinomial = multinomial.reshape(
            (batch_size, n // pool_size, outcomes))

        h_sample = T.zeros_like(z)

        idx = 0
        for i in xrange(pool_size):
            h_sample = T.set_subtensor(h_sample[:, i:n:pool_size],
                                       reshaped_multinomial[:, :, idx])
            idx += 1

        p_sample = 1 - reshaped_multinomial[:, :, -1]

        assert h_sample.dtype == z.dtype

        return p, h, p_sample, h_sample
Esempio n. 57
0
    def train(self, savefile, task, recover=True):
        """
        Train the RNN.

        Parameters
        ----------

        savefile : str

        task : function

        recover : bool, optional
                  If `True`, will attempt to recover from a previously saved run.

        """
        N     = self.p['N']
        Nin   = self.p['Nin']
        Nout  = self.p['Nout']
        alpha = self.p['dt']/self.p['tau']

        # Initialize settings
        settings = OrderedDict()

        # Check if file already exists
        if not recover:
            if os.path.isfile(savefile):
                os.remove(savefile)

        #---------------------------------------------------------------------------------
        # Are we using GPUs?
        #---------------------------------------------------------------------------------

        if theanotools.get_processor_type() == 'gpu':
            settings['GPU'] = 'enabled'
        else:
            settings['GPU'] = 'no'

        #---------------------------------------------------------------------------------
        # Random number generator
        #---------------------------------------------------------------------------------

        settings['init seed'] = self.p['seed']
        rng = np.random.RandomState(self.p['seed'])

        #---------------------------------------------------------------------------------
        # Weight initialization
        #---------------------------------------------------------------------------------

        settings['distribution (Win)']  = self.p['distribution_in']
        settings['distribution (Wrec)'] = self.p['distribution_rec']
        settings['distribution (Wout)'] = self.p['distribution_out']

        if Nin > 0:
            Win_0 = self.init_weights(rng, self.p['Cin'], N, Nin,
                                      self.p['distribution_in'])
        Wrec_0 = self.init_weights(rng, self.p['Crec'],
                                   N, N, self.p['distribution_rec'])
        Wout_0 = self.init_weights(rng, self.p['Cout'],
                                   Nout, N, self.p['distribution_out'])

        #---------------------------------------------------------------------------------
        # Enforce Dale's law on the initial weights
        #---------------------------------------------------------------------------------

        settings['Nin/N/Nout'] = '{}/{}/{}'.format(Nin, N, Nout)

        if self.p['ei'] is not None:
            Nexc = len(np.where(self.p['ei'] > 0)[0])
            Ninh = len(np.where(self.p['ei'] < 0)[0])
            settings['Dale\'s law'] = 'E/I = {}/{}'.format(Nexc, Ninh)

            if Nin > 0:
                Win_0 = abs(Win_0) # If Dale, assume inputs are excitatory
            Wrec_0 = abs(Wrec_0)
            Wout_0 = abs(Wout_0)
        else:
            settings['Dale\'s law'] = 'no'

        #---------------------------------------------------------------------------------
        # Fix spectral radius
        #---------------------------------------------------------------------------------

        # Compute spectral radius
        C = self.p['Crec']
        if C is not None:
            Wrec_0_full = C.mask_plastic*Wrec_0 + C.mask_fixed
        else:
            Wrec_0_full = Wrec_0
        if self.p['ei'] is not None:
            Wrec_0_full = Wrec_0_full*self.p['ei']
        rho = RNN.spectral_radius(Wrec_0_full)

        # Scale Wrec to have fixed spectral radius
        if self.p['ei'] is not None:
            R = self.p['rho0']/rho
        else:
            R = 1.1/rho
        Wrec_0 *= R
        if C is not None:
            C.mask_fixed *= R

        # Check spectral radius
        if C is not None:
            Wrec_0_full = C.mask_plastic*Wrec_0 + C.mask_fixed
        else:
            Wrec_0_full = Wrec_0
        if self.p['ei'] is not None:
            Wrec_0_full = Wrec_0_full*self.p['ei']
        rho = RNN.spectral_radius(Wrec_0_full)
        settings['initial spectral radius'] = '{:.2f}'.format(rho)

        #---------------------------------------------------------------------------------
        # Others
        #---------------------------------------------------------------------------------

        brec_0 = self.p['brec']*np.ones(N)
        bout_0 = self.p['bout']*np.ones(Nout)
        x0_0   = self.p['x0']*np.ones(N)

        #---------------------------------------------------------------------------------
        # RNN parameters
        #---------------------------------------------------------------------------------

        if Nin > 0:
            Win = theanotools.shared(Win_0, name='Win')
        else:
            Win = None
        Wrec = theanotools.shared(Wrec_0, name='Wrec')
        Wout = theanotools.shared(Wout_0, name='Wout')
        brec = theanotools.shared(brec_0, name='brec')
        bout = theanotools.shared(bout_0, name='bout')
        x0   = theanotools.shared(x0_0,   name='x0')

        #---------------------------------------------------------------------------------
        # Parameters to train
        #---------------------------------------------------------------------------------

        trainables = []
        if Win is not None:
            trainables += [Win]
        trainables += [Wrec]
        if Wout is not None:
            trainables += [Wout]

        if self.p['train_brec']:
            settings['train recurrent bias'] = 'yes'
            trainables += [brec]
        else:
            settings['train recurrent bias'] = 'no'

        if self.p['train_bout']:
            settings['train output bias'] = 'yes'
            trainables += [bout]
        else:
            settings['train output bias'] = 'no'

        # In continuous mode it doesn't make sense to train x0, which is forgotten
        if self.p['mode'] == 'continuous':
            self.p['train_x0'] = False

        if self.p['train_x0']:
            settings['train initial conditions'] = 'yes'
            trainables += [x0]
        else:
            settings['train initial conditions'] = 'no'

        #---------------------------------------------------------------------------------
        # Weight matrices
        #---------------------------------------------------------------------------------

        # Input
        if Nin > 0:
            if self.p['Cin'] is not None:
                C = self.p['Cin']
                settings['sparseness (Win)'] = ('p = {:.2f}, p_plastic = {:.2f}'
                                                .format(C.p, C.p_plastic))

                Cin_mask_plastic = theanotools.shared(C.mask_plastic)
                Cin_mask_fixed   = theanotools.shared(C.mask_fixed)

                Win_ = Cin_mask_plastic*Win + Cin_mask_fixed
                Win_.name = 'Win_'
            else:
                Win_ = Win

        # Recurrent
        if self.p['Crec'] is not None:
            C = self.p['Crec']
            settings['sparseness (Wrec)'] = ('p = {:.2f}, p_plastic = {:.2f}'
                                             .format(C.p, C.p_plastic))

            Crec_mask_plastic = theanotools.shared(C.mask_plastic)
            Crec_mask_fixed   = theanotools.shared(C.mask_fixed)

            Wrec_ = Crec_mask_plastic*Wrec + Crec_mask_fixed
            Wrec_.name = 'Wrec_'
        else:
            Wrec_ = Wrec

        # Output
        if self.p['Cout'] is not None:
            C = self.p['Cout']
            settings['sparseness (Wout)'] = ('p = {:.2f}, p_plastic = {:.2f}'
                                             .format(C.p, C.p_plastic))

            Cout_mask_plastic = theanotools.shared(C.mask_plastic)
            Cout_mask_fixed   = theanotools.shared(C.mask_fixed)

            Wout_ = Cout_mask_plastic*Wout + Cout_mask_fixed
            Wout_.name = 'Wout_'
        else:
            Wout_ = Wout

        #---------------------------------------------------------------------------------
        # Dale's law
        #---------------------------------------------------------------------------------

        if self.p['ei'] is not None:
            # Function to keep matrix elements positive
            if self.p['ei_positive_func'] == 'abs':
                settings['E/I positivity function'] = 'absolute value'
                make_positive = abs
            elif self.p['ei_positive_func'] == 'rectify':
                settings['E/I positivity function'] = 'rectify'
                make_positive = theanotools.rectify
            else:
                raise ValueError("Unknown ei_positive_func.")

            # Assume inputs are excitatory
            if Nin > 0:
                Win_ = make_positive(Win_)

            # E/I
            ei    = theanotools.shared(self.p['ei'], name='ei')
            Wrec_ = make_positive(Wrec_)*ei

            Wout_ = make_positive(Wout_)*ei

        #---------------------------------------------------------------------------------
        # Variables to save
        #---------------------------------------------------------------------------------

        if Nin > 0:
            save_values = [Win_]
        else:
            save_values = [None]
        save_values += [Wrec_, Wout_, brec, bout, x0]

        #---------------------------------------------------------------------------------
        # Activation functions
        #---------------------------------------------------------------------------------

        f_hidden, d_f_hidden = theanotools.hidden_activations[self.p['hidden_activation']]
        settings['hidden activation'] = self.p['hidden_activation']

        act = self.p['output_activation']
        f_output = theanotools.output_activations[act]

        if act == 'sigmoid':
            settings['output activation/loss'] = 'sigmoid/binary cross entropy'
            f_loss = theanotools.binary_crossentropy
        elif act == 'softmax':
            settings['output activation/loss'] = 'softmax/categorical cross entropy'
            f_loss = theanotools.categorical_crossentropy
        else:
            settings['output activation/loss'] = act + '/squared'
            f_loss = theanotools.L2

        #---------------------------------------------------------------------------------
        # RNN
        #---------------------------------------------------------------------------------

        # Dims: time, trials, units
        # u[:,:,:Nin]  contains the inputs (including baseline and noise),
        # u[:,:,Nin:]  contains the recurrent noise
        u   = T.tensor3('u')
        x0_ = T.alloc(x0, u.shape[1], x0.shape[0])

        if Nin > 0:
            def rnn(u_t, x_tm1, r_tm1, WinT, WrecT):
                x_t = ((1 - alpha)*x_tm1
                       + alpha*(T.dot(r_tm1, WrecT)        # Recurrent
                                + brec                     # Bias
                                + T.dot(u_t[:,:Nin], WinT) # Input
                                + u_t[:,Nin:])             # Recurrent noise
                       )
                r_t = f_hidden(x_t)

                return [x_t, r_t]

            [x, r], _ = theano.scan(fn=rnn,
                                    outputs_info=[x0_, f_hidden(x0_)],
                                    sequences=u,
                                    non_sequences=[Win_.T, Wrec_.T])
        else:
            def rnn(u_t, x_tm1, r_tm1, WrecT):
                x_t = ((1 - alpha)*x_tm1
                       + alpha*(T.dot(r_tm1, WrecT) # Recurrent
                                + brec              # Bias
                                + u_t[:,Nin:]) # Recurrent noise
                       )
                r_t = f_hidden(x_t)

                return [x_t, r_t]

            [x, r], _ = theano.scan(fn=rnn,
                                    outputs_info=[x0_, f_hidden(x0_)],
                                    sequences=u,
                                    non_sequences=[Wrec_.T])

        #---------------------------------------------------------------------------------
        # Running mode
        #---------------------------------------------------------------------------------

        if self.p['mode'] == 'continuous':
            settings['mode'] = 'continuous'

            if self.p['n_gradient'] != 1:
                print("[ Trainer.train ] In continuous mode,"
                      " so we're setting n_gradient to 1.")
                self.p['n_gradient'] = 1

            x0_ = x[-1]
        else:
            settings['mode'] = 'batch'

        #---------------------------------------------------------------------------------
        # Readout
        #---------------------------------------------------------------------------------

        z = f_output(T.dot(r, Wout_.T) + bout)

        #---------------------------------------------------------------------------------
        # Deduce whether the task specification contains an output mask -- use a
        # temporary dataset so it doesn't affect the training.
        #---------------------------------------------------------------------------------

        dataset = Dataset(1, task, self.floatX, self.p, name='gradient')
        if dataset.has_output_mask():
            settings['output mask'] = 'yes'
        else:
            settings['output mask'] = 'no'

        #---------------------------------------------------------------------------------
        # Loss
        #---------------------------------------------------------------------------------

        # (time, trials, outputs)
        target = T.tensor3('target')

        # Set mask
        mask     = target[:,:,Nout:]
        masknorm = T.sum(mask)

        # Input-output pairs
        inputs = [u, target]
        # target[:,:,:Nout] contains the target outputs, &
        # target[:,:,Nout:] contains the mask.

        # Loss, not including the regularization terms
        loss = T.sum(f_loss(z, target[:,:,:Nout])*mask)/masknorm

        # Root-mean-squared error
        error = T.sqrt(T.sum(theanotools.L2(z, target[:,:,:Nout])*mask)/masknorm)

        #---------------------------------------------------------------------------------
        # Regularization terms
        #---------------------------------------------------------------------------------

        regs = 0

        #---------------------------------------------------------------------------------
        # L1 weight regularization
        #---------------------------------------------------------------------------------

        lambda1 = self.p['lambda1_in']
        if lambda1 > 0:
            settings['L1 weight regularization (Win)'] = ('lambda1_in = {}'
                                                          .format(lambda1))
            regs += lambda1 * T.mean(abs(Win))

        lambda1 = self.p['lambda1_rec']
        if lambda1 > 0:
            settings['L1 weight regularization (Wrec)'] = ('lambda1_rec = {}'
                                                           .format(lambda1))
            regs += lambda1 * T.mean(abs(Wrec))

        lambda1 = self.p['lambda1_out']
        if lambda1 > 0:
            settings['L1 weight regularization (Wout)'] = ('lambda1_out = {}'
                                                           .format(lambda1))
            regs += lambda1 * T.mean(abs(Wout))

        #---------------------------------------------------------------------------------
        # L2 weight regularization
        #---------------------------------------------------------------------------------

        if Nin > 0:
            lambda2 = self.p['lambda2_in']
            if lambda2 > 0:
                settings['L2 weight regularization (Win)'] = ('lambda2_in = {}'
                                                              .format(lambda2))
                regs += lambda2 * T.mean(Win**2)

        lambda2 = self.p['lambda2_rec']
        if lambda2 > 0:
            settings['L2 weight regularization (Wrec)'] = ('lambda2_rec = {}'
                                                           .format(lambda2))
            regs += lambda2 * T.mean(Wrec**2)

        lambda2 = self.p['lambda2_out']
        if lambda2 > 0:
            settings['L2 weight regularization (Wout)'] = ('lambda2_out = {}'
                                                           .format(lambda2))
            regs += lambda2 * T.mean(Wout**2)

        #---------------------------------------------------------------------------------
        # L2 rate regularization
        #---------------------------------------------------------------------------------

        lambda2 = self.p['lambda2_r']
        if lambda2 > 0:
            settings['L2 rate regularization'] = 'lambda2_r = {}'.format(lambda2)
            regs += lambda2 * T.mean(r**2)

        #---------------------------------------------------------------------------------
        # Final costs
        #---------------------------------------------------------------------------------

        costs = [loss, error]

        #---------------------------------------------------------------------------------
        # Datasets
        #---------------------------------------------------------------------------------

        gradient_data   = Dataset(self.p['n_gradient'], task, self.floatX, self.p,
                                  batch_size=self.p['gradient_batch_size'],
                                  seed=self.p['gradient_seed'],
                                  name='gradient')
        validation_data = Dataset(self.p['n_validation'], task, self.floatX, self.p,
                                  batch_size=self.p['validation_batch_size'],
                                  seed=self.p['validation_seed'],
                                  name='validation')

        # Input noise
        if np.isscalar(self.p['var_in']):
            if Nin > 0:
                settings['sigma_in'] = '{}'.format(np.sqrt(self.p['var_in']))
        else:
            settings['sigma_in'] = 'array'

        # Recurrent noise
        if np.isscalar(self.p['var_rec']):
            settings['sigma_rec'] = '{}'.format(np.sqrt(self.p['var_rec']))
        else:
            settings['sigma_rec'] = 'array'

        # Dataset settings
        settings['rectify inputs']            = self.p['rectify_inputs']
        settings['gradient minibatch size']   = gradient_data.minibatch_size
        settings['validation minibatch size'] = validation_data.minibatch_size

        #---------------------------------------------------------------------------------
        # Other settings
        #---------------------------------------------------------------------------------

        settings['dt'] = '{} ms'.format(self.p['dt'])
        if np.isscalar(self.p['tau']):
            settings['tau'] = '{} ms'.format(self.p['tau'])
        else:
            settings['tau'] = 'custom'
        settings['tau_in']            = '{} ms'.format(self.p['tau_in'])
        settings['learning rate']     = '{}'.format(self.p['learning_rate'])
        settings['lambda_Omega']      = '{}'.format(self.p['lambda_Omega'])
        settings['max gradient norm'] = '{}'.format(self.p['max_gradient_norm'])

        #---------------------------------------------------------------------------------
        # A few important Theano settings
        #---------------------------------------------------------------------------------

        settings['(Theano) floatX']   = self.floatX
        settings['(Theano) allow_gc'] = theano.config.allow_gc

        #---------------------------------------------------------------------------------
        # Train!
        #---------------------------------------------------------------------------------

        print_settings(settings)

        sgd = SGD(trainables, inputs, costs, regs, x, z, self.p, save_values,
                  {'Wrec_': Wrec_, 'd_f_hidden': d_f_hidden})
        sgd.train(gradient_data, validation_data, savefile)
Esempio n. 58
0
def max_pool_c01b(z, pool_shape, top_down=None, theano_rng=None):
    """
    .. todo::

        WRITEME properly

    Like max_pool but with all 4-tensors formatted with axes ('c', 0, 1, 'b').
    This is for maximum speed when using-cuda convnet.

    Performance notes:
    Stabilizing the softmax is one source slowness. Here it is stabilized
    with several calls to maximum and sub. It might also be possible to
    stabilize it with T.maximum(-top_down,<cuda convnet max pooling>).
    Don't know if that would be faster or slower.

    Benchmarks show that most of the time is spent in GpuIncSubtensor
    when running on gpu. So it is mostly that which needs a faster
    implementation. One other way to implement this would be with cuda
    convnet convolution, where the convolution stride is equal to the
    pool width, and the thing to multiply with is the hparts stacked
    along the channel axis. This isn't a feasible solution for max_pool
    because of theano convolution's poor support for strides, but for cuda
    convnet it could give a speedup.
    """

    z_name = z.name
    if z_name is None:
        z_name = 'anon_z'

    ch, zr, zc, batch_size = z.shape

    r, c = pool_shape

    zpart = []

    mx = None

    if top_down is None:
        t = 0.
    else:
        t = -top_down
        t.name = 'neg_top_down'

    for i in xrange(r):
        zpart.append([])
        for j in xrange(c):
            cur_part = z[:, i:zr:r, j:zc:c, :]
            if z_name is not None:
                cur_part.name = z_name + '[%d, %d]' % (i, j)
            zpart[i].append(cur_part)
            if mx is None:
                mx = T.maximum(t, cur_part)
                if cur_part.name is not None:
                    mx.name = 'max(-top_down,' + cur_part.name + ')'
            else:
                max_name = None
                if cur_part.name is not None:
                    mx_name = 'max(' + cur_part.name + ',' + mx.name + ')'
                mx = T.maximum(mx, cur_part)
                mx.name = mx_name
    mx.name = 'local_max(' + z_name + ')'

    pt = []

    for i in xrange(r):
        pt.append([])
        for j in xrange(c):
            z_ij = zpart[i][j]
            safe = z_ij - mx
            safe.name = 'safe_z(%s)' % z_ij.name
            cur_pt = T.exp(safe)
            cur_pt.name = 'pt(%s)' % z_ij.name
            pt[-1].append(cur_pt)

    off_pt = T.exp(t - mx)
    off_pt.name = 'p_tilde_off(%s)' % z_name
    denom = off_pt

    for i in xrange(r):
        for j in xrange(c):
            denom = denom + pt[i][j]
    denom.name = 'denom(%s)' % z_name

    off_prob = off_pt / denom
    p = 1. - off_prob
    p.name = 'p(%s)' % z_name

    hpart = []
    for i in xrange(r):
        hpart.append([pt_ij / denom for pt_ij in pt[i]])

    h = T.alloc(0., ch, zr, zc, batch_size)

    for i in xrange(r):
        for j in xrange(c):
            h.name = 'h_interm'
            h = T.set_subtensor(h[:, i:zr:r, j:zc:c, :], hpart[i][j])

    h.name = 'h(%s)' % z_name

    if theano_rng is None:
        return p, h
    else:
        events = []
        for i in xrange(r):
            for j in xrange(c):
                events.append(hpart[i][j])
        events.append(off_prob)

        events = [event.dimshuffle(0, 1, 2, 3, 'x') for event in events]

        events = tuple(events)

        stacked_events = T.concatenate(events, axis=4)

        ch, rows, cols, batch_size, outcomes = stacked_events.shape
        reshaped_events = stacked_events.reshape(
            (ch * rows * cols * batch_size, outcomes))

        multinomial = theano_rng.multinomial(pvals=reshaped_events,
                                             dtype=p.dtype)

        reshaped_multinomial = multinomial.reshape(
            (ch, rows, cols, batch_size, outcomes))

        h_sample = T.alloc(0., ch, zr, zc, batch_size)

        idx = 0
        for i in xrange(r):
            for j in xrange(c):
                h_sample = T.set_subtensor(
                    h_sample[:, i:zr:r, j:zc:c, :],
                    reshaped_multinomial[:, :, :, :, idx])
                idx += 1

        p_sample = 1 - reshaped_multinomial[:, :, :, :, -1]

        return p, h, p_sample, h_sample
Esempio n. 59
0
    def __init__(self, We, params):

        num_inputs = We.shape[1]
        lstm_layers_num = 1
        self.eta = params.eta
        self.num_labels = params.num_labels
        self.en_hidden_size = params.en_hidden_size
        self.de_hidden_size = params.de_hidden_size

        self.lstm_layers_num = params.lstm_layers_num
        self._train = None
        self._utter = None
        self.params = []
        self.encoder_lstm_layers = []
        self.decoder_lstm_layers = []
        self.hos = []
        self.Cos = []

        encoderInputs = tensor.imatrix()
        decoderInputs, decoderTarget = tensor.imatrices(2)
        encoderMask, TF, decoderMask, decoderInputs0 = tensor.fmatrices(4)

        self.lookuptable = theano.shared(We)

        #### the last one is for the stary symbole
        self.de_lookuptable = theano.shared(name="Decoder LookUpTable",
                                            value=init_xavier_uniform(
                                                self.num_labels + 1,
                                                self.de_hidden_size),
                                            borrow=True)

        self.linear = theano.shared(
            name="Linear",
            value=init_xavier_uniform(
                self.de_hidden_size + 2 * self.en_hidden_size,
                self.num_labels),
            borrow=True)
        self.linear_bias = theano.shared(
            name="Hidden to Bias",
            value=np.asarray(np.random.randn(self.num_labels, ) * 0.,
                             dtype=theano.config.floatX),
            borrow=True)

        #self.hidden_decode = theano.shared(name="Hidden to Decode", value= init_xavier_uniform(2*en_hidden_size, self.de_hidden_size), borrow = True)

        #self.hidden_bias = theano.shared(
        #        name="Hidden to Bias",
        #        value=np.asarray(np.random.randn(self.de_hidden_size, )*0., dtype=theano.config.floatX) ,
        #        borrow=True
        #        )

        #self.params += [self.linear, self.de_lookuptable, self.hidden_decode, self.hidden_bias]    #concatenate
        self.params += [self.linear, self.linear_bias, self.de_lookuptable
                        ]  #the initial hidden state of decoder lstm is zeros
        #(max_sent_size, batch_size, hidden_size)
        state_below = self.lookuptable[encoderInputs.flatten()].reshape(
            (encoderInputs.shape[0], encoderInputs.shape[1], num_inputs))
        for _ in range(self.lstm_layers_num):

            enclstm_f = LSTM(num_inputs, self.en_hidden_size)
            enclstm_b = LSTM(num_inputs, self.en_hidden_size, True)
            self.encoder_lstm_layers.append(enclstm_f)  #append
            self.encoder_lstm_layers.append(enclstm_b)  #append
            self.params += enclstm_f.params + enclstm_b.params  #concatenate

            hs_f, Cs_f = enclstm_f.forward(state_below, encoderMask)
            hs_b, Cs_b = enclstm_b.forward(state_below, encoderMask)

            hs = tensor.concatenate([hs_f, hs_b], axis=2)
            Cs = tensor.concatenate([Cs_f, Cs_b], axis=2)
            hs0 = tensor.concatenate([hs_f[-1], hs_b[0]], axis=1)
            Cs0 = tensor.concatenate([Cs_f[-1], Cs_b[0]], axis=1)
            #self.hos += tensor.tanh(tensor.dot(hs0, self.hidden_decode) + self.hidden_bias),
            #self.Cos += tensor.tanh(tensor.dot(Cs0, self.hidden_decode) + self.hidden_bias),
            self.hos += tensor.alloc(
                np.asarray(0., dtype=theano.config.floatX),
                encoderInputs.shape[1], self.de_hidden_size),
            self.Cos += tensor.alloc(
                np.asarray(0., dtype=theano.config.floatX),
                encoderInputs.shape[1], self.de_hidden_size),
            state_below = hs

        Encoder = state_below

        ei, di, dt = tensor.imatrices(3)  #place holders
        em, dm, tf, di0 = tensor.fmatrices(4)

        self.encoder_function = theano.function(inputs=[ei, em],
                                                outputs=Encoder,
                                                givens={
                                                    encoderInputs: ei,
                                                    encoderMask: em
                                                })

        #####################################################
        #####################################################
        state_below = self.de_lookuptable[decoderInputs.flatten()].reshape(
            (decoderInputs.shape[0], decoderInputs.shape[1],
             self.de_hidden_size))
        for i in range(self.lstm_layers_num):
            declstm = LSTM(self.de_hidden_size, self.de_hidden_size)
            self.decoder_lstm_layers += declstm,  #append
            self.params += declstm.params  #concatenate
            ho, Co = self.hos[i], self.Cos[i]
            state_below, Cs = declstm.forward(state_below, decoderMask, ho, Co)

        ##### Here we include the representation from the decoder
        decoder_lstm_outputs = tensor.concatenate([state_below, Encoder],
                                                  axis=2)

        linear_outputs = tensor.dot(decoder_lstm_outputs,
                                    self.linear) + self.linear_bias[None,
                                                                    None, :]
        softmax_outputs, _ = theano.scan(
            fn=lambda x: tensor.nnet.softmax(x),
            sequences=[linear_outputs],
        )

        def _NLL(pred, y, m):
            return -m * tensor.log(pred[tensor.arange(encoderInputs.shape[1]),
                                        y])

        costs, _ = theano.scan(
            fn=_NLL, sequences=[softmax_outputs, decoderTarget, decoderMask])
        loss = costs.sum() / decoderMask.sum() + params.L2 * sum(
            lasagne.regularization.l2(x) for x in self.params)

        updates = lasagne.updates.adam(loss, self.params, self.eta)
        #updates = lasagne.updates.apply_momentum(updates, self.params, momentum=0.9)

        ###################################################
        #### using the ground truth when training
        ##################################################
        self._train = theano.function(inputs=[ei, em, di, dm, dt],
                                      outputs=[loss, softmax_outputs],
                                      updates=updates,
                                      givens={
                                          encoderInputs: ei,
                                          encoderMask: em,
                                          decoderInputs: di,
                                          decoderMask: dm,
                                          decoderTarget: dt
                                      })

        #########################################################################
        ### For schedule sampling
        #########################################################################

        ###### always use privous predict as next input
        def _step2(ctx_, state_, hs_, Cs_):
            ### ctx_: b x h
            ### state_ : b x h
            ### hs_ : 1 x b x h    the first dimension is the number of the decoder layers
            ### Cs_ : 1 x b x h    the first dimension is the number of the decoder layers

            hs, Cs = [], []
            token_idxs = tensor.cast(state_.argmax(axis=-1), "int32")
            msk_ = tensor.fill(
                (tensor.zeros_like(token_idxs, dtype="float32")), 1)
            msk_ = msk_.dimshuffle('x', 0)
            state_below0 = self.de_lookuptable[token_idxs].reshape(
                (1, ctx_.shape[0], self.de_hidden_size))
            for i, lstm in enumerate(self.decoder_lstm_layers):
                h, C = lstm.forward(state_below0, msk_, hs_[i],
                                    Cs_[i])  #mind msk
                hs += h[-1],
                Cs += C[-1],
                state_below0 = h

            hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable(
                Cs)
            state_below0 = state_below0.reshape(
                (ctx_.shape[0], self.de_hidden_size))
            state_below0 = tensor.concatenate([ctx_, state_below0], axis=1)
            newpred = tensor.dot(state_below0,
                                 self.linear) + self.linear_bias[None, :]
            state_below = tensor.nnet.softmax(newpred)

            ##### the beging symbole probablity is 0
            extra_p = tensor.zeros_like(hs[:, :, 0])
            state_below = tensor.concatenate([state_below, extra_p.T], axis=1)

            return state_below, hs, Cs

        ctx_0, state_0 = tensor.fmatrices(2)
        hs_0 = tensor.ftensor3()
        Cs_0 = tensor.ftensor3()

        state_below_tmp, hs_tmp, Cs_tmp = _step2(ctx_0, state_0, hs_0, Cs_0)
        self.f_next = theano.function([ctx_0, state_0, hs_0, Cs_0],
                                      [state_below_tmp, hs_tmp, Cs_tmp],
                                      name='f_next')

        hs0, Cs0 = tensor.as_tensor_variable(
            self.hos, name="hs0"), tensor.as_tensor_variable(self.Cos,
                                                             name="Cs0")
        train_outputs, _ = theano.scan(fn=_step2,
                                       sequences=[Encoder],
                                       outputs_info=[decoderInputs0, hs0, Cs0],
                                       n_steps=encoderInputs.shape[0])

        train_predict = train_outputs[0]
        train_costs, _ = theano.scan(
            fn=_NLL, sequences=[train_predict, decoderTarget, decoderMask])

        train_loss = train_costs.sum() / decoderMask.sum() + params.L2 * sum(
            lasagne.regularization.l2(x) for x in self.params)

        ##from adam import adam
        ##train_updates = adam(train_loss, self.params, self.eta)
        #train_updates = lasagne.updates.apply_momentum(train_updates, self.params, momentum=0.9)
        #train_updates = lasagne.updates.sgd(train_loss, self.params, self.eta)
        #train_updates = lasagne.updates.apply_momentum(train_updates, self.params, momentum=0.9)
        from momentum import momentum
        train_updates = momentum(train_loss,
                                 self.params,
                                 params.eta,
                                 momentum=0.9)

        self._train2 = theano.function(
            inputs=[ei, em, di0, dm, dt],
            outputs=[train_loss, train_predict],
            updates=train_updates,
            givens={
                encoderInputs: ei,
                encoderMask: em,
                decoderInputs0: di0,
                decoderMask: dm,
                decoderTarget: dt
            }
            #givens={encoderInputs:ei, encoderMask:em, decoderInputs:di, decoderMask:dm, decoderTarget:dt, TF:tf}
        )

        listof_token_idx = train_predict.argmax(axis=-1)
        self._utter = theano.function(inputs=[ei, em, di0],
                                      outputs=listof_token_idx,
                                      givens={
                                          encoderInputs: ei,
                                          encoderMask: em,
                                          decoderInputs0: di0
                                      })
Esempio n. 60
0
def gru_cond_layer(tparams, state_below, options, prefix='gru',
                   mask=None, context=None, one_step=False,
                   init_memory=None, init_state=None, alpha_past=None,
                   context_mask=None,
                   **kwargs):

    assert context, 'Context must be provided'

    if one_step:
        assert init_state, 'previous state must be provided'

    nsteps = state_below.shape[0]
    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = 1

    # mask
    if mask is None:
        mask = tensor.alloc(1., state_below.shape[0], 1)

    dim = tparams[_p(prefix, 'Wcx')].shape[1]
    dimctx = tparams[_p(prefix, 'Wcx')].shape[0]
    pad = (tparams[_p(prefix, 'conv_Q')].shape[2]-1)//2

    # initial/previous state
    if init_state is None:
        init_state = tensor.alloc(0., n_samples, dim)

    # projected context
    assert context.ndim == 3, \
        'Context must be 3-d: #annotation x #sample x dim'

    if alpha_past is None:
        alpha_past = tensor.alloc(0., n_samples, context.shape[0])

    pctx_ = tensor.dot(context, tparams[_p(prefix, 'Wc_att')]) +\
        tparams[_p(prefix, 'b_att')]

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n*dim:(n+1)*dim]
        return _x[:, n*dim:(n+1)*dim]

    # projected x
    state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) +\
        tparams[_p(prefix, 'bx')]
    state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) +\
        tparams[_p(prefix, 'b')]
    state_belowyg = tensor.dot(state_below, tparams[_p(prefix, 'Wyg')]) +\
        tparams[_p(prefix, 'byg')]

    # state_below_ : x_  1*dim ; state_belowx : xx_  2*dim ; represents E*y

    def _step_slice(m_, x_, xx_, yg, h_, ctx_, alpha_, alpha_past_, beta, pctx_, cc_,
                    U, Wc, W_comb_att, U_att, c_tt, Ux, Wcx, U_nl, Ux_nl, b_nl, bx_nl, conv_Q, conv_Uf, conv_b, 
                    Whg, bhg, Umg, W_m_att, U_when_att, c_when_att):
        preact1 = tensor.dot(h_, U)
        preact1 += x_
        preact1 = tensor.nnet.sigmoid(preact1)

        r1 = _slice(preact1, 0, dim) # reset gate
        u1 = _slice(preact1, 1, dim) # update gate

        preactx1 = tensor.dot(h_, Ux)
        preactx1 *= r1
        preactx1 += xx_

        h1 = tensor.tanh(preactx1)

        h1 = u1 * h_ + (1. - u1) * h1
        h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_
        
        g_m = tensor.dot(h_, Whg) + bhg
        g_m += yg
        g_m = tensor.nnet.sigmoid(g_m)
        mt = tensor.dot(h1, Umg)
        mt = tensor.tanh(mt)
        mt *= g_m
        # attention
        pstate_ = tensor.dot(h1, W_comb_att)

        # converage vector
        cover_F = theano.tensor.nnet.conv2d(alpha_past_[:,None,:,None],conv_Q,border_mode='half') # batch x dim x SeqL x 1
        cover_F = cover_F.dimshuffle(1,2,0,3) # dim x SeqL x batch x 1
        cover_F = cover_F.reshape([cover_F.shape[0],cover_F.shape[1],cover_F.shape[2]])
        assert cover_F.ndim == 3, \
            'Output of conv must be 3-d: #dim x SeqL x batch'
        #cover_F = cover_F[:,pad:-pad,:]
        cover_F = cover_F.dimshuffle(1, 2, 0)
        # cover_F must be SeqL x batch x dimctx
        cover_vector = tensor.dot(cover_F, conv_Uf) + conv_b
        # cover_vector = cover_vector * context_mask[:,:,None]

        pctx__ = pctx_ + pstate_[None, :, :] + cover_vector
        #pctx__ += xc_
        pctx__ = tensor.tanh(pctx__)
        alpha = tensor.dot(pctx__, U_att)+c_tt
        # compute alpha_when
        
        pctx_when = tensor.dot(mt, W_m_att)
        pctx_when += pstate_
        pctx_when = tensor.tanh(pctx_when)
        alpha_when = tensor.dot(pctx_when, U_when_att)+c_when_att # batch * 1
        
        alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]]) # SeqL * batch
        alpha = tensor.exp(alpha)
        alpha_when = tensor.exp(alpha_when)
        if context_mask:
            alpha = alpha * context_mask
        if context_mask:
            alpha_mean = alpha.sum(0, keepdims=True) / context_mask.sum(0, keepdims=True)
        else:
            alpha_mean = alpha.mean(0, keepdims=True)
        alpha_when = concatenate([alpha_mean, alpha_when.T], axis=0) # (SeqL+1)*batch
        alpha = alpha / alpha.sum(0, keepdims=True)
        alpha_when = alpha_when / alpha_when.sum(0, keepdims=True)
        beta = alpha_when[-1, :]
        alpha_past = alpha_past_ + alpha.T
        ctx_ = (cc_ * alpha[:, :, None]).sum(0)  # current context
        ctx_ = beta[:, None] * mt + (1. - beta)[:, None] * ctx_

        preact2 = tensor.dot(h1, U_nl)+b_nl
        preact2 += tensor.dot(ctx_, Wc)
        preact2 = tensor.nnet.sigmoid(preact2)

        r2 = _slice(preact2, 0, dim)
        u2 = _slice(preact2, 1, dim)

        preactx2 = tensor.dot(h1, Ux_nl)+bx_nl
        preactx2 *= r2
        preactx2 += tensor.dot(ctx_, Wcx)

        h2 = tensor.tanh(preactx2)

        h2 = u2 * h1 + (1. - u2) * h2
        h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1

        return h2, ctx_, alpha.T, alpha_past, beta  # pstate_, preact, preactx, r, u

    seqs = [mask, state_below_, state_belowx, state_belowyg]
    #seqs = [mask, state_below_, state_belowx, state_belowc]
    _step = _step_slice

    shared_vars = [tparams[_p(prefix, 'U')],
                   tparams[_p(prefix, 'Wc')],
                   tparams[_p(prefix, 'W_comb_att')],
                   tparams[_p(prefix, 'U_att')],
                   tparams[_p(prefix, 'c_tt')],
                   tparams[_p(prefix, 'Ux')],
                   tparams[_p(prefix, 'Wcx')],
                   tparams[_p(prefix, 'U_nl')],
                   tparams[_p(prefix, 'Ux_nl')],
                   tparams[_p(prefix, 'b_nl')],
                   tparams[_p(prefix, 'bx_nl')],
                   tparams[_p(prefix, 'conv_Q')],
                   tparams[_p(prefix, 'conv_Uf')],
                   tparams[_p(prefix, 'conv_b')],
                   tparams[_p(prefix, 'Whg')],
                   tparams[_p(prefix, 'bhg')],
                   tparams[_p(prefix, 'Umg')],
                   tparams[_p(prefix, 'W_m_att')],
                   tparams[_p(prefix, 'U_when_att')],
                   tparams[_p(prefix, 'c_when_att')]]

    if one_step:
        rval = _step(*(seqs + [init_state, None, None, alpha_past, None, pctx_, context] +
                       shared_vars))
    else:
        rval, updates = theano.scan(_step,
                                    sequences=seqs,
                                    outputs_info=[init_state,
                                                  tensor.alloc(0., n_samples,
                                                               context.shape[2]),
                                                  tensor.alloc(0., n_samples,
                                                               context.shape[0]),
                                                  tensor.alloc(0., n_samples,
                                                               context.shape[0]),
                                                  tensor.alloc(0., n_samples,)],
                                    non_sequences=[pctx_, context]+shared_vars,
                                    name=_p(prefix, '_layers'),
                                    n_steps=nsteps,
                                    profile=profile,
                                    strict=True)
    return rval