Ejemplo n.º 1
0
def mlp_layer(tparams, state_below, options, prefix='predictor'):
    layer_num = len(options['dims'])
    for i in range(layer_num - 1):
        if i == 0:
            output = tensor.dot(state_below, tparams[_p(prefix, i)])
            output = tanh(output)
        elif i == layer_num - 2:
            output = tensor.dot(output, tparams[_p(prefix, i)])
            output = rectifier(output)
        else:
            output = tensor.dot(output, tparams[_p(prefix, i)])
            output = tanh(output)
    return output
    def _step(m_, h_, c_, a_, ct_, pctx_, dp_=None, dp_att_=None):
        if _p(prefix, 'Wct_att') in tparams:
            pstate_ = tensor.dot(h_, tparams[_p(
                prefix, 'Wd_att')]) + tensor.dot(
                    ct_, tparams[_p(prefix, 'Wct_att')])
        else:
            pstate_ = tensor.dot(h_, tparams[_p(prefix, 'Wd_att')])
        pctx_ += pstate_[:, None, :]
        pctx_ = tanh(pctx_)
        alpha = tensor.dot(
            pctx_, tparams[_p(prefix, 'U_att')] + tparams[_p(prefix, 'c_att')])
        alpha_shp = alpha.shape
        alpha = tensor.nnet.softmax(alpha.reshape(
            (alpha_shp[0], alpha_shp[1])))
        ctx_ = (new_ctx * alpha[:, :, None]).sum(1)  # current context
        preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) + \
                tensor.dot(ctx_, tparams[_p(prefix, 'Wc')]) + \
                tparams[_p(prefix,'b')]

        i = tensor.nnet.sigmoid(_slice(preact, 0, dim))
        f = tensor.nnet.sigmoid(_slice(preact, 1, dim))
        o = tensor.nnet.sigmoid(_slice(preact, 2, dim))

        c = tensor.tanh(_slice(preact, 3, dim))

        c = f * c_ + i * c
        c = m_[:, None] * c + (1. - m_)[:, None] * c_

        h = o * tensor.tanh(c)
        h = m_[:, None] * h + (1. - m_)[:, None] * h_

        rval = [h, c, alpha, ctx_, i, f, o]
        return rval
    def forward(self):
        temp = np.vstack((np.ones((1, self._input.shape[1])), self._input))
        self._forward_cache_acted = [temp]
        self._forward_cache_raw = [temp]
        if not self._constructed:
            print("use the build method before forwarding.")
            assert 0
        times = len(self._layers) - 1
        for i in range(times):
            # temp = np.vstack((np.ones((1, self._input.shape[1])), temp))
            temp = np.dot(self._weights[i], temp)
            self._forward_cache_raw.append(temp)
            if not self._activations[i]:
                pass
            elif self._activations[i].lower() == 'sigmoid':
                temp = util.sigmoid(temp)
            elif self._activations[i].lower() == 'tanh':
                temp = util.tanh(temp)
            elif self._activations[i].lower() == 'relu':
                temp = util.relu(temp)
            else:
                print(
                    "Activation function should be None, 'sigmoid', 'tanh' or 'relu'."
                )
                assert 0
            self._forward_cache_acted.append(temp)

        self._predictions = temp
        return temp
Ejemplo n.º 4
0
 def forward(self,X):
     #Z = relu(X.dot(self.W1)+self.b1)
     Z = tanh(X.dot(self.W1)+self.b1)
     # print("Z.shape"+str(Z.shape))
     # print("self.W2.shape"+str(self.W2.shape))
     # print("self.b2.shape"+str(self.b2.shape))
     ret =  sigmoid(Z.dot(self.W2)+self.b2)
     # print("ret.shape"+str(np.array(ret).shape))
     return ret, Z
Ejemplo n.º 5
0
    def forward(self, x_t):
        self.t += 1

        t = self.t
        h = self.h[t-1]

        self.input_gate[t] = sigmoid(np.dot(self.W_hi, h) + np.dot(self.W_xi, x_t) + self.b_i)
        self.forget_gate[t] = sigmoid(np.dot(self.W_hf, h) + np.dot(self.W_xf, x_t) + self.b_f)
        self.output_gate[t] = sigmoid(np.dot(self.W_ho, h) + np.dot(self.W_xo, x_t) + self.b_o)
        self.cell_update[t] = tanh(np.dot(self.W_hj, h) + np.dot(self.W_xj, x_t) + self.b_j)

        self.c[t] = self.input_gate[t] * self.cell_update[t] + self.forget_gate[t] * self.c[t-1]
        self.ct[t] = tanh(self.c[t])
        self.h[t] = self.output_gate[t] * self.ct[t]

        self.x[t] = x_t

        return self.h[t]
Ejemplo n.º 6
0
    def forward_pass(self, inputs):

        # decleare variables used forward pass
        self.inputs = inputs
        self.n_inp = len(inputs)
        self.vr = []
        self.vz = []
        self.v_h = []
        self.vo = []
        self.r = []
        self.z = []
        self._h = []
        self.h = {}
        self.o = []
        self.h[-1] = np.zeros((self.h_size, 1))

        # performing recurrsion
        for i in range(self.n_inp):

            # calculating reset gate value
            # self.vr.append(np.dot(self.w['ur'],inputs[i]) + np.dot(self.w['wr'], self.h[i-1]) + self.b['r'])
            # self.r.append(sigmoid(self.vr[i]))
            self.r.append(
                sigmoid(
                    np.dot(self.w['ur'], inputs[i]) +
                    np.dot(self.w['wr'], self.h[i - 1]) + self.b['r']))

            # calculation update gate value
            # self.vz.append(np.dot(self.w['uz'],inputs[i]) + np.dot(self.w['wz'], self.h[i-1])  + self.b['z'])
            # self.z.append(sigmoid(self.vz[i]))
            self.z.append(
                sigmoid(
                    np.dot(self.w['uz'], inputs[i]) +
                    np.dot(self.w['wz'], self.h[i - 1]) + self.b['z']))

            # applying reset gate value
            # self.v_h.append(np.dot(self.w['u_h'], inputs[i]) + np.dot(self.w['w_h'], np.multiply(self.h[i - 1], self.r[i])) +  + self.b['_h'])
            # self._h.append(tanh(self.v_h[i]))
            self._h.append(
                tanh(
                    np.dot(self.w['u_h'], inputs[i]) +
                    np.dot(self.w['w_h'], np.multiply(self.h[i -
                                                             1], self.r[i])) +
                    +self.b['_h']))

            # applying update gate value
            self.h[i] = np.multiply(self.z[i], self.h[i - 1]) + np.multiply(
                1 - self.z[i], self._h[i])

            # calculating output
            # self.vo.append(np.dot(self.w['wo'], self.h[i]) + self.b['o'])
            # self.o.append(softmax(self.vo[i]))
            self.o.append(
                softmax(np.dot(self.w['wo'], self.h[i]) + self.b['o']))

        return self.o
Ejemplo n.º 7
0
def mlp_attention_layer(tparams, state_below, options, prefix='attention'):
    mean_emb = state_below.mean(1)
    attention_vec = tensor.dot(state_below, tparams[_p(
        prefix, 'W_att')]) + tparams[_p(prefix, 'b')]
    attention_vec += tensor.dot(mean_emb, tparams[_p(prefix, 'Wm')])[:,
                                                                     None, :]
    attention_vec = tanh(attention_vec)
    alpha = tensor.dot(attention_vec, tparams[_p(
        prefix, 'U_att')]) + tparams[_p(prefix, 'c_att')]
    alpha_shp = alpha.shape
    alpha = tensor.nnet.softmax(alpha.reshape([alpha_shp[0], alpha_shp[1]]))
    output = (state_below * alpha[:, :, None]).sum(1)
    return output
Ejemplo n.º 8
0
    def forward(self, x_t):
        self.t += 1
        t = self.t
        h = self.h[t - 1]

        self.input_gate[t] = sigmoid(
            np.dot(self.W_hi, h) + np.dot(self.W_xi, x_t) + self.b_i)
        self.forget_gate[t] = sigmoid(
            np.dot(self.W_hi, h) + np.dot(self.W_xf, x_t) + self.b_f)
        self.output_gate[t] = sigmoid(
            np.dot(self.W_ho, h) + np.dot(self.W_xo, x_t) + self.b_o)
        self.cell_update[t] = tanh(
            np.dot(self.W_hj, h) + np.dot(self.W_xj, x_t) + self.b_j)

        self.c[t] = self.input_gate[t] * self.cell_update[
            t] + self.forget_gate[t] * self.c[t - 1]
        self.ct[t] = tanh(self.c[t])
        self.h[t] = self.output_gate[t] * self.ct[t]

        self.x[t] = x_t

        return self.h[t]
    def _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_=None, dp_att_=None):
        """ Each variable is one time slice of the LSTM
        m_ - (mask), x_- (previous word), h_- (hidden state), c_- (lstm memory),
        a_ - (alpha distribution [eq (5)]), as_- (sample from alpha dist), ct_- (context),
        pctx_ (projected context), dp_/dp_att_ (dropout masks)
        """
        # attention computation
        # [described in  equations (4), (5), (6) in
        # section "3.1.2 Decoder: Long Short Term Memory Network]
        pstate_ = tensor.dot(h_, tparams[_p(prefix,'Wd_att')]) + tensor.dot(ct_, tparams[_p(prefix, 'Wct_att')])
        pctx_ = pctx_ + pstate_[:,None,:]
        pctx_list = []
        pctx_list.append(pctx_)
        pctx_ = tanh(pctx_)
        alpha = tensor.dot(pctx_, tparams[_p(prefix,'U_att')])+tparams[_p(prefix, 'c_tt')]
        alpha_pre = alpha
        alpha_shp = alpha.shape

        alpha = tensor.nnet.softmax(alpha.reshape([alpha_shp[0],alpha_shp[1]])) # softmax
        ctx_ = (context * alpha[:,:,None]).sum(1) # current context
        alpha_sample = alpha # you can return something else reasonable here to debug

        preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
        preact += x_
        preact += tensor.dot(ctx_, tparams[_p(prefix, 'Wc')])

        # Recover the activations to the lstm gates
        # [equation (1)]
        i = _slice(preact, 0, dim)
        f = _slice(preact, 1, dim)
        o = _slice(preact, 2, dim)
        if options['use_dropout_lstm']:
            i = i * _slice(dp_, 0, dim)
            f = f * _slice(dp_, 1, dim)
            o = o * _slice(dp_, 2, dim)
        i = tensor.nnet.sigmoid(i)
        f = tensor.nnet.sigmoid(f)
        o = tensor.nnet.sigmoid(o)
        c = tensor.tanh(_slice(preact, 3, dim))

        # compute the new memory/hidden state
        # if the mask is 0, just copy the previous state
        c = f * c_ + i * c
        c = m_[:,None] * c + (1. - m_)[:,None] * c_

        h = o * tensor.tanh(c)
        h = m_[:,None] * h + (1. - m_)[:,None] * h_

        rval = [h, c, alpha, alpha_sample, ctx_]
        rval += [pstate_, pctx_, i, f, o, preact, alpha_pre]+pctx_list
        return rval
Ejemplo n.º 10
0
    def predict(self,input):
        L = np.shape(input)[0]
        az = np.zeros((L,self.Nhidden))
        ar = np.zeros((L,self.Nhidden))
        ahhat = np.zeros((L,self.Nhidden))
        ah = np.zeros((L,self.Nhidden))
        
        a1 = tanh(np.dot(input,self.w1) + self.b1)
        x = np.concatenate((np.zeros((self.Nhidden)),a1[1,:]))
        az[1,:] = sigm(np.dot(x,self.wz) + self.bz)
        ar[1,:] = sigm(np.dot(x,self.wr) + self.br)
        ahhat[1,:] = tanh(np.dot(x,self.wh) + self.bh)
        ah[1,:] = az[1,:]*ahhat[1,:]
        
        for i in range(1,L):
            x = np.concatenate((ah[i-1,:],a1[i,:]))
            az[i,:] = sigm(np.dot(x,self.wz) + self.bz)
            ar[i,:] = sigm(np.dot(x,self.wr) + self.br)
            x = np.concatenate((ar[i,:]*ah[i-1,:],a1[i,:]))
            ahhat[i,:] = tanh(np.dot(x,self.wh) + self.bh)
            ah[i,:] = (1-az[i,:])*ah[i-1,:] + az[i,:]*ahhat[i,:]
 
        a2 = tanh(np.dot(ah,self.w2) + self.b2)
        return [a1,az,ar,ahhat,ah,a2]
    def backward(self, learning_rate=0.01):
        # using mse for loss
        self._gradients = []
        mse = np.average(
            np.square(self._forward_cache_acted[-1] - self._labels))
        d_mse_yhat = np.average(2 *
                                (self._forward_cache_acted[-1] - self._labels))
        times = len(self._layers) - 1
        dx = np.ones((self._forward_cache_raw[times - 2].shape[0], 1))
        for i in range(times - 1, -1, -1):
            # in reverse order
            act = self._activations[i]
            d_act = None
            if not act:
                d_act = np.ones(self._forward_cache_raw[i + 1].shape)
                # d_act = 1
            elif act.lower() == 'sigmoid':
                d_act = util.sigmoid(self._forward_cache_raw[i + 1]) * (
                    1 - util.sigmoid(self._forward_cache_raw[i + 1]))
            elif act.lower() == 'relu':
                d_act = (self._forward_cache_raw[i + 1] > 0).astype('float32')
            elif act.lower() == 'tanh':
                d_act = 1 - np.square(util.tanh(
                    self._forward_cache_raw[i + 1]))
            if i != times - 1:
                dw = np.dot(
                    dx * d_act,
                    self._forward_cache_raw[i].T) / self._labels.shape[1]
            else:
                dw = np.dot(
                    d_act,
                    self._forward_cache_raw[i].T) / self._labels.shape[1]

            dx = np.dot(d_act.T, self._weights[i]).T
            self._gradients.insert(0, dw * d_mse_yhat)
        for i in range(times):
            self._weights[
                i] = self._weights[i] - learning_rate * self._gradients[i]
        return mse
Ejemplo n.º 12
0
def build_model(tparams, options, sampling=True):
    """ Builds the entire computational graph used for training

    Basically does a forward pass through the data and calculates the cost function

    [This function builds a model described in Section 3.1.2 onwards
    as the convolutional feature are precomputed, some extra features
    which were not used are also implemented here.]

    Parameters
    ----------
    tparams : OrderedDict
        maps names of variables to theano shared variables
    options : dict
        big dictionary with all the settings and hyperparameters
    sampling : boolean
        [If it is true, when using stochastic attention, follows
        the learning rule described in section 4. at the bottom left of
        page 5]
    Returns
    -------
    trng: theano random number generator
        Used for dropout, stochastic attention, etc
    use_noise: theano shared variable
        flag that toggles noise on and off
    [x, mask, ctx]: theano variables
        Represent the captions, binary mask, and annotations
        for a single batch (see dimensions below)
    alphas: theano variables
        Attention weights
    alpha_sample: theano variable
        Sampled attention weights used in REINFORCE for stochastic
        attention: [see the learning rule in eq (12)]
    cost: theano variable
        negative log likelihood
    opt_outs: OrderedDict
        extra outputs required depending on configuration in options
    """
    trng = RandomStreams(1234)
    use_noise = theano.shared(numpy.float32(0.))

    # description string: #words x #samples,
    x = tensor.matrix('x', dtype='int64')
    mask = tensor.matrix('mask', dtype='float32')
    # context: #samples x #annotations x dim
    ctx = tensor.tensor3('ctx', dtype='float32')

    n_timesteps = x.shape[0]
    n_samples = x.shape[1]

    # index into the word embedding matrix, shift it forward in time
    emb = tparams['Wemb'][x.flatten()].reshape(
        [n_timesteps, n_samples, options['dim_word']])
    emb_shifted = tensor.zeros_like(emb)
    emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1])
    emb = emb_shifted
    if options['lstm_encoder']:
        # encoder
        ctx_fwd = get_layer('lstm')[1](tparams,
                                       ctx.dimshuffle(1, 0, 2),
                                       options,
                                       prefix='encoder')[0].dimshuffle(
                                           1, 0, 2)
        ctx_rev = get_layer('lstm')[1](
            tparams,
            ctx.dimshuffle(1, 0, 2)[:, ::-1, :],
            options,
            prefix='encoder_rev')[0][:, ::-1, :].dimshuffle(1, 0, 2)
        ctx0 = tensor.concatenate((ctx_fwd, ctx_rev), axis=2)
    else:
        ctx0 = ctx

    # initial state/cell [top right on page 4]
    ctx_mean = ctx0.mean(1)
    for lidx in xrange(1, options['n_layers_init']):
        ctx_mean = get_layer('ff')[1](tparams,
                                      ctx_mean,
                                      options,
                                      prefix='ff_init_%d' % lidx,
                                      activ='rectifier')
        if options['use_dropout']:
            ctx_mean = dropout_layer(ctx_mean, use_noise, trng)

    init_state = get_layer('ff')[1](tparams,
                                    ctx_mean,
                                    options,
                                    prefix='ff_state',
                                    activ='tanh')
    init_memory = get_layer('ff')[1](tparams,
                                     ctx_mean,
                                     options,
                                     prefix='ff_memory',
                                     activ='tanh')
    # lstm decoder
    # [equation (1), (2), (3) in section 3.1.2]
    attn_updates = []
    proj, updates = get_layer('lstm_cond')[1](tparams,
                                              emb,
                                              options,
                                              prefix='decoder',
                                              mask=mask,
                                              context=ctx0,
                                              one_step=False,
                                              init_state=init_state,
                                              init_memory=init_memory,
                                              trng=trng,
                                              use_noise=use_noise,
                                              sampling=sampling)
    attn_updates += updates
    proj_h = proj[0]
    # optional deep attention
    if options['n_layers_lstm'] > 1:
        for lidx in xrange(1, options['n_layers_lstm']):
            init_state = get_layer('ff')[1](tparams,
                                            ctx_mean,
                                            options,
                                            prefix='ff_state_%d' % lidx,
                                            activ='tanh')
            init_memory = get_layer('ff')[1](tparams,
                                             ctx_mean,
                                             options,
                                             prefix='ff_memory_%d' % lidx,
                                             activ='tanh')
            proj, updates = get_layer('lstm_cond')[1](tparams,
                                                      proj_h,
                                                      options,
                                                      prefix='decoder_%d' %
                                                      lidx,
                                                      mask=mask,
                                                      context=ctx0,
                                                      one_step=False,
                                                      init_state=init_state,
                                                      init_memory=init_memory,
                                                      trng=trng,
                                                      use_noise=use_noise,
                                                      sampling=sampling)
            attn_updates += updates
            proj_h = proj[0]

    alphas = proj[2]
    alpha_sample = proj[3]
    ctxs = proj[4]

    # [beta value explained in note 4.2.1 "doubly stochastic attention"]
    if options['selector']:
        sels = proj[5]

    if options['use_dropout']:
        proj_h = dropout_layer(proj_h, use_noise, trng)

    # compute word probabilities
    # [equation (7)]
    logit = get_layer('ff')[1](tparams,
                               proj_h,
                               options,
                               prefix='ff_logit_lstm',
                               activ='linear')
    if options['prev2out']:
        logit += emb
    if options['ctx2out']:
        logit += get_layer('ff')[1](tparams,
                                    ctxs,
                                    options,
                                    prefix='ff_logit_ctx',
                                    activ='linear')
    logit = tanh(logit)
    if options['use_dropout']:
        logit = dropout_layer(logit, use_noise, trng)
    if options['n_layers_out'] > 1:
        for lidx in xrange(1, options['n_layers_out']):
            logit = get_layer('ff')[1](tparams,
                                       logit,
                                       options,
                                       prefix='ff_logit_h%d' % lidx,
                                       activ='rectifier')
            if options['use_dropout']:
                logit = dropout_layer(logit, use_noise, trng)

    # compute softmax
    logit = get_layer('ff')[1](tparams,
                               logit,
                               options,
                               prefix='ff_logit',
                               activ='linear')
    logit_shp = logit.shape
    probs = tensor.nnet.softmax(
        logit.reshape([logit_shp[0] * logit_shp[1], logit_shp[2]]))

    # Index into the computed probability to give the log likelihood
    x_flat = x.flatten()
    p_flat = probs.flatten()
    cost = -tensor.log(p_flat[tensor.arange(x_flat.shape[0]) * probs.shape[1] +
                              x_flat] + 1e-8)
    cost = cost.reshape([x.shape[0], x.shape[1]])
    masked_cost = cost * mask
    cost = (masked_cost).sum(0)

    # optional outputs
    opt_outs = dict()
    if options['selector']:
        opt_outs['selector'] = sels
    if options['attn_type'] == 'stochastic':
        opt_outs['masked_cost'] = masked_cost  # need this for reinforce later
        opt_outs['attn_updates'] = attn_updates  # this is to update the rng

    return trng, use_noise, [x, mask,
                             ctx], alphas, alpha_sample, cost, opt_outs
Ejemplo n.º 13
0
    def _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_=None, dp_att_=None):
        """ Each variable is one time slice of the LSTM
        m_ - (mask), x_- (previous word), h_- (hidden state), c_- (lstm memory),
        a_ - (alpha distribution [eq (5)]), as_- (sample from alpha dist), ct_- (context),
        pctx_ (projected context), dp_/dp_att_ (dropout masks)
        """
        # attention computation
        # [described in  equations (4), (5), (6) in
        # section "3.1.2 Decoder: Long Short Term Memory Network]
        pstate_ = tensor.dot(h_, tparams[_p(prefix, 'Wd_att')])
        pctx_ = pctx_ + pstate_[:, None, :]
        pctx_list = []
        pctx_list.append(pctx_)
        pctx_ = tanh(pctx_)
        alpha = tensor.dot(pctx_, tparams[_p(prefix, 'U_att')]) + tparams[_p(
            prefix, 'c_tt')]
        alpha_pre = alpha
        alpha_shp = alpha.shape

        if options['attn_type'] == 'deterministic':
            alpha = tensor.nnet.softmax(
                alpha.reshape([alpha_shp[0], alpha_shp[1]]))  # softmax
            ctx_ = (context * alpha[:, :, None]).sum(1)  # current context
            alpha_sample = alpha  # you can return something else reasonable here to debug
        else:
            alpha = tensor.nnet.softmax(
                temperature_c *
                alpha.reshape([alpha_shp[0], alpha_shp[1]]))  # softmax
            # TODO return alpha_sample
            if sampling:
                alpha_sample = h_sampling_mask * trng.multinomial(pvals=alpha,dtype=theano.config.floatX)\
                               + (1.-h_sampling_mask) * alpha
            else:
                if argmax:
                    alpha_sample = tensor.cast(
                        tensor.eq(
                            tensor.arange(alpha_shp[1])[None, :],
                            tensor.argmax(alpha, axis=1, keepdims=True)),
                        theano.config.floatX)
                else:
                    alpha_sample = alpha
            ctx_ = (context * alpha_sample[:, :, None]).sum(
                1)  # current context

        if options['selector']:
            sel_ = tensor.nnet.sigmoid(
                tensor.dot(h_, tparams[_p(prefix, 'W_sel')]) +
                tparams[_p(prefix, 'b_sel')])
            sel_ = sel_.reshape([sel_.shape[0]])
            ctx_ = sel_[:, None] * ctx_

        preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
        preact += x_
        preact += tensor.dot(ctx_, tparams[_p(prefix, 'Wc')])

        # Recover the activations to the lstm gates
        # [equation (1)]
        i = _slice(preact, 0, dim)
        f = _slice(preact, 1, dim)
        o = _slice(preact, 2, dim)
        if options['use_dropout_lstm']:
            i = i * _slice(dp_, 0, dim)
            f = f * _slice(dp_, 1, dim)
            o = o * _slice(dp_, 2, dim)
        i = tensor.nnet.sigmoid(i)
        f = tensor.nnet.sigmoid(f)
        o = tensor.nnet.sigmoid(o)
        c = tensor.tanh(_slice(preact, 3, dim))

        # compute the new memory/hidden state
        # if the mask is 0, just copy the previous state
        c = f * c_ + i * c
        c = m_[:, None] * c + (1. - m_)[:, None] * c_

        h = o * tensor.tanh(c)
        h = m_[:, None] * h + (1. - m_)[:, None] * h_

        rval = [h, c, alpha, alpha_sample, ctx_]
        if options['selector']:
            rval += [sel_]
        rval += [pstate_, pctx_, i, f, o, preact, alpha_pre] + pctx_list
        return rval
Ejemplo n.º 14
0
def lstm_cond_layer(tparams,
                    state_below,
                    options,
                    prefix='lstm',
                    mask=None,
                    context=None,
                    one_step=False,
                    init_memory=None,
                    init_state=None,
                    trng=None,
                    use_noise=None,
                    sampling=True,
                    argmax=False,
                    **kwargs):

    assert context, 'Context must be provided'

    if one_step:
        assert init_memory, 'previous memory must be provided'
        assert init_state, 'previous state must be provided'

    nsteps = state_below.shape[0]
    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = 1

    # mask
    if mask is None:
        mask = tensor.alloc(1., state_below.shape[0], 1)

    # infer lstm dimension
    dim = tparams[_p(prefix, 'U')].shape[0]

    # initial/previous state
    if init_state is None:
        init_state = tensor.alloc(0., n_samples, dim)
    # initial/previous memory
    if init_memory is None:
        init_memory = tensor.alloc(0., n_samples, dim)

    # projected context
    pctx_ = tensor.dot(context, tparams[_p(prefix, 'Wc_att')]) + tparams[_p(
        prefix, 'b_att')]
    if options['n_layers_att'] > 1:
        for lidx in xrange(1, options['n_layers_att']):
            pctx_ = tensor.dot(pctx_, tparams[_p(
                prefix, 'W_att_%d' % lidx)]) + tparams[_p(
                    prefix, 'b_att_%d' % lidx)]
            # note to self: this used to be options['n_layers_att'] - 1, so no extra non-linearity if n_layers_att < 3
            if lidx < options['n_layers_att']:
                pctx_ = tanh(pctx_)

    # projected x
    # state_below is timesteps*num samples by d in training (TODO change to notation of paper)
    # this is n * d during sampling
    state_below = tensor.dot(state_below, tparams[_p(
        prefix, 'W')]) + tparams[_p(prefix, 'b')]

    # additional parameters for stochastic hard attention
    if options['attn_type'] == 'stochastic':
        # temperature for softmax
        temperature = options.get("temperature", 1)
        # [see (Section 4.1): Stochastic "Hard" Attention]
        semi_sampling_p = options.get("semi_sampling_p", 0.5)
        temperature_c = theano.shared(numpy.float32(temperature),
                                      name='temperature_c')
        h_sampling_mask = trng.binomial((1, ),
                                        p=semi_sampling_p,
                                        n=1,
                                        dtype=theano.config.floatX).sum()

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n * dim:(n + 1) * dim]
        return _x[:, n * dim:(n + 1) * dim]

    def _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_=None, dp_att_=None):
        """ Each variable is one time slice of the LSTM
        m_ - (mask), x_- (previous word), h_- (hidden state), c_- (lstm memory),
        a_ - (alpha distribution [eq (5)]), as_- (sample from alpha dist), ct_- (context),
        pctx_ (projected context), dp_/dp_att_ (dropout masks)
        """
        # attention computation
        # [described in  equations (4), (5), (6) in
        # section "3.1.2 Decoder: Long Short Term Memory Network]
        pstate_ = tensor.dot(h_, tparams[_p(prefix, 'Wd_att')])
        pctx_ = pctx_ + pstate_[:, None, :]
        pctx_list = []
        pctx_list.append(pctx_)
        pctx_ = tanh(pctx_)
        alpha = tensor.dot(pctx_, tparams[_p(prefix, 'U_att')]) + tparams[_p(
            prefix, 'c_tt')]
        alpha_pre = alpha
        alpha_shp = alpha.shape

        if options['attn_type'] == 'deterministic':
            alpha = tensor.nnet.softmax(
                alpha.reshape([alpha_shp[0], alpha_shp[1]]))  # softmax
            ctx_ = (context * alpha[:, :, None]).sum(1)  # current context
            alpha_sample = alpha  # you can return something else reasonable here to debug
        else:
            alpha = tensor.nnet.softmax(
                temperature_c *
                alpha.reshape([alpha_shp[0], alpha_shp[1]]))  # softmax
            # TODO return alpha_sample
            if sampling:
                alpha_sample = h_sampling_mask * trng.multinomial(pvals=alpha,dtype=theano.config.floatX)\
                               + (1.-h_sampling_mask) * alpha
            else:
                if argmax:
                    alpha_sample = tensor.cast(
                        tensor.eq(
                            tensor.arange(alpha_shp[1])[None, :],
                            tensor.argmax(alpha, axis=1, keepdims=True)),
                        theano.config.floatX)
                else:
                    alpha_sample = alpha
            ctx_ = (context * alpha_sample[:, :, None]).sum(
                1)  # current context

        if options['selector']:
            sel_ = tensor.nnet.sigmoid(
                tensor.dot(h_, tparams[_p(prefix, 'W_sel')]) +
                tparams[_p(prefix, 'b_sel')])
            sel_ = sel_.reshape([sel_.shape[0]])
            ctx_ = sel_[:, None] * ctx_

        preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
        preact += x_
        preact += tensor.dot(ctx_, tparams[_p(prefix, 'Wc')])

        # Recover the activations to the lstm gates
        # [equation (1)]
        i = _slice(preact, 0, dim)
        f = _slice(preact, 1, dim)
        o = _slice(preact, 2, dim)
        if options['use_dropout_lstm']:
            i = i * _slice(dp_, 0, dim)
            f = f * _slice(dp_, 1, dim)
            o = o * _slice(dp_, 2, dim)
        i = tensor.nnet.sigmoid(i)
        f = tensor.nnet.sigmoid(f)
        o = tensor.nnet.sigmoid(o)
        c = tensor.tanh(_slice(preact, 3, dim))

        # compute the new memory/hidden state
        # if the mask is 0, just copy the previous state
        c = f * c_ + i * c
        c = m_[:, None] * c + (1. - m_)[:, None] * c_

        h = o * tensor.tanh(c)
        h = m_[:, None] * h + (1. - m_)[:, None] * h_

        rval = [h, c, alpha, alpha_sample, ctx_]
        if options['selector']:
            rval += [sel_]
        rval += [pstate_, pctx_, i, f, o, preact, alpha_pre] + pctx_list
        return rval

    if options['use_dropout_lstm']:
        if options['selector']:
            _step0 = lambda m_, x_, dp_, h_, c_, a_, as_, ct_, sel_, pctx_: \
                            _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_)
        else:
            _step0 = lambda m_, x_, dp_, h_, c_, a_, as_, ct_, pctx_: \
                            _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_)
        dp_shape = state_below.shape
        if one_step:
            dp_mask = tensor.switch(
                use_noise,
                trng.binomial((dp_shape[0], 3 * dim),
                              p=0.5,
                              n=1,
                              dtype=state_below.dtype),
                tensor.alloc(0.5, dp_shape[0], 3 * dim))
        else:
            dp_mask = tensor.switch(
                use_noise,
                trng.binomial((dp_shape[0], dp_shape[1], 3 * dim),
                              p=0.5,
                              n=1,
                              dtype=state_below.dtype),
                tensor.alloc(0.5, dp_shape[0], dp_shape[1], 3 * dim))
    else:
        if options['selector']:
            _step0 = lambda m_, x_, h_, c_, a_, as_, ct_, sel_, pctx_: _step(
                m_, x_, h_, c_, a_, as_, ct_, pctx_)
        else:
            _step0 = lambda m_, x_, h_, c_, a_, as_, ct_, pctx_: _step(
                m_, x_, h_, c_, a_, as_, ct_, pctx_)

    if one_step:
        if options['use_dropout_lstm']:
            if options['selector']:
                rval = _step0(mask, state_below, dp_mask, init_state,
                              init_memory, None, None, None, None, pctx_)
            else:
                rval = _step0(mask, state_below, dp_mask, init_state,
                              init_memory, None, None, None, pctx_)
        else:
            if options['selector']:
                rval = _step0(mask, state_below, init_state, init_memory, None,
                              None, None, None, pctx_)
            else:
                rval = _step0(mask, state_below, init_state, init_memory, None,
                              None, None, pctx_)
        return rval
    else:
        seqs = [mask, state_below]
        if options['use_dropout_lstm']:
            seqs += [dp_mask]
        outputs_info = [
            init_state, init_memory,
            tensor.alloc(0., n_samples, pctx_.shape[1]),
            tensor.alloc(0., n_samples, pctx_.shape[1]),
            tensor.alloc(0., n_samples, context.shape[2])
        ]
        if options['selector']:
            outputs_info += [tensor.alloc(0., n_samples)]
        outputs_info += [None, None, None, None, None, None, None
                         ] + [None]  # *options['n_layers_att']
        rval, updates = theano.scan(_step0,
                                    sequences=seqs,
                                    outputs_info=outputs_info,
                                    non_sequences=[pctx_],
                                    name=_p(prefix, '_layers'),
                                    n_steps=nsteps,
                                    profile=False)
        return rval, updates
Ejemplo n.º 15
0
    def forwardProp(self,allKids,words_embedded,updateWlab,label,theta,freq):
        (W1,W2,W3,W4,Wlab,b1,b2,b3,blab,WL)=self.getParams(theta)
        sl=np.size(words_embedded,1)
        sentree=rnntree.rnntree(self.d,sl,words_embedded)
        collapsed_sentence = range(sl)
        if updateWlab:
            temp_label=np.zeros(self.cat)
            temp_label[label-1]=1.0
            nodeUnder = np.ones([2*sl-1,1])

            for i in range(sl,2*sl-1): # calculate n1, n2 and n1+n2 for each node in the sensentree and store in nodeUnder
                kids = allKids[i]
                n1 = nodeUnder[kids[0]]
                n2 = nodeUnder[kids[1]]
                nodeUnder[i] = n1+n2

            cat_size=self.cat
            sentree.catDelta = np.zeros([cat_size, 2*sl-1])
            sentree.catDelta_out = np.zeros([self.d,2*sl-1])

            # classifier on single words
            for i in range(sl):
                sm = softmax(np.dot(Wlab,words_embedded[:,i]) + blab)
                lbl_sm = (1-self.alpha)*(temp_label - sm)
                sentree.nodeScores[i] = 1.0/2.0*(np.dot(lbl_sm,(temp_label- sm)))
                sentree.catDelta[:, i] = -np.dot(lbl_sm,softmax_prime(sm))

            # sm = sigmoid(self.Wlab*words_embedded + self.blab)

            #lbl_sm = (1-self.alpha)*(label[:,np.ones(sl,1)] - sm)
            #sentree.nodeScores[:sl] = 1/2*(lbl_sm.*(label(:,ones(sl,1)) - sm))
            #sentree.catDelta[:, :sl] = -(lbl_sm).*sigmoid_prime(sm)

            for i in range(sl,2*sl-1):
                kids = allKids[i]

                c1 = sentree.nodeFeatures[:,kids[0]]
                c2 = sentree.nodeFeatures[:,kids[1]]

                # Eq. [2] in the paper: p = f(W[1][c1 c2] + b[1])
                p = tanh(np.dot(W1,c1) + np.dot(W2,c2) + b1)

                # See last paragraph in Section 2.3
                p_norm1 = p/norm(p)

                # Eq. (7) in the paper (for special case of 1d label)
                #sm = sigmoid(np.dot(Wlab,p_norm1) + blab)
                sm=softmax(np.dot(Wlab,p_norm1) + blab)
                beta=0.5
                #lbl_sm = beta * (1.0-self.alpha)*(label - sm)
                lbl_sm = beta * (1.0-self.alpha)*(temp_label - sm)
                #lbl_sm = beta * (1.0-self.alpha) * (temp_label-sm)
                #sentree.catDelta[:, i] = -softmax_prime(sm)[:,label-1]
                #J=-(1.0-self.alpha)*np.log(sm[label-1])
                #sentree.catDelta[:, i] = -np.dot(lbl_sm,sigmoid_prime(sm))
                sentree.catDelta[:, i] = -np.dot(lbl_sm,softmax_prime(sm))
                #J = 1.0/2.0*(np.dot(lbl_sm,(label - sm)))
                J = 1.0/2.0*(np.dot(lbl_sm,(temp_label - sm)))

                sentree.nodeFeatures[:,i] = p_norm1
                sentree.nodeFeatures_unnormalized[:,i] = p
                sentree.nodeScores[i] = J
                sentree.numkids = nodeUnder

            sentree.kids = allKids
        else:
            # Reconstruction Error
            for j in range(sl-1):
                size2=np.size(words_embedded,1)
                c1 = words_embedded[:,0:-1]
                c2 = words_embedded[:,1:]

                freq1 = freq[0:-1]
                freq2 = freq[1:]

                p = tanh(np.dot(W1,c1) + np.dot(W2,c2) + np.reshape(b1,[self.d,1])*([1]*(size2-1)))
                p_norm1 =p/np.sqrt(sum(p**2))

                y1_unnormalized = tanh(np.dot(W3,p_norm1) + np.reshape(b2,[self.d,1])*([1]*(size2-1)))
                y2_unnormalized = tanh(np.dot(W4,p_norm1) + np.reshape(b3,[self.d,1])*([1]*(size2-1)))

                y1 = y1_unnormalized/np.sqrt(sum(y1_unnormalized**2))
                y2 = y2_unnormalized/np.sqrt(sum(y2_unnormalized**2))

                y1c1 = self.alpha*(y1-c1)
                y2c2 = self.alpha*(y2-c2)

                # Eq. (4) in the paper: reconstruction error
                J = 1.0/2.0*sum((y1c1)*(y1-c1) + (y2c2)*(y2-c2))

                # finding the pair with smallest reconstruction error for constructing sentree
                J_min= min(J)
                J_minpos=np.argmin(J)

                sentree.node_y1c1[:,sl+j] = y1c1[:,J_minpos]
                sentree.node_y2c2[:,sl+j] = y2c2[:,J_minpos]
                sentree.nodeDelta_out1[:,sl+j] = np.dot(norm1tanh_prime(y1_unnormalized[:,J_minpos]) , y1c1[:,J_minpos])
                sentree.nodeDelta_out2[:,sl+j] = np.dot(norm1tanh_prime(y2_unnormalized[:,J_minpos]) , y2c2[:,J_minpos])

                words_embedded=np.delete(words_embedded,J_minpos+1,1)
                words_embedded[:,J_minpos]=p_norm1[:,J_minpos]
                sentree.nodeFeatures[:, sl+j] = p_norm1[:,J_minpos]
                sentree.nodeFeatures_unnormalized[:, sl+j]= p[:,J_minpos]
                sentree.nodeScores[sl+j] = J_min
                sentree.pp[collapsed_sentence[J_minpos]] = sl+j
                sentree.pp[collapsed_sentence[J_minpos+1]] = sl+j
                sentree.kids[sl+j,:] = [collapsed_sentence[J_minpos], collapsed_sentence[J_minpos+1]]
                sentree.numkids[sl+j] = sentree.numkids[sentree.kids[sl+j,0]] + sentree.numkids[sentree.kids[sl+j,1]]


                freq=np.delete(freq,J_minpos+1)
                freq[J_minpos] = (sentree.numkids[sentree.kids[sl+j,0]]*freq1[J_minpos] + sentree.numkids[sentree.kids[sl+j,1]]*freq2[J_minpos])/(sentree.numkids[sentree.kids[sl+j,0]]+sentree.numkids[sentree.kids[sl+j,1]])

                collapsed_sentence=np.delete(collapsed_sentence,J_minpos+1)
                collapsed_sentence[J_minpos]=sl+j
        return sentree
Ejemplo n.º 16
0
    def supAnalyser(self,X,freq,vocabulary,top=20):
        result_score=[]
        result_word=[]
        for i in range(self.cat):
            result_score.append([0.0]*top)
            result_word.append(['']*top)

        num_sent=np.size(X,0)
        allKids=[[]]*num_sent

        for i in range(num_sent):
            x=X[i]
            sl=len(x)
            words_embedded=self.WL[:,x]
            unsup_tree = self.forwardProp([],words_embedded,False,None,self.theta,freq)
            allKids[i]=unsup_tree.kids

            sup_tree=rnntree.rnntree(self.d,sl,words_embedded)

            nodeUnder = np.ones([2*sl-1,1])

            for j in range(sl,2*sl-1): # calculate n1, n2 and n1+n2 for each node in the sensentree and store in nodeUnder
                kids = allKids[i][j]
                n1 = nodeUnder[kids[0]]
                n2 = nodeUnder[kids[1]]
                nodeUnder[j] = n1+n2

            #sentree.catDelta = np.zeros([cat_size, 2*sl-1])
            #sentree.catDelta_out = np.zeros([self.d,2*sl-1])

            for j in range(2*sl-1):
                kids = allKids[i][j]

                c1 = sup_tree.nodeFeatures[:,kids[0]]
                c2 = sup_tree.nodeFeatures[:,kids[1]]

                # Eq. [2] in the paper: p = f(W[1][c1 c2] + b[1])
                p = tanh(np.dot(self.W1,c1) + np.dot(self.W2,c2) + self.b1)

                # See last paragraph in Section 2.3
                p_norm1 = p/norm(p)

                # Eq. (7) in the paper (for special case of 1d label)
                #sm = sigmoid(np.dot(Wlab,p_norm1) + blab)
                sm=softmax(np.dot(self.Wlab,p_norm1) + self.blab)
                #max_score=max(sm)
                for ind in range(self.cat):
                    max_score=sm[ind]
                    #ind=list(sm).index(max_score)
                    min_score=min(result_score[ind])
                    if max_score>min_score:
                        min_ind=result_score[ind].index(min_score)
                        result_score[ind][min_ind]=max_score
                        if j<sl:
                            result_word[ind][min_ind]=vocabulary[x[j]]
                        else:
                            stk=[]
                            stk.extend(list(kids))
                            stk.reverse()
                            words=[]
                            while len(stk)!=0:
                                current=stk.pop()
                                if current<sl:
                                    words.append(vocabulary[x[current]])
                                else:
                                    toExtend=[]
                                    toExtend.extend(list(allKids[i][current]))
                                    toExtend.reverse()
                                    stk.extend(toExtend)

                            result_word[ind][min_ind]=' '.join(words)
        return (result_score,result_word)
Ejemplo n.º 17
0
def lstm_cond_layer(tparams, state_below, options, prefix='lstm',
                    mask=None, context=None, one_step=False,
                    init_memory=None, init_state=None,
                    trng=None, use_noise=None, sampling=True,
                    argmax=False, **kwargs):

    assert context, 'Context must be provided'

    if one_step:
        assert init_memory, 'previous memory must be provided'
        assert init_state, 'previous state must be provided'

    nsteps = state_below.shape[0]
    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = 1

    # mask
    if mask is None:
        mask = tensor.alloc(1., state_below.shape[0], 1)

    # infer lstm dimension
    dim = tparams[_p(prefix, 'U')].shape[0]

    # initial/previous state
    if init_state is None:
        init_state = tensor.alloc(0., n_samples, dim)
    # initial/previous memory
    if init_memory is None:
        init_memory = tensor.alloc(0., n_samples, dim)

    # projected context
    pctx_ = tensor.dot(context, tparams[_p(prefix,'Wc_att')]) + tparams[_p(prefix, 'b_att')]
    if options['n_layers_att'] > 1:
        for lidx in xrange(1, options['n_layers_att']):
            pctx_ = tensor.dot(pctx_, tparams[_p(prefix,'W_att_%d'%lidx)])+tparams[_p(prefix, 'b_att_%d'%lidx)]
            # note to self: this used to be options['n_layers_att'] - 1, so no extra non-linearity if n_layers_att < 3
            if lidx < options['n_layers_att']:
                pctx_ = tanh(pctx_)

    # projected x
    # state_below is timesteps*num samples by d in training (TODO change to notation of paper)
    # this is n * d during sampling
    state_below = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')]

    # additional parameters for stochastic hard attention
    if options['attn_type'] == 'stochastic':
        # temperature for softmax
        temperature = options.get("temperature", 1)
        # [see (Section 4.1): Stochastic "Hard" Attention]
        semi_sampling_p = options.get("semi_sampling_p", 0.5)
        temperature_c = theano.shared(numpy.float32(temperature), name='temperature_c')
        h_sampling_mask = trng.binomial((1,), p=semi_sampling_p, n=1, dtype=theano.config.floatX).sum()

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n*dim:(n+1)*dim]
        return _x[:, n*dim:(n+1)*dim]

    def _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_=None, dp_att_=None):
        """ Each variable is one time slice of the LSTM
        m_ - (mask), x_- (previous word), h_- (hidden state), c_- (lstm memory),
        a_ - (alpha distribution [eq (5)]), as_- (sample from alpha dist), ct_- (context),
        pctx_ (projected context), dp_/dp_att_ (dropout masks)
        """
        # attention computation
        # [described in  equations (4), (5), (6) in
        # section "3.1.2 Decoder: Long Short Term Memory Network]
        pstate_ = tensor.dot(h_, tparams[_p(prefix,'Wd_att')])
        pctx_ = pctx_ + pstate_[:,None,:]
        pctx_list = []
        pctx_list.append(pctx_)
        pctx_ = tanh(pctx_)
        alpha = tensor.dot(pctx_, tparams[_p(prefix,'U_att')])+tparams[_p(prefix, 'c_tt')]
        alpha_pre = alpha
        alpha_shp = alpha.shape

        if options['attn_type'] == 'deterministic':
            alpha = tensor.nnet.softmax(alpha.reshape([alpha_shp[0],alpha_shp[1]])) # softmax
            ctx_ = (context * alpha[:,:,None]).sum(1) # current context
            alpha_sample = alpha # you can return something else reasonable here to debug
        else:
            alpha = tensor.nnet.softmax(temperature_c*alpha.reshape([alpha_shp[0],alpha_shp[1]])) # softmax
            # TODO return alpha_sample
            if sampling:
                alpha_sample = h_sampling_mask * trng.multinomial(pvals=alpha,dtype=theano.config.floatX)\
                               + (1.-h_sampling_mask) * alpha
            else:
                if argmax:
                    alpha_sample = tensor.cast(tensor.eq(tensor.arange(alpha_shp[1])[None,:],
                                               tensor.argmax(alpha,axis=1,keepdims=True)), theano.config.floatX)
                else:
                    alpha_sample = alpha
            ctx_ = (context * alpha_sample[:,:,None]).sum(1) # current context

        if options['selector']:
            sel_ = tensor.nnet.sigmoid(tensor.dot(h_, tparams[_p(prefix, 'W_sel')])+tparams[_p(prefix,'b_sel')])
            sel_ = sel_.reshape([sel_.shape[0]])
            ctx_ = sel_[:,None] * ctx_

        preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
        preact += x_
        preact += tensor.dot(ctx_, tparams[_p(prefix, 'Wc')])

        # Recover the activations to the lstm gates
        # [equation (1)]
        i = _slice(preact, 0, dim)
        f = _slice(preact, 1, dim)
        o = _slice(preact, 2, dim)
        if options['use_dropout_lstm']:
            i = i * _slice(dp_, 0, dim)
            f = f * _slice(dp_, 1, dim)
            o = o * _slice(dp_, 2, dim)
        i = tensor.nnet.sigmoid(i)
        f = tensor.nnet.sigmoid(f)
        o = tensor.nnet.sigmoid(o)
        c = tensor.tanh(_slice(preact, 3, dim))

        # compute the new memory/hidden state
        # if the mask is 0, just copy the previous state
        c = f * c_ + i * c
        c = m_[:,None] * c + (1. - m_)[:,None] * c_

        h = o * tensor.tanh(c)
        h = m_[:,None] * h + (1. - m_)[:,None] * h_

        rval = [h, c, alpha, alpha_sample, ctx_]
        if options['selector']:
            rval += [sel_]
        rval += [pstate_, pctx_, i, f, o, preact, alpha_pre]+pctx_list
        return rval

    if options['use_dropout_lstm']:
        if options['selector']:
            _step0 = lambda m_, x_, dp_, h_, c_, a_, as_, ct_, sel_, pctx_: \
                            _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_)
        else:
            _step0 = lambda m_, x_, dp_, h_, c_, a_, as_, ct_, pctx_: \
                            _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_)
        dp_shape = state_below.shape
        if one_step:
            dp_mask = tensor.switch(use_noise,
                                    trng.binomial((dp_shape[0], 3*dim),
                                                  p=0.5, n=1, dtype=state_below.dtype),
                                    tensor.alloc(0.5, dp_shape[0], 3 * dim))
        else:
            dp_mask = tensor.switch(use_noise,
                                    trng.binomial((dp_shape[0], dp_shape[1], 3*dim),
                                                  p=0.5, n=1, dtype=state_below.dtype),
                                    tensor.alloc(0.5, dp_shape[0], dp_shape[1], 3*dim))
    else:
        if options['selector']:
            _step0 = lambda m_, x_, h_, c_, a_, as_, ct_, sel_, pctx_: _step(m_, x_, h_, c_, a_, as_, ct_, pctx_)
        else:
            _step0 = lambda m_, x_, h_, c_, a_, as_, ct_, pctx_: _step(m_, x_, h_, c_, a_, as_, ct_, pctx_)

    if one_step:
        if options['use_dropout_lstm']:
            if options['selector']:
                rval = _step0(mask, state_below, dp_mask, init_state, init_memory, None, None, None, None, pctx_)
            else:
                rval = _step0(mask, state_below, dp_mask, init_state, init_memory, None, None, None, pctx_)
        else:
            if options['selector']:
                rval = _step0(mask, state_below, init_state, init_memory, None, None, None, None, pctx_)
            else:
                rval = _step0(mask, state_below, init_state, init_memory, None, None, None, pctx_)
        return rval
    else:
        seqs = [mask, state_below]
        if options['use_dropout_lstm']:
            seqs += [dp_mask]
        outputs_info = [init_state,
                        init_memory,
                        tensor.alloc(0., n_samples, pctx_.shape[1]),
                        tensor.alloc(0., n_samples, pctx_.shape[1]),
                        tensor.alloc(0., n_samples, context.shape[2])]
        if options['selector']:
            outputs_info += [tensor.alloc(0., n_samples)]
        outputs_info += [None,
                         None,
                         None,
                         None,
                         None,
                         None,
                         None] + [None] # *options['n_layers_att']
        rval, updates = theano.scan(_step0,
                                    sequences=seqs,
                                    outputs_info=outputs_info,
                                    non_sequences=[pctx_],
                                    name=_p(prefix, '_layers'),
                                    n_steps=nsteps, profile=False)
        return rval, updates
    def supAnalyser(self, X, freq, vocabulary, top=20):
        result_score = []
        result_word = []
        for i in range(self.cat):
            result_score.append([0.0] * top)
            result_word.append([''] * top)

        num_sent = np.size(X, 0)
        allKids = [[]] * num_sent

        for i in range(num_sent):
            x = X[i]
            sl = len(x)
            words_embedded = self.WL[:, x]
            unsup_tree = self.forwardProp([], words_embedded, False, None,
                                          self.theta, freq)
            allKids[i] = unsup_tree.kids

            sup_tree = rnntree.rnntree(self.d, sl, words_embedded)

            nodeUnder = np.ones([2 * sl - 1, 1])

            for j in range(
                    sl, 2 * sl - 1
            ):  # calculate n1, n2 and n1+n2 for each node in the sensentree and store in nodeUnder
                kids = allKids[i][j]
                n1 = nodeUnder[kids[0]]
                n2 = nodeUnder[kids[1]]
                nodeUnder[j] = n1 + n2

            #sentree.catDelta = np.zeros([cat_size, 2*sl-1])
            #sentree.catDelta_out = np.zeros([self.d,2*sl-1])

            for j in range(2 * sl - 1):
                kids = allKids[i][j]

                c1 = sup_tree.nodeFeatures[:, kids[0]]
                c2 = sup_tree.nodeFeatures[:, kids[1]]

                # Eq. [2] in the paper: p = f(W[1][c1 c2] + b[1])
                p = tanh(np.dot(self.W1, c1) + np.dot(self.W2, c2) + self.b1)

                # See last paragraph in Section 2.3
                p_norm1 = p / norm(p)

                # Eq. (7) in the paper (for special case of 1d label)
                #sm = sigmoid(np.dot(Wlab,p_norm1) + blab)
                sm = softmax(np.dot(self.Wlab, p_norm1) + self.blab)
                max_score = max(sm)
                ind = list(sm).index(max_score)
                min_score = min(result_score[ind])
                if max_score > min_score:
                    min_ind = result_score[ind].index(min_score)
                    result_score[ind][min_ind] = max_score
                    if j < sl:
                        result_word[ind][min_ind] = vocabulary[x[j]]
                    else:
                        stk = []
                        stk.extend(list(kids))
                        stk.reverse()
                        words = []
                        while len(stk) != 0:
                            current = stk.pop()
                            if current < sl:
                                words.append(vocabulary[x[current]])
                            else:
                                toExtend = []
                                toExtend.extend(list(allKids[i][current]))
                                toExtend.reverse()
                                stk.extend(toExtend)

                        result_word[ind][min_ind] = ' '.join(words)
        return (result_score, result_word)
Ejemplo n.º 19
0
    def forwardProp(self,allKids,words_embedded,updateWlab,label,theta,freq):
        #allkids存的是所有节点,第i行存第i个节点,列表示第i行节点所包含的子节点
        (W1,W2,W3,W4,Wlab,b1,b2,b3,blab,WL)=self.getParams(theta)
        #s1可能是词汇表的大小        
        sl=np.size(words_embedded,1)
        sentree=rnntree.rnntree(self.d,sl,words_embedded)
        collapsed_sentence = range(sl)
        #计算情感误差
        if updateWlab:
            temp_label=np.zeros(self.cat)
            #label表示当前标签,label-1主要是因为list从0开始,即当前标签的位置为1
            temp_label[label-1]=1.0
            nodeUnder = np.ones([2*sl-1,1])
            #n1,n2是kids的子节点数
            for i in range(sl,2*sl-1): # calculate n1, n2 and n1+n2 for each node in the sensentree and store in nodeUnder
                kids = allKids[i] 
                n1 = nodeUnder[kids[0]] #左节点
                n2 = nodeUnder[kids[1]] #右节点
                nodeUnder[i] = n1+n2    #第i个节点的子节点数目

            cat_size=self.cat
            sentree.catDelta = np.zeros([cat_size, 2*sl-1])
            sentree.catDelta_out = np.zeros([self.d,2*sl-1])

            # classifier on single words
            for i in range(sl):
                sm = softmax(np.dot(Wlab,words_embedded[:,i]) + blab)
                #这里代码部分计算情感误差和论文不太一样,这里直接用yi-h(x)来表示情感误差
                lbl_sm = (1-self.alpha)*(temp_label - sm)
                #这里貌似是在计算J
                sentree.nodeScores[i] = 1.0/2.0*(np.dot(lbl_sm,(temp_label- sm)))  #sentree.nodeScores分为2个部分,这里计算0-s1,下面计算2*s1-1
                sentree.catDelta[:, i] = -np.dot(lbl_sm,softmax_prime(sm))

            # sm = sigmoid(self.Wlab*words_embedded + self.blab)

            #lbl_sm = (1-self.alpha)*(label[:,np.ones(sl,1)] - sm)
            #sentree.nodeScores[:sl] = 1/2*(lbl_sm.*(label(:,ones(sl,1)) - sm))
            #sentree.catDelta[:, :sl] = -(lbl_sm).*sigmoid_prime(sm)

            for i in range(sl,2*sl-1):
                #kids,c1,c2 是什么
                kids = allKids[i]

                c1 = sentree.nodeFeatures[:,kids[0]]   #左孩子的词向量
                c2 = sentree.nodeFeatures[:,kids[1]]   #右孩子的词向量

                # Eq. [2] in the paper: p = f(W[1][c1 c2] + b[1])
                p = tanh(np.dot(W1,c1) + np.dot(W2,c2) + b1)

                # See last paragraph in Section 2.3
                p_norm1 = p/norm(p)

                # Eq. (7) in the paper (for special case of 1d label)
                #sm = sigmoid(np.dot(Wlab,p_norm1) + blab)
                sm=softmax(np.dot(Wlab,p_norm1) + blab)
                beta=0.5  #论文里面本来是没有beta这个值的
                #lbl_sm = beta * (1.0-self.alpha)*(label - sm)
                lbl_sm = beta * (1.0-self.alpha)*(temp_label - sm)
                #lbl_sm = beta * (1.0-self.alpha) * (temp_label-sm)
                #sentree.catDelta[:, i] = -softmax_prime(sm)[:,label-1]
                #J=-(1.0-self.alpha)*np.log(sm[label-1])
                #sentree.catDelta[:, i] = -np.dot(lbl_sm,sigmoid_prime(sm))
                sentree.catDelta[:, i] = -np.dot(lbl_sm,softmax_prime(sm))
                #J = 1.0/2.0*(np.dot(lbl_sm,(label - sm)))
                J = 1.0/2.0*(np.dot(lbl_sm,(temp_label - sm)))

                sentree.nodeFeatures[:,i] = p_norm1
                sentree.nodeFeatures_unnormalized[:,i] = p
                sentree.nodeScores[i] = J
                sentree.numkids = nodeUnder

            sentree.kids = allKids
        #计算重构误差
        else:
            # Reconstruction Error
            for j in range(sl-1):
                size2=np.size(words_embedded,1)
                c1 = words_embedded[:,0:-1] 
                c2 = words_embedded[:,1:]

                freq1 = freq[0:-1]
                freq2 = freq[1:]

                p = tanh(np.dot(W1,c1) + np.dot(W2,c2) + np.reshape(b1,[self.d,1])*([1]*(size2-1)))
                p_norm1 =p/np.sqrt(sum(p**2))
                #下方y1,y2实际上就是论文的c1,c2,由p分解而来。
                y1_unnormalized = tanh(np.dot(W3,p_norm1) + np.reshape(b2,[self.d,1])*([1]*(size2-1)))
                y2_unnormalized = tanh(np.dot(W4,p_norm1) + np.reshape(b3,[self.d,1])*([1]*(size2-1)))

                y1 = y1_unnormalized/np.sqrt(sum(y1_unnormalized**2))
                y2 = y2_unnormalized/np.sqrt(sum(y2_unnormalized**2))

                y1c1 = self.alpha*(y1-c1)
                y2c2 = self.alpha*(y2-c2)

                # Eq. (4) in the paper: reconstruction error:重构误差
                #(y1-c1)*(y1-c1)的结果是一个数值
                J = 1.0/2.0*sum((y1c1)*(y1-c1) + (y2c2)*(y2-c2))
                
                #这个for循环的下面部分没看懂
                # finding the pair with smallest reconstruction error for constructing sentree
                #min(J)是什么意思,J是一个值
                J_min= min(J)
                J_minpos=np.argmin(J)
                #重构误差最小的重构向量存入树中(c1',c2')
                sentree.node_y1c1[:,sl+j] = y1c1[:,J_minpos]
                sentree.node_y2c2[:,sl+j] = y2c2[:,J_minpos]
                #可能是更新值
                sentree.nodeDelta_out1[:,sl+j] = np.dot(norm1tanh_prime(y1_unnormalized[:,J_minpos]) , y1c1[:,J_minpos])
                sentree.nodeDelta_out2[:,sl+j] = np.dot(norm1tanh_prime(y2_unnormalized[:,J_minpos]) , y2c2[:,J_minpos])

                words_embedded=np.delete(words_embedded,J_minpos+1,1)
                words_embedded[:,J_minpos]=p_norm1[:,J_minpos]
                sentree.nodeFeatures[:, sl+j] = p_norm1[:,J_minpos]
                sentree.nodeFeatures_unnormalized[:, sl+j]= p[:,J_minpos]
                sentree.nodeScores[sl+j] = J_min
                sentree.pp[collapsed_sentence[J_minpos]] = sl+j
                sentree.pp[collapsed_sentence[J_minpos+1]] = sl+j
                sentree.kids[sl+j,:] = [collapsed_sentence[J_minpos], collapsed_sentence[J_minpos+1]]
                sentree.numkids[sl+j] = sentree.numkids[sentree.kids[sl+j,0]] + sentree.numkids[sentree.kids[sl+j,1]]


                freq=np.delete(freq,J_minpos+1)
                freq[J_minpos] = (sentree.numkids[sentree.kids[sl+j,0]]*freq1[J_minpos] + sentree.numkids[sentree.kids[sl+j,1]]*freq2[J_minpos])/(sentree.numkids[sentree.kids[sl+j,0]]+sentree.numkids[sentree.kids[sl+j,1]])

                collapsed_sentence=np.delete(collapsed_sentence,J_minpos+1)
                collapsed_sentence[J_minpos]=sl+j
        return sentree
def build_model(tparams, options):
    """ Builds the entire computational graph used for training

    Basically does a forward pass through the data and calculates the cost function

    [This function builds a model described in Section 3.1.2 onwards
    as the convolutional feature are precomputed, some extra features
    which were not used are also implemented here.]

    Parameters
    ----------
    tparams : OrderedDict
        maps names of variables to theano shared variables
    options : dict
        big dictionary with all the settings and hyperparameters
    Returns
    -------
    trng: theano random number generator
        Used for dropout, etc
    use_noise: theano shared variable
        flag that toggles noise on and off
    [x, mask, ctx, cnn_features]: theano variables
        Represent the captions, binary mask, and annotations
        for a single batch (see dimensions below)
    alphas: theano variables
        Attention weights
    alpha_sample: theano variable
        Sampled attention weights used in REINFORCE for stochastic
        attention: [see the learning rule in eq (12)]
    cost: theano variable
        negative log likelihood
    opt_outs: OrderedDict
        extra outputs required depending on configuration in options
    """
    trng = RandomStreams(1234)
    use_noise = theano.shared(numpy.float32(0.))

    # description string: #words x #samples,
    x = tensor.matrix('x', dtype='int64')
    # mask: #samples,
    mask = tensor.matrix('mask', dtype='float32')
    # context: #samples x #visual_words x dim
    if options['with_glove']:
        ctx = tensor.tensor3('ctx', dtype='float32')
        new_ctx = ctx
    else:
        ctx = tensor.matrix('ctx', dtype='int32')
        new_ctx = tparams['VCemb'][ctx]
    # fc7 features: #samples x dim
    cnn_features = tensor.matrix('cnn_feats', dtype='float32')

    # index into the word embedding matrix, shift it forward in time, the first element is zero
    # Time step x S x D
    emb = tparams['Wemb'][x.flatten()].reshape(
        [x.shape[0], x.shape[1], options['dim_word']])
    emb_shifted = tensor.zeros_like(emb)
    emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1])
    emb = emb_shifted

    # forward-backward lstm encoder
    if options['lstm_encoder']:
        rval, encoder_alphas = get_layer('lstm_cond_nox')[1](tparams,
                                                             options,
                                                             prefix='encoder',
                                                             context=new_ctx)
        ctx0 = rval.dimshuffle(1, 0, 2)
    else:
        ctx0 = new_ctx

    for lidx in range(options['n_layers_lstm']):
        init_state_prefix = 'CNNTrans_%d' % lidx if lidx > 0 else 'CNNTrans'
        init_memory_prefix = 'CNN_memory_%d' % lidx if lidx > 0 else 'CNN_memory'
        lstm_prefix = 'decoder_%d' % lidx if lidx > 0 else 'decoder'
        lstm_inps = proj_h if lidx > 0 else emb
        init_state = get_layer('ff')[1](tparams,
                                        cnn_features,
                                        options,
                                        prefix=init_state_prefix,
                                        activ='tanh')
        init_memory = get_layer('ff')[1](tparams,
                                         cnn_features,
                                         options,
                                         prefix=init_memory_prefix,
                                         activ='tanh')
        attn_updates = []
        proj, updates = get_layer('lstm_cond')[1](tparams,
                                                  lstm_inps,
                                                  options,
                                                  prefix=lstm_prefix,
                                                  mask=mask,
                                                  context=ctx0,
                                                  one_step=False,
                                                  init_state=init_state,
                                                  init_memory=init_memory,
                                                  trng=trng,
                                                  use_noise=use_noise)
        attn_updates += updates
        proj_h = proj[0]

    alphas = proj[2]
    ctxs = proj[4]

    if options['use_dropout']:
        proj_h = dropout_layer(proj_h, use_noise, trng)

    # compute word probabilities
    # [equation (7)]
    logit = get_layer('ff')[1](tparams,
                               proj_h,
                               options,
                               prefix='ff_logit_lstm',
                               activ='linear')
    if options['prev2out']:
        logit += emb
    if options['ctx2out']:
        logit += get_layer('ff')[1](tparams,
                                    ctxs,
                                    options,
                                    prefix='ff_logit_ctx',
                                    activ='linear')
    logit = tanh(logit)
    if options['use_dropout']:
        logit = dropout_layer(logit, use_noise, trng)
    if options['n_layers_out'] > 1:
        for lidx in xrange(1, options['n_layers_out']):
            logit = get_layer('ff')[1](tparams,
                                       logit,
                                       options,
                                       prefix='ff_logit_h%d' % lidx,
                                       activ='rectifier')
            if options['use_dropout']:
                logit = dropout_layer(logit, use_noise, trng)

    # compute softmax
    logit = get_layer('ff')[1](tparams,
                               logit,
                               options,
                               prefix='ff_logit',
                               activ='linear')
    logit_shp = logit.shape
    probs = tensor.nnet.softmax(
        logit.reshape([logit_shp[0] * logit_shp[1], logit_shp[2]]))

    # Index into the computed probability to give the log likelihood
    x_flat = x.flatten()
    p_flat = probs.flatten()
    cost = -tensor.log(p_flat[tensor.arange(x_flat.shape[0]) * probs.shape[1] +
                              x_flat] + 1e-8)
    cost = cost.reshape([x.shape[0], x.shape[1]])
    masked_cost = cost * mask
    #align_cost = (-standard_aligns*alphas).sum(2)

    cost = masked_cost.sum(0)

    # optional outputs
    opt_outs = dict()
    if options['lstm_encoder']:
        return trng, use_noise, [x, mask, ctx, cnn_features
                                 ], [alphas, encoder_alphas], cost, opt_outs
    else:
        return trng, use_noise, [x, mask, ctx,
                                 cnn_features], [alphas], cost, opt_outs
def build_sampler(tparams, options, use_noise, trng):
    """ Builds a sampler used for generating from the model
    Parameters
    ----------
    tparams : OrderedDict
        maps names of variables to theano shared variables
    options : dict
        big dictionary with all the settings and hyperparameters
    use_noise: boolean
        If true, add noise to the sampling
    trng: random number generator
    Returns
    -------
    f_init : theano function
        Input: annotation, Output: initial lstm state and memory
        (also performs transformation on ctx0 if using lstm_encoder)
    f_next: theano function
        Takes the previous word/state/memory + ctx0 and runs ne
        step through the lstm (used for beam search)
    """
    # context: #annotations x dim
    if options['with_glove']:
        ctx = tensor.matrix('ctx_sampler', dtype='float32')
        new_ctx = ctx
    else:
        ctx = tensor.vector('ctx_sampler', dtype='int32')
        new_ctx = tparams['VCemb'][ctx]
    if options['lstm_encoder']:
        ctx0, _ = get_layer('lstm_cond_nox')[1](tparams,
                                                options,
                                                prefix='encoder',
                                                context=new_ctx)
    else:
        ctx0 = new_ctx
    # initial state/cell
    cnn_features = tensor.vector('x_feats', dtype='float32')
    init_state, init_memory = [], []
    for lidx in range(options['n_layers_lstm']):
        init_state_prefix = 'CNNTrans_%d' % lidx if lidx > 0 else 'CNNTrans'
        init_memory_prefix = 'CNN_memory_%d' % lidx if lidx > 0 else 'CNN_memory'
        init_state.append(
            get_layer('ff')[1](tparams,
                               cnn_features,
                               options,
                               prefix=init_state_prefix,
                               activ='tanh'))
        init_memory.append(
            get_layer('ff')[1](tparams,
                               cnn_features,
                               options,
                               prefix=init_memory_prefix,
                               activ='tanh'))

    print 'Building f_init...',
    f_init = theano.function([ctx, cnn_features],
                             [ctx0] + init_state + init_memory,
                             name='f_init',
                             profile=False,
                             allow_input_downcast=True)
    print 'Done'

    # build f_next
    x = tensor.vector('x_sampler', dtype='int64')
    init_state = []
    init_memory = []
    for lidx in range(options['n_layers_lstm']):
        init_state.append(tensor.matrix('init_state', dtype='float32'))
        init_memory.append(tensor.matrix('init_memory', dtype='float32'))

    # for the first word (which is coded with -1), emb should be all zero
    emb = tensor.switch(x[:, None] < 0,
                        tensor.alloc(0., 1, tparams['Wemb'].shape[1]),
                        tparams['Wemb'][x])

    next_state, next_memory, ctxs = [], [], []
    for lidx in range(options['n_layers_lstm']):
        decoder_prefix = 'decoder_%d' % lidx if lidx > 0 else 'decoder'
        inps = proj_h if lidx > 0 else emb
        proj = get_layer('lstm_cond')[1](tparams,
                                         inps,
                                         options,
                                         prefix=decoder_prefix,
                                         context=ctx0,
                                         one_step=True,
                                         init_state=init_state[lidx],
                                         init_memory=init_memory[lidx],
                                         trng=trng,
                                         use_noise=use_noise)
        next_state.append(proj[0])
        next_memory.append(proj[1])
        ctxs.append(proj[4])
        next_alpha = proj[2]
        proj_h = proj[0]

    if options['use_dropout']:
        proj_h = dropout_layer(proj[0], use_noise, trng)
    else:
        proj_h = proj[0]
    logit = get_layer('ff')[1](tparams,
                               proj_h,
                               options,
                               prefix='ff_logit_lstm',
                               activ='linear')
    if options['prev2out']:
        logit += emb
    if options['ctx2out']:
        logit += get_layer('ff')[1](tparams,
                                    ctxs[-1],
                                    options,
                                    prefix='ff_logit_ctx',
                                    activ='linear')
    logit = tanh(logit)
    if options['use_dropout']:
        logit = dropout_layer(logit, use_noise, trng)
    if options['n_layers_out'] > 1:
        for lidx in xrange(1, options['n_layers_out']):
            logit = get_layer('ff')[1](tparams,
                                       logit,
                                       options,
                                       prefix='ff_logit_h%d' % lidx,
                                       activ='rectifier')
            if options['use_dropout']:
                logit = dropout_layer(logit, use_noise, trng)
    logit = get_layer('ff')[1](tparams,
                               logit,
                               options,
                               prefix='ff_logit',
                               activ='linear')
    next_probs = tensor.nnet.softmax(logit)
    next_sample = trng.multinomial(pvals=next_probs).argmax(1)

    # next word probability
    f_next = theano.function([x, ctx0] + init_state + init_memory,
                             [next_probs, next_sample, next_alpha] +
                             next_state + next_memory,
                             name='f_next',
                             profile=False,
                             allow_input_downcast=True)

    return f_init, f_next
Ejemplo n.º 22
0
def build_sampler(tparams, options, use_noise, trng, sampling=True):
    """ Builds a sampler used for generating from the model
    Parameters
    ----------
    tparams : OrderedDict
        maps names of variables to theano shared variables
    options : dict
        big dictionary with all the settings and hyperparameters
    use_noise: boolean
        If true, add noise to the sampling
    trng: random number generator
    sampling : boolean
        [If it is true, when using stochastic attention, follows
        the learning rule described in section 4. at the bottom left of
        page 5]
    Returns
    -------
    f_init : theano function
        Input: annotation, Output: initial lstm state and memory
        (also performs transformation on ctx0 if using lstm_encoder)
    f_next: theano function
        Takes the previous word/state/memory + ctx0 and runs ne
        step through the lstm (used for beam search)
    """
    # context: #annotations x dim
    ctx = tensor.matrix('ctx_sampler', dtype='float32')
    if options['lstm_encoder']:
        # encoder
        ctx_fwd = get_layer('lstm')[1](tparams, ctx,
                                       options, prefix='encoder')[0]
        ctx_rev = get_layer('lstm')[1](tparams, ctx[::-1,:],
                                       options, prefix='encoder_rev')[0][::-1,:]
        ctx = tensor.concatenate((ctx_fwd, ctx_rev), axis=1)

    # initial state/cell
    ctx_mean = ctx.mean(0)
    for lidx in xrange(1, options['n_layers_init']):
        ctx_mean = get_layer('ff')[1](tparams, ctx_mean, options,
                                      prefix='ff_init_%d'%lidx, activ='rectifier')
        if options['use_dropout']:
            ctx_mean = dropout_layer(ctx_mean, use_noise, trng)
    init_state = [get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state', activ='tanh')]
    init_memory = [get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_memory', activ='tanh')]
    if options['n_layers_lstm'] > 1:
        for lidx in xrange(1, options['n_layers_lstm']):
            init_state.append(get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state_%d'%lidx, activ='tanh'))
            init_memory.append(get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_memory_%d'%lidx, activ='tanh'))

    print 'Building f_init...',
    f_init = theano.function([ctx], [ctx]+init_state+init_memory, name='f_init', profile=False, allow_input_downcast=True)
    print 'Done'

    # build f_next
    ctx = tensor.matrix('ctx_sampler', dtype='float32')
    x = tensor.vector('x_sampler', dtype='int64')
    init_state = [tensor.matrix('init_state', dtype='float32')]
    init_memory = [tensor.matrix('init_memory', dtype='float32')]
    if options['n_layers_lstm'] > 1:
        for lidx in xrange(1, options['n_layers_lstm']):
            init_state.append(tensor.matrix('init_state', dtype='float32'))
            init_memory.append(tensor.matrix('init_memory', dtype='float32'))

    # for the first word (which is coded with -1), emb should be all zero
    emb = tensor.switch(x[:,None] < 0, tensor.alloc(0., 1, tparams['Wemb'].shape[1]),
                        tparams['Wemb'][x])

    proj = get_layer('lstm_cond')[1](tparams, emb, options,
                                     prefix='decoder',
                                     mask=None, context=ctx,
                                     one_step=True,
                                     init_state=init_state[0],
                                     init_memory=init_memory[0],
                                     trng=trng,
                                     use_noise=use_noise,
                                     sampling=sampling)

    next_state, next_memory, ctxs = [proj[0]], [proj[1]], [proj[4]]
    proj_h = proj[0]
    if options['n_layers_lstm'] > 1:
        for lidx in xrange(1, options['n_layers_lstm']):
            proj = get_layer('lstm_cond')[1](tparams, proj_h, options,
                                             prefix='decoder_%d'%lidx,
                                             context=ctx,
                                             one_step=True,
                                             init_state=init_state[lidx],
                                             init_memory=init_memory[lidx],
                                             trng=trng,
                                             use_noise=use_noise,
                                             sampling=sampling)
            next_state.append(proj[0])
            next_memory.append(proj[1])
            ctxs.append(proj[4])
            proj_h = proj[0]

    if options['use_dropout']:
        proj_h = dropout_layer(proj[0], use_noise, trng)
    else:
        proj_h = proj[0]
    logit = get_layer('ff')[1](tparams, proj_h, options, prefix='ff_logit_lstm', activ='linear')
    if options['prev2out']:
        logit += emb
    if options['ctx2out']:
        logit += get_layer('ff')[1](tparams, ctxs[-1], options, prefix='ff_logit_ctx', activ='linear')
    logit = tanh(logit)
    if options['use_dropout']:
        logit = dropout_layer(logit, use_noise, trng)
    if options['n_layers_out'] > 1:
        for lidx in xrange(1, options['n_layers_out']):
            logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit_h%d'%lidx, activ='rectifier')
            if options['use_dropout']:
                logit = dropout_layer(logit, use_noise, trng)
    logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear')
    logit_shp = logit.shape
    next_probs = tensor.nnet.softmax(logit)
    next_sample = trng.multinomial(pvals=next_probs).argmax(1)

    # next word probability
    f_next = theano.function([x, ctx]+init_state+init_memory, [next_probs, next_sample]+next_state+next_memory, name='f_next', profile=False, allow_input_downcast=True)

    return f_init, f_next
Ejemplo n.º 23
0
def build_model(tparams, options, sampling=True):
    """ Builds the entire computational graph used for training

    Basically does a forward pass through the data and calculates the cost function

    [This function builds a model described in Section 3.1.2 onwards
    as the convolutional feature are precomputed, some extra features
    which were not used are also implemented here.]

    Parameters
    ----------
    tparams : OrderedDict
        maps names of variables to theano shared variables
    options : dict
        big dictionary with all the settings and hyperparameters
    sampling : boolean
        [If it is true, when using stochastic attention, follows
        the learning rule described in section 4. at the bottom left of
        page 5]
    Returns
    -------
    trng: theano random number generator
        Used for dropout, stochastic attention, etc
    use_noise: theano shared variable
        flag that toggles noise on and off
    [x, mask, ctx]: theano variables
        Represent the captions, binary mask, and annotations
        for a single batch (see dimensions below)
    alphas: theano variables
        Attention weights
    alpha_sample: theano variable
        Sampled attention weights used in REINFORCE for stochastic
        attention: [see the learning rule in eq (12)]
    cost: theano variable
        negative log likelihood
    opt_outs: OrderedDict
        extra outputs required depending on configuration in options
    """
    trng = RandomStreams(1234)
    use_noise = theano.shared(numpy.float32(0.))

    # description string: #words x #samples,
    x = tensor.matrix('x', dtype='int64')
    mask = tensor.matrix('mask', dtype='float32')
    # context: #samples x #annotations x dim
    ctx = tensor.tensor3('ctx', dtype='float32')

    n_timesteps = x.shape[0]
    n_samples = x.shape[1]

    # index into the word embedding matrix, shift it forward in time
    emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, options['dim_word']])
    emb_shifted = tensor.zeros_like(emb)
    emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1])
    emb = emb_shifted
    if options['lstm_encoder']:
        # encoder
        ctx_fwd = get_layer('lstm')[1](tparams, ctx.dimshuffle(1,0,2),
                                       options, prefix='encoder')[0].dimshuffle(1,0,2)
        ctx_rev = get_layer('lstm')[1](tparams, ctx.dimshuffle(1,0,2)[:,::-1,:],
                                       options, prefix='encoder_rev')[0][:,::-1,:].dimshuffle(1,0,2)
        ctx0 = tensor.concatenate((ctx_fwd, ctx_rev), axis=2)
    else:
        ctx0 = ctx

    # initial state/cell [top right on page 4]
    ctx_mean = ctx0.mean(1)
    for lidx in xrange(1, options['n_layers_init']):
        ctx_mean = get_layer('ff')[1](tparams, ctx_mean, options,
                                      prefix='ff_init_%d'%lidx, activ='rectifier')
        if options['use_dropout']:
            ctx_mean = dropout_layer(ctx_mean, use_noise, trng)

    init_state = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state', activ='tanh')
    init_memory = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_memory', activ='tanh')
    # lstm decoder
    # [equation (1), (2), (3) in section 3.1.2]
    attn_updates = []
    proj, updates = get_layer('lstm_cond')[1](tparams, emb, options,
                                              prefix='decoder',
                                              mask=mask, context=ctx0,
                                              one_step=False,
                                              init_state=init_state,
                                              init_memory=init_memory,
                                              trng=trng,
                                              use_noise=use_noise,
                                              sampling=sampling)
    attn_updates += updates
    proj_h = proj[0]
    # optional deep attention
    if options['n_layers_lstm'] > 1:
        for lidx in xrange(1, options['n_layers_lstm']):
            init_state = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state_%d'%lidx, activ='tanh')
            init_memory = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_memory_%d'%lidx, activ='tanh')
            proj, updates = get_layer('lstm_cond')[1](tparams, proj_h, options,
                                                      prefix='decoder_%d'%lidx,
                                                      mask=mask, context=ctx0,
                                                      one_step=False,
                                                      init_state=init_state,
                                                      init_memory=init_memory,
                                                      trng=trng,
                                                      use_noise=use_noise,
                                                      sampling=sampling)
            attn_updates += updates
            proj_h = proj[0]

    alphas = proj[2]
    alpha_sample = proj[3]
    ctxs = proj[4]

    # [beta value explained in note 4.2.1 "doubly stochastic attention"]
    if options['selector']:
        sels = proj[5]

    if options['use_dropout']:
        proj_h = dropout_layer(proj_h, use_noise, trng)

    # compute word probabilities
    # [equation (7)]
    logit = get_layer('ff')[1](tparams, proj_h, options, prefix='ff_logit_lstm', activ='linear')
    if options['prev2out']:
        logit += emb
    if options['ctx2out']:
        logit += get_layer('ff')[1](tparams, ctxs, options, prefix='ff_logit_ctx', activ='linear')
    logit = tanh(logit)
    if options['use_dropout']:
        logit = dropout_layer(logit, use_noise, trng)
    if options['n_layers_out'] > 1:
        for lidx in xrange(1, options['n_layers_out']):
            logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit_h%d'%lidx, activ='rectifier')
            if options['use_dropout']:
                logit = dropout_layer(logit, use_noise, trng)

    # compute softmax
    logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear')
    logit_shp = logit.shape
    probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]]))

    # Index into the computed probability to give the log likelihood
    x_flat = x.flatten()
    p_flat = probs.flatten()
    cost = -tensor.log(p_flat[tensor.arange(x_flat.shape[0])*probs.shape[1]+x_flat]+1e-8)
    cost = cost.reshape([x.shape[0], x.shape[1]])
    masked_cost = cost * mask
    cost = (masked_cost).sum(0)

    # optional outputs
    opt_outs = dict()
    if options['selector']:
        opt_outs['selector'] = sels
    if options['attn_type'] == 'stochastic':
        opt_outs['masked_cost'] = masked_cost # need this for reinforce later
        opt_outs['attn_updates'] = attn_updates # this is to update the rng

    return trng, use_noise, [x, mask, ctx], alphas, alpha_sample, cost, opt_outs
Ejemplo n.º 24
0
    def _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_=None, dp_att_=None):
        """ Each variable is one time slice of the LSTM
        m_ - (mask), x_- (previous word), h_- (hidden state), c_- (lstm memory),
        a_ - (alpha distribution [eq (5)]), as_- (sample from alpha dist), ct_- (context),
        pctx_ (projected context), dp_/dp_att_ (dropout masks)
        """
        # attention computation
        # [described in  equations (4), (5), (6) in
        # section "3.1.2 Decoder: Long Short Term Memory Network]
        pstate_ = tensor.dot(h_, tparams[_p(prefix,'Wd_att')])
        pctx_ = pctx_ + pstate_[:,None,:]
        pctx_list = []
        pctx_list.append(pctx_)
        pctx_ = tanh(pctx_)
        alpha = tensor.dot(pctx_, tparams[_p(prefix,'U_att')])+tparams[_p(prefix, 'c_tt')]
        alpha_pre = alpha
        alpha_shp = alpha.shape

        if options['attn_type'] == 'deterministic':
            alpha = tensor.nnet.softmax(alpha.reshape([alpha_shp[0],alpha_shp[1]])) # softmax
            ctx_ = (context * alpha[:,:,None]).sum(1) # current context
            alpha_sample = alpha # you can return something else reasonable here to debug
        else:
            alpha = tensor.nnet.softmax(temperature_c*alpha.reshape([alpha_shp[0],alpha_shp[1]])) # softmax
            # TODO return alpha_sample
            if sampling:
                alpha_sample = h_sampling_mask * trng.multinomial(pvals=alpha,dtype=theano.config.floatX)\
                               + (1.-h_sampling_mask) * alpha
            else:
                if argmax:
                    alpha_sample = tensor.cast(tensor.eq(tensor.arange(alpha_shp[1])[None,:],
                                               tensor.argmax(alpha,axis=1,keepdims=True)), theano.config.floatX)
                else:
                    alpha_sample = alpha
            ctx_ = (context * alpha_sample[:,:,None]).sum(1) # current context

        if options['selector']:
            sel_ = tensor.nnet.sigmoid(tensor.dot(h_, tparams[_p(prefix, 'W_sel')])+tparams[_p(prefix,'b_sel')])
            sel_ = sel_.reshape([sel_.shape[0]])
            ctx_ = sel_[:,None] * ctx_

        preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
        preact += x_
        preact += tensor.dot(ctx_, tparams[_p(prefix, 'Wc')])

        # Recover the activations to the lstm gates
        # [equation (1)]
        i = _slice(preact, 0, dim)
        f = _slice(preact, 1, dim)
        o = _slice(preact, 2, dim)
        if options['use_dropout_lstm']:
            i = i * _slice(dp_, 0, dim)
            f = f * _slice(dp_, 1, dim)
            o = o * _slice(dp_, 2, dim)
        i = tensor.nnet.sigmoid(i)
        f = tensor.nnet.sigmoid(f)
        o = tensor.nnet.sigmoid(o)
        c = tensor.tanh(_slice(preact, 3, dim))

        # compute the new memory/hidden state
        # if the mask is 0, just copy the previous state
        c = f * c_ + i * c
        c = m_[:,None] * c + (1. - m_)[:,None] * c_

        h = o * tensor.tanh(c)
        h = m_[:,None] * h + (1. - m_)[:,None] * h_

        rval = [h, c, alpha, alpha_sample, ctx_]
        if options['selector']:
            rval += [sel_]
        rval += [pstate_, pctx_, i, f, o, preact, alpha_pre]+pctx_list
        return rval
Ejemplo n.º 25
0
def build_sampler(tparams, options, use_noise, trng, sampling=True):
    """ Builds a sampler used for generating from the model
    Parameters
    ----------
    tparams : OrderedDict
        maps names of variables to theano shared variables
    options : dict
        big dictionary with all the settings and hyperparameters
    use_noise: boolean
        If true, add noise to the sampling
    trng: random number generator
    sampling : boolean
        [If it is true, when using stochastic attention, follows
        the learning rule described in section 4. at the bottom left of
        page 5]
    Returns
    -------
    f_init : theano function
        Input: annotation, Output: initial lstm state and memory
        (also performs transformation on ctx0 if using lstm_encoder)
    f_next: theano function
        Takes the previous word/state/memory + ctx0 and runs ne
        step through the lstm (used for beam search)
    """
    # context: #annotations x dim
    ctx = tensor.matrix('ctx_sampler', dtype='float32')
    if options['lstm_encoder']:
        # encoder
        ctx_fwd = get_layer('lstm')[1](tparams, ctx, options,
                                       prefix='encoder')[0]
        ctx_rev = get_layer('lstm')[1](tparams,
                                       ctx[::-1, :],
                                       options,
                                       prefix='encoder_rev')[0][::-1, :]
        ctx = tensor.concatenate((ctx_fwd, ctx_rev), axis=1)

    # initial state/cell
    ctx_mean = ctx.mean(0)
    for lidx in xrange(1, options['n_layers_init']):
        ctx_mean = get_layer('ff')[1](tparams,
                                      ctx_mean,
                                      options,
                                      prefix='ff_init_%d' % lidx,
                                      activ='rectifier')
        if options['use_dropout']:
            ctx_mean = dropout_layer(ctx_mean, use_noise, trng)
    init_state = [
        get_layer('ff')[1](tparams,
                           ctx_mean,
                           options,
                           prefix='ff_state',
                           activ='tanh')
    ]
    init_memory = [
        get_layer('ff')[1](tparams,
                           ctx_mean,
                           options,
                           prefix='ff_memory',
                           activ='tanh')
    ]
    if options['n_layers_lstm'] > 1:
        for lidx in xrange(1, options['n_layers_lstm']):
            init_state.append(
                get_layer('ff')[1](tparams,
                                   ctx_mean,
                                   options,
                                   prefix='ff_state_%d' % lidx,
                                   activ='tanh'))
            init_memory.append(
                get_layer('ff')[1](tparams,
                                   ctx_mean,
                                   options,
                                   prefix='ff_memory_%d' % lidx,
                                   activ='tanh'))

    print 'Building f_init...',
    f_init = theano.function([ctx], [ctx] + init_state + init_memory,
                             name='f_init',
                             profile=False,
                             allow_input_downcast=True)
    print 'Done'

    # build f_next
    ctx = tensor.matrix('ctx_sampler', dtype='float32')
    x = tensor.vector('x_sampler', dtype='int64')
    init_state = [tensor.matrix('init_state', dtype='float32')]
    init_memory = [tensor.matrix('init_memory', dtype='float32')]
    if options['n_layers_lstm'] > 1:
        for lidx in xrange(1, options['n_layers_lstm']):
            init_state.append(tensor.matrix('init_state', dtype='float32'))
            init_memory.append(tensor.matrix('init_memory', dtype='float32'))

    # for the first word (which is coded with -1), emb should be all zero
    emb = tensor.switch(x[:, None] < 0,
                        tensor.alloc(0., 1, tparams['Wemb'].shape[1]),
                        tparams['Wemb'][x])

    proj = get_layer('lstm_cond')[1](tparams,
                                     emb,
                                     options,
                                     prefix='decoder',
                                     mask=None,
                                     context=ctx,
                                     one_step=True,
                                     init_state=init_state[0],
                                     init_memory=init_memory[0],
                                     trng=trng,
                                     use_noise=use_noise,
                                     sampling=sampling)

    next_state, next_memory, ctxs = [proj[0]], [proj[1]], [proj[4]]
    proj_h = proj[0]
    if options['n_layers_lstm'] > 1:
        for lidx in xrange(1, options['n_layers_lstm']):
            proj = get_layer('lstm_cond')[1](tparams,
                                             proj_h,
                                             options,
                                             prefix='decoder_%d' % lidx,
                                             context=ctx,
                                             one_step=True,
                                             init_state=init_state[lidx],
                                             init_memory=init_memory[lidx],
                                             trng=trng,
                                             use_noise=use_noise,
                                             sampling=sampling)
            next_state.append(proj[0])
            next_memory.append(proj[1])
            ctxs.append(proj[4])
            proj_h = proj[0]

    if options['use_dropout']:
        proj_h = dropout_layer(proj[0], use_noise, trng)
    else:
        proj_h = proj[0]
    logit = get_layer('ff')[1](tparams,
                               proj_h,
                               options,
                               prefix='ff_logit_lstm',
                               activ='linear')
    if options['prev2out']:
        logit += emb
    if options['ctx2out']:
        logit += get_layer('ff')[1](tparams,
                                    ctxs[-1],
                                    options,
                                    prefix='ff_logit_ctx',
                                    activ='linear')
    logit = tanh(logit)
    if options['use_dropout']:
        logit = dropout_layer(logit, use_noise, trng)
    if options['n_layers_out'] > 1:
        for lidx in xrange(1, options['n_layers_out']):
            logit = get_layer('ff')[1](tparams,
                                       logit,
                                       options,
                                       prefix='ff_logit_h%d' % lidx,
                                       activ='rectifier')
            if options['use_dropout']:
                logit = dropout_layer(logit, use_noise, trng)
    logit = get_layer('ff')[1](tparams,
                               logit,
                               options,
                               prefix='ff_logit',
                               activ='linear')
    logit_shp = logit.shape
    next_probs = tensor.nnet.softmax(logit)
    next_sample = trng.multinomial(pvals=next_probs).argmax(1)

    # next word probability
    f_next = theano.function([x, ctx] + init_state + init_memory,
                             [next_probs, next_sample] + next_state +
                             next_memory,
                             name='f_next',
                             profile=False,
                             allow_input_downcast=True)

    return f_init, f_next
Ejemplo n.º 26
0
    def forwardProp(self,allKids,words_embedded,updateWlab,label,theta,freq):
        (W1,W2,W3,W4,Wlab,b1,b2,b3,blab,WL)=self.getParams(theta)
        #sl是words_embedded的个数,一句话单词的个数
        # allKids一开始没有值,是因为训练之前,语法树本来就没有构建完,树结构是训练完了以后才出现的。但是,allkids内容应该会随着算法的进行而变化
        sl=np.size(words_embedded,1)
        sentree=rnntree.rnntree(self.d,sl,words_embedded)
        collapsed_sentence = range(sl)

        # updateWlab主要是获得情感误差,修正情感的权值
        # 情感误差也是需要p作为输入的,因此也需要计算出p
        if updateWlab:
            temp_label=np.zeros(self.cat)
            #假设cat = 4, temp_label就是(0,0,0,0)。下面这句话的意思是label对应的位置为1
            temp_label[label-1]=1.0
            nodeUnder = np.ones([2*sl-1,1])

            # 这个for循环是计算出,某个节点底下一共有多少个子节点
            # kids存了两个值,分别代表左右孩子。
            # 可以推测出,allkids存的东西,allkids[i]代表第i个非叶子节点,allkids[i][0]是左孩子,allkids[i][1]是右孩子
            for i in range(sl,2*sl-1): # calculate n1, n2 and n1+n2 for each node in the sensentree and store in nodeUnder
                kids = allKids[i]
                n1 = nodeUnder[kids[0]]
                n2 = nodeUnder[kids[1]]
                nodeUnder[i] = n1+n2

            cat_size=self.cat
            sentree.catDelta = np.zeros([cat_size, 2*sl-1])
            sentree.catDelta_out = np.zeros([self.d,2*sl-1])

            # classifier on single words
            # 处理所有单词,即叶子节点
            # 这里有个问题就是,为什么叶子节点也要计算情感误差
            for i in range(sl):
                sm = softmax(np.dot(Wlab,words_embedded[:,i]) + blab)
                #这里不管情感误差是如何计算的,sentree.nodeScores存的是情感误差没错了。
                #sentree.catDelta存的什么不清楚,但是和情感误差有关
                lbl_sm = (1-self.alpha)*(temp_label - sm)
                sentree.nodeScores[i] = 1.0/2.0*(np.dot(lbl_sm,(temp_label- sm)))
                sentree.catDelta[:, i] = -np.dot(lbl_sm,softmax_prime(sm))

            # sm = sigmoid(self.Wlab*words_embedded + self.blab)

            #lbl_sm = (1-self.alpha)*(label[:,np.ones(sl,1)] - sm)
            #sentree.nodeScores[:sl] = 1/2*(lbl_sm.*(label(:,ones(sl,1)) - sm))
            #sentree.catDelta[:, :sl] = -(lbl_sm).*sigmoid_prime(sm)

            #超过sl的部分是单词的父亲节点
            for i in range(sl,2*sl-1):
                kids = allKids[i]
                #c1,c2,是左右孩子的向量
                c1 = sentree.nodeFeatures[:,kids[0]]
                c2 = sentree.nodeFeatures[:,kids[1]]

                # Eq. [2] in the paper: p = f(W[1][c1 c2] + b[1])
                #计算p,显然p是个数值,即得分,用于判断哪两个节点合并
                p = tanh(np.dot(W1,c1) + np.dot(W2,c2) + b1)

                # See last paragraph in Section 2.3
                p_norm1 = p/norm(p)

                # Eq. (7) in the paper (for special case of 1d label)
                #sm = sigmoid(np.dot(Wlab,p_norm1) + blab)
                #这里是计算节点的情感标签,sm
                sm = softmax(np.dot(Wlab,p_norm1) + blab)
                beta=0.5
                #lbl_sm = beta * (1.0-self.alpha)*(label - sm)
                lbl_sm = beta * (1.0-self.alpha)*(temp_label - sm)
                #lbl_sm = beta * (1.0-self.alpha) * (temp_label-sm)
                #sentree.catDelta[:, i] = -softmax_prime(sm)[:,label-1]
                #J=-(1.0-self.alpha)*np.log(sm[label-1])
                #sentree.catDelta[:, i] = -np.dot(lbl_sm,sigmoid_prime(sm))
                sentree.catDelta[:, i] = -np.dot(lbl_sm,softmax_prime(sm))
                #J = 1.0/2.0*(np.dot(lbl_sm,(label - sm)))
                J = 1.0/2.0*(np.dot(lbl_sm,(temp_label - sm)))

                sentree.nodeFeatures[:,i] = p_norm1
                sentree.nodeFeatures_unnormalized[:,i] = p
                sentree.nodeScores[i] = J
                sentree.numkids = nodeUnder

            sentree.kids = allKids
        else:
            # 这里主要是计算重构误差
            # Reconstruction Error
            for j in range(sl-1):
                size2=np.size(words_embedded,1)

                """
                 经过测试,p有多个值
                 也就不难怪这里c1,c2里面分别存了多个单词的向量
                 因此,这个算法并不是一个个依次算p的,而是一次性一起算出来p
                 也因此J的值应该也是有多个值。代表两两单词计算的不同结果。
                """
                c1 = words_embedded[:,0:-1] # 去掉最后一个单词
                c2 = words_embedded[:,1:]  # 去掉第一个单词

                freq1 = freq[0:-1]
                freq2 = freq[1:]

                p = tanh(np.dot(W1,c1) + np.dot(W2,c2) + np.reshape(b1,[self.d,1])*([1]*(size2-1)))
                p_norm1 =p/np.sqrt(sum(p**2))

                y1_unnormalized = tanh(np.dot(W3,p_norm1) + np.reshape(b2,[self.d,1])*([1]*(size2-1)))
                y2_unnormalized = tanh(np.dot(W4,p_norm1) + np.reshape(b3,[self.d,1])*([1]*(size2-1)))

                y1 = y1_unnormalized/np.sqrt(sum(y1_unnormalized**2))
                y2 = y2_unnormalized/np.sqrt(sum(y2_unnormalized**2))

                y1c1 = self.alpha*(y1-c1)
                y2c2 = self.alpha*(y2-c2)

                # Eq. (4) in the paper: reconstruction error
                J = 1.0/2.0*sum((y1c1)*(y1-c1) + (y2c2)*(y2-c2))

                # finding the pair with smallest reconstruction error for constructing sentree
                J_min= min(J)
                J_minpos=np.argmin(J)

                """
                只有非叶子节点才会有重构节点,因此,sentree.node_y1c1需要从sl+j开始存y1c1.
                """
                sentree.node_y1c1[:,sl+j] = y1c1[:,J_minpos]
                sentree.node_y2c2[:,sl+j] = y2c2[:,J_minpos]
                sentree.nodeDelta_out1[:,sl+j] = np.dot(norm1tanh_prime(y1_unnormalized[:,J_minpos]) , y1c1[:,J_minpos])
                sentree.nodeDelta_out2[:,sl+j] = np.dot(norm1tanh_prime(y2_unnormalized[:,J_minpos]) , y2c2[:,J_minpos])

                #一对节点被选中以后,需要删除words_embedded对应的向量
                #还要把合成的节点加入words_embedded
                words_embedded=np.delete(words_embedded,J_minpos+1,1)
                words_embedded[:,J_minpos]=p_norm1[:,J_minpos]
                sentree.nodeFeatures[:, sl+j] = p_norm1[:,J_minpos]
                sentree.nodeFeatures_unnormalized[:, sl+j]= p[:,J_minpos]
                sentree.nodeScores[sl+j] = J_min
                # pp存的可能是父节点信息,因为两个孩子拥有同一个父亲
                sentree.pp[collapsed_sentence[J_minpos]] = sl+j
                sentree.pp[collapsed_sentence[J_minpos+1]] = sl+j
                sentree.kids[sl+j,:] = [collapsed_sentence[J_minpos], collapsed_sentence[J_minpos+1]]
                sentree.numkids[sl+j] = sentree.numkids[sentree.kids[sl+j,0]] + sentree.numkids[sentree.kids[sl+j,1]]


                freq=np.delete(freq,J_minpos+1)
                freq[J_minpos] = (sentree.numkids[sentree.kids[sl+j,0]]*freq1[J_minpos] + sentree.numkids[sentree.kids[sl+j,1]]*freq2[J_minpos])/(sentree.numkids[sentree.kids[sl+j,0]]+sentree.numkids[sentree.kids[sl+j,1]])

                collapsed_sentence=np.delete(collapsed_sentence,J_minpos+1)
                collapsed_sentence[J_minpos]=sl+j
            print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
            print(sentree.pp)
            print("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")
            print(sentree.kids)
        return sentree
 def forward(self, X):
     #Z = relu(X.dot(self.W1)+self.b1)
     Z = tanh(X.dot(self.W1) + self.b1)
     return softmax(Z.dot(self.W2) + self.b2), Z
    def forwardProp(self, allKids, words_embedded, updateWlab, label, theta,
                    freq):
        (W1, W2, W3, W4, Wlab, b1, b2, b3, blab, WL) = self.getParams(theta)
        sl = np.size(words_embedded, 1)
        sentree = rnntree.rnntree(self.d, sl, words_embedded)
        collapsed_sentence = range(sl)
        if updateWlab:
            temp_label = np.zeros(self.cat)
            temp_label[label - 1] = 1.0
            nodeUnder = np.ones([2 * sl - 1, 1])

            for i in range(
                    sl, 2 * sl - 1
            ):  # calculate n1, n2 and n1+n2 for each node in the sensentree and store in nodeUnder
                kids = allKids[i]
                n1 = nodeUnder[kids[0]]
                n2 = nodeUnder[kids[1]]
                nodeUnder[i] = n1 + n2

            cat_size = self.cat
            sentree.catDelta = np.zeros([cat_size, 2 * sl - 1])
            sentree.catDelta_out = np.zeros([self.d, 2 * sl - 1])

            # classifier on single words
            for i in range(sl):
                sm = softmax(np.dot(Wlab, words_embedded[:, i]) + blab)
                lbl_sm = (1 - self.alpha) * (temp_label - sm)
                sentree.nodeScores[i] = 1.0 / 2.0 * (np.dot(
                    lbl_sm, (temp_label - sm)))
                sentree.catDelta[:, i] = -np.dot(lbl_sm, softmax_prime(sm))

            # sm = sigmoid(self.Wlab*words_embedded + self.blab)

            #lbl_sm = (1-self.alpha)*(label[:,np.ones(sl,1)] - sm)
            #sentree.nodeScores[:sl] = 1/2*(lbl_sm.*(label(:,ones(sl,1)) - sm))
            #sentree.catDelta[:, :sl] = -(lbl_sm).*sigmoid_prime(sm)

            for i in range(sl, 2 * sl - 1):
                kids = allKids[i]

                c1 = sentree.nodeFeatures[:, kids[0]]
                c2 = sentree.nodeFeatures[:, kids[1]]

                # Eq. [2] in the paper: p = f(W[1][c1 c2] + b[1])
                p = tanh(np.dot(W1, c1) + np.dot(W2, c2) + b1)

                # See last paragraph in Section 2.3
                p_norm1 = p / norm(p)

                # Eq. (7) in the paper (for special case of 1d label)
                #sm = sigmoid(np.dot(Wlab,p_norm1) + blab)
                sm = softmax(np.dot(Wlab, p_norm1) + blab)
                beta = 0.5
                #lbl_sm = beta * (1.0-self.alpha)*(label - sm)
                lbl_sm = beta * (1.0 - self.alpha) * (temp_label - sm)
                #lbl_sm = beta * (1.0-self.alpha) * (temp_label-sm)
                #sentree.catDelta[:, i] = -softmax_prime(sm)[:,label-1]
                #J=-(1.0-self.alpha)*np.log(sm[label-1])
                #sentree.catDelta[:, i] = -np.dot(lbl_sm,sigmoid_prime(sm))
                sentree.catDelta[:, i] = -np.dot(lbl_sm, softmax_prime(sm))
                #J = 1.0/2.0*(np.dot(lbl_sm,(label - sm)))
                J = 1.0 / 2.0 * (np.dot(lbl_sm, (temp_label - sm)))

                sentree.nodeFeatures[:, i] = p_norm1
                sentree.nodeFeatures_unnormalized[:, i] = p
                sentree.nodeScores[i] = J
                sentree.numkids = nodeUnder

            sentree.kids = allKids
        else:
            # Reconstruction Error
            for j in range(sl - 1):
                size2 = np.size(words_embedded, 1)
                c1 = words_embedded[:, 0:-1]
                c2 = words_embedded[:, 1:]

                freq1 = freq[0:-1]
                freq2 = freq[1:]

                p = tanh(
                    np.dot(W1, c1) + np.dot(W2, c2) +
                    np.reshape(b1, [self.d, 1]) * ([1] * (size2 - 1)))
                p_norm1 = p / np.sqrt(sum(p**2))

                y1_unnormalized = tanh(
                    np.dot(W3, p_norm1) + np.reshape(b2, [self.d, 1]) *
                    ([1] * (size2 - 1)))
                y2_unnormalized = tanh(
                    np.dot(W4, p_norm1) + np.reshape(b3, [self.d, 1]) *
                    ([1] * (size2 - 1)))

                y1 = y1_unnormalized / np.sqrt(sum(y1_unnormalized**2))
                y2 = y2_unnormalized / np.sqrt(sum(y2_unnormalized**2))

                y1c1 = self.alpha * (y1 - c1)
                y2c2 = self.alpha * (y2 - c2)

                # Eq. (4) in the paper: reconstruction error
                J = 1.0 / 2.0 * sum((y1c1) * (y1 - c1) + (y2c2) * (y2 - c2))

                # finding the pair with smallest reconstruction error for constructing sentree
                J_min = min(J)
                J_minpos = np.argmin(J)

                sentree.node_y1c1[:, sl + j] = y1c1[:, J_minpos]
                sentree.node_y2c2[:, sl + j] = y2c2[:, J_minpos]
                sentree.nodeDelta_out1[:, sl + j] = np.dot(
                    norm1tanh_prime(y1_unnormalized[:, J_minpos]),
                    y1c1[:, J_minpos])
                sentree.nodeDelta_out2[:, sl + j] = np.dot(
                    norm1tanh_prime(y2_unnormalized[:, J_minpos]),
                    y2c2[:, J_minpos])

                words_embedded = np.delete(words_embedded, J_minpos + 1, 1)
                words_embedded[:, J_minpos] = p_norm1[:, J_minpos]
                sentree.nodeFeatures[:, sl + j] = p_norm1[:, J_minpos]
                sentree.nodeFeatures_unnormalized[:, sl + j] = p[:, J_minpos]
                sentree.nodeScores[sl + j] = J_min
                sentree.pp[collapsed_sentence[J_minpos]] = sl + j
                sentree.pp[collapsed_sentence[J_minpos + 1]] = sl + j
                sentree.kids[sl + j, :] = [
                    collapsed_sentence[J_minpos],
                    collapsed_sentence[J_minpos + 1]
                ]
                sentree.numkids[sl + j] = sentree.numkids[sentree.kids[
                    sl + j, 0]] + sentree.numkids[sentree.kids[sl + j, 1]]

                freq = np.delete(freq, J_minpos + 1)
                freq[J_minpos] = (
                    sentree.numkids[sentree.kids[sl + j, 0]] * freq1[J_minpos]
                    +
                    sentree.numkids[sentree.kids[sl + j, 1]] * freq2[J_minpos]
                ) / (sentree.numkids[sentree.kids[sl + j, 0]] +
                     sentree.numkids[sentree.kids[sl + j, 1]])

                collapsed_sentence = np.delete(collapsed_sentence,
                                               J_minpos + 1)
                collapsed_sentence[J_minpos] = sl + j
        return sentree
Ejemplo n.º 29
0
def tanh(x):
    return sp.tanh(x)