def _step(self,y_tm1,y_m,s_tm1,h,x_m):
                
        # attention

        pctx__=T.dot(h,self.W_ha)+T.dot(s_tm1,self.W_sa)[None,:,:]
        
        pctx__=T.tanh(pctx__)
        
        e=T.dot(pctx__,self.U_att)+self.b_att
        
        e=T.exp(e.reshape((e.shape[0],e.shape[1])))
        
        e=e/e.sum(0, keepdims=True)
        
        e=e*x_m
  
        c=(h*e[:,:,None]).sum(0)
        

        z = T.nnet.sigmoid(T.dot(y_tm1, self.W_z) + self.b_z + T.dot(s_tm1, self.U_z)+T.dot(c,self.W_cs))
        r = T.nnet.sigmoid(T.dot(y_tm1, self.W_r) + self.b_r + T.dot(s_tm1, self.U_r)+T.dot(c,self.W_cs))
        hh_t = T.tanh(T.dot(y_tm1, self.W_h) + self.b_h + T.dot(r * s_tm1, self.U_h)+T.dot(c,self.W_cy))
        s_t = z * s_tm1 + (1 - z) * hh_t
        
        s_t = (1. - y_m)[:,None] * s_tm1 + y_m[:,None] * s_t
        
        logit=T.tanh(T.dot(s_t, self.W_hl)+T.dot(y_tm1, self.W_yl)+T.dot(c, self.W_cl))
        
        return T.cast(s_t,dtype =theano.config.floatX),T.cast(logit,dtype =theano.config.floatX)  
Example #2
0
        def _step(x_, h_, c_, pred_, prob_):
            h_a = []
            c_a = []
            for it in range(self.n_levels):
                preact = T.dot(h_[it], self.U[it])
                preact += T.dot(x_, self.W[it]) + self.b[it]

                i = T.nnet.sigmoid(_slice(preact, 0, self.n_dim))
                f = T.nnet.sigmoid(_slice(preact, 1, self.n_dim))
                o = T.nnet.sigmoid(_slice(preact, 2, self.n_dim))
                c = T.tanh(_slice(preact, 3, self.n_dim))

                c = f * c_[it] + i * c
                h = o * T.tanh(c)

                h_a.append(h)
                c_a.append(c)

                x_ = h

            q = T.dot(h, self.L) + self.b0
            prob = T.nnet.softmax(q)
            pred = T.argmax(prob, axis=1)

            return T.stack(h_a).squeeze(), T.stack(c_a).squeeze(), pred, prob
Example #3
0
    def step(self, X, previous_hidden, previous_state):
        if self.use_input_peep:
            input_gate = T.nnet.sigmoid(T.dot(X, self.Wi) + T.dot(previous_hidden, self.Ui) + T.dot(previous_state, self.Pi) + self.bi)
        else:
            input_gate = T.nnet.sigmoid(T.dot(X, self.Wi) + T.dot(previous_hidden, self.Ui) + self.bi)
        candidate_state = T.tanh(T.dot(X, self.Wg) + T.dot(previous_hidden, self.Ug) + self.bg)

        if self.use_forget_gate:
            if self.use_forget_peep:
                forget_gate = T.nnet.sigmoid(T.dot(X, self.Wf) + T.dot(previous_hidden, self.Uf) + T.dot(previous_state, self.Pf) + self.bf)
            else:
                forget_gate = T.nnet.sigmoid(T.dot(X, self.Wf) + T.dot(previous_hidden, self.Uf) + self.bf)
            state = candidate_state * input_gate + previous_state * forget_gate
        else:
            state = candidate_state * input_gate + previous_state * 0

        if self.use_output_peep:
            output_gate = T.nnet.sigmoid(T.dot(X, self.Wo) + T.dot(previous_hidden, self.Uo) + T.dot(previous_state, self.Po) + self.bo)
        else:
            output_gate = T.nnet.sigmoid(T.dot(X, self.Wo) + T.dot(previous_hidden, self.Uo) + self.bo)
        if self.use_tanh_output:
            output = output_gate * T.tanh(state)
        else:
            output = output_gate * state
        return output, state
Example #4
0
    def __call__(self, x, h, prev_cell):
        z = x.dot(self.W_x) + h.dot(self.U_h) + self.b

        def _get_unit(matrix, unit, dim):
            slice_num = self.units[unit]
            # assume all slices have the same dimension
            return matrix[:, slice_num * dim: (slice_num + 1) * dim]

        # input gate
        i = T.nnet.sigmoid(_get_unit(z, 'i', self.unit_size))

        # candidate memory cell
        candidate = T.tanh(_get_unit(z, 'c', self.unit_size))

        # forget gate
        f = T.nnet.sigmoid(_get_unit(z, 'f', self.unit_size))

        # output gate (note it doesn't involve the current memory cell)
        o = T.nnet.sigmoid(_get_unit(z, 'o', self.unit_size))

        next_cell = i * candidate + f * prev_cell

        h = o * T.tanh(next_cell)

        return [next_cell, h]
Example #5
0
    def generate(self, h_, c_, x_):
        h_a = []
        c_a = []
        for it in range(self.n_levels):
            preact = T.dot(x_, self.W[it])
            preact += T.dot(h_[it], self.U[it]) + self.b[it]

            i = T.nnet.sigmoid(self.slice(preact, 0, self.n_dim))
            f = T.nnet.sigmoid(self.slice(preact, 1, self.n_dim))
            o = T.nnet.sigmoid(self.slice(preact, 2, self.n_dim))
            c = T.tanh(self.slice(preact, 3, self.n_dim))

            c = f * c_[it] + i * c
            h = o * T.tanh(c)

            h_a.append(h)
            c_a.append(c)

            x_ = h

        q = T.dot(h, self.L) + self.b0
        # mask = T.concatenate([T.alloc(np_floatX(1.), q.shape[0] - 1), T.alloc(np_floatX(0.), 1)])
        prob = T.nnet.softmax(q / 1)

        return prob, T.stack(h_a).squeeze(), T.stack(c_a)[0].squeeze()
Example #6
0
 def dev_loss(self, dev_types, dev_lams, ss_ratio, y):
     su_mask = ss_ratio * T.neq(y, 0).reshape((y.shape[0], 1))
     un_mask = T.eq(y, 0).reshape((y.shape[0], 1))
     ss_mask = su_mask + un_mask
     var_fun = lambda x1, x2: T.sum(((x1 - x2) * ss_mask)**2.0) / T.sum(ss_mask)
     tanh_fun = lambda x1, x2: var_fun(T.tanh(x1), T.tanh(x2))
     norm_fun = lambda x1, x2: var_fun( \
             (x1 / T.sqrt(T.sum(x1**2.0,axis=1,keepdims=1) + 1e-6)), \
             (x2 / T.sqrt(T.sum(x2**2.0,axis=1,keepdims=1) + 1e-6)))
     sigm_fun = lambda x1, x2: var_fun(T.nnet.sigmoid(x1), T.nnet.sigmoid(x2))
     cent_fun = lambda xt, xo: T.sum(T.nnet.binary_crossentropy( \
             T.nnet.sigmoid(xo), T.nnet.sigmoid(xt))) / xt.shape[0]
     L = 0.0
     for i in xrange(self.layer_count):
         if (i < (self.layer_count - 1)):
             x1 = self.layers[i].output
             x2 = self.drop_nets[0][i].output
         else:
             x1 = self.layers[i].linear_output
             x2 = self.drop_nets[0][i].linear_output
         if (dev_types[i] == 1):
             L = L + (dev_lams[i] * norm_fun(x1, x2))
         elif (dev_types[i] == 2):
             L = L + (dev_lams[i] * tanh_fun(x1, x2))
         elif (dev_types[i] == 3):
             L = L + (dev_lams[i] * sigm_fun(x1, x2))
         elif (dev_types[i] == 4):
             L = L + (dev_lams[i] * cent_fun(x1, x2))
         else:
             L = L + (dev_lams[i] * var_fun(x1, x2))
     return L
def tanh_actfun(x, scale=None):
    """Compute  rescaled tanh activation for x."""
    if scale is None:
        x_tanh = T.tanh(x)
    else:
        x_tanh = scale * T.tanh(constFX(1/scale) * x)
    return x_tanh
Example #8
0
 def step(self, i_t, x_t, z_t, y_p, c_p, *other_args):
   # See Unit.scan() for seqs.
   # args: seqs (x_t = unit.xc, z_t, i_t), outputs (# unit.n_act, y_p, c_p, ...), non_seqs (none)
   other_outputs = []
   if self.recurrent_transform:
     state_vars = other_args[:len(self.recurrent_transform.state_vars)]
     self.recurrent_transform.set_sorted_state_vars(state_vars)
     z_r, r_updates = self.recurrent_transform.step(y_p)
     z_t += z_r
     for v in self.recurrent_transform.get_sorted_state_vars():
       other_outputs += [r_updates[v]]
   z_t += T.dot(y_p, self.W_re)
   partition = z_t.shape[1] // 4 #number of units
   forgetgate = T.nnet.sigmoid(z_t[:,:partition])
   propgate = T.nnet.sigmoid(z_t[:,partition:2*partition])
   diffgate = T.nnet.sigmoid(z_t[:,2*partition:3*partition])
   input = T.tanh(z_t[:,3*partition:4*partition])
   # c(t) = (1 - FG(t)) * IN(t) + FG(t) * c(t-1)
   c_t = (1-forgetgate) * input + forgetgate * c_p
   # y(t) = tanh( PG(t) * c(t) + DG(t) * ( c(t) - c(t-1)) ) HINT: The additional nonlinearity maybe has not a significant effect
   y_t = T.tanh(propgate * c_t + diffgate * ( c_t - c_p))
   i_output = T.outer(i_t, self.o_output)
   i_h = T.outer(i_t, self.o_h)
   # return: next outputs (# unit.n_act, y_t, c_t, ...)
   return (y_t * i_output, c_t * i_h + c_p * (1 - i_h)) + tuple(other_outputs)
Example #9
0
 def recurrence(x_t, c_tm1, h_tm1):
     i_t = T.nnet.sigmoid(T.dot(x_t, self.w_xi) + T.dot(h_tm1, self.w_hi) + self.b_i)  # + T.dot(c_tm1, self.w_ci)
     f_t = T.nnet.sigmoid(T.dot(x_t, self.w_xf) + T.dot(h_tm1, self.w_hf) + self.b_f)  # + T.dot(c_tm1, self.w_cf)
     c_t = f_t * c_tm1 + i_t * T.tanh(T.dot(x_t, self.w_xc) + T.dot(h_tm1, self.w_hc) + self.b_c)
     o_t = T.nnet.sigmoid(T.dot(x_t, self.w_xo) + T.dot(h_tm1, self.w_ho) + self.b_o)  # + T.dot(c_t, self.w_co)
     h_t = o_t * T.tanh(c_t)
     return [c_t, h_t]
Example #10
0
    def _step(m_, x_, h_, c_):
        preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
        preact += x_

        i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj']))
        f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj']))
        o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj']))
        c = tensor.tanh(_slice(preact, 3, options['dim_proj']))

        if has_input_gate:
            if has_forget_gate:
                c = f * c_ + i * c
            else:
                c = c_ + i*c
        else:
            if has_forget_gate:
                c = f*c_ + c
            else:
                c = c_ + c

        c = m_[:, None] * c + (1. - m_)[:, None] * c_

        if has_output_gate:
            h = o * tensor.tanh(c)
        else:
            h = tensor.tanh(c)

        h = m_[:, None] * h + (1. - m_)[:, None] * h_

        return h, c
Example #11
0
  def step(self, i_t, x_t, z_t, att_p, y_p, c_p, *other_args):
    # See Unit.scan() for seqs.
    # args: seqs (x_t = unit.xc, z_t, i_t), outputs (# unit.n_act, y_p, c_p, ...), non_seqs (none)
    other_outputs = []
    #att_p = theano.printing.Print('att in lstms', attrs=['__str__'])(att_p)
    if self.recurrent_transform:
      state_vars = other_args[:len(self.recurrent_transform.state_vars)]
      self.recurrent_transform.set_sorted_state_vars(state_vars)
      z_r, r_updates = self.recurrent_transform.step(y_p)
      z_t += z_r
      for v in self.recurrent_transform.get_sorted_state_vars():
        other_outputs += [r_updates[v]]
    maxatt = att_p.repeat(z_t.shape[1]).reshape((z_t.shape[0],z_t.shape[1]))#.dimshuffle(1,0)
    #maxatt = theano.printing.Print('maxatt',attrs=['__str__','shape'])(maxatt)
    z_t = T.switch(maxatt>0,z_t,z_t + T.dot(y_p, self.W_re))
    #z_t += T.dot(y_p, self.W_re)
    #z_t = theano.printing.Print('z_t lstms',attrs=['shape'])(z_t)

    partition = z_t.shape[1] // 4
    ingate = T.nnet.sigmoid(z_t[:,:partition])
    forgetgate = ((T.nnet.sigmoid(z_t[:,partition:2*partition])).T * (1.-att_p)).T
    outgate = T.nnet.sigmoid(z_t[:,2*partition:3*partition])
    input = T.tanh(z_t[:,3*partition:4*partition])
    #c_t = ((forgetgate * c_p + ingate * input).T * (1.-T.max(att_p,axis=-1))).T
    c_t = forgetgate * c_p + ingate * input
    y_t = outgate * T.tanh(c_t)
    i_output = T.outer(i_t, self.o_output)
    i_h = T.outer(i_t, self.o_h)
    # return: next outputs (# unit.n_act, y_t, c_t, ...)
    return (y_t * i_output, c_t * i_h + c_p * (1 - i_h)) + tuple(other_outputs)
    def step_fn(current_input_to_state, prev_c, prev_h):
        # all args have shape (batch size, output_dim, height)

        # TODO consider learning this padding
        prev_h_padded = T.zeros((batch_size, output_dim, 1+height), dtype=theano.config.floatX)
        prev_h_padded = T.inc_subtensor(prev_h_padded[:,:,1:], prev_h)

        state_to_state = lib.ops.conv1d.Conv1D(
            name+'.StateToState', 
            output_dim, 
            4*output_dim, 
            2, 
            prev_h_padded, 
            biases=False
        )

        gates = current_input_to_state + state_to_state

        o_f_i = T.nnet.sigmoid(gates[:,:3*output_dim,:])
        o = o_f_i[:,0*output_dim:1*output_dim,:]
        f = o_f_i[:,1*output_dim:2*output_dim,:]
        i = o_f_i[:,2*output_dim:3*output_dim,:]
        g = T.tanh(gates[:,3*output_dim:4*output_dim,:])

        new_c = (f * prev_c) + (i * g)
        new_h = o * T.tanh(new_c)

        return (new_c, new_h)
        def forward_prop_step(x_t, s_t1_prev, s_t2_prev):
            ''' Inner function encapsulating a propagation step
            This is how we calculated the hidden state in a simple RNN. No longer!
            s_t = T.tanh(U[:,x_t] + W.dot(s_t1_prev))
            '''
            # Word embedding layer
            x_e = E[:,x_t]
            
            # GRU Layer 1
            z_t1 = T.nnet.hard_sigmoid(U[0].dot(x_e) + W[0].dot(s_t1_prev) + b[0])
            r_t1 = T.nnet.hard_sigmoid(U[1].dot(x_e) + W[1].dot(s_t1_prev) + b[1])
            c_t1 = T.tanh(U[2].dot(x_e) + W[2].dot(s_t1_prev * r_t1) + b[2])
            s_t1 = (T.ones_like(z_t1) - z_t1) * c_t1 + z_t1 * s_t1_prev
            
            # GRU Layer 2
            z_t2 = T.nnet.hard_sigmoid(U[3].dot(s_t1) + W[3].dot(s_t2_prev) + b[3])
            r_t2 = T.nnet.hard_sigmoid(U[4].dot(s_t1) + W[4].dot(s_t2_prev) + b[4])
            c_t2 = T.tanh(U[5].dot(s_t1) + W[5].dot(s_t2_prev * r_t2) + b[5])
            s_t2 = (T.ones_like(z_t2) - z_t2) * c_t2 + z_t2 * s_t2_prev
            
            # Final output calculation
            # Theano's softmax returns a matrix with one row, we only need the row
            o_t = T.nnet.softmax(V.dot(s_t2) + c)[0]

            return [o_t, s_t1, s_t2]
Example #14
0
File: rnn.py Project: dwf/pylearn2
    def fprop_step_mask(self, state_below, mask, state_before, U):
        """
        Scan function for case using masks

        Parameters
        ----------
        : todo
        state_below : TheanoTensor
        """

        g_on = state_below + tensor.dot(state_before[:, :self.dim], U)
        i_on = tensor.nnet.sigmoid(g_on[:, :self.dim])
        f_on = tensor.nnet.sigmoid(g_on[:, self.dim:2*self.dim])
        o_on = tensor.nnet.sigmoid(g_on[:, 2*self.dim:3*self.dim])

        z = tensor.set_subtensor(state_before[:, self.dim:],
                                 f_on * state_before[:, self.dim:] +
                                 i_on * tensor.tanh(g_on[:, 3*self.dim:]))
        z = tensor.set_subtensor(z[:, :self.dim],
                                 o_on * tensor.tanh(z[:, self.dim:]))

        # Only update the state for non-masked data, otherwise
        # just carry on the previous state until the end
        z = mask[:, None] * z + (1 - mask[:, None]) * state_before

        return z
Example #15
0
    def step(x_t, m, h_tm1, c_tm1, ctx_t, att, pctx_):
        projected_state = T.dot(h_tm1, Wd_att)
        pctx_ = T.tanh(pctx_ + projected_state[None, :, :])
        new_att = T.dot(pctx_, U_att) + c_att
        new_att = new_att.reshape([new_att.shape[0], new_att.shape[1]])
        new_att = T.exp(new_att) * context_mask
        new_att = new_att / new_att.sum(axis=0, keepdims=True)
        # Current context
        ctx_t = (context * new_att[:, :, None]).sum(axis=0)

        preactivation = T.dot(h_tm1, U)
        preactivation += x_t
        preactivation += T.dot(ctx_t, Wc)

        i_t = T.nnet.sigmoid(_slice(preactivation, 0, hidden_size))
        f_t = T.nnet.sigmoid(_slice(preactivation, 1, hidden_size))
        o_t = T.nnet.sigmoid(_slice(preactivation, 2, hidden_size))
        c_t = T.tanh(_slice(preactivation, 3, hidden_size))

        c_t = f_t * c_tm1 + i_t * c_t
        c_t = m[:, None] * c_t + (1. - m)[:, None] * c_tm1
        h_t = o_t * T.tanh(c_t)
        h_t = m[:, None] * h_t + (1. - m)[:, None] * h_tm1
        return (h_t, c_t, ctx_t, new_att.T, projected_state,
                i_t, f_t, o_t, preactivation)
Example #16
0
    def step(x_t, m_t, att_i_t,
             h_tm1, ctx_tm1, att_w_tm1,
             proj_hid_att, conc_hidden, U, W, W_cth, W_ctc, Ws_att,
             Wp_att, bp_att, Wc_att, Urz, hidden_mask):
        att_s = tensor.dot(h_tm1, Ws_att)
        att = proj_hid_att + att_s[None, :, :]
        att += att_i_t
        att = tensor.tanh(att)
        att_w_t = tensor.dot(att, Wp_att) + bp_att
        att_w_t = att_w_t.reshape((att_w_t.shape[0], att_w_t.shape[1]))  # ?
        att_w_t_max = (att_w_t * hidden_mask).max(axis=0, keepdims=True)
        att_w_t = tensor.exp(att_w_t - att_w_t_max)
        att_w_t = hidden_mask * att_w_t
        att_w_t = att_w_t / att_w_t.sum(axis=0, keepdims=True)
        ctx_t = (conc_hidden * att_w_t[:, :, None]).sum(axis=0)

        projected_state = tensor.dot(h_tm1, Urz)
        projected_state += tensor.dot(ctx_t, W_cth)

        r = tensor.nnet.sigmoid(_slice(x_t, 0) + _slice(projected_state, 0))
        z = tensor.nnet.sigmoid(_slice(x_t, 1) + _slice(projected_state, 1))
        candidate_h_t = tensor.tanh(_slice(x_t, 2) + r * tensor.dot(
            h_tm1, U) + tensor.dot(ctx_t, W_ctc))

        h_ti = z * h_tm1 + (1. - z) * candidate_h_t
        h_t = m_t[:, None] * h_ti + (1 - m_t)[:, None] * h_tm1
        return h_t, ctx_t, att_w_t.T
Example #17
0
        def pass_edges(input_idx_t, edge_t, edge_mask_t, counter_t, h_tm1, c_tm1, x):
            h_t = h_tm1
            c_t = c_tm1
            # select the input vector to use for this edge (source)
            x_t_i = x[input_idx_t, :]
            # zero out the input unless this is a leaf node
            x_t_0 = T.switch(T.eq(T.sum(edge_mask_t), 0), x_t_i, x_t_i*0)
            # concatenate with the input edge vector
            x_t_edge = T.concatenate([x_t_0, edge_t])

            # compute attention weights, using a manual softmax
            attention_scores = T.dot(self.v_a, T.tanh(T.dot(self.W_h_a, h_tm1))) # (1, n_edges)
            # find the max of the unmasked values
            max_score = T.max(attention_scores + edge_mask_t * 10000.0) - 10000.0
            # exponentiate the differences, masking first to avoid inf, and then to keep only relevant scores
            exp_scores = T.exp((attention_scores - max_score) * edge_mask_t) * edge_mask_t
            # take the sum, and add one if the mask is all zeros to avoid an inf
            exp_scores_sum = T.sum(exp_scores) + T.switch(T.eq(T.sum(edge_mask_t), 0), 1.0, 0.0)
            # normalize to compute the weights
            weighted_mask = exp_scores / exp_scores_sum

            i_t = T.nnet.sigmoid(T.dot(x_t_edge, self.W_x_i) + T.sum(T.dot(self.W_h_i.T, (weighted_mask * h_tm1)).T, axis=0) + self.b_h_i)
            f_t = T.nnet.sigmoid(T.dot(x_t_edge, self.W_x_f) + T.sum(T.dot(self.W_h_f.T, (weighted_mask * h_tm1)).T, axis=0) + self.b_h_f)
            o_t = T.nnet.sigmoid(T.dot(x_t_edge, self.W_x_o) + T.sum(T.dot(self.W_h_o.T, (weighted_mask * h_tm1)).T, axis=0) + self.b_h_o)
            u_t = T.tanh(T.dot(x_t_edge, self.W_x_u) + T.sum(T.dot(self.W_h_u.T, (weighted_mask * h_tm1)).T, axis=0) + self.b_h_u)

            c_temp = i_t * u_t + f_t * T.sum((weighted_mask * c_tm1).T, axis=0)
            h_temp = o_t * T.tanh(c_temp)

            h_t = T.set_subtensor(h_t[:, counter_t], h_temp)
            c_t = T.set_subtensor(c_t[:, counter_t], c_temp)
            return h_t, c_t
Example #18
0
    def _step_slice(m_, x_, xx_, xc_, h_, ctx_, alpha_, pctx_, cc_,
                    U, Wc, Wd_att, U_att, c_tt, Ux, Wcx):
        # attention
        pstate_ = tensor.dot(h_, Wd_att)
        pctx__ = pctx_ + pstate_[None,:,:] 
        pctx__ += xc_
        pctx__ = tensor.tanh(pctx__)
        alpha = tensor.dot(pctx__, U_att)+c_tt
        alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]])
        alpha = tensor.exp(alpha)
        if context_mask:
            alpha = alpha * context_mask
        alpha = alpha / alpha.sum(0, keepdims=True)
        ctx_ = (cc_ * alpha[:,:,None]).sum(0) # current context

        preact = tensor.dot(h_, U)
        preact += x_
        preact += tensor.dot(ctx_, Wc)
        preact = tensor.nnet.sigmoid(preact)

        r = _slice(preact, 0, dim)
        u = _slice(preact, 1, dim)

        preactx = tensor.dot(h_, Ux)
        preactx *= r
        preactx += xx_
        preactx += tensor.dot(ctx_, Wcx)

        h = tensor.tanh(preactx)

        h = u * h_ + (1. - u) * h
        h = m_[:,None] * h + (1. - m_)[:,None] * h_

        return h, ctx_, alpha.T #, pstate_, preact, preactx, r, u
    def _step(self, m_, x_, h_, c_):

        i_preact = (index_dot(x_, self.W_i) +
                    T.dot(h_, self.U_i) + self.b_i)
        i = T.nnet.sigmoid(i_preact)

        f_preact = (index_dot(x_, self.W_f) +
                    T.dot(h_, self.U_f) + self.b_f)
        f = T.nnet.sigmoid(f_preact)

        o_preact = (index_dot(x_, self.W_o) +
                    T.dot(h_, self.U_o) + self.b_o)
        o = T.nnet.sigmoid(o_preact)

        c_preact = (index_dot(x_, self.W_c) +
                    T.dot(h_, self.U_c) + self.b_c)
        c = T.tanh(c_preact)

        c = f * c_ + i * c
        c = m_[:, None] * c + (1. - m_)[:, None] * c_

        h = o * T.tanh(c)
        h = m_[:, None] * h + (1. - m_)[:, None] * h_

        return h, c
Example #20
0
        def _step(c, c_m, hidden, c_matrix):
            node_idx = c[:, 0]
            left_child_idx = c[:, 1]
            right_child_idx = c[:, 2]

            all_samples = T.arange(n_samples)
            recursive = (
                T.dot(hidden[left_child_idx, all_samples, :], self.W)
                + T.dot(hidden[right_child_idx, all_samples, :], self.U)
                + self.b
            )

            i = T.nnet.sigmoid(_slice(recursive, 0, self.dim_proj))
            f1 = T.nnet.sigmoid(_slice(recursive, 1, self.dim_proj))
            f2 = T.nnet.sigmoid(_slice(recursive, 2, self.dim_proj))
            o = T.nnet.sigmoid(_slice(recursive, 3, self.dim_proj))
            c_prime = T.tanh(_slice(recursive, 4, self.dim_proj))

            new_c = (
                i * c_prime
                + f1 * c_matrix[left_child_idx, all_samples, :]
                + f2 * c_matrix[right_child_idx, all_samples, :]
            )

            new_c_masked = c_m[:, None] * new_c + (1.0 - c_m[:, None]) * c_matrix[node_idx, all_samples, :]

            new_h = o * T.tanh(new_c_masked)
            new_h_masked = c_m[:, None] * new_h + (1.0 - c_m[:, None]) * hidden[node_idx, all_samples, :]

            return (
                T.set_subtensor(hidden[node_idx, all_samples], new_h_masked),
                T.set_subtensor(c_matrix[node_idx, all_samples], new_c_masked),
            )
Example #21
0
        def recurrence( sample_z_t, sample_x_t, h_tm1_enc, h_tm1_dec, c_tm1_enc, c_tm1_dec,  mu_z_t,  sigma_z_t, mu_x_tm1, sigma_x_tm1,  v):
            if v is not None:
                v_hat = v -  ( mu_x_tm1 + (sigma_x_tm1 * sample_x_t.reshape((batch_size, n_visible)) ) )#error input
                r_t = T.concatenate( [v , v_hat], axis = 1 ) 
            else:
                v_hat = mu_x_tm1 -  ( mu_x_tm1 + (sigma_x_tm1 * sample_x_t.reshape((batch_size, n_visible)) ) )#error input
                r_t = T.concatenate( [mu_x_tm1 , v_hat], axis = 1 ) 
            # v_enc = [r_t, h_tm1_dec]
            v_enc = T.concatenate( [r_t, h_tm1_dec] , axis = 1)
        
            #Generate h_t_enc = RNN_enc(h_tm1_enc, v_enc)
            i_t_enc = T.nnet.sigmoid(bi_enc + T.dot(c_tm1_enc, Wci_enc) + T.dot(h_tm1_enc, Whi_enc) + T.dot(v_enc, Wvi_enc))
            f_t_enc = T.nnet.sigmoid(bf_enc + T.dot(c_tm1_enc, Wcf_enc) + T.dot(h_tm1_enc, Whf_enc) + T.dot(v_enc, Wvf_enc))
            c_t_enc = (f_t_enc * c_tm1_enc) + ( i_t_enc * T.tanh( T.dot(v_enc, Wvc_enc) + T.dot( h_tm1_enc, Whc_enc) + bc_enc ))
            o_t_enc = T.nnet.sigmoid(bo_enc + T.dot(c_t_enc, Wco_enc) + T.dot(h_tm1_enc, Who_enc) + T.dot(v_enc, Wvo_enc))
            h_t_enc = o_t_enc * T.tanh( c_t_enc )
        
            # Get z_t
            mu_z_t = T.dot(h_t_enc, Wh_enc_mu_z ) + b_mu_z
            sigma_z_t = sigma_b + T.nnet.softplus(T.dot(h_t_enc, Wh_enc_sig_z ) + b_sig_z)
            #sample =  theano_rng.normal(size=mew_t.shape, avg = 0, std = 1, dtype=theano.config.floatX)
            z_t = mu_z_t + (sigma_z_t * (sample_z_t.reshape((batch_size,n_z))) ) 
            # Generate h_t_dec = RNN_dec(h_tm1_dec, z_t) 
            i_t_dec = T.nnet.sigmoid(bi_dec + T.dot(c_tm1_dec, Wci_dec) + T.dot(h_tm1_dec, Whi_dec) + T.dot(z_t, Wzi_dec))
            f_t_dec = T.nnet.sigmoid(bf_dec + T.dot(c_tm1_dec, Wcf_dec) + T.dot(h_tm1_dec, Whf_dec) + T.dot(z_t , Wzf_dec))
            c_t_dec = (f_t_dec * c_tm1_dec) + ( i_t_dec * T.tanh( T.dot(z_t, Wzc_dec) + T.dot( h_tm1_dec, Whc_dec) + bc_dec ))
            o_t_dec = T.nnet.sigmoid(bo_dec + T.dot(c_t_dec, Wco_dec) + T.dot(h_tm1_dec, Who_dec) + T.dot(z_t, Wzo_dec))
            h_t_dec = o_t_dec * T.tanh( c_t_dec )

            # Get w_t
            mu_x_t = mu_x_tm1 + T.dot(h_t_dec, Wh_dec_mu_x) + b_mu_x
            sigma_x_t = sigma_b +  T.nnet.softplus(T.dot(h_t_dec, Wh_dec_sig_x) + b_sig_x)

            return [ h_t_enc, h_t_dec, c_t_enc, c_t_dec,  mu_z_t, sigma_z_t,  mu_x_t, sigma_x_t]
	def step(x,prev_h,prev_c):
		input_gate = T.nnet.sigmoid(
				T.dot(x,P.W_input_in) +\
				T.dot(prev_h,P.W_hidden_in) +\
				T.dot(prev_c,P.W_cell_in) +\
				P.b_in
			)

		forget_gate = T.nnet.sigmoid(
				T.dot(x,P.W_input_forget) +\
				T.dot(prev_h,P.W_hidden_forget) +\
				T.dot(prev_c,P.W_cell_forget) +\
				P.b_forget
			)

		curr_c = forget_gate * prev_c + input_gate * T.tanh(
				T.dot(x,P.W_input_cell) +\
				T.dot(prev_h,P.W_hidden_cell) +\
				P.b_cell
			)

		output_gate = T.nnet.sigmoid(
				T.dot(x,P.W_input_output) +\
				T.dot(prev_h,P.W_hidden_output) +\
				T.dot(curr_c,P.W_cell_output) +\
				P.b_output
			)
		
		curr_h = output_gate * T.tanh(curr_c)

		return curr_h,curr_c
Example #23
0
    def _step(x_, xb_, h_, c_, hb_, cb_):
        preact = T.dot(h_, tparams[_p(prefix, 'U')])
        preact += T.dot(x_, tparams[_p(prefix, 'W')])
        preact += tparams[_p(prefix, 'b')]

        i = T.nnet.sigmoid(_slice(preact, 0, options['dim_proj']))
        f = T.nnet.sigmoid(_slice(preact, 1, options['dim_proj']))
        o = T.nnet.sigmoid(_slice(preact, 2, options['dim_proj']))
        c = T.tanh(_slice(preact, 3, options['dim_proj']))

        c = f * c_ + i * c
        h = o * T.tanh(c)
        
        preactb = T.dot(hb_, tparams[_p(prefix, 'Ub')])
        preactb += T.dot(xb_, tparams[_p(prefix, 'Wb')])
        preactb += tparams[_p(prefix, 'bb')]

        ib = T.nnet.sigmoid(_slice(preactb, 0, options['dim_proj']))
        fb = T.nnet.sigmoid(_slice(preactb, 1, options['dim_proj']))
        ob = T.nnet.sigmoid(_slice(preactb, 2, options['dim_proj']))
        cb = T.tanh(_slice(preactb, 3, options['dim_proj']))

        cb = fb * cb_ + ib * cb
        hb = ob * T.tanh(cb)
        
        # take the reverse of hb and concatenate with h before feeding into logistic regression
        hhb = T.concatenate([h,hb[::-1]])
        # a single frame prediction given h - the posterior probablity
        one_pred = T.nnet.softmax(T.dot(hhb, tparams['U']) + tparams['b'])
        
        return h, c, hb, cb, one_pred
 def get_ht_ct(self, xWxi_t, xWxf_t, xWxc_t, xWxo_t, h_t1, c_t1):
     i_t = T.nnet.sigmoid(xWxi_t + h_t1.dot(self.Whi) + c_t1.dot(self.Wci) + self.bi)
     f_t = T.nnet.sigmoid(xWxf_t + h_t1.dot(self.Whf) + c_t1.dot(self.Wcf) + self.bf)
     c_t = f_t * c_t1 + i_t * T.tanh(xWxc_t + h_t1.dot(self.Whc) + self.bc)
     o_t = T.nnet.sigmoid(xWxo_t + h_t1.dot(self.Who) + c_t.dot(self.Wco) + self.bo)
     h_t = o_t * T.tanh(c_t)
     return h_t, c_t
Example #25
0
        def forward_step(x_t, prev_state, prev_content, prev_state_2, prev_content_2):
            input_gate = T.nnet.hard_sigmoid(T.dot((self.U_input), x_t) + T.dot(self.W_input, prev_state) + self.bias_input)
            forget_gate = T.nnet.hard_sigmoid(
                T.dot((self.U_forget), x_t) + T.dot(self.W_forget, prev_state) + self.bias_forget)
            output_gate = T.nnet.hard_sigmoid(
                T.dot((self.U_output), x_t) + T.dot(self.W_output, prev_state) + self.bias_output)

            stabilized_input = T.tanh(T.dot((self.U), x_t) + T.dot(self.W, prev_state) + self.bias)
            c = forget_gate * prev_content + input_gate * stabilized_input
            s1 = output_gate * T.tanh(c)

            input_gate_2 = T.nnet.hard_sigmoid(
                T.dot((self.U_input_2), s1) + T.dot(self.W_input_2, prev_state_2) + self.bias_input_2)
            forget_gate_2 = T.nnet.hard_sigmoid(
                T.dot((self.U_forget_2), s1) + T.dot(self.W_forget_2, prev_state_2) + self.bias_forget_2)
            output_gate_2 = T.nnet.hard_sigmoid(
                T.dot((self.U_output_2), s1) + T.dot(self.W_output_2, prev_state_2) + self.bias_output_2)

            stabilized_input_2 = T.tanh(T.dot((self.U_2), s1) + T.dot(self.W_2, prev_state_2) + self.bias_2)

            c2 = forget_gate_2 * prev_content_2 + input_gate_2 * stabilized_input_2

            s2 = output_gate_2 * T.tanh(c2)
            o = T.nnet.sigmoid(T.dot(self.O_w, s2) + self.O_bias)

            return [o, s1, c, s2, c2, input_gate, forget_gate, output_gate]
def convolutional_model(X, w_1, w_2, w_3, w_4, w_5, w_6, p_1, p_2, p_3, p_4, p_5):
    l1 = dropout(T.tanh( max_pool_2d(T.maximum(conv2d(X, w_1, border_mode='full'),0.), (2, 2),ignore_border=True) + b_1.dimshuffle('x', 0, 'x', 'x') ), p_1)
    l2 = dropout(T.tanh( max_pool_2d(T.maximum(conv2d(l1, w_2), 0.), (2, 2),ignore_border=True) + b_2.dimshuffle('x', 0, 'x', 'x') ), p_2)
    l3 = dropout(T.flatten(T.tanh( max_pool_2d(T.maximum(conv2d(l2, w_3), 0.), (2, 2),ignore_border=True) + b_3.dimshuffle('x', 0, 'x', 'x') ), outdim=2), p_3)# flatten to switch back to 1d layers
    l4 = dropout(T.maximum(T.dot(l3, w_4), 0.), p_4)
    l5 = dropout(T.maximum(T.dot(l4, w_5), 0.), p_5)
    return T.dot(l5, w_6)
Example #27
0
    def _step(self,y_tm1,s_tm1,h):
        
        # attention
         
        pctx__=T.dot(h,self.W_ha)+T.dot(s_tm1,self.W_sa)
        
        #pctx__+=T.dot(y_t,self.W_yc)
        
        e=T.exp(T.tanh(pctx__))
        
        #e=T.dot(pctx__,self.U_z)
        
        e=e/e.sum(0, keepdims=True)
        
        c=T.dot(e.T,h)

        #c=(h*e[:,:,None]).sum(0)

        z = T.tanh(T.dot(y_tm1, self.W_z) + self.b_z + T.dot(s_tm1, self.U_z)+T.dot(c,self.W_cs))
        r = T.tanh(T.dot(y_tm1, self.W_r) + self.b_r + T.dot(s_tm1, self.U_r)+T.dot(c,self.W_cs))
        hh_t = T.tanh(T.dot(y_tm1, self.W_h) + self.b_h + T.dot(r * s_tm1, self.U_h)+T.dot(c,self.W_cy))
        s_t = z * s_tm1 + (1 - z) * hh_t
        
        logit=T.tanh(T.dot(s_t, self.W_hl)+T.dot(y_tm1, self.W_yl)+T.dot(c, self.W_cl))
        
        return T.cast(s_t,dtype =theano.config.floatX),logit  
    def theano_setup(self):
    
        # The matrices Wb and Wc were originally tied.
        # Because of that, I decided to keep Wb and Wc with
        # the same shape (instead of being transposed) to
        # avoid disturbing the code as much as possible.

        Wb = T.dmatrix('Wb')
        Wc = T.dmatrix('Wc')
        b = T.dvector('b')
        c = T.dvector('c')
        s = T.dscalar('s')
        x = T.dmatrix('x')
    
        h_act = T.dot(x, Wc) + c
        if self.act_func[0] == 'tanh':
            h = T.tanh(h_act)
        elif self.act_func[0] == 'sigmoid':
            h = T.nnet.sigmoid(h_act)
        elif self.act_func[0] == 'id':
            # bad idae
            h = h_act
        else:
            raise("Invalid act_func[0]")

        r_act = T.dot(h, Wb.T) + b
        if self.act_func[1] == 'tanh':
            r = s * T.tanh(r_act)
        elif self.act_func[1] == 'sigmoid':
            r = s * T.nnet.sigmoid(r_act)
        elif self.act_func[1] == 'id':
            r = s * r_act
        else:
            raise("Invalid act_func[1]")


        # Another variable to be able to call a function
        # with a noisy x and compare it to a reference x.
        y = T.dmatrix('y')

        loss = ((r - y)**2)
        sum_loss = T.sum(loss)
        
        # theano_encode_decode : vectorial function in argument X.
        # theano_loss : vectorial function in argument X.
        # theano_gradients : returns triplet of gradients, each of
        #                    which involves the all data X summed
        #                    so it's not a "vectorial" function.

        self.theano_encode_decode = function([Wb,Wc,b,c,s,x], r)
        self.theano_loss = function([Wb,Wc,b,c,s,x,y], loss)

        self.theano_gradients = function([Wb,Wc,b,c,s,x,y],
                                         [T.grad(sum_loss, Wb), T.grad(sum_loss, Wc),
                                          T.grad(sum_loss, b),  T.grad(sum_loss, c),
                                          T.grad(sum_loss, s)])
        # other useful theano functions for the experiments that involve
        # adding noise to the hidden states
        self.theano_encode = function([Wc,c,x], h)
        self.theano_decode = function([Wb,b,s,h], r)
Example #29
0
    def step(x, prev_cell, prev_hidden):
        transformed_x = T.dot(x, P[name_W_input])

        x_i = transformed_x[0 * hidden_size : 1 * hidden_size]
        x_f = transformed_x[1 * hidden_size : 2 * hidden_size]
        x_c = transformed_x[2 * hidden_size : 3 * hidden_size]
        x_o = transformed_x[3 * hidden_size : 4 * hidden_size]

        transformed_hid = T.dot(prev_hidden, P[name_W_hidden])
        h_i = transformed_hid[0 * hidden_size : 1 * hidden_size]
        h_f = transformed_hid[1 * hidden_size : 2 * hidden_size]
        h_c = transformed_hid[2 * hidden_size : 3 * hidden_size]
        h_o = transformed_hid[3 * hidden_size : 4 * hidden_size]

        transformed_cell = T.dot(prev_cell, V_if)
        c_i = transformed_cell[0 * hidden_size : 1 * hidden_size]
        c_f = transformed_cell[1 * hidden_size : 2 * hidden_size]

        in_lin = x_i + h_i + b_i + c_i
        forget_lin = x_f + h_f + b_f + c_f
        cell_lin = x_c + h_c + b_c

        in_gate = T.nnet.sigmoid(in_lin)
        forget_gate = T.nnet.sigmoid(forget_lin)
        cell_updates = T.tanh(cell_lin)

        cell = forget_gate * prev_cell + in_gate * cell_updates

        out_lin = x_o + h_o + b_o + T.dot(cell, V_o)
        out_gate = T.nnet.sigmoid(out_lin)

        hid = out_gate * T.tanh(cell)
        return cell, hid
Example #30
0
 def b_step_lstm(x_t, h_tm1, c_tm1):
     i_t = T.nnet.sigmoid(T.dot(x_t, self.W_xi_b) + T.dot(h_tm1, self.W_hi_b) + self.b_i_b)
     f_t = T.nnet.sigmoid(T.dot(x_t, self.W_xf_b) + T.dot(h_tm1, self.W_hf_b) + self.b_f_b)
     c_t = f_t * c_tm1 + i_t * T.tanh(T.dot(x_t, self.W_xc_b) + T.dot(h_tm1, self.W_hc_b) + self.b_c_b)
     o_t = T.nnet.sigmoid(T.dot(x_t, self.W_xo_b) + T.dot(h_tm1, self.W_ho_b) + self.b_o_b)
     h_t = o_t * T.tanh(c_t)
     return [h_t, c_t]
Example #31
0
import theano.sandbox.rng_mrg as RNG_MRG

from utils import data_tools as data
from recurrent_gsn import generative_stochastic_network
import utils.logger as log
from utils.image_tiler import tile_raster_images
from utils.utils import cast32, logit, trunc, get_shared_weights, get_shared_bias, salt_and_pepper, \
    make_time_units_string

# Default values to use for SEN parameters
defaults = {  # gsn parameters
    "gsn_layers": 3,  # number of hidden layers to use
    "walkbacks":
    5,  # number of walkbacks (generally 2*layers) - need enough to have info from top layer propagate to visible layer
    "hidden_size": 1500,
    "hidden_activation": lambda x: T.tanh(x),
    "visible_activation": lambda x: T.nnet.sigmoid(x),
    "input_sampling": True,
    "MRG": RNG_MRG.MRG_RandomStreams(1),
    # recurrent parameters
    "recurrent_hidden_size": 1500,
    "recurrent_hidden_activation": lambda x: T.tanh(x),
    # sen parameters

    # training parameters
    "load_params": False,
    "cost_function": lambda x, y: T.mean(T.nnet.binary_crossentropy(x, y)),
    "n_epoch": 1000,
    "gsn_batch_size": 100,
    "batch_size": 200,
    "save_frequency": 10,
Example #32
0
def build_model(tparams, options):

    # description string: #words x #samples
    x = tensor.matrix('x', dtype='float32')
    x_mask = tensor.matrix('x_mask', dtype='float32')
    y = tensor.matrix('y', dtype='int64')
    y_mask = tensor.matrix('y_mask', dtype='float32')

    n_timesteps_trg = y.shape[0]
    n_samples = x.shape[1]

    init_memory = None

    # word embedding (target)
    import scipy.io
    from sklearn.decomposition import PCA
    matlab_data = scipy.io.loadmat('corr_5.mat')
    correlations = matlab_data['corr_5']
    pca = PCA(n_components=options['dim_word'])
    pca.fit(correlations)
    correlations_reduced = pca.transform(correlations)
    n_clusters, dim_reduced = correlations_reduced.shape
    Wemb = numpy.zeros((n_clusters + 1, dim_reduced), dtype=numpy.float32)
    Wemb[1:, :] = numpy.array(correlations_reduced, dtype=numpy.float32)
    Wemb_tensor = tensor.constant(Wemb, dtype=numpy.float32)

    emb = Wemb_tensor[y.flatten()].reshape(
        [n_timesteps_trg, n_samples, options['dim_word']])
    emb_shifted = tensor.zeros_like(emb)
    emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1])
    emb = emb_shifted

    # decoder
    proj = get_layer(options['decoder'])[1](tparams,
                                            emb,
                                            options,
                                            prefix='decoder',
                                            mask=y_mask,
                                            context=x.T,
                                            context_mask=x_mask.T,
                                            one_step=False,
                                            init_state=None,
                                            init_memory=init_memory)
    proj_h = proj[0]
    if options['decoder'].startswith('lstm'):
        ctxs = proj[2]
        alphas = proj[3]
    else:
        ctxs = proj[1]
        alphas = proj[2]

    proj_h = dropout_layer(proj_h)  # Drop out here

    # compute word probabilities
    logit_lstm = get_layer('ff')[1](tparams,
                                    proj_h,
                                    options,
                                    prefix='ff_logit_lstm',
                                    activ='linear')
    logit_ctx = get_layer('ff')[1](tparams,
                                   ctxs,
                                   options,
                                   prefix='ff_logit_ctx',
                                   activ='linear')
    logit = tensor.tanh(logit_lstm + logit_ctx)
    logit = dropout_layer(logit)  # Dropout here
    logit = get_layer('ff')[1](tparams,
                               logit,
                               options,
                               prefix='ff_logit',
                               activ='linear')
    logit_shp = logit.shape
    probs = tensor.nnet.softmax(
        logit.reshape([logit_shp[0] * logit_shp[1], logit_shp[2]]))

    # cost
    y_flat = y.flatten()
    y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat
    cost = -tensor.log(probs.flatten()[y_flat_idx] + 1e-8)
    cost = cost.reshape([y.shape[0], y.shape[1]])
    cost = (cost * y_mask).sum(0)
    cost = cost.mean()

    return x, x_mask, y, y_mask, alphas, cost
Example #33
0
def tanh(x):
    return tensor.tanh(x)
Example #34
0
 def recurrent_fn( u_t, h_tm1, W_hh, W_ux, W_hy,b) :
     x_t = TT.dot(W_ux, u_t)
     h_t = TT.tanh( TT.dot(W_hh, h_tm1) + x_t + b)
     y_t = TT.dot(W_hy, h_t)
     return h_t, y_t
Example #35
0
 def tanh(self, X):
     return T.tanh(X)
    def model(self, activation, RecurrentUnit):
        self.f = activation

        # embedding layer parameters
        we = init_weight(self.V, self.D)

        # hidden layer parameters
        self.hidden_layers = []
        Mi = self.D
        for Mo in self.hidden_layer_sizes:
            ru = RecurrentUnit(Mi, Mo, activation)
            self.hidden_layers.append(ru)
            Mi = Mo

        # attention layer parameters
        wa = init_weight(Mi, Mi)
        ba = np.zeros(Mi)
        ua = init_weight(Mi,1)

        self.Wa = theano.shared(wa)
        self.Ba = theano.shared(ba)
        self.Ua = theano.shared(ua)

        # output layer parameters
        wo = init_weight(Mi, self.O)
        bo = np.zeros(self.O)

        # shared variable
        self.We = theano.shared(we, name="Embedding weights")
        self.Wo = theano.shared(wo, name="Output weight")
        self.Bo = theano.shared(bo, name="Output Bias")
        self.params = [self.We, self.Wa, self.Ba, self.Ua, self.Wo, self.Bo]
        for ru in self.hidden_layers:
            self.params += ru.params

        # input variables
        thx = T.ivector('X')
        thy = T.ivector('Y')
        thStartPoints = T.ivector('start_points')
        thEndPoints = T.ivector('end_points')

        # embedding layer computation
        Z = self.We[thx]                                    # size = [? x D]

        # rnn layer computation
        for ru in self.hidden_layers:
            Z = ru.output(Z, thStartPoints)                 # size = [? x H]

        # attention layer computation
        u = T.tanh(Z.dot(self.Wa) + self.Ba)                # size = [? x H]
        alpha = T.nnet.softmax(u.dot(self.Ua))              # size = [? x 1]        ( [? x H].dot([H x 1]) )
        c = T.repeat(alpha, Z.shape[1], axis=1) * Z         # size = [H]            ( [? x H]*[? x H] )

        # output layer computation
        py = T.nnet.softmax(c.dot(self.Wo) + self.Bo)       # size = [O]            ( [H].dot([H x O]) )
        py_x = py[thEndPoints, :]
        prediction = T.argmax(py_x, axis=1)

        self.predict_op = theano.function(
            inputs=[thx, thStartPoints, thEndPoints],
            outputs=prediction,
            allow_input_downcast=True
        )

        return thx, thy, thStartPoints, thEndPoints, py_x, prediction
Example #37
0
    def __init__(self,
                 train_X=None,
                 train_Y=None,
                 valid_X=None,
                 valid_Y=None,
                 test_X=None,
                 test_Y=None,
                 args=None,
                 logger=None):
        # Output logger
        self.logger = logger
        self.outdir = args.get("output_path", defaults["output_path"])
        if self.outdir[-1] != '/':
            self.outdir = self.outdir + '/'
        # Input data
        self.train_X = train_X
        self.train_Y = train_Y
        self.valid_X = valid_X
        self.valid_Y = valid_Y
        self.test_X = test_X
        self.test_Y = test_Y

        # variables from the dataset that are used for initialization and image reconstruction
        if train_X is None:
            self.N_input = args.get("input_size")
            if args.get("input_size") is None:
                raise AssertionError(
                    "Please either specify input_size in the arguments or provide an example train_X for input dimensionality."
                )
        else:
            self.N_input = train_X.eval().shape[1]
        self.root_N_input = numpy.sqrt(self.N_input)

        self.is_image = args.get('is_image', defaults['is_image'])
        if self.is_image:
            self.image_width = args.get('width', self.root_N_input)
            self.image_height = args.get('height', self.root_N_input)

        #######################################
        # Network and training specifications #
        #######################################
        self.gsn_layers = args.get(
            'gsn_layers', defaults['gsn_layers'])  # number hidden layers
        self.walkbacks = args.get('walkbacks',
                                  defaults['walkbacks'])  # number of walkbacks
        self.learning_rate = theano.shared(
            cast32(args.get('learning_rate',
                            defaults['learning_rate'])))  # learning rate
        self.init_learn_rate = cast32(
            args.get('learning_rate', defaults['learning_rate']))
        self.momentum = theano.shared(
            cast32(args.get('momentum',
                            defaults['momentum'])))  # momentum term
        self.annealing = cast32(args.get(
            'annealing',
            defaults['annealing']))  # exponential annealing coefficient
        self.noise_annealing = cast32(
            args.get('noise_annealing', defaults['noise_annealing'])
        )  # exponential noise annealing coefficient
        self.batch_size = args.get('batch_size', defaults['batch_size'])
        self.gsn_batch_size = args.get('gsn_batch_size',
                                       defaults['gsn_batch_size'])
        self.n_epoch = args.get('n_epoch', defaults['n_epoch'])
        self.early_stop_threshold = args.get('early_stop_threshold',
                                             defaults['early_stop_threshold'])
        self.early_stop_length = args.get('early_stop_length',
                                          defaults['early_stop_length'])
        self.save_frequency = args.get('save_frequency',
                                       defaults['save_frequency'])

        self.noiseless_h1 = args.get('noiseless_h1', defaults["noiseless_h1"])
        self.hidden_add_noise_sigma = theano.shared(
            cast32(
                args.get('hidden_add_noise_sigma',
                         defaults["hidden_add_noise_sigma"])))
        self.input_salt_and_pepper = theano.shared(
            cast32(
                args.get('input_salt_and_pepper',
                         defaults["input_salt_and_pepper"])))
        self.input_sampling = args.get('input_sampling',
                                       defaults["input_sampling"])
        self.vis_init = args.get('vis_init', defaults['vis_init'])
        self.load_params = args.get('load_params', defaults['load_params'])
        self.hessian_free = args.get('hessian_free', defaults['hessian_free'])

        self.layer_sizes = [self.N_input] + [
            args.get('hidden_size', defaults['hidden_size'])
        ] * self.gsn_layers  # layer sizes, from h0 to hK (h0 is the visible layer)
        self.recurrent_hidden_size = args.get(
            'recurrent_hidden_size', defaults['recurrent_hidden_size'])
        self.top_layer_sizes = [self.recurrent_hidden_size] + [
            args.get('hidden_size', defaults['hidden_size'])
        ] * self.gsn_layers  # layer sizes, from h0 to hK (h0 is the visible layer)

        self.f_recon = None
        self.f_noise = None

        # Activation functions!
        # For the GSN:
        if args.get('hidden_activation') is not None:
            log.maybeLog(self.logger,
                         'Using specified activation for GSN hiddens')
            self.hidden_activation = args.get('hidden_activation')
        elif args.get('hidden_act') == 'sigmoid':
            log.maybeLog(self.logger,
                         'Using sigmoid activation for GSN hiddens')
            self.hidden_activation = T.nnet.sigmoid
        elif args.get('hidden_act') == 'rectifier':
            log.maybeLog(self.logger,
                         'Using rectifier activation for GSN hiddens')
            self.hidden_activation = lambda x: T.maximum(cast32(0), x)
        elif args.get('hidden_act') == 'tanh':
            log.maybeLog(
                self.logger,
                'Using hyperbolic tangent activation for GSN hiddens')
            self.hidden_activation = lambda x: T.tanh(x)
        elif args.get('hidden_act') is not None:
            log.maybeLog(
                self.logger,
                "Did not recognize hidden activation {0!s}, please use tanh, rectifier, or sigmoid for GSN hiddens"
                .format(args.get('hidden_act')))
            raise NotImplementedError(
                "Did not recognize hidden activation {0!s}, please use tanh, rectifier, or sigmoid for GSN hiddens"
                .format(args.get('hidden_act')))
        else:
            log.maybeLog(self.logger,
                         "Using default activation for GSN hiddens")
            self.hidden_activation = defaults['hidden_activation']
        # For the RNN:
        if args.get('recurrent_hidden_activation') is not None:
            log.maybeLog(self.logger,
                         'Using specified activation for RNN hiddens')
            self.recurrent_hidden_activation = args.get(
                'recurrent_hidden_activation')
        elif args.get('recurrent_hidden_act') == 'sigmoid':
            log.maybeLog(self.logger,
                         'Using sigmoid activation for RNN hiddens')
            self.recurrent_hidden_activation = T.nnet.sigmoid
        elif args.get('recurrent_hidden_act') == 'rectifier':
            log.maybeLog(self.logger,
                         'Using rectifier activation for RNN hiddens')
            self.recurrent_hidden_activation = lambda x: T.maximum(
                cast32(0), x)
        elif args.get('recurrent_hidden_act') == 'tanh':
            log.maybeLog(
                self.logger,
                'Using hyperbolic tangent activation for RNN hiddens')
            self.recurrent_hidden_activation = lambda x: T.tanh(x)
        elif args.get('recurrent_hidden_act') is not None:
            log.maybeLog(
                self.logger,
                "Did not recognize hidden activation {0!s}, please use tanh, rectifier, or sigmoid for RNN hiddens"
                .format(args.get('hidden_act')))
            raise NotImplementedError(
                "Did not recognize hidden activation {0!s}, please use tanh, rectifier, or sigmoid for RNN hiddens"
                .format(args.get('hidden_act')))
        else:
            log.maybeLog(self.logger,
                         "Using default activation for RNN hiddens")
            self.recurrent_hidden_activation = defaults[
                'recurrent_hidden_activation']
        # Visible layer activation
        if args.get('visible_activation') is not None:
            log.maybeLog(self.logger,
                         'Using specified activation for visible layer')
            self.visible_activation = args.get('visible_activation')
        elif args.get('visible_act') == 'sigmoid':
            log.maybeLog(self.logger,
                         'Using sigmoid activation for visible layer')
            self.visible_activation = T.nnet.sigmoid
        elif args.get('visible_act') == 'softmax':
            log.maybeLog(self.logger,
                         'Using softmax activation for visible layer')
            self.visible_activation = T.nnet.softmax
        elif args.get('visible_act') is not None:
            log.maybeLog(
                self.logger,
                "Did not recognize visible activation {0!s}, please use sigmoid or softmax"
                .format(args.get('visible_act')))
            raise NotImplementedError(
                "Did not recognize visible activation {0!s}, please use sigmoid or softmax"
                .format(args.get('visible_act')))
        else:
            log.maybeLog(self.logger,
                         'Using default activation for visible layer')
            self.visible_activation = defaults['visible_activation']

        # Cost function!
        if args.get('cost_function') is not None:
            log.maybeLog(self.logger,
                         '\nUsing specified cost function for GSN training\n')
            self.cost_function = args.get('cost_function')
        elif args.get('cost_funct') == 'binary_crossentropy':
            log.maybeLog(self.logger, '\nUsing binary cross-entropy cost!\n')
            self.cost_function = lambda x, y: T.mean(
                T.nnet.binary_crossentropy(x, y))
        elif args.get('cost_funct') == 'square':
            log.maybeLog(self.logger, "\nUsing square error cost!\n")
            #cost_function = lambda x,y: T.log(T.mean(T.sqr(x-y)))
            self.cost_function = lambda x, y: T.log(T.sum(T.pow((x - y), 2)))
        elif args.get('cost_funct') is not None:
            log.maybeLog(
                self.logger,
                "\nDid not recognize cost function {0!s}, please use binary_crossentropy or square\n"
                .format(args.get('cost_funct')))
            raise NotImplementedError(
                "Did not recognize cost function {0!s}, please use binary_crossentropy or square"
                .format(args.get('cost_funct')))
        else:
            log.maybeLog(self.logger,
                         '\nUsing default cost function for GSN training\n')
            self.cost_function = defaults['cost_function']

        ############################
        # Theano variables and RNG #
        ############################
        self.X = T.fmatrix('X')  #single (batch) for training gsn
        self.Xs = T.fmatrix('Xs')  #sequence for training rnn
        self.MRG = RNG_MRG.MRG_RandomStreams(1)

        ###############
        # Parameters! #
        ###############
        #visible gsn
        self.weights_list = [
            get_shared_weights(self.layer_sizes[i],
                               self.layer_sizes[i + 1],
                               name="W_{0!s}_{1!s}".format(i, i + 1))
            for i in range(self.gsn_layers)
        ]  # initialize each layer to uniform sample from sqrt(6. / (n_in + n_out))
        self.bias_list = [
            get_shared_bias(self.layer_sizes[i], name='b_' + str(i))
            for i in range(self.gsn_layers + 1)
        ]  # initialize each layer to 0's.

        #recurrent
        self.recurrent_to_gsn_weights_list = [
            get_shared_weights(self.recurrent_hidden_size,
                               self.layer_sizes[layer],
                               name="W_u_h{0!s}".format(layer))
            for layer in range(self.gsn_layers + 1) if layer % 2 != 0
        ]
        self.W_u_u = get_shared_weights(self.recurrent_hidden_size,
                                        self.recurrent_hidden_size,
                                        name="W_u_u")
        self.W_ins_u = get_shared_weights(args.get('hidden_size',
                                                   defaults['hidden_size']),
                                          self.recurrent_hidden_size,
                                          name="W_ins_u")
        self.recurrent_bias = get_shared_bias(self.recurrent_hidden_size,
                                              name='b_u')

        #top layer gsn
        self.top_weights_list = [
            get_shared_weights(self.top_layer_sizes[i],
                               self.top_layer_sizes[i + 1],
                               name="Wtop_{0!s}_{1!s}".format(i, i + 1))
            for i in range(self.gsn_layers)
        ]  # initialize each layer to uniform sample from sqrt(6. / (n_in + n_out))
        self.top_bias_list = [
            get_shared_bias(self.top_layer_sizes[i], name='btop_' + str(i))
            for i in range(self.gsn_layers + 1)
        ]  # initialize each layer to 0's.

        #lists for use with gradients
        self.gsn_params = self.weights_list + self.bias_list
        self.u_params = [self.W_u_u, self.W_ins_u, self.recurrent_bias]
        self.top_params = self.top_weights_list + self.top_bias_list
        self.params = self.gsn_params + self.recurrent_to_gsn_weights_list + self.u_params + self.top_params

        ###################################################
        #          load initial parameters                #
        ###################################################
        if self.load_params:
            params_to_load = 'gsn_params.pkl'
            log.maybeLog(self.logger, "\nLoading existing GSN parameters\n")
            loaded_params = cPickle.load(open(params_to_load, 'r'))
            [
                p.set_value(lp.get_value(borrow=False)) for lp, p in zip(
                    loaded_params[:len(self.weights_list)], self.weights_list)
            ]
            [
                p.set_value(lp.get_value(borrow=False)) for lp, p in zip(
                    loaded_params[len(self.weights_list):], self.bias_list)
            ]

            params_to_load = 'rnn_params.pkl'
            log.maybeLog(self.logger, "\nLoading existing RNN parameters\n")
            loaded_params = cPickle.load(open(params_to_load, 'r'))
            [
                p.set_value(lp.get_value(borrow=False)) for lp, p in zip(
                    loaded_params[:len(self.recurrent_to_gsn_weights_list)],
                    self.recurrent_to_gsn_weights_list)
            ]
            [
                p.set_value(lp.get_value(borrow=False)) for lp, p in zip(
                    loaded_params[len(self.recurrent_to_gsn_weights_list
                                      ):len(self.recurrent_to_gsn_weights_list
                                            ) + 1], self.W_u_u)
            ]
            [
                p.set_value(lp.get_value(borrow=False)) for lp, p in zip(
                    loaded_params[len(self.recurrent_to_gsn_weights_list) +
                                  1:len(self.recurrent_to_gsn_weights_list) +
                                  2], self.W_ins_u)
            ]
            [
                p.set_value(lp.get_value(borrow=False)) for lp, p in zip(
                    loaded_params[len(self.recurrent_to_gsn_weights_list) +
                                  2:], self.recurrent_bias)
            ]

            params_to_load = 'top_gsn_params.pkl'
            log.maybeLog(self.logger,
                         "\nLoading existing top level GSN parameters\n")
            loaded_params = cPickle.load(open(params_to_load, 'r'))
            [
                p.set_value(lp.get_value(borrow=False))
                for lp, p in zip(loaded_params[:len(self.top_weights_list)],
                                 self.top_weights_list)
            ]
            [
                p.set_value(lp.get_value(borrow=False))
                for lp, p in zip(loaded_params[len(self.top_weights_list):],
                                 self.top_bias_list)
            ]

        self.gsn_args = {
            'weights_list':
            self.weights_list,
            'bias_list':
            self.bias_list,
            'hidden_activation':
            self.hidden_activation,
            'visible_activation':
            self.visible_activation,
            'cost_function':
            self.cost_function,
            'layers':
            self.gsn_layers,
            'walkbacks':
            self.walkbacks,
            'hidden_size':
            args.get('hidden_size', defaults['hidden_size']),
            'learning_rate':
            args.get('learning_rate', defaults['learning_rate']),
            'momentum':
            args.get('momentum', defaults['momentum']),
            'annealing':
            self.annealing,
            'noise_annealing':
            self.noise_annealing,
            'batch_size':
            self.gsn_batch_size,
            'n_epoch':
            self.n_epoch,
            'early_stop_threshold':
            self.early_stop_threshold,
            'early_stop_length':
            self.early_stop_length,
            'save_frequency':
            self.save_frequency,
            'noiseless_h1':
            self.noiseless_h1,
            'hidden_add_noise_sigma':
            args.get('hidden_add_noise_sigma',
                     defaults['hidden_add_noise_sigma']),
            'input_salt_and_pepper':
            args.get('input_salt_and_pepper',
                     defaults['input_salt_and_pepper']),
            'input_sampling':
            self.input_sampling,
            'vis_init':
            self.vis_init,
            'output_path':
            self.outdir + 'gsn/',
            'is_image':
            self.is_image,
            'input_size':
            self.N_input
        }

        self.top_gsn_args = {
            'weights_list':
            self.top_weights_list,
            'bias_list':
            self.top_bias_list,
            'hidden_activation':
            self.hidden_activation,
            'visible_activation':
            self.recurrent_hidden_activation,
            'cost_function':
            self.cost_function,
            'layers':
            self.gsn_layers,
            'walkbacks':
            self.walkbacks,
            'hidden_size':
            args.get('hidden_size', defaults['hidden_size']),
            'learning_rate':
            args.get('learning_rate', defaults['learning_rate']),
            'momentum':
            args.get('momentum', defaults['momentum']),
            'annealing':
            self.annealing,
            'noise_annealing':
            self.noise_annealing,
            'batch_size':
            self.gsn_batch_size,
            'n_epoch':
            self.n_epoch,
            'early_stop_threshold':
            self.early_stop_threshold,
            'early_stop_length':
            self.early_stop_length,
            'save_frequency':
            self.save_frequency,
            'noiseless_h1':
            self.noiseless_h1,
            'hidden_add_noise_sigma':
            args.get('hidden_add_noise_sigma',
                     defaults['hidden_add_noise_sigma']),
            'input_salt_and_pepper':
            args.get('input_salt_and_pepper',
                     defaults['input_salt_and_pepper']),
            'input_sampling':
            self.input_sampling,
            'vis_init':
            self.vis_init,
            'output_path':
            self.outdir + 'top_gsn/',
            'is_image':
            False,
            'input_size':
            self.recurrent_hidden_size
        }

        ############
        # Sampling #
        ############
        # the input to the sampling function
        X_sample = T.fmatrix("X_sampling")
        self.network_state_input = [X_sample] + [
            T.fmatrix("H_sampling_" + str(i + 1))
            for i in range(self.gsn_layers)
        ]

        # "Output" state of the network (noisy)
        # initialized with input, then we apply updates
        self.network_state_output = [X_sample] + self.network_state_input[1:]
        visible_pX_chain = []

        # ONE update
        log.maybeLog(self.logger,
                     "Performing one walkback in network state sampling.")
        generative_stochastic_network.update_layers(
            self.network_state_output, self.weights_list, self.bias_list,
            visible_pX_chain, True, self.noiseless_h1,
            self.hidden_add_noise_sigma, self.input_salt_and_pepper,
            self.input_sampling, self.MRG, self.visible_activation,
            self.hidden_activation, self.logger)

        ##############################################
        #        Build the graphs for the SEN        #
        ##############################################
        # If `x_t` is given, deterministic recurrence to compute the u_t. Otherwise, first generate
        def recurrent_step(x_t, u_tm1, add_noise):
            # Make current guess for hiddens based on U
            for i in range(self.gsn_layers):
                if i % 2 == 0:
                    log.maybeLog(
                        self.logger, "Using {0!s} and {1!s}".format(
                            self.recurrent_to_gsn_weights_list[(i + 1) / 2],
                            self.bias_list[i + 1]))
            h_t = T.concatenate([
                self.hidden_activation(self.bias_list[i + 1] + T.dot(
                    u_tm1, self.recurrent_to_gsn_weights_list[(i + 1) / 2]))
                for i in range(self.gsn_layers) if i % 2 == 0
            ],
                                axis=0)

            # Make a GSN to update U
            _, hs = generative_stochastic_network.build_gsn(
                x_t, self.weights_list, self.bias_list, add_noise,
                self.noiseless_h1, self.hidden_add_noise_sigma,
                self.input_salt_and_pepper, self.input_sampling, self.MRG,
                self.visible_activation, self.hidden_activation,
                self.walkbacks, self.logger)
            htop_t = hs[-1]
            ins_t = htop_t

            ua_t = T.dot(ins_t, self.W_ins_u) + T.dot(
                u_tm1, self.W_u_u) + self.recurrent_bias
            u_t = self.recurrent_hidden_activation(ua_t)
            return [ua_t, u_t, h_t]

        log.maybeLog(self.logger, "\nCreating recurrent step scan.")
        # For training, the deterministic recurrence is used to compute all the
        # {h_t, 1 <= t <= T} given Xs. Conditional GSNs can then be trained
        # in batches using those parameters.
        u0 = T.zeros((self.recurrent_hidden_size,
                      ))  # initial value for the RNN hidden units
        (ua, u, h_t), updates_recurrent = theano.scan(
            fn=lambda x_t, u_tm1, *_: recurrent_step(x_t, u_tm1, True),
            sequences=self.Xs,
            outputs_info=[None, u0, None],
            non_sequences=self.params)

        log.maybeLog(self.logger,
                     "Now for reconstruction sample without noise")
        (_, _, h_t_recon), updates_recurrent_recon = theano.scan(
            fn=lambda x_t, u_tm1, *_: recurrent_step(x_t, u_tm1, False),
            sequences=self.Xs,
            outputs_info=[None, u0, None],
            non_sequences=self.params)
        # put together the hiddens list
        h_list = [T.zeros_like(self.Xs)]
        for layer, w in enumerate(self.weights_list):
            if layer % 2 != 0:
                h_list.append(T.zeros_like(T.dot(h_list[-1], w)))
            else:
                h_list.append(
                    (h_t.T[(layer / 2) * self.hidden_size:(layer / 2 + 1) *
                           self.hidden_size]).T)

        h_list_recon = [T.zeros_like(self.Xs)]
        for layer, w in enumerate(self.weights_list):
            if layer % 2 != 0:
                h_list_recon.append(T.zeros_like(T.dot(h_list_recon[-1], w)))
            else:
                h_list_recon.append(
                    (h_t_recon.T[(layer / 2) *
                                 self.hidden_size:(layer / 2 + 1) *
                                 self.hidden_size]).T)

        #with noise
        _, cost, show_cost = generative_stochastic_network.build_gsn_given_hiddens(
            self.Xs, h_list, self.weights_list, self.bias_list, True,
            self.noiseless_h1, self.hidden_add_noise_sigma,
            self.input_salt_and_pepper, self.input_sampling, self.MRG,
            self.visible_activation, self.hidden_activation, self.walkbacks,
            self.cost_function, self.logger)
        #without noise for reconstruction
        x_sample_recon, _, _ = generative_stochastic_network.build_gsn_given_hiddens(
            self.Xs, h_list_recon, self.weights_list, self.bias_list, False,
            self.noiseless_h1, self.hidden_add_noise_sigma,
            self.input_salt_and_pepper, self.input_sampling, self.MRG,
            self.visible_activation, self.hidden_activation, self.walkbacks,
            self.cost_function, self.logger)

        updates_train = updates_recurrent
        updates_cost = updates_recurrent

        #############
        #   COSTS   #
        #############
        log.maybeLog(self.logger,
                     '\nCost w.r.t p(X|...) at every step in the graph')
        start_functions_time = time.time()

        # if we are not using Hessian-free training create the normal sgd functions
        if not self.hessian_free:
            gradient = T.grad(cost, self.params)
            gradient_buffer = [
                theano.shared(
                    numpy.zeros(param.get_value().shape, dtype='float32'))
                for param in self.params
            ]

            m_gradient = [
                self.momentum * gb + (cast32(1) - self.momentum) * g
                for (gb, g) in zip(gradient_buffer, gradient)
            ]
            param_updates = [(param, param - self.learning_rate * mg)
                             for (param, mg) in zip(self.params, m_gradient)]
            gradient_buffer_updates = zip(gradient_buffer, m_gradient)

            updates = OrderedDict(param_updates + gradient_buffer_updates)
            updates_train.update(updates)

            log.maybeLog(self.logger, "rnn-gsn learn...")
            self.f_learn = theano.function(inputs=[self.Xs],
                                           updates=updates_train,
                                           outputs=show_cost,
                                           on_unused_input='warn',
                                           name='rnngsn_f_learn')

            log.maybeLog(self.logger, "rnn-gsn cost...")
            self.f_cost = theano.function(inputs=[self.Xs],
                                          updates=updates_cost,
                                          outputs=show_cost,
                                          on_unused_input='warn',
                                          name='rnngsn_f_cost')

        log.maybeLog(self.logger, "Training/cost functions done.")

        # Denoise some numbers : show number, noisy number, predicted number, reconstructed number
        log.maybeLog(
            self.logger,
            "Creating graph for noisy reconstruction function at checkpoints during training."
        )
        self.f_recon = theano.function(inputs=[self.Xs],
                                       outputs=x_sample_recon[-1],
                                       updates=updates_recurrent_recon,
                                       name='rnngsn_f_recon')

        # a function to add salt and pepper noise
        self.f_noise = theano.function(inputs=[self.X],
                                       outputs=salt_and_pepper(
                                           self.X, self.input_salt_and_pepper),
                                       name='rnngsn_f_noise')
        # Sampling functions
        log.maybeLog(self.logger, "Creating sampling function...")
        if self.gsn_layers == 1:
            self.f_sample = theano.function(
                inputs=[X_sample],
                outputs=visible_pX_chain[-1],
                name='rnngsn_f_sample_single_layer')
        else:
            # WHY IS THERE A WARNING????
            # because the first odd layers are not used -> directly computed FROM THE EVEN layers
            # unused input = warn
            self.f_sample = theano.function(inputs=self.network_state_input,
                                            outputs=self.network_state_output +
                                            visible_pX_chain,
                                            on_unused_input='warn',
                                            name='rnngsn_f_sample')

        log.maybeLog(self.logger, "Done compiling all functions.")
        compilation_time = time.time() - start_functions_time
        # Show the compile time with appropriate easy-to-read units.
        log.maybeLog(
            self.logger, "Total compilation time took " +
            make_time_units_string(compilation_time) + ".\n\n")
Example #38
0
hidden = 10

D = (numpy.random.randn(examples, features),
     numpy.random.randint(size=examples, low=0, high=2))
training_steps = 1000

x = T.dmatrix("x")
y = T.dvector("y")

w1 = theano.shared(numpy.random.randn(features, hidden), name="w1")
b1 = theano.shared(numpy.zeros(hidden), name="b1")

w2 = theano.shared(numpy.random.randn(hidden), name="w2")
b2 = theano.shared(0., name="b2")

p1 = T.tanh(T.dot(x, w1) + b1)
p2 = T.tanh(T.dot(p1, w2) + b2)

prediction = p2 > 0.5

error = T.nnet.binary_crossentropy(p2, y)

loss = error.mean() + 0.01 * (l2(w1) + l2(w2))
gw1, gb1, gw2, gb2 = T.grad(loss, [w1, b1, w2, b2])

train = theano.function(inputs=[x, y],
                        outputs=[p2, error],
                        updates=((w1, w1 - 0.1 * gw1), (b1, b1 - 0.1 * gb1),
                                 (w2, w2 - 0.1 * gw2), (b2, b2 - 0.1 * gb2)))
predict = theano.function(inputs=[x], outputs=[prediction])
Example #39
0
    def __init__(self, name, config):
        super().__init__(name)
        self.config = config

        pprint(config)
        sys.stdout.flush()

        self.add(Embeddings(
            'src_char_embeddings',
            len(config['src_encoder'].sub_encoder),
            config['src_char_embedding_dims'],
            dropout=config['char_embeddings_dropout']))

        self.add(Embeddings(
            'src_embeddings',
            len(config['src_encoder']),
            config['src_embedding_dims'],
            dropout=config['embeddings_dropout']))

        self.add(Embeddings(
            'trg_embeddings',
            len(config['trg_encoder']),
            config['trg_embedding_dims']))

        self.add(Linear(
            'hidden',
            config['decoder_state_dims'],
            config['trg_embedding_dims'],
            dropout=config['dropout'],
            layernorm=config['layernorm']))

        self.add(Linear(
            'emission',
            config['trg_embedding_dims'],
            len(config['trg_encoder']),
            w=self.trg_embeddings._w.T))

        self.add(Linear(
            'proj_h0',
            config['encoder_state_dims'],
            config['decoder_state_dims'],
            dropout=config['dropout'],
            layernorm=config['layernorm']))

        self.add(Linear(
            'proj_c0',
            config['encoder_state_dims'],
            config['decoder_state_dims'],
            dropout=config['dropout'],
            layernorm=config['layernorm']))

        # The total loss is
        #   lambda_o*xent(target sentence) + lambda_a*xent(alignment)
        self.lambda_o = theano.shared(
                np.array(1.0, dtype=theano.config.floatX))
        self.lambda_a = theano.shared(
                np.array(config['alignment_loss'], dtype=theano.config.floatX))
        for prefix, backwards in (('fwd', False), ('back', True)):
            self.add(LSTMSequence(
                prefix+'_char_encoder', backwards,
                config['src_char_embedding_dims'] + (
                    (config['src_embedding_dims'] // 2) if backwards else 0),
                config['src_embedding_dims'] // 2,
                layernorm=config['encoder_layernorm'],
                dropout=config['recurrent_dropout'],
                trainable_initial=True,
                offset=0))
        for prefix, backwards in (('fwd', False), ('back', True)):
            self.add(LSTMSequence(
                prefix+'_encoder', backwards,
                config['src_embedding_dims'] + (
                    config['encoder_state_dims'] if backwards else 0),
                config['encoder_state_dims'],
                layernorm=config['encoder_layernorm'],
                dropout=config['recurrent_dropout'],
                trainable_initial=True,
                offset=0))
        self.add(LSTMSequence(
            'decoder', False,
            config['trg_embedding_dims'],
            config['decoder_state_dims'],
            layernorm=config['decoder_layernorm'],
            dropout=config['recurrent_dropout'],
            attention_dims=config['attention_dims'],
            attended_dims=2*config['encoder_state_dims'],
            trainable_initial=False,
            offset=-1))

        h_t = T.matrix('h_t')
        self.predict_fun = function(
                [h_t],
                T.nnet.softmax(self.emission(T.tanh(self.hidden(h_t)))))

        inputs = T.lmatrix('inputs')
        inputs_mask = T.bmatrix('inputs_mask')
        chars = T.lmatrix('chars')
        chars_mask = T.bmatrix('chars_mask')
        outputs = T.lmatrix('outputs')
        outputs_mask = T.bmatrix('outputs_mask')
        attention = T.tensor3('attention')

        self.x = [inputs, inputs_mask, chars, chars_mask]
        self.y = [outputs, outputs_mask, attention]

        self.encode_fun = function(self.x, self.encode(*self.x))
        self.xent_fun = function(self.x+self.y, self.xent(*(self.x+self.y)))
Example #40
0
 def forward_prop_step(x_t, s_t_prev, U, V, W):
     s_t = T.tanh(U[:, x_t] + W.dot(s_t_prev))
     o_t = T.nnet.softmax(V.dot(s_t))
     return [o_t[0], s_t
             ]  # need to take [0] as nnet.softmax returns a 2D tensor
Example #41
0
def tanh(x):
    """
    Tanh activation function
    """
    return tensor.tanh(x)
Example #42
0
def lstm_train(n_in=7,
               n_hidden=10,
               n_i=10,
               n_c=10,
               n_o=10,
               n_f=10,
               n_y=7,
               nb_epochs=300,
               nb_train_examples=1000):
    '''
    # numbeer of input layer dim as embedded reber grammar (7bit vector)
        n_in = 7
    
    # number of hidden layer unit for gate
        n_hidden = 10
        n_i = 10
        n_c = 10
        n_o = 10
        n_f = 10
    
    # number of output layer dim (7bit vector)
        n_y = 7
    '''

    # 重みの初期化
    # 入力および出力ゲートは開くか閉じるを使う
    # 忘却ゲートは開いているべきある。(トレーニングのはじめから忘れないように)
    # biasの適当な初期化によって達成を試みている。
    W_xi = theano.shared(ortho_weights(n_in, n_i))
    W_hi = theano.shared(ortho_weights(n_hidden, n_i))
    W_ci = theano.shared(ortho_weights(n_c, n_i))
    b_i = theano.shared(np.cast[config.floatX](np.random.uniform(
        -0.5, 0.5, size=n_i)))  # 入力ゲートはランダムで良い

    W_xf = theano.shared(ortho_weights(n_in, n_f))
    W_hf = theano.shared(ortho_weights(n_hidden, n_f))
    W_cf = theano.shared(ortho_weights(n_c, n_f))
    b_f = theano.shared(np.cast[config.floatX](np.random.uniform(
        0, 1, size=n_f)))  # 忘却ゲートははじめ開いているべき(sigmoidを挟んだときに確実に0.5以上にする)

    W_xc = theano.shared(ortho_weights(n_in, n_c))
    W_hc = theano.shared(ortho_weights(n_hidden, n_c))
    b_c = theano.shared(np.zeros(
        n_c, dtype=config.floatX))  # メモリセルのバイアスは初期値0(閉じるでも開くでもない)

    W_xo = theano.shared(ortho_weights(n_in, n_o))
    W_ho = theano.shared(ortho_weights(n_hidden, n_o))
    W_co = theano.shared(ortho_weights(n_c, n_o))
    b_o = theano.shared(np.cast[config.floatX](np.random.uniform(
        -0.5, 0.5, size=n_o)))  # 出力ゲートはランダムで良い

    W_hy = theano.shared(ortho_weights(n_hidden, n_y))
    b_y = theano.shared(np.zeros(n_y,
                                 dtype=config.floatX))  # カテゴリー分類レイヤーのバイアスは初期値0

    c0 = theano.shared(np.zeros(n_c, dtype=config.floatX))  # メモリセルの初期入力
    h0 = T.tanh(c0)  # 初期ct_prime

    params = [
        W_xi, W_hi, W_ci, b_i, W_xf, W_hf, W_cf, b_f, W_xc, W_hc, b_c, W_xo,
        W_ho, W_co, b_o, W_hy, b_y, c0
    ]

    # 初期時刻の入力ベクトルシンボル
    v = T.matrix(dtype=config.floatX)

    # ターゲット(教師データ)シンボル
    target = T.matrix(dtype=config.floatX)

    # recurrence
    [h_vals, _, y_vals], _ = theano.scan(
        fn=one_lstm_step,
        #sequences = dict(input=v, taps=[0]),
        sequences=v,
        outputs_info=[h0, c0, None],
        non_sequences=[
            W_xi, W_hi, W_ci, b_i, W_xf, W_hf, W_cf, b_f, W_xc, W_hc, b_c,
            W_xo, W_ho, W_co, b_o, W_hy, b_y
        ])

    # cost ここでは多クラス問題なのでクロスエントロピー
    cost = -T.mean(target * T.log(y_vals) + (1. - target) * T.log(1. - y_vals))

    # 学習率の共有変数
    lr = np.cast[config.floatX](.1)
    learning_rate = theano.shared(lr)

    # 各パラメータの勾配
    #gparams = T.grad(cost, params)

    gparams = []
    for param in params:
        gparam = T.grad(cost, param)
        gparams.append(gparam)

    # パラメータの更新 simple sgd
    updates = []
    for param, gparam in zip(params, gparams):
        updates.append((param, param - gparam * learning_rate))

    # 教師データの生成
    train_data = reber_grammer.get_n_embedded_examples(nb_train_examples)
    print 'train data length: ', len(train_data)

    # lstm
    learn_rnn_fn = theano.function(inputs=[v, target],
                                   outputs=cost,
                                   updates=updates)
    train_errors = np.ndarray(nb_epochs)

    def train_rnn(train_data):
        for x in range(nb_epochs):
            error = 0.
            for j in range(len(train_data)):
                # train_dataからランダムに1つ事例を取得
                index = np.random.randint(0, len(train_data))
                # 入力ベクトルi, 教師ベクトルo
                i, o = train_data[index]
                #print 'train vector: ',i
                #print 'train target: ',o
                train_cost = learn_rnn_fn(i, o)
                error += train_cost
            # epochごとにerrorを出力
            print "epochs %i : %f" % (x, error)
            train_errors[x] = error

    train_rnn(train_data)

    plt.plot(np.arange(nb_epochs), train_errors, 'b-')
    plt.xlabel('epochs')
    plt.ylabel('error')
    plt.ylim(0., 50)
    plt.show()
    print params
Example #43
0
    def __init__(self, layer_def, inputs, inputs_shape, rs, clone_from=None):
        """
            Create an Gated Recurrent Unit layer with shared variable internal parameters.
            
            :type layer_def: Element, xml containing configu for Conv layer
            
            :type inputs: list of inputs [input,gate_input,prev_output] 
            :param inputs[0]: input, the input which is a theano.matrix, x_t
            :param inputs[1]: previous state, h_{t-1}, same shape as this layer
            
            :type rs: a random state
            """

        #inputs = [input,gate_input,previous_output]
        layer_name = layer_def.attrib["name"]
        assert (len(inputs) == 2)
        assert (len(inputs_shape) == 2)
        self.input = inputs[0]
        self.prev_h = inputs[1]
        n_in, _ = inputs_shape[0]
        n_prev_h, bsz = inputs_shape[1]
        assert (bsz == inputs_shape[0][1])

        # clone the num_units
        if clone_from == None:
            self.num_units = int(layer_def.find("numunits").text)
        else:
            self.num_units = clone_from.num_units

        assert (n_prev_h == self.num_units)

        #create the weight matrices
        rng = np.random.RandomState(seed=int(time.time()))
        # initialize weights with random weights
        if clone_from != None:
            #weight matrices for x_t, the input
            self.W_z = clone_from.W_z
            self.W_r = clone_from.W_r
            self.W = clone_from.W
            #weight matrices for h_{t-1}
            self.U_z = clone_from.U_z
            self.U_r = clone_from.U_r
            self.U = clone_from.U
        else:
            #W_{}: is a matrix of size num_units x n_in
            W_bound = np.sqrt(6. / (self.num_units + n_in))
            #W_o
            W_values = np.asarray(rng.normal(loc=0.,
                                             scale=W_bound,
                                             size=(self.num_units, n_in)),
                                  dtype=theano.config.floatX)
            self.W_z = theano.shared(value=W_values,
                                     name=layer_name + '-Wz',
                                     borrow=False)  # num_units x n_in
            #W_f
            W_values = np.asarray(rng.normal(loc=0.,
                                             scale=W_bound,
                                             size=(self.num_units, n_in)),
                                  dtype=theano.config.floatX)
            self.W_r = theano.shared(value=W_values,
                                     name=layer_name + '-Wr',
                                     borrow=False)  # num_units x n_in
            #W_i
            W_values = np.asarray(rng.normal(loc=0.,
                                             scale=W_bound,
                                             size=(self.num_units, n_in)),
                                  dtype=theano.config.floatX)
            self.W = theano.shared(value=W_values,
                                   name=layer_name + '-W',
                                   borrow=False)  # num_units x n_in

            #U_{}: is a matrix of size num_units x num_units
            U_bound = np.sqrt(6. / (self.num_units + self.num_units))
            #U_o
            U_values = np.asarray(rng.normal(loc=0.,
                                             scale=U_bound,
                                             size=(self.num_units,
                                                   self.num_units)),
                                  dtype=theano.config.floatX)
            self.U_z = theano.shared(value=U_values,
                                     name=layer_name + '-Uz',
                                     borrow=False)  #num_units x num_units
            #U_f
            U_values = np.asarray(rng.normal(loc=0.,
                                             scale=U_bound,
                                             size=(self.num_units,
                                                   self.num_units)),
                                  dtype=theano.config.floatX)
            self.U_r = theano.shared(value=U_values,
                                     name=layer_name + '-Ur',
                                     borrow=False)  #num_units x num_units
            #U_i
            U_values = np.asarray(rng.normal(loc=0.,
                                             scale=U_bound,
                                             size=(self.num_units,
                                                   self.num_units)),
                                  dtype=theano.config.floatX)
            self.U = theano.shared(value=U_values,
                                   name=layer_name + '-U',
                                   borrow=False)  #num_units x num_units

        #calculate the gate values
        # num_units x bsz               #num_units x bsz
        self.zgate = T.nnet.sigmoid(
            T.dot(self.W_z, self.input) +
            T.dot(self.U_z, self.prev_h))  #update gate
        # num_units x bsz               #num_units x bsz
        self.rgate = T.nnet.sigmoid(
            T.dot(self.W_r, self.input) +
            T.dot(self.U_r, self.prev_h))  #reset gate
        # num_units x bsz       #num_units x bsz
        self.tilde_h = T.tanh(
            T.dot(self.W, self.input) +
            T.dot(self.U, (self.rgate * self.prev_h)))  #new memory content
        #output is a dictionary
        #only if there is a mem output tag, then provide this output
        self.output = dict()
        self.output_shape = dict()
        #the default output
        self.output[layer_name] = (
            1. - self.zgate) * self.prev_h + self.zgate * self.tilde_h
        self.output_shape[layer_name] = [self.num_units, bsz]
        self.inputs_shape = inputs_shape
        # parameters of the model
        if clone_from == None:
            self.params = [
                self.W_z, self.W_r, self.W, self.U_z, self.U_r, self.U
            ]
        else:
            self.params = []
Example #44
0
    def build_model(self, **kwargs):
        self.opt_ret.clear()

        use_noise = kwargs.pop('use_noise', theano.shared(np.float32(1.)))
        trng = kwargs.pop('trng', RandomStreams(self.O['seed']))

        dropout_param = None
        if self.O['use_dropout'][0]:
            dropout_param = [use_noise, trng, self.O['use_dropout'][1]]

        x, x_mask, y, y_mask = self.get_input()
        xr, xr_mask = self.reverse_input(x, x_mask)

        n_timestep, n_timestep_tgt, n_samples = self.input_dimensions(x, y)

        # Word embedding for forward rnn (source)
        emb = self.embedding(x, n_timestep, n_samples)
        proj_f = self._encoder(emb,
                               'encoder',
                               mask=x_mask,
                               dropout_param=dropout_param)

        # Word embedding for backward rnn (source)
        embr = self.embedding(xr, n_timestep, n_samples)
        proj_r = self._encoder(embr,
                               'encoder',
                               mask=xr_mask,
                               dropout_param=dropout_param)

        # Context will be the concatenation of forward and backward RNNs
        ctx = concatenate(
            [proj_f[0], proj_r[0][::-1], proj_f[1], proj_r[1][::-1]],
            axis=proj_f[0].ndim - 1)

        # Mean of the context across time, which will be used to initialize decoder LSTM. This is the original code
        ctx_mean = self.get_context_mean(ctx, x_mask)

        # Initial decoder state
        initial_decoder_h = self.fully_connect(ctx_mean, 'initDecoder', T.tanh)

        # Word embedding (target), we will shift the target sequence one time step
        # to the right. This is done because of the bi-gram connections in the
        # readout and decoder rnn. The first target will be all zeros and we will
        # not condition on the last output.
        emb = self.embedding(y, n_timestep_tgt, n_samples, 'Wemb_dec')
        emb_shifted = T.zeros_like(emb)
        emb_shifted = T.set_subtensor(emb_shifted[1:], emb[:-1])
        emb = emb_shifted

        hidden_from_last_layer, ctx_from_1st_layer = self._decoder(
            emb,
            y_mask,
            ctx,
            x_mask,
            initial_decoder_h,
            prefix='decoder',
            one_step=False,
            dropout_param=dropout_param,
        )

        # As suggested in Page 14 of the NMT + Attention model paper, let us implement the equation above section A.2.3
        fc_hidden = self.fully_connect(hidden_from_last_layer,
                                       prefix='fc_compress_lastHiddenState',
                                       activ='linear')
        fc_emb = self.fully_connect(emb,
                                    prefix='fc_compress_emb',
                                    activ='linear')
        fc_ctx = self.fully_connect(ctx_from_1st_layer,
                                    prefix='fc_compress_ctx',
                                    activ='linear')

        fc_sum = T.tanh(fc_hidden + fc_emb + fc_ctx)

        # According to Baidu's paper, dropout is only used in LSTM. So I drop the following two lines out (v-yixia)
        # if self.O['use_dropout'][0]:
        #    fc_sum = self.dropout(fc_sum, use_noise, trng, self.O['use_dropout'][1])

        softmax_output = self.fully_connect(fc_sum,
                                            prefix='fc_to_softmax',
                                            activ='linear')
        softmax_output_shp = softmax_output.shape
        probs = T.nnet.softmax(
            softmax_output.reshape([
                softmax_output_shp[0] * softmax_output_shp[1],
                softmax_output_shp[2]
            ]))

        cost = self.get_cost(y, y_mask, probs)

        return x, x_mask, y, y_mask, cost
Example #45
0
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=4,
                    emb_size=300,
                    batch_size=10,
                    describ_max_len=20,
                    type_size=12,
                    filter_size=[3, 5],
                    maxSentLen=100,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options
    emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/2018-il9-il10/multi-emb/'
    test_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9-setE-as-test-input_ner_filtered_w2.txt'
    output_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9_system_output_noMT_epoch4.json'
    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))
    word2id = {}
    # all_sentences, all_masks, all_labels, all_other_labels, word2id=load_BBN_il5Trans_il5_dataset(maxlen=maxSentLen)  #minlen, include one label, at least one word in the sentence
    train_p1_sents, train_p1_masks, train_p1_labels, word2id = load_trainingData_types(
        word2id, maxSentLen)
    train_p2_sents, train_p2_masks, train_p2_labels, train_p2_other_labels, word2id = load_trainingData_types_plus_others(
        word2id, maxSentLen)
    test_sents, test_masks, test_labels, word2id = load_il9_NI_test(
        word2id, maxSentLen)

    label_sent, label_mask = load_SF_type_descriptions(word2id, type_size,
                                                       describ_max_len)
    label_sent = np.asarray(label_sent, dtype='int32')
    label_mask = np.asarray(label_mask, dtype=theano.config.floatX)

    train_p1_sents = np.asarray(train_p1_sents, dtype='int32')
    train_p1_masks = np.asarray(train_p1_masks, dtype=theano.config.floatX)
    train_p1_labels = np.asarray(train_p1_labels, dtype='int32')
    train_p1_size = len(train_p1_labels)

    train_p2_sents = np.asarray(train_p2_sents, dtype='int32')
    train_p2_masks = np.asarray(train_p2_masks, dtype=theano.config.floatX)
    train_p2_labels = np.asarray(train_p2_labels, dtype='int32')
    train_p2_other_labels = np.asarray(train_p2_other_labels, dtype='int32')
    train_p2_size = len(train_p2_labels)
    '''
    combine train_p1 and train_p2
    '''
    train_sents = np.concatenate([train_p1_sents, train_p2_sents], axis=0)
    train_masks = np.concatenate([train_p1_masks, train_p2_masks], axis=0)
    train_labels = np.concatenate([train_p1_labels, train_p2_labels], axis=0)
    train_size = train_p1_size + train_p2_size

    test_sents = np.asarray(test_sents, dtype='int32')
    test_masks = np.asarray(test_masks, dtype=theano.config.floatX)
    test_labels = np.asarray(test_labels, dtype='int32')
    test_size = len(test_sents)

    vocab_size = len(word2id) + 1  # add one zero pad index

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_fasttext_multiple_word2vec_given_file([
        emb_root + '100k-ENG-multicca.300.ENG.txt',
        emb_root + '100k-SWA-multicca.d300.SWA.txt',
        emb_root + '100k-IL9-multicca.d300.IL9.txt'
    ], 300)
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_id_matrix = T.imatrix('sents_id_matrix')
    sents_mask = T.fmatrix('sents_mask')
    labels = T.imatrix('labels')  #batch*12
    other_labels = T.imatrix()  #batch*4

    des_id_matrix = T.imatrix()
    des_mask = T.fmatrix()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    common_input = embeddings[sents_id_matrix.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(
            0, 2, 1)  #the input format can be adapted into CNN or GRU or LSTM
    bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2)
    repeat_common_input = T.repeat(
        normalize_tensor3_colwise(common_input), type_size,
        axis=0)  #(batch_size*type_size, emb_size, maxsentlen)

    des_input = embeddings[des_id_matrix.flatten()].reshape(
        (type_size, describ_max_len, emb_size)).dimshuffle(0, 2, 1)
    bow_des = T.sum(des_input * des_mask.dimshuffle(0, 'x', 1),
                    axis=2)  #(tyope_size, emb_size)
    repeat_des_input = T.tile(
        normalize_tensor3_colwise(des_input),
        (batch_size, 1, 1))  #(batch_size*type_size, emb_size, maxsentlen)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    conv_W2, conv_b2 = create_conv_para(rng,
                                        filter_shape=(hidden_size[0], 1,
                                                      emb_size,
                                                      filter_size[1]))
    multiCNN_para = [conv_W, conv_b, conv_W2, conv_b2]

    conv_att_W, conv_att_b = create_conv_para(rng,
                                              filter_shape=(hidden_size[0], 1,
                                                            emb_size,
                                                            filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    conv_att_W2, conv_att_b2 = create_conv_para(rng,
                                                filter_shape=(hidden_size[0],
                                                              1, emb_size,
                                                              filter_size[1]))
    conv_W_context2, conv_b_context2 = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    ACNN_para = [
        conv_att_W, conv_att_b, conv_W_context, conv_att_W2, conv_att_b2,
        conv_W_context2
    ]
    '''
    multi-CNN
    '''
    conv_model = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    conv_model2 = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        W=conv_W2,
        b=conv_b2
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings2 = conv_model2.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size
    '''
    GRU
    '''
    U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0])
    GRU_NN_para = [
        U1, W1, b1
    ]  #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias
    # gru_input = common_input.dimshuffle((0,2,1))   #gru requires input (batch_size, emb_size, maxSentLen)
    gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask,
                                                 hidden_size[0], U1, W1, b1)
    gru_sent_embeddings = gru_layer.output_sent_rep  # (batch_size, hidden_size)
    '''
    ACNN
    '''
    attentive_conv_layer = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W,
        b=conv_att_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l

    attentive_conv_layer2 = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W2,
        b=conv_att_b2,
        W_context=conv_W_context2,
        b_context=conv_b_context2)
    sent_att_embeddings2 = attentive_conv_layer2.attentive_maxpool_vec_l
    '''
    cross-DNN-dataless
    '''
    #first map label emb into hidden space
    HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para(
        rng, emb_size, hidden_size[0])
    HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b]
    HL_layer_1 = HiddenLayer(rng,
                             input=bow_des,
                             n_in=emb_size,
                             n_out=hidden_size[0],
                             W=HL_layer_1_W,
                             b=HL_layer_1_b,
                             activation=T.tanh)
    des_rep_hidden = HL_layer_1.output  #(type_size, hidden_size)
    dot_dnn_dataless_1 = T.tanh(sent_embeddings.dot(
        des_rep_hidden.T))  #(batch_size, type_size)
    dot_dnn_dataless_2 = T.tanh(sent_embeddings2.dot(des_rep_hidden.T))
    '''
    dataless cosine
    '''
    cosine_scores = normalize_matrix_rowwise(bow_emb).dot(
        normalize_matrix_rowwise(bow_des).T)
    cosine_score_matrix = T.nnet.sigmoid(
        cosine_scores)  #(batch_size, type_size)
    '''
    dataless top-30 fine grained cosine
    '''
    fine_grained_cosine = T.batched_dot(
        repeat_common_input.dimshuffle(0, 2, 1),
        repeat_des_input)  #(batch_size*type_size,maxsentlen,describ_max_len)
    fine_grained_cosine_to_matrix = fine_grained_cosine.reshape(
        (batch_size * type_size, maxSentLen * describ_max_len))
    sort_fine_grained_cosine_to_matrix = T.sort(fine_grained_cosine_to_matrix,
                                                axis=1)
    top_k_simi = sort_fine_grained_cosine_to_matrix[:,
                                                    -30:]  # (batch_size*type_size, 5)
    max_fine_grained_cosine = T.mean(top_k_simi, axis=1)
    top_k_cosine_scores = max_fine_grained_cosine.reshape(
        (batch_size, type_size))
    top_k_score_matrix = T.nnet.sigmoid(top_k_cosine_scores)

    acnn_LR_input = T.concatenate([
        dot_dnn_dataless_1, dot_dnn_dataless_2, cosine_score_matrix,
        top_k_score_matrix, sent_embeddings, sent_embeddings2,
        gru_sent_embeddings, sent_att_embeddings, sent_att_embeddings2, bow_emb
    ],
                                  axis=1)
    acnn_LR_input_size = hidden_size[0] * 5 + emb_size + 4 * type_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    acnn_U_a, acnn_LR_b = create_LR_para(rng, acnn_LR_input_size, 12)
    acnn_LR_para = [acnn_U_a, acnn_LR_b]
    acnn_layer_LR = LogisticRegression(
        rng,
        input=acnn_LR_input,
        n_in=acnn_LR_input_size,
        n_out=12,
        W=acnn_U_a,
        b=acnn_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    acnn_score_matrix = T.nnet.sigmoid(
        acnn_layer_LR.before_softmax)  #batch * 12
    acnn_prob_pos = T.where(labels < 1, 1.0 - acnn_score_matrix,
                            acnn_score_matrix)
    acnn_loss = -T.mean(T.log(acnn_prob_pos))

    acnn_other_U_a, acnn_other_LR_b = create_LR_para(rng, acnn_LR_input_size,
                                                     16)
    acnn_other_LR_para = [acnn_other_U_a, acnn_other_LR_b]
    acnn_other_layer_LR = LogisticRegression(rng,
                                             input=acnn_LR_input,
                                             n_in=acnn_LR_input_size,
                                             n_out=16,
                                             W=acnn_other_U_a,
                                             b=acnn_other_LR_b)
    acnn_other_prob_matrix = T.nnet.softmax(
        acnn_other_layer_LR.before_softmax.reshape((batch_size * 4, 4)))
    acnn_other_prob_tensor3 = acnn_other_prob_matrix.reshape(
        (batch_size, 4, 4))
    acnn_other_prob = acnn_other_prob_tensor3[
        T.repeat(T.arange(batch_size), 4),
        T.tile(T.arange(4), (batch_size)),
        other_labels.flatten()]
    acnn_other_field_loss = -T.mean(T.log(acnn_other_prob))

    params = multiCNN_para + GRU_NN_para + ACNN_para + acnn_LR_para + HL_layer_1_params  # put all model parameters together
    cost = acnn_loss + 1e-4 * ((conv_W**2).sum() + (conv_W2**2).sum() +
                               (conv_att_W**2).sum() + (conv_att_W2**2).sum())
    updates = Gradient_Cost_Para(cost, params, learning_rate)

    other_paras = params + acnn_other_LR_para
    cost_other = cost + acnn_other_field_loss
    other_updates = Gradient_Cost_Para(cost_other, other_paras, learning_rate)
    '''
    testing
    '''

    ensemble_NN_scores = acnn_score_matrix  #T.max(T.concatenate([att_score_matrix.dimshuffle('x',0,1), score_matrix.dimshuffle('x',0,1), acnn_score_matrix.dimshuffle('x',0,1)],axis=0),axis=0)
    # '''
    # majority voting, does not work
    # '''
    # binarize_NN = T.where(ensemble_NN_scores > 0.5, 1, 0)
    # binarize_dataless = T.where(cosine_score_matrix > 0.5, 1, 0)
    # binarize_dataless_finegrained = T.where(top_k_score_matrix > 0.5, 1, 0)
    # binarize_conc =  T.concatenate([binarize_NN.dimshuffle('x',0,1), binarize_dataless.dimshuffle('x',0,1),binarize_dataless_finegrained.dimshuffle('x',0,1)],axis=0)
    # sum_binarize_conc = T.sum(binarize_conc,axis=0)
    # binarize_prob = T.where(sum_binarize_conc > 0.0, 1, 0)
    # '''
    # sum up prob, works
    # '''
    # ensemble_scores_1 = 0.6*ensemble_NN_scores+0.4*top_k_score_matrix
    # binarize_prob = T.where(ensemble_scores_1 > 0.3, 1, 0)
    '''
    sum up prob, works
    '''
    ensemble_scores = ensemble_NN_scores  #0.6*ensemble_NN_scores+0.4*0.5*(cosine_score_matrix+top_k_score_matrix)
    binarize_prob = T.where(ensemble_scores > 0.3, 1, 0)
    '''
    test for other fields
    '''
    sum_tensor3 = acnn_other_prob_tensor3  #(batch, 4, 3)

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_p1_model = theano.function(
        [sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    train_p2_model = theano.function([
        sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask,
        other_labels
    ],
                                     cost_other,
                                     updates=other_updates,
                                     allow_input_downcast=True,
                                     on_unused_input='ignore')
    test_model = theano.function(
        [sents_id_matrix, sents_mask, des_id_matrix, des_mask],
        binarize_prob,
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_train_p2_batches = train_p2_size / batch_size
    train_p2_batch_start = list(np.arange(n_train_p2_batches) *
                                batch_size) + [train_p2_size - batch_size]

    n_test_batches = test_size / batch_size
    n_test_remain = test_size % batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    train_p2_batch_start_set = set(train_p2_batch_start)
    # max_acc_dev=0.0
    max_meanf1_test = 0.0
    max_weightf1_test = 0.0
    train_indices = range(train_size)
    train_p2_indices = range(train_p2_size)
    cost_i = 0.0
    other_cost_i = 0.0
    min_mean_frame = 100.0
    while epoch < n_epochs:
        epoch = epoch + 1
        random.Random(100).shuffle(train_indices)
        random.Random(100).shuffle(train_p2_indices)
        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]

            cost_i += train_p1_model(train_sents[train_id_batch],
                                     train_masks[train_id_batch],
                                     train_labels[train_id_batch], label_sent,
                                     label_mask)

            if batch_id in train_p2_batch_start_set:
                train_p2_id_batch = train_p2_indices[batch_id:batch_id +
                                                     batch_size]
                other_cost_i += train_p2_model(
                    train_p2_sents[train_p2_id_batch],
                    train_p2_masks[train_p2_id_batch],
                    train_p2_labels[train_p2_id_batch], label_sent, label_mask,
                    train_p2_other_labels[train_p2_id_batch])
            # else:
            #     random_batch_id = random.choice(train_p2_batch_start)
            #     train_p2_id_batch = train_p2_indices[random_batch_id:random_batch_id+batch_size]
            #     other_cost_i+=train_p2_model(
            #                         train_p2_sents[train_p2_id_batch],
            #                         train_p2_masks[train_p2_id_batch],
            #                         train_p2_labels[train_p2_id_batch],
            #                         label_sent,
            #                         label_mask,
            #                         train_p2_other_labels[train_p2_id_batch]
            #                         )
            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 20 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), str(
                        other_cost_i /
                        iter), 'uses ', (time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                error_sum = 0.0
                all_pred_labels = []
                all_gold_labels = []
                for test_batch_id in test_batch_start:  # for each test batch
                    pred_labels = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_masks[test_batch_id:test_batch_id + batch_size],
                        label_sent, label_mask)
                    gold_labels = test_labels[test_batch_id:test_batch_id +
                                              batch_size]
                    # print 'pred_labels:', pred_labels
                    # print 'gold_labels;', gold_labels
                    all_pred_labels.append(pred_labels)
                    all_gold_labels.append(gold_labels)
                all_pred_labels = np.concatenate(all_pred_labels)
                all_gold_labels = np.concatenate(all_gold_labels)

                test_mean_f1, test_weight_f1 = average_f1_two_array_by_col(
                    all_pred_labels, all_gold_labels)
                if test_weight_f1 > max_weightf1_test:
                    max_weightf1_test = test_weight_f1
                if test_mean_f1 > max_meanf1_test:
                    max_meanf1_test = test_mean_f1
                print '\t\t\t\t\t\t\t\tcurrent f1s:', test_mean_f1, test_weight_f1, '\t\tmax_f1:', max_meanf1_test, max_weightf1_test

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #46
0
    def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2, 2)):
        """
        Allocate a LeNetConvPoolLayer with shared variable internal parameters.
        :type rng: numpy.random.RandomState
        :param rng: a random number generator used to initialize weights
        :type input: theano.tensor.dtensor4
        :param input: symbolic image tensor, of shape image_shape
        :type filter_shape: tuple or list of length 4
        :param filter_shape: (number of filters, num input feature maps,
                              filter height, filter width)
        :type image_shape: tuple or list of length 4
        :param image_shape: (batch size, num input feature maps,
                             image height, image width)
        :type poolsize: tuple or list of length 2
        :param poolsize: the downsampling (pooling) factor (#rows, #cols)
        """

        assert image_shape[1] == filter_shape[1]
        self.input = input

        # there are "num input feature maps * filter height * filter width"
        # inputs to each hidden unit
        fan_in = numpy.prod(filter_shape[1:])
        # each unit in the lower layer receives a gradient from:
        # "num output feature maps * filter height * filter width" /
        #   pooling size
        fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) /
                   numpy.prod(poolsize))
        # initialize weights with random weights
        W_bound = numpy.sqrt(6. / (fan_in + fan_out))
        self.W = theano.shared(
            numpy.asarray(
                rng.uniform(low=-W_bound, high=W_bound, size=filter_shape),
                dtype=theano.config.floatX
            ),
            name = 'W',
            borrow=True
        )

        # the bias is a 1D tensor -- one bias per output feature map
        b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX)
        self.b = theano.shared(value=b_values, borrow=True)

        # convolve input feature maps with filters
        conv_out = conv.conv2d(
            input=input,
            filters=self.W,
            filter_shape=filter_shape,
            image_shape=image_shape
        )

        # downsample each feature map individually, using maxpooling
        pooled_out = downsample.max_pool_2d(
            input=conv_out,
            ds=poolsize,
            ignore_border=True
        )

        # add the bias term. Since the bias is a vector (1D array), we first
        # reshape it to a tensor of shape (1, n_filters, 1, 1). Each bias will
        # thus be broadcasted across mini-batches and feature map
        # width & height
        self.output = T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x'))

        # store parameters of this layer
        self.params = [self.W, self.b]

        # keep track of model input
        self.input = input
Example #47
0
    def __init__(self,
                 rng,
                 input,
                 filter_shape,
                 image_shape,
                 poolsize=(2, 2),
                 non_linear="tanh"):
        """
        Allocate a LeNetConvPoolLayer with shared variable internal parameters.

        :type rng: numpy.random.RandomState
        :param rng: a random number generator used to initialize weights

        :type input: theano.tensor.dtensor4
        :param input: symbolic image tensor, of shape image_shape

        :type filter_shape: tuple or list of length 4
        :param filter_shape: (number of filters, num input feature maps,
                              filter height,filter width)

        :type image_shape: tuple or list of length 4
        :param image_shape: (batch size, num input feature maps,
                             image height, image width)

        :type poolsize: tuple or list of length 2
        :param poolsize: the downsampling (pooling) factor (#rows,#cols)
        """

        assert image_shape[1] == filter_shape[1]
        self.input = input
        self.filter_shape = filter_shape
        self.image_shape = image_shape
        self.poolsize = poolsize
        self.non_linear = non_linear
        # there are "num input feature maps * filter height * filter width"
        # inputs to each hidden unit
        fan_in = numpy.prod(filter_shape[1:])
        # each unit in the lower layer receives a gradient from:
        # "num output feature maps * filter height * filter width" /
        #   pooling size
        fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) /
                   numpy.prod(poolsize))
        # initialize weights with random weights
        if self.non_linear == "none" or self.non_linear == "relu":
            self.W = theano.shared(numpy.asarray(rng.uniform(
                low=-0.01, high=0.01, size=filter_shape),
                                                 dtype=theano.config.floatX),
                                   borrow=True,
                                   name="W_conv")
        else:
            W_bound = numpy.sqrt(6. / (fan_in + fan_out))
            self.W = theano.shared(numpy.asarray(rng.uniform(
                low=-W_bound, high=W_bound, size=filter_shape),
                                                 dtype=theano.config.floatX),
                                   borrow=True,
                                   name="W_conv")
        b_values = numpy.zeros((filter_shape[0], ), dtype=theano.config.floatX)
        self.b = theano.shared(value=b_values, borrow=True, name="b_conv")

        # convolve input feature maps with filters
        conv_out = conv.conv2d(input=input,
                               filters=self.W,
                               filter_shape=self.filter_shape,
                               image_shape=self.image_shape)
        if self.non_linear == "tanh":
            conv_out_tanh = T.tanh(conv_out +
                                   self.b.dimshuffle('x', 0, 'x', 'x'))
            self.output = downsample.max_pool_2d(input=conv_out_tanh,
                                                 ds=self.poolsize,
                                                 ignore_border=True)
        elif self.non_linear == "relu":
            conv_out_tanh = ReLU(conv_out +
                                 self.b.dimshuffle('x', 0, 'x', 'x'))
            self.output = downsample.max_pool_2d(input=conv_out_tanh,
                                                 ds=self.poolsize,
                                                 ignore_border=True)
        else:
            pooled_out = downsample.max_pool_2d(input=conv_out,
                                                ds=self.poolsize,
                                                ignore_border=True)
            self.output = pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')
        self.params = [self.W, self.b]
Example #48
0
def tanh(x):
    return T.tanh(x)
Example #49
0
 def apply(self, input_):
     return tensor.tanh(input_)
Example #50
0
 def scrn(X, h, R, P, A, B, b, t):
     #c_t = T.dot(X,B)*(1-T.nnet.sigmoid(C)) + h[:,:n_c]*T.nnet.sigmoid(C)
     c_t = T.dot(X, B) * 0.05 + h[:, :n_c] * 0.95
     h_t = T.tanh(
         T.dot(X, A) + T.dot(c_t, P) + T.dot(h[:, n_c:], R) + b)
     return concatenate([c_t, h_t], axis=1)
Example #51
0
def build_model(tparams, options):
    opt_ret = dict()

    trng = RandomStreams(1234)
    use_noise = theano.shared(numpy.float32(0.))

    # description string: #words x #samples
    x = tensor.matrix('x', dtype='int64')
    x_mask = tensor.matrix('x_mask', dtype='float32')
    y = tensor.matrix('y', dtype='int64')
    y_mask = tensor.matrix('y_mask', dtype='float32')

    # for the backward rnn, we just need to invert x and x_mask
    xr = x[::-1]
    xr_mask = x_mask[::-1]

    n_timesteps = x.shape[0]
    n_timesteps_trg = y.shape[0]
    n_samples = x.shape[1]

    # word embedding for forward rnn (source)
    emb = tparams['Wemb'][x.flatten()]
    emb = emb.reshape([n_timesteps, n_samples, options['dim_word']])
    proj = get_layer(options['encoder'])[1](tparams,
                                            emb,
                                            options,
                                            prefix='encoder',
                                            mask=x_mask)
    # word embedding for backward rnn (source)
    embr = tparams['Wemb'][xr.flatten()]
    embr = embr.reshape([n_timesteps, n_samples, options['dim_word']])
    projr = get_layer(options['encoder'])[1](tparams,
                                             embr,
                                             options,
                                             prefix='encoder_r',
                                             mask=xr_mask)

    # context will be the concatenation of forward and backward rnns
    ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim - 1)

    # mean of the context (across time) will be used to initialize decoder rnn
    ctx_mean = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None]

    # or you can use the last state of forward + backward encoder rnns
    # ctx_mean = concatenate([proj[0][-1], projr[0][-1]], axis=proj[0].ndim-2)

    # initial decoder state
    init_state = get_layer('ff')[1](tparams,
                                    ctx_mean,
                                    options,
                                    prefix='ff_state',
                                    activ='tanh')

    # word embedding (target), we will shift the target sequence one time step
    # to the right. This is done because of the bi-gram connections in the
    # readout and decoder rnn. The first target will be all zeros and we will
    # not condition on the last output.
    emb = tparams['Wemb'][y.flatten()]
    emb = emb.reshape([n_timesteps_trg, n_samples, options['dim_word']])
    emb_shifted = tensor.zeros_like(emb)
    emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1])
    emb = emb_shifted

    # decoder - pass through the decoder conditional gru with attention
    proj = get_layer(options['decoder'])[1](tparams,
                                            emb,
                                            options,
                                            prefix='decoder',
                                            mask=y_mask,
                                            context=ctx,
                                            context_mask=x_mask,
                                            one_step=False,
                                            init_state=init_state)
    # hidden states of the decoder gru
    proj_h = proj[0]

    # weighted averages of context, generated by attention module
    ctxs = proj[1]

    # weights (alignment matrix)
    opt_ret['dec_alphas'] = proj[2]

    # compute word probabilities
    logit_lstm = get_layer('ff')[1](tparams,
                                    proj_h,
                                    options,
                                    prefix='ff_logit_lstm',
                                    activ='linear')
    logit_prev = get_layer('ff')[1](tparams,
                                    emb,
                                    options,
                                    prefix='ff_logit_prev',
                                    activ='linear')
    logit_ctx = get_layer('ff')[1](tparams,
                                   ctxs,
                                   options,
                                   prefix='ff_logit_ctx',
                                   activ='linear')
    logit = tensor.tanh(logit_lstm + logit_prev + logit_ctx)
    if options['use_dropout']:
        logit = dropout_layer(logit, use_noise, trng)
    logit = get_layer('ff')[1](tparams,
                               logit,
                               options,
                               prefix='ff_logit',
                               activ='linear')
    logit_shp = logit.shape
    probs = tensor.nnet.softmax(
        logit.reshape([logit_shp[0] * logit_shp[1], logit_shp[2]]))

    # cost
    y_flat = y.flatten()
    y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat
    cost = -tensor.log(probs.flatten()[y_flat_idx])
    cost = cost.reshape([y.shape[0], y.shape[1]])
    cost = (cost * y_mask).sum(0)

    return trng, use_noise, x, x_mask, y, y_mask, opt_ret, cost
Example #52
0
def Tanh(x):
    y = T.tanh(x)
    return (y)
Example #53
0
 def forward_prop_step(x_t, s_t_prev, U, V, W):
     # compute output of the hidden layer
     s_t = T.tanh(U[:, x_t] + W.dot(s_t_prev))
     # compute output of the softmax layer
     o_t = T.nnet.softmax(V.dot(s_t))
     return [o_t[0], s_t]
Example #54
0
def build_sampler(tparams, options, trng, use_noise):
    x = tensor.matrix('x', dtype='int64')
    xr = x[::-1]
    n_timesteps = x.shape[0]
    n_samples = x.shape[1]

    # word embedding (source), forward and backward
    emb = tparams['Wemb'][x.flatten()]
    emb = emb.reshape([n_timesteps, n_samples, options['dim_word']])
    embr = tparams['Wemb'][xr.flatten()]
    embr = embr.reshape([n_timesteps, n_samples, options['dim_word']])

    # encoder
    proj = get_layer(options['encoder'])[1](tparams,
                                            emb,
                                            options,
                                            prefix='encoder')
    projr = get_layer(options['encoder'])[1](tparams,
                                             embr,
                                             options,
                                             prefix='encoder_r')

    # concatenate forward and backward rnn hidden states
    ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim - 1)

    # get the input for decoder rnn initializer mlp
    ctx_mean = ctx.mean(0)
    # ctx_mean = concatenate([proj[0][-1],projr[0][-1]], axis=proj[0].ndim-2)
    init_state = get_layer('ff')[1](tparams,
                                    ctx_mean,
                                    options,
                                    prefix='ff_state',
                                    activ='tanh')

    print('Building f_init...')
    outs = [init_state, ctx]
    f_init = theano.function([x], outs, name='f_init', profile=profile)
    print('Done')

    # x: 1 x 1
    y = tensor.vector('y_sampler', dtype='int64')
    init_state = tensor.matrix('init_state', dtype='float32')

    # if it's the first word, emb should be all zero and it is indicated by -1
    emb = tensor.switch(y[:, None] < 0,
                        tensor.alloc(0., 1, tparams['Wemb'].shape[1]),
                        tparams['Wemb'][y])

    # apply one step of conditional gru with attention
    proj = get_layer(options['decoder'])[1](tparams,
                                            emb,
                                            options,
                                            prefix='decoder',
                                            mask=None,
                                            context=ctx,
                                            one_step=True,
                                            init_state=init_state)
    # get the next hidden state
    next_state = proj[0]

    # get the weighted averages of context for this target word y
    ctxs = proj[1]

    logit_lstm = get_layer('ff')[1](tparams,
                                    next_state,
                                    options,
                                    prefix='ff_logit_lstm',
                                    activ='linear')
    logit_prev = get_layer('ff')[1](tparams,
                                    emb,
                                    options,
                                    prefix='ff_logit_prev',
                                    activ='linear')
    logit_ctx = get_layer('ff')[1](tparams,
                                   ctxs,
                                   options,
                                   prefix='ff_logit_ctx',
                                   activ='linear')
    logit = tensor.tanh(logit_lstm + logit_prev + logit_ctx)
    if options['use_dropout']:
        logit = dropout_layer(logit, use_noise, trng)
    logit = get_layer('ff')[1](tparams,
                               logit,
                               options,
                               prefix='ff_logit',
                               activ='linear')

    # compute the softmax probability
    next_probs = tensor.nnet.softmax(logit)

    # sample from softmax distribution to get the sample
    next_sample = trng.multinomial(pvals=next_probs).argmax(1)

    # compile a function to do the whole thing above, next word probability,
    # sampled word for the next target, next hidden state to be used
    print('Building f_next..')
    inps = [y, ctx, init_state]
    outs = [next_probs, next_sample, next_state]
    f_next = theano.function(inps, outs, name='f_next', profile=profile)
    print('Done')

    return f_init, f_next
Example #55
0
def build_model(tparams, options):
    opt_ret = dict()

    trng = RandomStreams(1234)
    use_noise = theano.shared(numpy.float32(0.))

    # description string: #words x #samples
    x = tensor.matrix('x', dtype='int64')
    x_mask = tensor.matrix('x_mask', dtype='float32')
    y = tensor.matrix('y', dtype='int64')
    y_mask = tensor.matrix('y_mask', dtype='float32')

    n_timesteps = x.shape[0]
    n_timesteps_trg = y.shape[0]
    n_samples = x.shape[1]

    # word embedding (source)
    emb = tparams['Wemb'][x.flatten()]
    emb = emb.reshape([n_timesteps, n_samples, options['dim_word']])

    # pass through encoder gru, recurrence here
    proj = get_layer(options['encoder'])[1](tparams,
                                            emb,
                                            options,
                                            prefix='encoder',
                                            mask=x_mask)

    # last hidden state of encoder rnn will be used to initialize decoder rnn
    ctx = proj[0][-1]
    ctx_mean = ctx

    # initial decoder state
    init_state = get_layer('ff')[1](tparams,
                                    ctx_mean,
                                    options,
                                    prefix='ff_state',
                                    activ='tanh')

    # word embedding (target), we will shift the target sequence one time step
    # to the right. This is done because of the bi-gram connections in the
    # readout and decoder rnn. The first target will be all zeros and we will
    # not condition on the last output.
    emb = tparams['Wemb_dec'][y.flatten()]
    emb = emb.reshape([n_timesteps_trg, n_samples, options['dim_word']])
    emb_shifted = tensor.zeros_like(emb)
    emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1])
    emb = emb_shifted

    # decoder - pass through the decoder gru, recurrence here
    proj = get_layer(options['decoder'])[1](tparams,
                                            emb,
                                            options,
                                            prefix='decoder',
                                            mask=y_mask,
                                            context=ctx,
                                            one_step=False,
                                            init_state=init_state)
    # hidden states of the decoder gru
    proj_h = proj

    # we will condition on the last state of the encoder only
    ctxs = ctx[None, :, :]

    # compute word probabilities
    logit_lstm = get_layer('ff')[1](tparams,
                                    proj_h,
                                    options,
                                    prefix='ff_logit_lstm',
                                    activ='linear')
    logit_prev = get_layer('ff')[1](tparams,
                                    emb,
                                    options,
                                    prefix='ff_logit_prev',
                                    activ='linear')
    logit_ctx = get_layer('ff')[1](tparams,
                                   ctxs,
                                   options,
                                   prefix='ff_logit_ctx',
                                   activ='linear')
    logit = tensor.tanh(logit_lstm + logit_prev + logit_ctx)
    logit = get_layer('ff')[1](tparams,
                               logit,
                               options,
                               prefix='ff_logit',
                               activ='linear')
    logit_shp = logit.shape
    probs = tensor.nnet.softmax(
        logit.reshape([logit_shp[0] * logit_shp[1], logit_shp[2]]))

    # cost
    y_flat = y.flatten()
    y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat
    cost = -tensor.log(probs.flatten()[y_flat_idx])
    cost = cost.reshape([y.shape[0], y.shape[1]])
    cost = (cost * y_mask).sum(0)

    return trng, use_noise, x, x_mask, y, y_mask, opt_ret, cost
Example #56
0
    def __init__(self, We, params):

        lstm_layers_num = 1
        en_hidden_size = We.shape[1]
        self.eta = params.eta
        self.num_labels = params.num_labels
        self.en_hidden_size = en_hidden_size
        self.de_hidden_size = params.de_hidden_size

        self.lstm_layers_num = params.lstm_layers_num
        self._train = None
        self._utter = None
        self.params = []
        self.encoder_lstm_layers = []
        self.decoder_lstm_layers = []
        self.hos = []
        self.Cos = []

        encoderInputs = tensor.imatrix()
        decoderInputs, decoderTarget = tensor.imatrices(2)
        encoderMask, TF, decoderMask, decoderInputs0 = tensor.fmatrices(4)

        self.lookuptable = theano.shared(We)

        #### the last one is for the stary symbole
        self.de_lookuptable = theano.shared(name="Decoder LookUpTable",
                                            value=init_xavier_uniform(
                                                self.num_labels + 1,
                                                self.de_hidden_size),
                                            borrow=True)

        self.linear = theano.shared(name="Linear",
                                    value=init_xavier_uniform(
                                        self.de_hidden_size, self.num_labels),
                                    borrow=True)

        self.hidden_decode = theano.shared(name="Hidden to Decode",
                                           value=init_xavier_uniform(
                                               2 * en_hidden_size,
                                               self.de_hidden_size),
                                           borrow=True)

        self.hidden_bias = theano.shared(
            name="Hidden to Bias",
            value=np.asarray(np.random.randn(self.de_hidden_size, ) * 0.,
                             dtype=theano.config.floatX),
            borrow=True)

        self.params += [
            self.linear, self.de_lookuptable, self.hidden_decode,
            self.hidden_bias
        ]  #concatenate

        #(max_sent_size, batch_size, hidden_size)
        state_below = self.lookuptable[encoderInputs.flatten()].reshape(
            (encoderInputs.shape[0], encoderInputs.shape[1],
             self.en_hidden_size))
        for _ in range(self.lstm_layers_num):

            enclstm_f = LSTM(self.en_hidden_size)
            enclstm_b = LSTM(self.en_hidden_size, True)
            self.encoder_lstm_layers.append(enclstm_f)  #append
            self.encoder_lstm_layers.append(enclstm_b)  #append
            self.params += enclstm_f.params + enclstm_b.params  #concatenate

            hs_f, Cs_f = enclstm_f.forward(state_below, encoderMask)
            hs_b, Cs_b = enclstm_b.forward(state_below, encoderMask)

            hs = tensor.concatenate([hs_f, hs_b], axis=2)
            Cs = tensor.concatenate([Cs_f, Cs_b], axis=2)
            self.hos += tensor.tanh(
                tensor.dot(hs[-1], self.hidden_decode) + self.hidden_bias),
            self.Cos += tensor.tanh(
                tensor.dot(Cs[-1], self.hidden_decode) + self.hidden_bias),
            state_below = hs

        state_below = self.de_lookuptable[decoderInputs.flatten()].reshape(
            (decoderInputs.shape[0], decoderInputs.shape[1],
             self.de_hidden_size))
        for i in range(self.lstm_layers_num):
            declstm = LSTM(self.de_hidden_size)
            self.decoder_lstm_layers += declstm,  #append
            self.params += declstm.params  #concatenate
            ho, Co = self.hos[i], self.Cos[i]
            state_below, Cs = declstm.forward(state_below, decoderMask, ho, Co)

        decoder_lstm_outputs = state_below

        ei, di, dt = tensor.imatrices(3)  #place holders
        em, dm, tf, di0 = tensor.fmatrices(4)
        #####################################################
        #####################################################
        linear_outputs = tensor.dot(decoder_lstm_outputs, self.linear)
        softmax_outputs, updates = theano.scan(
            fn=lambda x: tensor.nnet.softmax(x),
            sequences=[linear_outputs],
        )

        def _NLL(pred, y, m):
            return -m * tensor.log(pred[tensor.arange(encoderInputs.shape[1]),
                                        y])

        costs, _ = theano.scan(
            fn=_NLL, sequences=[softmax_outputs, decoderTarget, decoderMask])
        loss = costs.sum() / decoderMask.sum()

        updates = lasagne.updates.adam(loss, self.params, self.eta)
        #updates = lasagne.updates.apply_momentum(updates, self.params, momentum=0.9)

        ###################################################
        #### using the ground truth when training
        ##################################################
        self._train = theano.function(inputs=[ei, em, di, dm, dt],
                                      outputs=[loss, softmax_outputs],
                                      updates=updates,
                                      givens={
                                          encoderInputs: ei,
                                          encoderMask: em,
                                          decoderInputs: di,
                                          decoderMask: dm,
                                          decoderTarget: dt
                                      })

        #########################################################################
        ### For schedule sampling
        #########################################################################

        ###### always use privous predict as next input
        def _step2(state_, hs_, Cs_):

            hs, Cs = [], []
            token_idxs = tensor.cast(state_.argmax(axis=-1), "int32")
            msk_ = tensor.fill(
                (tensor.zeros_like(token_idxs, dtype="float32")), 1)
            msk_ = msk_.dimshuffle('x', 0)
            state_below0 = self.de_lookuptable[token_idxs].reshape(
                (1, encoderInputs.shape[1], self.de_hidden_size))
            for i, lstm in enumerate(self.decoder_lstm_layers):
                h, C = lstm.forward(state_below0, msk_, hs_[i],
                                    Cs_[i])  #mind msk
                hs += h[-1],
                Cs += C[-1],
                state_below0 = h

            hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable(
                Cs)

            newpred = tensor.dot(state_below0, self.linear).reshape(
                (encoderInputs.shape[1], self.num_labels))
            state_below = tensor.nnet.softmax(newpred)

            return state_below, hs, Cs

        hs0, Cs0 = tensor.as_tensor_variable(
            self.hos, name="hs0"), tensor.as_tensor_variable(self.Cos,
                                                             name="Cs0")
        train_outputs, _ = theano.scan(fn=_step2,
                                       outputs_info=[decoderInputs0, hs0, Cs0],
                                       n_steps=encoderInputs.shape[0])

        train_predict = train_outputs[0]
        train_costs, _ = theano.scan(
            fn=_NLL, sequences=[train_predict, decoderTarget, decoderMask])

        train_loss = train_costs.sum() / decoderMask.sum()

        train_updates = lasagne.updates.adam(train_loss, self.params, self.eta)
        #train_updates = lasagne.updates.apply_momentum(train_updates, self.params, momentum=0.9)

        self._train2 = theano.function(
            inputs=[ei, em, di0, dm, dt],
            outputs=[train_loss, train_predict],
            updates=train_updates,
            givens={
                encoderInputs: ei,
                encoderMask: em,
                decoderInputs0: di0,
                decoderMask: dm,
                decoderTarget: dt
            }
            #givens={encoderInputs:ei, encoderMask:em, decoderInputs:di, decoderMask:dm, decoderTarget:dt, TF:tf}
        )

        listof_token_idx = train_predict.argmax(axis=-1)
        self._utter = theano.function(inputs=[ei, em, di0],
                                      outputs=listof_token_idx,
                                      givens={
                                          encoderInputs: ei,
                                          encoderMask: em,
                                          decoderInputs0: di0
                                      })
Example #57
0
    def _setup_functions(self, X_sym, y_sym, X_mask, y_mask, layer_sizes):
        input_variable = X_sym

        # layer_sizes consists of input size, all hidden sizes, and output size
        hidden_sizes = layer_sizes[1:-1]
        # set these to stop pep8 vim plugin from complaining
        input_size = None
        output_size = None
        for n in range(len(hidden_sizes)):
            if (n - 1) < 0:
                input_size = layer_sizes[0]
            else:
                if self.bidirectional:
                    # Accomodate for concatenated hiddens
                    input_size = 2 * output_size
                else:
                    input_size = output_size
            hidden_size = hidden_sizes[n]
            if (n + 1) != len(hidden_sizes):
                output_size = hidden_sizes[n + 1]
            else:
                output_size = layer_sizes[-1]

            forward_hidden, forward_params = self.recurrent_function(
                input_size, hidden_size, output_size, input_variable, X_mask,
                self.random_state)

            if self.bidirectional:
                backward_hidden, backward_params = self.recurrent_function(
                    input_size, hidden_size, output_size, input_variable[::-1],
                    X_mask[::-1], self.random_state)
                params = forward_params + backward_params
                input_variable = concatenate(
                    [forward_hidden, backward_hidden[::-1]],
                    axis=forward_hidden.ndim - 1)
            else:
                params = forward_params
                input_variable = forward_hidden

        if self.bidirectional:
            # Accomodate for concatenated hiddens
            sz = 2 * hidden_sizes[-1]
        else:
            sz = hidden_sizes[-1]

        if self.cost == "softmax":
            # easy mode
            output, output_params = build_linear_layer(sz, output_size,
                                                       input_variable,
                                                       self.random_state)
            params = params + output_params
            shp = output.shape
            output = output.reshape([shp[0] * shp[1], shp[2]])
            y_hat_sym = T.nnet.softmax(output)
            y_sym_reshaped = y_sym.reshape([shp[0] * shp[1], shp[2]])
            cost = -T.mean((y_sym_reshaped * T.log(y_hat_sym)).sum(axis=1))

        elif self.cost == "encdec":
            # hardmode
            context = input_variable
            context_mean = context[0]

            init_state, state_params = build_tanh_layer(sz, hidden_sizes[-1],
                                                        context_mean,
                                                        self.random_state)
            init_memory, memory_params = build_tanh_layer(sz, hidden_sizes[-1],
                                                          context_mean,
                                                          self.random_state)
            # partial sampler setup
            self._encode = theano.function([X_sym, X_mask],
                                           [init_state, init_memory, context])
            init_state_sampler = T.matrix()
            init_memory_sampler = T.matrix()
            y_sw_sampler = T.tensor3()
            y_sw_mask = T.alloc(1., y_sw_sampler.shape[0], 1)

            # need this style of init to reuse params for sampler and actual
            # training. This makes this part quite nasty - dictionary
            # for initialization and params is making more and more sense.
            # conditional params will be reused below
            conditional_params = init_recurrent_conditional_lstm_layer(
                output_size, hidden_sizes[-1], sz, self.random_state)

            rval, _p = build_recurrent_conditional_lstm_layer_from_params(
                conditional_params, y_sw_sampler, y_sw_mask, context, X_mask,
                init_state_sampler, init_memory_sampler,
                self.random_state, one_step=True)
            next_state, next_memory, sampler_contexts, _ = rval
            #end sampler parts... for now

            params = params + state_params + memory_params
            shifted_labels = T.zeros_like(y_sym)
            shifted_labels = T.set_subtensor(shifted_labels[1:], y_sym[:-1])
            y_sym = shifted_labels

            rval, _p = build_recurrent_conditional_lstm_layer_from_params(
                conditional_params, shifted_labels, y_mask, context, X_mask,
                init_state, init_memory, self.random_state)
            projected_hidden, _, contexts, attention = rval

            params = params + conditional_params

            # once again, need to use same params for sample gen
            lh_params = init_linear_layer(hidden_sizes[-1], output_size,
                                          self.random_state)
            logit_hidden, _ = build_linear_layer_from_params(lh_params,
                                                             projected_hidden)
            params = params + lh_params

            lo_params = init_linear_layer(output_size, output_size,
                                          self.random_state)
            logit_out, _ = build_linear_layer_from_params(lo_params,
                                                             y_sym)
            params = params + lo_params

            lc_params = init_linear_layer(sz, output_size,
                                          self.random_state)
            logit_contexts, _ = build_linear_layer_from_params(lc_params,
                                                               contexts)
            params = params + lc_params

            logit = T.tanh(logit_hidden + logit_out + logit_contexts)
            output_params = init_linear_layer(output_size, output_size,
                                              self.random_state)
            output, _ = build_linear_layer_from_params(output_params,
                                                       logit)
            params = params + output_params

            shp = output.shape
            output = output.reshape([shp[0] * shp[1], shp[2]])
            y_hat_sym = T.nnet.softmax(output)

            # Need to apply mask so that cost isn't punished
            y_sym_reshaped = (y_sym * y_mask.dimshuffle(0, 1, 'x')).reshape(
                [shp[0] * shp[1], shp[2]])
            y_sym_reshaped = y_sym.reshape([shp[0] * shp[1], shp[2]])
            cost = -T.mean((y_sym_reshaped * T.log(y_hat_sym)).sum(axis=1))

            # Finish sampler
            logit_sampler_hidden, _ = build_linear_layer_from_params(lh_params,
                                                                     next_state)
            logit_sampler_out, _ = build_linear_layer_from_params(lo_params,
                                                                  y_sw_sampler)
            logit_sampler_contexts, _ = build_linear_layer_from_params(
                lc_params, sampler_contexts)
            logit_sampler = T.tanh(logit_sampler_hidden + logit_sampler_out
                                   + logit_sampler_contexts)
            output_sampler, _ = build_linear_layer_from_params(output_params,
                                                       logit_sampler)
            shp = output_sampler.shape
            output_sampler = output_sampler.reshape([shp[0] * shp[1], shp[2]])
            y_hat_sampler = T.nnet.softmax(output_sampler)
            self._sampler_step = theano.function(
                [y_sw_sampler, context, X_mask, init_state_sampler,
                 init_memory_sampler],
                [y_hat_sampler, next_state, next_memory])

        else:
            raise ValueError("Value of %s not a valid cost!"
                             % self.cost)

        self.params_ = params

        if self.learning_alg == "sgd":
            updates = self.get_clip_sgd_updates(
                X_sym, y_sym, params, cost, self.learning_rate, self.momentum)
        elif self.learning_alg == "rmsprop":
            updates = self.get_clip_rmsprop_updates(
                X_sym, y_sym, params, cost, self.learning_rate, self.momentum)
        elif self.learning_alg == "sfg":
            updates = self.get_sfg_updates(
                X_sym, y_sym, params, cost, self.learning_rate, self.momentum)
        else:
            raise ValueError("Value of %s not a valid learning_alg!"
                             % self.learning_alg)

        if self.cost == "softmax":
            self.fit_function = theano.function(inputs=[X_sym, y_sym, X_mask,
                                                        y_mask],
                                                outputs=cost,
                                                updates=updates,
                                                on_unused_input="ignore")

            self.loss_function = theano.function(inputs=[X_sym, y_sym, X_mask,
                                                        y_mask],
                                                outputs=cost,
                                                on_unused_input="ignore")

            self.predict_function = theano.function(
                inputs=[X_sym, X_mask],
                outputs=y_hat_sym,
                on_unused_input="ignore")

        else:
            self.fit_function = theano.function(inputs=[X_sym, y_sym, X_mask,
                                                        y_mask],
                                                outputs=cost,
                                                updates=updates,
                                                on_unused_input="warn")

            self.loss_function = theano.function(inputs=[X_sym, y_sym, X_mask,
                                                        y_mask],
                                                outputs=cost,
                                                on_unused_input="warn")

            self.predict_function = theano.function(
                inputs=[X_sym, X_mask, y_sym, y_mask],
                outputs=y_hat_sym)
Example #58
0
def build_sampler(tparams, options, trng, use_noise):
    x = tensor.matrix('x', dtype='int64')
    n_timesteps = x.shape[0]
    n_samples = x.shape[1]

    # word embedding (source)
    emb = tparams['Wemb'][x.flatten()]
    emb = emb.reshape([n_timesteps, n_samples, options['dim_word']])

    # encoder
    proj = get_layer(options['encoder'])[1](tparams,
                                            emb,
                                            options,
                                            prefix='encoder')
    ctx = proj[0][-1]
    ctx_mean = ctx
    init_state = get_layer('ff')[1](tparams,
                                    ctx_mean,
                                    options,
                                    prefix='ff_state',
                                    activ='tanh')

    print 'Building f_init...',
    outs = [init_state, ctx]
    f_init = theano.function([x], outs, name='f_init', profile=profile)
    print 'Done'

    # y: 1 x 1
    y = tensor.vector('y_sampler', dtype='int64')
    init_state = tensor.matrix('init_state', dtype='float32')

    # if it's the first word, emb should be all zero
    emb = tensor.switch(y[:, None] < 0,
                        tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]),
                        tparams['Wemb_dec'][y])

    # apply one step of gru layer
    proj = get_layer(options['decoder'])[1](tparams,
                                            emb,
                                            options,
                                            prefix='decoder',
                                            mask=None,
                                            context=ctx,
                                            one_step=True,
                                            init_state=init_state)
    next_state = proj
    ctxs = ctx

    # compute the output probability dist and sample
    logit_lstm = get_layer('ff')[1](tparams,
                                    next_state,
                                    options,
                                    prefix='ff_logit_lstm',
                                    activ='linear')
    logit_prev = get_layer('ff')[1](tparams,
                                    emb,
                                    options,
                                    prefix='ff_logit_prev',
                                    activ='linear')
    logit_ctx = get_layer('ff')[1](tparams,
                                   ctxs,
                                   options,
                                   prefix='ff_logit_ctx',
                                   activ='linear')
    logit = tensor.tanh(logit_lstm + logit_prev + logit_ctx)
    logit = get_layer('ff')[1](tparams,
                               logit,
                               options,
                               prefix='ff_logit',
                               activ='linear')
    next_probs = tensor.nnet.softmax(logit)
    next_sample = trng.multinomial(pvals=next_probs).argmax(1)

    # next word probability
    print 'Building f_next..',
    inps = [y, ctx, init_state]
    outs = [next_probs, next_sample, next_state]
    f_next = theano.function(inps, outs, name='f_next', profile=profile)
    print 'Done'

    return f_init, f_next
Example #59
0
def build_tanh_layer_from_params(params, input_variable):
    W, b = params
    output_variable = T.tanh(T.dot(input_variable, W) + b)
    return output_variable, params
Example #60
0
    def __init__(self, rng, input, input_u, input_p, mask, n_wordin, n_usrin, n_prdin, n_out, name, prefix=None):
        self.input = input
        self.inputu = input_u
        self.inputp = input_p

        if prefix is None:
            W_values = numpy.asarray(                                              
                rng.uniform(
                    low=-numpy.sqrt(6. / (n_wordin + n_out)),
                    high=numpy.sqrt(6. / (n_wordin + n_out)),
                    size=(n_wordin, n_out)
                ),
                dtype=numpy.float32
            )
            W = theano.shared(value=W_values, name='W', borrow=True)

            '''
            v_values = numpy.zeros((n_out,), dtype=theano.config.floatX)            
            v = theano.shared(value=v_values, name='v', borrow=True)
            '''
            v_values = numpy.asarray(
                rng.normal(scale=0.1, size=(n_out,)),
                dtype=numpy.float32
            )
            v = theano.shared(value=v_values, name='v', borrow=True)
            
            Wu_values = numpy.asarray(                                              
                rng.uniform(
                    low=-numpy.sqrt(6. / (n_usrin + n_out)),
                    high=numpy.sqrt(6. / (n_usrin + n_out)),
                    size=(n_usrin, n_out)
                ),
                dtype=numpy.float32
            )
            Wu = theano.shared(value=Wu_values, name='Wu', borrow=True)
            
            Wp_values = numpy.asarray(                                              
                rng.uniform(
                    low=-numpy.sqrt(6. / (n_prdin + n_out)),
                    high=numpy.sqrt(6. / (n_prdin + n_out)),
                    size=(n_prdin, n_out)
                ),
                dtype=numpy.float32
            )
            Wp = theano.shared(value=Wp_values, name='Wp', borrow=True)
            
            b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
            b = theano.shared(value=b_values, name='b', borrow=True)
 
        else:
            f = file(prefix + name + '.save', 'rb')
            W = pickle.load(f)
            v = pickle.load(f)
            Wu = pickle.load(f)
            Wp = pickle.load(f)
            b = pickle.load(f)
            f.close()

        self.W = W
        self.v = v
        self.Wu = Wu
        self.Wp = Wp
        self.b = b

        attenu = T.dot(input_u, self.Wu)                                              
        attenp = T.dot(input_p, self.Wp)                                              

        atten = T.tanh(T.dot(input, self.W)+ attenu + attenp +b)                         
        atten = T.sum(atten * v, axis=2, acc_dtype='float32')                 
        atten = softmask(atten.dimshuffle(1,0), mask.dimshuffle(1,0)).dimshuffle(1, 0)        
        output = atten.dimshuffle(0, 1, 'x') * input
        self.output = T.sum(output, axis=0, acc_dtype='float32')                

        self.params = [self.W, self.v,self.Wu,self.Wp,self.b]
        self.name=name
        self.atten = atten
        self.mask = mask