コード例 #1
0
ファイル: cnn.py プロジェクト: afcarl/deep-srl-system
    def convolution_max(self, h, n_prds):
        """
        :param h: 1D: n_words, 2D: batch_size * n_prds, 3D n_h
        :return: 1D: n_words, 2D: batch_size * n_prds, 3D: n_h
        """

        # 1D: n_words, 2D: batch_size, 3D: n_prds, 4D: n_h
        m = h.reshape((h.shape[0], h.shape[1] / n_prds, n_prds, h.shape[2]))
        # 1D: batch_size, 2D: n_prds * n_words, 3D: n_h
        m = m.dimshuffle((1, 2, 0, 3))
        m = m.reshape((m.shape[0], m.shape[1] * m.shape[2], m.shape[3]))

        # 1D: batch_size, 2D: n_h
        h_m = T.max(relu(T.dot(m, self.W_m)), axis=1)
        # 1D: batch_size, 2D: n_h
        h_m = h_m.reshape((h_m.shape[0], -1)).dimshuffle((0, 'x', 'x', 1))
        # 1D: batch_size, 2D: n_prds, 3D: n_h
        h_m = T.repeat(h_m, n_prds, 1)
        # 1D: batch_size, 2D: n_prds, 3D: n_words, 4D: n_h
        h_m = T.repeat(h_m, h.shape[0], 2)
        # 1D: n_words, 2D: batch_size, 3D: n_prds, 4D: n_h
        h_m = h_m.dimshuffle((2, 0, 1, 3))
        # 1D: n_words, 2D: batch_size, 3D: n_prds, 4D: n_h
        h_m = h_m.reshape((h_m.shape[0], h_m.shape[1] * h_m.shape[2], h_m.shape[3]))

        return relu(T.dot(T.concatenate([h, h_m], axis=2), self.W_c))
コード例 #2
0
def gru_layers(x, batch, n_fin, n_h, n_y, n_layers=1):
    params = []

    for i in xrange(n_layers):
        if i == 0:
            layer = GRU(n_i=n_fin, n_h=n_h)
            layer_input = relu(T.dot(x.dimshuffle(1, 0, 2), layer.W))
            # h0: 1D: Batch, 2D: n_h
            h0 = T.zeros((batch, n_h), dtype=theano.config.floatX)
        else:
            layer = GRU(n_i=n_h * 2, n_h=n_h)
            # h: 1D: n_words, 2D: Batch, 3D n_h
            layer_input = relu(
                T.dot(T.concatenate([layer_input, h], 2), layer.W))[::-1]
            h0 = layer_input[0]

        xr = T.dot(layer_input, layer.W_xr)
        xz = T.dot(layer_input, layer.W_xz)
        xh = T.dot(layer_input, layer.W_xh)

        h, _ = theano.scan(fn=layer.forward,
                           sequences=[xr, xz, xh],
                           outputs_info=[h0])
        params.extend(layer.params)

    layer = CRF(n_i=n_h * 2, n_h=n_y)
    params.extend(layer.params)
    h = relu(T.dot(T.concatenate([layer_input, h], 2), layer.W))

    if n_layers % 2 == 0:
        emit = h[::-1]
    else:
        emit = h

    return params, layer, emit
コード例 #3
0
def gru_layers(x, batch, n_fin, n_h, n_y, n_layers=1):
    params = []

    for i in xrange(n_layers):
        if i == 0:
            layer = GRU(n_i=n_fin, n_h=n_h)
            layer_input = relu(T.dot(x.dimshuffle(1, 0, 2), layer.W))
            # h0: 1D: Batch, 2D: n_h
            h0 = T.zeros((batch, n_h), dtype=theano.config.floatX)
        else:
            layer = GRU(n_i=n_h * 2, n_h=n_h)
            # h: 1D: n_words, 2D: Batch, 3D n_h
            layer_input = relu(T.dot(T.concatenate([layer_input, h], 2), layer.W))[::-1]
            h0 = layer_input[0]

        xr = T.dot(layer_input, layer.W_xr)
        xz = T.dot(layer_input, layer.W_xz)
        xh = T.dot(layer_input, layer.W_xh)

        h, _ = theano.scan(fn=layer.forward, sequences=[xr, xz, xh], outputs_info=[h0])
        params.extend(layer.params)

    layer = CRF(n_i=n_h * 2, n_h=n_y)
    params.extend(layer.params)
    h = relu(T.dot(T.concatenate([layer_input, h], 2), layer.W))

    if n_layers % 2 == 0:
        emit = h[::-1]
    else:
        emit = h

    return params, layer, emit
コード例 #4
0
ファイル: cnn.py プロジェクト: afcarl/deep-srl-system
    def convolution2(self, h, n_prds):
        """
        :param h: 1D: n_words, 2D: batch_size * n_prds, 3D n_h
        :return: 1D: n_words, 2D: batch_size * n_prds, 3D: n_h
        """

        # 1D: n_words, 2D: batch_size, 3D: n_prds, 4D: n_h
        m = h.reshape((h.shape[0], h.shape[1] / n_prds, n_prds, h.shape[2]))
        # 1D: n_words, 2D: batch_size, 3D: n_h
        h_m = self.pooling(relu(T.dot(m, self.W_m)), axis=2)
        # 1D: n_words, 2D: batch_size * n_prds, 3D: n_h
        h_m = T.repeat(h_m, n_prds, 1)

        return relu(T.dot(T.concatenate([h, h_m], axis=2), self.W_c))
コード例 #5
0
ファイル: attention.py プロジェクト: hjpwhu/sirnn
    def forward(self, A, a_res):
        """
        :param A: 1D: batch, 2D: n_agents, 3D: dim_h
        :param a_res: 1D: batch, 2D: dim_h
        :return: h_after: 1D: n_queries, 2D: n_cands-1, 3D: dim_h
        """

        # 1D: batch, 2D: n_agents, 3D: dim_h
        M = tanh(
            T.dot(A, self.W1_c) +
            T.dot(a_res, self.W1_h).dimshuffle(0, 'x', 1))

        # 1D: batch, 2D: n_agents
        u = T.dot(M, self.w)

        # 1D: batch, 2D: n_agents, 3D: 1
        alpha = T.nnet.softmax(u)
        alpha = alpha.dimshuffle((0, 1, 'x'))

        # 1D: batch, 2D: dim_h
        r = T.sum(A * alpha, axis=1)

        # 1D: batch, 2D: dim_h
        h = relu(T.dot(r, self.W2_r))
        return h
コード例 #6
0
def linear_activation_forward(A_prev, W, b, activation):
    '''
    Implements forward propagation.

    Arguments:
    A_prev -- activation from previous layers
    W -- weight matrix
    b -- bias matrix
    activation -- the activation used in this layer, 'sigmoid' or 'relu'

    Returns:
    A -- the output activation 
    cache -- a python dictionary containing 'linear_cache' and 'activation_cache'
    '''
    
    Z, linear_cache = linear_forward(A_prev, W, b)

    if activation == 'relu':
        A, activation_cache = nn_utils.relu(Z)
    
    elif activation == 'sigmoid':
        A, activation_cache = nn_utils.sigmoid(Z)

    cache = (linear_cache, activation_cache)

    return A, cache
コード例 #7
0
def linear_activation_forward(A_prev, W, b, activation):
    '''
    Implements the forward propagation for the Linear->Activation layer.

    Arguments:
    A_prev -- activation from previous layer(or input data)
    W -- weight matrix
    b -- bias matrix
    activation -- activation used in this layer, 'relu' or 'sigmoid

    Returns:
    A -- the output of activation function
    cache -- tuple containing 'linear_cache' and 'activation_cache',
             stored for computing backward pass efficiently
    '''

    Z, linear_cache = linear_forward(A_prev, W, b)
    ## calling linear_forward function
    ## to get the value of Z and linear cache

    if activation == 'sigmoid':
        A, activation_cache = sigmoid(Z)
        ## calling sigmoid function defined in nn_utils

    elif activation == 'relu':
        A, activation_cache = relu(Z)
        ## calling relu function defined in nn_utlis

    assert (A.shape == (W.shape[0], A_prev.shape[1]))
    ## assertion for checking the shape of A

    cache = (linear_cache, activation_cache)
    return A, cache
コード例 #8
0
 def forward(self, x, h):
     """
     :param x: 1D: n_words, 2D: batch, 3D: dim_h
     :param h: 1D: n_words, 2D: batch, 3D: dim_h
     :return: 1D: n_words, 2D: batch, 3D: n_labels
     """
     return relu(T.dot(T.concatenate([x, h], 2), self.W))
コード例 #9
0
def linear_activation_forward(A_prev, W, b, activation):
    """
    Implement the forward propagation for the LINEAR->ACTIVATION layer

    Arguments:
    A_prev -- activations from previous layer (or input data): (size of previous layer, number of examples)
    W -- weights matrix: numpy array of shape (size of current layer, size of previous layer)
    b -- bias vector, numpy array of shape (size of the current layer, 1)
    activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"

    Returns:
    A -- the output of the activation function, also called the post-activation value
    cache -- a python dictionary containing "linear_cache" and "activation_cache";
             stored for computing the backward pass efficiently
    """

    if activation == "sigmoid":
        # Inputs: "A_prev, W, b". Outputs: "A, activation_cache".
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = nn_utils.sigmoid(Z)

    elif activation == "relu":
        # Inputs: "A_prev, W, b". Outputs: "A, activation_cache".
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = nn_utils.relu(Z)

    assert (A.shape == (W.shape[0], A_prev.shape[1]))
    cache = (linear_cache, activation_cache)

    return A, cache
コード例 #10
0
def lstm_layers(x, batch, n_fin, n_h, n_y, n_layers=1):
    params = []

    for i in xrange(n_layers):
        if i == 0:
            layer = LSTM(n_i=n_fin, n_h=n_h)
            layer_input = relu(T.dot(x.dimshuffle(
                1, 0, 2), layer.W))  # x: 1D: Batch, 2D: n_words, 3D: n_fin
            h0 = layer.h0 * T.ones((batch, n_h))  # h0: 1D: Batch, 2D: n_h
            c0 = layer.c0 * T.ones((batch, n_h))  # c0: 1D: Batch, 2D: n_h
        else:
            layer = LSTM(n_i=n_h * 2, n_h=n_h)
            layer_input = relu(
                T.dot(T.concatenate([layer_input, h], 2),
                      layer.W))[::-1]  # h: 1D: n_words, 2D: Batch, 3D n_h
            h0 = layer_input[0]
            c0 = c[-1]

        xi = T.dot(
            layer_input,
            layer.W_xi)  # layer_input: 1D: n_words, 2D: Batch, 3D: n_fin
        xf = T.dot(layer_input, layer.W_xf)
        xc = T.dot(layer_input, layer.W_xc)
        xo = T.dot(layer_input, layer.W_xo)

        [h, c], _ = theano.scan(fn=layer.forward,
                                sequences=[xi, xf, xc, xo],
                                outputs_info=[h0, c0])

        params.extend(layer.params)

    layer = CRF(n_i=n_h * 2, n_h=n_y)
    params.extend(layer.params)
    h = relu(T.dot(T.concatenate([layer_input, h], 2), layer.W))

    if n_layers % 2 == 0:
        emit = h[::-1]
    else:
        emit = h

    return params, layer, emit
コード例 #11
0
ファイル: nn_word.py プロジェクト: afcarl/neural-pos-tagger
    def __init__(self, name, x, y, lr, init_emb, vocab_size, emb_dim,
                 hidden_dim, output_dim, window, opt):

        assert window % 2 == 1, 'Window size must be odd'
        """ input """
        self.name = name
        self.x = x
        self.y = y
        self.lr = lr
        self.input = [self.x, self.y, self.lr]

        n_words = x.shape[0]
        """ params """
        if init_emb is not None:
            self.emb = theano.shared(init_emb)
        else:
            self.emb = theano.shared(sample_weights(vocab_size, emb_dim))

        self.W_in = theano.shared(
            sample_weights(hidden_dim, 1, window, emb_dim))
        self.W_out = theano.shared(sample_weights(hidden_dim, output_dim))

        self.b_in = theano.shared(sample_weights(hidden_dim, 1))
        self.b_y = theano.shared(sample_weights(output_dim))

        self.params = [self.W_in, self.W_out, self.b_in, self.b_y]
        """ pad """
        self.zero = theano.shared(
            np.zeros(shape=(1, 1, window / 2, emb_dim),
                     dtype=theano.config.floatX))
        """ look up embedding """
        self.x_emb = self.emb[self.x]  # x_emb: 1D: n_words, 2D: n_emb
        """ convolution """
        self.x_in = self.conv(self.x_emb)
        """ feed-forward computation """
        self.h = relu(
            self.x_in.reshape((self.x_in.shape[1], self.x_in.shape[2])) +
            T.repeat(self.b_in, T.cast(self.x_in.shape[2], 'int32'), 1)).T
        self.o = T.dot(self.h, self.W_out) + self.b_y
        self.p_y_given_x = T.nnet.softmax(self.o)
        """ prediction """
        self.y_pred = T.argmax(self.o, axis=1)
        self.result = T.eq(self.y_pred, self.y)
        """ cost function """
        self.nll = -T.sum(T.log(self.p_y_given_x)[T.arange(n_words), self.y])
        self.cost = self.nll

        if opt == 'sgd':
            self.updates = sgd(self.cost, self.params, self.emb, self.x_emb,
                               self.lr)
        else:
            self.updates = ada_grad(self.cost, self.params, self.emb,
                                    self.x_emb, self.x, self.lr)
コード例 #12
0
ファイル: nn_word.py プロジェクト: hiroki13/neural-pos-tagger
    def __init__(self, name, x, y, lr, init_emb, vocab_size, emb_dim, hidden_dim, output_dim, window, opt):

        assert window % 2 == 1, 'Window size must be odd'

        """ input """
        self.name = name
        self.x = x
        self.y = y
        self.lr = lr
        self.input = [self.x, self.y, self.lr]

        n_words = x.shape[0]

        """ params """
        if init_emb is not None:
            self.emb = theano.shared(init_emb)
        else:
            self.emb = theano.shared(sample_weights(vocab_size, emb_dim))

        self.W_in = theano.shared(sample_weights(hidden_dim, 1, window, emb_dim))
        self.W_out = theano.shared(sample_weights(hidden_dim, output_dim))

        self.b_in = theano.shared(sample_weights(hidden_dim, 1))
        self.b_y = theano.shared(sample_weights(output_dim))

        self.params = [self.W_in, self.W_out, self.b_in, self.b_y]

        """ pad """
        self.zero = theano.shared(np.zeros(shape=(1, 1, window / 2, emb_dim), dtype=theano.config.floatX))

        """ look up embedding """
        self.x_emb = self.emb[self.x]  # x_emb: 1D: n_words, 2D: n_emb

        """ convolution """
        self.x_in = self.conv(self.x_emb)

        """ feed-forward computation """
        self.h = relu(self.x_in.reshape((self.x_in.shape[1], self.x_in.shape[2])) + T.repeat(self.b_in, T.cast(self.x_in.shape[2], 'int32'), 1)).T
        self.o = T.dot(self.h, self.W_out) + self.b_y
        self.p_y_given_x = T.nnet.softmax(self.o)

        """ prediction """
        self.y_pred = T.argmax(self.o, axis=1)
        self.result = T.eq(self.y_pred, self.y)

        """ cost function """
        self.nll = -T.sum(T.log(self.p_y_given_x)[T.arange(n_words), self.y])
        self.cost = self.nll

        if opt == 'sgd':
            self.updates = sgd(self.cost, self.params, self.emb, self.x_emb, self.lr)
        else:
            self.updates = ada_grad(self.cost, self.params, self.emb, self.x_emb, self.x, self.lr)
コード例 #13
0
ファイル: cnn.py プロジェクト: afcarl/deep-srl-system
    def convolution(self, h, n_prds):
        """
        :param h: 1D: n_words, 2D: batch_size * n_prds, 3D n_h
        :return: 1D: n_words, 2D: batch_size * n_prds, 3D: n_h
        """

        # 1D: n_words, 2D: batch_size, 3D: n_prds, 4D: n_h
        m = h.reshape((h.shape[0], h.shape[1] / n_prds, n_prds, h.shape[2]))
        m = m.dimshuffle((1, 2, 0, 3))
#        n_words = m.shape[2]

        # 1D: batch_size, 2D: n_prds * n_words, 3D: n_h
        m1 = m.reshape((m.shape[0], m.shape[1] * m.shape[2], m.shape[3]))
        m3 = m.dimshuffle((0, 2, 1, 3))

        # 1D: batch_size, 2D: n_h
        h_m1 = T.max(relu(T.dot(m1, self.W_m1)), axis=1)
        h_m1 = h_m1.dimshuffle((0, 'x', 'x', 1))
#        h_m1 = T.repeat(h_m1, n_prds, 1)
#        h_m1 = T.repeat(h_m1, n_words, 2)

        # 1D: batch_size, 2D: n_prds, 3D: n_h
        h_m2 = T.max(relu(T.dot(m, self.W_m2)), axis=2)
        h_m2 = h_m2.dimshuffle((0, 1, 'x', 2))
#        h_m2 = T.repeat(h_m2, n_words, 2)

        # 1D: batch_size, 2D: n_words, 3D: n_h
        h_m3 = T.max(relu(T.dot(m3, self.W_m3)), axis=2)
        h_m3 = h_m3.dimshuffle((0, 'x', 1, 2))
#        h_m3 = T.repeat(h_m3, n_prds, 1)

        # 1D: batch_size, 2D: n_prds, 3D: n_words, 4D: n_h
        h_m = h_m1 + h_m2 + h_m3
        # 1D: n_words, 2D: batch_size, 3D: n_prds, 4D: n_h
        h_m = h_m.dimshuffle((2, 0, 1, 3))
        # 1D: n_words, 2D: batch_size, 3D: n_prds, 4D: n_h
        h_m = h_m.reshape((h_m.shape[0], h_m.shape[1] * h_m.shape[2], h_m.shape[3]))

        return relu(T.dot(T.concatenate([h, h_m], axis=2), self.W_c))
コード例 #14
0
def lstm_layers(x, batch, n_fin, n_h, n_y, n_layers=1):
    params = []

    for i in xrange(n_layers):
        if i == 0:
            layer = LSTM(n_i=n_fin, n_h=n_h)
            layer_input = relu(T.dot(x.dimshuffle(1, 0, 2), layer.W))  # x: 1D: Batch, 2D: n_words, 3D: n_fin
            h0 = layer.h0 * T.ones((batch, n_h))  # h0: 1D: Batch, 2D: n_h
            c0 = layer.c0 * T.ones((batch, n_h))  # c0: 1D: Batch, 2D: n_h
        else:
            layer = LSTM(n_i=n_h * 2, n_h=n_h)
            layer_input = relu(T.dot(T.concatenate([layer_input, h], 2), layer.W))[::-1]  # h: 1D: n_words, 2D: Batch, 3D n_h
            h0 = layer_input[0]
            c0 = c[-1]

        xi = T.dot(layer_input, layer.W_xi)  # layer_input: 1D: n_words, 2D: Batch, 3D: n_fin
        xf = T.dot(layer_input, layer.W_xf)
        xc = T.dot(layer_input, layer.W_xc)
        xo = T.dot(layer_input, layer.W_xo)

        [h, c], _ = theano.scan(fn=layer.forward,
                                sequences=[xi, xf, xc, xo],
                                outputs_info=[h0, c0])

        params.extend(layer.params)

    layer = CRF(n_i=n_h * 2, n_h=n_y)
    params.extend(layer.params)
    h = relu(T.dot(T.concatenate([layer_input, h], 2), layer.W))

    if n_layers % 2 == 0:
        emit = h[::-1]
    else:
        emit = h

    return params, layer, emit
コード例 #15
0
    def __init__(self, x, y, n_words, batch_size, lr, init_emb, vocab_size,
                 emb_dim, hidden_dim, output_dim, window, opt):
        assert window % 2 == 1, 'Window size must be odd'
        """ input """
        self.x = x  # 1D: n_words * batch_size, 2D: window; elem=word id
        self.x_v = x.flatten(
        )  # 1D: n_words * batch_size * window; elem=word id
        self.y = y
        self.batch_size = batch_size
        self.n_words = n_words
        self.lr = lr
        """ params """
        if init_emb is not None:
            self.emb = theano.shared(init_emb)
        else:
            self.emb = theano.shared(sample_weights(vocab_size, emb_dim))

        self.W_in = theano.shared(sample_weights(emb_dim * window, hidden_dim))
        self.W_out = theano.shared(sample_weights(hidden_dim, output_dim))

        self.b_in = theano.shared(sample_weights(hidden_dim))
        self.b_y = theano.shared(sample_weights(output_dim))

        self.params = [self.W_in, self.W_out, self.b_in, self.b_y]
        """ look up embedding """
        self.x_emb = self.emb[
            self.x_v]  # x_emb: 1D: batch_size * n_words * window, 2D: emb_dim
        """ forward """
        self.h = relu(
            T.dot(self.x_emb.reshape((batch_size * n_words, emb_dim *
                                      window)), self.W_in) + self.b_in)
        self.o = T.dot(self.h, self.W_out) + self.b_y
        self.p_y_given_x = T.nnet.softmax(self.o)
        """ predict """
        self.y_pred = T.argmax(self.o, axis=1)
        self.result = T.eq(self.y_pred, self.y)
        """ loss """
        self.log_p = T.log(self.p_y_given_x)[T.arange(batch_size * n_words),
                                             self.y]
        self.nll = -T.sum(self.log_p)
        self.cost = self.nll

        if opt == 'sgd':
            self.updates = sgd(self.cost, self.params, self.emb, self.x_emb,
                               self.lr)
        else:
            self.updates = ada_grad(self.cost, self.params, self.emb,
                                    self.x_emb, self.x, self.lr)
コード例 #16
0
ファイル: layers.py プロジェクト: afcarl/deep-srl-system
 def grid_propagate(self, h):
     """
     :param h: 1D: batch, 2D: n_prds, 3D: n_words, 4D: dim_h
     :return: 1D: batch, 2D: n_prds, 3D: n_words, 4D: dim_h
     """
     h0_lr = T.zeros((h.shape[0], h.shape[1], h.shape[3]),
                     dtype=theano.config.floatX)
     h0_ud = T.zeros((h.shape[0], h.shape[2], h.shape[3]),
                     dtype=theano.config.floatX)
     for i in xrange(0, self.depth):
         h_lr = self.forward_lr(self.layers[i * 3], h, h0_lr)
         h_ud = self.forward_ud(self.layers[(i * 3) + 1], h, h0_ud)
         h = h + relu(self.dot(self.layers[(i * 3) + 2], h_lr, h_ud))
         h = self.flip(h)
     if (self.depth % 2) == 1:
         h = self.flip(h)
     return h
コード例 #17
0
    def __init__(self, x, y, n_words, batch_size, lr, init_emb, vocab_size, emb_dim, hidden_dim, output_dim, window, opt):
        assert window % 2 == 1, 'Window size must be odd'

        """ input """
        self.x = x  # 1D: n_words * batch_size, 2D: window; elem=word id
        self.x_v = x.flatten()  # 1D: n_words * batch_size * window; elem=word id
        self.y = y
        self.batch_size = batch_size
        self.n_words = n_words
        self.lr = lr

        """ params """
        if init_emb is not None:
            self.emb = theano.shared(init_emb)
        else:
            self.emb = theano.shared(sample_weights(vocab_size, emb_dim))

        self.W_in = theano.shared(sample_weights(emb_dim * window, hidden_dim))
        self.W_out = theano.shared(sample_weights(hidden_dim, output_dim))

        self.b_in = theano.shared(sample_weights(hidden_dim))
        self.b_y = theano.shared(sample_weights(output_dim))

        self.params = [self.W_in, self.W_out, self.b_in, self.b_y]

        """ look up embedding """
        self.x_emb = self.emb[self.x_v]  # x_emb: 1D: batch_size * n_words * window, 2D: emb_dim

        """ forward """
        self.h = relu(T.dot(self.x_emb.reshape((batch_size * n_words, emb_dim * window)), self.W_in) + self.b_in)
        self.o = T.dot(self.h, self.W_out) + self.b_y
        self.p_y_given_x = T.nnet.softmax(self.o)

        """ predict """
        self.y_pred = T.argmax(self.o, axis=1)
        self.result = T.eq(self.y_pred, self.y)

        """ loss """
        self.log_p = T.log(self.p_y_given_x)[T.arange(batch_size * n_words), self.y]
        self.nll = -T.sum(self.log_p)
        self.cost = self.nll

        if opt == 'sgd':
            self.updates = sgd(self.cost, self.params, self.emb, self.x_emb, self.lr)
        else:
            self.updates = ada_grad(self.cost, self.params, self.emb, self.x_emb, self.x, self.lr)
コード例 #18
0
ファイル: cnn.py プロジェクト: afcarl/deep-srl-system
def layers(x, window, dim_emb, dim_hidden, n_layers, activation=tanh):
    params = []
    zero = T.zeros((1, dim_emb * window), dtype=theano.config.floatX)

    def zero_pad_gate(matrix):
        return T.neq(T.sum(T.eq(matrix, zero), 1, keepdims=True), dim_emb * window)

    for i in xrange(n_layers):
        if i == 0:
            W = theano.shared(sample_weights(dim_emb * window, dim_hidden))
#            h = zero_pad_gate(x) * relu(T.dot(x, W))
            h = relu(T.dot(x, W))
        else:
            W = theano.shared(sample_weights(dim_hidden, dim_hidden))
            h = activation(T.dot(h, W))
        params.append(W)

    return h, params
コード例 #19
0
    def forward_one_layer(A_previous, W, b, activation_function):
        """
            Forwarding only 1 layer ahead
            W.shape = (l, l-1)
            Z, A_prev, A.shape = (l, m)
            Returns A and [cache = A_prev, W, b, Z]
        """
        Z = np.dot(
            W, A_previous) + b  # W.shape: (l, l-1) / A_previous.shape: (l, m)

        A = None
        if activation_function == "sigmoid":
            A = nn_utils.sigmoid(Z)
        elif activation_function == "relu":
            A = nn_utils.relu(Z)

        cache = A_previous, W, b, Z

        return A, cache
コード例 #20
0
ファイル: attention.py プロジェクト: hjpwhu/sirnn
    def forward_bi(self, A, a_res):
        """
        :param A: 1D: batch, 2D: n_agents, 3D: dim_h
        :param a_res: 1D: batch, 2D: dim_h
        :return: h_after: 1D: n_queries, 2D: n_cands-1, 3D: dim_h
        """

        # 1D: batch, 2D: n_agents
        M = T.sum(T.dot(A, self.W1_c) * a_res.dimshuffle(0, 'x', 1), axis=2)

        # 1D: batch, 2D: n_agents, 3D: 1
        alpha = T.nnet.softmax(M)
        alpha = alpha.dimshuffle((0, 1, 'x'))

        # 1D: batch, 2D: dim_h
        r = T.sum(A * alpha, axis=1)

        # 1D: batch, 2D: dim_h
        h = relu(T.dot(r, self.W2_r))
        return h
コード例 #21
0
ファイル: attention.py プロジェクト: hjpwhu/sirnn
    def forward_second_order(self, A, a_res):
        """
        :param A: 1D: batch, 2D: n_agents, 3D: dim_h
        :param a_res: 1D: batch, 2D: dim_h
        :return: h_after: 1D: n_queries, 2D: n_cands-1, 3D: dim_h
        """

        # 1D: batch, 2D: n_agents, 3D: dim_h
        M = tanh(
            T.dot(A, self.W1_c) +
            T.dot(a_res, self.W1_h).dimshuffle(0, 'x', 1))

        # 1D: batch, 2D: n_agents
        M_a = T.dot(M, self.w)

        # 1D: batch, 2D: n_agents, 3D: 1
        alpha = T.nnet.softmax(M_a)
        alpha = alpha.dimshuffle((0, 1, 'x'))

        # 1D: batch, 2D: dim_h
        r_a = T.sum(A * alpha, axis=1)

        # 1D: n_agents, 2D: dim_h
        w = self.w.dimshuffle(('x', 'x', 0))
        w = T.repeat(w, M_a.shape[1], axis=1)

        # 1D: batch, 2D: n_agents, 3D: dim_h
        M_a = M_a.dimshuffle((0, 1, 'x'))
        M_a = T.repeat(M_a, M.shape[2], axis=2)
        M_b = M - T.sum(M_a * w) / self.w.norm(2)

        beta = T.nnet.softmax(T.dot(M_b, self.w_b))
        beta = beta.dimshuffle((0, 1, 'x'))

        # 1D: batch, 2D: dim_h
        r_b = T.sum(A * beta, axis=1)

        # 1D: batch, 2D: dim_h
        h = relu(T.dot(T.concatenate([r_a, r_b], axis=1), self.W2_r))
        return h
コード例 #22
0
    def forward_one_layer(self, A_previous, W, b, activation_function):
        """
            Forwarding only 1 layer ahead
            W.shape = (nl, nl-1)
            Z, A_prev, A.shape = (nl, m)
            Returns A and [cache = A_prev, W, b, Z]
        """

        print("--------------------")
        print(A_previous.shape)
        print(W.shape)
        print("--------------------")

        Z = np.dot(W, A_previous) + b

        if activation_function == "sigmoid":
            A = nn_utils.sigmoid(Z)
        elif activation_function == "relu":
            A = nn_utils.relu(Z)

        cache = A_previous, W, b, Z

        return A, cache
コード例 #23
0
ファイル: attention.py プロジェクト: hjpwhu/sirnn
    def forward_double(self, A, a_res):
        """
        :param A: 1D: batch, 2D: n_agents, 3D: dim_h
        :param a_res: 1D: batch, 2D: dim_h
        :return: h_after: 1D: n_queries, 2D: n_cands-1, 3D: dim_h
        """

        # 1D: batch, 2D: n_agents, 3D: dim_h
        M = tanh(
            T.dot(A, self.W1_c) +
            T.dot(a_res, self.W1_h).dimshuffle(0, 'x', 1))

        # 1D: batch, 2D: n_agents
        M_a = T.dot(M, self.w)

        # 1D: batch, 2D: dim_h
        M_b = T.max(M, axis=1)
        M_b = T.dot(M_b, self.W_m)

        # 1D: batch, 2D: n_agents, 3D: 1
        alpha = T.nnet.softmax(M_a)
        alpha = alpha.dimshuffle((0, 1, 'x'))

        # 1D: batch, 2D: 1, 3D: dim_h
        beta = T.nnet.softmax(M_b)
        beta = beta.dimshuffle((0, 'x', 1))

        # 1D: batch, 2D: n_agents, 3D: dim_h
        #        gamma = - (T.log(alpha) + T.log(beta))
        gamma = alpha * beta

        # 1D: batch, 2D: dim_h
        r = T.sum(A * gamma, axis=1)

        # 1D: batch, 2D: dim_h
        h = relu(T.dot(r, self.W2_r))
        return h
コード例 #24
0
    def __init__(self, x, c, y, n_words, batch_size, lr, init_emb, vocab_w_size, w_emb_dim, w_hidden_dim,
                 c_emb_dim, c_hidden_dim, output_dim, vocab_c_size, window, opt):
        assert window % 2 == 1, 'Window size must be odd'

        """ input """
        self.x = x  # 1D: n_words * batch_size, 2D: window; elem=word id
        self.x_v = x.flatten()  # 1D: n_words * batch_size * window; elem=word id
        self.c = c  # 1D: n_words * batch_size, 2D: window, 3D: max_len_char, 4D: window; elem=char id
        self.y = y
        self.batch_size = batch_size
        self.n_words = n_words
        self.lr = lr

        n_phi = (w_emb_dim + c_hidden_dim) * window
        max_len_char = T.cast(self.c.shape[2], 'int32')

        """ params """
        if init_emb is not None:
            self.emb = theano.shared(init_emb)
        else:
            self.emb = theano.shared(sample_weights(vocab_w_size, w_emb_dim))

        self.pad = build_shared_zeros((1, c_emb_dim))
        self.e_c = theano.shared(sample_norm_dist(vocab_c_size - 1, c_emb_dim))
        self.emb_c = T.concatenate([self.pad, self.e_c], 0)

        self.W_in = theano.shared(sample_weights(n_phi, w_hidden_dim))
        self.W_c = theano.shared(sample_weights(c_emb_dim * window, c_hidden_dim))
        self.W_out = theano.shared(sample_weights(w_hidden_dim, output_dim))

        self.b_in = theano.shared(sample_weights(w_hidden_dim))
        self.b_c = theano.shared(sample_weights(c_hidden_dim))
        self.b_y = theano.shared(sample_weights(output_dim))

        self.params = [self.e_c, self.W_in, self.W_c, self.W_out, self.b_in, self.b_c, self.b_y]

        """ look up embedding """
        self.x_emb = self.emb[self.x_v]  # 1D: batch_size*n_words * window, 2D: emb_dim
        self.c_emb = self.emb_c[self.c]  # 1D: batch_size*n_words, 2D: window, 3D: max_len_char, 4D: window, 5D: n_c_emb
        self.x_emb_r = self.x_emb.reshape((x.shape[0], x.shape[1], -1))

        """ convolution """
        self.c_phi = T.max(T.dot(self.c_emb.reshape((batch_size * n_words, window, max_len_char, -1)), self.W_c) + self.b_c, 2)  # 1D: n_words, 2D: window, 3D: n_h_c
        self.x_phi = T.concatenate([self.x_emb_r, self.c_phi], axis=2)

        """ forward """
        self.h = relu(T.dot(self.x_phi.reshape((batch_size * n_words, n_phi)), self.W_in) + self.b_in)
        self.o = T.dot(self.h, self.W_out) + self.b_y
        self.p_y_given_x = T.nnet.softmax(self.o)

        """ predict """
        self.y_pred = T.argmax(self.o, axis=1)
        self.result = T.eq(self.y_pred, self.y)

        """ loss """
        self.log_p = T.log(self.p_y_given_x)[T.arange(batch_size * n_words), self.y]
        self.nll = -T.sum(self.log_p)
        self.cost = self.nll

        if opt == 'sgd':
            self.updates = sgd(self.cost, self.params, self.emb, self.x_emb, self.lr)
        else:
            self.updates = ada_grad(self.cost, self.params, self.emb, self.x_emb, self.x, self.lr)
コード例 #25
0
    def __init__(self, x, c, y, n_words, batch_size, lr, init_emb,
                 vocab_w_size, w_emb_dim, w_hidden_dim, c_emb_dim,
                 c_hidden_dim, output_dim, vocab_c_size, window, opt):
        assert window % 2 == 1, 'Window size must be odd'
        """ input """
        self.x = x  # 1D: n_words * batch_size, 2D: window; elem=word id
        self.x_v = x.flatten(
        )  # 1D: n_words * batch_size * window; elem=word id
        self.c = c  # 1D: n_words * batch_size, 2D: window, 3D: max_len_char, 4D: window; elem=char id
        self.y = y
        self.batch_size = batch_size
        self.n_words = n_words
        self.lr = lr

        n_phi = (w_emb_dim + c_hidden_dim) * window
        max_len_char = T.cast(self.c.shape[2], 'int32')
        """ params """
        if init_emb is not None:
            self.emb = theano.shared(init_emb)
        else:
            self.emb = theano.shared(sample_weights(vocab_w_size, w_emb_dim))

        self.pad = build_shared_zeros((1, c_emb_dim))
        self.e_c = theano.shared(sample_norm_dist(vocab_c_size - 1, c_emb_dim))
        self.emb_c = T.concatenate([self.pad, self.e_c], 0)

        self.W_in = theano.shared(sample_weights(n_phi, w_hidden_dim))
        self.W_c = theano.shared(
            sample_weights(c_emb_dim * window, c_hidden_dim))
        self.W_out = theano.shared(sample_weights(w_hidden_dim, output_dim))

        self.b_in = theano.shared(sample_weights(w_hidden_dim))
        self.b_c = theano.shared(sample_weights(c_hidden_dim))
        self.b_y = theano.shared(sample_weights(output_dim))

        self.params = [
            self.e_c, self.W_in, self.W_c, self.W_out, self.b_in, self.b_c,
            self.b_y
        ]
        """ look up embedding """
        self.x_emb = self.emb[
            self.x_v]  # 1D: batch_size*n_words * window, 2D: emb_dim
        self.c_emb = self.emb_c[
            self.
            c]  # 1D: batch_size*n_words, 2D: window, 3D: max_len_char, 4D: window, 5D: n_c_emb
        self.x_emb_r = self.x_emb.reshape((x.shape[0], x.shape[1], -1))
        """ convolution """
        self.c_phi = T.max(
            T.dot(
                self.c_emb.reshape(
                    (batch_size * n_words, window, max_len_char, -1)),
                self.W_c) + self.b_c, 2)  # 1D: n_words, 2D: window, 3D: n_h_c
        self.x_phi = T.concatenate([self.x_emb_r, self.c_phi], axis=2)
        """ forward """
        self.h = relu(
            T.dot(self.x_phi.reshape((batch_size * n_words,
                                      n_phi)), self.W_in) + self.b_in)
        self.o = T.dot(self.h, self.W_out) + self.b_y
        self.p_y_given_x = T.nnet.softmax(self.o)
        """ predict """
        self.y_pred = T.argmax(self.o, axis=1)
        self.result = T.eq(self.y_pred, self.y)
        """ loss """
        self.log_p = T.log(self.p_y_given_x)[T.arange(batch_size * n_words),
                                             self.y]
        self.nll = -T.sum(self.log_p)
        self.cost = self.nll

        if opt == 'sgd':
            self.updates = sgd(self.cost, self.params, self.emb, self.x_emb,
                               self.lr)
        else:
            self.updates = ada_grad(self.cost, self.params, self.emb,
                                    self.x_emb, self.x, self.lr)
コード例 #26
0
 def dot(self, x):
     return relu(T.dot(x, self.W))
コード例 #27
0
    def __init__(self, x_span, x_word, x_ctx, x_dist, y, init_emb, n_vocab, dim_w, dim_d, dim_h, L2_reg):
        """
        :param x_span: 1D: batch, 2D: limit * 2 (10); elem=word id
        :param x_word: 1D: batch, 2D: 4 (m_first, m_last, a_first, a_last); elem=word id
        :param x_ctx : 1D: batch, 2D: window * 2 * 2 (20); elem=word id
        :param x_dist: 1D: batch; elem=distance between sentences of ant and ment
        :param y     : 1D: batch
        """

        self.input  = [x_span, x_word, x_ctx, x_dist, y]
        self.x_span = x_span
        self.x_word = x_word
        self.x_ctx  = x_ctx
        self.x_dist = x_dist
        self.y      = y

        dim_x = dim_w * (2 + 4 + 20) + 1
        batch = y.shape[0]

        """ Params """
        if init_emb is None:
            self.emb = theano.shared(sample_weights(n_vocab, dim_w))
        else:
            self.emb = theano.shared(init_emb)

        self.W_d = theano.shared(sample_weights(dim_d))
        self.W_i = theano.shared(sample_weights(dim_x, dim_h*3))
        self.W_h = theano.shared(sample_weights(dim_h*3, dim_h))
        self.W_o = theano.shared(sample_weights(dim_h))
        self.params = [self.W_d, self.W_i, self.W_h, self.W_o]

        """ Input Layer """
        x_s = self.emb[x_span]     # 1D: batch, 2D: limit * 2,      3D: dim_w
        x_w = self.emb[x_word]     # 1D: batch, 2D: 4,              3D: dim_w
        x_c = self.emb[x_ctx]      # 1D: batch, 2D: window * 2 * 2, 3D: dim_w
        x_d = self.W_d[x_dist]     # 1D: batch
        x_s_avg = T.concatenate([T.mean(x_s[:, :x_s.shape[1]/2], 1), T.mean(x_s[:, x_s.shape[1]/2:], 1)], 1)
        x = T.concatenate([x_s_avg, x_w.reshape((batch, -1)), x_c.reshape((batch, -1)), x_d.reshape((batch, 1))], 1)

        """ Intermediate Layers """
        h1 = relu(T.dot(x, self.W_i))   # h1: 1D: batch, 2D: dim_h
        h2 = relu(T.dot(h1, self.W_h))  # h2: 1D: batch, 2D: dim_h

        """ Output Layer """
        p_y = sigmoid(T.dot(h2, self.W_o))  # p_y: 1D: batch

        """ Predicts """
        self.thresholds = theano.shared(np.asarray([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], dtype=theano.config.floatX))
        self.y_hat = self.binary_predict(p_y)  # 1D: batch, 2D: 9 (thresholds)
        self.y_hat_index = T.argmax(p_y)
        self.p_y_hat = p_y[self.y_hat_index]

        """ Cost Function """
        self.nll = - T.sum(y * T.log(p_y) + (1. - y) * T.log((1. - p_y)))  # TODO: ranking criterion
        self.cost = self.nll + L2_reg * L2_sqr(params=self.params) / 2

        """ Update """
        self.grad = T.grad(self.cost, self.params)
        self.updates = adam(self.params, self.grad)

        """ Check Results """
        self.result = T.eq(self.y_hat, y.reshape((y.shape[0], 1)))  # 1D: batch, 2D: 9 (thresholds)
        self.total_p = T.sum(self.y_hat, 0)
        self.total_r = T.sum(y, keepdims=True)
        self.correct = T.sum(self.result, 0)
        self.correct_t, self.correct_f = correct_tf(self.result, y.reshape((y.shape[0], 1)))
コード例 #28
0
    def __init__(self, name, w, c, b, y, lr,
                 init_w_emb, vocab_w_size, vocab_c_size,
                 w_emb_dim, c_emb_dim, w_hidden_dim, c_hidden_dim, output_dim,
                 window, opt):

        assert window % 2 == 1, 'Window size must be odd'

        """ input """
        self.name = name
        self.w = w
        self.c = c
        self.b = b
        self.y = y
        self.lr = lr
        self.input = [self.w, self.c, self.b, self.y, self.lr]

        n_phi = w_emb_dim + c_emb_dim * window
        n_words = w.shape[0]

        """ params """
        if init_w_emb is not None:
            self.emb = theano.shared(init_w_emb)
        else:
            self.emb = theano.shared(sample_weights(vocab_w_size, w_emb_dim))

        self.emb_c = theano.shared(sample_norm_dist(vocab_c_size, c_emb_dim))
        self.W_in = theano.shared(sample_weights(w_hidden_dim, 1, window, n_phi))
        self.W_c = theano.shared(sample_weights(c_hidden_dim, 1, window, c_emb_dim))
        self.W_out = theano.shared(sample_weights(w_hidden_dim, output_dim))

        self.b_in = theano.shared(sample_weights(w_hidden_dim, 1))
        self.b_c = theano.shared(sample_weights(c_hidden_dim))
        self.b_y = theano.shared(sample_weights(output_dim))

        """ pad """
        self.zero = theano.shared(np.zeros(shape=(1, 1, window / 2, n_phi), dtype=theano.config.floatX))
        self.zero_c = theano.shared(np.zeros(shape=(1, 1, window / 2, c_emb_dim), dtype=theano.config.floatX))

        self.params = [self.emb_c, self.W_in, self.W_c, self.W_out, self.b_in, self.b_c, self.b_y]

        """ look up embedding """
        x_emb = self.emb[self.w]  # x_emb: 1D: n_words, 2D: w_emb_dim
        c_emb = self.emb_c[self.c]  # c_emb: 1D: n_chars, 2D: c_emb_dim

        """ create feature """
        c_phi = self.create_char_feature(self.b, c_emb, self.zero_c) + self.b_c  # 1D: n_words, 2D: c_hidden_dim(50)
        x_phi = T.concatenate([x_emb, c_phi], axis=1)  # 1D: n_words, 2D: w_emb_dim(100) + c_hidden_dim(50)

        """ convolution """
        x_padded = T.concatenate([self.zero, x_phi.reshape((1, 1, x_phi.shape[0], x_phi.shape[1])), self.zero], axis=2)  # x_padded: 1D: n_words + n_pad, 2D: n_phi
        x_in = conv2d(input=x_padded, filters=self.W_in)  # 1D: 1, 2D: w_hidden_dim(300), 3D: n_words, 4D: 1

        """ feed-forward computation """
        h = relu(x_in.reshape((x_in.shape[1], x_in.shape[2])) + T.repeat(self.b_in, T.cast(x_in.shape[2], 'int32'), 1)).T
        self.o = T.dot(h, self.W_out) + self.b_y
        self.p_y_given_x = T.nnet.softmax(self.o)

        """ prediction """
        self.y_pred = T.argmax(self.o, axis=1)
        self.result = T.eq(self.y_pred, self.y)

        """ cost function """
        self.nll = -T.sum(T.log(self.p_y_given_x)[T.arange(n_words), self.y])
        self.cost = self.nll

        if opt == 'sgd':
            self.updates = sgd(self.cost, self.params, self.emb, x_emb, self.lr)
        else:
            self.updates = ada_grad(self.cost, self.params, self.emb, x_emb, self.w, self.lr)
コード例 #29
0
 def dot(self, x):
     return relu(T.dot(x, self.W))
コード例 #30
0
    def __init__(self, x_span, x_word, x_ctx, x_dist, x_slen, y, init_emb, n_vocab, dim_w, dim_d, dim_h, L2_reg):
        """
        :param x_span: 1D: batch, 2D: limit * 2 (10); elem=word id
        :param x_word: 1D: batch, 2D: 4 (m_first, m_last, a_first, a_last); elem=word id
        :param x_ctx : 1D: batch, 2D: window * 2 * 2 (20); elem=word id
        :param x_dist: 1D: batch; 2D: 2; elem=[sent dist, ment dist]
        :param x_slen: 1D: batch; 2D: 3; elem=[m_span_len, a_span_len, head_match]
        :param y     : 1D: batch
        """

        self.input  = [x_span, x_word, x_ctx, x_dist, y]
        self.x_span = x_span
        self.x_word = x_word
        self.x_ctx  = x_ctx
        self.x_dist = x_dist
        self.x_slen = x_slen
        self.y      = y

        dim_x = dim_w * (10 + 4 + 4 + 2 + 3)
        batch = y.shape[0]

        """ Params """
        if init_emb is None:
            self.emb = theano.shared(sample_weights(n_vocab, dim_w))
        else:
            self.emb = theano.shared(init_emb)

        self.W_d = theano.shared(sample_weights(dim_d, dim_w))
        self.W_l = theano.shared(sample_weights(7, dim_w))
        self.W_i = theano.shared(sample_weights(dim_x, dim_h))
        self.W_h = theano.shared(sample_weights(dim_h, dim_h))
        self.W_o = theano.shared(sample_weights(dim_h))
        self.params = [self.W_d, self.W_l, self.W_i, self.W_h, self.W_o]

        """ Input Layer """
        x_vec = T.concatenate([x_span, x_word, x_ctx], 1).flatten()  # 1D: batch * (limit * 2 + 4 + 20)
        x_in = self.emb[x_vec]     # 1D: batch, 2D: limit * 2, 3D: dim_w
        x_d = self.W_d[x_dist]     # 1D: batch, 2D: 2, 3D: dim_w
        x_l = self.W_l[x_slen]     # 1D: batch, 2D: 2, 3D: dim_w
        x = T.concatenate([x_in.reshape((batch, -1)), x_d.reshape((batch, -1)), x_l.reshape((batch, -1))], 1)

        """ Intermediate Layers """
        h1 = relu(T.dot(x, self.W_i))   # h1: 1D: batch, 2D: dim_h
        h2 = relu(T.dot(h1, self.W_h))  # h2: 1D: batch, 2D: dim_h

        """ Output Layer """
        p_y = sigmoid(T.dot(h2, self.W_o))  # p_y: 1D: batch

        """ Cost Function """
        self.nll = - T.sum(y * T.log(p_y) + (1. - y) * T.log((1. - p_y)))  # TODO: ranking criterion
        self.cost = self.nll + L2_reg * L2_sqr(params=self.params) / 2

        """ Update """
        self.updates = sgd(self.cost, self.params, self.emb, x_in)

        """ Predicts """
        self.thresholds = theano.shared(np.asarray([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], dtype=theano.config.floatX))
        self.y_hat = self.binary_predict(p_y)  # 1D: batch, 2D: 9 (thresholds)
        self.y_hat_index = T.argmax(p_y)
        self.p_y_hat = p_y[self.y_hat_index]

        """ Check Results """
        self.result = T.eq(self.y_hat, y.reshape((y.shape[0], 1)))  # 1D: batch, 2D: 9 (thresholds)
        self.total_p = T.sum(self.y_hat, 0)
        self.total_r = T.sum(y, keepdims=True)
        self.correct = T.sum(self.result, 0)
        self.correct_t, self.correct_f = correct_tf(self.result, y.reshape((y.shape[0], 1)))
コード例 #31
0
    def __init__(self, x_span, x_word, x_ctx, x_dist, y, init_emb, n_vocab,
                 dim_w, dim_d, dim_h, L2_reg):
        """
        :param x_span: 1D: batch, 2D: limit * 2 (10); elem=word id
        :param x_word: 1D: batch, 2D: 4 (m_first, m_last, a_first, a_last); elem=word id
        :param x_ctx : 1D: batch, 2D: window * 2 * 2 (20); elem=word id
        :param x_dist: 1D: batch; elem=distance between sentences of ant and ment
        :param y     : 1D: batch
        """

        self.input = [x_span, x_word, x_ctx, x_dist, y]
        self.x_span = x_span
        self.x_word = x_word
        self.x_ctx = x_ctx
        self.x_dist = x_dist
        self.y = y

        dim_x = dim_w * (2 + 4 + 20) + 1
        batch = y.shape[0]
        """ Params """
        if init_emb is None:
            self.emb = theano.shared(sample_weights(n_vocab, dim_w))
        else:
            self.emb = theano.shared(init_emb)

        self.W_d = theano.shared(sample_weights(dim_d))
        self.W_i = theano.shared(sample_weights(dim_x, dim_h * 3))
        self.W_h = theano.shared(sample_weights(dim_h * 3, dim_h))
        self.W_o = theano.shared(sample_weights(dim_h))
        self.params = [self.W_d, self.W_i, self.W_h, self.W_o]
        """ Input Layer """
        x_s = self.emb[x_span]  # 1D: batch, 2D: limit * 2,      3D: dim_w
        x_w = self.emb[x_word]  # 1D: batch, 2D: 4,              3D: dim_w
        x_c = self.emb[x_ctx]  # 1D: batch, 2D: window * 2 * 2, 3D: dim_w
        x_d = self.W_d[x_dist]  # 1D: batch
        x_s_avg = T.concatenate([
            T.mean(x_s[:, :x_s.shape[1] / 2], 1),
            T.mean(x_s[:, x_s.shape[1] / 2:], 1)
        ], 1)
        x = T.concatenate([
            x_s_avg,
            x_w.reshape((batch, -1)),
            x_c.reshape((batch, -1)),
            x_d.reshape((batch, 1))
        ], 1)
        """ Intermediate Layers """
        h1 = relu(T.dot(x, self.W_i))  # h1: 1D: batch, 2D: dim_h
        h2 = relu(T.dot(h1, self.W_h))  # h2: 1D: batch, 2D: dim_h
        """ Output Layer """
        p_y = sigmoid(T.dot(h2, self.W_o))  # p_y: 1D: batch
        """ Predicts """
        self.thresholds = theano.shared(
            np.asarray([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
                       dtype=theano.config.floatX))
        self.y_hat = self.binary_predict(p_y)  # 1D: batch, 2D: 9 (thresholds)
        self.y_hat_index = T.argmax(p_y)
        self.p_y_hat = p_y[self.y_hat_index]
        """ Cost Function """
        self.nll = -T.sum(y * T.log(p_y) + (1. - y) * T.log(
            (1. - p_y)))  # TODO: ranking criterion
        self.cost = self.nll + L2_reg * L2_sqr(params=self.params) / 2
        """ Update """
        self.grad = T.grad(self.cost, self.params)
        self.updates = adam(self.params, self.grad)
        """ Check Results """
        self.result = T.eq(self.y_hat, y.reshape(
            (y.shape[0], 1)))  # 1D: batch, 2D: 9 (thresholds)
        self.total_p = T.sum(self.y_hat, 0)
        self.total_r = T.sum(y, keepdims=True)
        self.correct = T.sum(self.result, 0)
        self.correct_t, self.correct_f = correct_tf(self.result,
                                                    y.reshape((y.shape[0], 1)))