Beispiel #1
0
class InputModule(MergeLayer):
    # Input Module, which uses SemMemModule and GRULayer(lasgne)
    def __init__(self, incomings, voc_size, hid_state_size,
                 SemMem=None, GRU=None, **kwargs):
        super(InputModule, self).__init__(incomings, **kwargs)
        
        if SemMem is not None:
            self.SemMem = SemMem
        else:
            self.SemMem = SemMemModule(incomings[0], voc_size, hid_state_size, **kwargs)
        if GRU is not None:
            self.GRU = GRU
        else:
            self.GRU = GRULayer(SemMem, hid_state_size)
        self.voc_size = voc_size
        self.hid_state_size = hid_state_size

    def get_params(self, **tags):
        # Because InputModules uses external GRULayer's parameters,
        # We have to notify this information to train the GRU's parameters. 
        return self.GRU.get_params(**tags)

    def get_output_shape_for(self, input_shape):
        return (None, None, self.hid_state_size)

    def get_output_for(self, inputs, **kwargs):
        # input with size (batch, sentences, words)
        input = inputs[0]
        # original size of input_word is (batch, sentences)
        # input_word with size (batch x sentences, ) after flatten
        input_word = T.flatten(inputs[1])
        word_dropout = inputs[2]
        
        # Apply word embedding
        # With size (batch x sentence, word, emb_dim)
        sentence_rep = self.SemMem.get_output_for([input, word_dropout])
        
        # Apply GRU Layer
        # 'gru_outs' with size (batch x sentence, word, hid_state_size)
        gru_outs = self.GRU.get_output_for([sentence_rep])
        
        # Extract candidate fact from GRU's output by input_word variable
        # resolving input with additional word
        # e.g. John went to the hallway nil nil nil -> [GRU1, ... ,GRU8] -> GRU5
        #
        # hid_extract with size (batch x sentence, hid_state_size)
        hid_extract = gru_outs[T.arange(gru_outs.shape[0], dtype='int16'), input_word - 1]

        # candidate_facts with size (batch, sentences, hid_state_size)
        candidate_facts = T.reshape(x=hid_extract, newshape=(-1, input.shape[1], self.hid_state_size))
        return candidate_facts
Beispiel #2
0
def test_gru_hid_init_layer_eval():
    # Test `hid_init` as a `Layer` with some dummy input. Compare the output of
    # a network with a `Layer` as input to `hid_init` to a network with a
    # `np.array` as input to `hid_init`
    n_units = 7
    n_test_cases = 2
    in_shp = (n_test_cases, 2, 3)
    in_h_shp = (1, n_units)

    # dummy inputs
    X_test = np.ones(in_shp, dtype=theano.config.floatX)
    Xh_test = np.ones(in_h_shp, dtype=theano.config.floatX)
    Xh_test_batch = np.tile(Xh_test, (n_test_cases, 1))

    # network with `Layer` initializer for hid_init
    l_inp = InputLayer(in_shp)
    l_inp_h = InputLayer(in_h_shp)
    l_rec_inp_layer = GRULayer(l_inp, n_units, hid_init=l_inp_h)

    # network with `np.array` initializer for hid_init
    l_rec_nparray = GRULayer(l_inp, n_units, hid_init=Xh_test)

    # copy network parameters from l_rec_inp_layer to l_rec_nparray
    l_il_param = dict([(p.name, p) for p in l_rec_inp_layer.get_params()])
    l_rn_param = dict([(p.name, p) for p in l_rec_nparray.get_params()])
    for k, v in l_rn_param.items():
        if k in l_il_param:
            v.set_value(l_il_param[k].get_value())

    # build the theano functions
    X = T.tensor3()
    Xh = T.matrix()
    output_inp_layer = lasagne.layers.get_output(l_rec_inp_layer,
                                                 {l_inp: X, l_inp_h: Xh})
    output_nparray = lasagne.layers.get_output(l_rec_nparray, {l_inp: X})

    # test both nets with dummy input
    output_val_inp_layer = output_inp_layer.eval({X: X_test,
                                                  Xh: Xh_test_batch})
    output_val_nparray = output_nparray.eval({X: X_test})

    # check output given `Layer` is the same as with `np.array`
    assert np.allclose(output_val_inp_layer, output_val_nparray)
class InputModule(MergeLayer):
    # Input Module, which uses SemMemModule and GRULayer(lasgne)
    def __init__(self, incomings, voc_size, hid_state_size,
                 SemMem=None, GRU=None, **kwargs):
        super(InputModule, self).__init__(incomings, **kwargs)
        
        if SemMem is not None:
            self.SemMem = SemMem
        else:
            self.SemMem = SemMemModule(incomings[0],voc_size,hid_state_size,**kwargs)
        if GRU is not None:
            self.GRU = GRU
        else:
            self.GRU = GRULayer(SemMem, hid_state_size)
        self.voc_size = voc_size
        self.hid_state_size = hid_state_size

    def get_params(self, **tags):
        # Because InputModules uses external GRULayer's parameters,
        # We have to inform this information to train them. 
        return self.GRU.get_params(**tags)
    def get_output_shape_for(self, input_shape):
        return (None, None, self.hid_state_size)
    def get_output_for(self, inputs, **kwargs):
        input          = inputs[0]
        input_word     = T.flatten(inputs[1])
        word_dropout   = inputs[2]        
        
        # Apply word embedding
        sentence_rep = self.SemMem.get_output_for([input, word_dropout])
        
        # Apply GRU Layer
        gru_outs = self.GRU.get_output_for([sentence_rep])
        
        # Extract candidate fact from GRU's output by input_word variable
        # resolving input with adtional word
        # e.g. John when to the hallway nil nil nil -> [GRU1, ... ,GRU8] -> GRU5
        candidate_facts = T.reshape(
            gru_outs[T.arange(gru_outs.shape[0],dtype='int32'), input_word-1], 
            (-1, input.shape[1], self.hid_state_size))
        return candidate_facts
Beispiel #4
0
def test_gru_bck():
    num_batch, seq_len, n_features1 = 2, 3, 4
    num_units = 2
    x = T.tensor3()
    in_shp = (num_batch, seq_len, n_features1)
    l_inp = InputLayer(in_shp)

    x_in = np.ones(in_shp).astype('float32')

    # need to set random seed.
    lasagne.random.get_rng().seed(1234)
    l_gru_fwd = GRULayer(l_inp, num_units=num_units, backwards=False)
    lasagne.random.get_rng().seed(1234)
    l_gru_bck = GRULayer(l_inp, num_units=num_units, backwards=True)
    output_fwd = helper.get_output(l_gru_fwd, x)
    output_bck = helper.get_output(l_gru_bck, x)

    output_fwd_val = output_fwd.eval({x: x_in})
    output_bck_val = output_bck.eval({x: x_in})

    # test that the backwards model reverses its final input
    np.testing.assert_almost_equal(output_fwd_val, output_bck_val[:, ::-1])
Beispiel #5
0
def test_gru_hid_init_mask():
    # test that you can set hid_init to be a layer when a mask is provided
    l_inp = InputLayer((2, 2, 3))
    l_inp_h = InputLayer((2, 5))
    l_inp_msk = InputLayer((2, 2))
    l_gru = GRULayer(l_inp, 5, hid_init=l_inp_h, mask_input=l_inp_msk)

    x = T.tensor3()
    h = T.matrix()
    msk = T.matrix()

    inputs = {l_inp: x, l_inp_h: h, l_inp_msk: msk}
    output = lasagne.layers.get_output(l_gru, inputs)
Beispiel #6
0
def test_gru_passthrough():
    # Tests that the LSTM can simply pass through its input
    l_in = InputLayer((4, 5, 6))
    zero = lasagne.init.Constant(0.)
    one = lasagne.init.Constant(1.)
    pass_gate = Gate(zero, zero, None, one, None)
    no_gate = Gate(zero, zero, None, zero, None)
    in_pass_gate = Gate(
        np.eye(6).astype(theano.config.floatX), zero, None, zero, None)
    l_rec = GRULayer(l_in, 6, no_gate, pass_gate, in_pass_gate)
    out = lasagne.layers.get_output(l_rec)
    inp = np.arange(4 * 5 * 6).reshape(4, 5, 6).astype(theano.config.floatX)
    np.testing.assert_almost_equal(out.eval({l_in.input_var: inp}), inp)
 def __init__(self, incomings, voc_size, hid_state_size,
              SemMem=None, GRU=None, **kwargs):
     super(InputModule, self).__init__(incomings, **kwargs)
     
     if SemMem is not None:
         self.SemMem = SemMem
     else:
         self.SemMem = SemMemModule(incomings[0],voc_size,hid_state_size,**kwargs)
     if GRU is not None:
         self.GRU = GRU
     else:
         self.GRU = GRULayer(SemMem, hid_state_size)
     self.voc_size = voc_size
     self.hid_state_size = hid_state_size
Beispiel #8
0
def test_gru_return_shape():
    num_batch, seq_len, n_features1, n_features2 = 5, 3, 10, 11
    num_units = 6
    x = T.tensor4()
    in_shp = (num_batch, seq_len, n_features1, n_features2)
    l_inp = InputLayer(in_shp)
    l_rec = GRULayer(l_inp, num_units=num_units)

    x_in = np.random.random(in_shp).astype('float32')
    output = helper.get_output(l_rec, x)
    output_val = output.eval({x: x_in})

    assert helper.get_output_shape(l_rec, x_in.shape) == output_val.shape
    assert output_val.shape == (num_batch, seq_len, num_units)
Beispiel #9
0
    def _add_decoder(self):
        """
        Decoder returns the batch of sequences of thought vectors, each corresponds to a decoded token
        reshapes this 3d tensor to 2d matrix so that the next Dense layer can convert each thought vector to
        a probability distribution vector
        """

        self._net['hid_states_decoder'] = InputLayer(
            shape=(None, self._decoder_depth, None),
            input_var=T.tensor3('hid_inits_decoder'),
            name='hid_states_decoder')

        # repeat along the sequence axis output_seq_len times, where output_seq_len is inferred from input tensor
        self._net['enc_repeated'] = RepeatLayer(
            incoming=self._net[
                'enc_result'],  # input shape = (batch_size, encoder_output_dimension)
            n=self._output_seq_len,
            name='repeat_layer')

        self._net['emb_condition_id_repeated'] = RepeatLayer(
            incoming=self._net['emb_condition_id'],
            n=self._output_seq_len,
            name='embedding_condition_id_repeated')

        self._net['dec_concated_input'] = ConcatLayer(
            incomings=[
                self._net['emb_y'], self._net['enc_repeated'],
                self._net['emb_condition_id_repeated']
            ],
            axis=2,
            name='decoder_concated_input')
        # shape = (batch_size, input_seq_len, encoder_output_dimension)

        self._net['dec_0'] = self._net['dec_concated_input']

        for dec_layer_id in xrange(1, self._decoder_depth + 1):
            # input shape = (batch_size, input_seq_len, embedding_dimension + hidden_dimension)
            self._net['dec_' + str(dec_layer_id)] = GRULayer(
                incoming=self._net['dec_' + str(dec_layer_id - 1)],
                num_units=self._hidden_layer_dim,
                grad_clipping=self._grad_clip,
                only_return_final=False,
                name='decoder_' + str(dec_layer_id),
                mask_input=self._net['input_y_mask'],
                hid_init=SliceLayer(self._net['hid_states_decoder'],
                                    dec_layer_id - 1,
                                    axis=1))

        self._net['dec'] = self._net['dec_' + str(self._decoder_depth)]
Beispiel #10
0
def test_gru_precompute():
    num_batch, seq_len, n_features1 = 2, 3, 4
    num_units = 2
    in_shp = (num_batch, seq_len, n_features1)
    l_inp = InputLayer(in_shp)
    l_mask_inp = InputLayer(in_shp[:2])

    x_in = np.random.random(in_shp).astype('float32')
    mask_in = np.ones((num_batch, seq_len), dtype='float32')

    # need to set random seed.
    lasagne.random.get_rng().seed(1234)
    l_gru_precompute = GRULayer(l_inp,
                                num_units=num_units,
                                precompute_input=True,
                                mask_input=l_mask_inp)
    lasagne.random.get_rng().seed(1234)
    l_gru_no_precompute = GRULayer(l_inp,
                                   num_units=num_units,
                                   precompute_input=False,
                                   mask_input=l_mask_inp)
    output_precompute = helper.get_output(l_gru_precompute).eval({
        l_inp.input_var:
        x_in,
        l_mask_inp.input_var:
        mask_in
    })
    output_no_precompute = helper.get_output(l_gru_no_precompute).eval({
        l_inp.input_var:
        x_in,
        l_mask_inp.input_var:
        mask_in
    })

    # test that the backwards model reverses its final input
    np.testing.assert_almost_equal(output_precompute, output_no_precompute)
Beispiel #11
0
def test_gru_variable_input_size():
    # that seqlen and batchsize None works
    num_batch, n_features1 = 6, 5
    num_units = 13
    x = T.tensor3()

    in_shp = (None, None, n_features1)
    l_inp = InputLayer(in_shp)
    x_in1 = np.ones((num_batch + 1, 10, n_features1)).astype('float32')
    x_in2 = np.ones((num_batch, 15, n_features1)).astype('float32')
    l_rec = GRULayer(l_inp, num_units=num_units, backwards=False)
    output = helper.get_output(l_rec, x)

    output.eval({x: x_in1})
    output.eval({x: x_in2})
Beispiel #12
0
def test_gru_unroll_scan_fwd():
    num_batch, seq_len, n_features1 = 2, 3, 4
    num_units = 2
    in_shp = (num_batch, seq_len, n_features1)
    l_inp = InputLayer(in_shp)
    l_mask_inp = InputLayer(in_shp[:2])

    x_in = np.random.random(in_shp).astype('float32')
    mask_in = np.ones(in_shp[:2]).astype('float32')

    # need to set random seed.
    lasagne.random.get_rng().seed(1234)
    l_gru_scan = GRULayer(l_inp,
                          num_units=num_units,
                          backwards=False,
                          unroll_scan=False,
                          mask_input=l_mask_inp)
    lasagne.random.get_rng().seed(1234)
    l_gru_unrolled = GRULayer(l_inp,
                              num_units=num_units,
                              backwards=False,
                              unroll_scan=True,
                              mask_input=l_mask_inp)
    output_scan = helper.get_output(l_gru_scan)
    output_unrolled = helper.get_output(l_gru_unrolled)

    output_scan_val = output_scan.eval({
        l_inp.input_var: x_in,
        l_mask_inp.input_var: mask_in
    })
    output_unrolled_val = output_unrolled.eval({
        l_inp.input_var: x_in,
        l_mask_inp.input_var: mask_in
    })

    np.testing.assert_almost_equal(output_scan_val, output_unrolled_val)
    def __init__(self, num_batch, max_len, n_features, hidden=[200, 200], **kwargs):
        self.num_batch = num_batch
        self.n_features = n_features
        self.max_len = max_len
        self.hidden = hidden
        rng = np.random.RandomState(123)
        self.drng = rng
        self.rng = RandomStreams(rng.randint(2 ** 30))

        # params
        initial_W = np.asarray(
            rng.uniform(
                    low=-4 * np.sqrt(6. / (self.hidden[1] + self.n_features)),
                    high=4 * np.sqrt(6. / (self.hidden[1] + self.n_features)),
                    size=(self.hidden[1], self.n_features)
            ),
            dtype=theano.config.floatX
        )

        self.W = theano.shared(value=initial_W, name='W', borrow=True)
        # # self.W_y_kappa = theano.shared(value=initial_W, name='W_y_kappa', borrow=True)
        self.b = theano.shared(
                value=np.zeros(
                    self.n_features,
                    dtype=theano.config.floatX
                ),
                borrow=True
            )
        # self.b_y_kappa = theano.shared(
        #         value=np.zeros(
        #             self.n_features,
        #             dtype=theano.config.floatX
        #         ),
        #         name='b',
        #         borrow=True
        #     )


        # I could directly create the model here since it is fixed
        self.l_in = InputLayer(shape=(None, self.max_len, self.n_features))
        self.mask_input = InputLayer(shape=(None, self.max_len))
        first_hidden = GRULayer(self.l_in, mask_input=self.mask_input, num_units=hidden[0])
        # l_shp = ReshapeLayer(first_hidden, (-1, hidden[0]))
        # l_dense = DenseLayer(l_shp, num_units=self.hidden[0], nonlinearity=rectify)
        # l_drop = DropoutLayer(l_dense, p=0.5)
        # l_shp = ReshapeLayer(l_drop, (-1, self.max_len, self.hidden[0]))
        self.model = GRULayer(first_hidden, num_units=hidden[1])
Beispiel #14
0
def gru_column(input, num_units, hidden, **kwargs):
    kwargs.pop("only_return_final", None)
    assert isinstance(hidden, (list, tuple))

    name = kwargs.pop("name", "default")
    column = [input]
    for i, l_hidden in enumerate(hidden):
        kwargs_ = kwargs.copy()
        if isinstance(l_hidden, Layer):
            kwargs_.pop("learn_init", None)
            kwargs_["hid_init"] = l_hidden

        layer = GRULayer(column[-1], num_units,
                         name=os.path.join(name, "gru_%02d" % i),
                         **kwargs_)
        column.append(layer)
    return column[1:]
Beispiel #15
0
    def _add_utterance_encoder(self):
        # input shape = (batch_size * input_context_size, input_seq_len, embedding_dimension)
        self._add_forward_backward_encoder_layer()

        for enc_layer_id in xrange(1, self._encoder_depth):
            is_last_encoder_layer = enc_layer_id == self._encoder_depth - 1
            return_only_final_state = is_last_encoder_layer

            # input shape = (batch_size * input_context_size, input_seq_len, embedding_dimension)
            self._net['enc_' + str(enc_layer_id)] = GRULayer(
                incoming=self._net['enc_' + str(enc_layer_id - 1)],
                num_units=self._hidden_layer_dim,
                grad_clipping=self._grad_clip,
                only_return_final=return_only_final_state,
                name='encoder_' + str(enc_layer_id),
                mask_input=self._net['input_x_mask'])

        self._net['enc'] = self._net['enc_' + str(self._encoder_depth - 1)]
Beispiel #16
0
def test_gru_nparams_hid_init_layer():
    # test that you can see layers through hid_init
    l_inp = InputLayer((2, 2, 3))
    l_inp_h = InputLayer((2, 5))
    l_inp_h_de = DenseLayer(l_inp_h, 7)
    l_gru = GRULayer(l_inp, 7, hid_init=l_inp_h_de)

    # directly check the layers can be seen through hid_init
    assert lasagne.layers.get_all_layers(l_gru) == [l_inp, l_inp_h, l_inp_h_de,
                                                    l_gru]

    # 3*n_gates + 2
    # the 3 is because we have  hid_to_gate, in_to_gate and bias for each gate
    # 2 is for the W and b parameters in the DenseLayer
    assert len(lasagne.layers.get_all_params(l_gru, trainable=True)) == 11

    # GRU bias params(3) + Dense bias params(1)
    assert len(lasagne.layers.get_all_params(l_gru, regularizable=False)) == 4
Beispiel #17
0
    def __init__(self, num_batch, max_len, n_features, hidden=[200, 200], **kwargs):
        self.num_batch = num_batch
        self.n_features = n_features
        self.max_len = max_len
        self.hidden = hidden
        rng = np.random.RandomState(123)
        self.drng = rng
        self.rng = RandomStreams(rng.randint(2 ** 30))

        # params
        initial_W = np.asarray(
            rng.uniform(
                    low=1e-5,
                    high=1,
                    size=(self.hidden[1], self.n_features)
            ),
            dtype=theano.config.floatX
        )

        self.W_y_theta = theano.shared(value=initial_W, name='W_y_theta', borrow=True)
        # # self.W_y_kappa = theano.shared(value=initial_W, name='W_y_kappa', borrow=True)
        self.b_y_theta = theano.shared(
                value=np.zeros(
                    self.n_features,
                    dtype=theano.config.floatX
                ),
                borrow=True
            )
        # self.b_y_kappa = theano.shared(
        #         value=np.zeros(
        #             self.n_features,
        #             dtype=theano.config.floatX
        #         ),
        #         name='b',
        #         borrow=True
        #     )


        # I could directly create the model here since it is fixed
        self.l_in = InputLayer(shape=(self.num_batch, self.max_len, self.n_features))
        self.mask_input = InputLayer(shape=(self.num_batch, self.max_len))
        first_hidden = GRULayer(self.l_in, mask_input=self.mask_input, num_units=hidden[0])
        self.model =GRULayer(first_hidden, num_units=hidden[1])
Beispiel #18
0
def gated_layer(incoming,
                num_units,
                grad_clipping,
                only_return_final,
                backwards,
                gated_layer_type,
                mask_input=None,
                cell_init=lasagne.init.Constant(0.),
                hid_init=lasagne.init.Constant(0.),
                resetgate=lasagne.layers.Gate(W_cell=None),
                updategate=lasagne.layers.Gate(W_cell=None),
                hidden_update=lasagne.layers.Gate(
                    W_cell=None, nonlinearity=lasagne.nonlinearities.tanh),
                name=None):
    if gated_layer_type == "gru":
        return GRULayer(incoming,
                        num_units,
                        mask_input=mask_input,
                        grad_clipping=grad_clipping,
                        only_return_final=only_return_final,
                        backwards=backwards,
                        hid_init=hid_init,
                        resetgate=resetgate,
                        updategate=updategate,
                        hidden_update=hidden_update,
                        name=name)
    else:
        return LSTMLayer(incoming,
                         num_units,
                         mask_input=mask_input,
                         grad_clipping=grad_clipping,
                         nonlinearity=lasagne.nonlinearities.tanh,
                         only_return_final=only_return_final,
                         backwards=backwards,
                         cell_init=cell_init,
                         hid_init=hid_init,
                         resetgate=resetgate,
                         updategate=updategate,
                         hidden_update=hidden_update,
                         name=name)
Beispiel #19
0
def test_gru_hid_init_layer_eval():
    # Test `hid_init` as a `Layer` with some dummy input. Compare the output of
    # a network with a `Layer` as input to `hid_init` to a network with a
    # `np.array` as input to `hid_init`
    n_units = 7
    n_test_cases = 2
    in_shp = (n_test_cases, 2, 3)
    in_h_shp = (1, n_units)

    # dummy inputs
    X_test = np.ones(in_shp, dtype=theano.config.floatX)
    Xh_test = np.ones(in_h_shp, dtype=theano.config.floatX)
    Xh_test_batch = np.tile(Xh_test, (n_test_cases, 1))

    # network with `Layer` initializer for hid_init
    l_inp = InputLayer(in_shp)
    l_inp_h = InputLayer(in_h_shp)
    l_rec_inp_layer = GRULayer(l_inp, n_units, hid_init=l_inp_h)

    # network with `np.array` initializer for hid_init
    l_rec_nparray = GRULayer(l_inp, n_units, hid_init=Xh_test)

    # copy network parameters from l_rec_inp_layer to l_rec_nparray
    l_il_param = dict([(p.name, p) for p in l_rec_inp_layer.get_params()])
    l_rn_param = dict([(p.name, p) for p in l_rec_nparray.get_params()])
    for k, v in l_rn_param.items():
        if k in l_il_param:
            v.set_value(l_il_param[k].get_value())

    # build the theano functions
    X = T.tensor3()
    Xh = T.matrix()
    output_inp_layer = lasagne.layers.get_output(l_rec_inp_layer, {
        l_inp: X,
        l_inp_h: Xh
    })
    output_nparray = lasagne.layers.get_output(l_rec_nparray, {l_inp: X})

    # test both nets with dummy input
    output_val_inp_layer = output_inp_layer.eval({
        X: X_test,
        Xh: Xh_test_batch
    })
    output_val_nparray = output_nparray.eval({X: X_test})

    # check output given `Layer` is the same as with `np.array`
    assert np.allclose(output_val_inp_layer, output_val_nparray)
Beispiel #20
0
    def _add_context_encoder(self):
        self._net['batched_enc'] = reshape(
            self._net['enc'], (self._batch_size, self._input_context_size, get_output_shape(self._net['enc'])[-1]))

        self._net['context_enc'] = GRULayer(
            incoming=self._net['batched_enc'],
            num_units=self._hidden_layer_dim,
            grad_clipping=self._grad_clip,
            only_return_final=True,
            name='context_encoder')

        self._net['switch_enc_to_tv'] = T.iscalar(name='switch_enc_to_tv')

        self._net['thought_vector'] = InputLayer(
            shape=(None, self._hidden_layer_dim), input_var=T.fmatrix(name='thought_vector'), name='thought_vector')

        self._net['enc_result'] = SwitchLayer(
            incomings=[self._net['thought_vector'], self._net['context_enc']], condition=self._net['switch_enc_to_tv'])

        # We need the following to pass as 'givens' argument when compiling theano functions:
        self._default_thoughts_vector = T.zeros((self._batch_size, self._hidden_layer_dim))
        self._default_input_x = T.zeros(shape=(self._net['thought_vector'].input_var.shape[0], 1, 1), dtype=np.int32)
Beispiel #21
0
	def __init__(self, in_path, concat=False, wsi_path=None, dat_path='data/dat.pkl', supp_path='data/supp.pkl'):
		self.in_path = in_path
		self.concat = concat
		self.wsi_path = wsi_path
		self.lm_mode = 'default'
		with open(self.in_path, 'rb') as f:
			p = pk.load(f)

		self.do_brnn = False
		if 'do_brnn' in p:
			self.do_brnn = p['do_brnn']
		self.is_lstm = 'Wxo' in p['params']
		self.is_gru = 'Whr' in p['params']
		if 'Wt' not in p: self.lm_mode = 'iden'
		elif p['Wt'].get_value().ndim == 1: self.lm_mode = 'diag'
		self.params = p['params']
		self.dwe = self.params['dwe'] # disambiguated word embeddings
		self.td = self.dwe.get_value().shape[1]
		self.hd = self.params['L'].get_value().shape[1]
		self.gc = 2
		self.l_mask = InputLayer((None, None), trainable=False)

		if self.is_lstm:
			self.l_gru_emb = LSTMLayer((None, None, self.td), self.hd, grad_clipping=self.gc, \
				ingate=Gate(W_in=self.params['Wxr'], W_hid=self.params['Whr'], b=self.params['br'], W_cell=None), \
				forgetgate=Gate(W_in=self.params['Wxu'], W_hid=self.params['Whu'], b=self.params['bu'], W_cell=None), \
				outgate=Gate(W_in=self.params['Wxo'], W_hid=self.params['Who'], b=self.params['bo'], W_cell=None), \
				cell=Gate(W_in=self.params['Wxc'], W_hid=self.params['Whc'], b=self.params['bc'], W_cell=None,\
					nonlinearity=nonlinearities.tanh),mask_input=self.l_mask, peepholes=False)
			if self.do_brnn:
				self.l_bgru_emb = LSTMLayer((None, None, self.td), self.hd, grad_clipping=self.gc, \
					ingate=Gate(W_in=self.params['bWxr'], W_hid=self.params['bWhr'], b=self.params['bbr'], W_cell=None), \
					forgetgate=Gate(W_in=self.params['bWxu'], W_hid=self.params['bWhu'], b=self.params['bbu'], W_cell=None), \
					outgate=Gate(W_in=self.params['bWxo'], W_hid=self.params['bWho'], b=self.params['bbo'], W_cell=None), \
					cell=Gate(W_in=self.params['bWxc'], W_hid=self.params['bWhc'], b=self.params['bbc'], W_cell=None,\
						nonlinearity=nonlinearities.tanh),mask_input=self.l_mask, peepholes=False, backwards=True)
		elif self.is_gru:
			self.l_gru_emb = GRULayer((None, None, self.td), self.hd, grad_clipping=self.gc, \
				resetgate=Gate(W_in=self.params['Wxr'], W_hid=self.params['Whr'], b=self.params['br'], W_cell=None), \
				updategate=Gate(W_in=self.params['Wxu'], W_hid=self.params['Whu'], b=self.params['bu'], W_cell=None), \
				hidden_update=Gate(W_in=self.params['Wxc'], W_hid=self.params['Whc'], b=self.params['bc'], W_cell=None,\
					nonlinearity=nonlinearities.tanh),mask_input=self.l_mask)
			if self.do_brnn:
				self.l_bgru_emb = GRULayer((None, None, self.td), self.hd, grad_clipping=self.gc, \
					resetgate=Gate(W_in=self.params['bWxr'], W_hid=self.params['bWhr'], b=self.params['bbr'], W_cell=None), \
					updategate=Gate(W_in=self.params['bWxu'], W_hid=self.params['bWhu'], b=self.params['bbu'], W_cell=None), \
					hidden_update=Gate(W_in=self.params['bWxc'], W_hid=self.params['bWhc'], b=self.params['bbc'], W_cell=None,\
						nonlinearity=nonlinearities.tanh),mask_input=self.l_mask, backwards=True)
		else:
			self.is_nlm = True
	
		with open(dat_path, 'rb') as f:
			d = pk.load(f)
		self.nw, self.mw, self.ms = d['def'].shape # num words, max num of words, max num of senses
		self.dw = d['dw'] # dw to index
		self.aw = d['aw']
		self.no = len(d['aw'])
		if 'spriors' in d:
			self.sense_priors = d['spriors']
		else:
			self.sense_priors = np.ones((self.no, self.ms))

		with open(supp_path, 'rb') as f:
			s = pk.load(f)
		self.id2aw = s['id2aw']
		self.id2dw = s['id2dw']
		self.aw2dw = s['aw2dw']

		self.build_encoder()
Beispiel #22
0
def test_gru_grad_clipping():
    # test that you can set grad_clip variable
    x = T.tensor3()
    l_rec = GRULayer(InputLayer((2, 2, 3)), 5, grad_clipping=1)
    output = lasagne.layers.get_output(l_rec, x)
        hidden.append(slice_)
    return hidden


###############################################################################
#                                   ENCODER                                   #
###############################################################################
# Encoder's Recurrent subnetwork
l_encoder_mask = InputLayer((None, None), name="encoder/mask")
l_encoder_embed = InputLayer((None, None, n_embed_char), name="encoder/input")

bidi_gru = []
bidi_gru.append(
    GRULayer(l_encoder_embed,
             n_hidden_encoder,
             learn_init=True,
             name="encoder/gru_f",
             backwards=False))
bidi_gru.append(
    GRULayer(l_encoder_embed,
             n_hidden_encoder,
             learn_init=True,
             name="encoder/gru_b",
             backwards=True))

l_encoder_context = ConcatLayer(bidi_gru, axis=-1, name="encoder/cat")

###############################################################################
#                                   DECODER                                   #
###############################################################################
# Decoder's Recurrent subnetwork
Beispiel #24
0
class PRAE:
    def __init__(self,
                 num_batch,
                 max_len,
                 n_features,
                 hidden=[200, 200],
                 **kwargs):
        self.num_batch = num_batch
        self.n_features = n_features
        self.max_len = max_len
        self.hidden = hidden
        rng = np.random.RandomState(123)
        self.drng = rng
        self.rng = RandomStreams(rng.randint(2**30))

        # params
        initial_W = np.asarray(rng.uniform(low=1e-5,
                                           high=1,
                                           size=(self.hidden[1],
                                                 self.n_features)),
                               dtype=theano.config.floatX)

        self.W_y_theta = theano.shared(value=initial_W,
                                       name='W_y_theta',
                                       borrow=True)
        # # self.W_y_kappa = theano.shared(value=initial_W, name='W_y_kappa', borrow=True)
        self.b_y_theta = theano.shared(value=np.zeros(
            self.n_features, dtype=theano.config.floatX),
                                       borrow=True)
        # self.b_y_kappa = theano.shared(
        #         value=np.zeros(
        #             self.n_features,
        #             dtype=theano.config.floatX
        #         ),
        #         name='b',
        #         borrow=True
        #     )

        # I could directly create the model here since it is fixed
        self.l_in = InputLayer(shape=(self.num_batch, self.max_len,
                                      self.n_features))
        self.mask_input = InputLayer(shape=(self.num_batch, self.max_len))
        first_hidden = GRULayer(self.l_in,
                                mask_input=self.mask_input,
                                num_units=hidden[0])
        self.model = GRULayer(first_hidden, num_units=hidden[1])
        # need some reshape voodoo
        # l_shp = ReshapeLayer(second_hidden, (-1, hidden[1]))
        # after the reshape I have batch*max_len X features
        # self.model = DenseLayer(l_shp, num_units=self.n_features, nonlinearity=rectify)
        # if now I put a dense layer this will collect all the output temporally which is what I want, I'll have to fix
        # the dimensions probably later
        # For every gaussian in the sum I need 3 values plus a value for the total scale
        # the output of this layer will be (num_batch, num_units, max_len) TODO check size

    def get_output_shape_for(self):
        return self.model.get_output_shape_for(self.num_batch, self.max_len,
                                               self.hidden[2])

    def get_output_y(self, output):
        # (batch, time, hidden) X (hidden, features) + (, features) => (batch, time, features)
        theta_out = T.nnet.relu(T.dot(output, self.W_y_theta) + self.b_y_theta)
        #kappa_out = T.nnet.relu(T.dot(output, self.W_y_kappa) + self.b_y_kappa)
        return theta_out

    def get_log_x(self, x, theta_out):
        # DIM = (batch, time, hidden)
        # everything is elementwise
        log_x = T.log(theta_out + 1e-8) - theta_out * x
        log_x = log_x.sum(axis=2, dtype=theano.config.floatX
                          )  # sum over x cause I assume they are independent
        return log_x

    def build_model(self, train_x, train_mask_x, train_mask_out, train_target,
                    test_x, test_mask_x, test_mask_out, test_target):
        self.train_x = train_x
        self.train_mask_x = train_mask_x
        self.train_mask_out = train_mask_out
        self.train_target = train_target
        self.test_x = test_x
        self.test_mask_x = test_mask_x
        self.test_mask_out = test_mask_out
        self.test_target = test_target
        self.index = T.iscalar('index')
        self.num_batch_test = T.iscalar('index')
        self.b_slice = slice(self.index * self.num_batch,
                             (self.index + 1) * self.num_batch)

        sym_x = T.dtensor3()
        sym_mask_x = T.dmatrix()
        sym_target = T.dtensor3()
        # sym_mask_out = T.dtensor3() should not be useful since output is still zero
        # TODO think about this if it is true

        output = lasagne.layers.get_output(self.model,
                                           inputs={
                                               self.l_in: sym_x,
                                               self.mask_input: sym_mask_x
                                           })
        theta = self.get_output_y(output)
        log_px = self.get_log_x(sym_target, theta)
        log_px_sum_time = log_px.sum(axis=1,
                                     dtype=theano.config.floatX)  # sum over tx
        loss = -T.sum(log_px_sum_time) / self.num_batch  # average over batch
        ##
        log_px_test = self.get_log_x(sym_target, theta)
        log_px_sum_time_test = log_px_test.sum(
            axis=1, dtype=theano.config.floatX)  # sum over time
        loss_test = -T.sum(
            log_px_sum_time_test) / self.num_batch_test  # average over batch
        # loss = T.mean(lasagne.objectives.squared_error(mu, sym_target))
        all_params = [self.W_y_theta] + [
            self.b_y_theta
        ] + lasagne.layers.get_all_params(self.model)
        all_grads_target = [T.clip(g, -3, 3) for g in T.grad(loss, all_params)]
        all_grads_target = lasagne.updates.total_norm_constraint(
            all_grads_target, 3)
        updates_target = adam(all_grads_target, all_params)

        train_model = theano.function(
            [self.index], [loss, theta, log_px],
            givens={
                sym_x: self.train_x[self.b_slice],
                sym_mask_x: self.train_mask_x[self.b_slice],
                sym_target: self.train_target[self.b_slice]
            },
            updates=updates_target)
        test_model = theano.function(
            [self.num_batch_test], [loss_test, theta],
            givens={
                sym_x: self.test_x,
                sym_mask_x: self.test_mask_x,
                sym_target: self.test_target
            })

        return train_model, test_model
Beispiel #25
0
def test_memory_cells(batch_size=3, seq_len=50, input_dim=8, n_hidden=16):
    # lasagne way
    l_in = InputLayer(
        (None, seq_len, input_dim),
        input_var=theano.shared(
            np.random.normal(size=[batch_size, seq_len, input_dim])),
        name='input seq')

    l_lstm0 = LSTMLayer(l_in, n_hidden, name='lstm')
    l_gru0 = GRULayer(l_in, n_hidden, name='gru')

    f_predict0 = theano.function([], get_output([l_lstm0, l_gru0]))

    # agentnet way
    s_in = InputLayer((None, input_dim), name='in')

    s_prev_cell = InputLayer((None, n_hidden), name='cell')
    s_prev_hid = InputLayer((None, n_hidden), name='hid')
    s_lstm_cell, s_lstm_hid = LSTMCell(s_prev_cell,
                                       s_prev_hid,
                                       s_in,
                                       name='lstm')

    s_prev_gru = InputLayer((None, n_hidden), name='hid')
    s_gru = GRUCell(s_prev_gru, s_in, name='gru')

    rec = Recurrence(state_variables=OrderedDict({
        s_lstm_cell: s_prev_cell,
        s_lstm_hid: s_prev_hid,
        s_gru: s_prev_gru
    }),
                     input_sequences={s_in: l_in},
                     unroll_scan=False)

    state_seqs, _ = rec.get_sequence_layers()

    l_lstm1 = state_seqs[s_lstm_hid]
    l_gru1 = state_seqs[s_gru]

    f_predict1 = theano.function([], get_output([l_lstm1, l_gru1]))

    # lstm param transfer
    old_params = sorted(get_all_params(l_lstm0, trainable=True),
                        key=lambda p: p.name)
    new_params = sorted(get_all_params(s_lstm_hid, trainable=True),
                        key=lambda p: p.name)

    for old, new in zip(old_params, new_params):
        print old.name, '<-', new.name
        assert tuple(old.shape.eval()) == tuple(new.shape.eval())
        old.set_value(new.get_value())

    # gru param transfer
    old_params = sorted(get_all_params(l_gru0, trainable=True),
                        key=lambda p: p.name)
    new_params = sorted(get_all_params(s_gru, trainable=True),
                        key=lambda p: p.name)

    for old, new in zip(old_params, new_params):
        print old.name, '<-', new.name
        assert tuple(old.shape.eval()) == tuple(new.shape.eval())
        old.set_value(new.get_value())

    lstm0_out, gru0_out = f_predict0()
    lstm1_out, gru1_out = f_predict1()

    assert np.allclose(lstm0_out, lstm1_out)
    assert np.allclose(gru0_out, gru1_out)
class PRAE:
    def __init__(self, num_batch, max_len, n_features, hidden=[200, 200], **kwargs):
        self.num_batch = num_batch
        self.n_features = n_features
        self.max_len = max_len
        self.hidden = hidden
        rng = np.random.RandomState(123)
        self.drng = rng
        self.rng = RandomStreams(rng.randint(2 ** 30))

        # params
        initial_W = np.asarray(
            rng.uniform(
                    low=-4 * np.sqrt(6. / (self.hidden[1] + self.n_features)),
                    high=4 * np.sqrt(6. / (self.hidden[1] + self.n_features)),
                    size=(self.hidden[1], self.n_features)
            ),
            dtype=theano.config.floatX
        )

        self.W = theano.shared(value=initial_W, name='W', borrow=True)
        # # self.W_y_kappa = theano.shared(value=initial_W, name='W_y_kappa', borrow=True)
        self.b = theano.shared(
                value=np.zeros(
                    self.n_features,
                    dtype=theano.config.floatX
                ),
                borrow=True
            )
        # self.b_y_kappa = theano.shared(
        #         value=np.zeros(
        #             self.n_features,
        #             dtype=theano.config.floatX
        #         ),
        #         name='b',
        #         borrow=True
        #     )


        # I could directly create the model here since it is fixed
        self.l_in = InputLayer(shape=(None, self.max_len, self.n_features))
        self.mask_input = InputLayer(shape=(None, self.max_len))
        first_hidden = GRULayer(self.l_in, mask_input=self.mask_input, num_units=hidden[0])
        # l_shp = ReshapeLayer(first_hidden, (-1, hidden[0]))
        # l_dense = DenseLayer(l_shp, num_units=self.hidden[0], nonlinearity=rectify)
        # l_drop = DropoutLayer(l_dense, p=0.5)
        # l_shp = ReshapeLayer(l_drop, (-1, self.max_len, self.hidden[0]))
        self.model = GRULayer(first_hidden, num_units=hidden[1])
        # self.model = ConcatLayer([first_hidden, second_hidden], axis=2)
        # l_shp = ReshapeLayer(second_hidden, (-1, hidden[1]))
        # l_dense = DenseLayer(l_shp, num_units=self.n_features, nonlinearity=rectify)
        # To reshape back to our original shape, we can use the symbolic shape
        # variables we retrieved above.
        #self.model = ReshapeLayer(l_dense, (-1, self.max_len, self.n_features))
        # if now I put a dense layer this will collect all the output temporally which is what I want, I'll have to fix
        # the dimensions probably later
        # For every gaussian in the sum I need 3 values plus a value for the total scale
        # the output of this layer will be (num_batch, num_units, max_len) TODO check size

    def get_output_shape_for(self):
        return self.model.get_output_shape_for(self.num_batch, self.max_len, self.hidden[1])

    def get_output_y(self, x):
        return T.nnet.relu(T.dot(x, self.W) + self.b)


    def build_model(self, train_x, train_mask_x, train_mask_out, train_target,
                    test_x, test_mask_x, test_mask_out, test_target):
        self.train_x = train_x
        self.train_mask_x = train_mask_x
        self.train_mask_out = train_mask_out
        self.train_target = train_target
        self.test_x = test_x
        self.test_mask_x = test_mask_x
        self.test_mask_out = test_mask_out
        self.test_target = test_target
        self.index = T.iscalar('index')
        self.num_batch_test = T.iscalar('index')
        self.b_slice = slice(self.index * self.num_batch, (self.index + 1) * self.num_batch)

        sym_x = T.dtensor3()
        sym_mask_x = T.dmatrix()
        sym_target = T.dtensor3()
        sym_mask_out = T.dtensor3()
        # sym_mask_out = T.dtensor3() should not be useful since output is still zero
        # TODO think about this if it is true

        out = lasagne.layers.get_output(self.model, inputs={self.l_in: sym_x, self.mask_input: sym_mask_x})
        out_out = self.get_output_y(out)
        loss = T.mean(lasagne.objectives.squared_error(out_out, sym_target)) / self.num_batch

        out_test = lasagne.layers.get_output(self.model, inputs={self.l_in: sym_x, self.mask_input: sym_mask_x})
        out_out_test = self.get_output_y(out_test)
        loss_test = T.mean(lasagne.objectives.squared_error(out_out_test, sym_target)) / self.num_batch_test

        all_params = [self.W] + [self.b] +lasagne.layers.get_all_params(self.model)
        all_grads_target = [T.clip(g, -3, 3) for g in T.grad(loss, all_params)]
        all_grads_target = lasagne.updates.total_norm_constraint(all_grads_target, 3)
        updates_target = adam(all_grads_target, all_params)

        train_model = theano.function([self.index],
                                      [loss, out_out],
                                      givens={sym_x: self.train_x[self.b_slice],
                                              sym_mask_x: self.train_mask_x[self.b_slice],
                                              sym_target: self.train_target[self.b_slice],
                                              },
                                      updates=updates_target)
        test_model = theano.function([self.num_batch_test],
                                     [loss_test, out_out_test],
                                     givens={sym_x: self.test_x,
                                             sym_mask_x: self.test_mask_x,
                                             sym_target: self.test_target,
                                             })

        return train_model, test_model
Beispiel #27
0
def test_gru_init_val_error():
    # check if errors are raised when init is non matrix tensorVariable
    vector = T.vector()
    with pytest.raises(ValueError):
        l_rec = GRULayer(InputLayer((2, 2, 3)), 5, hid_init=vector)
	def __init__(self, model_name, max_seq_len, num_features_pitch, num_features_duration, num_gru_layer_units=25, set_x_input_to_zero=False, in_dropout_p=0, out_dropout_p=0, use_l2_penalty=False):
		super(GRU_Network, self).__init__(model_name, max_seq_len, num_features_pitch, num_features_duration, num_gru_layer_units, set_x_input_to_zero, in_dropout_p, use_l2_penalty)
		
		self.out_dropout_p = out_dropout_p

		##### THE LAYERS OF THE NEXT-STEP PREDICTION GRU NETWORK #####

		### INPUT NETWORK ###
		# Two input layers receiving Onehot-encoded data
		l_in_pitch = InputLayer((None, None, self.num_features_pitch), name="l_in_pitch")
		l_in_duration = InputLayer((None, None, self.num_features_duration), name="l_in_duration")

		# Layer merging the two input layers
		l_in_merge = ConcatLayer([l_in_pitch, l_in_duration], axis=2, name="l_in_merge")

		# Dropout in input network
		l_in_intermediate = l_in_merge
		if self.in_dropout_p > 0:
			l_in_intermediate = DropoutLayer(l_in_intermediate, rescale=False, p=self.in_dropout_p, shared_axes=(1,2))


		# The mask layer for ignoring time-steps after <eos> in the GRU layer
		l_in_mask = InputLayer((None, self.max_seq_len), name="l_in_mask")


		### OUTPUT NETWORK ###
		# A normal GRU layer
		self.l_out_gru = GRULayer(l_in_intermediate, num_units=self.num_gru_layer_units, name='GRULayer', mask_input=l_in_mask)

		# Dropout in output network
		l_out_intermediate = self.l_out_gru
		if self.out_dropout_p > 0:
			l_out_intermediate = DropoutLayer(l_out_intermediate, rescale=False, p=self.out_dropout_p)

		# We need to do some reshape voodo to connect a softmax layer.
		# See http://lasagne.readthedocs.org/en/latest/modules/layers/recurrent.html#examples 
		# In short this line changes the shape from 
		# (batch_size, decode_len, num_dec_units) -> (batch_size*decodelen,num_dec_units). 
		# We need to do this since the softmax is applied to the last dimension and we want to 
		# softmax the output at each position individually
		l_out_reshape = ReshapeLayer(l_out_intermediate, (-1, [2]), name="l_out_reshape")


		# Setting up the output-layers as softmax-encoded pitch and duration vectors from the dense layers. (Two dense layers with softmax output, e.g. prediction probabilities for next note in melody)
		l_out_softmax_pitch = DenseLayer(l_out_reshape, num_units=self.num_features_pitch, nonlinearity=lasagne.nonlinearities.softmax, name='SoftmaxOutput_pitch')
		l_out_softmax_duration = DenseLayer(l_out_reshape, num_units=self.num_features_duration, nonlinearity=lasagne.nonlinearities.softmax, name='SoftmaxOutput_duration')

		# reshape back to 3d format (batch_size, decode_len, num_dec_units). Here we tied the batch size to the shape of the symbolic variable for X allowing 
		#us to use different batch sizes in the model.
		self.l_out_pitch = ReshapeLayer(l_out_softmax_pitch, (-1, self.max_seq_len, self.num_features_pitch), name="l_out_pitch")	
		self.l_out_duration = ReshapeLayer(l_out_softmax_duration, (-1, self.max_seq_len, self.num_features_duration), name="l_out_duration")

		### NETWORK OUTPUTS ###
		# Setting up the output as softmax-encoded pitch and duration vectors from the dense softmax layers.
		# (OBS: This is bypassing the onehot layers, so we evaluate the model on the softmax-outputs directly)
		output_pitch_train = get_output(self.l_out_pitch, {l_in_pitch: self.x_pitch_sym, l_in_duration: self.x_duration_sym, l_in_mask: self.x_mask_sym}, deterministic = False)
		output_duration_train = get_output(self.l_out_duration, {l_in_pitch: self.x_pitch_sym, l_in_duration: self.x_duration_sym, l_in_mask: self.x_mask_sym}, deterministic = False)

		output_pitch_eval = get_output(self.l_out_pitch, {l_in_pitch: self.x_pitch_sym, l_in_duration: self.x_duration_sym, l_in_mask: self.x_mask_sym}, deterministic = True)
		output_duration_eval = get_output(self.l_out_duration, {l_in_pitch: self.x_pitch_sym, l_in_duration: self.x_duration_sym, l_in_mask: self.x_mask_sym}, deterministic = True)

		output_gru = get_output(self.l_out_gru, {l_in_pitch: self.x_pitch_sym, l_in_duration: self.x_duration_sym, l_in_mask: self.x_mask_sym}, deterministic = True)

		#Get parameters from all layers except nondeterministic (dropout)
		all_parameters = get_all_params([self.l_out_pitch, self.l_out_duration], trainable=True)

		print "Trainable Model Parameters"
		print "-"*40
		for param in all_parameters:
		    print param, param.get_value().shape
		print "-"*40

		# Compute costs
		# For indeterministic training 
		cost_pitch_train, acc_pitch_train = eval(output_pitch_train, self.y_pitch_sym, self.num_features_pitch, self.y_mask_sym)
		cost_duration_train, acc_duration_train = eval(output_duration_train, self.y_duration_sym, self.num_features_duration, self.y_mask_sym)
		if self.use_l2_penalty:
			l2_penalty = regularize_layer_params([self.l_out_pitch, self.l_out_duration], l2)
		else: 
			l2_penalty = 0  

		total_cost = cost_pitch_train + cost_duration_train + l2_penalty

		# and deterministic evaluation
		cost_pitch_eval, acc_pitch_eval = eval(output_pitch_eval, self.y_pitch_sym, self.num_features_pitch, self.y_mask_sym)
		cost_duration_eval, acc_duration_eval = eval(output_duration_eval, self.y_duration_sym, self.num_features_duration, self.y_mask_sym)

		#add grad clipping to avoid exploding gradients
		all_grads = [T.clip(g,-3,3) for g in T.grad(total_cost, all_parameters)]
		all_grads = lasagne.updates.total_norm_constraint(all_grads,3)

		#Compile Theano functions.
		updates = lasagne.updates.adam(all_grads, all_parameters, learning_rate=0.005)

		self.f_train = theano.function([self.x_pitch_sym, self.y_pitch_sym, self.x_duration_sym, self.y_duration_sym, self.x_mask_sym, self.y_mask_sym], [cost_pitch_train, acc_pitch_train, output_pitch_train, cost_duration_train, acc_duration_train, output_duration_train], updates=updates)
		
		#since we have stochasticity in the network when dropout is used we will use the evaluation graph without any updates given and deterministic=True.
		self.f_eval = theano.function([self.x_pitch_sym, self.y_pitch_sym, self.x_duration_sym, self.y_duration_sym, self.x_mask_sym, self.y_mask_sym], [cost_pitch_eval, acc_pitch_eval, output_pitch_eval, cost_duration_eval, acc_duration_eval, output_duration_eval])

		self.f_eval_gru = theano.function([self.x_pitch_sym, self.x_duration_sym, self.x_mask_sym], output_gru)
Beispiel #29
0
class Sent2Vec:
	def __init__(self, in_path, concat=False, wsi_path=None, dat_path='data/dat.pkl', supp_path='data/supp.pkl'):
		self.in_path = in_path
		self.concat = concat
		self.wsi_path = wsi_path
		self.lm_mode = 'default'
		with open(self.in_path, 'rb') as f:
			p = pk.load(f)

		self.do_brnn = False
		if 'do_brnn' in p:
			self.do_brnn = p['do_brnn']
		self.is_lstm = 'Wxo' in p['params']
		self.is_gru = 'Whr' in p['params']
		if 'Wt' not in p: self.lm_mode = 'iden'
		elif p['Wt'].get_value().ndim == 1: self.lm_mode = 'diag'
		self.params = p['params']
		self.dwe = self.params['dwe'] # disambiguated word embeddings
		self.td = self.dwe.get_value().shape[1]
		self.hd = self.params['L'].get_value().shape[1]
		self.gc = 2
		self.l_mask = InputLayer((None, None), trainable=False)

		if self.is_lstm:
			self.l_gru_emb = LSTMLayer((None, None, self.td), self.hd, grad_clipping=self.gc, \
				ingate=Gate(W_in=self.params['Wxr'], W_hid=self.params['Whr'], b=self.params['br'], W_cell=None), \
				forgetgate=Gate(W_in=self.params['Wxu'], W_hid=self.params['Whu'], b=self.params['bu'], W_cell=None), \
				outgate=Gate(W_in=self.params['Wxo'], W_hid=self.params['Who'], b=self.params['bo'], W_cell=None), \
				cell=Gate(W_in=self.params['Wxc'], W_hid=self.params['Whc'], b=self.params['bc'], W_cell=None,\
					nonlinearity=nonlinearities.tanh),mask_input=self.l_mask, peepholes=False)
			if self.do_brnn:
				self.l_bgru_emb = LSTMLayer((None, None, self.td), self.hd, grad_clipping=self.gc, \
					ingate=Gate(W_in=self.params['bWxr'], W_hid=self.params['bWhr'], b=self.params['bbr'], W_cell=None), \
					forgetgate=Gate(W_in=self.params['bWxu'], W_hid=self.params['bWhu'], b=self.params['bbu'], W_cell=None), \
					outgate=Gate(W_in=self.params['bWxo'], W_hid=self.params['bWho'], b=self.params['bbo'], W_cell=None), \
					cell=Gate(W_in=self.params['bWxc'], W_hid=self.params['bWhc'], b=self.params['bbc'], W_cell=None,\
						nonlinearity=nonlinearities.tanh),mask_input=self.l_mask, peepholes=False, backwards=True)
		elif self.is_gru:
			self.l_gru_emb = GRULayer((None, None, self.td), self.hd, grad_clipping=self.gc, \
				resetgate=Gate(W_in=self.params['Wxr'], W_hid=self.params['Whr'], b=self.params['br'], W_cell=None), \
				updategate=Gate(W_in=self.params['Wxu'], W_hid=self.params['Whu'], b=self.params['bu'], W_cell=None), \
				hidden_update=Gate(W_in=self.params['Wxc'], W_hid=self.params['Whc'], b=self.params['bc'], W_cell=None,\
					nonlinearity=nonlinearities.tanh),mask_input=self.l_mask)
			if self.do_brnn:
				self.l_bgru_emb = GRULayer((None, None, self.td), self.hd, grad_clipping=self.gc, \
					resetgate=Gate(W_in=self.params['bWxr'], W_hid=self.params['bWhr'], b=self.params['bbr'], W_cell=None), \
					updategate=Gate(W_in=self.params['bWxu'], W_hid=self.params['bWhu'], b=self.params['bbu'], W_cell=None), \
					hidden_update=Gate(W_in=self.params['bWxc'], W_hid=self.params['bWhc'], b=self.params['bbc'], W_cell=None,\
						nonlinearity=nonlinearities.tanh),mask_input=self.l_mask, backwards=True)
		else:
			self.is_nlm = True
	
		with open(dat_path, 'rb') as f:
			d = pk.load(f)
		self.nw, self.mw, self.ms = d['def'].shape # num words, max num of words, max num of senses
		self.dw = d['dw'] # dw to index
		self.aw = d['aw']
		self.no = len(d['aw'])
		if 'spriors' in d:
			self.sense_priors = d['spriors']
		else:
			self.sense_priors = np.ones((self.no, self.ms))

		with open(supp_path, 'rb') as f:
			s = pk.load(f)
		self.id2aw = s['id2aw']
		self.id2dw = s['id2dw']
		self.aw2dw = s['aw2dw']

		self.build_encoder()

    # assume xml-style input
	# output: 'lemma.pos instance-id sense-name/rating'
	def perform_wsi(self):
		expr = '[' + string.punctuation + ']'
		jaccard = False
		for d in os.listdir(self.wsi_path):
			f = os.path.join(self.wsi_path, d)
			if not os.path.isfile(f): continue
			with open(f) as fin:
				wsi = xd.parse(fin.read())
			for inst in wsi['instances']['instance']:
				tok = inst['@token']
				txt = re.sub(expr, ' ', inst['#text'])
				lemma = inst['@lemma']
				pos = inst['@partOfSpeech']
				inst_id = inst['@id']
				ind = txt.split().index(tok)
				s, m, ptmp = self.to_indexes(txt, token=tok, pos=pos, lem=lemma)
				'''s = s.reshape(1, *s.shape)
				m = m.reshape(1, *m.shape)
				fu = np.asarray([ptmp]).astype(np.int32)
				weights = self.get_weights(s, m, fu, np.ones_like(s).astype(np.float32)) # mw x ms'''
				weights = self.get_vector([txt], mode='w', token=tok, pos=pos, lem=lemma)
				senses = s[ind, :]
				sweight = weights[0][ind, :]
				ratings = [(self.id2dw[senses[i]], sweight[i]) \
					for i in range(len(sweight)) \
						if self.id2dw[senses[i]].split('.')[0] == lemma and sweight[i] > 0.02]
				ratings.sort(key=lambda k: k[1], reverse=True)
				if len(ratings) == 0: pdb.set_trace()
				l = min(3, len(ratings))
				if jaccard:
					r = [k[0] for k in ratings[0:2]]
				else:
					r = [k[0] + '/' + str(k[1]) for k in ratings[0:l]]
				print '{}.{} {} {}'.format(lemma, pos, inst_id, ' '.join(r))

	def build_encoder(self):
		def to_vect(d, m, p):
			L0 = self.params['L0']
			hid_inp = self.dwe[d, :] # mw x ms x hd
			logit = T.exp(T.dot(hid_inp, L0)[:,:,p])# (mw x ms) x mw
			mk = T.switch(T.lt(p, 0), 0, 1) # mw: word-level mask (different mask from m)
			mask = mk.dimshuffle(0, 'x', 'x')
			l2 = logit * mask # mw x ms x mw
			l2 = T.sum(l2 * mk.dimshuffle('x', 'x', 0), axis=2) * m # mw x ms 
			w0 = l2 / T.sum(l2, axis=1).dimshuffle(0, 'x')
			w1 = T.switch(T.isnan(w0), 0, w0)
			w = w1.dimshuffle(0, 1, 'x') # mw x ms x 1
			res = T.sum(w * hid_inp, axis=1) # mw x hd
			return res #, logit, weights

		def to_weights(d, m, p, prior):
			hid_inp = self.dwe[d, :] # mw x ms x hd
			if self.is_lstm or self.is_gru:
				logit = T.exp(T.dot(hid_inp, L0)[:,:,p])# (mw x ms) x mw
				mk = T.switch(T.lt(p, 0), 0, 1) # mw: word-level mask (different mask from m)
				mask = mk.dimshuffle(0, 'x', 'x')
				l2 = logit * mask # mw x ms x mw
				l2 = T.sum(l2 * mk.dimshuffle('x', 'x', 0), axis=2) * m # mw x ms 
				w0 = l2 / T.sum(l2, axis=1).dimshuffle(0, 'x')
				w1 = T.switch(T.isnan(w0), 0, w0)
			else:
				if self.lm_mode == 'diag':
					B = hid_inp * Wt.dimshuffle('x', 'x', 0)
					tmp = T.tensordot(B, B.T, axes = 1)
				elif self.lm_mode == 'iden':
					logit = T.tensordot(self.dwe[d, :], self.dwe.T, axes=1)[:,:,d] # mw x ms x mw x ms
					cnt = T.sum(m, axis=1).dimshuffle('x', 'x', 0) # 1 x 1 x mw
					logit = T.sum(logit * m.dimshuffle('x', 'x', 0, 1), axis=3) / cnt # mw x ms x mw
					logit = T.exp(10*T.switch(T.isnan(logit), 0, logit)) # mw x ms x mw
					logit = T.prod(logit, axis=2) * prior # mw x ms
					sm = T.sum(logit * m, axis=1, keepdims=True) # mw x 1
					logit = (logit * m) / sm # mw x ms
					return T.switch(T.or_(T.isnan(logit), T.isinf(logit)), 0, logit)
				else:
					tmp = T.tensordot(T.dot(hid_inp, self.params['Wt']), hid_inp.T, axes=1) # mw x ms x ms x mw
				tmp = T.exp(tmp.dimshuffle(0, 1, 3, 2)) # mw x ms x mw x ms
				tmp = tmp * m.dimshuffle('x', 'x', 0, 1)
				nrm = T.sum(tmp, axis=3)
				tmp = tmp / nrm.dimshuffle(0, 1, 2, 'x')
				tmp = T.switch(T.isnan(tmp), 0, tmp)
				mk = T.switch(T.lt(p, 0), 0, 1) # mw: word-level mask (different mask from m)
				tmp = T.max(tmp, axis=3) * mk.dimshuffle('x', 'x', 0) # mw x ms x mw
				tmp = T.exp(T.sum(T.log(T.switch(T.eq(tmp, 0), 1, tmp)), axis=2)) * m # mw x ms
				tmp = tmp * prior
				tmp = tmp / T.sum(tmp, axis=1).dimshuffle(0, 'x')
				w1 = T.switch(T.isnan(tmp), 0, tmp)
			return w1

		st = T.itensor3('st') # bs x len x ms
		pd = T.imatrix('wi') # bs x len
		mk = T.itensor3('mk') # bs x len x ms
		wv = T.dmatrix('wv') # bs x hd
		pe = T.imatrix('pe') # bs x mew
		pr = T.tensor3('pr') # bs x len x ms
		weights, _ = theano.scan(fn = to_weights, sequences = [st, mk, pd, pr]) # bs x mw x ms
		mask = T.ones_like(pd).astype(theano.config.floatX) # bs x len

		if self.is_lstm or self.is_gru:
			enc, _ = theano.scan(fn = to_vect, sequences = [st, mk, pd]) # bs x mw x hd
			enc = enc.astype(theano.config.floatX)
			fdef_emb = self.l_gru_emb.get_output_for([enc, mask]) # bs x hd
			if self.do_brnn:
				bdef_emb = self.l_bgru_emb.get_output_for([enc, mask])
				if self.concat:
					def_emb = T.concatenate([fdef_emb[:,-1,:], bdef_emb[:,0,:]], axis=1)
				else:
					def_emb = T.dot(fdef_emb[:, -1, :], self.params['Wf']) + \
						T.dot(bdef_emb[:, 0, :], self.params['Wb']) + \
						self.params['by'].dimshuffle('x', 0) # bs x hd
			else:
				def_emb = fdef_emb[:, -1, :]
		else:
			hid_inp = self.dwe[st, :]
			dat = T.sum(weights.dimshuffle(0, 1, 2, 'x') * hid_inp, axis=2)
			def_emb = T.sum(T.dot(dat, self.params['L']), axis = 1)

		self.encode = theano.function([st, mk, pd, pr], def_emb)
		self.get_weights = theano.function([st, mk, pd, pr], weights)

	def preproc_word(self, w, pos=None):
		if pos == 'j': pos = 'a'
		w = re.sub(r'[\$,\{\}\[\]\(\)`\'\":;!\?\.]', '', w).lower()
		w = re.sub(r'\-', '_', w) # hyphen -> underscore
		if w == 'an': w = 'a' # dirty hack....
		if w == 'oclock': w = 'o\'clock'
		if w.isdigit(): w = '<NUM>'
		wp = wn.morphy(w, pos=pos)
		if wp is None: wp = w
		return wp

	# 'sents' is a list of sentences
	def get_vector(self, sents, mode='v', token=None, pos='any', lem=None):
		mw = max([len(s.split()) for s in sents])
		s = np.ones((len(sents), mw, self.ms), dtype=np.int32) * -1
		m = np.zeros(s.shape, dtype=np.int32)
		p = np.ones((len(sents), mw), dtype=np.int32) * -1
		pr = np.ones((len(sents), mw, self.ms), dtype=np.float32)
		sp = self.sense_priors # no x ms
		for (si, sn) in enumerate(sents):
			s[si], m[si], p_tmp = self.to_indexes(sn, mw, token=token, pos=pos, lem=lem)
			p[si][0:len(p_tmp)] = p_tmp
			for i in range(mw):
				if i >= len(sn): break
				pwid = p[si][i]
				pr[si][i] = sp[pwid, :]
		if mode == 'v':
			return self.encode(s, m, p, pr)
		else:
			return self.get_weights(s, m, p, pr)

	# 'sent' is a single string
	# mw is the maximum number of words (if called from get_vector())
	# Setting token = w and pos = p will restrict the processing of 'w' to ones having POS tag 'p'
	def to_indexes(self, sent, mw = None, token = None, pos = None, lem = None):
		def same_pos(a, b):
			if a is None or b is None or a == b: return True
			if (a == 'a' or a == 's') and b == 'j': return True
			return False

		p_tmp = []
		sn = sent.split()
		if mw is None:
			mw = len(sn)
		s = np.ones((mw, self.ms), dtype=np.int32) * -1
		m = np.zeros(s.shape, dtype=np.int32)
		for (ind, w) in enumerate(sn):
			filt = (token is not None) and (w == token) #filter the token using pos
			if filt: _pos = pos
			else: _pos = None
			w = self.preproc_word(w, pos=_pos)
			if w not in self.aw2dw or len(self.aw2dw[w]) == 0:
				s[ind, 0] = self.dw['<UNK>']
				m[ind, 0] = 1.0
			else:
				l = min(10, len(self.aw2dw[w]))
				if filt:
					cands = []
					if lem is not None: w = lem
					for wp in self.aw2dw[w]:
						try:
							if same_pos(wn.synset(wp).pos(), pos) and wp.split('.')[0] == w:
								cands.append(wp)
						except:
							continue
					#cands = [wp for wp in self.aw2dw[w] if same_pos(wn.synset(wp).pos(), pos)]
					l = min(25, len(cands))
					s[ind][0:l] = [self.dw[wp] for wp in cands][0:25]
				else:
					s[ind][0:l] = [self.dw[wp] for wp in self.aw2dw[w][0:l]]
				m[ind][0:l] = np.ones((l,))
				if l == 0: pdb.set_trace()
			if w in self.aw:
				p_tmp.append(self.aw[w])
			else:
				p_tmp.append(0)
		return s, m, p_tmp
Beispiel #30
0
class PRAE:
    def __init__(self, num_batch, max_len, n_features, hidden=[200, 200], **kwargs):
        self.num_batch = num_batch
        self.n_features = n_features
        self.max_len = max_len
        self.hidden = hidden
        rng = np.random.RandomState(123)
        self.drng = rng
        self.rng = RandomStreams(rng.randint(2 ** 30))

        # params
        initial_W = np.asarray(
            rng.uniform(
                    low=1e-5,
                    high=1,
                    size=(self.hidden[1], self.n_features)
            ),
            dtype=theano.config.floatX
        )

        self.W_y_theta = theano.shared(value=initial_W, name='W_y_theta', borrow=True)
        # # self.W_y_kappa = theano.shared(value=initial_W, name='W_y_kappa', borrow=True)
        self.b_y_theta = theano.shared(
                value=np.zeros(
                    self.n_features,
                    dtype=theano.config.floatX
                ),
                borrow=True
            )
        # self.b_y_kappa = theano.shared(
        #         value=np.zeros(
        #             self.n_features,
        #             dtype=theano.config.floatX
        #         ),
        #         name='b',
        #         borrow=True
        #     )


        # I could directly create the model here since it is fixed
        self.l_in = InputLayer(shape=(self.num_batch, self.max_len, self.n_features))
        self.mask_input = InputLayer(shape=(self.num_batch, self.max_len))
        first_hidden = GRULayer(self.l_in, mask_input=self.mask_input, num_units=hidden[0])
        self.model =GRULayer(first_hidden, num_units=hidden[1])
        # need some reshape voodoo
        # l_shp = ReshapeLayer(second_hidden, (-1, hidden[1]))
        # after the reshape I have batch*max_len X features
        # self.model = DenseLayer(l_shp, num_units=self.n_features, nonlinearity=rectify)
        # if now I put a dense layer this will collect all the output temporally which is what I want, I'll have to fix
        # the dimensions probably later
        # For every gaussian in the sum I need 3 values plus a value for the total scale
        # the output of this layer will be (num_batch, num_units, max_len) TODO check size



    def get_output_shape_for(self):
        return self.model.get_output_shape_for(self.num_batch, self.max_len, self.hidden[2])

    def get_output_y(self, output):
        # (batch, time, hidden) X (hidden, features) + (, features) => (batch, time, features)
        theta_out = T.nnet.relu(T.dot(output, self.W_y_theta) + self.b_y_theta)
        #kappa_out = T.nnet.relu(T.dot(output, self.W_y_kappa) + self.b_y_kappa)
        return theta_out

    def get_log_x(self, x, theta_out):
        # DIM = (batch, time, hidden)
        # everything is elementwise
        log_x = T.log(theta_out + 1e-8) - theta_out * x
        log_x = log_x.sum(axis=2, dtype=theano.config.floatX)  # sum over x cause I assume they are independent
        return log_x

    def build_model(self, train_x, train_mask_x, train_mask_out, train_target,
                    test_x, test_mask_x, test_mask_out, test_target):
        self.train_x = train_x
        self.train_mask_x = train_mask_x
        self.train_mask_out = train_mask_out
        self.train_target = train_target
        self.test_x = test_x
        self.test_mask_x = test_mask_x
        self.test_mask_out = test_mask_out
        self.test_target = test_target
        self.index = T.iscalar('index')
        self.num_batch_test = T.iscalar('index')
        self.b_slice = slice(self.index * self.num_batch, (self.index + 1) * self.num_batch)

        sym_x = T.dtensor3()
        sym_mask_x = T.dmatrix()
        sym_target = T.dtensor3()
        # sym_mask_out = T.dtensor3() should not be useful since output is still zero
        # TODO think about this if it is true

        output = lasagne.layers.get_output(self.model, inputs={self.l_in: sym_x, self.mask_input: sym_mask_x})
        theta = self.get_output_y(output)
        log_px = self.get_log_x(sym_target, theta)
        log_px_sum_time = log_px.sum(axis=1, dtype=theano.config.floatX) # sum over tx
        loss = - T.sum(log_px_sum_time) / self.num_batch # average over batch
        ##
        log_px_test = self.get_log_x(sym_target, theta)
        log_px_sum_time_test = log_px_test.sum(axis=1, dtype=theano.config.floatX) # sum over time
        loss_test = - T.sum(log_px_sum_time_test) / self.num_batch_test  # average over batch
        # loss = T.mean(lasagne.objectives.squared_error(mu, sym_target))
        all_params = [self.W_y_theta] + [self.b_y_theta] + lasagne.layers.get_all_params(self.model)
        all_grads_target = [T.clip(g, -3, 3) for g in T.grad(loss, all_params)]
        all_grads_target = lasagne.updates.total_norm_constraint(all_grads_target, 3)
        updates_target = adam(all_grads_target, all_params)

        train_model = theano.function([self.index],
                                      [loss, theta, log_px],
                                      givens={sym_x: self.train_x[self.b_slice],
                                              sym_mask_x: self.train_mask_x[self.b_slice],
                                              sym_target: self.train_target[self.b_slice]},
                                      updates=updates_target)
        test_model = theano.function([self.num_batch_test],
                                     [loss_test, theta],
                                     givens={sym_x: self.test_x,
                                             sym_mask_x: self.test_mask_x,
                                             sym_target: self.test_target})

        return train_model, test_model
Beispiel #31
0
        def recurrence(i, h_tm1, w_a, M_a, *args, **kwargs):
            """
            notes
            Headers from paper in all caps
            mem = n_article slots if is_article else n_title_slots

            :param i: center index of sliding window
            :param h_tm1: h_{t-1} (hidden state)
            :param w_a: attention weights for article memory
            :param M_a: article memory
            :param args: gru_weights, maybe w_t, maybe M_t
                   gru_weights: weights with which to initialize GRULayer on each time step
                   w_t: attention weights for titles memory
                   M_t: titles memory
            :param kwargs: is_training, is_article
                   is_training:
                   is_article: we use different parts of memory when working with a article
            :return: [y = model outputs,
                      i + 1 = increment index,
                      h w, M (see above)]
            """
            is_training = kwargs['is_training']
            is_article = kwargs['is_article']
            gru_weights = args[:depth]
            if len(args) > depth:
                w_t = args[depth]
                M_t = args[depth + 1]

            i_type = T.iscalar if is_article or is_training else T.ivector
            assert i.type == i_type

            if not is_article:
                assert w_t is not None and M_t is not None

            word_idxs = i
            if is_article or is_training:
                # get representation of word window
                document = articles if is_article else titles  # [instances, bucket_width]
                word_idxs = document[:, i:i + 1]  # [instances, 1]
            # x_i = self.emb[word_idxs].flatten(ndim=2)  # [instances, embedding_dim]

            input = InputLayer(shape=(None, 1), input_var=word_idxs)
            embed = EmbeddingLayer(input, num_embeddings, embedding_dim)
            gru = GRULayer(incoming=embed,
                           num_units=embedding_dim,
                           hid_init=self.gru0)
            for weight in gru_weights:
                gru = GRULayer(incoming=gru,
                               num_units=embedding_dim,
                               hid_init=weight)
            x_i = get_output(gru).flatten(ndim=2)
            x_i = Print('x_i')(x_i)  # [instances, embedding_dim]

            gru_weights = []

            if is_article:
                M_read = M_a  # [instances, memory_size, n_article_slots]
                w_read = w_a  # [instances, n_article_slots]
            else:
                M_read = T.concatenate(
                    [M_a, M_t],
                    axis=2)  # [instances, memory_size, n_title_slots]
                w_read = T.concatenate([w_a, w_t],
                                       axis=1)  # [instances, n_title_slots]

            # eqn 15
            c = T.batched_dot(M_read, w_read)  # [instances, memory_size]

            # EXTERNAL MEMORY READ
            def get_attention(Wg, bg, M, w):
                g = T.nnet.sigmoid(T.dot(x_i, Wg) + bg)  # [instances, mem]

                # eqn 11
                k = T.dot(h_tm1, self.Wk) + self.bk  # [instances, memory_size]

                # eqn 13
                beta = T.dot(h_tm1, self.Wb) + self.bb
                beta = T.nnet.softplus(beta)
                beta = T.addbroadcast(beta, 1)  # [instances, 1]

                # eqn 12
                w_hat = T.nnet.softmax(beta * cosine_dist(M, k))

                # eqn 14
                return (1 - g) * w + g * w_hat  # [instances, mem]

            w_a = get_attention(self.Wg_a, self.bg_a, M_a,
                                w_a)  # [instances, n_article_slots]
            if not is_article:
                w_t = get_attention(self.Wg_t, self.bg_t, M_t,
                                    w_t)  # [instances, n_title_slots]

            # MODEL INPUT AND OUTPUT
            # eqn 9
            h = T.dot(c, self.Wh) + T.dot(
                x_i, self.Wx) + self.bh  # [instances, hidden_size]

            # eqn 10
            y = T.nnet.softmax(T.dot(h, self.W) +
                               self.b)  # [instances, nclasses]

            # EXTERNAL MEMORY UPDATE
            def update_memory(We, be, w_update, M_update):
                # eqn 17
                e = T.nnet.sigmoid(T.dot(h_tm1, We) + be)  # [instances, mem]
                f = 1. - w_update * e  # [instances, mem]

                # eqn 16
                v = T.tanh(T.dot(h, self.Wv) +
                           self.bv)  # [instances, memory_size]

                # need to add broadcast layers for memory update
                f = f.dimshuffle(0, 'x', 1)  # [instances, 1, mem]
                u = w_update.dimshuffle(0, 'x', 1)  # [instances, 1, mem]
                v = v.dimshuffle(0, 1, 'x')  # [instances, memory_size, 1]

                # eqn 19
                return M_update * f + T.batched_dot(v, u) * (
                    1 - f)  # [instances, memory_size, mem]

            M_a = update_memory(self.We_a, self.be_a, w_a, M_a)
            attention_and_memory = [w_a, M_a]
            if not is_article:
                M_t = update_memory(self.We_t, self.be_t, w_t, M_t)
                attention_and_memory += [w_t, M_t]

            y_max = y.argmax(axis=1).astype(int32)
            next_idxs = i + 1 if is_training or is_article else y_max
            return [y, y_max, next_idxs, h] + attention_and_memory