Ejemplo n.º 1
0
    def get_embedding_layer(self, l_in, extra_vars):
        language = extra_vars[0]
        context_vars = extra_vars[1:]

        id_tag = (self.id + '/') if self.id else ''

        l_lang = InputLayer(shape=(None, ),
                            input_var=language,
                            name=id_tag + 'lang_input')

        if self.options.bilingual_en_embed_file:
            en_embeddings = load_embeddings(
                self.options.bilingual_en_embed_file, self.seq_vec)
            en_embed_size = en_embeddings.shape[1]
        else:
            en_embeddings = Normal()
            en_embed_size = self.options.bilingual_embed_size

        if self.options.bilingual_zh_embed_file:
            zh_embeddings = load_embeddings(
                self.options.bilingual_zh_embed_file, self.seq_vec)
            zh_embed_size = zh_embeddings.shape[1]
        else:
            zh_embeddings = Normal()
            zh_embed_size = self.options.bilingual_embed_size

        l_en = EmbeddingLayer(l_in,
                              input_size=len(self.seq_vec.tokens),
                              output_size=en_embed_size,
                              W=en_embeddings,
                              name=id_tag + 'desc_embed_en')
        l_en_transformed = dimshuffle(l_en, (0, 2, 1))
        l_en_transformed = NINLayer(l_en_transformed,
                                    num_units=self.options.listener_cell_size,
                                    nonlinearity=None,
                                    name=id_tag + 'desc_embed_en_transformed')
        l_en_transformed = dimshuffle(l_en_transformed, (0, 2, 1))

        l_zh = EmbeddingLayer(l_in,
                              input_size=len(self.seq_vec.tokens),
                              output_size=zh_embed_size,
                              W=zh_embeddings,
                              name=id_tag + 'desc_embed_zh')
        l_zh_transformed = dimshuffle(l_zh, (0, 2, 1))
        l_zh_transformed = NINLayer(l_zh_transformed,
                                    num_units=self.options.listener_cell_size,
                                    nonlinearity=None,
                                    name=id_tag + 'desc_embed_zh_transformed')
        l_zh_transformed = dimshuffle(l_zh_transformed, (0, 2, 1))
        l_merged = SwitchLayer(l_lang, [l_en_transformed, l_zh_transformed],
                               name=id_tag + 'desc_embed_switch')
        return (l_merged, context_vars)
Ejemplo n.º 2
0
    def build_model(self):
        # Define tensor variables.
        x_user = T.ivector("x_user")
        x_user_context = T.ivector("x_user_context")

        y_labels = T.vector("y_emb")

        ################################################################################################################
        # Unsupervised embedding learning.
        ################################################################################################################
        l_in_user = InputLayer(shape=(None, ), input_var=x_user)
        l_in_user_context = InputLayer(shape=(None, ),
                                       input_var=x_user_context)

        l1_user = EmbeddingLayer(l_in_user,
                                 input_size=self.number_of_users,
                                 output_size=self.embedding_size,
                                 W=lasagne.init.GlorotUniform(gain=1.0))
        l1_user_context = EmbeddingLayer(
            l_in_user_context,
            input_size=self.number_of_users,
            output_size=self.embedding_size,
            W=lasagne.init.GlorotUniform(gain=1.0))

        l_user_user_merge = lasagne.layers.ElemwiseMergeLayer(
            [l1_user, l1_user_context], T.mul)

        self.l.append(l_user_user_merge)

        user_user_embedding_merge = lasagne.layers.get_output(
            l_user_user_merge)

        user_user_loss = -T.log(
            T.nnet.sigmoid(
                T.sum(user_user_embedding_merge, axis=1) * y_labels)).sum()

        l_user_user_merge_params = lasagne.layers.get_all_params(
            l_user_user_merge, trainable=True)

        user_user_updates = lasagne.updates.adam(
            user_user_loss,
            l_user_user_merge_params,
            learning_rate=self.learning_rate)

        user_user_batch_train_function = theano.function(
            [x_user, x_user_context, y_labels],
            user_user_loss,
            updates=user_user_updates,
            on_unused_input="ignore")

        return user_user_batch_train_function, \
               l1_user
Ejemplo n.º 3
0
    def _add_word_embeddings(self):
        self._net['input_x'] = InputLayer(shape=(None, None, None),
                                          input_var=T.itensor3(name='input_x'),
                                          name='input_x')

        self._net['input_y'] = InputLayer(shape=(None, None),
                                          input_var=T.imatrix(name='input_y'),
                                          name='input_y')

        # Infer these variables from data passed to computation graph since batch shape may differ in training and
        # prediction phases
        self._batch_size = self._net['input_x'].input_var.shape[0]
        self._input_context_size = self._net['input_x'].input_var.shape[1]
        self._input_seq_len = self._net['input_x'].input_var.shape[2]
        self._output_seq_len = self._net['input_y'].input_var.shape[1]

        self._net['input_x_batched'] = \
            reshape(self._net['input_x'], (self._batch_size * self._input_context_size, self._input_seq_len))

        self._net['input_x_mask'] = NotEqualMaskLayer(
            incoming=self._net['input_x_batched'],
            x=self._skip_token_id,
            name='mask_x')

        self._net['emb_x'] = EmbeddingLayer(
            incoming=self._net['input_x_batched'],
            input_size=self._vocab_size,
            output_size=self._word_embedding_dim,
            W=self._W_init_embedding,
            name='emb_x')
        # output shape (batch_size, input_context_size, input_seq_len, embedding_dimension)

        self._net['input_y_mask'] = NotEqualMaskLayer(
            incoming=self._net['input_y'],
            x=self._skip_token_id,
            name='mask_y')

        self._net['emb_y'] = EmbeddingLayer(
            incoming=self._net['input_y'],
            input_size=self._vocab_size,
            output_size=self._word_embedding_dim,
            W=self._W_init_embedding,
            name='emb_y')
        # output shape (batch_size, output_seq_len, embedding_dimension)

        if not self._train_word_embedding:
            self._net['emb_x'].params[self._net['emb_x'].W].remove('trainable')
            self._net['emb_y'].params[self._net['emb_y'].W].remove('trainable')
Ejemplo n.º 4
0
    def _get_l_out(self, input_vars):
        id_tag = (self.id + '/') if self.id else ''

        input_var = input_vars[0]

        l_in = InputLayer(shape=(None,), input_var=input_var,
                          name=id_tag + 'desc_input')
        embed_size = self.options.listener_cell_size or self.color_vec.num_types
        l_in_embed = EmbeddingLayer(l_in, input_size=len(self.seq_vec.tokens),
                                    output_size=embed_size,
                                    name=id_tag + 'desc_embed')

        if self.options.listener_cell_size == 0:
            l_scores = l_in_embed  # BiasLayer(l_in_embed, name=id_tag + 'bias')
        else:
            l_hidden = DenseLayer(l_in_embed, num_units=self.options.listener_cell_size,
                                  nonlinearity=NONLINEARITIES[self.options.listener_nonlinearity],
                                  name=id_tag + 'hidden')
            if self.options.listener_dropout > 0.0:
                l_hidden_drop = DropoutLayer(l_hidden, p=self.options.listener_dropout,
                                             name=id_tag + 'hidden_drop')
            else:
                l_hidden_drop = l_hidden

            l_scores = DenseLayer(l_hidden_drop, num_units=self.color_vec.num_types,
                                  nonlinearity=None, name=id_tag + 'scores')
        l_out = NonlinearityLayer(l_scores, nonlinearity=softmax, name=id_tag + 'out')

        return l_out, [l_in]
def test_embedding_2D_input():
    import numpy as np
    import theano
    import theano.tensor as T
    from lasagne.layers import EmbeddingLayer, InputLayer, helper
    x = T.imatrix()
    batch_size = 2
    seq_len = 3
    emb_size = 5
    vocab_size = 3
    l_in = InputLayer((None, seq_len))
    W = np.arange(
        vocab_size*emb_size).reshape((vocab_size, emb_size)).astype('float32')
    l1 = EmbeddingLayer(l_in, input_size=vocab_size, output_size=emb_size,
                        W=W)

    x_test = np.array([[0, 1, 2], [0, 0, 2]], dtype='int32')

    # check output shape
    assert helper.get_output_shape(
        l1, (batch_size, seq_len)) == (batch_size, seq_len, emb_size)

    output = helper.get_output(l1, x)
    f = theano.function([x], output)
    np.testing.assert_array_almost_equal(f(x_test), W[x_test])
Ejemplo n.º 6
0
    def __init__(self, vocab):
        ### THEANO GRAPH INPUT ###
        self.input_phrase = T.imatrix("encoder phrase tokens")
        ##########################

        self.l_in = InputLayer((None, None),
                               self.input_phrase,
                               name='context input')
        self.l_mask = InputLayer((None, None),
                                 T.neq(self.input_phrase, vocab.PAD_ix),
                                 name='context mask')

        self.l_emb = EmbeddingLayer(self.l_in,
                                    vocab.n_tokens,
                                    Config.EMB_SIZE,
                                    name="context embedding")

        self.l_lstm = LSTMLayer(self.l_emb,
                                Config.N_LSTM_UNITS,
                                name='encoder_lstm',
                                grad_clipping=Config.LSTM_LAYER_GRAD_CLIP,
                                mask_input=self.l_mask,
                                only_return_final=True,
                                peepholes=False)

        self.output = self.l_lstm
Ejemplo n.º 7
0
def build_lstm_decorer():
    net = collections.OrderedDict()
    net['sent_input'] = InputLayer((None, CFG['SEQUENCE LENGTH'] - 1),
                                   input_var=T.imatrix())
    net['word_emb'] = EmbeddingLayer(net['sent_input'], input_size=CFG['VOCAB SIZE'],\
                                    output_size=CFG['EMBEDDING SIZE'])
    net['vis_input'] = InputLayer((None, CFG['VIS SIZE']),
                                  input_var=T.matrix())
    net['vis_emb'] = DenseLayer(net['vis_input'],
                                num_units=CFG['EMBEDDING SIZE'],
                                nonlinearity=lasagne.nonlinearities.identity)
    net['vis_emb_reshp'] = ReshapeLayer(net['vis_emb'],
                                        (-1, 1, CFG['EMBEDDING SIZE']))
    net['decorder_input'] = ConcatLayer(
        [net['vis_emb_reshp'], net['word_emb']])
    net['feat_dropout'] = DropoutLayer(net['decorder_input'], p=0.5)

    net['mask_input'] = InputLayer((None, CFG['SEQUENCE LENGTH']))
    net['lstm'] = LSTMLayer(net['feat_dropout'],num_units=CFG['EMBEDDING SIZE'], \
                            mask_input=net['mask_input'], grad_clipping=5.)
    net['lstm_dropout'] = DropoutLayer(net['lstm'], p=0.5)
    net['lstm_reshp'] = ReshapeLayer(net['lstm_dropout'],
                                     (-1, CFG['EMBEDDING SIZE']))
    net['word_prob'] = DenseLayer(net['lstm_reshp'],
                                  num_units=CFG['VOCAB SIZE'] + 2,
                                  nonlinearity=softmax)
    net['sent_prob'] = ReshapeLayer(
        net['word_prob'], (-1, CFG['SEQUENCE LENGTH'], CFG['VOCAB SIZE'] + 2))
    return net
Ejemplo n.º 8
0
    def build_model(batch_size=128):
        x = T.matrix('input', dtype='int32')

        l_in = lasagne.layers.InputLayer(shape=(None, None,), input_var=x)
        # We can retrieve symbolic references to the input variable's shape, which
        # we will later use in reshape layers.
        batchsize, seqlen = l_in.input_var.shape

        W = np.random.rand(vocab_size, embedding_size).astype(np.float32)
        ebd = EmbeddingLayer(l_in, input_size=vocab_size, output_size=embedding_size, W=W)

        # All gradients above this will be clipped
        GRAD_CLIP = 100
        # We now build the LSTM layer which takes l_in as the input layer
        # We clip the gradients at GRAD_CLIP to prevent the problem of exploding gradients.

        l_forward_1 = lasagne.layers.LSTMLayer(
            ebd, hiddenLayDim, grad_clipping=GRAD_CLIP,
            nonlinearity=lasagne.nonlinearities.tanh)

        l_forward_2 = lasagne.layers.LSTMLayer(
            l_forward_1, hiddenLayDim, grad_clipping=GRAD_CLIP,
            nonlinearity=lasagne.nonlinearities.tanh)

        # the output size of l_forward_2 will be (batch_size, seqlen, hiddenLayDim)

        # In order to connect a recurrent layer to a dense layer, we need to
        # flatten the first two dimensions (batch_size, seqlen); this will
        # cause each time step of each sequence to be processed independently
        l_shp = ReshapeLayer(l_forward_2, (-1, hiddenLayDim))
        l_out = lasagne.layers.DenseLayer(l_shp, num_units=vocab_size, W = lasagne.init.Normal(), nonlinearity=None)
        # Don't reshape back. Because keep the current shape will make it
        # easier to calc the categorical_crossentropy
        # l_out = ReshapeLayer(l_dense, (batchsize, seqlen, vocab_size))
        return l_out, x
Ejemplo n.º 9
0
def build_res_stafg():

    net = collections.OrderedDict()
    # INPUTS----------------------------------------
    net['sent_input'] = InputLayer((None, CFG['SEQUENCE LENGTH']),
                                   input_var=T.imatrix())
    net['word_emb'] = EmbeddingLayer(net['sent_input'], input_size=CFG['VOCAB SIZE']+3,\
                                    output_size=CFG['WORD VECTOR SIZE'],W=np.copy(CFG['wemb']))

    net['vis_input'] = InputLayer((None,CFG['VISUAL LENGTH'], CFG['VIS SIZE']))
    # key words model-------------------------------------
    net['vis_mean_pool'] = FeaturePoolLayer(net['vis_input'],
                                                CFG['VISUAL LENGTH'],pool_function=T.mean)
    net['ctx_vis_reshp'] = ReshapeLayer(net['vis_mean_pool'],(-1,CFG['VIS SIZE']))
    net['global_vis'] = DenseLayer(net['ctx_vis_reshp'],num_units=CFG['EMBEDDING SIZE'],nonlinearity=linear)
    net['key_words_prob'] = DenseLayer(DropoutLayer(net['global_vis']), num_units=CFG['VOCAB SIZE']+3,nonlinearity=sigmoid)
    # gru model--------------------------------------
    net['mask_input'] = InputLayer((None, CFG['SEQUENCE LENGTH']))
    net['sgru'] = GRULayer(net['word_emb'],num_units=CFG['EMBEDDING SIZE'], \
                            mask_input=net['mask_input'],hid_init=net['global_vis'])
    net['sta_gru'] = CTXAttentionGRULayer([net['sgru'],net['vis_input'],net['global_vis']],
                                           num_units=CFG['EMBEDDING SIZE'],
                                           mask_input=net['mask_input'])
    net['fusion'] = DropoutLayer(ConcatLayer([net['sta_gru'],net['gru']],axis=2), p=0.5)
    net['fusion_reshp'] = ReshapeLayer(net['fusion'], (-1,CFG['EMBEDDING SIZE']*2))
    net['word_prob'] = DenseLayer(net['fusion_reshp'], num_units=CFG['VOCAB SIZE']+3,
                                  nonlinearity=softmax)
    net['sent_prob'] = ReshapeLayer(net['word_prob'],(-1,CFG['SEQUENCE LENGTH'], CFG['VOCAB SIZE']+3))
    return net
Ejemplo n.º 10
0
    def _add_word_embeddings(self):
        self._net['input_x'] = InputLayer(shape=(None, None, None),
                                          input_var=T.itensor3(name='input_x'),
                                          name='input_x')

        self._net['input_y'] = InputLayer(shape=(None, None),
                                          input_var=T.imatrix(name='input_y'),
                                          name='input_y')

        # These are theano variables and they are computed dynamically as data is passed into the computational graph
        self._batch_size = self._net['input_x'].input_var.shape[0]
        self._input_context_size = self._net['input_x'].input_var.shape[1]
        self._input_seq_len = self._net['input_x'].input_var.shape[2]
        self._output_seq_len = self._net['input_y'].input_var.shape[1]

        self._net['input_x_batched'] = \
            reshape(self._net['input_x'], (self._batch_size * self._input_context_size, self._input_seq_len))

        self._net['input_x_mask'] = NotEqualMaskLayer(
            incoming=self._net['input_x_batched'],
            x=self._skip_token_id,
            name='mask_x')

        self._net['emb_x'] = EmbeddingLayer(
            incoming=self._net['input_x_batched'],
            input_size=self._vocab_size,
            output_size=self._word_embedding_dim,
            W=self._W_init_embedding,
            name='emb_x')
        # output shape (batch_size, input_context_size, input_seq_len, embedding_dimension)

        self._net['input_y_mask'] = NotEqualMaskLayer(
            incoming=self._net['input_y'],
            x=self._skip_token_id,
            name='mask_y')

        self._net['emb_y'] = EmbeddingLayer(
            incoming=self._net['input_y'],
            input_size=self._vocab_size,
            output_size=self._word_embedding_dim,
            W=self._W_init_embedding,
            name='emb_y')
        # output shape (batch_size, output_seq_len, embedding_dimension)

        if not self._train_word_embedding:
            self._net['emb_x'].params[self._net['emb_x'].W].remove('trainable')
            self._net['emb_y'].params[self._net['emb_y'].W].remove('trainable')
Ejemplo n.º 11
0
    def _add_condition_embeddings(self):
        self._net['input_condition_id'] = InputLayer(
            shape=(None, ), input_var=T.ivector(name='in_condition_id'), name='input_condition_id')

        self._net['emb_condition_id'] = EmbeddingLayer(
            incoming=self._net['input_condition_id'],
            input_size=self._condition_ids_num,
            output_size=self._condition_embedding_dim,
            name='embedding_condition_id')
 def embedding(self, input_dim, cats, output_dim):
     words = np.random.uniform(-0.05, 0.05,
                               (cats, output_dim)).astype("float32")
     w = theano.shared(value=words.astype(theano.config.floatX))
     embed_input = InputLayer((None, input_dim), input_var=T.imatrix())
     e = EmbeddingLayer(embed_input,
                        input_size=cats,
                        output_size=output_dim,
                        W=w)
     return e
Ejemplo n.º 13
0
def test_lstm_get_emb_output():
    hid_size = 10
    inp_size = 10
    out_size = 40
    l_in = InputLayer((None, None), input_var=T.imatrix())
    l_emb = EmbeddingLayer(l_in, inp_size, out_size)
    l_lstm = LSTMLayer(l_emb, hid_size)

    emb_output = lasagne.layers.get_output(l_emb)
    output = lasagne.layers.get_output(l_lstm)
    output_for = l_lstm.get_output_for([emb_output])
Ejemplo n.º 14
0
    def _get_l_out(self, input_vars):
        listener.check_options(self.options)
        id_tag = (self.id + '/') if self.id else ''

        input_var = input_vars[0]

        l_in = InputLayer(shape=(None, self.seq_vec.max_len),
                          input_var=input_var,
                          name=id_tag + 'desc_input')
        l_in_embed = EmbeddingLayer(
            l_in,
            input_size=len(self.seq_vec.tokens),
            output_size=self.options.listener_cell_size,
            name=id_tag + 'desc_embed')

        cell = CELLS[self.options.listener_cell]
        cell_kwargs = {
            'grad_clipping': self.options.listener_grad_clipping,
            'num_units': self.options.listener_cell_size,
        }
        if self.options.listener_cell == 'LSTM':
            cell_kwargs['forgetgate'] = Gate(
                b=Constant(self.options.listener_forget_bias))
        if self.options.listener_cell != 'GRU':
            cell_kwargs['nonlinearity'] = NONLINEARITIES[
                self.options.listener_nonlinearity]

        l_rec1 = cell(l_in_embed, name=id_tag + 'rec1', **cell_kwargs)
        if self.options.listener_dropout > 0.0:
            l_rec1_drop = DropoutLayer(l_rec1,
                                       p=self.options.listener_dropout,
                                       name=id_tag + 'rec1_drop')
        else:
            l_rec1_drop = l_rec1

        l_hidden = DenseLayer(
            l_rec1_drop,
            num_units=self.options.listener_cell_size,
            nonlinearity=NONLINEARITIES[self.options.listener_nonlinearity],
            name=id_tag + 'hidden')
        if self.options.listener_dropout > 0.0:
            l_hidden_drop = DropoutLayer(l_hidden,
                                         p=self.options.listener_dropout,
                                         name=id_tag + 'hidden_drop')
        else:
            l_hidden_drop = l_hidden
        l_out = DenseLayer(l_hidden_drop,
                           num_units=3,
                           nonlinearity=softmax,
                           name=id_tag + 'scores')

        return l_out, [l_in]
Ejemplo n.º 15
0
def test_lnlstm_get_emb_output():
    hid_size = 10
    inp_size = 10
    out_size = 40
    n_batches = 23
    seqlen = 47
    l_in = InputLayer((n_batches, seqlen), input_var=T.imatrix('input_var'), name="l_in")
    l_emb = EmbeddingLayer(l_in, inp_size, out_size, name="l_emb")
    l_lstm = LNLSTMLayer(l_emb, hid_size, name="l_lstm")

    emb_output = lasagne.layers.get_output(l_emb)
    output = lasagne.layers.get_output(l_lstm)
    output_for = l_lstm.get_output_for([emb_output])
Ejemplo n.º 16
0
    def __init__(self, vocab, num_users):
        self.vocab = vocab

        self._user_id = T.ivector('user ids')
        self._good_utterance = T.imatrix('utterance from user')
        self._bad_utterance = T.imatrix('utterance not from user')

        self.l_utt_enc = Enc(vocab)

        self._user_inp = InputLayer((None, ),
                                    input_var=self._user_id,
                                    name='user ids layer')
        self.l_user_emb = EmbeddingLayer(self._user_inp,
                                         num_users,
                                         DssmConfig.USER_EMB_SIZE,
                                         name='user embedding')
        self.l_user_semantic = DenseLayer(self.l_user_emb,
                                          DssmConfig.SEMANTIC_SPACE_SIZE,
                                          name='user representation')
        self.l_user_semantic = dropout(self.l_user_semantic,
                                       p=DssmConfig.DROPOUT_RATE)

        self.l_utt_semantic = DenseLayer(self.l_utt_enc.output,
                                         DssmConfig.SEMANTIC_SPACE_SIZE,
                                         name='utterance representation')
        self.l_utt_semantic = dropout(self.l_utt_semantic,
                                      p=DssmConfig.DROPOUT_RATE)

        self.user_semantic = get_output(self.l_user_semantic)
        self.user_semantic_d = get_output(self.l_user_semantic,
                                          deterministic=True)

        self.good_utt_semantic = get_output(
            self.l_utt_semantic,
            inputs={self.l_utt_enc.l_in: self._good_utterance})
        self.good_utt_semantic_d = get_output(
            self.l_utt_semantic,
            inputs={self.l_utt_enc.l_in: self._good_utterance},
            deterministic=True)

        self.bad_utt_semantic = get_output(
            self.l_utt_semantic,
            inputs={self.l_utt_enc.l_in: self._bad_utterance})
        self.bad_utt_semantic_d = get_output(
            self.l_utt_semantic,
            inputs={self.l_utt_enc.l_in: self._bad_utterance},
            deterministic=True)

        self._build_loss_and_ops()
Ejemplo n.º 17
0
    def get_input_layer(self, input_vars, recurrent_length=0, cell_size=20, context_len=1, id=None):
        id_tag = (id + '/') if id else ''
        (input_var,) = input_vars
        shape = ((None, context_len * len(self.buckets))
                 if recurrent_length == 0 else
                 (None, recurrent_length, context_len * len(self.buckets)))
        l_color = InputLayer(shape=shape, input_var=input_var,
                             name=id_tag + 'color_input')
        l_color_embed = EmbeddingLayer(l_color, input_size=sum(b.num_types for b in self.buckets),
                                       output_size=cell_size,
                                       name=id_tag + 'color_embed')

        dims = (([0], -1) if recurrent_length == 0 else ([0], [1], -1))
        l_color_flattened = reshape(l_color_embed, dims)

        return l_color_flattened, [l_color]
Ejemplo n.º 18
0
 def get_input_layer(self, input_vars, recurrent_length=0, cell_size=20,
                     context_len=1, id=None):
     id_tag = (id + '/') if id else ''
     (input_var,) = input_vars
     input_shape = ((None, context_len)
                    if recurrent_length == 0 else
                    (None, recurrent_length, context_len))
     l_color = InputLayer(shape=input_shape, input_var=input_var,
                          name=id_tag + 'color_input')
     l_color_embed = EmbeddingLayer(l_color, input_size=self.num_types,
                                    output_size=cell_size,
                                    name=id_tag + 'color_embed')
     output_shape = (([0], context_len * cell_size)
                     if recurrent_length == 0 else
                     ([0], recurrent_length, context_len * cell_size))
     l_color_shape = reshape(l_color_embed, output_shape, name=id_tag + 'color_embed_flattened')
     return l_color_shape, [l_color]
Ejemplo n.º 19
0
    def modify_context(self, l_context_repr, extra_vars):
        language = extra_vars[0]
        id_tag = (self.id + '/') if self.id else ''

        print('l_context_repr: {}'.format(l_context_repr.output_shape))

        l_lang_input = InputLayer(shape=(None, ),
                                  input_var=language,
                                  name=id_tag + 'lang_input')
        l_lang_embed = EmbeddingLayer(
            l_lang_input,
            input_size=len(self.lang_vec.tokens),
            output_size=self.options.bilingual_lang_embed_size,
            name=id_tag + 'lang_embed')
        print('l_lang_embed: {}'.format(l_lang_embed.output_shape))

        l_modified_context = ConcatLayer([l_lang_embed, l_context_repr])
        print('l_modified_context: {}'.format(l_modified_context.output_shape))

        return (l_modified_context, [l_lang_input])
Ejemplo n.º 20
0
    def __init__(self, vocab, enc):
        # Define inputs of decoder at each time step.
        self.prev_cell = InputLayer((None, Config.N_LSTM_UNITS), name='cell')
        self.prev_hid = InputLayer((None, Config.N_LSTM_UNITS), name='hid')
        self.input_word = InputLayer((None, ))
        self.encoder_lstm = InputLayer((None, Config.N_LSTM_UNITS),
                                       name='encoder')

        # Embed input word and use the same embeddings as in the encoder.
        self.word_embedding = EmbeddingLayer(self.input_word,
                                             vocab.n_tokens,
                                             Config.EMB_SIZE,
                                             W=enc.l_emb.W,
                                             name='emb')

        # This is not WrongLSTMLayer! *Cell is used for one-tick networks.
        self.new_cell, self.new_hid = LSTMCell(
            self.prev_cell,
            self.prev_hid,
            input_or_inputs=[self.word_embedding, self.encoder_lstm],
            name='decoder_lstm',
            peepholes=False)

        # Define parts for new word prediction. Bottleneck is a hack for reducing time complexity.
        self.bottleneck = DenseLayer(self.new_hid,
                                     Config.BOTTLENECK_UNITS,
                                     nonlinearity=T.tanh,
                                     name='decoder intermediate')

        self.next_word_probs = DenseLayer(self.bottleneck,
                                          vocab.n_tokens,
                                          nonlinearity=lambda probs: T.nnet.
                                          softmax(probs / Config.TEMPERATURE),
                                          name='decoder next word probas')

        self.next_words = ProbabilisticResolver(self.next_word_probs,
                                                assume_normalized=True)
def multi_task_classifier(args,
                          input_var,
                          target_var,
                          wordEmbeddings,
                          seqlen,
                          num_feats,
                          lambda_val=0.5 * 1e-4):

    print("Building multi task model with 1D Convolution")

    vocab_size = wordEmbeddings.shape[1]
    wordDim = wordEmbeddings.shape[0]

    kw = 2
    num_filters = seqlen - kw + 1
    stride = 1
    filter_size = wordDim
    pool_size = num_filters

    input = InputLayer((None, seqlen, num_feats), input_var=input_var)
    batchsize, _, _ = input.input_var.shape

    #span
    emb1 = EmbeddingLayer(input,
                          input_size=vocab_size,
                          output_size=wordDim,
                          W=wordEmbeddings.T)
    reshape1 = ReshapeLayer(emb1, (batchsize, seqlen, num_feats * wordDim))
    conv1d_1 = DimshuffleLayer(
        Conv1DLayer(reshape1,
                    num_filters=num_filters,
                    filter_size=wordDim,
                    stride=1,
                    nonlinearity=tanh,
                    W=GlorotUniform()), (0, 2, 1))
    maxpool_1 = MaxPool1DLayer(conv1d_1, pool_size=pool_size)
    hid_1 = DenseLayer(maxpool_1,
                       num_units=args.hiddenDim,
                       nonlinearity=sigmoid)
    network_1 = DenseLayer(hid_1, num_units=2, nonlinearity=softmax)
    """
    #DocTimeRel
    emb2 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T)
    reshape2 = ReshapeLayer(emb2, (batchsize, seqlen, num_feats*wordDim))
    conv1d_2 = DimshuffleLayer(Conv1DLayer(reshape2, num_filters=num_filters, filter_size=wordDim, stride=1, 
        nonlinearity=tanh,W=GlorotUniform()), (0,2,1))
    maxpool_2 = MaxPool1DLayer(conv1d_2, pool_size=pool_size)  
    hid_2 = DenseLayer(maxpool_2, num_units=args.hiddenDim, nonlinearity=sigmoid)
    network_2 = DenseLayer(hid_2, num_units=5, nonlinearity=softmax)
    """

    #Type
    emb3 = EmbeddingLayer(input,
                          input_size=vocab_size,
                          output_size=wordDim,
                          W=wordEmbeddings.T)
    reshape3 = ReshapeLayer(emb3, (batchsize, seqlen, num_feats * wordDim))
    conv1d_3 = DimshuffleLayer(
        Conv1DLayer(reshape3,
                    num_filters=num_filters,
                    filter_size=wordDim,
                    stride=1,
                    nonlinearity=tanh,
                    W=GlorotUniform()), (0, 2, 1))
    maxpool_3 = MaxPool1DLayer(conv1d_3, pool_size=pool_size)
    hid_3 = DenseLayer(maxpool_3,
                       num_units=args.hiddenDim,
                       nonlinearity=sigmoid)
    network_3 = DenseLayer(hid_3, num_units=4, nonlinearity=softmax)

    #Degree
    emb4 = EmbeddingLayer(input,
                          input_size=vocab_size,
                          output_size=wordDim,
                          W=wordEmbeddings.T)
    reshape4 = ReshapeLayer(emb4, (batchsize, seqlen, num_feats * wordDim))
    conv1d_4 = DimshuffleLayer(
        Conv1DLayer(reshape4,
                    num_filters=num_filters,
                    filter_size=wordDim,
                    stride=1,
                    nonlinearity=tanh,
                    W=GlorotUniform()), (0, 2, 1))
    maxpool_4 = MaxPool1DLayer(conv1d_4, pool_size=pool_size)
    hid_4 = DenseLayer(maxpool_4,
                       num_units=args.hiddenDim,
                       nonlinearity=sigmoid)
    network_4 = DenseLayer(hid_4, num_units=4, nonlinearity=softmax)

    #Polarity
    emb5 = EmbeddingLayer(input,
                          input_size=vocab_size,
                          output_size=wordDim,
                          W=wordEmbeddings.T)
    reshape5 = ReshapeLayer(emb5, (batchsize, seqlen, num_feats * wordDim))
    conv1d_5 = DimshuffleLayer(
        Conv1DLayer(reshape5,
                    num_filters=num_filters,
                    filter_size=wordDim,
                    stride=1,
                    nonlinearity=tanh,
                    W=GlorotUniform()), (0, 2, 1))
    maxpool_5 = MaxPool1DLayer(conv1d_5, pool_size=pool_size)
    hid_5 = DenseLayer(maxpool_5,
                       num_units=args.hiddenDim,
                       nonlinearity=sigmoid)
    network_5 = DenseLayer(hid_5, num_units=3, nonlinearity=softmax)

    #ContextualModality
    emb6 = EmbeddingLayer(input,
                          input_size=vocab_size,
                          output_size=wordDim,
                          W=wordEmbeddings.T)
    reshape6 = ReshapeLayer(emb6, (batchsize, seqlen, num_feats * wordDim))
    conv1d_6 = DimshuffleLayer(
        Conv1DLayer(reshape6,
                    num_filters=num_filters,
                    filter_size=wordDim,
                    stride=1,
                    nonlinearity=tanh,
                    W=GlorotUniform()), (0, 2, 1))
    maxpool_6 = MaxPool1DLayer(conv1d_6, pool_size=pool_size)
    hid_6 = DenseLayer(maxpool_6,
                       num_units=args.hiddenDim,
                       nonlinearity=sigmoid)
    network_6 = DenseLayer(hid_6, num_units=5, nonlinearity=softmax)
    """
    #ContextualAspect
    emb7 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T)
    reshape7 = ReshapeLayer(emb7, (batchsize, seqlen, num_feats*wordDim))
    conv1d_7 = DimshuffleLayer(Conv1DLayer(reshape7, num_filters=num_filters, filter_size=wordDim, stride=1, 
        nonlinearity=tanh,W=GlorotUniform()), (0,2,1))
    maxpool_7 = MaxPool1DLayer(conv1d_7, pool_size=pool_size)  
    hid_7 = DenseLayer(maxpool_7, num_units=args.hiddenDim, nonlinearity=sigmoid)
    network_7 = DenseLayer(hid_7, num_units=4, nonlinearity=softmax)
    """
    """
    #Permanence
    emb8 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T)
    reshape8 = ReshapeLayer(emb8, (batchsize, seqlen, num_feats*wordDim))
    conv1d_8 = DimshuffleLayer(Conv1DLayer(reshape8, num_filters=num_filters, filter_size=wordDim, stride=1, 
        nonlinearity=tanh,W=GlorotUniform()), (0,2,1))
    maxpool_8 = MaxPool1DLayer(conv1d_8, pool_size=pool_size)  
    hid_8 = DenseLayer(maxpool_8, num_units=args.hiddenDim, nonlinearity=sigmoid)
    network_8 = DenseLayer(hid_8, num_units=4, nonlinearity=softmax)
    """

    # Is this important?
    """
    network_1_out, network_2_out, network_3_out, network_4_out, \
    network_5_out, network_6_out, network_7_out, network_8_out = \
    get_output([network_1, network_2, network_3, network_4, network_5, network_6, network_7, network_8])
    """
    network_1_out = get_output(network_1)
    network_3_out = get_output(network_3)
    network_4_out = get_output(network_4)
    network_5_out = get_output(network_5)
    network_6_out = get_output(network_6)

    loss_1 = T.mean(binary_crossentropy(
        network_1_out, target_var)) + regularize_layer_params_weighted(
            {
                emb1: lambda_val,
                conv1d_1: lambda_val,
                hid_1: lambda_val,
                network_1: lambda_val
            }, l2)
    updates_1 = adagrad(loss_1,
                        get_all_params(network_1, trainable=True),
                        learning_rate=args.step)
    train_fn_1 = theano.function([input_var, target_var],
                                 loss_1,
                                 updates=updates_1,
                                 allow_input_downcast=True)
    val_acc_1 = T.mean(
        binary_accuracy(get_output(network_1, deterministic=True), target_var))
    val_fn_1 = theano.function([input_var, target_var],
                               val_acc_1,
                               allow_input_downcast=True)
    """
    loss_2 = T.mean(categorical_crossentropy(network_2_out,target_var)) + regularize_layer_params_weighted({emb2:lambda_val, conv1d_2:lambda_val, 
                hid_2:lambda_val, network_2:lambda_val} , l2)
    updates_2 = adagrad(loss_2, get_all_params(network_2, trainable=True), learning_rate=args.step)
    train_fn_2 = theano.function([input_var, target_var], loss_2, updates=updates_2, allow_input_downcast=True)
    val_acc_2 =  T.mean(categorical_accuracy(get_output(network_2, deterministic=True), target_var))
    val_fn_2 = theano.function([input_var, target_var], val_acc_2, allow_input_downcast=True)
    """

    loss_3 = T.mean(categorical_crossentropy(
        network_3_out, target_var)) + regularize_layer_params_weighted(
            {
                emb3: lambda_val,
                conv1d_3: lambda_val,
                hid_3: lambda_val,
                network_3: lambda_val
            }, l2)
    updates_3 = adagrad(loss_3,
                        get_all_params(network_3, trainable=True),
                        learning_rate=args.step)
    train_fn_3 = theano.function([input_var, target_var],
                                 loss_3,
                                 updates=updates_3,
                                 allow_input_downcast=True)
    val_acc_3 = T.mean(
        categorical_accuracy(get_output(network_3, deterministic=True),
                             target_var))
    val_fn_3 = theano.function([input_var, target_var],
                               val_acc_3,
                               allow_input_downcast=True)

    loss_4 = T.mean(categorical_crossentropy(
        network_4_out, target_var)) + regularize_layer_params_weighted(
            {
                emb4: lambda_val,
                conv1d_4: lambda_val,
                hid_4: lambda_val,
                network_4: lambda_val
            }, l2)
    updates_4 = adagrad(loss_4,
                        get_all_params(network_4, trainable=True),
                        learning_rate=args.step)
    train_fn_4 = theano.function([input_var, target_var],
                                 loss_4,
                                 updates=updates_4,
                                 allow_input_downcast=True)
    val_acc_4 = T.mean(
        categorical_accuracy(get_output(network_4, deterministic=True),
                             target_var))
    val_fn_4 = theano.function([input_var, target_var],
                               val_acc_4,
                               allow_input_downcast=True)

    loss_5 = T.mean(categorical_crossentropy(
        network_5_out, target_var)) + regularize_layer_params_weighted(
            {
                emb5: lambda_val,
                conv1d_5: lambda_val,
                hid_5: lambda_val,
                network_5: lambda_val
            }, l2)
    updates_5 = adagrad(loss_5,
                        get_all_params(network_5, trainable=True),
                        learning_rate=args.step)
    train_fn_5 = theano.function([input_var, target_var],
                                 loss_5,
                                 updates=updates_5,
                                 allow_input_downcast=True)
    val_acc_5 = T.mean(
        categorical_accuracy(get_output(network_5, deterministic=True),
                             target_var))
    val_fn_5 = theano.function([input_var, target_var],
                               val_acc_5,
                               allow_input_downcast=True)

    loss_6 = T.mean(categorical_crossentropy(
        network_6_out, target_var)) + regularize_layer_params_weighted(
            {
                emb6: lambda_val,
                conv1d_6: lambda_val,
                hid_6: lambda_val,
                network_6: lambda_val
            }, l2)
    updates_6 = adagrad(loss_6,
                        get_all_params(network_6, trainable=True),
                        learning_rate=args.step)
    train_fn_6 = theano.function([input_var, target_var],
                                 loss_6,
                                 updates=updates_6,
                                 allow_input_downcast=True)
    val_acc_6 = T.mean(
        categorical_accuracy(get_output(network_6, deterministic=True),
                             target_var))
    val_fn_6 = theano.function([input_var, target_var],
                               val_acc_6,
                               allow_input_downcast=True)
    """
    loss_7 = T.mean(categorical_crossentropy(network_7_out,target_var)) + regularize_layer_params_weighted({emb7:lambda_val, conv1d_7:lambda_val, 
                hid_7:lambda_val, network_7:lambda_val} , l2)
    updates_7 = adagrad(loss_7, get_all_params(network_7, trainable=True), learning_rate=args.step)
    train_fn_7 = theano.function([input_var, target_var], loss_7, updates=updates_7, allow_input_downcast=True)
    val_acc_7 =  T.mean(categorical_accuracy(get_output(network_7, deterministic=True), target_var))
    val_fn_7 = theano.function([input_var, target_var], val_acc_7, allow_input_downcast=True)

    loss_8 = T.mean(categorical_crossentropy(network_8_out,target_var)) + regularize_layer_params_weighted({emb8:lambda_val, conv1d_8:lambda_val, 
                hid_8:lambda_val, network_8:lambda_val} , l2)
    updates_8 = adagrad(loss_8, get_all_params(network_8, trainable=True), learning_rate=args.step)
    train_fn_8 = theano.function([input_var, target_var], loss_8, updates=updates_8, allow_input_downcast=True)
    val_acc_8 =  T.mean(categorical_accuracy(get_output(network_8, deterministic=True), target_var))
    val_fn_8 = theano.function([input_var, target_var], val_acc_8, allow_input_downcast=True)
    """
    """
    return train_fn_1, val_fn_1, network_1, train_fn_2, val_fn_2, network_2, train_fn_3, val_fn_3, \
            network_3, train_fn_4, val_fn_4, network_4, train_fn_5, val_fn_5, network_5, \
            train_fn_6, val_fn_6, network_6, train_fn_7, val_fn_7, network_7, train_fn_8, val_fn_8, network_8
    """
    return train_fn_1, val_fn_1, network_1, train_fn_3, val_fn_3, \
            network_3, train_fn_4, val_fn_4, network_4, train_fn_5, val_fn_5, network_5, \
            train_fn_6, val_fn_6, network_6
Ejemplo n.º 22
0
    def _get_l_out(self, input_vars):
        check_options(self.options)
        id_tag = (self.id + '/') if self.id else ''

        input_var = input_vars[0]
        context_vars = input_vars[1:]

        l_in = InputLayer(shape=(None, self.seq_vec.max_len),
                          input_var=input_var,
                          name=id_tag + 'desc_input')
        l_in_embed = EmbeddingLayer(
            l_in,
            input_size=len(self.seq_vec.tokens),
            output_size=self.options.listener_cell_size,
            name=id_tag + 'desc_embed')

        cell = CELLS[self.options.listener_cell]
        cell_kwargs = {
            'grad_clipping': self.options.listener_grad_clipping,
            'num_units': self.options.listener_cell_size,
        }
        if self.options.listener_cell == 'LSTM':
            cell_kwargs['forgetgate'] = Gate(
                b=Constant(self.options.listener_forget_bias))
        if self.options.listener_cell != 'GRU':
            cell_kwargs['nonlinearity'] = NONLINEARITIES[
                self.options.listener_nonlinearity]

        l_rec1 = cell(l_in_embed,
                      name=id_tag + 'rec1',
                      only_return_final=True,
                      **cell_kwargs)
        if self.options.listener_bidi:
            l_rec1_backwards = cell(l_in_embed,
                                    name=id_tag + 'rec1_back',
                                    backwards=True,
                                    only_return_final=True,
                                    **cell_kwargs)
            l_rec1 = ConcatLayer([l_rec1, l_rec1_backwards],
                                 axis=1,
                                 name=id_tag + 'rec1_bidi_concat')
        if self.options.listener_dropout > 0.0:
            l_rec1_drop = DropoutLayer(l_rec1,
                                       p=self.options.listener_dropout,
                                       name=id_tag + 'rec1_drop')
        else:
            l_rec1_drop = l_rec1

        # (batch_size, repr_size)
        l_pred_mean = DenseLayer(l_rec1_drop,
                                 num_units=self.color_vec.output_size,
                                 nonlinearity=None,
                                 name=id_tag + 'pred_mean')
        # (batch_size, repr_size * repr_size)
        l_pred_covar_vec = DenseLayer(
            l_rec1_drop,
            num_units=self.color_vec.output_size**2,
            # initially produce identity matrix
            b=np.eye(self.color_vec.output_size,
                     dtype=theano.config.floatX).ravel(),
            nonlinearity=None,
            name=id_tag + 'pred_covar_vec')
        # (batch_size, repr_size, repr_size)
        l_pred_covar = reshape(
            l_pred_covar_vec,
            ([0], self.color_vec.output_size, self.color_vec.output_size),
            name=id_tag + 'pred_covar')

        # Context repr has shape (batch_size, context_len * repr_size)
        l_context_repr, context_inputs = self.color_vec.get_input_layer(
            context_vars,
            cell_size=self.options.listener_cell_size,
            context_len=self.context_len,
            id=self.id)
        l_context_points = reshape(
            l_context_repr,
            ([0], self.context_len, self.color_vec.output_size))

        l_unnorm_scores = GaussianScoreLayer(l_context_points,
                                             l_pred_mean,
                                             l_pred_covar,
                                             name=id_tag + 'gaussian_score')

        l_scores = NonlinearityLayer(l_unnorm_scores,
                                     nonlinearity=softmax,
                                     name=id_tag + 'scores')

        return l_scores, [l_in] + context_inputs
Ejemplo n.º 23
0
    def __init__(self,
                 pre_trained_w_embs=None,
                 pre_trained_c_embs=None,
                 w_grams=(3, 4, 5),
                 w_nfs=(50, 50, 50),
                 c_grams=(4, 5, 6),
                 c_nfs=(50, 50, 50),
                 mlp_layers=(2, ),
                 mlp_dropouts=(0.5, ),
                 mlp_nonlinearities=(softmax, ),
                 opt_method=lasagne.updates.adadelta,
                 opt_args={
                     'learning_rate': 0.1,
                     'rho': 0.95,
                     'epsilon': 1e-6
                 },
                 **kwargs):
        parameters = locals()
        del parameters['self']
        parameters.update(kwargs)
        self.parameters = parameters

        assert pre_trained_w_embs is not None
        assert pre_trained_c_embs is not None
        if isinstance(pre_trained_w_embs, tuple):
            w_vocab_size = pre_trained_w_embs[0]
            w_emb_dim = pre_trained_w_embs[1]
            pre_trained_w_embs = Uniform(0.25).sample(
                (w_vocab_size, w_emb_dim))
        else:
            w_vocab_size = pre_trained_w_embs.shape[0]
            w_emb_dim = pre_trained_w_embs.shape[1]
        t_pre_trained_w_embs = theano.shared(pre_trained_w_embs,
                                             name='w_embs',
                                             borrow=True)

        if isinstance(pre_trained_c_embs, tuple):
            c_vocab_size = pre_trained_c_embs[0]
            c_emb_dim = pre_trained_c_embs[1]
            pre_trained_c_embs = Uniform(0.25).sample(
                (c_vocab_size, c_emb_dim))
        else:
            c_vocab_size = pre_trained_c_embs.shape[0]
            c_emb_dim = pre_trained_c_embs.shape[1]
        t_pre_trained_c_embs = theano.shared(pre_trained_c_embs,
                                             name='c_embs',
                                             borrow=True)

        w_sents = T.imatrix(name='w_sents')
        c_sents = T.imatrix(name='c_sents')
        labels = T.ivector(name='labels')

        w_input = InputLayer((None, None), input_var=w_sents)
        c_input = InputLayer((None, None), input_var=c_sents)

        w_embs = EmbeddingLayer(w_input,
                                input_size=w_vocab_size,
                                output_size=w_emb_dim,
                                W=t_pre_trained_w_embs)
        w_embs = ReshapeLayer(w_embs, ([0], 1, [1], [2]))
        c_embs = EmbeddingLayer(c_input,
                                input_size=c_vocab_size,
                                output_size=c_emb_dim,
                                W=t_pre_trained_c_embs)
        c_embs = ReshapeLayer(c_embs, ([0], 1, [1], [2]))

        conv_layers = []
        for w_gram, w_nf in zip(w_grams, w_nfs):
            conv_layer = Conv2DLayer(w_embs,
                                     w_nf, (w_gram, w_emb_dim),
                                     pad=(w_gram - 1, 0))
            # shape (batch_size, nfs[i], 1, 1)
            pooled_layer = MaxLayer(conv_layer, axis=2)
            # shape (batch_size, nfs[i])
            flatten_layer = ReshapeLayer(pooled_layer, ([0], [1]))
            conv_layers.append(flatten_layer)

        for c_gram, c_nf in zip(c_grams, c_nfs):
            # shape (batch_size, nfs[i], num_features, 1)
            conv_layer = Conv2DLayer(c_embs,
                                     c_nf, (c_gram, c_emb_dim),
                                     pad=(c_gram - 1, 0))
            # shape (batch_size, nfs[i], 1, 1)
            pooled_layer = MaxLayer(conv_layer, axis=2)
            # shape (batch_size, nfs[i])
            flatten_layer = ReshapeLayer(pooled_layer, ([0], [1]))
            conv_layers.append(flatten_layer)

        network = ConcatLayer(conv_layers, axis=1)

        for mlp_layer, mlp_dropout, mlp_nonlinearity in zip(
                mlp_layers, mlp_dropouts, mlp_nonlinearities):
            if mlp_dropout is not None:
                network = DropoutLayer(network, p=mlp_dropout)
            network = DenseLayer(network,
                                 num_units=mlp_layer,
                                 nonlinearity=mlp_nonlinearity)

        self.network = network

        # Create a loss expression for training, i.e., negative log likelihood we want to maximize):
        train_predict = get_output(network)
        train_loss = negative_log_likelihood(train_predict, labels)
        train_acc = T.sum(T.eq(T.argmax(train_predict, axis=1), labels),
                          axis=0)

        # We could add some weight decay as well here, see lasagne.regularization.

        # Here to create update expressions for training, i.e., how to modify the
        # parameters at each training step. Here, we'll use Stochastic Gradient
        # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more.
        params = get_all_params(network, trainable=True)
        updates = opt_method(train_loss, params, **opt_args)
        # correct updates for embeddings[0] by resetting it to its initial value
        updates[t_pre_trained_w_embs] = T.set_subtensor(
            updates[t_pre_trained_w_embs][0, :], t_pre_trained_w_embs[0])
        updates[t_pre_trained_c_embs] = T.set_subtensor(
            updates[t_pre_trained_c_embs][0, :], t_pre_trained_c_embs[0])

        # Create a loss expression for validation/testing. The crucial difference
        # here is that we do a deterministic forward pass through the network,
        # disabling dropout layers.
        test_prediction = get_output(network, deterministic=True)
        test_loss = negative_log_likelihood(test_prediction, labels)
        test_acc = T.sum(T.eq(T.argmax(test_prediction, axis=1), labels),
                         axis=0)

        # Compile a function performing a training step on a mini-batch (by giving
        # the updates dictionary) and returning the corresponding training loss:
        self.train_fn = theano.function([w_sents, c_sents, labels],
                                        [train_loss, train_acc],
                                        updates=updates)

        # Compile a second function computing the validation loss and accuracy:
        self.test_fn = theano.function([w_sents, c_sents, labels],
                                       [test_loss, test_acc])
Ejemplo n.º 24
0
    def _get_l_out(self, input_vars):
        check_options(self.options)
        id_tag = (self.id + '/') if self.id else ''

        color_mask_var, prev_output_var, mask_var = input_vars[-3:]
        color_input_vars = input_vars[:-3]

        num_contexts = color_mask_var.shape[0]
        num_colors = color_mask_var.shape[1]
        l_color_repr, color_inputs = self.color_vec.get_input_layer(
            color_input_vars,
            recurrent_length=0,
            cell_size=self.options.speaker_cell_size,
            context_len=None,
            id=self.id)
        l_color_reshaped = ReshapeLayer(
            l_color_repr,
            (num_contexts, num_colors, self.color_vec.output_size),
            name=id_tag + 'color_reshaped')
        l_color_mask_in = InputLayer(shape=(None, None),
                                     input_var=color_mask_var,
                                     name=id_tag + 'color_mask')

        cell = CELLS[self.options.speaker_cell]
        cell_kwargs = {
            'mask_input':
            (None if self.options.speaker_no_mask else l_color_mask_in),
            'grad_clipping':
            self.options.speaker_grad_clipping,
            'num_units':
            self.options.speaker_cell_size,
        }
        if self.options.speaker_cell == 'LSTM':
            cell_kwargs['forgetgate'] = Gate(
                b=Constant(self.options.speaker_forget_bias))
        if self.options.speaker_cell != 'GRU':
            cell_kwargs['nonlinearity'] = NONLINEARITIES[
                self.options.speaker_nonlinearity]

        l_context_out = cell(l_color_reshaped,
                             name=id_tag + 'reccontext',
                             only_return_final=True,
                             **cell_kwargs)
        l_context_tiled = RepeatLayer(l_context_out,
                                      self.seq_vec.max_len - 1,
                                      name=id_tag + 'reccontext_tiled')

        l_prev_out = InputLayer(shape=(None, self.seq_vec.max_len - 1),
                                input_var=prev_output_var,
                                name=id_tag + 'prev_input')
        l_prev_embed = EmbeddingLayer(
            l_prev_out,
            input_size=len(self.seq_vec.tokens),
            output_size=self.options.speaker_cell_size,
            name=id_tag + 'prev_embed')
        l_in = ConcatLayer([l_context_tiled, l_prev_embed],
                           axis=2,
                           name=id_tag + 'color_prev')
        l_mask_in = InputLayer(shape=(None, self.seq_vec.max_len - 1),
                               input_var=mask_var,
                               name=id_tag + 'mask_input')
        l_rec_drop = l_in

        cell_kwargs['mask_input'] = (None if self.options.speaker_no_mask else
                                     l_mask_in)

        for i in range(1, self.options.speaker_recurrent_layers):
            l_rec = cell(l_rec_drop, name=id_tag + 'rec%d' % i, **cell_kwargs)
            if self.options.speaker_dropout > 0.0:
                l_rec_drop = DropoutLayer(l_rec,
                                          p=self.options.speaker_dropout,
                                          name=id_tag + 'rec%d_drop' % i)
            else:
                l_rec_drop = l_rec
        l_rec = cell(l_rec_drop,
                     name=id_tag +
                     'rec%d' % self.options.speaker_recurrent_layers,
                     **cell_kwargs)
        l_shape = ReshapeLayer(l_rec, (-1, self.options.speaker_cell_size),
                               name=id_tag + 'reshape')
        l_hidden_out = l_shape
        for i in range(1, self.options.speaker_hidden_out_layers + 1):
            l_hidden_out = DenseLayer(
                l_hidden_out,
                num_units=self.options.speaker_cell_size,
                nonlinearity=NONLINEARITIES[self.options.speaker_nonlinearity],
                name=id_tag + 'hidden_out%d' % i)
        l_softmax = DenseLayer(l_hidden_out,
                               num_units=len(self.seq_vec.tokens),
                               nonlinearity=softmax,
                               name=id_tag + 'softmax')
        l_out = ReshapeLayer(
            l_softmax,
            (-1, self.seq_vec.max_len - 1, len(self.seq_vec.tokens)),
            name=id_tag + 'out')

        return l_out, color_inputs + [l_color_mask_in, l_prev_out, l_mask_in]
Ejemplo n.º 25
0
    def _get_net(self):
        net = OrderedDict()

        net['l_in_x'] = InputLayer(shape=(None, None),
                                   input_var=T.imatrix(name="enc_ix"),
                                   name="encoder_seq_ix")

        net['l_in_y'] = InputLayer(shape=(None, None),
                                   input_var=T.imatrix(name="dec_ix"),
                                   name="decoder_seq_ix")

        net['l_emb_x'] = EmbeddingLayer(
            incoming=net['l_in_x'],
            input_size=self.vocab_size,
            output_size=TOKEN_REPRESENTATION_SIZE,
            W=self.W,
            name="embeddings_layer_x"
        )

        net['l_emb_y'] = EmbeddingLayer(
            incoming=net['l_in_y'],
            input_size=self.vocab_size,
            output_size=TOKEN_REPRESENTATION_SIZE,
            W=self.W,
            name="embeddings_layer_y"
        )
        if not LEARN_WORD_EMBEDDINGS:
            net['l_emb_x'].params[net['l_emb_x'].W].remove('trainable')
            net['l_emb_y'].params[net['l_emb_y'].W].remove('trainable')

        # encoder ###############################################
        net['l_enc'] = self.rnn_layer(
            incoming=net['l_emb_x'],
            num_units=HIDDEN_LAYER_DIMENSION,
            grad_clipping=self.gc,
            only_return_final=True,
            name='lstm_encoder'
        )

        # decoder ###############################################

        net['l_dec'] = self.rnn_layer(
            incoming=net['l_emb_y'],
            num_units=HIDDEN_LAYER_DIMENSION,
            hid_init=net['l_enc'],
            grad_clipping=GRAD_CLIP,
            name='lstm_decoder'
        )

        # decoder returns the batch of sequences of though vectors, each corresponds to a decoded token
        # reshape this 3d tensor to 2d matrix so that the next Dense layer can convert each though vector to
        # probability distribution vector

        # output ###############################################
        # cut off the last prob vectors for every prob sequence:
        # they correspond to the tokens that go after EOS_TOKEN and we are not interested in it

        net['l_slice'] = SliceLayer(
            incoming=net['l_dec'],
            indices=slice(0, -1),  # keep all but the last token
            axis=1,  # sequneces axis
            name='slice_layer'
        )

        net['l_dec_long'] = ReshapeLayer(
            incoming=net['l_slice'],
            shape=(-1, HIDDEN_LAYER_DIMENSION),
            name='reshape_layer'
        )

        net['l_dist'] = DenseLayer(
            incoming=net['l_dec_long'],
            num_units=self.vocab_size,
            nonlinearity=lasagne.nonlinearities.softmax,
            name="dense_output_probas"
        )

        # don't need to reshape back, can compare this "long" output with true one-hot vectors

        return net
Ejemplo n.º 26
0
    def test_clone(self):
        # Data for unit testing
        X_unit = ['abcdef', 'abcdef', 'qwerty']
        X_unit = [[ord(c) for c in w] for w in X_unit]
        X_unit = np.array(X_unit, dtype='int8')
        n_alerts_unit, l_alerts_unit = X_unit.shape
        mask_unit = np.ones(X_unit.shape, dtype='int8')

        # Dimensions
        n_alerts = None
        l_alerts = None
        n_alphabet = 2**7  # All ASCII chars
        num_units = 10

        # Symbolic variables
        input_var, input_var2 = T.imatrices('inputs', 'inputs2')
        mask_var, mask_var2 = T.matrices('masks', 'masks2')
        target_var = T.dvector('targets')

        # build net for testing
        l_in = InputLayer(shape=(n_alerts, l_alerts),
                          input_var=input_var,
                          name='INPUT-LAYER')
        l_emb = EmbeddingLayer(l_in,
                               n_alphabet,
                               n_alphabet,
                               W=np.eye(n_alphabet),
                               name='EMBEDDING-LAYER')
        l_emb.params[l_emb.W].remove('trainable')  # Fix weight
        l_mask = InputLayer(shape=(n_alerts, l_alerts),
                            input_var=mask_var,
                            name='MASK-INPUT-LAYER')
        l_lstm = LSTMLayer(l_emb,
                           num_units=num_units,
                           name='LSTM-LAYER',
                           mask_input=l_mask)
        l_slice = SliceLayer(l_lstm, indices=-1, axis=1,
                             name="SLICE-LAYER")  # Only last timestep

        net = l_slice

        # clone
        l_in2 = InputLayer(shape=(n_alerts, l_alerts),
                           input_var=input_var2,
                           name='INPUT-LAYER2')
        l_mask2 = InputLayer(shape=(n_alerts, l_alerts),
                             input_var=mask_var2,
                             name='MASK-INPUT-LAYER2')
        net2 = lstm_rnn_tied_weights.clone(net, l_in2, l_mask2)

        self.assertNotEqual(repr(net), repr(net2))

        pred_unit = layers.get_output(net,
                                      inputs={
                                          l_in: input_var,
                                          l_mask: mask_var
                                      }).eval({
                                          input_var: X_unit,
                                          mask_var: mask_unit
                                      })

        pred_unit2 = layers.get_output(net2,
                                       inputs={
                                           l_in2: input_var2,
                                           l_mask2: mask_var2
                                       }).eval({
                                           input_var2: X_unit,
                                           mask_var2: mask_unit
                                       })

        self.assert_array_equal(pred_unit, pred_unit2)
Ejemplo n.º 27
0
    def __init__(self,
                 vocab_size,
                 n_entities,
                 embedding_size,
                 n_hidden_que,
                 n_hidden_con,
                 n_out_hidden,
                 residual=False,
                 depth_rnn=1,
                 grad_clipping=10,
                 skip_connections=False,
                 bidir=False,
                 dropout=False,
                 **kwargs):
        ReaderTwoSeqModel.__init__(self, vocab_size, n_entities,
                                   embedding_size, residual, depth_rnn,
                                   grad_clipping, skip_connections, bidir,
                                   dropout)

        self.n_hidden_question = n_hidden_que
        self.n_hidden_context = n_hidden_con
        self.n_out_hidden = n_out_hidden

        ##################
        # SEQ PROCESSING #
        ##################

        embed_con = EmbeddingLayer(self.in_con, vocab_size, embedding_size)
        embed_que = EmbeddingLayer(self.in_que,
                                   vocab_size,
                                   embedding_size,
                                   W=embed_con.W)

        gru_con = create_deep_rnn(embed_con,
                                  GRULayer,
                                  depth_rnn,
                                  layer_mask=self.in_con_mask,
                                  num_units=n_hidden_con,
                                  grad_clipping=grad_clipping,
                                  residual=residual,
                                  skip_connections=skip_connections,
                                  bidir=bidir)[-1]
        gru_que = create_deep_rnn(embed_que,
                                  GRULayer,
                                  depth_rnn,
                                  layer_mask=self.in_que_mask,
                                  num_units=n_hidden_que,
                                  grad_clipping=grad_clipping,
                                  residual=residual,
                                  skip_connections=skip_connections,
                                  bidir=bidir)[-1]

        #############
        # ATTENTION #
        #############

        que_condition = SliceLayer(gru_que, indices=-1, axis=1)
        batch_size = self.seq_con.shape[0]
        att = self.create_attention(gru_con, self.in_con_mask, que_condition,
                                    batch_size, n_hidden_con, **kwargs)

        ##########
        # OUTPUT #
        ##########

        out_att = DenseLayer(att, n_out_hidden, nonlinearity=None)
        out_que = DenseLayer(que_condition, n_out_hidden, nonlinearity=None)

        out_sum = ElemwiseSumLayer([out_att, out_que])
        if dropout:
            out_sum = DropoutLayer(out_sum, dropout)
        out_tanh = NonlinearityLayer(out_sum, nonlinearity=T.tanh)

        out = DenseLayer(out_tanh, self.n_entities, nonlinearity=None)
        if dropout:
            out = DropoutLayer(out, dropout)

        self.net = CandidateOutputLayer(out, self.in_cand, self.in_cand_mask)
Ejemplo n.º 28
0
    def _get_l_out(self, input_vars):
        check_options(self.options)
        id_tag = (self.id + '/') if self.id else ''

        input_var = input_vars[0]
        context_vars = input_vars[1:]

        l_in = InputLayer(shape=(None, self.seq_vec.max_len),
                          input_var=input_var,
                          name=id_tag + 'desc_input')
        l_in_embed = EmbeddingLayer(
            l_in,
            input_size=len(self.seq_vec.tokens),
            output_size=self.options.listener_cell_size,
            name=id_tag + 'desc_embed')

        # Context repr has shape (batch_size, seq_len, context_len * repr_size)
        l_context_repr, context_inputs = self.color_vec.get_input_layer(
            context_vars,
            recurrent_length=self.seq_vec.max_len,
            cell_size=self.options.listener_cell_size,
            context_len=self.context_len,
            id=self.id)
        l_context_repr = reshape(
            l_context_repr,
            ([0], [1], self.context_len, self.color_vec.output_size))
        l_hidden_context = dimshuffle(l_context_repr, (0, 3, 1, 2),
                                      name=id_tag + 'shuffle_in')
        for i in range(1, self.options.listener_hidden_color_layers + 1):
            l_hidden_context = NINLayer(
                l_hidden_context,
                num_units=self.options.listener_cell_size,
                nonlinearity=NONLINEARITIES[
                    self.options.listener_nonlinearity],
                b=Constant(0.1),
                name=id_tag + 'hidden_context%d' % i)
        l_pool = FeaturePoolLayer(l_hidden_context,
                                  pool_size=self.context_len,
                                  axis=3,
                                  pool_function=T.mean,
                                  name=id_tag + 'pool')
        l_pool_squeezed = reshape(l_pool, ([0], [1], [2]),
                                  name=id_tag + 'pool_squeezed')
        l_pool_shuffle = dimshuffle(l_pool_squeezed, (0, 2, 1),
                                    name=id_tag + 'shuffle_out')
        l_concat = ConcatLayer([l_pool_shuffle, l_in_embed],
                               axis=2,
                               name=id_tag + 'concat_inp_context')

        cell = CELLS[self.options.listener_cell]
        cell_kwargs = {
            'grad_clipping': self.options.listener_grad_clipping,
            'num_units': self.options.listener_cell_size,
        }
        if self.options.listener_cell == 'LSTM':
            cell_kwargs['forgetgate'] = Gate(
                b=Constant(self.options.listener_forget_bias))
        if self.options.listener_cell != 'GRU':
            cell_kwargs['nonlinearity'] = NONLINEARITIES[
                self.options.listener_nonlinearity]

        # l_rec1_drop = l_concat
        l_rec1 = cell(l_concat, name=id_tag + 'rec1', **cell_kwargs)
        if self.options.listener_dropout > 0.0:
            l_rec1_drop = DropoutLayer(l_rec1,
                                       p=self.options.listener_dropout,
                                       name=id_tag + 'rec1_drop')
        else:
            l_rec1_drop = l_rec1
        l_rec2 = cell(l_rec1_drop,
                      name=id_tag + 'rec2',
                      only_return_final=True,
                      **cell_kwargs)
        if self.options.listener_dropout > 0.0:
            l_rec2_drop = DropoutLayer(l_rec2,
                                       p=self.options.listener_dropout,
                                       name=id_tag + 'rec2_drop')
        else:
            l_rec2_drop = l_rec2

        l_rec2_drop = NINLayer(l_rec2_drop,
                               num_units=self.options.listener_cell_size,
                               nonlinearity=None,
                               name=id_tag + 'rec2_dense')

        # Context is fed into the RNN as one copy for each time step; just use
        # the first time step for output.
        # Input shape: (batch_size, repr_size, seq_len, context_len)
        # Output shape: (batch_size, repr_size, context_len)
        l_context_nonrec = SliceLayer(l_hidden_context,
                                      indices=0,
                                      axis=2,
                                      name=id_tag + 'context_nonrec')
        l_pool_nonrec = SliceLayer(l_pool_squeezed,
                                   indices=0,
                                   axis=2,
                                   name=id_tag + 'pool_nonrec')

        # Output shape: (batch_size, repr_size, context_len)
        l_sub = broadcast_sub_layer(
            l_pool_nonrec,
            l_context_nonrec,
            feature_dim=self.options.listener_cell_size,
            id_tag=id_tag)
        # Output shape: (batch_size, repr_size * 2, context_len)
        l_concat_sub = ConcatLayer([l_context_nonrec, l_sub],
                                   axis=1,
                                   name=id_tag + 'concat_inp_context')
        # Output shape: (batch_size, cell_size, context_len)
        l_hidden = NINLayer(l_concat_sub,
                            num_units=self.options.listener_cell_size,
                            nonlinearity=None,
                            name=id_tag + 'hidden')
        if self.options.listener_dropout > 0.0:
            l_hidden_drop = DropoutLayer(l_hidden,
                                         p=self.options.listener_dropout,
                                         name=id_tag + 'hidden_drop')
        else:
            l_hidden_drop = l_hidden

        l_dot = broadcast_dot_layer(
            l_rec2_drop,
            l_hidden_drop,
            feature_dim=self.options.listener_cell_size,
            id_tag=id_tag)
        l_dot_bias = l_dot  # BiasLayer(l_dot, name=id_tag + 'dot_bias')
        l_dot_clipped = NonlinearityLayer(
            l_dot_bias,
            nonlinearity=NONLINEARITIES[self.options.listener_nonlinearity],
            name=id_tag + 'dot_clipped')
        l_scores = NonlinearityLayer(l_dot_clipped,
                                     nonlinearity=softmax,
                                     name=id_tag + 'scores')

        return l_scores, [l_in] + context_inputs
Ejemplo n.º 29
0
def main(exp_name, embed_data, train_data, train_data_stats, val_data,
         val_data_stats, test_data, test_data_stats, log_path, batch_size,
         num_epochs, unroll_steps, learn_rate, num_dense, dense_dim, penalty,
         reg_coeff):
    """
    Main run function for training model.
    :param exp_name:
    :param embed_data:
    :param train_data:
    :param train_data_stats:
    :param val_data:
    :param val_data_stats:
    :param test_data:
    :param test_data_stats:
    :param log_path:
    :param batch_size:
    :param num_epochs:
    :param unroll_steps:
    :param learn_rate:
    :param num_dense: Number of dense fully connected layers to add after concatenation layer
    :param dense_dim: Dimension of dense FC layers -- note this only applies if num_dense > 1
    :param penalty: Penalty to use for regularization
    :param reg_weight: Regularization coeff to use for each layer of network; may
                       want to support different coefficient for different layers
    :return:
    """
    # Set random seed for deterministic results
    np.random.seed(0)
    num_ex_to_train = 30

    # Load embedding table
    table = EmbeddingTable(embed_data)
    vocab_size = table.sizeVocab
    dim_embeddings = table.dimEmbeddings
    embeddings_mat = table.embeddings

    train_prem, train_hyp = generate_data(train_data,
                                          train_data_stats,
                                          "left",
                                          "right",
                                          table,
                                          seq_len=unroll_steps)
    val_prem, val_hyp = generate_data(val_data,
                                      val_data_stats,
                                      "left",
                                      "right",
                                      table,
                                      seq_len=unroll_steps)
    train_labels = convertLabelsToMat(train_data)
    val_labels = convertLabelsToMat(val_data)

    # To test for overfitting capabilities of model
    if num_ex_to_train > 0:
        val_prem = val_prem[0:num_ex_to_train]
        val_hyp = val_hyp[0:num_ex_to_train]
        val_labels = val_labels[0:num_ex_to_train]

    # Theano expressions for premise/hypothesis inputs to network
    x_p = T.imatrix()
    x_h = T.imatrix()
    target_values = T.fmatrix(name="target_output")

    # Embedding layer for premise
    l_in_prem = InputLayer((batch_size, unroll_steps))
    l_embed_prem = EmbeddingLayer(l_in_prem,
                                  input_size=vocab_size,
                                  output_size=dim_embeddings,
                                  W=embeddings_mat)

    # Embedding layer for hypothesis
    l_in_hyp = InputLayer((batch_size, unroll_steps))
    l_embed_hyp = EmbeddingLayer(l_in_hyp,
                                 input_size=vocab_size,
                                 output_size=dim_embeddings,
                                 W=embeddings_mat)

    # Ensure embedding matrix parameters are not trainable
    l_embed_hyp.params[l_embed_hyp.W].remove('trainable')
    l_embed_prem.params[l_embed_prem.W].remove('trainable')

    l_embed_hyp_sum = SumEmbeddingLayer(l_embed_hyp)
    l_embed_prem_sum = SumEmbeddingLayer(l_embed_prem)

    # Concatenate sentence embeddings for premise and hypothesis
    l_concat = ConcatLayer([l_embed_hyp_sum, l_embed_prem_sum])

    l_in = l_concat
    l_output = l_concat
    # Add 'num_dense' dense layers with tanh
    # top layer is softmax
    if num_dense > 1:
        for n in range(num_dense):
            if n == num_dense - 1:
                l_output = DenseLayer(
                    l_in,
                    num_units=NUM_DENSE_UNITS,
                    nonlinearity=lasagne.nonlinearities.softmax)
            else:
                l_in = DenseLayer(l_in,
                                  num_units=dense_dim,
                                  nonlinearity=lasagne.nonlinearities.tanh)
    else:
        l_output = DenseLayer(l_in,
                              num_units=NUM_DENSE_UNITS,
                              nonlinearity=lasagne.nonlinearities.softmax)

    network_output = get_output(l_output, {
        l_in_prem: x_p,
        l_in_hyp: x_h
    })  # Will have shape (batch_size, 3)
    f_dense_output = theano.function([x_p, x_h],
                                     network_output,
                                     on_unused_input='warn')

    # Compute cost
    if penalty == "l2":
        p_metric = l2
    elif penalty == "l1":
        p_metric = l1

    layers = lasagne.layers.get_all_layers(l_output)
    layer_dict = {l: reg_coeff for l in layers}
    reg_cost = reg_coeff * regularize_layer_params_weighted(
        layer_dict, p_metric)
    cost = T.mean(
        T.nnet.categorical_crossentropy(network_output,
                                        target_values).mean()) + reg_cost
    compute_cost = theano.function([x_p, x_h, target_values], cost)

    # Compute accuracy
    accuracy = T.mean(T.eq(T.argmax(network_output, axis=-1),
                           T.argmax(target_values, axis=-1)),
                      dtype=theano.config.floatX)
    compute_accuracy = theano.function([x_p, x_h, target_values], accuracy)

    label_output = T.argmax(network_output, axis=-1)
    predict = theano.function([x_p, x_h], label_output)

    # Define update/train functions
    all_params = lasagne.layers.get_all_params(l_output, trainable=True)
    updates = lasagne.updates.rmsprop(cost, all_params, learn_rate)
    train = theano.function([x_p, x_h, target_values], cost, updates=updates)

    # TODO: Augment embedding layer to allow for masking inputs

    stats = Stats(exp_name)
    acc_num = 10

    #minibatches = getMinibatchesIdx(val_prem.shape[0], batch_size)
    minibatches = getMinibatchesIdx(train_prem.shape[0], batch_size)
    print("Training ...")
    try:
        total_num_ex = 0
        for epoch in xrange(num_epochs):
            for _, minibatch in minibatches:
                total_num_ex += len(minibatch)
                stats.log("Processed {0} total examples in epoch {1}".format(
                    str(total_num_ex), str(epoch)))

                #prem_batch = val_prem[minibatch]
                #hyp_batch = val_hyp[minibatch]
                #labels_batch = val_labels[minibatch]

                prem_batch = train_prem[minibatch]
                hyp_batch = train_hyp[minibatch]
                labels_batch = train_labels[minibatch]

                train(prem_batch, hyp_batch, labels_batch)
                cost_val = compute_cost(prem_batch, hyp_batch, labels_batch)

                stats.recordCost(total_num_ex, cost_val)
                # Periodically compute and log train/dev accuracy
                if total_num_ex % (acc_num * batch_size) == 0:
                    train_acc = compute_accuracy(train_prem, train_hyp,
                                                 train_labels)
                    dev_acc = compute_accuracy(val_prem, val_hyp, val_labels)
                    stats.recordAcc(total_num_ex, train_acc, dataset="train")
                    stats.recordAcc(total_num_ex, dev_acc, dataset="dev")

    except KeyboardInterrupt:
        pass
Ejemplo n.º 30
0
    def _get_l_out(self, input_vars):
        check_options(self.options)
        id_tag = (self.id + '/') if self.id else ''

        prev_output_var, mask_var = input_vars[-2:]
        color_input_vars = input_vars[:-2]

        context_len = self.context_len if hasattr(self, 'context_len') else 1
        l_color_repr, color_inputs = self.color_vec.get_input_layer(
            color_input_vars,
            recurrent_length=self.seq_vec.max_len - 1,
            cell_size=self.options.speaker_cell_size,
            context_len=context_len,
            id=self.id)
        l_hidden_color = dimshuffle(l_color_repr, (0, 2, 1))
        for i in range(1, self.options.speaker_hidden_color_layers + 1):
            l_hidden_color = NINLayer(
                l_hidden_color,
                num_units=self.options.speaker_cell_size,
                nonlinearity=NONLINEARITIES[self.options.speaker_nonlinearity],
                name=id_tag + 'hidden_color%d' % i)
        l_hidden_color = dimshuffle(l_hidden_color, (0, 2, 1))

        l_prev_out = InputLayer(shape=(None, self.seq_vec.max_len - 1),
                                input_var=prev_output_var,
                                name=id_tag + 'prev_input')
        l_prev_embed = EmbeddingLayer(
            l_prev_out,
            input_size=len(self.seq_vec.tokens),
            output_size=self.options.speaker_cell_size,
            name=id_tag + 'prev_embed')
        l_in = ConcatLayer([l_hidden_color, l_prev_embed],
                           axis=2,
                           name=id_tag + 'color_prev')
        l_mask_in = InputLayer(shape=(None, self.seq_vec.max_len - 1),
                               input_var=mask_var,
                               name=id_tag + 'mask_input')
        l_rec_drop = l_in

        cell = CELLS[self.options.speaker_cell]
        cell_kwargs = {
            'mask_input':
            (None if self.options.speaker_no_mask else l_mask_in),
            'grad_clipping': self.options.speaker_grad_clipping,
            'num_units': self.options.speaker_cell_size,
        }
        if self.options.speaker_cell == 'LSTM':
            cell_kwargs['forgetgate'] = Gate(
                b=Constant(self.options.speaker_forget_bias))
        if self.options.speaker_cell != 'GRU':
            cell_kwargs['nonlinearity'] = NONLINEARITIES[
                self.options.speaker_nonlinearity]

        for i in range(1, self.options.speaker_recurrent_layers):
            l_rec = cell(l_rec_drop, name=id_tag + 'rec%d' % i, **cell_kwargs)
            if self.options.speaker_dropout > 0.0:
                l_rec_drop = DropoutLayer(l_rec,
                                          p=self.options.speaker_dropout,
                                          name=id_tag + 'rec%d_drop' % i)
            else:
                l_rec_drop = l_rec
        l_rec = cell(l_rec_drop,
                     name=id_tag +
                     'rec%d' % self.options.speaker_recurrent_layers,
                     **cell_kwargs)
        l_shape = ReshapeLayer(l_rec, (-1, self.options.speaker_cell_size),
                               name=id_tag + 'reshape')
        l_hidden_out = l_shape
        for i in range(1, self.options.speaker_hidden_out_layers + 1):
            l_hidden_out = DenseLayer(
                l_hidden_out,
                num_units=self.options.speaker_cell_size,
                nonlinearity=NONLINEARITIES[self.options.speaker_nonlinearity],
                name=id_tag + 'hidden_out%d' % i)
        l_softmax = DenseLayer(l_hidden_out,
                               num_units=len(self.seq_vec.tokens),
                               nonlinearity=softmax,
                               name=id_tag + 'softmax')
        l_out = ReshapeLayer(
            l_softmax,
            (-1, self.seq_vec.max_len - 1, len(self.seq_vec.tokens)),
            name=id_tag + 'out')

        return l_out, color_inputs + [l_prev_out, l_mask_in]