Exemple #1
0
    def create_model(self):
        input_dim = self.input_dim
        x = self.x
        y = self.y
        p = self.p
        mask = self.mask
        hidden_dim = self.hidden_dim
        embedding_dim = self.embedding_dim
        lookup = LookupTable(self.dict_size,
                             embedding_dim,
                             weights_init=IsotropicGaussian(0.001),
                             name='LookupTable')
        x_to_h = Linear(embedding_dim,
                        hidden_dim * 4,
                        name='x_to_h',
                        weights_init=IsotropicGaussian(0.001),
                        biases_init=Constant(0.0))
        lstm = LSTM(hidden_dim,
                    name='lstm',
                    weights_init=IsotropicGaussian(0.001),
                    biases_init=Constant(0.0))
        h_to_o = MLP([Logistic()], [hidden_dim, 1],
                     weights_init=IsotropicGaussian(0.001),
                     biases_init=Constant(0),
                     name='h_to_o')

        lookup.initialize()
        x_to_h.initialize()
        lstm.initialize()
        h_to_o.initialize()

        embed = lookup.apply(x).reshape(
            (x.shape[0], x.shape[1], self.embedding_dim))
        embed.name = "embed_vec"
        x_transform = x_to_h.apply(embed.transpose(1, 0, 2))
        x_transform.name = "Transformed X"
        self.lookup = lookup
        self.x_to_h = x_to_h
        self.lstm = lstm
        self.h_to_o = h_to_o

        #if mask is None:
        h, c = lstm.apply(x_transform)
        #else:
        #h, c = lstm.apply(x_transform, mask=mask)
        h.name = "hidden_state"
        c.name = "cell state"
        # only values of hidden units of the last timeframe are used for
        # the classification
        indices = T.sum(mask, axis=0) - 1
        rel_hid = h[indices, T.arange(h.shape[1])]
        out = self.h_to_o.apply(rel_hid)

        probs = out
        return probs
Exemple #2
0
    def __init__(self, input1_size, input2_size, lookup1_dim=200, lookup2_dim=200, hidden_size=512):
        self.hidden_size = hidden_size
        self.input1_size = input1_size
        self.input2_size = input2_size
        self.lookup1_dim = lookup1_dim
        self.lookup2_dim = lookup2_dim

        x1 = tensor.lmatrix('durations')
        x2 = tensor.lmatrix('syllables')
        y = tensor.lmatrix('pitches')

        lookup1 = LookupTable(dim=self.lookup1_dim, length=self.input1_size, name='lookup1',
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0))
        lookup1.initialize()
        lookup2 = LookupTable(dim=self.lookup2_dim, length=self.input2_size, name='lookup2',
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0))
        lookup2.initialize()
        merge = Merge(['lookup1', 'lookup2'], [self.lookup1_dim, self.lookup2_dim], self.hidden_size,
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0))
        merge.initialize()
        recurrent_block = LSTM(dim=self.hidden_size, activation=Tanh(),
                              weights_init=initialization.Uniform(width=0.01)) #RecurrentStack([LSTM(dim=self.hidden_size, activation=Tanh())] * 3)
        recurrent_block.initialize()
        linear = Linear(input_dim=self.hidden_size, output_dim=self.input1_size,
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0))
        linear.initialize()
        softmax = NDimensionalSoftmax()

        l1 = lookup1.apply(x1)
        l2 = lookup2.apply(x2)
        m = merge.apply(l1, l2)
        h = recurrent_block.apply(m)
        a = linear.apply(h)

        y_hat = softmax.apply(a, extra_ndim=1)
        # ValueError: x must be 1-d or 2-d tensor of floats. Got TensorType(float64, 3D)

        self.Cost = softmax.categorical_cross_entropy(y, a, extra_ndim=1).mean()

        self.ComputationGraph = ComputationGraph(self.Cost)

        self.Model = Model(y_hat)
def construct_model(vocab_size, embedding_dim, ngram_order, hidden_dims,
                    activations):
    # Construct the model
    x = tensor.lmatrix('features')
    y = tensor.lvector('targets')

    lookup = LookupTable(length=vocab_size, dim=embedding_dim, name='lookup')
    hidden = MLP(activations=activations + [None],
                 dims=[ngram_order * embedding_dim] + hidden_dims +
                 [vocab_size])

    embeddings = lookup.apply(x)
    embeddings = embeddings.flatten(ndim=2)  # Concatenate embeddings
    activations = hidden.apply(embeddings)
    cost = Softmax().categorical_cross_entropy(y, activations)

    # Initialize parameters
    lookup.weights_init = IsotropicGaussian(0.001)
    hidden.weights_init = IsotropicGaussian(0.01)
    hidden.biases_init = Constant(0.001)
    lookup.initialize()
    hidden.initialize()

    return cost
def construct_model(vocab_size, embedding_dim, ngram_order, hidden_dims,
                    activations):
    # Construct the model
    x = tensor.lmatrix('features')
    y = tensor.lvector('targets')

    lookup = LookupTable(length=vocab_size, dim=embedding_dim, name='lookup')
    hidden = MLP(activations=activations + [None],
                 dims=[ngram_order * embedding_dim] + hidden_dims +
                 [vocab_size])

    embeddings = lookup.apply(x)
    embeddings = embeddings.flatten(ndim=2)  # Concatenate embeddings
    activations = hidden.apply(embeddings)
    cost = Softmax().categorical_cross_entropy(y, activations)

    # Initialize parameters
    lookup.weights_init = IsotropicGaussian(0.001)
    hidden.weights_init = IsotropicGaussian(0.01)
    hidden.biases_init = Constant(0.001)
    lookup.initialize()
    hidden.initialize()

    return cost
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')
        ans_indices = tensor.imatrix('ans_indices')  # n_steps * n_samples
        ans_indices_mask = tensor.imatrix('ans_indices_mask')

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)
        ans_indices = ans_indices.dimshuffle(1, 0)
        ans_indices_mask = ans_indices_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size, config.embed_size, name='embed')
        #embed.weights_init = IsotropicGaussian(0.01)
        embed.weights_init = Constant(
            init_embedding_table(filename='embeddings/vocab_embeddings.txt'))

        # one directional LSTM encoding
        q_lstm_ins = Linear(input_dim=config.embed_size,
                            output_dim=4 * config.pre_lstm_size,
                            name='q_lstm_in')
        q_lstm = LSTM(dim=config.pre_lstm_size,
                      activation=Tanh(),
                      name='q_lstm')
        c_lstm_ins = Linear(input_dim=config.embed_size,
                            output_dim=4 * config.pre_lstm_size,
                            name='c_lstm_in')
        c_lstm = LSTM(dim=config.pre_lstm_size,
                      activation=Tanh(),
                      name='c_lstm')
        bricks += [q_lstm, c_lstm, q_lstm_ins, c_lstm_ins]

        q_tmp = q_lstm_ins.apply(embed.apply(question))
        c_tmp = c_lstm_ins.apply(embed.apply(context))
        q_hidden, _ = q_lstm.apply(q_tmp,
                                   mask=question_mask.astype(
                                       theano.config.floatX))  # lq, bs, dim
        c_hidden, _ = c_lstm.apply(c_tmp,
                                   mask=context_mask.astype(
                                       theano.config.floatX))  # lc, bs, dim

        # Attention mechanism Bilinear question
        attention_question = Linear(input_dim=config.pre_lstm_size,
                                    output_dim=config.pre_lstm_size,
                                    name='att_question')
        bricks += [attention_question]
        att_weights_question = q_hidden[
            None, :, :, :] * attention_question.apply(
                c_hidden.reshape(
                    (c_hidden.shape[0] * c_hidden.shape[1],
                     c_hidden.shape[2]))).reshape(
                         (c_hidden.shape[0], c_hidden.shape[1],
                          c_hidden.shape[2]))[:,
                                              None, :, :]  # --> lc,lq,bs,dim
        att_weights_question = att_weights_question.sum(
            axis=3)  # sum over axis 3 -> dimensions --> lc,lq,bs
        att_weights_question = att_weights_question.dimshuffle(
            0, 2, 1)  # --> lc,bs,lq
        att_weights_question = att_weights_question.reshape(
            (att_weights_question.shape[0] * att_weights_question.shape[1],
             att_weights_question.shape[2]))  # --> lc*bs,lq
        att_weights_question = tensor.nnet.softmax(
            att_weights_question
        )  # softmax over axis 1 -> length of question # --> lc*bs,lq
        att_weights_question = att_weights_question.reshape(
            (c_hidden.shape[0], q_hidden.shape[1],
             q_hidden.shape[0]))  # --> lc,bs,lq
        att_weights_question = att_weights_question.dimshuffle(
            0, 2, 1)  # --> lc,lq,bs

        attended_question = tensor.sum(
            q_hidden[None, :, :, :] * att_weights_question[:, :, :, None],
            axis=1)  # sum over axis 1 -> length of question --> lc,bs,dim
        attended_question.name = 'attended_question'

        # Match LSTM
        cqembed = tensor.concatenate([c_hidden, attended_question], axis=2)
        mlstms, mhidden_list = make_bidir_lstm_stack(
            cqembed, 2 * config.pre_lstm_size,
            context_mask.astype(theano.config.floatX), config.match_lstm_size,
            config.match_skip_connections, 'match')
        bricks = bricks + mlstms
        if config.match_skip_connections:
            menc_dim = 2 * sum(config.match_lstm_size)
            menc = tensor.concatenate(mhidden_list, axis=2)
        else:
            menc_dim = 2 * config.match_lstm_size[-1]
            menc = tensor.concatenate(mhidden_list[-2:], axis=2)
        menc.name = 'menc'

        # Attention mechanism MLP start
        attention_mlp_start = MLP(
            dims=config.attention_mlp_hidden + [1],
            activations=config.attention_mlp_activations[1:] + [Identity()],
            name='attention_mlp_start')
        attention_clinear_start = Linear(
            input_dim=menc_dim,
            output_dim=config.attention_mlp_hidden[0],
            name='attm_start')  # Wym
        bricks += [attention_mlp_start, attention_clinear_start]
        layer1_start = Tanh(name='layer1_start')
        layer1_start = layer1_start.apply(
            attention_clinear_start.apply(
                menc.reshape(
                    (menc.shape[0] * menc.shape[1], menc.shape[2]))).reshape(
                        (menc.shape[0], menc.shape[1],
                         config.attention_mlp_hidden[0])))
        att_weights_start = attention_mlp_start.apply(
            layer1_start.reshape(
                (layer1_start.shape[0] * layer1_start.shape[1],
                 layer1_start.shape[2])))
        att_weights_start = att_weights_start.reshape(
            (layer1_start.shape[0], layer1_start.shape[1]))
        att_weights_start = tensor.nnet.softmax(att_weights_start.T).T

        attended = tensor.sum(menc * att_weights_start[:, :, None], axis=0)
        attended.name = 'attended'

        # Attention mechanism MLP end
        attention_mlp_end = MLP(
            dims=config.attention_mlp_hidden + [1],
            activations=config.attention_mlp_activations[1:] + [Identity()],
            name='attention_mlp_end')
        attention_qlinear_end = Linear(
            input_dim=menc_dim,
            output_dim=config.attention_mlp_hidden[0],
            name='atts_end')  #Wum
        attention_clinear_end = Linear(
            input_dim=menc_dim,
            output_dim=config.attention_mlp_hidden[0],
            use_bias=False,
            name='attm_end')  # Wym
        bricks += [
            attention_mlp_end, attention_qlinear_end, attention_clinear_end
        ]
        layer1_end = Tanh(name='layer1_end')
        layer1_end = layer1_end.apply(
            attention_clinear_end.apply(
                menc.reshape((menc.shape[0] * menc.shape[1], menc.shape[2]
                              ))).reshape((menc.shape[0], menc.shape[1],
                                           config.attention_mlp_hidden[0])) +
            attention_qlinear_end.apply(attended)[None, :, :])
        att_weights_end = attention_mlp_end.apply(
            layer1_end.reshape((layer1_end.shape[0] * layer1_end.shape[1],
                                layer1_end.shape[2])))
        att_weights_end = att_weights_end.reshape(
            (layer1_end.shape[0], layer1_end.shape[1]))
        att_weights_end = tensor.nnet.softmax(att_weights_end.T).T

        att_weights_start = tensor.dot(
            tensor.le(
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[None, :],
                    (context.shape[0], 1)),
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[:, None],
                    (1, context.shape[0]))), att_weights_start)
        att_weights_end = tensor.dot(
            tensor.ge(
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[None, :],
                    (context.shape[0], 1)),
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[:, None],
                    (1, context.shape[0]))), att_weights_end)

        # add attention from left and right
        att_weights = att_weights_start * att_weights_end
        #att_weights = tensor.minimum(att_weights_start, att_weights_end)

        att_target = tensor.zeros((ans_indices.shape[1], context.shape[0]),
                                  dtype=theano.config.floatX)
        att_target = tensor.set_subtensor(
            att_target[tensor.arange(ans_indices.shape[1]), ans_indices], 1)
        att_target = att_target.dimshuffle(1, 0)
        #att_target = tensor.eq(tensor.tile(answer[None,:,:], (context.shape[0], 1, 1)),
        #                       tensor.tile(context[:,None,:], (1, answer.shape[0], 1))).sum(axis=1).clip(0,1)

        self.predictions = tensor.gt(att_weights, 0.25) * context

        att_target = att_target / (att_target.sum(axis=0) + 0.00001)
        #att_weights = att_weights / (att_weights.sum(axis=0) + 0.00001)

        cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) *
                context_mask).sum() / context_mask.sum()

        # Apply dropout
        cg = ComputationGraph([cost])
        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, mhidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'
        att_weights_start.name = 'att_weights_start'
        att_weights_end.name = 'att_weights_end'
        att_weights.name = 'att_weights'
        att_target.name = 'att_target'
        self.predictions.name = 'pred'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]
        self.analyse_vars = [
            cost, self.predictions, att_weights_start, att_weights_end,
            att_weights, att_target
        ]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
Exemple #6
0
    def create_model(self, symbols_num = 500):

        # Hyperparameters

        # The dimension of the hidden state of the GRUs in each direction.
        hidden_states = self.args.encoder_hidden_dims
        # Dimension of the word-embedding space
        embedding_dims = self.args.source_embeddings_dim


        ###################
        # Declaration of the Theano variables that come from the data stream
        ###################

        # The context document.
        context_bt = tt.lmatrix('context')
        # Context document mask used to distinguish real symbols from the sequence and padding symbols that are at the end
        context_mask_bt = tt.matrix('context_mask')

        # The question
        question_bt = tt.lmatrix('question')
        question_mask_bt = tt.matrix('question_mask')

        # The correct answer
        y = tt.lmatrix('answer')
        y = y[:,0] # originally answers are in a 2d matrix, here we convert it to a vector

        # The candidates among which the answer is selected
        candidates_bi = tt.lmatrix("candidates")
        candidates_bi_mask = tt.matrix("candidates_mask")



        ###################
        # Network's components
        ###################

        # Lookup table with randomly initialized word embeddings
        lookup = LookupTable(symbols_num, embedding_dims, weights_init=Uniform(width=0.2))

        # bidirectional encoder that translates context
        context_encoder = self.create_bidi_encoder("context_encoder", embedding_dims, hidden_states)

        # bidirectional encoder for question
        question_encoder = self.create_bidi_encoder("question_encoder", embedding_dims, hidden_states)

        # Initialize the components (where not done upon creation)
        lookup.initialize()



        ###################
        # Wiring the components together
        #
        # Where present, the 3 letters at the end of the variable name identify its dimensions:
        # b ... position of the example within the batch
        # t ... position of the word within the document/question
        # f ... features of the embedding vector
        ###################

        ### Read the context document
        # Map token indices to word embeddings
        context_embedding_tbf = lookup.apply(context_bt.T)

        # Read the embedded context document using the bidirectional GRU and produce the contextual embedding of each word
        memory_encoded_btf = context_encoder.apply(context_embedding_tbf, context_mask_bt.T).dimshuffle(1,0,2)
        memory_encoded_btf.name = "memory_encoded_btf"

        ### Correspondingly, read the query
        x_embedded_tbf = lookup.apply(question_bt.T)
        x_encoded_btf = question_encoder.apply(x_embedded_tbf, question_mask_bt.T).dimshuffle(1,0,2)
        # The query encoding is a concatenation of the final states of the forward and backward GRU encoder
        x_forward_encoded_bf = x_encoded_btf[:,-1,0:hidden_states]
        x_backward_encoded_bf = x_encoded_btf[:,0,hidden_states:hidden_states*2]
        query_representation_bf = tt.concatenate([x_forward_encoded_bf,x_backward_encoded_bf],axis=1)

        # Compute the attention on each word in the context as a dot product of its contextual embedding and the query
        mem_attention_presoft_bt = tt.batched_dot(query_representation_bf, memory_encoded_btf.dimshuffle(0,2,1))

        # TODO is this pre-masking necessary?
        mem_attention_presoft_masked_bt = tt.mul(mem_attention_presoft_bt,context_mask_bt)

        # Normalize the attention using softmax
        mem_attention_bt = SoftmaxWithMask(name="memory_query_softmax").apply(mem_attention_presoft_masked_bt,context_mask_bt)

        if self.args.weighted_att:
            # compute weighted attention over original word vectors
            att_weighted_responses_bf = theano.tensor.batched_dot(mem_attention_bt, context_embedding_tbf.dimshuffle(1,0,2))


            # compare desired response to all candidate responses
            # select relevant candidate answer words
            candidates_embeddings_bfi = lookup.apply(candidates_bi).dimshuffle(0,2,1)

            # convert it to output symbol probabilities
            y_hat_presoft = tt.batched_dot(att_weighted_responses_bf, candidates_embeddings_bfi)
            y_hat = SoftmaxWithMask(name="output_softmax").apply(y_hat_presoft,candidates_bi_mask)

        else:
            # Sum the attention of each candidate word across the whole context document,
            # this is the key innovation of the model

            # TODO: Get rid of sentence-by-sentence processing?
            # TODO: Rewrite into matrix notation instead of scans?
            def sum_prob_of_word(word_ix, sentence_ixs, sentence_attention_probs):
                word_ixs_in_sentence = tt.eq(sentence_ixs,word_ix).nonzero()[0]
                return sentence_attention_probs[word_ixs_in_sentence].sum()

            def sum_probs_single_sentence(candidate_indices_i, sentence_ixs_t, sentence_attention_probs_t):
                result, updates = theano.scan(
                    fn=sum_prob_of_word,
                    sequences=[candidate_indices_i],
                    non_sequences=[sentence_ixs_t, sentence_attention_probs_t])
                return result

            def sum_probs_batch(candidate_indices_bt,sentence_ixs_bt, sentence_attention_probs_bt):
                result, updates = theano.scan(
                    fn=sum_probs_single_sentence,
                    sequences=[candidate_indices_bt, sentence_ixs_bt, sentence_attention_probs_bt],
                    non_sequences=None)
                return result

            # Sum the attention of each candidate word across the whole context document
            y_hat = sum_probs_batch(candidates_bi, context_bt, mem_attention_bt)
        y_hat.name = "y_hat"

        # We use the convention that ground truth is always at index 0, so the following are the target answers
        y = y.zeros_like()

        # We use Cross Entropy as the training objective
        cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat)
        cost.name = "cost"


        predicted_response_index = tt.argmax(y_hat,axis=1)
        accuracy = tt.eq(y,predicted_response_index).mean()
        accuracy.name = "accuracy"

        return cost, accuracy, mem_attention_bt, y_hat, context_bt, candidates_bi, candidates_bi_mask, y, context_mask_bt, question_bt, question_mask_bt
Exemple #7
0
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size,
                            config.embed_size,
                            name='question_embed')
        embed.weights_init = IsotropicGaussian(0.01)

        # Calculate question encoding (concatenate layer1)
        qembed = embed.apply(question)
        qlstms, qhidden_list = make_bidir_lstm_stack(
            qembed, config.embed_size,
            question_mask.astype(theano.config.floatX),
            config.question_lstm_size, config.question_skip_connections, 'q')
        bricks = bricks + qlstms
        if config.question_skip_connections:
            qenc_dim = 2 * sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list],
                                      axis=1)
        else:
            qenc_dim = 2 * config.question_lstm_size[-1]
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]],
                                      axis=1)
        qenc.name = 'qenc'

        # Calculate context encoding (concatenate layer1)
        cembed = embed.apply(context)
        cqembed = tensor.concatenate([
            cembed,
            tensor.extra_ops.repeat(qenc[None, :, :], cembed.shape[0], axis=0)
        ],
                                     axis=2)
        clstms, chidden_list = make_bidir_lstm_stack(
            cqembed, config.embed_size + qenc_dim,
            context_mask.astype(theano.config.floatX), config.ctx_lstm_size,
            config.ctx_skip_connections, 'ctx')
        bricks = bricks + clstms
        if config.ctx_skip_connections:
            cenc_dim = 2 * sum(config.ctx_lstm_size)  #2 : fw & bw
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2 * config.question_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'

        # Attention mechanism Bilinear
        attention_clinear_1 = Linear(input_dim=cenc_dim,
                                     output_dim=qenc_dim,
                                     name='attc_1')
        bricks += [attention_clinear_1]
        att_start = qenc[None, :, :] * attention_clinear_1.apply(
            cenc.reshape(
                (cenc.shape[0] * cenc.shape[1], cenc.shape[2]))).reshape(
                    (cenc.shape[0], cenc.shape[1], cenc.shape[2]))
        att_start = att_start.sum(axis=2)
        att_start = tensor.nnet.softmax(att_start.T).T

        attention_clinear_2 = Linear(input_dim=cenc_dim,
                                     output_dim=qenc_dim,
                                     name='attc_2')
        bricks += [attention_clinear_2]
        att_end = qenc[None, :, :] * attention_clinear_2.apply(
            cenc.reshape(
                (cenc.shape[0] * cenc.shape[1], cenc.shape[2]))).reshape(
                    (cenc.shape[0], cenc.shape[1], cenc.shape[2]))
        att_end = att_end.sum(axis=2)
        att_end = tensor.nnet.softmax(att_end.T).T

        att_start = tensor.dot(
            tensor.le(
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[None, :],
                    (context.shape[0], 1)),
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[:, None],
                    (1, context.shape[0]))), att_start)
        att_end = tensor.dot(
            tensor.ge(
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[None, :],
                    (context.shape[0], 1)),
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[:, None],
                    (1, context.shape[0]))), att_end)

        # add attention from left and right
        att_weights = att_start * att_end

        att_target = tensor.eq(
            tensor.tile(answer[None, :, :], (context.shape[0], 1, 1)),
            tensor.tile(context[:, None, :],
                        (1, answer.shape[0], 1))).sum(axis=1).clip(0, 1)

        self.predictions = tensor.gt(att_weights, 0.25) * context

        att_target = att_target / (att_target.sum(axis=0) + 0.00001)
        att_weights = att_weights / (att_weights.sum(axis=0) + 0.00001)

        #cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) * context_mask).sum() / context_mask.sum()
        cost = (((att_weights - att_target)**2) *
                context_mask).sum() / context_mask.sum()

        # Apply dropout
        cg = ComputationGraph([cost])
        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'
        att_start.name = 'att_start'
        att_end.name = 'att_end'
        att_weights.name = 'att_weights'
        att_target.name = 'att_target'
        self.predictions.name = 'pred'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]
        self.analyse_vars = [
            cost, self.predictions, att_start, att_end, att_weights, att_target
        ]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
class NeuralLM:

    def __init__(self, x, y, vocab_size, hidden_size, num_layers, pretrained_embeds=None):
        """
        Implements a neural language model using an LSTM.
        Word y_n+1 ~ Softmax(U * h_n)
        :param x A minibatch: each row is an instance (a sequence),
            with batch_size rows
        :param y x shifted by 1, which are the target words to predict
            for the language modeling objective based on the hidden LSTM
            state
        :param vocab_size The number of types in the training data
        :param hidden_size The dimensionality of the word embeddings
        :param pretrained_embeds Pretrained embeddings for initailization as an ND array
        """
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # Initialize the word embedding table.  If we have pretrained embeddings, we use those
        self.word_embedding_lookup = LookupTable(length=vocab_size, dim=hidden_size, name="word_embeddings")
        if pretrained_embeds is None:
            initialize(self.word_embedding_lookup, 0.8)
        else:
            assert pretrained_embeds.shape[0] == vocab_size and pretrained_embeds.shape[1] == hidden_size
            self.word_embedding_lookup.weights_init = Constant(pretrained_embeds)
            self.word_embedding_lookup.biases_init = Constant(0)
            self.word_embedding_lookup.initialize()

        self.word_embeddings = self.word_embedding_lookup.W

        self.y_hat, self.cost, self.cells = self.nn_fprop(x, y, num_layers)

    def lstm_layer(self, h, n):
        """
        Performs the LSTM update for a batch of word sequences
        :param h The word embeddings for this update
        :param n The number of layers of the LSTM
        """
        # Maps the word embedding to a dimensionality to be used in the LSTM
        linear = Linear(input_dim=self.hidden_size, output_dim=self.hidden_size * 4, name='linear_lstm' + str(n))
        initialize(linear, sqrt(6.0 / (5 * self.hidden_size)))
        lstm = LSTM(dim=self.hidden_size, name='lstm' + str(n))
        initialize(lstm, 0.08)
        return lstm.apply(linear.apply(h))

    def softmax_layer(self, h, y):
        """
        Perform Softmax over the hidden state in order to
        predict the next word in the sequence and compute
        the loss.
        :param h The hidden state sequence
        :param y The target words
        """
        hidden_to_output = Linear(name='hidden_to_output', input_dim=self.hidden_size,
                                  output_dim=self.vocab_size)
        initialize(hidden_to_output, sqrt(6.0 / (self.hidden_size + self.vocab_size)))

        linear_output = hidden_to_output.apply(h)
        linear_output.name = 'linear_output'
        softmax = NDimensionalSoftmax(name="lm_softmax")
        y_hat = softmax.log_probabilities(linear_output, extra_ndim=1)
        y_hat.name = 'y_hat'

        cost = softmax.categorical_cross_entropy(y, linear_output, extra_ndim=1).mean()

        cost.name = 'cost'
        return y_hat, cost

    def nn_fprop(self, x, y, num_layers):
        h = T.nnet.sigmoid(self.word_embedding_lookup.apply(x)) # constrain the word embeddings
        cells = []
        for i in range(num_layers):
            h, c = self.lstm_layer(h, i)
            cells.append(c)
        return self.softmax_layer(h, y) + (cells, )

    @property
    def cost(self):
        return self.cost

    @property
    def embeddings(self):
        return self.word_embeddings
Exemple #9
0
 def _build_lookup(self, name, word_num, dim=1, *args, **kwargs):
     lookup = LookupTable(length=word_num, dim=dim, name=name)
     lookup.weights_init = Constant(1. / word_num**0.25)
     lookup.initialize()
     return lookup
Exemple #10
0
def run(epochs=1, corpus="data/", HIDDEN_DIMS=100, path="./"):
    brown = BrownDataset(corpus)

    INPUT_DIMS = brown.get_vocabulary_size()

    OUTPUT_DIMS = brown.get_vocabulary_size()

    # These are theano variables
    x = tensor.lmatrix('context')
    y = tensor.ivector('output')

    # Construct the graph
    input_to_hidden = LookupTable(name='input_to_hidden', length=INPUT_DIMS,
                                  dim=HIDDEN_DIMS)

    # Compute the weight matrix for every word in the context and then compute
    # the average.
    h = tensor.mean(input_to_hidden.apply(x), axis=1)

    hidden_to_output = Linear(name='hidden_to_output', input_dim=HIDDEN_DIMS,
                              output_dim=OUTPUT_DIMS)
    y_hat = Softmax().apply(hidden_to_output.apply(h))

    # And initialize with random varibales and set the bias vector to 0
    weights = IsotropicGaussian(0.01)
    input_to_hidden.weights_init = hidden_to_output.weights_init = weights
    input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0)
    input_to_hidden.initialize()
    hidden_to_output.initialize()

    # And now the cost function
    cost = CategoricalCrossEntropy().apply(y, y_hat)
    cg = ComputationGraph(cost)

    W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables)
    cost = cost + 0.01 * (W1 ** 2).sum() + 0.01 * (W2 ** 2).sum()
    cost.name = 'cost_with_regularization'

    mini_batch = SequentialScheme(brown.num_instances(), 512)
    data_stream = DataStream.default_stream(brown, iteration_scheme=mini_batch)

    # Now we tie up lose ends and construct the algorithm for the training
    # and define what happens in the main loop.
    algorithm = GradientDescent(cost=cost, parameters=cg.parameters,
                                step_rule=Scale(learning_rate=0.1))

    extensions = [
        ProgressBar(),
        FinishAfter(after_n_epochs=epochs),
        Printing(),
        # TrainingDataMonitoring(variables=[cost]),
        SaveWeights(layers=[input_to_hidden, hidden_to_output],
                    prefixes=['%sfirst' % path, '%ssecond' % path]),
        # Plot(
        #     'Word Embeddings',
        #     channels=[
        #         [
        #             'cost_with_regularization'
        #         ]
        #     ])
    ]

    logger.info("Starting main loop...")
    main = MainLoop(data_stream=data_stream,
                    algorithm=algorithm,
                    extensions=extensions)

    main.run()

    pickle.dump(cg, open('%scg.pickle' % path, 'wb'))
Exemple #11
0
    def __init__(self, config, vocab_size):
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')

        bricks = []

        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)

        context_bag = to_bag(context, vocab_size)

        # Embed questions and context
        embed = LookupTable(vocab_size, config.embed_size, name='embed')
        embed.weights_init = IsotropicGaussian(0.01)
        #embeddings_initial_value = init_embedding_table(filename='embeddings/vocab_embeddings.txt')
        #embed.weights_init = Constant(embeddings_initial_value)

        # Calculate context encoding (concatenate layer1)
        cembed = embed.apply(context)
        clstms, chidden_list = make_bidir_lstm_stack(
            cembed, config.embed_size,
            context_mask.astype(theano.config.floatX), config.ctx_lstm_size,
            config.ctx_skip_connections, 'ctx')
        bricks = bricks + clstms
        if config.ctx_skip_connections:
            cenc_dim = 2 * sum(config.ctx_lstm_size)  #2 : fw & bw
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2 * config.ctx_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'

        # Build the encoder bricks
        transition = GatedRecurrent(activation=Tanh(),
                                    dim=config.generator_lstm_size,
                                    name="transition")
        attention = SequenceContentAttention(
            state_names=transition.apply.states,
            attended_dim=cenc_dim,
            match_dim=config.generator_lstm_size,
            name="attention")
        readout = Readout(readout_dim=vocab_size,
                          source_names=[
                              transition.apply.states[0],
                              attention.take_glimpses.outputs[0]
                          ],
                          emitter=MaskedSoftmaxEmitter(context_bag=context_bag,
                                                       name='emitter'),
                          feedback_brick=LookupFeedback(
                              vocab_size, config.feedback_size),
                          name="readout")
        generator = SequenceGenerator(readout=readout,
                                      transition=transition,
                                      attention=attention,
                                      name="generator")

        cost = generator.cost(answer,
                              answer_mask.astype(theano.config.floatX),
                              attended=cenc,
                              attended_mask=context_mask.astype(
                                  theano.config.floatX),
                              name="cost")
        self.predictions = generator.generate(
            n_steps=7,
            batch_size=config.batch_size,
            attended=cenc,
            attended_mask=context_mask.astype(theano.config.floatX),
            iterate=True)[1]

        # Apply dropout
        cg = ComputationGraph([cost])

        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, chidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]

        # initialize new stuff manually (change!)
        generator.weights_init = IsotropicGaussian(0.01)
        generator.biases_init = Constant(0)
        generator.push_allocation_config()
        generator.push_initialization_config()
        transition.weights_init = Orthogonal()
        generator.initialize()

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
Exemple #12
0
    def create_model(self):
        input_dim = self.input_dim
        x = self.x
        y = self.y
        p = self.p
        mask = self.mask
        hidden_dim = self.hidden_dim
        embedding_dim = self.embedding_dim
        lookup = LookupTable(self.dict_size,
                             embedding_dim,
                             weights_init=IsotropicGaussian(0.001),
                             name='LookupTable')
        x_to_h = Linear(embedding_dim,
                        hidden_dim * 4,
                        name='x_to_h',
                        weights_init=IsotropicGaussian(0.001),
                        biases_init=Constant(0.0))
        lstm = LSTM(hidden_dim,
                    name='lstm',
                    weights_init=IsotropicGaussian(0.001),
                    biases_init=Constant(0.0))
        h_to_o = MLP([Logistic()], [hidden_dim, 1],
                     weights_init=IsotropicGaussian(0.001),
                     biases_init=Constant(0),
                     name='h_to_o')

        lookup.initialize()
        x_to_h.initialize()
        lstm.initialize()
        h_to_o.initialize()

        embed = lookup.apply(x).reshape(
            (x.shape[0], x.shape[1], self.embedding_dim))
        embed.name = "embed_vec"
        x_transform = x_to_h.apply(embed.transpose(1, 0, 2))
        x_transform.name = "Transformed X"
        self.lookup = lookup
        self.x_to_h = x_to_h
        self.lstm = lstm
        self.h_to_o = h_to_o

        #if mask is None:
        h, c = lstm.apply(x_transform)
        #else:
        #h, c = lstm.apply(x_transform, mask=mask)
        h.name = "hidden_state"
        c.name = "cell state"
        # only values of hidden units of the last timeframe are used for
        # the classification
        indices = T.sum(mask, axis=0) - 1
        rel_hid = h[indices, T.arange(h.shape[1])]
        out = self.h_to_o.apply(rel_hid)

        probs = 1 - out
        probs.name = "probability"
        y = y.dimshuffle(0, 'x')
        # Create the if-else cost function
        pos_ex = (y * probs) / p
        neg_ex = (1 - y) * (1 - probs) / np.float32(1 - p)
        reward = pos_ex + neg_ex
        cost = reward  # Negative of reward
        cost.name = "cost"
        return cost
Exemple #13
0
x = tensor.imatrix('features')
y = tensor.ivector('targets')

v = dataset.get_vocab_size()

input_to_hidden = LookupTable(name='input_to_hidden', length=v, dim=hidden_size)
h = tensor.mean(input_to_hidden.apply(x), axis=1)


hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_size, output_dim=v)
y_hat = Softmax().apply(hidden_to_output.apply(h))

input_to_hidden.weights_init = hidden_to_output.weights_init = IsotropicGaussian(0.01)
input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0)

input_to_hidden.initialize()
hidden_to_output.initialize()

cost = CategoricalCrossEntropy().apply(y, y_hat)

cg = ComputationGraph(cost)

W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables)
cost = cost + 0.005 * (abs(W1)).sum() + 0.005 * (abs(W2)).sum()
cost.name = 'cost'


algorithm = GradientDescent(cost=cost,
                            parameters=cg.parameters,
                            step_rule=Scale(learning_rate=0.1))
Exemple #14
0
def main(model_path, recurrent_type):
    dataset_options = dict(dictionary=char2code, level="character",
                           preprocess=_lower)
    dataset = OneBillionWord("training", [99], **dataset_options)
    data_stream = dataset.get_example_stream()
    data_stream = Filter(data_stream, _filter_long)
    data_stream = Mapping(data_stream, _make_target,
                          add_sources=('target',))
    data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(100))
    data_stream = Padding(data_stream)
    data_stream = Mapping(data_stream, _transpose)

    features = tensor.lmatrix('features')
    features_mask = tensor.matrix('features_mask')
    target = tensor.lmatrix('target')
    target_mask = tensor.matrix('target_mask')

    dim = 100
    lookup = LookupTable(len(all_chars), dim,
                         weights_init=IsotropicGaussian(0.01),
                         biases_init=Constant(0.))

    if recurrent_type == 'lstm':
        rnn = LSTM(dim / 4, Tanh(),
                   weights_init=IsotropicGaussian(0.01),
                   biases_init=Constant(0.))
    elif recurrent_type == 'simple':
        rnn = SimpleRecurrent(dim, Tanh())
        rnn = Bidirectional(rnn,
                            weights_init=IsotropicGaussian(0.01),
                            biases_init=Constant(0.))
    else:
        raise ValueError('Not known RNN type')
    rnn.initialize()
    lookup.initialize()
    y_hat = rnn.apply(lookup.apply(features), mask=features_mask)

    print len(all_chars)
    linear = Linear(2 * dim, len(all_chars),
                    weights_init=IsotropicGaussian(0.01),
                    biases_init=Constant(0.))
    linear.initialize()
    y_hat = linear.apply(y_hat)
    seq_lenght = y_hat.shape[0]
    batch_size = y_hat.shape[1]
    y_hat = Softmax().apply(y_hat.reshape((seq_lenght * batch_size, -1))).reshape(y_hat.shape)
    cost = CategoricalCrossEntropy().apply(
        target.flatten(),
        y_hat.reshape((-1, len(all_chars)))) * seq_lenght * batch_size
    cost.name = 'cost'
    cost_per_character = cost / features_mask.sum()
    cost_per_character.name = 'cost_per_character'

    cg = ComputationGraph([cost, cost_per_character])
    model = Model(cost)
    algorithm = GradientDescent(step_rule=Adam(), cost=cost,
                                params=cg.parameters)

    train_monitor = TrainingDataMonitoring(
        [cost, cost_per_character], prefix='train',
        after_batch=True)
    extensions = [train_monitor, Printing(every_n_batches=40),
                  Dump(model_path, every_n_batches=200),
                  #Checkpoint('rnn.pkl', every_n_batches=200)
                  ]
    main_loop = MainLoop(model=model, algorithm=algorithm,
                         data_stream=data_stream, extensions=extensions)
    main_loop.run()
Exemple #15
0
def main(config): 
	vocab_src, _ = text_to_dict([config['train_src'],
		config['dev_src'], config['test_src']])
	vocab_tgt, cabvo = text_to_dict([config['train_tgt'],
		config['dev_tgt']])

	# Create Theano variables
	logger.info('Creating theano variables')
	source_sentence = tensor.lmatrix('source')
	source_sentence_mask = tensor.matrix('source_mask')
	target_sentence = tensor.lmatrix('target')
	target_sentence_mask = tensor.matrix('target_mask')
	source_sentence.tag.test_value = [[13, 20, 0, 20, 0, 20, 0],
										[1, 4, 8, 4, 8, 4, 8],]
	source_sentence_mask.tag.test_value = [[0, 1, 0, 1, 0, 1, 0],
											[1, 0, 1, 0, 1, 0, 1],]
	target_sentence.tag.test_value = [[0,1,1,5],
										[2,0,1,0],]
	target_sentence_mask.tag.test_value = [[0,1,1,0],
											[1,1,1,0],]


	logger.info('Building RNN encoder-decoder')
	### Building Encoder 
	embedder = LookupTable(
		length=len(vocab_src), 
		dim=config['embed_src'], 
		weights_init=IsotropicGaussian(),
		biases_init=Constant(0.0), 
		name='embedder')
	transformer = Linear(
		config['embed_src'], 
		config['hidden_src']*4, 
		weights_init=IsotropicGaussian(),
		biases_init=Constant(0.0), 
		name='transformer')

	lstminit = np.asarray([0.0,]*config['hidden_src']+[0.0,]*config['hidden_src']+[1.0,]*config['hidden_src']+[0.0,]*config['hidden_src'])
	encoder = Bidirectional(
		LSTM(
			dim=config['hidden_src'], 
			weights_init=IsotropicGaussian(0.01),
			biases_init=Constant(lstminit)),
		name='encoderBiLSTM'
		)
	encoder.prototype.weights_init = Orthogonal()
	
	### Building Decoder 
	lstminit = np.asarray([0.0,]*config['hidden_tgt']+[0.0,]*config['hidden_tgt']+[1.0,]*config['hidden_tgt']+[0.0,]*config['hidden_tgt'])
	transition = LSTM2GO(
		attended_dim=config['hidden_tgt'], 
		dim=config['hidden_tgt'], 
		weights_init=IsotropicGaussian(0.01),
		biases_init=Constant(lstminit), 
		name='decoderLSTM')

	attention = SequenceContentAttention( 
		state_names=transition.apply.states, # default activation is Tanh
		state_dims=[config['hidden_tgt']],
		attended_dim=config['hidden_src']*2,
		match_dim=config['hidden_tgt'], 
		name="attention")

	readout = Readout(
		source_names=['states', 
			'feedback', 
			attention.take_glimpses.outputs[0]],
		readout_dim=len(vocab_tgt),
		emitter = SoftmaxEmitter(
			name='emitter'), 
		feedback_brick = LookupFeedback(
			num_outputs=len(vocab_tgt), 
			feedback_dim=config['embed_tgt'], 
			name='feedback'), 
		post_merge=InitializableFeedforwardSequence([
			Bias(dim=config['hidden_tgt'], 
				name='softmax_bias').apply,
			Linear(input_dim=config['hidden_tgt'], 
				output_dim=config['embed_tgt'],
				use_bias=False, 
				name='softmax0').apply,
			Linear(input_dim=config['embed_tgt'], 
				name='softmax1').apply]),
		merged_dim=config['hidden_tgt'])

	decoder = SequenceGenerator(
		readout=readout, 
		transition=transition, 
		attention=attention, 
		weights_init=IsotropicGaussian(0.01), 
		biases_init=Constant(0),
		name="generator",
		fork=Fork(
			[name for name in transition.apply.sequences if name != 'mask'], 
			prototype=Linear()),
		add_contexts=True)
	decoder.transition.weights_init = Orthogonal()

	#printchildren(encoder, 1)
	# Initialize model
	logger.info('Initializing model')
	embedder.initialize()
	transformer.initialize()
	encoder.initialize()
	decoder.initialize()
	
	# Apply model 
	embedded = embedder.apply(source_sentence)
	tansformed = transformer.apply(embedded)
	encoded = encoder.apply(tansformed)[0]
	generated = decoder.generate(
		n_steps=2*source_sentence.shape[1], 
		batch_size=source_sentence.shape[0], 
		attended = encoded.dimshuffle(1,0,2), 
		attended_mask=tensor.ones(source_sentence.shape).T
		)
	print 'Generated: ', generated
	# generator_generate_outputs
	#samples = generated[1] # For GRU 
	samples = generated[2] # For LSTM
	samples.name = 'samples'
	#samples_cost = generated[4] # For GRU 
	samples_cost = generated[5] # For LSTM
	samples_cost = 'sampling_cost'
	cost = decoder.cost(
		mask = target_sentence_mask.T, 
		outputs = target_sentence.T, 
		attended = encoded.dimshuffle(1,0,2), 
		attended_mask = source_sentence_mask.T)
	cost.name = 'target_cost'
	cost.tag.aggregation_scheme = TakeLast(cost)
	model = Model(cost)
	
	logger.info('Creating computational graph')
	cg = ComputationGraph(cost)
	
	# apply dropout for regularization
	if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog
		logger.info('Applying dropout')
		dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output']
		cg = apply_dropout(cg, dropout_inputs, config['dropout'])

	######## 
	# Print shapes
	shapes = [param.get_value().shape for param in cg.parameters]
	logger.info("Parameter shapes: ")
	for shape, count in Counter(shapes).most_common():
		logger.info('	{:15}: {}'.format(shape, count))
	logger.info("Total number of parameters: {}".format(len(shapes)))

	printchildren(embedder, 1)
	printchildren(transformer, 1)
	printchildren(encoder, 1)
	printchildren(decoder, 1)
	# Print parameter names
	# enc_dec_param_dict = merge(Selector(embedder).get_parameters(), Selector(encoder).get_parameters(), Selector(decoder).get_parameters())
	# enc_dec_param_dict = merge(Selector(decoder).get_parameters())
	# logger.info("Parameter names: ")
	# for name, value in enc_dec_param_dict.items():
	# 	logger.info('	{:15}: {}'.format(value.get_value().shape, name))
	# logger.info("Total number of parameters: {}".format(len(enc_dec_param_dict)))
	##########

	# Training data 
	train_stream = get_train_stream(config, 
		[config['train_src'],], [config['train_tgt'],], 
		vocab_src, vocab_tgt)
	dev_stream = get_dev_stream(
		[config['dev_src'],], [config['dev_tgt'],], 
		vocab_src, vocab_tgt)
	test_stream = get_test_stream([config['test_src'],], vocab_src)

	# Set extensions
	logger.info("Initializing extensions")
	extensions = [
		FinishAfter(after_n_batches=config['finish_after']),
		ProgressBar(),
		TrainingDataMonitoring([cost], 
			prefix="tra", 
			after_batch=True),
		DataStreamMonitoring(variables=[cost], 
			data_stream=dev_stream, 
			prefix="dev", 
			after_batch=True), 
		Sampler(
			model=Model(samples), 
			data_stream=dev_stream,
			vocab=cabvo,
			saveto=config['saveto']+'dev',
			every_n_batches=config['save_freq']), 
		Sampler(
			model=Model(samples), 
			data_stream=test_stream,
			vocab=cabvo,
			saveto=config['saveto']+'test',
			after_n_batches=1, 
			on_resumption=True,
			before_training=True), 
		Plotter(saveto=config['saveto'], after_batch=True),
		Printing(after_batch=True),
		Checkpoint(
			path=config['saveto'], 
			parameters = cg.parameters,
			save_main_loop=False,
			every_n_batches=config['save_freq'])]
	if BOKEH_AVAILABLE: 
		Plot('Training cost', channels=[['target_cost']], after_batch=True)
	if config['reload']: 
		extensions.append(Load(path=config['saveto'], 
			load_iteration_state=False, 
			load_log=False))
	else: 
		with open(config['saveto']+'.txt', 'w') as f: 
			pass 

	# Set up training algorithm
	logger.info("Initializing training algorithm")
	algorithm = GradientDescent(cost=cost, 
		parameters=cg.parameters,
		step_rule=CompositeRule([StepClipping(config['step_clipping']), 
			eval(config['step_rule'])()])
    )

	# Initialize main loop
	logger.info("Initializing main loop")
	main_loop = MainLoop(
		model=model,
		algorithm=algorithm,
		data_stream=train_stream,
		extensions=extensions)
	main_loop.run()
Exemple #16
0
    def __init__(self,
                 input1_size,
                 input2_size,
                 lookup1_dim=200,
                 lookup2_dim=200,
                 hidden_size=512):
        self.hidden_size = hidden_size
        self.input1_size = input1_size
        self.input2_size = input2_size
        self.lookup1_dim = lookup1_dim
        self.lookup2_dim = lookup2_dim

        x1 = tensor.lmatrix('durations')
        x2 = tensor.lmatrix('syllables')
        y = tensor.lmatrix('pitches')

        lookup1 = LookupTable(dim=self.lookup1_dim,
                              length=self.input1_size,
                              name='lookup1',
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0))
        lookup1.initialize()
        lookup2 = LookupTable(dim=self.lookup2_dim,
                              length=self.input2_size,
                              name='lookup2',
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0))
        lookup2.initialize()
        merge = Merge(['lookup1', 'lookup2'],
                      [self.lookup1_dim, self.lookup2_dim],
                      self.hidden_size,
                      weights_init=initialization.Uniform(width=0.01),
                      biases_init=Constant(0))
        merge.initialize()
        recurrent_block = LSTM(
            dim=self.hidden_size,
            activation=Tanh(),
            weights_init=initialization.Uniform(width=0.01)
        )  #RecurrentStack([LSTM(dim=self.hidden_size, activation=Tanh())] * 3)
        recurrent_block.initialize()
        linear = Linear(input_dim=self.hidden_size,
                        output_dim=self.input1_size,
                        weights_init=initialization.Uniform(width=0.01),
                        biases_init=Constant(0))
        linear.initialize()
        softmax = NDimensionalSoftmax()

        l1 = lookup1.apply(x1)
        l2 = lookup2.apply(x2)
        m = merge.apply(l1, l2)
        h = recurrent_block.apply(m)
        a = linear.apply(h)

        y_hat = softmax.apply(a, extra_ndim=1)
        # ValueError: x must be 1-d or 2-d tensor of floats. Got TensorType(float64, 3D)

        self.Cost = softmax.categorical_cross_entropy(y, a,
                                                      extra_ndim=1).mean()

        self.ComputationGraph = ComputationGraph(self.Cost)

        self.Model = Model(y_hat)
def main(num_epochs=100):
    x = tensor.matrix('features')
    m = tensor.matrix('features_mask')
    y = tensor.imatrix('targets')

    x_int = x.astype(dtype='int32').T
    train_dataset = IMDB()
    idx_sort = numpy.argsort(
        [len(s) for s in
         train_dataset.indexables[
             train_dataset.sources.index('features')]]
    )
    n_voc = len(train_dataset.dict.keys())
    for idx in xrange(len(train_dataset.sources)):
        train_dataset.indexables[idx] = train_dataset.indexables[idx][idx_sort]

    n_h = 100
    linear_embedding = LookupTable(
        length=n_voc,
        dim=4 * n_h,
        weights_init=Uniform(std=0.01),
        biases_init=Constant(0.)
    )
    linear_embedding.initialize()
    lstm_biases = numpy.zeros(4 * n_h).astype(dtype=theano.config.floatX)
    lstm_biases[n_h:(2 * n_h)] = 4.
    rnn = LSTM(
        dim=n_h,
        weights_init=Uniform(std=0.01),
        biases_init=Constant(0.)
    )
    rnn.initialize()
    score_layer = Linear(
        input_dim=n_h,
        output_dim=1,
        weights_init=Uniform(std=0.01),
        biases_init=Constant(0.)
    )
    score_layer.initialize()

    embedding = linear_embedding.apply(x_int) * tensor.shape_padright(m.T)
    rnn_out = rnn.apply(embedding)
    rnn_out_mean_pooled = rnn_out[0][-1]

    probs = Sigmoid().apply(
        score_layer.apply(rnn_out_mean_pooled))

    cost = - (y * tensor.log(probs)
              + (1 - y) * tensor.log(1 - probs)
              ).mean()
    cost.name = 'cost'

    misclassification = (y * (probs < 0.5)
                         + (1 - y) * (probs > 0.5)
                         ).mean()
    misclassification.name = 'misclassification'

    cg = ComputationGraph([cost])
    params = cg.parameters

    algorithm = GradientDescent(
        cost=cost,
        params=params,
        step_rule=CompositeRule(
            components=[StepClipping(threshold=10.),
                        Adam()
                        ]
        )
    )

    n_train = int(numpy.floor(.8 * train_dataset.num_examples))
    n_valid = int(numpy.floor(.1 * train_dataset.num_examples))
    train_data_stream = Padding(
        data_stream=DataStream(
            dataset=train_dataset,
            iteration_scheme=BatchwiseShuffledScheme(
                examples=range(n_train),
                batch_size=10,
            )
        ),
        mask_sources=('features',)
    )
    valid_data_stream = Padding(
        data_stream=DataStream(
            dataset=train_dataset,
            iteration_scheme=BatchwiseShuffledScheme(
                examples=range(n_train, n_train + n_valid),
                batch_size=10,
            )
        ),
        mask_sources=('features',)
    )
    test_data_stream = Padding(
        data_stream=DataStream(
            dataset=train_dataset,
            iteration_scheme=BatchwiseShuffledScheme(
                examples=range(n_train + n_valid,
                               train_dataset.num_examples),
                batch_size=10,
            )
        ),
        mask_sources=('features',)
    )

    model = Model(cost)

    extensions = []
    extensions.append(Timing())
    extensions.append(FinishAfter(after_n_epochs=num_epochs))
    extensions.append(DataStreamMonitoring(
        [cost, misclassification],
        test_data_stream,
        prefix='test'))
    extensions.append(DataStreamMonitoring(
        [cost, misclassification],
        valid_data_stream,
        prefix='valid'))
    extensions.append(TrainingDataMonitoring(
        [cost, misclassification],
        prefix='train',
        after_epoch=True))

    plotters = []
    plotters.append(Plotter(
        channels=[['train_cost', 'train_misclassification',
                   'valid_cost', 'valid_misclassification']],
        titles=['Costs']))

    extensions.append(PlotManager('IMDB classification example',
                                  plotters=plotters,
                                  after_epoch=True,
                                  after_training=True))
    extensions.append(Printing())

    main_loop = MainLoop(model=model,
                         data_stream=train_data_stream,
                         algorithm=algorithm,
                         extensions=extensions)

    main_loop.run()
Exemple #18
0
 def _embed(self, sample_num, dim, name, *args, **kwargs):
     embed = LookupTable(sample_num, dim, name=name)
     embed.weights_init = IsotropicGaussian(std=1 / numpy.sqrt(dim))
     embed.initialize()
     return embed
Exemple #19
0
def main(mode, save_path, num_batches, from_dump):
    if mode == "train":
        # Experiment configuration
        dimension = 100
        readout_dimension = len(char2code)

        # Data processing pipeline
        data_stream = DataStreamMapping(
            mapping=lambda data: tuple(array.T for array in data),
            data_stream=PaddingDataStream(
                BatchDataStream(
                    iteration_scheme=ConstantScheme(10),
                    data_stream=DataStreamMapping(
                        mapping=reverse_words,
                        add_sources=("targets", ),
                        data_stream=DataStreamFilter(
                            predicate=lambda data: len(data[0]) <= 100,
                            data_stream=OneBillionWord(
                                "training", [99],
                                char2code,
                                level="character",
                                preprocess=str.lower).get_default_stream())))))

        # Build the model
        chars = tensor.lmatrix("features")
        chars_mask = tensor.matrix("features_mask")
        targets = tensor.lmatrix("targets")
        targets_mask = tensor.matrix("targets_mask")

        encoder = Bidirectional(GatedRecurrent(dim=dimension,
                                               activation=Tanh()),
                                weights_init=Orthogonal())
        encoder.initialize()
        fork = Fork([
            name
            for name in encoder.prototype.apply.sequences if name != 'mask'
        ],
                    weights_init=IsotropicGaussian(0.1),
                    biases_init=Constant(0))
        fork.input_dim = dimension
        fork.fork_dims = {name: dimension for name in fork.fork_names}
        fork.initialize()
        lookup = LookupTable(readout_dimension,
                             dimension,
                             weights_init=IsotropicGaussian(0.1))
        lookup.initialize()
        transition = Transition(activation=Tanh(),
                                dim=dimension,
                                attended_dim=2 * dimension,
                                name="transition")
        attention = SequenceContentAttention(
            state_names=transition.apply.states,
            match_dim=dimension,
            name="attention")
        readout = LinearReadout(readout_dim=readout_dimension,
                                source_names=["states"],
                                emitter=SoftmaxEmitter(name="emitter"),
                                feedbacker=LookupFeedback(
                                    readout_dimension, dimension),
                                name="readout")
        generator = SequenceGenerator(readout=readout,
                                      transition=transition,
                                      attention=attention,
                                      weights_init=IsotropicGaussian(0.1),
                                      biases_init=Constant(0),
                                      name="generator")
        generator.push_initialization_config()
        transition.weights_init = Orthogonal()
        generator.initialize()
        bricks = [encoder, fork, lookup, generator]

        # Give an idea of what's going on
        params = Selector(bricks).get_params()
        logger.info("Parameters:\n" +
                    pprint.pformat([(key, value.get_value().shape)
                                    for key, value in params.items()],
                                   width=120))

        # Build the cost computation graph
        batch_cost = generator.cost(
            targets,
            targets_mask,
            attended=encoder.apply(**dict_union(fork.apply(
                lookup.lookup(chars), return_dict=True),
                                                mask=chars_mask)),
            attended_mask=chars_mask).sum()
        batch_size = named_copy(chars.shape[1], "batch_size")
        cost = aggregation.mean(batch_cost, batch_size)
        cost.name = "sequence_log_likelihood"
        logger.info("Cost graph is built")

        # Fetch variables useful for debugging
        max_length = named_copy(chars.shape[0], "max_length")
        cost_per_character = named_copy(
            aggregation.mean(batch_cost, batch_size * max_length),
            "character_log_likelihood")
        cg = ComputationGraph(cost)
        energies = unpack(VariableFilter(application=readout.readout,
                                         name="output")(cg.variables),
                          singleton=True)
        min_energy = named_copy(energies.min(), "min_energy")
        max_energy = named_copy(energies.max(), "max_energy")
        (activations, ) = VariableFilter(
            application=generator.transition.apply,
            name="states")(cg.variables)
        mean_activation = named_copy(activations.mean(), "mean_activation")

        # Define the training algorithm.
        algorithm = GradientDescent(cost=cost,
                                    step_rule=CompositeRule([
                                        GradientClipping(10.0),
                                        SteepestDescent(0.01)
                                    ]))

        observables = [
            cost, min_energy, max_energy, mean_activation, batch_size,
            max_length, cost_per_character, algorithm.total_step_norm,
            algorithm.total_gradient_norm
        ]
        for name, param in params.items():
            observables.append(named_copy(param.norm(2), name + "_norm"))
            observables.append(
                named_copy(algorithm.gradients[param].norm(2),
                           name + "_grad_norm"))

        main_loop = MainLoop(
            model=bricks,
            data_stream=data_stream,
            algorithm=algorithm,
            extensions=([LoadFromDump(from_dump)] if from_dump else []) + [
                Timing(),
                TrainingDataMonitoring(observables, after_every_batch=True),
                TrainingDataMonitoring(
                    observables, prefix="average", every_n_batches=10),
                FinishAfter(after_n_batches=num_batches).add_condition(
                    "after_batch", lambda log: math.isnan(
                        log.current_row.total_gradient_norm)),
                Plot(os.path.basename(save_path),
                     [["average_" + cost.name],
                      ["average_" + cost_per_character.name]],
                     every_n_batches=10),
                SerializeMainLoop(save_path,
                                  every_n_batches=500,
                                  save_separately=["model", "log"]),
                Printing(every_n_batches=1)
            ])
        main_loop.run()
    elif mode == "test":
        with open(save_path, "rb") as source:
            encoder, fork, lookup, generator = dill.load(source)
        logger.info("Model is loaded")
        chars = tensor.lmatrix("features")
        generated = generator.generate(
            n_steps=3 * chars.shape[0],
            batch_size=chars.shape[1],
            attended=encoder.apply(**dict_union(
                fork.apply(lookup.lookup(chars), return_dict=True))),
            attended_mask=tensor.ones(chars.shape))
        sample_function = ComputationGraph(generated).get_theano_function()
        logging.info("Sampling function is compiled")

        while True:
            # Python 2-3 compatibility
            line = input("Enter a sentence\n")
            batch_size = int(input("Enter a number of samples\n"))
            encoded_input = [
                char2code.get(char, char2code["<UNK>"])
                for char in line.lower().strip()
            ]
            encoded_input = ([char2code['<S>']] + encoded_input +
                             [char2code['</S>']])
            print("Encoder input:", encoded_input)
            target = reverse_words((encoded_input, ))[0]
            print("Target: ", target)
            states, samples, glimpses, weights, costs = sample_function(
                numpy.repeat(numpy.array(encoded_input)[:, None],
                             batch_size,
                             axis=1))

            messages = []
            for i in range(samples.shape[1]):
                sample = list(samples[:, i])
                try:
                    true_length = sample.index(char2code['</S>']) + 1
                except ValueError:
                    true_length = len(sample)
                sample = sample[:true_length]
                cost = costs[:true_length, i].sum()
                message = "({})".format(cost)
                message += "".join(code2char[code] for code in sample)
                if sample == target:
                    message += " CORRECT!"
                messages.append((cost, message))
            messages.sort(key=lambda tuple_: -tuple_[0])
            for _, message in messages:
                print(message)
Exemple #20
0
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')
        ans_indices = tensor.imatrix('ans_indices')  # n_steps * n_samples
        ans_indices_mask = tensor.imatrix('ans_indices_mask')

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)
        ans_indices = ans_indices.dimshuffle(1, 0)
        ans_indices_mask = ans_indices_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size, config.embed_size, name='embed')
        embed.weights_init = IsotropicGaussian(0.01)
        # embed.weights_init = Constant(init_embedding_table(filename='embeddings/vocab_embeddings.txt'))

        # one directional LSTM encoding
        q_lstm_ins = Linear(input_dim=config.embed_size,
                            output_dim=4 * config.pre_lstm_size,
                            name='q_lstm_in')
        q_lstm = LSTM(dim=config.pre_lstm_size,
                      activation=Tanh(),
                      name='q_lstm')
        c_lstm_ins = Linear(input_dim=config.embed_size,
                            output_dim=4 * config.pre_lstm_size,
                            name='c_lstm_in')
        c_lstm = LSTM(dim=config.pre_lstm_size,
                      activation=Tanh(),
                      name='c_lstm')
        bricks += [q_lstm, c_lstm, q_lstm_ins, c_lstm_ins]

        q_tmp = q_lstm_ins.apply(embed.apply(question))
        c_tmp = c_lstm_ins.apply(embed.apply(context))
        q_hidden, _ = q_lstm.apply(q_tmp,
                                   mask=question_mask.astype(
                                       theano.config.floatX))  # lq, bs, dim
        c_hidden, _ = c_lstm.apply(c_tmp,
                                   mask=context_mask.astype(
                                       theano.config.floatX))  # lc, bs, dim

        # Attention mechanism Bilinear question
        attention_question = Linear(input_dim=config.pre_lstm_size,
                                    output_dim=config.pre_lstm_size,
                                    name='att_question')
        bricks += [attention_question]
        att_weights_question = q_hidden[
            None, :, :, :] * attention_question.apply(
                c_hidden.reshape(
                    (c_hidden.shape[0] * c_hidden.shape[1],
                     c_hidden.shape[2]))).reshape(
                         (c_hidden.shape[0], c_hidden.shape[1],
                          c_hidden.shape[2]))[:,
                                              None, :, :]  # --> lc,lq,bs,dim
        att_weights_question = att_weights_question.sum(
            axis=3)  # sum over axis 3 -> dimensions --> lc,lq,bs
        att_weights_question = att_weights_question.dimshuffle(
            0, 2, 1)  # --> lc,bs,lq
        att_weights_question = att_weights_question.reshape(
            (att_weights_question.shape[0] * att_weights_question.shape[1],
             att_weights_question.shape[2]))  # --> lc*bs,lq
        att_weights_question = tensor.nnet.softmax(
            att_weights_question
        )  # softmax over axis 1 -> length of question # --> lc*bs,lq
        att_weights_question = att_weights_question.reshape(
            (c_hidden.shape[0], q_hidden.shape[1],
             q_hidden.shape[0]))  # --> lc,bs,lq
        att_weights_question = att_weights_question.dimshuffle(
            0, 2, 1)  # --> lc,lq,bs

        question_context_attention = att_weights_question.dimshuffle(2, 1, 0)
        question_context_attention.name = "question_context_attention"

        self.analyse_vars = [question_context_attention]
        attended_question = tensor.sum(
            q_hidden[None, :, :, :] * att_weights_question[:, :, :, None],
            axis=1)  # sum over axis 1 -> length of question --> lc,bs,dim
        attended_question.name = 'attended_question'

        # Match LSTM
        cqembed = tensor.concatenate([c_hidden, attended_question], axis=2)
        mlstms, mhidden_list = make_bidir_lstm_stack(
            cqembed, 2 * config.pre_lstm_size,
            context_mask.astype(theano.config.floatX), config.match_lstm_size,
            config.match_skip_connections, 'match')
        bricks = bricks + mlstms
        if config.match_skip_connections:
            menc_dim = 2 * sum(config.match_lstm_size)
            menc = tensor.concatenate(mhidden_list, axis=2)
        else:
            menc_dim = 2 * config.match_lstm_size[-1]
            menc = tensor.concatenate(mhidden_list[-2:], axis=2)
        menc.name = 'menc'

        #pointer networks decoder LSTM and Attention parameters
        params = init_params(data_dim=config.decoder_data_dim,
                             lstm_dim=config.decoder_lstm_output_dim)
        tparams = init_tparams(params)

        self.theano_params = []
        add_role(tparams['lstm_de_W'], WEIGHT)
        add_role(tparams['lstm_de_U'], WEIGHT)
        add_role(tparams['lstm_de_b'], BIAS)
        add_role(tparams['ptr_b1'], BIAS)
        add_role(tparams['ptr_b2'], BIAS)
        add_role(tparams['ptr_v'], WEIGHT)
        add_role(tparams['ptr_W1'], WEIGHT)
        add_role(tparams['ptr_W2'], WEIGHT)
        self.theano_params = tparams.values()

        #n_steps = length , n_samples = batch_size
        n_steps = ans_indices.shape[0]
        n_samples = ans_indices.shape[1]
        preds, generations = ptr_network(
            tparams, cqembed, context_mask.astype(theano.config.floatX),
            ans_indices, ans_indices_mask.astype(theano.config.floatX),
            config.decoder_lstm_output_dim, menc)

        self.generations = generations

        idx_steps = tensor.outer(tensor.arange(n_steps, dtype='int64'),
                                 tensor.ones((n_samples, ), dtype='int64'))
        idx_samples = tensor.outer(tensor.ones((n_steps, ), dtype='int64'),
                                   tensor.arange(n_samples, dtype='int64'))
        probs = preds[idx_steps, ans_indices, idx_samples]
        # probs *= y_mask
        off = 1e-8
        if probs.dtype == 'float16':
            off = 1e-6
        # probs += (1 - y_mask)  # change unmasked position to 1, since log(1) = 0
        probs += off
        # probs_printed = theano.printing.Print('this is probs')(probs)
        cost = -tensor.log(probs)
        cost *= ans_indices_mask
        cost = cost.sum(axis=0) / ans_indices_mask.sum(axis=0)
        cost = cost.mean()

        # Apply dropout
        cg = ComputationGraph([cost])
        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, mhidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'

        # self.predictions.name = 'pred'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]
        # self.analyse_vars= [cost, self.predictions, att_weights_start, att_weights_end, att_weights, att_target]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
Exemple #21
0
source_path = 'dataset/normalized_syllables_rhythm_notes.json-seqlen-30.hdf5'

train_dataset = T_H5PYDataset(source_path, which_sets=('train', ))

hidden_layer_dim = 1000

x = tensor.lmatrix('syllables')
y = tensor.lmatrix('durations')

lookup_input = LookupTable(name='lookup_input',
                           length=train_dataset.syllables_vocab_size() + 1,
                           dim=hidden_layer_dim,
                           weights_init=initialization.Uniform(width=0.01),
                           biases_init=Constant(0))
lookup_input.initialize()

linear_input = Linear(name='linear_input',
                      input_dim=hidden_layer_dim,
                      output_dim=hidden_layer_dim,
                      weights_init=initialization.Uniform(width=0.01),
                      biases_init=Constant(0))
linear_input.initialize()

rnn = SimpleRecurrent(name='hidden',
                      dim=hidden_layer_dim,
                      activation=Tanh(),
                      weights_init=initialization.Uniform(width=0.01))
rnn.initialize()

linear_output = Linear(name='linear_output',
Exemple #22
0
def main(num_epochs=100):
    x = tensor.matrix('features')
    m = tensor.matrix('features_mask')
    y = tensor.imatrix('targets')

    x_int = x.astype(dtype='int32').T - 2
    train_dataset = IMDB()
    idx_sort = numpy.argsort(
        [len(s) for s in
         train_dataset.indexables[
             train_dataset.sources.index('features')]]
    )
    n_voc = len(train_dataset.dict.keys())
    for idx in xrange(len(train_dataset.sources)):
        train_dataset.indexables[idx] = train_dataset.indexables[idx][idx_sort]

    n_h = 10
    linear_embedding = LookupTable(
        length=n_voc,
        dim=4 * n_h,
        weights_init=Uniform(std=0.01),
        biases_init=Constant(0.)
    )
    linear_embedding.initialize()
    lstm_biases = numpy.zeros(4 * n_h).astype(dtype=theano.config.floatX)
    lstm_biases[n_h:(2 * n_h)] = 4.
    rnn = Bidirectional(LSTM(
        dim=n_h,
        weights_init=Uniform(std=0.01),
        biases_init=Constant(0.)
    ))
    rnn.initialize()
    score_layer = Linear(
        input_dim=2*n_h,
        output_dim=1,
        weights_init=Uniform(std=0.01),
        biases_init=Constant(0.)
    )
    score_layer.initialize()

    embedding = linear_embedding.apply(x_int) * tensor.shape_padright(m.T)
    rnn_out = rnn.apply(embedding)
    rnn_out_mean_pooled = tensor.mean(rnn_out[0], axis=0)

    probs = Sigmoid().apply(
        score_layer.apply(rnn_out_mean_pooled))

    cost = - (y * tensor.log(probs)
              + (1 - y) * tensor.log(1 - probs)
              ).mean()
    cost.name = 'cost'

    misclassification = (y * (probs < 0.5)
                         + (1 - y) * (probs > 0.5)
                         ).mean()
    misclassification.name = 'misclassification'

    cg = ComputationGraph([cost])
    params = cg.parameters

    algorithm = GradientDescent(
        cost=cost,
        params=params,
        step_rule=CompositeRule(
            components=[StepClipping(threshold=10.),
                        Adam()
                        ]
        )
    )

    n_train = int(numpy.floor(.8 * train_dataset.num_examples))
    n_valid = int(numpy.floor(.1 * train_dataset.num_examples))
    train_data_stream = Padding(
        data_stream=DataStream(
            dataset=train_dataset,
            iteration_scheme=BatchwiseShuffledScheme(
                examples=range(100),
                batch_size=10,
            )
        ),
        mask_sources=('features',)
    )
    valid_data_stream = Padding(
        data_stream=DataStream(
            dataset=train_dataset,
            iteration_scheme=BatchwiseShuffledScheme(
                examples=range(100, 110),
                batch_size=10,
            )
        ),
        mask_sources=('features',)
    )
    test_data_stream = Padding(
        data_stream=DataStream(
            dataset=train_dataset,
            iteration_scheme=BatchwiseShuffledScheme(
                examples=range(110, 120),
                batch_size=10,
            )
        ),
        mask_sources=('features',)
    )

    model = Model(cost)

    extensions = []
    extensions.append(Timing())
    extensions.append(FinishAfter(after_n_epochs=num_epochs))
    extensions.append(DataStreamMonitoring(
        [cost, misclassification],
        test_data_stream,
        prefix='test'))
    extensions.append(DataStreamMonitoring(
        [cost, misclassification],
        valid_data_stream,
        prefix='valid'))
    extensions.append(TrainingDataMonitoring(
        [cost, misclassification],
        prefix='train',
        after_epoch=True))

    plotters = []
    plotters.append(Plotter(
        channels=[['train_cost', 'train_misclassification',
                   'valid_cost', 'valid_misclassification']],
        titles=['Costs']))

    extensions.append(PlotManager('IMDB classification example',
                                  plotters=plotters,
                                  after_epoch=True,
                                  after_training=True))
    extensions.append(Printing())

    main_loop = MainLoop(model=model,
                         data_stream=train_data_stream,
                         algorithm=algorithm,
                         extensions=extensions)

    main_loop.run()
def main(num_epochs=100):
    x = tensor.matrix('features')
    m = tensor.matrix('features_mask')

    x_int = x.astype(dtype='int32').T
    train_dataset = TextFile('inspirational.txt')
    train_dataset.indexables[0] = numpy.array(sorted(
        train_dataset.indexables[0], key=len
    ))

    n_voc = len(train_dataset.dict.keys())

    init_probs = numpy.array(
        [sum(filter(lambda idx:idx == w,
                    [s[0] for s in train_dataset.indexables[
                        train_dataset.sources.index('features')]]
                    )) for w in xrange(n_voc)],
        dtype=theano.config.floatX
    )
    init_probs = init_probs / init_probs.sum()

    n_h = 100
    linear_embedding = LookupTable(
        length=n_voc,
        dim=n_h,
        weights_init=Uniform(std=0.01),
        biases_init=Constant(0.)
    )
    linear_embedding.initialize()
    lstm_biases = numpy.zeros(4 * n_h).astype(dtype=theano.config.floatX)
    lstm_biases[n_h:(2 * n_h)] = 4.
    rnn = SimpleRecurrent(
        dim=n_h,
        activation=Tanh(),
        weights_init=Uniform(std=0.01),
        biases_init=Constant(0.)
    )
    rnn.initialize()
    score_layer = Linear(
        input_dim=n_h,
        output_dim=n_voc,
        weights_init=Uniform(std=0.01),
        biases_init=Constant(0.)
    )
    score_layer.initialize()

    embedding = (linear_embedding.apply(x_int[:-1])
                 * tensor.shape_padright(m.T[1:]))
    rnn_out = rnn.apply(inputs=embedding, mask=m.T[1:])
    probs = softmax(
        sequence_map(score_layer.apply, rnn_out, mask=m.T[1:])[0]
    )
    idx_mask = m.T[1:].nonzero()
    cost = CategoricalCrossEntropy().apply(
        x_int[1:][idx_mask[0], idx_mask[1]],
        probs[idx_mask[0], idx_mask[1]]
    )
    cost.name = 'cost'
    misclassification = MisclassificationRate().apply(
        x_int[1:][idx_mask[0], idx_mask[1]],
        probs[idx_mask[0], idx_mask[1]]
    )
    misclassification.name = 'misclassification'

    cg = ComputationGraph([cost])
    params = cg.parameters

    algorithm = GradientDescent(
        cost=cost,
        params=params,
        step_rule=Adam()
    )

    train_data_stream = Padding(
        data_stream=DataStream(
            dataset=train_dataset,
            iteration_scheme=BatchwiseShuffledScheme(
                examples=train_dataset.num_examples,
                batch_size=10,
            )
        ),
        mask_sources=('features',)
    )

    model = Model(cost)

    extensions = []
    extensions.append(Timing())
    extensions.append(FinishAfter(after_n_epochs=num_epochs))
    extensions.append(TrainingDataMonitoring(
        [cost, misclassification],
        prefix='train',
        after_epoch=True))

    batch_size = 10
    length = 30
    trng = MRG_RandomStreams(18032015)
    u = trng.uniform(size=(length, batch_size, n_voc))
    gumbel_noise = -tensor.log(-tensor.log(u))
    init_samples = (tensor.log(init_probs).dimshuffle(('x', 0))
                    + gumbel_noise[0]).argmax(axis=-1)
    init_states = rnn.initial_state('states', batch_size)

    def sampling_step(g_noise, states, samples_step):
        embedding_step = linear_embedding.apply(samples_step)
        next_states = rnn.apply(inputs=embedding_step,
                                            states=states,
                                            iterate=False)
        probs_step = softmax(score_layer.apply(next_states))
        next_samples = (tensor.log(probs_step)
                        + g_noise).argmax(axis=-1)

        return next_states, next_samples

    [_, samples], _ = theano.scan(
        fn=sampling_step,
        sequences=[gumbel_noise[1:]],
        outputs_info=[init_states, init_samples]
    )

    sampling = theano.function([], samples.owner.inputs[0].T)

    plotters = []
    plotters.append(Plotter(
        channels=[['train_cost', 'train_misclassification']],
        titles=['Costs']))

    extensions.append(PlotManager('Language modelling example',
                                  plotters=plotters,
                                  after_epoch=True,
                                  after_training=True))
    extensions.append(Printing())
    extensions.append(PrintSamples(sampler=sampling,
                                   voc=train_dataset.inv_dict))

    main_loop = MainLoop(model=model,
                         data_stream=train_data_stream,
                         algorithm=algorithm,
                         extensions=extensions)

    main_loop.run()
Exemple #24
0
train_dataset = MyDataset(source_path, which_sets=('train',))


hidden_layer_dim = 1000

x = tensor.lmatrix('x')
y = tensor.lmatrix('y')

lookup_input = LookupTable(
    name='lookup_input',
    length=charset_size+1,
    dim=hidden_layer_dim,
    weights_init=initialization.Uniform(width=0.01),
    biases_init=Constant(0))
lookup_input.initialize()

linear_input = Linear(
    name='linear_input',
    input_dim=hidden_layer_dim,
    output_dim=hidden_layer_dim,
    weights_init=initialization.Uniform(width=0.01),
    biases_init=Constant(0))
linear_input.initialize()

rnn = SimpleRecurrent(
    name='hidden',
    dim=hidden_layer_dim,
    activation=Tanh(),
    weights_init=initialization.Uniform(width=0.01))
rnn.initialize()
Exemple #25
0
    def create_model(self, symbols_num = 500):

        hidden_states = self.args.encoder_hidden_dims
        embedding_dims = self.args.source_embeddings_dim

        # dimensions of sequence embeddings that are created bz bidir net, so the dimensionality is two times dim of a single net
        thought_dim = hidden_states * 2

        #query_dims = self.args.recurrent_stack_depth * self.args.encoder_hidden_dims

        # batch X input symbols
        context = tt.lmatrix('context')
        context_mask = tt.matrix('context_mask')
        context_mask = decorate(context_mask, "context_mask",level=1)
        # batch X output symbols
        x = tt.lmatrix('question')
        x_mask = tt.matrix('question_mask')
        # answer ix for each example in the batch
        y = tt.lmatrix('answer')


        # candidate answer words for each example, batch X candidate words (10 per each example)
        candidates_bi = tt.lmatrix("candidates")
        candidates_bi_mask = tt.matrix("candidates_mask")


        # TODO y can contain long sequences, here we use just the first symbol of each answer (that is possibly longer)
        # this have to be adjusted when response can be a sequence and not only a symbol
        y = decorate(y, "output")
        y = y[:,0]


        ###################
        # create model parts
        ###################

        lookup = LookupTable(symbols_num, embedding_dims, weights_init=Uniform(width=0.2))

        context_encoder = self.create_bidi_encoder("context_encoder", embedding_dims, hidden_states)

        question_encoder = self.create_bidi_encoder("question_encoder", embedding_dims, hidden_states)


        # inits
        lookup.initialize()
        #rnn.initialize()


        ###################
        # wire the model together
        ###################

        context = decorate(context, "CONTEXT",1)

        context_embedding_tbf = lookup.apply(context.T)
        #memory_encoded_btf = rnn.apply(context_embedding_tbf[:,0,:])[1]  # use cells
        memory_encoded_btf = context_encoder.apply(context_embedding_tbf.T,context_mask).dimshuffle(1,0,2)

        memory_encoded_btf.name = "memory_encoded_btf"
        memory_encoded_btf = decorate(memory_encoded_btf,"MEM ENC")

        # batch X features
        x = decorate(x,"X")
        x_embedded_btf = lookup.apply(x.T)
        x_embedded_btf = decorate(x_embedded_btf,"QUESTION EMB")
        x_encoded_btf = question_encoder.apply(x_embedded_btf.T, x_mask).dimshuffle(1,0,2)
        x_last = x_encoded_btf[-1]
        # extract forward rnn that is the first in bidir encoder
        x_encoded_btf = decorate(x_encoded_btf,"QUESTION ENC")

        x_forward_encoded_bf = x_encoded_btf[:,-1,0:hidden_states]
        x_backward_encoded_bf = x_encoded_btf[:,0,hidden_states:hidden_states*2]

        query_representation_bf = tt.concatenate([x_forward_encoded_bf,x_backward_encoded_bf],axis=1)

        # bidirectional representation of question is used as the search key
        search_key = query_representation_bf
        #search_key = x_last

        #search_key = W_um.apply(x_encoded)
        search_key = decorate(search_key,"SEARCH KEY")

        mem_attention_pre = tt.batched_dot(search_key, memory_encoded_btf.dimshuffle(0,2,1))
        mem_attention_pre = decorate(mem_attention_pre,"ATT presoftmax")

        # use masking on attention, this might be unnecessary but we do it just to be sure
        mem_attention_pre_masked_bt = tt.mul(mem_attention_pre,context_mask)
        mem_attention_pre_masked_bt = decorate(mem_attention_pre_masked_bt,"ATT presoftmax masked")

        #mem_attention_bt = Softmax(name="memory_query_softmax").apply(mem_attention_pre_masked_bt)
        mem_attention_bt = SoftmaxWithMask(name="memory_query_softmax").apply(mem_attention_pre_masked_bt,context_mask)

        mem_attention_bt = decorate(mem_attention_bt,"ATT",level=2)

        # compute weighted attention over original word vectors
        att_weighted_responses_bf = theano.tensor.batched_dot(mem_attention_bt, context_embedding_tbf.dimshuffle(1,0,2))

        #use mask to remove the probability mass from the unmasked candidates
        #word_probs_bi = word_probs_bi * candidates_bi_mask

        # compare desired response to all candidate responses
        # select relevant candidate answer words
        candidates_embeddings_bfi = lookup.apply(candidates_bi).dimshuffle(0,2,1)

        # convert it to output symbol probabilities
        y_hat_presoft = tt.batched_dot(att_weighted_responses_bf, candidates_embeddings_bfi)
        y_hat = SoftmaxWithMask(name="output_softmax").apply(y_hat_presoft,candidates_bi_mask)

        y_hat.name = "y_hat"
        y_hat = decorate(y_hat,"y_hat",level=2)

        # the correct answer is always the first among the candidates, so we can use zeros as index of ground truth
        y = y.zeros_like()

        # cost associated with prediction error
        cost_prediction = CategoricalCrossEntropy().apply(y.flatten(), y_hat)
        cost_prediction.name = "cost_prediction"

        cost = cost_prediction

        attention_cost_weight = None

        cost_attention = None

        cost.name = "cost"


        predicted_response_index = tt.argmax(y_hat,axis=1)
        accuracy = tt.eq(y,predicted_response_index).mean()
        accuracy.name = "accuracy"

        return cost, accuracy, mem_attention_bt, y_hat, attention_cost_weight, cost_prediction, cost_attention, context, candidates_bi, candidates_bi_mask, y, context_mask, x, x_mask
Exemple #26
0
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')
        better = tensor.imatrix('better')
        better_mask = tensor.imatrix('better_mask')
        worse = tensor.imatrix('worse')
        worse_mask = tensor.imatrix('worse_mask')
        b_left = tensor.imatrix('b_left')
        b_left_mask = tensor.imatrix('b_left_mask')
        b_right = tensor.imatrix('b_right')
        b_right_mask = tensor.imatrix('b_right_mask')
        w_left = tensor.imatrix('w_left')
        w_left_mask = tensor.imatrix('w_left_mask')
        w_right = tensor.imatrix('w_right')
        w_right_mask = tensor.imatrix('w_right_mask')


        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)

        better = better.dimshuffle(1, 0)
        better_mask = better_mask.dimshuffle(1, 0)

        worse = worse.dimshuffle(1, 0)
        worse_mask = worse_mask.dimshuffle(1, 0)

        b_left = b_left.dimshuffle(1, 0)
        b_left_mask = b_left_mask.dimshuffle(1, 0)

        b_right = b_right.dimshuffle(1, 0)
        b_right_mask = b_right_mask.dimshuffle(1, 0)

        w_left = w_left.dimshuffle(1, 0)
        w_left_mask = w_left_mask.dimshuffle(1, 0)

        w_right = w_right.dimshuffle(1, 0)
        w_right_mask = w_right_mask.dimshuffle(1, 0)

        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size, config.embed_size, name='question_embed')
        embed.weights_init = IsotropicGaussian(0.01)

        # Calculate question encoding (concatenate layer1)
        qembed = embed.apply(question)
        qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX),
                                                     config.question_lstm_size, config.question_skip_connections, 'q')


        bricks = bricks + qlstms
        if config.question_skip_connections:
            qenc_dim = 2*sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1)
        else:
            qenc_dim = 2*config.question_lstm_size[-1]
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1)
        qenc.name = 'qenc'

        # candidate encoders
        candidates_hidden_list = []

        candidate_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='candidate_fwd_lstm_in_0_0')
        candidate_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='candidate_fwd_lstm_0')

        candidate_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='candidate_bwd_lstm_in_0_0')
        candidate_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='candidate_bwd_lstm_0')

        #adding encoding bricks for initialization
        bricks = bricks + [candidate_fwd_lstm, candidate_bwd_lstm, candidate_fwd_lstm_ins, candidate_bwd_lstm_ins]

        #computing better encoding
        better_embed = embed.apply(better)
        better_fwd_tmp = candidate_fwd_lstm_ins.apply(better_embed)
        better_bwd_tmp = candidate_bwd_lstm_ins.apply(better_embed)
        better_fwd_hidden, _ = candidate_fwd_lstm.apply(better_fwd_tmp, mask=better_mask.astype(theano.config.floatX))
        better_bwd_hidden, _ = candidate_bwd_lstm.apply(better_bwd_tmp[::-1], mask=better_mask.astype(theano.config.floatX)[::-1])
        better_hidden_list = [better_fwd_hidden, better_bwd_hidden]
        better_enc_dim = 2*sum(config.ctx_lstm_size)
        better_enc = tensor.concatenate([h[-1,:,:] for h in better_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size
        better_enc.name = 'better_enc'
        candidates_hidden_list = candidates_hidden_list + [better_fwd_hidden, better_bwd_hidden]

        #computing worse encoding
        worse_embed = embed.apply(worse)
        worse_fwd_tmp = candidate_fwd_lstm_ins.apply(worse_embed)
        worse_bwd_tmp = candidate_bwd_lstm_ins.apply(worse_embed)
        worse_fwd_hidden, _ = candidate_fwd_lstm.apply(worse_fwd_tmp, mask=worse_mask.astype(theano.config.floatX))
        worse_bwd_hidden, _ = candidate_bwd_lstm.apply(worse_bwd_tmp[::-1], mask=worse_mask.astype(theano.config.floatX)[::-1])
        worse_hidden_list = [worse_fwd_hidden, worse_bwd_hidden]
        worse_enc_dim = 2*sum(config.ctx_lstm_size)
        worse_enc = tensor.concatenate([h[-1,:,:] for h in worse_hidden_list], axis=1)
        worse_enc.name = 'worse_enc'
        candidates_hidden_list = candidates_hidden_list + [worse_fwd_hidden, worse_bwd_hidden]


        #left encoders
        left_context_hidden_list = []

        left_context_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='left_context_fwd_lstm_in_0_0')
        left_context_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='left_context_fwd_lstm_0')

        left_context_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='left_context_bwd_lstm_in_0_0')
        left_context_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='left_context_bwd_lstm_0')

        #adding encoding bricks for initialization
        bricks = bricks + [left_context_fwd_lstm, left_context_bwd_lstm, left_context_fwd_lstm_ins, left_context_bwd_lstm_ins]

        #right encoders
        right_context_hidden_list = []

        right_context_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='right_context_fwd_lstm_in_0_0')
        right_context_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='right_context_fwd_lstm_0')

        right_context_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='right_context_bwd_lstm_in_0_0')
        right_context_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='right_context_bwd_lstm_0')

        #adding encoding bricks for initialization
        bricks = bricks + [right_context_fwd_lstm, right_context_bwd_lstm, right_context_fwd_lstm_ins, right_context_bwd_lstm_ins]


        #left half encodings
        better_left_embed = embed.apply(b_left)
        better_left_fwd_tmp = left_context_fwd_lstm_ins.apply(better_left_embed)
        better_left_bwd_tmp = left_context_bwd_lstm_ins.apply(better_left_embed)
        better_left_fwd_hidden, _ = left_context_fwd_lstm.apply(better_left_fwd_tmp, mask=b_left_mask.astype(theano.config.floatX))
        better_left_bwd_hidden, _ = left_context_bwd_lstm.apply(better_left_bwd_tmp[::-1], mask=b_left_mask.astype(theano.config.floatX)[::-1])
        better_left_hidden_list = [better_left_fwd_hidden, better_left_bwd_hidden]
        better_left_enc_dim = 2*sum(config.ctx_lstm_size)
        better_left_enc = tensor.concatenate([h[-1,:,:] for h in better_left_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size
        better_left_enc.name = 'better_left_enc'
        left_context_hidden_list = left_context_hidden_list + [better_left_fwd_hidden, better_left_bwd_hidden]

        worse_left_embed = embed.apply(w_left)
        worse_left_fwd_tmp = left_context_fwd_lstm_ins.apply(worse_left_embed)
        worse_left_bwd_tmp = left_context_bwd_lstm_ins.apply(worse_left_embed)
        worse_left_fwd_hidden, _ = left_context_fwd_lstm.apply(worse_left_fwd_tmp, mask=w_left_mask.astype(theano.config.floatX))
        worse_left_bwd_hidden, _ = left_context_bwd_lstm.apply(worse_left_bwd_tmp[::-1], mask=w_left_mask.astype(theano.config.floatX)[::-1])
        worse_left_hidden_list = [worse_left_fwd_hidden, worse_left_bwd_hidden]
        worse_left_enc_dim = 2*sum(config.ctx_lstm_size)
        worse_left_enc = tensor.concatenate([h[-1,:,:] for h in worse_left_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size
        worse_left_enc.name = 'worse_left_enc'
        left_context_hidden_list = left_context_hidden_list + [worse_left_fwd_hidden, worse_left_bwd_hidden]


        #right half encoding
        better_right_embed = embed.apply(b_right)
        better_right_fwd_tmp = right_context_fwd_lstm_ins.apply(better_right_embed)
        better_right_bwd_tmp = right_context_bwd_lstm_ins.apply(better_right_embed)
        better_right_fwd_hidden, _ = right_context_fwd_lstm.apply(better_right_fwd_tmp, mask=b_right_mask.astype(theano.config.floatX))
        better_right_bwd_hidden, _ = right_context_bwd_lstm.apply(better_right_bwd_tmp[::-1], mask=b_right_mask.astype(theano.config.floatX)[::-1])
        better_right_hidden_list = [better_right_fwd_hidden, better_right_bwd_hidden]
        better_right_enc_dim = 2*sum(config.ctx_lstm_size)
        better_right_enc = tensor.concatenate([h[-1,:,:] for h in better_right_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size
        better_right_enc.name = 'better_right_enc'
        right_context_hidden_list = right_context_hidden_list + [better_right_fwd_hidden, better_right_bwd_hidden]

        worse_right_embed = embed.apply(w_right)
        worse_right_fwd_tmp = right_context_fwd_lstm_ins.apply(worse_right_embed)
        worse_right_bwd_tmp = right_context_bwd_lstm_ins.apply(worse_right_embed)
        worse_right_fwd_hidden, _ = right_context_fwd_lstm.apply(worse_right_fwd_tmp, mask=w_right_mask.astype(theano.config.floatX))
        worse_right_bwd_hidden, _ = right_context_bwd_lstm.apply(worse_right_bwd_tmp[::-1], mask=w_right_mask.astype(theano.config.floatX)[::-1])
        worse_right_hidden_list = [worse_right_fwd_hidden, worse_right_bwd_hidden]
        worse_right_enc_dim = 2*sum(config.ctx_lstm_size)
        worse_right_enc = tensor.concatenate([h[-1,:,:] for h in worse_right_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size
        worse_right_enc.name = 'worse_right_enc'
        right_context_hidden_list = right_context_hidden_list + [worse_right_fwd_hidden, worse_right_bwd_hidden]


        # F1 prediction MLP
        prediction_mlp = MLP(dims=config.prediction_mlp_hidden + [1],
                             activations=config.prediction_mlp_activations[1:] + [Identity()],
                             name='prediction_mlp')

        prediction_qlinear = Linear(input_dim=qenc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, name='preq')
        prediction_cand_linear = Linear(input_dim=worse_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='precand')
        prediction_left_half_linear = Linear(input_dim=better_left_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='preleft')
        prediction_right_half_linear = Linear(input_dim=better_right_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='preright')
        bricks += [prediction_mlp, prediction_qlinear, prediction_cand_linear, prediction_left_half_linear, prediction_right_half_linear]
        better_layer1 = Tanh('tan1').apply(tensor.concatenate([prediction_cand_linear.apply(better_enc), prediction_qlinear.apply(qenc), prediction_left_half_linear.apply(better_left_enc), prediction_right_half_linear.apply(better_right_enc)],axis=1))
        better_layer1.name = 'better_layer1'

        worse_layer1 = Tanh('tan2').apply(tensor.concatenate([prediction_cand_linear.apply(worse_enc), prediction_qlinear.apply(qenc), prediction_left_half_linear.apply(worse_left_enc), prediction_right_half_linear.apply(worse_right_enc)],axis=1))
        worse_layer1.name = 'worse_layer1'



        better_pred_weights = Tanh('rec1').apply(prediction_mlp.apply(better_layer1)) #batch_size
        worse_pred_weights = Tanh('rec2').apply(prediction_mlp.apply(worse_layer1)) #batch_size

        # numpy.set_printoptions(edgeitems=500)
        # better_pred_weights = theano.printing.Print('better')(better_pred_weights)
        # worse_pred_weights = theano.printing.Print('better')(worse_pred_weights)
        # #cost : max(0,- score-better + score-worse + margin)
        margin = config.margin
        conditions = tensor.lt(better_pred_weights, worse_pred_weights + margin).astype(theano.config.floatX)
        self.predictions = conditions
        cost = (-better_pred_weights + worse_pred_weights + margin) * conditions
        cost = cost.mean()

        # Apply dropout
        cg = ComputationGraph([cost])

        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + candidates_hidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
Exemple #27
0
VOCAB_DIM = dataset.vocabulary_size
print "vocab size:", VOCAB_DIM
EMBEDDING_DIM = 100

Xs = tensor.imatrix("context")
y = tensor.ivector('center')

w1 = LookupTable(name="w1", length=VOCAB_DIM, dim=EMBEDDING_DIM)
w2 = Linear(name='w2', input_dim=EMBEDDING_DIM, output_dim=VOCAB_DIM)

hidden = tensor.mean(w1.apply(Xs), axis=1)
y_hat = Softmax().apply(w2.apply(hidden))

w1.weights_init = w2.weights_init = IsotropicGaussian(0.01)
w1.biases_init = w2.biases_init = Constant(0)
w1.initialize()
w2.initialize()

cost = CategoricalCrossEntropy().apply(y, y_hat)

cg = ComputationGraph(cost)
W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables)

cost = cost + 0.005 * (W1**2).sum() + 0.005 * (W2**2).sum()
cost.name = "loss"

#
# the actual training of the model
#
main = MainLoop(
    data_stream=DataStream.default_stream(dataset,
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')
        ans_indices = tensor.imatrix('ans_indices')  # n_steps * n_samples
        ans_indices_mask = tensor.imatrix('ans_indices_mask')

        context_bag = tensor.eq(context[:, :, None],
                                tensor.arange(vocab_size)).sum(axis=1).clip(
                                    0, 1)

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)
        ans_indices = ans_indices.dimshuffle(1, 0)
        ans_indices_mask = ans_indices_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size,
                            config.embed_size,
                            name='question_embed')
        embed.weights_init = IsotropicGaussian(0.01)
        # embeddings_initial_value = init_embedding_table(filename='embeddings/vocab_embeddings.txt')
        # embed.weights_init = Constant(embeddings_initial_value)

        # Calculate question encoding (concatenate layer1)
        qembed = embed.apply(question)

        qlstms, qhidden_list = make_bidir_lstm_stack(
            qembed, config.embed_size,
            question_mask.astype(theano.config.floatX),
            config.question_lstm_size, config.question_skip_connections, 'q')
        bricks = bricks + qlstms
        if config.question_skip_connections:
            qenc_dim = 2 * sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list],
                                      axis=1)
        else:
            qenc_dim = 2 * config.question_lstm_size[-1]
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]],
                                      axis=1)
        qenc.name = 'qenc'
        #embed size: 200, lstm_size = 256
        #qenc: length * batch_size * (2*lstm_size)

        # Calculate context encoding (concatenate layer1)
        cembed = embed.apply(context)
        cqembed = tensor.concatenate(
            [
                cembed,
                tensor.extra_ops.repeat(
                    qenc[None, :, :], cembed.shape[0], axis=0)
            ],
            axis=2
        )  #length * batch_size * (embed+2*lstm_size) this is what goes into encoder
        clstms, chidden_list = make_bidir_lstm_stack(
            cqembed, config.embed_size + qenc_dim,
            context_mask.astype(theano.config.floatX), config.ctx_lstm_size,
            config.ctx_skip_connections, 'ctx')
        bricks = bricks + clstms
        if config.ctx_skip_connections:
            cenc_dim = 2 * sum(config.ctx_lstm_size)  #2 : fw & bw
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2 * config.question_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'
        #cenc: length * batch_size * (2*lstm_size)

        #pointer networks decoder LSTM and Attention parameters
        params = init_params(data_dim=config.decoder_data_dim,
                             lstm_dim=config.decoder_lstm_output_dim)
        tparams = init_tparams(params)

        self.theano_params = []
        add_role(tparams['lstm_de_W'], WEIGHT)
        add_role(tparams['lstm_de_U'], WEIGHT)
        add_role(tparams['lstm_de_b'], BIAS)
        add_role(tparams['ptr_v'], WEIGHT)
        add_role(tparams['ptr_W1'], WEIGHT)
        add_role(tparams['ptr_W2'], WEIGHT)
        self.theano_params = tparams.values()
        # for p in tparams.values():
        #     add_role(p, WEIGHT)
        #     self.theano_params.append(p)

        #n_steps = length , n_samples = batch_size
        n_steps = ans_indices.shape[0]
        n_samples = ans_indices.shape[1]
        preds, generations = ptr_network(
            tparams, cqembed, context_mask.astype(theano.config.floatX),
            ans_indices, ans_indices_mask.astype(theano.config.floatX),
            config.decoder_lstm_output_dim, cenc)

        self.generations = generations

        idx_steps = tensor.outer(tensor.arange(n_steps, dtype='int64'),
                                 tensor.ones((n_samples, ), dtype='int64'))
        idx_samples = tensor.outer(tensor.ones((n_steps, ), dtype='int64'),
                                   tensor.arange(n_samples, dtype='int64'))
        probs = preds[idx_steps, ans_indices, idx_samples]
        # probs *= y_mask
        off = 1e-8
        if probs.dtype == 'float16':
            off = 1e-6
        # probs += (1 - y_mask)  # change unmasked position to 1, since log(1) = 0
        probs += off
        # probs_printed = theano.printing.Print('this is probs')(probs)
        cost = -tensor.log(probs)
        cost *= ans_indices_mask
        cost = cost.sum(axis=0) / ans_indices_mask.sum(axis=0)
        cost = cost.mean()
        # Apply dropout
        cg = ComputationGraph([cost])

        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
                  [attention.take_glimpses.outputs[0]],
                  emitter=emitter,
                  name="readout")

generator = SequenceGenerator(readout=readout,
                              attention=attention,
                              transition=transition,
                              name="generator")

generator.weights_init = IsotropicGaussian(0.01)
generator.biases_init = Constant(0.001)
generator.push_initialization_config()

lookup.weights_init = IsotropicGaussian(0.01)
lookup.biases_init = Constant(0.001)
lookup.initialize()

#generator.transition.weights_init = initialization.Identity(0.98)
#generator.transition.biases_init = IsotropicGaussian(0.01,0.9)
generator.transition.push_initialization_config()
generator.initialize()

cost_matrix = generator.cost_matrix(x,
                                    x_mask,
                                    attended=embed,
                                    attended_mask=context_mask)
cost = cost_matrix.sum(axis=0).mean()
cost.name = "nll"

cg = ComputationGraph(cost)
model = Model(cost)
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size,
                            config.embed_size,
                            name='question_embed')
        embed.weights_init = IsotropicGaussian(0.01)

        # Calculate question encoding (concatenate layer1)
        qembed = embed.apply(question)
        qlstms, qhidden_list = make_bidir_lstm_stack(
            qembed, config.embed_size,
            question_mask.astype(theano.config.floatX),
            config.question_lstm_size, config.question_skip_connections, 'q')
        bricks = bricks + qlstms
        if config.question_skip_connections:
            qenc_dim = 2 * sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list],
                                      axis=1)
        else:
            qenc_dim = 2 * config.question_lstm_size[-1]
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]],
                                      axis=1)
        qenc.name = 'qenc'

        # Calculate context encoding (concatenate layer1)
        cembed = embed.apply(context)
        clstms, chidden_list = make_bidir_lstm_stack(
            cembed, config.embed_size,
            context_mask.astype(theano.config.floatX), config.ctx_lstm_size,
            config.ctx_skip_connections, 'ctx')
        bricks = bricks + clstms
        if config.ctx_skip_connections:
            cenc_dim = 2 * sum(config.ctx_lstm_size)  #2 : fw & bw
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2 * config.question_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'

        # Attention mechanism MLP fwd
        attention_mlp_fwd = MLP(
            dims=config.attention_mlp_hidden + [1],
            activations=config.attention_mlp_activations[1:] + [Identity()],
            name='attention_mlp_fwd')
        attention_qlinear_fwd = Linear(
            input_dim=qenc_dim,
            output_dim=config.attention_mlp_hidden[0],
            name='attq_fwd')
        attention_clinear_fwd = Linear(
            input_dim=cenc_dim / 2,
            output_dim=config.attention_mlp_hidden[0],
            use_bias=False,
            name='attc_fwd')
        bricks += [
            attention_mlp_fwd, attention_qlinear_fwd, attention_clinear_fwd
        ]
        layer1_fwd = Tanh(name='tanh_fwd')
        layer1_fwd = layer1_fwd.apply(
            attention_clinear_fwd.apply(cenc[:, :, :cenc_dim / 2].reshape(
                (cenc.shape[0] * cenc.shape[1], cenc.shape[2] /
                 2))).reshape((cenc.shape[0], cenc.shape[1],
                               config.attention_mlp_hidden[0])) +
            attention_qlinear_fwd.apply(qenc)[None, :, :])
        att_weights_fwd = attention_mlp_fwd.apply(
            layer1_fwd.reshape((layer1_fwd.shape[0] * layer1_fwd.shape[1],
                                layer1_fwd.shape[2])))
        att_weights_fwd = att_weights_fwd.reshape(
            (layer1_fwd.shape[0], layer1_fwd.shape[1]))
        att_weights_fwd = tensor.nnet.softmax(att_weights_fwd.T)
        att_weights_fwd.name = 'att_weights_fwd'

        attended_fwd = tensor.sum(cenc[:, :, :cenc_dim / 2] *
                                  att_weights_fwd.T[:, :, None],
                                  axis=0)
        attended_fwd.name = 'attended_fwd'

        # Attention mechanism MLP bwd
        attention_mlp_bwd = MLP(
            dims=config.attention_mlp_hidden + [1],
            activations=config.attention_mlp_activations[1:] + [Identity()],
            name='attention_mlp_bwd')
        attention_qlinear_bwd = Linear(
            input_dim=qenc_dim,
            output_dim=config.attention_mlp_hidden[0],
            name='attq_bwd')
        attention_clinear_bwd = Linear(
            input_dim=cenc_dim / 2,
            output_dim=config.attention_mlp_hidden[0],
            use_bias=False,
            name='attc_bwd')
        bricks += [
            attention_mlp_bwd, attention_qlinear_bwd, attention_clinear_bwd
        ]
        layer1_bwd = Tanh(name='tanh_bwd')
        layer1_bwd = layer1_bwd.apply(
            attention_clinear_bwd.apply(cenc[:, :, cenc_dim / 2:].reshape(
                (cenc.shape[0] * cenc.shape[1], cenc.shape[2] /
                 2))).reshape((cenc.shape[0], cenc.shape[1],
                               config.attention_mlp_hidden[0])) +
            attention_qlinear_bwd.apply(qenc)[None, :, :])
        att_weights_bwd = attention_mlp_bwd.apply(
            layer1_bwd.reshape((layer1_bwd.shape[0] * layer1_bwd.shape[1],
                                layer1_bwd.shape[2])))
        att_weights_bwd = att_weights_bwd.reshape(
            (layer1_bwd.shape[0], layer1_bwd.shape[1]))
        att_weights_bwd = tensor.nnet.softmax(att_weights_bwd.T)
        att_weights_bwd.name = 'att_weights_bwd'

        attended_bwd = tensor.sum(cenc[:, :, cenc_dim / 2:] *
                                  att_weights_bwd.T[:, :, None],
                                  axis=0)
        attended_bwd.name = 'attended_bwd'

        ctx_question = tensor.concatenate([attended_fwd, attended_bwd, qenc],
                                          axis=1)
        ctx_question.name = 'ctx_question'

        answer_bag = to_bag(answer, vocab_size)
        answer_bag = tensor.set_subtensor(answer_bag[:, 0:3], 0)
        relevant_items = answer_bag.sum(axis=1, dtype=theano.config.floatX)

        def createSequences(j, index, c_enc, c_enc_dim, c_context,
                            c_window_size):
            sequence = tensor.concatenate([
                c_context[j:j + index, :],
                tensor.zeros((c_window_size - index, c_context.shape[1]))
            ],
                                          axis=0)
            enc = tensor.concatenate([
                c_enc[j + index - 1, :, :], c_enc[j, :, :-1],
                tensor.tile(c_window_size[None, None], (c_enc.shape[1], 1))
            ],
                                     axis=1)
            return enc, sequence

        def createTargetValues(j, index, c_context, c_vocab_size):
            sequence_bag = to_bag(c_context[j:j + index, :], c_vocab_size)
            sequence_bag = tensor.set_subtensor(sequence_bag[:, 0:3], 0)
            selected_items = sequence_bag.sum(axis=1,
                                              dtype=theano.config.floatX)
            tp = (sequence_bag * answer_bag).sum(axis=1,
                                                 dtype=theano.config.floatX)
            precision = tp / (selected_items + 0.00001)
            recall = tp / (relevant_items + 0.00001)
            #precision = tensor.set_subtensor(precision[tensor.isnan(precision)], 0.0)
            #recall = tensor.set_subtensor(recall[tensor.isnan(recall)], 1.0)
            macroF1 = (2 *
                       (precision * recall)) / (precision + recall + 0.00001)
            #macroF1 = tensor.set_subtensor(macroF1[tensor.isnan(macroF1)], 0.0)
            return macroF1

        window_size = 3
        senc = []
        sequences = []
        pred_targets = []
        for i in range(1, window_size + 1):
            (all_enc, all_sequence), _ = theano.scan(
                fn=createSequences,
                sequences=tensor.arange(cenc.shape[0] - i + 1),
                non_sequences=[i, cenc, cenc_dim, context, window_size])
            (all_macroF1), _ = theano.scan(
                fn=createTargetValues,
                sequences=tensor.arange(cenc.shape[0] - i + 1),
                non_sequences=[i, context, vocab_size])
            senc.append(all_enc)
            sequences.append(all_sequence)
            pred_targets.append(all_macroF1)

        senc = tensor.concatenate(senc, axis=0)
        sequences = tensor.concatenate(sequences, axis=0)
        pred_targets = tensor.concatenate(pred_targets, axis=0)

        # F1 prediction Bilinear
        prediction_linear = Linear(input_dim=2 * cenc_dim,
                                   output_dim=cenc_dim + qenc_dim,
                                   name='pred_linear')
        bricks += [prediction_linear]
        pred_weights = ctx_question[None, :, :] * prediction_linear.apply(
            senc.reshape(
                (senc.shape[0] * senc.shape[1], senc.shape[2]))).reshape(
                    (senc.shape[0], senc.shape[1], senc.shape[2]))
        pred_weights = pred_weights.sum(axis=2)
        pred_weights = tensor.nnet.sigmoid(pred_weights.T).T
        pred_weights.name = 'pred_weights'

        pred_targets = pred_targets / (pred_targets.sum(axis=0) + 0.00001)
        pred_weights = pred_weights / (pred_weights.sum(axis=0) + 0.00001)

        #numpy.set_printoptions(edgeitems=500)
        #pred_targets = theano.printing.Print('pred_targets')(pred_targets)
        #pred_weights = theano.printing.Print('pred_weights')(pred_weights)

        cost = tensor.nnet.binary_crossentropy(pred_weights,
                                               pred_targets).mean()
        self.predictions = sequences[pred_weights.argmax(axis=0), :,
                                     tensor.arange(sequences.shape[2])].T

        # Apply dropout
        cg = ComputationGraph([cost])

        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
Exemple #31
0
def main():
    x = T.imatrix('features')
    m = T.matrix('features_mask')
    y = T.imatrix('targets')
    #x_int = x.astype(dtype='int32').T
    x_int = x.T

    train_dataset = IMDB('train')
    n_voc = len(train_dataset.dict.keys())
    n_h = 2
    lookup = LookupTable(
            length=n_voc+2,
            dim = n_h*4,
            weights_init = Uniform(std=0.01),
            biases_init = Constant(0.)
        )
    lookup.initialize()

    #rnn = SimpleRecurrent(
            #dim = n_h,
            #activation=Tanh(),
            #weights_init = Uniform(std=0.01),
            #biases_init = Constant(0.)
        #)
    rnn = LSTM(
            dim = n_h,
            activation=Tanh(),
            weights_init = Uniform(std=0.01),
            biases_init = Constant(0.)
        )

    rnn.initialize()
    score_layer = Linear(
            input_dim = n_h,
            output_dim = 1,
            weights_init = Uniform(std=0.01),
            biases_init = Constant(0.))
    score_layer.initialize()

    embedding = lookup.apply(x_int) * T.shape_padright(m.T)
    #embedding = lookup.apply(x_int) + m.T.mean()*0
    #embedding = lookup.apply(x_int) + m.T.mean()*0

    rnn_states = rnn.apply(embedding, mask=m.T)
    #rnn_states, rnn_cells = rnn.apply(embedding)
    rnn_out_mean_pooled = rnn_states[-1]
    #rnn_out_mean_pooled = rnn_states.mean()

    probs = Sigmoid().apply(
        score_layer.apply(rnn_out_mean_pooled))

    cost = - (y * T.log(probs) + (1-y) * T.log(1 - probs)).mean()
    cost.name = 'cost'
    misclassification = (y * (probs < 0.5) + (1-y) * (probs > 0.5)).mean()
    misclassification.name = 'misclassification'


    # =================

    cg = ComputationGraph([cost])
    params = cg.parameters
    algorithm = GradientDescent(
            cost = cost,
            params=params,
            step_rule = CompositeRule([
                StepClipping(threshold=10),
                Adam(),
                #AdaDelta(),
                ])

            )


    # ========

    test_dataset = IMDB('test')
    batch_size = 64
    n_train = train_dataset.num_examples
    train_stream = DataStream(
            dataset=train_dataset,
            iteration_scheme=ShuffledScheme(
                examples=n_train,
                batch_size=batch_size)
            )
    train_padded = Padding(
            data_stream=train_stream,
            mask_sources=('features',)
            #mask_sources=[]
            )


    test_stream = DataStream(
            dataset=test_dataset,
            iteration_scheme=ShuffledScheme(
                examples=n_train,
                batch_size=batch_size)
            )
    test_padded = Padding(
            data_stream=test_stream,
            mask_sources=('features',)
            #mask_sources=[]
            )
    #import ipdb
    #ipdb.set_trace()

    #======
    model = Model(cost)
    extensions = []
    extensions.append(EpochProgress(batch_per_epoch=train_dataset.num_examples // batch_size + 1))
    extensions.append(TrainingDataMonitoring(
        [cost, misclassification],
        prefix='train',
        after_epoch=True
        ))

    extensions.append(DataStreamMonitoring(
        [cost, misclassification],
        data_stream=test_padded,
        prefix='test',
        after_epoch=True
        ))
    extensions.append(Timing())
    extensions.append(Printing())

    main_loop = MainLoop(
            model=model,
            data_stream=train_padded,
            algorithm=algorithm,
            extensions=extensions)
    main_loop.run()
Exemple #32
0
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size,
                            config.embed_size,
                            name='question_embed')
        embed.weights_init = IsotropicGaussian(0.01)

        # Calculate question encoding (concatenate layer1)
        qembed = embed.apply(question)
        qlstms, qhidden_list = make_bidir_lstm_stack(
            qembed, config.embed_size,
            question_mask.astype(theano.config.floatX),
            config.question_lstm_size, config.question_skip_connections, 'q')
        bricks = bricks + qlstms
        if config.question_skip_connections:
            qenc_dim = 2 * sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list],
                                      axis=1)
        else:
            qenc_dim = 2 * config.question_lstm_size[-1]
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]],
                                      axis=1)
        qenc.name = 'qenc'

        # Calculate context encoding (concatenate layer1)
        cembed = embed.apply(context)
        clstms, chidden_list = make_bidir_lstm_stack(
            cembed, config.embed_size,
            context_mask.astype(theano.config.floatX), config.ctx_lstm_size,
            config.ctx_skip_connections, 'ctx')
        bricks = bricks + clstms
        if config.ctx_skip_connections:
            cenc_dim = 2 * sum(config.ctx_lstm_size)  #2 : fw & bw
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2 * config.question_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'

        # Attention mechanism MLP
        attention_mlp = MLP(dims=config.attention_mlp_hidden + [1],
                            activations=config.attention_mlp_activations[1:] +
                            [Identity()],
                            name='attention_mlp')
        attention_qlinear = Linear(input_dim=qenc_dim,
                                   output_dim=config.attention_mlp_hidden[0],
                                   name='attq')
        attention_clinear = Linear(input_dim=cenc_dim,
                                   output_dim=config.attention_mlp_hidden[0],
                                   use_bias=False,
                                   name='attc')
        bricks += [attention_mlp, attention_qlinear, attention_clinear]
        layer1 = Tanh().apply(
            attention_clinear.apply(
                cenc.reshape((cenc.shape[0] * cenc.shape[1], cenc.shape[2]
                              ))).reshape((cenc.shape[0], cenc.shape[1],
                                           config.attention_mlp_hidden[0])) +
            attention_qlinear.apply(qenc)[None, :, :])
        layer1.name = 'layer1'
        att_weights = attention_mlp.apply(
            layer1.reshape(
                (layer1.shape[0] * layer1.shape[1], layer1.shape[2])))
        att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1]))
        att_weights = tensor.nnet.sigmoid(att_weights.T).T
        att_weights.name = 'att_weights'

        att_target = tensor.eq(
            tensor.tile(answer[None, :, :], (context.shape[0], 1, 1)),
            tensor.tile(context[:, None, :],
                        (1, answer.shape[0], 1))).sum(axis=1).clip(0, 1)
        cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) *
                context_mask).sum() / context_mask.sum()
        self.predictions = tensor.gt(att_weights, 0.1) * context

        # Apply dropout
        cg = ComputationGraph([cost])
        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
Exemple #33
0
hidden_layer_size = 200

layer1 = LookupTable(name='layer1',
                     length=train_dataset.words_bag_size,
                     dim=hidden_layer_size,
                     weights_init=Uniform(mean=0, std=0.01),
                     biases_init=Constant(0))
act1_mean = tensor.mean(layer1.apply(x), axis=1)
layer2 = Linear(name='layer2',
                input_dim=layer1.output_dim,
                output_dim=train_dataset.words_bag_size,
                weights_init=Uniform(mean=0, std=0.01),
                biases_init=Constant(0))
act2_softmax = Softmax().apply(layer2.apply(act1_mean))

layer1.initialize()
layer2.initialize()

missclass = MisclassificationRate().apply(y, act2_softmax)

cost = CategoricalCrossEntropy().apply(y, act2_softmax)

cg = ComputationGraph([cost])

W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables)

cost = cost + 0.00001 * (W1**2).sum() + 0.00005 * (W2**2).sum()

cost.name = 'cost'

from blocks.algorithms import GradientDescent, Scale
Exemple #34
0
VOCAB_DIM = dataset.vocabulary_size
print "vocab size:", VOCAB_DIM
EMBEDDING_DIM = 100

Xs = tensor.imatrix("context")
y = tensor.ivector('center')

w1 = LookupTable(name="w1", length=VOCAB_DIM, dim=EMBEDDING_DIM)
w2 = Linear(name='w2', input_dim=EMBEDDING_DIM, output_dim=VOCAB_DIM)

hidden = tensor.mean(w1.apply(Xs), axis=1)
y_hat = Softmax().apply(w2.apply(hidden))

w1.weights_init = w2.weights_init = IsotropicGaussian(0.01)
w1.biases_init = w2.biases_init = Constant(0)
w1.initialize()
w2.initialize()

cost = CategoricalCrossEntropy().apply(y, y_hat)

cg = ComputationGraph(cost)
W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables)

cost = cost + 0.005 * (W1 ** 2).sum() + 0.005 * (W2 ** 2).sum()
cost.name = "loss"


#
# the actual training of the model
#
main = MainLoop(data_stream = DataStream.default_stream(
Exemple #35
0
    def __init__(self, config, vocab_size):
        unsorted = tensor.imatrix('unsorted')
        unsorted_mask = tensor.imatrix('unsorted_mask')

        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')

        bricks = []

        unsorted = unsorted.dimshuffle(1, 0)
        unsorted_mask = unsorted_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)

        # Embed unsorted sequence
        embed = LookupTable(vocab_size, config.embed_size, name='embed')
        embed.weights_init = IsotropicGaussian(0.01)
        #make_bidir_lstm_stack(seq, seq_dim, mask, sizes, skip=True, name=''):
        unsorted_embed = embed.apply(unsorted)
        unsorted_lstms, unsorted_hidden_list = make_bidir_lstm_stack(
            unsorted_embed, config.embed_size,
            unsorted_mask.astype(theano.config.floatX), config.lstm_size,
            config.match_skip_connections, 'u')  #lu,bs,lstm_dim

        bricks = bricks + unsorted_lstms
        unsorted_enc_dim = 2 * sum(config.lstm_size)
        unsorted_enc = tensor.concatenate(
            unsorted_hidden_list,
            axis=2)  #concatenate fwd & bwd lstm hidden states
        unsorted_enc.name = 'unsorted_enc'

        #pointer networks decoder LSTM and Attention parameters
        params = init_params(data_dim=config.decoder_data_dim,
                             lstm_dim=config.decoder_lstm_output_dim)
        tparams = init_tparams(params)
        add_role(tparams['lstm_de_W'], WEIGHT)
        add_role(tparams['lstm_de_U'], WEIGHT)
        add_role(tparams['lstm_de_b'], BIAS)
        add_role(tparams['ptr_b1'], BIAS)
        add_role(tparams['ptr_b2'], BIAS)
        add_role(tparams['ptr_v'], WEIGHT)
        add_role(tparams['ptr_W1'], WEIGHT)
        add_role(tparams['ptr_W2'], WEIGHT)
        self.theano_params = tparams.values()

        #n_steps = length , n_samples = batch_size
        n_steps = answer.shape[0]
        n_samples = answer.shape[1]
        preds, generations = ptr_network(
            tparams, unsorted_embed,
            unsorted_mask.astype(theano.config.floatX), answer,
            answer_mask.astype(theano.config.floatX),
            config.decoder_lstm_output_dim, unsorted_enc)

        self.generations = generations

        idx_steps = tensor.outer(tensor.arange(n_steps, dtype='int64'),
                                 tensor.ones((n_samples, ), dtype='int64'))
        idx_samples = tensor.outer(tensor.ones((n_steps, ), dtype='int64'),
                                   tensor.arange(n_samples, dtype='int64'))
        probs = preds[idx_steps, answer, idx_samples]
        # probs *= y_mask
        off = 1e-8
        if probs.dtype == 'float16':
            off = 1e-6

        probs += off
        # probs_printed = theano.printing.Print('probs')(probs)
        cost = -tensor.log(probs)
        cost *= answer_mask
        cost = cost.sum(axis=0) / answer_mask.sum(axis=0)
        cost = cost.mean()

        # Apply dropout
        cg = ComputationGraph([cost])
        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, unsorted_hidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
)

generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, name="generator")

generator.weights_init = IsotropicGaussian(0.01)
generator.biases_init = Constant(0.0)
generator.push_initialization_config()

generator.transition.biases_init = IsotropicGaussian(0.01, 1)
generator.transition.push_initialization_config()

generator.initialize()

lookup.weights_init = IsotropicGaussian(0.001)
lookup.biases_init = Constant(0.0)
lookup.initialize()

# states = {}
states = [state for state in generator.transition.apply.outputs if state != "step"]

# ipdb.set_trace()

states = {name: shared_floatx_zeros((batch_size, hidden_size_recurrent)) for name in states}

cost_matrix = generator.cost_matrix(x, attended=context, **states)

cost = cost_matrix.mean() + 0.0 * start_flag
cost.name = "nll"

cg = ComputationGraph(cost)
Exemple #37
0
def main():
    x = T.imatrix('features')
    m = T.matrix('features_mask')
    y = T.imatrix('targets')
    #x_int = x.astype(dtype='int32').T
    x_int = x.T

    train_dataset = IMDB('train')
    n_voc = len(train_dataset.dict.keys())
    n_h = 2
    lookup = LookupTable(length=n_voc + 2,
                         dim=n_h * 4,
                         weights_init=Uniform(std=0.01),
                         biases_init=Constant(0.))
    lookup.initialize()

    #rnn = SimpleRecurrent(
    #dim = n_h,
    #activation=Tanh(),
    #weights_init = Uniform(std=0.01),
    #biases_init = Constant(0.)
    #)
    rnn = LSTM(dim=n_h,
               activation=Tanh(),
               weights_init=Uniform(std=0.01),
               biases_init=Constant(0.))

    rnn.initialize()
    score_layer = Linear(input_dim=n_h,
                         output_dim=1,
                         weights_init=Uniform(std=0.01),
                         biases_init=Constant(0.))
    score_layer.initialize()

    embedding = lookup.apply(x_int) * T.shape_padright(m.T)
    #embedding = lookup.apply(x_int) + m.T.mean()*0
    #embedding = lookup.apply(x_int) + m.T.mean()*0

    rnn_states = rnn.apply(embedding, mask=m.T)
    #rnn_states, rnn_cells = rnn.apply(embedding)
    rnn_out_mean_pooled = rnn_states[-1]
    #rnn_out_mean_pooled = rnn_states.mean()

    probs = Sigmoid().apply(score_layer.apply(rnn_out_mean_pooled))

    cost = -(y * T.log(probs) + (1 - y) * T.log(1 - probs)).mean()
    cost.name = 'cost'
    misclassification = (y * (probs < 0.5) + (1 - y) * (probs > 0.5)).mean()
    misclassification.name = 'misclassification'

    # =================

    cg = ComputationGraph([cost])
    params = cg.parameters
    algorithm = GradientDescent(
        cost=cost,
        params=params,
        step_rule=CompositeRule([
            StepClipping(threshold=10),
            Adam(),
            #AdaDelta(),
        ]))

    # ========

    test_dataset = IMDB('test')
    batch_size = 64
    n_train = train_dataset.num_examples
    train_stream = DataStream(dataset=train_dataset,
                              iteration_scheme=ShuffledScheme(
                                  examples=n_train, batch_size=batch_size))
    train_padded = Padding(data_stream=train_stream,
                           mask_sources=('features', )
                           #mask_sources=[]
                           )

    test_stream = DataStream(dataset=test_dataset,
                             iteration_scheme=ShuffledScheme(
                                 examples=n_train, batch_size=batch_size))
    test_padded = Padding(data_stream=test_stream,
                          mask_sources=('features', )
                          #mask_sources=[]
                          )
    #import ipdb
    #ipdb.set_trace()

    #======
    model = Model(cost)
    extensions = []
    extensions.append(
        EpochProgress(
            batch_per_epoch=train_dataset.num_examples // batch_size + 1))
    extensions.append(
        TrainingDataMonitoring([cost, misclassification],
                               prefix='train',
                               after_epoch=True))

    extensions.append(
        DataStreamMonitoring([cost, misclassification],
                             data_stream=test_padded,
                             prefix='test',
                             after_epoch=True))
    extensions.append(Timing())
    extensions.append(Printing())

    main_loop = MainLoop(model=model,
                         data_stream=train_padded,
                         algorithm=algorithm,
                         extensions=extensions)
    main_loop.run()