def get_presoft(h, args):
    output_size = get_output_size(args.dataset)
    # If args.skip_connections: dim = args.layers * args.state_dim
    # else: dim = args.state_dim
    use_all_states = args.skip_connections or args.skip_output or (args.rnn_type in ["clockwork", "soft"])
    output_layer = Linear(
        input_dim=use_all_states * args.layers *
        args.state_dim + (1 - use_all_states) * args.state_dim,
        output_dim=output_size, name="output_layer")

    output_layer.weights_init = initialization.IsotropicGaussian(0.1)
    output_layer.biases_init = initialization.Constant(0)
    output_layer.initialize()
    presoft = output_layer.apply(h)
    if not has_indices(args.dataset):
        presoft = Tanh().apply(presoft)
    presoft.name = 'presoft'
    return presoft
def get_presoft(h, args):
    output_size = get_output_size(args.dataset)
    # If args.skip_connections: dim = args.layers * args.state_dim
    # else: dim = args.state_dim
    use_all_states = args.skip_connections or args.skip_output or (
        args.rnn_type in ["clockwork", "soft"])
    output_layer = Linear(
        input_dim=use_all_states * args.layers * args.state_dim +
        (1 - use_all_states) * args.state_dim,
        output_dim=output_size,
        name="output_layer")

    output_layer.weights_init = initialization.IsotropicGaussian(0.1)
    output_layer.biases_init = initialization.Constant(0)
    output_layer.initialize()
    presoft = output_layer.apply(h)
    if not has_indices(args.dataset):
        presoft = Tanh().apply(presoft)
    presoft.name = 'presoft'
    return presoft
예제 #3
0
    def __init__(self, config, vocab_size, id_to_vocab, logger):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.ivector('answer')
        candidates = tensor.imatrix('candidates')
        candidates_mask = tensor.imatrix('candidates_mask')

        # question_actual = tensor.imatrix('question_actual')
        # context_actual = tensor.imatrix('context_actual')
        # answer_actual = tensor.imatrix('answer_actual')

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)

        # Embed questions and cntext
        embed = LookupTable(vocab_size, config.embed_size, name='question_embed')
        bricks.append(embed)

        qembed = embed.apply(question)
        cembed = embed.apply(context)

        qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX),
                                                     config.question_lstm_size, config.question_skip_connections, 'q')
        clstms, chidden_list = make_bidir_lstm_stack(cembed, config.embed_size, context_mask.astype(theano.config.floatX),
                                                     config.ctx_lstm_size, config.ctx_skip_connections, 'ctx')
        bricks = bricks + qlstms + clstms

        # Calculate question encoding (concatenate layer1)
        if config.question_skip_connections:
            qenc_dim = 2*sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1)
        else:
            qenc_dim = 2*config.question_lstm_size[-1]
            #u
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1)
        qenc.name = 'qenc'

        # Calculate context encoding (concatenate layer1)
        if config.ctx_skip_connections:
            cenc_dim = 2*sum(config.ctx_lstm_size)
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2*config.ctx_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'

        # Attention mechanism MLP
        attention_mlp = MLP(dims=config.attention_mlp_hidden + [1],
                            activations=config.attention_mlp_activations[1:] + [Identity()],
                            name='attention_mlp')
        attention_qlinear = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq')
        attention_clinear = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc')
        bricks += [attention_mlp, attention_qlinear, attention_clinear]
        layer1 = Tanh().apply(attention_clinear.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2])))
                                        .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0]))
                             + attention_qlinear.apply(qenc)[None, :, :])
        layer1.name = 'layer1'
        att_weights = attention_mlp.apply(layer1.reshape((layer1.shape[0]*layer1.shape[1], layer1.shape[2])))
        att_weights.name = 'att_weights_0'
        att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1]))
        att_weights.name = 'att_weights'

        #r
        attended = tensor.sum(cenc * tensor.nnet.softmax(att_weights.T).T[:, :, None], axis=0)
        attended.name = 'attended'

        # Now we can calculate our output
        out_mlp = MLP(dims=[cenc_dim + qenc_dim] + config.out_mlp_hidden + [config.n_entities],
                      activations=config.out_mlp_activations + [Identity()],
                      name='out_mlp')
        bricks += [out_mlp]
        # g^AR
        probs = out_mlp.apply(tensor.concatenate([attended, qenc], axis=1))
        probs.name = 'probs'

        is_candidate = tensor.eq(tensor.arange(config.n_entities, dtype='int32')[None, None, :],
                                 tensor.switch(candidates_mask, candidates, -tensor.ones_like(candidates))[:, :, None]).sum(axis=1)
        probs = tensor.switch(is_candidate, probs, -1000 * tensor.ones_like(probs))

        # Calculate prediction, cost and error rate
        pred = probs.argmax(axis=1)
        cost = Softmax().categorical_cross_entropy(answer, probs).mean()
        error_rate = tensor.neq(answer, pred).mean()

        # Apply dropout
        cg = ComputationGraph([cost, error_rate])
        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout)
        [cost_reg, error_rate_reg] = cg.outputs

        # Other stuff
        cost_reg.name = cost.name = 'cost'
        error_rate_reg.name = error_rate.name = 'error_rate'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg], [error_rate_reg]]
        self.monitor_vars_valid = [[cost], [error_rate]]

        # Initialize bricks
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
예제 #4
0
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size,
                            config.embed_size,
                            name='question_embed')
        embed.weights_init = IsotropicGaussian(0.01)

        # Calculate question encoding (concatenate layer1)
        qembed = embed.apply(question)
        qlstms, qhidden_list = make_bidir_lstm_stack(
            qembed, config.embed_size,
            question_mask.astype(theano.config.floatX),
            config.question_lstm_size, config.question_skip_connections, 'q')
        bricks = bricks + qlstms
        if config.question_skip_connections:
            qenc_dim = 2 * sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list],
                                      axis=1)
        else:
            qenc_dim = 2 * config.question_lstm_size[-1]
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]],
                                      axis=1)
        qenc.name = 'qenc'

        # Calculate context encoding (concatenate layer1)
        cembed = embed.apply(context)
        clstms, chidden_list = make_bidir_lstm_stack(
            cembed, config.embed_size,
            context_mask.astype(theano.config.floatX), config.ctx_lstm_size,
            config.ctx_skip_connections, 'ctx')
        bricks = bricks + clstms
        if config.ctx_skip_connections:
            cenc_dim = 2 * sum(config.ctx_lstm_size)  #2 : fw & bw
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2 * config.question_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'

        # Attention mechanism MLP
        attention_mlp = MLP(dims=config.attention_mlp_hidden + [1],
                            activations=config.attention_mlp_activations[1:] +
                            [Identity()],
                            name='attention_mlp')
        attention_qlinear = Linear(input_dim=qenc_dim,
                                   output_dim=config.attention_mlp_hidden[0],
                                   name='attq')
        attention_clinear = Linear(input_dim=cenc_dim,
                                   output_dim=config.attention_mlp_hidden[0],
                                   use_bias=False,
                                   name='attc')
        bricks += [attention_mlp, attention_qlinear, attention_clinear]
        layer1 = Tanh().apply(
            attention_clinear.apply(
                cenc.reshape((cenc.shape[0] * cenc.shape[1], cenc.shape[2]
                              ))).reshape((cenc.shape[0], cenc.shape[1],
                                           config.attention_mlp_hidden[0])) +
            attention_qlinear.apply(qenc)[None, :, :])
        layer1.name = 'layer1'
        att_weights = attention_mlp.apply(
            layer1.reshape(
                (layer1.shape[0] * layer1.shape[1], layer1.shape[2])))
        att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1]))
        att_weights = tensor.nnet.sigmoid(att_weights.T).T
        att_weights.name = 'att_weights'

        att_target = tensor.eq(
            tensor.tile(answer[None, :, :], (context.shape[0], 1, 1)),
            tensor.tile(context[:, None, :],
                        (1, answer.shape[0], 1))).sum(axis=1).clip(0, 1)
        cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) *
                context_mask).sum() / context_mask.sum()
        self.predictions = tensor.gt(att_weights, 0.1) * context

        # Apply dropout
        cg = ComputationGraph([cost])
        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
예제 #5
0
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')
        better = tensor.imatrix('better')
        better_mask = tensor.imatrix('better_mask')
        worse = tensor.imatrix('worse')
        worse_mask = tensor.imatrix('worse_mask')
        b_left = tensor.imatrix('b_left')
        b_left_mask = tensor.imatrix('b_left_mask')
        b_right = tensor.imatrix('b_right')
        b_right_mask = tensor.imatrix('b_right_mask')
        w_left = tensor.imatrix('w_left')
        w_left_mask = tensor.imatrix('w_left_mask')
        w_right = tensor.imatrix('w_right')
        w_right_mask = tensor.imatrix('w_right_mask')


        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)

        better = better.dimshuffle(1, 0)
        better_mask = better_mask.dimshuffle(1, 0)

        worse = worse.dimshuffle(1, 0)
        worse_mask = worse_mask.dimshuffle(1, 0)

        b_left = b_left.dimshuffle(1, 0)
        b_left_mask = b_left_mask.dimshuffle(1, 0)

        b_right = b_right.dimshuffle(1, 0)
        b_right_mask = b_right_mask.dimshuffle(1, 0)

        w_left = w_left.dimshuffle(1, 0)
        w_left_mask = w_left_mask.dimshuffle(1, 0)

        w_right = w_right.dimshuffle(1, 0)
        w_right_mask = w_right_mask.dimshuffle(1, 0)

        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size, config.embed_size, name='question_embed')
        embed.weights_init = IsotropicGaussian(0.01)

        # Calculate question encoding (concatenate layer1)
        qembed = embed.apply(question)
        qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX),
                                                     config.question_lstm_size, config.question_skip_connections, 'q')


        bricks = bricks + qlstms
        if config.question_skip_connections:
            qenc_dim = 2*sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1)
        else:
            qenc_dim = 2*config.question_lstm_size[-1]
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1)
        qenc.name = 'qenc'

        # candidate encoders
        candidates_hidden_list = []

        candidate_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='candidate_fwd_lstm_in_0_0')
        candidate_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='candidate_fwd_lstm_0')

        candidate_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='candidate_bwd_lstm_in_0_0')
        candidate_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='candidate_bwd_lstm_0')

        #adding encoding bricks for initialization
        bricks = bricks + [candidate_fwd_lstm, candidate_bwd_lstm, candidate_fwd_lstm_ins, candidate_bwd_lstm_ins]

        #computing better encoding
        better_embed = embed.apply(better)
        better_fwd_tmp = candidate_fwd_lstm_ins.apply(better_embed)
        better_bwd_tmp = candidate_bwd_lstm_ins.apply(better_embed)
        better_fwd_hidden, _ = candidate_fwd_lstm.apply(better_fwd_tmp, mask=better_mask.astype(theano.config.floatX))
        better_bwd_hidden, _ = candidate_bwd_lstm.apply(better_bwd_tmp[::-1], mask=better_mask.astype(theano.config.floatX)[::-1])
        better_hidden_list = [better_fwd_hidden, better_bwd_hidden]
        better_enc_dim = 2*sum(config.ctx_lstm_size)
        better_enc = tensor.concatenate([h[-1,:,:] for h in better_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size
        better_enc.name = 'better_enc'
        candidates_hidden_list = candidates_hidden_list + [better_fwd_hidden, better_bwd_hidden]

        #computing worse encoding
        worse_embed = embed.apply(worse)
        worse_fwd_tmp = candidate_fwd_lstm_ins.apply(worse_embed)
        worse_bwd_tmp = candidate_bwd_lstm_ins.apply(worse_embed)
        worse_fwd_hidden, _ = candidate_fwd_lstm.apply(worse_fwd_tmp, mask=worse_mask.astype(theano.config.floatX))
        worse_bwd_hidden, _ = candidate_bwd_lstm.apply(worse_bwd_tmp[::-1], mask=worse_mask.astype(theano.config.floatX)[::-1])
        worse_hidden_list = [worse_fwd_hidden, worse_bwd_hidden]
        worse_enc_dim = 2*sum(config.ctx_lstm_size)
        worse_enc = tensor.concatenate([h[-1,:,:] for h in worse_hidden_list], axis=1)
        worse_enc.name = 'worse_enc'
        candidates_hidden_list = candidates_hidden_list + [worse_fwd_hidden, worse_bwd_hidden]


        #left encoders
        left_context_hidden_list = []

        left_context_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='left_context_fwd_lstm_in_0_0')
        left_context_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='left_context_fwd_lstm_0')

        left_context_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='left_context_bwd_lstm_in_0_0')
        left_context_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='left_context_bwd_lstm_0')

        #adding encoding bricks for initialization
        bricks = bricks + [left_context_fwd_lstm, left_context_bwd_lstm, left_context_fwd_lstm_ins, left_context_bwd_lstm_ins]

        #right encoders
        right_context_hidden_list = []

        right_context_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='right_context_fwd_lstm_in_0_0')
        right_context_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='right_context_fwd_lstm_0')

        right_context_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='right_context_bwd_lstm_in_0_0')
        right_context_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='right_context_bwd_lstm_0')

        #adding encoding bricks for initialization
        bricks = bricks + [right_context_fwd_lstm, right_context_bwd_lstm, right_context_fwd_lstm_ins, right_context_bwd_lstm_ins]


        #left half encodings
        better_left_embed = embed.apply(b_left)
        better_left_fwd_tmp = left_context_fwd_lstm_ins.apply(better_left_embed)
        better_left_bwd_tmp = left_context_bwd_lstm_ins.apply(better_left_embed)
        better_left_fwd_hidden, _ = left_context_fwd_lstm.apply(better_left_fwd_tmp, mask=b_left_mask.astype(theano.config.floatX))
        better_left_bwd_hidden, _ = left_context_bwd_lstm.apply(better_left_bwd_tmp[::-1], mask=b_left_mask.astype(theano.config.floatX)[::-1])
        better_left_hidden_list = [better_left_fwd_hidden, better_left_bwd_hidden]
        better_left_enc_dim = 2*sum(config.ctx_lstm_size)
        better_left_enc = tensor.concatenate([h[-1,:,:] for h in better_left_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size
        better_left_enc.name = 'better_left_enc'
        left_context_hidden_list = left_context_hidden_list + [better_left_fwd_hidden, better_left_bwd_hidden]

        worse_left_embed = embed.apply(w_left)
        worse_left_fwd_tmp = left_context_fwd_lstm_ins.apply(worse_left_embed)
        worse_left_bwd_tmp = left_context_bwd_lstm_ins.apply(worse_left_embed)
        worse_left_fwd_hidden, _ = left_context_fwd_lstm.apply(worse_left_fwd_tmp, mask=w_left_mask.astype(theano.config.floatX))
        worse_left_bwd_hidden, _ = left_context_bwd_lstm.apply(worse_left_bwd_tmp[::-1], mask=w_left_mask.astype(theano.config.floatX)[::-1])
        worse_left_hidden_list = [worse_left_fwd_hidden, worse_left_bwd_hidden]
        worse_left_enc_dim = 2*sum(config.ctx_lstm_size)
        worse_left_enc = tensor.concatenate([h[-1,:,:] for h in worse_left_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size
        worse_left_enc.name = 'worse_left_enc'
        left_context_hidden_list = left_context_hidden_list + [worse_left_fwd_hidden, worse_left_bwd_hidden]


        #right half encoding
        better_right_embed = embed.apply(b_right)
        better_right_fwd_tmp = right_context_fwd_lstm_ins.apply(better_right_embed)
        better_right_bwd_tmp = right_context_bwd_lstm_ins.apply(better_right_embed)
        better_right_fwd_hidden, _ = right_context_fwd_lstm.apply(better_right_fwd_tmp, mask=b_right_mask.astype(theano.config.floatX))
        better_right_bwd_hidden, _ = right_context_bwd_lstm.apply(better_right_bwd_tmp[::-1], mask=b_right_mask.astype(theano.config.floatX)[::-1])
        better_right_hidden_list = [better_right_fwd_hidden, better_right_bwd_hidden]
        better_right_enc_dim = 2*sum(config.ctx_lstm_size)
        better_right_enc = tensor.concatenate([h[-1,:,:] for h in better_right_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size
        better_right_enc.name = 'better_right_enc'
        right_context_hidden_list = right_context_hidden_list + [better_right_fwd_hidden, better_right_bwd_hidden]

        worse_right_embed = embed.apply(w_right)
        worse_right_fwd_tmp = right_context_fwd_lstm_ins.apply(worse_right_embed)
        worse_right_bwd_tmp = right_context_bwd_lstm_ins.apply(worse_right_embed)
        worse_right_fwd_hidden, _ = right_context_fwd_lstm.apply(worse_right_fwd_tmp, mask=w_right_mask.astype(theano.config.floatX))
        worse_right_bwd_hidden, _ = right_context_bwd_lstm.apply(worse_right_bwd_tmp[::-1], mask=w_right_mask.astype(theano.config.floatX)[::-1])
        worse_right_hidden_list = [worse_right_fwd_hidden, worse_right_bwd_hidden]
        worse_right_enc_dim = 2*sum(config.ctx_lstm_size)
        worse_right_enc = tensor.concatenate([h[-1,:,:] for h in worse_right_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size
        worse_right_enc.name = 'worse_right_enc'
        right_context_hidden_list = right_context_hidden_list + [worse_right_fwd_hidden, worse_right_bwd_hidden]


        # F1 prediction MLP
        prediction_mlp = MLP(dims=config.prediction_mlp_hidden + [1],
                             activations=config.prediction_mlp_activations[1:] + [Identity()],
                             name='prediction_mlp')

        prediction_qlinear = Linear(input_dim=qenc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, name='preq')
        prediction_cand_linear = Linear(input_dim=worse_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='precand')
        prediction_left_half_linear = Linear(input_dim=better_left_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='preleft')
        prediction_right_half_linear = Linear(input_dim=better_right_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='preright')
        bricks += [prediction_mlp, prediction_qlinear, prediction_cand_linear, prediction_left_half_linear, prediction_right_half_linear]
        better_layer1 = Tanh('tan1').apply(tensor.concatenate([prediction_cand_linear.apply(better_enc), prediction_qlinear.apply(qenc), prediction_left_half_linear.apply(better_left_enc), prediction_right_half_linear.apply(better_right_enc)],axis=1))
        better_layer1.name = 'better_layer1'

        worse_layer1 = Tanh('tan2').apply(tensor.concatenate([prediction_cand_linear.apply(worse_enc), prediction_qlinear.apply(qenc), prediction_left_half_linear.apply(worse_left_enc), prediction_right_half_linear.apply(worse_right_enc)],axis=1))
        worse_layer1.name = 'worse_layer1'



        better_pred_weights = Tanh('rec1').apply(prediction_mlp.apply(better_layer1)) #batch_size
        worse_pred_weights = Tanh('rec2').apply(prediction_mlp.apply(worse_layer1)) #batch_size

        # numpy.set_printoptions(edgeitems=500)
        # better_pred_weights = theano.printing.Print('better')(better_pred_weights)
        # worse_pred_weights = theano.printing.Print('better')(worse_pred_weights)
        # #cost : max(0,- score-better + score-worse + margin)
        margin = config.margin
        conditions = tensor.lt(better_pred_weights, worse_pred_weights + margin).astype(theano.config.floatX)
        self.predictions = conditions
        cost = (-better_pred_weights + worse_pred_weights + margin) * conditions
        cost = cost.mean()

        # Apply dropout
        cg = ComputationGraph([cost])

        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + candidates_hidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
예제 #6
0
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')

        # set up 32-bit integer matrices
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.ivector('answer')
        candidates = tensor.imatrix('candidates')
        candidates_mask = tensor.imatrix('candidates_mask')

        # and the multple choice answers:
        ans1 = tensor.ivector('ans1')
        ans1_mask = tensor.ivector('ans1_mask')
        ans2 = tensor.ivector('ans2')
        ans2_mask = tensor.ivector('ans2_mask')
        ans3 = tensor.ivector('ans3')
        ans3_mask = tensor.ivector('ans3_mask')
        ans4 = tensor.ivector('ans4')
        ans4_mask = tensor.ivector('ans4_mask')

        bricks = []

        # inverts 1st and 2nd dimensions of matrix
        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)

        # Embed questions and cntext
        embed = LookupTable(vocab_size, config.embed_size, name='question_embed')
        bricks.append(embed)

        qembed = embed.apply(question)
        cembed = embed.apply(context)
        a1embed = embed.apply(ans1)
        a2embed = embed.apply(ans2)
        a3embed = embed.apply(ans3)
        a4embed = embed.apply(ans4)

        qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX),
                                                     config.question_lstm_size, config.question_skip_connections, 'q')
        clstms, chidden_list = make_bidir_lstm_stack(cembed, config.embed_size, context_mask.astype(theano.config.floatX),
                                                     config.ctx_lstm_size, config.ctx_skip_connections, 'ctx')
        bricks = bricks + qlstms + clstms

        # Calculate question encoding (concatenate layer1)
        if config.question_skip_connections:
            qenc_dim = 2*sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1)
        else:
            qenc_dim = 2*config.question_lstm_size[-1]
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1)
        qenc.name = 'qenc'

        # Calculate context encoding (concatenate layer1)
        if config.ctx_skip_connections:
            cenc_dim = 2*sum(config.ctx_lstm_size)
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2*config.ctx_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'

        # Attention mechanism MLP
        attention_mlp = MLP(dims=config.attention_mlp_hidden + [1],
                            activations=config.attention_mlp_activations[1:] + [Identity()],
                            name='attention_mlp')
        attention_qlinear = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq')
        attention_clinear = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc')
        bricks += [attention_mlp, attention_qlinear, attention_clinear]
        layer1 = Tanh().apply(attention_clinear.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2])))
                                        .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0]))
                             + attention_qlinear.apply(qenc)[None, :, :])
        layer1.name = 'layer1'
        att_weights = attention_mlp.apply(layer1.reshape((layer1.shape[0]*layer1.shape[1], layer1.shape[2])))
        att_weights.name = 'att_weights_0'
        att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1]))
        att_weights.name = 'att_weights'

        attended = tensor.sum(cenc * tensor.nnet.softmax(att_weights.T).T[:, :, None], axis=0)
        attended.name = 'attended'

        # Now we can calculate our output
        out_mlp = MLP(dims=[cenc_dim + qenc_dim] + config.out_mlp_hidden + [config.n_entities],
                      activations=config.out_mlp_activations + [Identity()],
                      name='out_mlp')
        bricks += [out_mlp]
        probs = out_mlp.apply(tensor.concatenate([attended, qenc], axis=1))
        probs.name = 'probs'

        # not needed anymore, since we're not only looking at entities
        # is_candidate = tensor.eq(tensor.arange(config.n_entities, dtype='int32')[None, None, :],
        #                          tensor.switch(candidates_mask, candidates, -tensor.ones_like(candidates))[:, :, None]).sum(axis=1)
        # probs = tensor.switch(is_candidate, probs, -1000 * tensor.ones_like(probs))

        # Calculate prediction, cost and error rate

        # vocab = tensor.arange(10)
        # probs = numpy.asarray([0, 0.8, 0, 0.2], dtype=numpy.float32)
        # context = numpy.asarray([3, 2, 8, 1], dtype=numpy.int32)
        # ans3 =  numpy.asarray([2, 8, 1], dtype=numpy.int32)
        # ans1 =  numpy.asarray([1, 3, 4], dtype=numpy.int32)
        # ans2 =  numpy.asarray([1, 1, 4], dtype=numpy.int32)

        # convert probs vector to one that's the same size as vocab, with all zeros except probs:
        # probs = tensor.switch(is_candidate, probs, -1000 * tensor.ones_like(probs))
        probsPadded = tensor.zeros_like(vocab_size, dtype=numpy.float32)
        probsSubset = probsPadded[cembed] #TODO this should be masked
        b = tensor.set_subtensor(probsSubset, probs)

        # get the similarity score of each (masked) answer with the context probs:
        ans1probs = b[a1enc]
        ans1score = tensor.switch(ans1_mask, ans1probs, tensor.zeros_like(ans1probs)).sum()
        ans2probs = b[a2enc]
        ans2score = ans2probs.sum()
        ans3probs = b[a3enc]
        ans3score = ans3probs.sum()
        ans4probs = b[a4enc]
        ans4score = ans4probs.sum()

        # and pick the best one:
        allans = tensor.stacklists([ans1score, ans2score, ans3score, ans4score])
        pred = tensor.argmax(allans)

        cg = ComputationGraph([ans1probs, ans1score, ans2probs, ans2score, ans3probs, ans3score, ans4probs, ans4score, allans, pred])
        f = cg.get_theano_function()
        out = f()

        #pred = probs.argmax(axis=1)
        #print "pred"
        #print pred TODO CHANGE THIS!
        cost = Softmax().categorical_cross_entropy(answer, probs).mean()
        error_rate = tensor.neq(answer, pred).mean()

        # Apply dropout
        cg = ComputationGraph([cost, error_rate])
        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout)
        [cost_reg, error_rate_reg] = cg.outputs

        # Other stuff
        cost_reg.name = cost.name = 'cost'
        error_rate_reg.name = error_rate.name = 'error_rate'


        self.probs = probs
        self.probs.name = "probs"
        self.cost = cost
        self.cost.name = "cost"
        #
        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg], [error_rate_reg]]
        self.monitor_vars_valid = [[cost], [error_rate]]

        # Initialize bricks
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
예제 #7
0
def get_prediction_function():
	question = tensor.imatrix('question')
	question_mask = tensor.imatrix('question_mask')
	context = tensor.imatrix('context')
	context_mask = tensor.imatrix('context_mask')
	answer = tensor.ivector('answer')
	candidates = tensor.imatrix('candidates')
	candidates_mask = tensor.imatrix('candidates_mask')

	"""	
	question = question.dimshuffle(1, 0)
	question_mask = question_mask.dimshuffle(1, 0)
	context = context.dimshuffle(1, 0)
	context_mask = context_mask.dimshuffle(1, 0)
	"""
	# Embed questions and cntext
	embed = bricks[-5]

	qembed = embed.apply(question.dimshuffle(1, 0))
	cembed = embed.apply(context.dimshuffle(1, 0))
	global _qembed,_cembed
	_qembed = theano.function([question], qembed)
	_cembed = theano.function([context], cembed)

	qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.dimshuffle(1, 0).astype(theano.config.floatX),
												config.question_lstm_size, config.question_skip_connections, 'q')
	chidden_list = make_bidir_lstm_stack(cembed, config.embed_size, context_mask.dimshuffle(1, 0).astype(theano.config.floatX),
												config.ctx_lstm_size, config.ctx_skip_connections, 'ctx')
	
	global _qhidden, _chidden
	_qhidden = theano.function([question, question_mask], qhidden_list)
	_chidden = theano.function([context, context_mask], chidden_list)

	# Calculate question encoding (concatenate layer1)
	if config.question_skip_connections:
		qenc_dim = 2*sum(config.question_lstm_size)
		qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1)
	else:
		qenc_dim = 2*config.question_lstm_size[-1]
		qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1)
	qenc.name = 'qenc'

	# Calculate context encoding (concatenate layer1)
	if config.ctx_skip_connections:
		cenc_dim = 2*sum(config.ctx_lstm_size)
		cenc = tensor.concatenate(chidden_list, axis=2)
	else:
		cenc_dim = 2*config.ctx_lstm_size[-1]
		cenc = tensor.concatenate(chidden_list[-2:], axis=2)
	cenc.name = 'cenc'

	global _qenc, _cenc
	_qenc = theano.function([question, question_mask], qenc)	
	_cenc = theano.function([context, context_mask], cenc)	

	# Attention mechanism MLP
	attention_mlp = bricks[-2]      #attention_mlp  
	attention_qlinear = bricks[4]	#attq
	attention_clinear = bricks[11] # attc
	layer1 = Tanh().apply(attention_clinear.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2])))
							.reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0]))
                             + attention_qlinear.apply(qenc)[None, :, :])

	global _attention_clinear, _attention_qlinear
	_attention_clinear = theano.function([context, context_mask], attention_clinear.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2]))).reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0])))
	_attention_qlinear = theano.function([question, question_mask], attention_qlinear.apply(qenc)[None, :, :])

	layer1.name = 'layer1'
	att_weights = attention_mlp.apply(layer1.reshape((layer1.shape[0]*layer1.shape[1], layer1.shape[2])))
	att_weights.name = 'att_weights_0'
	att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1]))
	att_weights.name = 'att_weights'

	attended = tensor.sum(cenc * tensor.nnet.softmax(att_weights.T).T[:, :, None], axis=0)
	attended.name = 'attended'

	global _attended
	_attended = theano.function([question, question_mask, context, context_mask], attended)

	# Now we can calculate our output
	out_mlp = bricks[-1] #out_mlp
	probs = out_mlp.apply(tensor.concatenate([attended, qenc], axis=1))
	probs.name = 'probs'

	f = theano.function([question, question_mask, context, context_mask], probs)
	return f
예제 #8
0
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size,
                            config.embed_size,
                            name='question_embed')
        embed.weights_init = IsotropicGaussian(0.01)

        # Calculate question encoding (concatenate layer1)
        qembed = embed.apply(question)
        qlstms, qhidden_list = make_bidir_lstm_stack(
            qembed, config.embed_size,
            question_mask.astype(theano.config.floatX),
            config.question_lstm_size, config.question_skip_connections, 'q')
        bricks = bricks + qlstms
        if config.question_skip_connections:
            qenc_dim = 2 * sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list],
                                      axis=1)
        else:
            qenc_dim = 2 * config.question_lstm_size[-1]
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]],
                                      axis=1)
        qenc.name = 'qenc'

        # Calculate context encoding (concatenate layer1)
        cembed = embed.apply(context)
        clstms, chidden_list = make_bidir_lstm_stack(
            cembed, config.embed_size,
            context_mask.astype(theano.config.floatX), config.ctx_lstm_size,
            config.ctx_skip_connections, 'ctx')
        bricks = bricks + clstms
        if config.ctx_skip_connections:
            cenc_dim = 2 * sum(config.ctx_lstm_size)  #2 : fw & bw
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2 * config.question_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'

        # Attention mechanism Bilinear
        attention_clinear = Linear(input_dim=cenc_dim,
                                   output_dim=qenc_dim,
                                   name='attc')
        bricks += [attention_clinear]
        att_weights = qenc[None, :, :] * attention_clinear.apply(
            cenc.reshape(
                (cenc.shape[0] * cenc.shape[1], cenc.shape[2]))).reshape(
                    (cenc.shape[0], cenc.shape[1], cenc.shape[2]))
        att_weights = att_weights.sum(axis=2)
        att_weights = tensor.nnet.softmax(att_weights.T).T
        att_weights.name = 'att_weights'

        attended = tensor.sum(cenc *
                              tensor.nnet.softmax(att_weights.T).T[:, :, None],
                              axis=0)
        attended.name = 'attended'

        answer_bag = to_bag(answer, vocab_size)
        answer_bag = tensor.set_subtensor(answer_bag[:, 0:3], 0)
        relevant_items = answer_bag.sum(axis=1, dtype=theano.config.floatX)

        def createSequences(j, index, c_enc, c_enc_dim, c_context,
                            c_window_size):
            sequence = tensor.concatenate([
                c_context[j:j + index, :],
                tensor.zeros((c_window_size - index, c_context.shape[1]))
            ],
                                          axis=0)
            enc = tensor.concatenate([
                c_enc[j + index - 1, :, :c_enc_dim / 2],
                c_enc[j, :, c_enc_dim / 2:-1],
                tensor.tile(c_window_size[None, None], (c_enc.shape[1], 1))
            ],
                                     axis=1)
            return enc, sequence

        def createTargetValues(j, index, c_context, c_vocab_size):
            sequence_bag = to_bag(c_context[j:j + index, :], c_vocab_size)
            sequence_bag = tensor.set_subtensor(sequence_bag[:, 0:3], 0)
            selected_items = sequence_bag.sum(axis=1,
                                              dtype=theano.config.floatX)
            tp = (sequence_bag * answer_bag).sum(axis=1,
                                                 dtype=theano.config.floatX)
            precision = tp / (selected_items + 0.00001)
            recall = tp / (relevant_items + 0.00001)
            #precision = tensor.set_subtensor(precision[tensor.isnan(precision)], 0.0)
            #recall = tensor.set_subtensor(recall[tensor.isnan(recall)], 1.0)
            macroF1 = (2 *
                       (precision * recall)) / (precision + recall + 0.00001)
            #macroF1 = tensor.set_subtensor(macroF1[tensor.isnan(macroF1)], 0.0)
            return macroF1

        window_size = 3
        senc = []
        sequences = []
        pred_targets = []
        for i in range(1, window_size + 1):
            (all_enc, all_sequence), _ = theano.scan(
                fn=createSequences,
                sequences=tensor.arange(cenc.shape[0] - i + 1),
                non_sequences=[i, cenc, cenc_dim, context, window_size])
            (all_macroF1), _ = theano.scan(
                fn=createTargetValues,
                sequences=tensor.arange(cenc.shape[0] - i + 1),
                non_sequences=[i, context, vocab_size])
            senc.append(all_enc)
            sequences.append(all_sequence)
            pred_targets.append(all_macroF1)

        senc = tensor.concatenate(senc, axis=0)
        sequences = tensor.concatenate(sequences, axis=0)
        pred_targets = tensor.concatenate(pred_targets, axis=0)

        # F1 prediction MLP
        prediction_mlp = MLP(
            dims=config.prediction_mlp_hidden + [1],
            activations=config.prediction_mlp_activations[1:] + [Identity()],
            name='prediction_mlp')
        prediction_qlinear = Linear(input_dim=qenc_dim,
                                    output_dim=config.prediction_mlp_hidden[0],
                                    name='preq')
        prediction_clinear = Linear(input_dim=cenc_dim,
                                    output_dim=config.prediction_mlp_hidden[0],
                                    use_bias=False,
                                    name='prec')
        prediction_slinear = Linear(input_dim=cenc_dim,
                                    output_dim=config.prediction_mlp_hidden[0],
                                    use_bias=False,
                                    name='pres')
        bricks += [
            prediction_mlp, prediction_qlinear, prediction_clinear,
            prediction_slinear
        ]
        layer1 = Tanh().apply(
            prediction_clinear.apply(attended)[None, :, :] +
            prediction_qlinear.apply(qenc)[None, :, :] +
            prediction_slinear.apply(
                senc.reshape((senc.shape[0] * senc.shape[1], senc.shape[2]
                              ))).reshape((senc.shape[0], senc.shape[1],
                                           config.prediction_mlp_hidden[0])))
        layer1.name = 'layer1'
        pred_weights = prediction_mlp.apply(
            layer1.reshape(
                (layer1.shape[0] * layer1.shape[1], layer1.shape[2])))
        pred_weights = pred_weights.reshape((layer1.shape[0], layer1.shape[1]))
        pred_weights = tensor.nnet.sigmoid(pred_weights.T).T

        pred_targets = pred_targets / (pred_targets.sum(axis=0) + 0.00001)
        pred_weights = pred_weights / (pred_weights.sum(axis=0) + 0.00001)

        #numpy.set_printoptions(edgeitems=500)
        #pred_targets = theano.printing.Print('pred_targets')(pred_targets)
        #pred_weights = theano.printing.Print('pred_weights')(pred_weights)

        cost = tensor.nnet.binary_crossentropy(pred_weights,
                                               pred_targets).mean()
        self.predictions = sequences[pred_weights.argmax(axis=0), :,
                                     tensor.arange(sequences.shape[2])].T

        # Apply dropout
        cg = ComputationGraph([cost])

        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.ivector('answer')
        candidates = tensor.imatrix('candidates')
        candidates_mask = tensor.imatrix('candidates_mask')

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)

        # Embed questions and cntext
        embed = LookupTable(vocab_size, config.embed_size, name='question_embed')
        bricks.append(embed)

        qembed = embed.apply(question)
        cembed = embed.apply(context)

        qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX),
                                                     config.question_lstm_size, config.question_skip_connections, 'q')
        clstms, chidden_list = make_bidir_lstm_stack(cembed, config.embed_size, context_mask.astype(theano.config.floatX),
                                                     config.ctx_lstm_size, config.ctx_skip_connections, 'ctx')
        bricks = bricks + qlstms + clstms

        # Calculate question encoding (concatenate layer1)
        if config.question_skip_connections:
            qenc_dim = 2*sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1)
        else:
            qenc_dim = 2*config.question_lstm_size[-1]
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1)
        qenc.name = 'qenc'

        # Calculate context encoding (concatenate layer1)
        if config.ctx_skip_connections:
            cenc_dim = 2*sum(config.ctx_lstm_size)
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2*config.ctx_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'

        # Attention mechanism MLP
        attention_mlp = MLP(dims=config.attention_mlp_hidden + [1],
                            activations=config.attention_mlp_activations[1:] + [Identity()],
                            name='attention_mlp')
        attention_qlinear = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq')
        attention_clinear = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc')
        bricks += [attention_mlp, attention_qlinear, attention_clinear]
        layer1 = Tanh().apply(attention_clinear.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2])))
                                        .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0]))
                             + attention_qlinear.apply(qenc)[None, :, :])
        layer1.name = 'layer1'
        att_weights = attention_mlp.apply(layer1.reshape((layer1.shape[0]*layer1.shape[1], layer1.shape[2])))
        att_weights.name = 'att_weights_0'
        att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1]))
        att_weights.name = 'att_weights'

        attended = tensor.sum(cenc * tensor.nnet.softmax(att_weights.T).T[:, :, None], axis=0)
        attended.name = 'attended'

        # Now we can calculate our output
        out_mlp = MLP(dims=[cenc_dim + qenc_dim] + config.out_mlp_hidden + [config.n_entities],
                      activations=config.out_mlp_activations + [Identity()],
                      name='out_mlp')
        bricks += [out_mlp]
        probs = out_mlp.apply(tensor.concatenate([attended, qenc], axis=1))
        probs.name = 'probs'

        is_candidate = tensor.eq(tensor.arange(config.n_entities, dtype='int32')[None, None, :],
                                 tensor.switch(candidates_mask, candidates, -tensor.ones_like(candidates))[:, :, None]).sum(axis=1)
        probs = tensor.switch(is_candidate, probs, -1000 * tensor.ones_like(probs))

        # Calculate prediction, cost and error rate
        pred = probs.argmax(axis=1)
        cost = Softmax().categorical_cross_entropy(answer, probs).mean()
        error_rate = tensor.neq(answer, pred).mean()

        # Apply dropout
        cg = ComputationGraph([cost, error_rate])
        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout)
        [cost_reg, error_rate_reg] = cg.outputs

        # Other stuff
        cost_reg.name = cost.name = 'cost'
        error_rate_reg.name = error_rate.name = 'error_rate'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg], [error_rate_reg]]
        self.monitor_vars_valid = [[cost], [error_rate]]

        # Initialize bricks
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()