Beispiel #1
0
def construct_model(vocab_size, embedding_dim, ngram_order, hidden_dims,
                    activations):
    # Construct the model
    x = tensor.lmatrix('features')
    y = tensor.lvector('targets')

    lookup = LookupTable(length=vocab_size, dim=embedding_dim, name='lookup')
    hidden = MLP(activations=activations + [None],
                 dims=[ngram_order * embedding_dim] + hidden_dims +
                 [vocab_size])

    embeddings = lookup.apply(x)
    embeddings = embeddings.flatten(ndim=2)  # Concatenate embeddings
    activations = hidden.apply(embeddings)
    cost = Softmax().categorical_cross_entropy(y, activations)

    # Initialize parameters
    lookup.weights_init = IsotropicGaussian(0.001)
    hidden.weights_init = IsotropicGaussian(0.01)
    hidden.biases_init = Constant(0.001)
    lookup.initialize()
    hidden.initialize()

    return cost
Beispiel #2
0
def construct_model(vocab_size, embedding_dim, ngram_order, hidden_dims,
                    activations):
    # Construct the model
    x = tensor.lmatrix('features')
    y = tensor.lvector('targets')

    lookup = LookupTable(length=vocab_size, dim=embedding_dim, name='lookup')
    hidden = MLP(activations=activations + [None],
                 dims=[ngram_order * embedding_dim] + hidden_dims +
                 [vocab_size])

    embeddings = lookup.apply(x)
    embeddings = embeddings.flatten(ndim=2)  # Concatenate embeddings
    activations = hidden.apply(embeddings)
    cost = Softmax().categorical_cross_entropy(y, activations)

    # Initialize parameters
    lookup.weights_init = IsotropicGaussian(0.001)
    hidden.weights_init = IsotropicGaussian(0.01)
    hidden.biases_init = Constant(0.001)
    lookup.initialize()
    hidden.initialize()

    return cost
Beispiel #3
0
    feedback_brick=feedback,
    name="readout",
)

generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, name="generator")

generator.weights_init = IsotropicGaussian(0.01)
generator.biases_init = Constant(0.0)
generator.push_initialization_config()

generator.transition.biases_init = IsotropicGaussian(0.01, 1)
generator.transition.push_initialization_config()

generator.initialize()

lookup.weights_init = IsotropicGaussian(0.001)
lookup.biases_init = Constant(0.0)
lookup.initialize()

# states = {}
states = [state for state in generator.transition.apply.outputs if state != "step"]

# ipdb.set_trace()

states = {name: shared_floatx_zeros((batch_size, hidden_size_recurrent)) for name in states}

cost_matrix = generator.cost_matrix(x, attended=context, **states)

cost = cost_matrix.mean() + 0.0 * start_flag
cost.name = "nll"
Beispiel #4
0
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size,
                            config.embed_size,
                            name='question_embed')
        embed.weights_init = IsotropicGaussian(0.01)

        # Calculate question encoding (concatenate layer1)
        qembed = embed.apply(question)
        qlstms, qhidden_list = make_bidir_lstm_stack(
            qembed, config.embed_size,
            question_mask.astype(theano.config.floatX),
            config.question_lstm_size, config.question_skip_connections, 'q')
        bricks = bricks + qlstms
        if config.question_skip_connections:
            qenc_dim = 2 * sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list],
                                      axis=1)
        else:
            qenc_dim = 2 * config.question_lstm_size[-1]
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]],
                                      axis=1)
        qenc.name = 'qenc'

        # Calculate context encoding (concatenate layer1)
        cembed = embed.apply(context)
        cqembed = tensor.concatenate([
            cembed,
            tensor.extra_ops.repeat(qenc[None, :, :], cembed.shape[0], axis=0)
        ],
                                     axis=2)
        clstms, chidden_list = make_bidir_lstm_stack(
            cqembed, config.embed_size + qenc_dim,
            context_mask.astype(theano.config.floatX), config.ctx_lstm_size,
            config.ctx_skip_connections, 'ctx')
        bricks = bricks + clstms
        if config.ctx_skip_connections:
            cenc_dim = 2 * sum(config.ctx_lstm_size)  #2 : fw & bw
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2 * config.question_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'

        # Attention mechanism Bilinear
        attention_clinear_1 = Linear(input_dim=cenc_dim,
                                     output_dim=qenc_dim,
                                     name='attc_1')
        bricks += [attention_clinear_1]
        att_start = qenc[None, :, :] * attention_clinear_1.apply(
            cenc.reshape(
                (cenc.shape[0] * cenc.shape[1], cenc.shape[2]))).reshape(
                    (cenc.shape[0], cenc.shape[1], cenc.shape[2]))
        att_start = att_start.sum(axis=2)
        att_start = tensor.nnet.softmax(att_start.T).T

        attention_clinear_2 = Linear(input_dim=cenc_dim,
                                     output_dim=qenc_dim,
                                     name='attc_2')
        bricks += [attention_clinear_2]
        att_end = qenc[None, :, :] * attention_clinear_2.apply(
            cenc.reshape(
                (cenc.shape[0] * cenc.shape[1], cenc.shape[2]))).reshape(
                    (cenc.shape[0], cenc.shape[1], cenc.shape[2]))
        att_end = att_end.sum(axis=2)
        att_end = tensor.nnet.softmax(att_end.T).T

        att_start = tensor.dot(
            tensor.le(
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[None, :],
                    (context.shape[0], 1)),
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[:, None],
                    (1, context.shape[0]))), att_start)
        att_end = tensor.dot(
            tensor.ge(
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[None, :],
                    (context.shape[0], 1)),
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[:, None],
                    (1, context.shape[0]))), att_end)

        # add attention from left and right
        att_weights = att_start * att_end

        att_target = tensor.eq(
            tensor.tile(answer[None, :, :], (context.shape[0], 1, 1)),
            tensor.tile(context[:, None, :],
                        (1, answer.shape[0], 1))).sum(axis=1).clip(0, 1)

        self.predictions = tensor.gt(att_weights, 0.25) * context

        att_target = att_target / (att_target.sum(axis=0) + 0.00001)
        att_weights = att_weights / (att_weights.sum(axis=0) + 0.00001)

        #cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) * context_mask).sum() / context_mask.sum()
        cost = (((att_weights - att_target)**2) *
                context_mask).sum() / context_mask.sum()

        # Apply dropout
        cg = ComputationGraph([cost])
        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'
        att_start.name = 'att_start'
        att_end.name = 'att_end'
        att_weights.name = 'att_weights'
        att_target.name = 'att_target'
        self.predictions.name = 'pred'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]
        self.analyse_vars = [
            cost, self.predictions, att_start, att_end, att_weights, att_target
        ]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
Beispiel #5
0
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size,
                            config.embed_size,
                            name='question_embed')
        embed.weights_init = IsotropicGaussian(0.01)

        # Calculate question encoding (concatenate layer1)
        qembed = embed.apply(question)
        qlstms, qhidden_list = make_bidir_lstm_stack(
            qembed, config.embed_size,
            question_mask.astype(theano.config.floatX),
            config.question_lstm_size, config.question_skip_connections, 'q')
        bricks = bricks + qlstms
        if config.question_skip_connections:
            qenc_dim = 2 * sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list],
                                      axis=1)
        else:
            qenc_dim = 2 * config.question_lstm_size[-1]
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]],
                                      axis=1)
        qenc.name = 'qenc'

        # Calculate context encoding (concatenate layer1)
        cembed = embed.apply(context)
        clstms, chidden_list = make_bidir_lstm_stack(
            cembed, config.embed_size,
            context_mask.astype(theano.config.floatX), config.ctx_lstm_size,
            config.ctx_skip_connections, 'ctx')
        bricks = bricks + clstms
        if config.ctx_skip_connections:
            cenc_dim = 2 * sum(config.ctx_lstm_size)  #2 : fw & bw
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2 * config.question_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'

        # Attention mechanism MLP
        attention_mlp = MLP(dims=config.attention_mlp_hidden + [1],
                            activations=config.attention_mlp_activations[1:] +
                            [Identity()],
                            name='attention_mlp')
        attention_qlinear = Linear(input_dim=qenc_dim,
                                   output_dim=config.attention_mlp_hidden[0],
                                   name='attq')
        attention_clinear = Linear(input_dim=cenc_dim,
                                   output_dim=config.attention_mlp_hidden[0],
                                   use_bias=False,
                                   name='attc')
        bricks += [attention_mlp, attention_qlinear, attention_clinear]
        layer1 = Tanh().apply(
            attention_clinear.apply(
                cenc.reshape((cenc.shape[0] * cenc.shape[1], cenc.shape[2]
                              ))).reshape((cenc.shape[0], cenc.shape[1],
                                           config.attention_mlp_hidden[0])) +
            attention_qlinear.apply(qenc)[None, :, :])
        layer1.name = 'layer1'
        att_weights = attention_mlp.apply(
            layer1.reshape(
                (layer1.shape[0] * layer1.shape[1], layer1.shape[2])))
        att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1]))
        att_weights = tensor.nnet.sigmoid(att_weights.T).T
        att_weights.name = 'att_weights'

        att_target = tensor.eq(
            tensor.tile(answer[None, :, :], (context.shape[0], 1, 1)),
            tensor.tile(context[:, None, :],
                        (1, answer.shape[0], 1))).sum(axis=1).clip(0, 1)
        cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) *
                context_mask).sum() / context_mask.sum()
        self.predictions = tensor.gt(att_weights, 0.1) * context

        # Apply dropout
        cg = ComputationGraph([cost])
        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')
        ans_indices = tensor.imatrix('ans_indices')  # n_steps * n_samples
        ans_indices_mask = tensor.imatrix('ans_indices_mask')

        context_bag = tensor.eq(context[:, :, None],
                                tensor.arange(vocab_size)).sum(axis=1).clip(
                                    0, 1)

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)
        ans_indices = ans_indices.dimshuffle(1, 0)
        ans_indices_mask = ans_indices_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size,
                            config.embed_size,
                            name='question_embed')
        embed.weights_init = IsotropicGaussian(0.01)
        # embeddings_initial_value = init_embedding_table(filename='embeddings/vocab_embeddings.txt')
        # embed.weights_init = Constant(embeddings_initial_value)

        # Calculate question encoding (concatenate layer1)
        qembed = embed.apply(question)

        qlstms, qhidden_list = make_bidir_lstm_stack(
            qembed, config.embed_size,
            question_mask.astype(theano.config.floatX),
            config.question_lstm_size, config.question_skip_connections, 'q')
        bricks = bricks + qlstms
        if config.question_skip_connections:
            qenc_dim = 2 * sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list],
                                      axis=1)
        else:
            qenc_dim = 2 * config.question_lstm_size[-1]
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]],
                                      axis=1)
        qenc.name = 'qenc'
        #embed size: 200, lstm_size = 256
        #qenc: length * batch_size * (2*lstm_size)

        # Calculate context encoding (concatenate layer1)
        cembed = embed.apply(context)
        cqembed = tensor.concatenate(
            [
                cembed,
                tensor.extra_ops.repeat(
                    qenc[None, :, :], cembed.shape[0], axis=0)
            ],
            axis=2
        )  #length * batch_size * (embed+2*lstm_size) this is what goes into encoder
        clstms, chidden_list = make_bidir_lstm_stack(
            cqembed, config.embed_size + qenc_dim,
            context_mask.astype(theano.config.floatX), config.ctx_lstm_size,
            config.ctx_skip_connections, 'ctx')
        bricks = bricks + clstms
        if config.ctx_skip_connections:
            cenc_dim = 2 * sum(config.ctx_lstm_size)  #2 : fw & bw
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2 * config.question_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'
        #cenc: length * batch_size * (2*lstm_size)

        #pointer networks decoder LSTM and Attention parameters
        params = init_params(data_dim=config.decoder_data_dim,
                             lstm_dim=config.decoder_lstm_output_dim)
        tparams = init_tparams(params)

        self.theano_params = []
        add_role(tparams['lstm_de_W'], WEIGHT)
        add_role(tparams['lstm_de_U'], WEIGHT)
        add_role(tparams['lstm_de_b'], BIAS)
        add_role(tparams['ptr_v'], WEIGHT)
        add_role(tparams['ptr_W1'], WEIGHT)
        add_role(tparams['ptr_W2'], WEIGHT)
        self.theano_params = tparams.values()
        # for p in tparams.values():
        #     add_role(p, WEIGHT)
        #     self.theano_params.append(p)

        #n_steps = length , n_samples = batch_size
        n_steps = ans_indices.shape[0]
        n_samples = ans_indices.shape[1]
        preds, generations = ptr_network(
            tparams, cqembed, context_mask.astype(theano.config.floatX),
            ans_indices, ans_indices_mask.astype(theano.config.floatX),
            config.decoder_lstm_output_dim, cenc)

        self.generations = generations

        idx_steps = tensor.outer(tensor.arange(n_steps, dtype='int64'),
                                 tensor.ones((n_samples, ), dtype='int64'))
        idx_samples = tensor.outer(tensor.ones((n_steps, ), dtype='int64'),
                                   tensor.arange(n_samples, dtype='int64'))
        probs = preds[idx_steps, ans_indices, idx_samples]
        # probs *= y_mask
        off = 1e-8
        if probs.dtype == 'float16':
            off = 1e-6
        # probs += (1 - y_mask)  # change unmasked position to 1, since log(1) = 0
        probs += off
        # probs_printed = theano.printing.Print('this is probs')(probs)
        cost = -tensor.log(probs)
        cost *= ans_indices_mask
        cost = cost.sum(axis=0) / ans_indices_mask.sum(axis=0)
        cost = cost.mean()
        # Apply dropout
        cg = ComputationGraph([cost])

        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
Beispiel #7
0
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')
        better = tensor.imatrix('better')
        better_mask = tensor.imatrix('better_mask')
        worse = tensor.imatrix('worse')
        worse_mask = tensor.imatrix('worse_mask')
        b_left = tensor.imatrix('b_left')
        b_left_mask = tensor.imatrix('b_left_mask')
        b_right = tensor.imatrix('b_right')
        b_right_mask = tensor.imatrix('b_right_mask')
        w_left = tensor.imatrix('w_left')
        w_left_mask = tensor.imatrix('w_left_mask')
        w_right = tensor.imatrix('w_right')
        w_right_mask = tensor.imatrix('w_right_mask')


        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)

        better = better.dimshuffle(1, 0)
        better_mask = better_mask.dimshuffle(1, 0)

        worse = worse.dimshuffle(1, 0)
        worse_mask = worse_mask.dimshuffle(1, 0)

        b_left = b_left.dimshuffle(1, 0)
        b_left_mask = b_left_mask.dimshuffle(1, 0)

        b_right = b_right.dimshuffle(1, 0)
        b_right_mask = b_right_mask.dimshuffle(1, 0)

        w_left = w_left.dimshuffle(1, 0)
        w_left_mask = w_left_mask.dimshuffle(1, 0)

        w_right = w_right.dimshuffle(1, 0)
        w_right_mask = w_right_mask.dimshuffle(1, 0)

        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size, config.embed_size, name='question_embed')
        embed.weights_init = IsotropicGaussian(0.01)

        # Calculate question encoding (concatenate layer1)
        qembed = embed.apply(question)
        qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX),
                                                     config.question_lstm_size, config.question_skip_connections, 'q')


        bricks = bricks + qlstms
        if config.question_skip_connections:
            qenc_dim = 2*sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1)
        else:
            qenc_dim = 2*config.question_lstm_size[-1]
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1)
        qenc.name = 'qenc'

        # candidate encoders
        candidates_hidden_list = []

        candidate_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='candidate_fwd_lstm_in_0_0')
        candidate_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='candidate_fwd_lstm_0')

        candidate_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='candidate_bwd_lstm_in_0_0')
        candidate_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='candidate_bwd_lstm_0')

        #adding encoding bricks for initialization
        bricks = bricks + [candidate_fwd_lstm, candidate_bwd_lstm, candidate_fwd_lstm_ins, candidate_bwd_lstm_ins]

        #computing better encoding
        better_embed = embed.apply(better)
        better_fwd_tmp = candidate_fwd_lstm_ins.apply(better_embed)
        better_bwd_tmp = candidate_bwd_lstm_ins.apply(better_embed)
        better_fwd_hidden, _ = candidate_fwd_lstm.apply(better_fwd_tmp, mask=better_mask.astype(theano.config.floatX))
        better_bwd_hidden, _ = candidate_bwd_lstm.apply(better_bwd_tmp[::-1], mask=better_mask.astype(theano.config.floatX)[::-1])
        better_hidden_list = [better_fwd_hidden, better_bwd_hidden]
        better_enc_dim = 2*sum(config.ctx_lstm_size)
        better_enc = tensor.concatenate([h[-1,:,:] for h in better_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size
        better_enc.name = 'better_enc'
        candidates_hidden_list = candidates_hidden_list + [better_fwd_hidden, better_bwd_hidden]

        #computing worse encoding
        worse_embed = embed.apply(worse)
        worse_fwd_tmp = candidate_fwd_lstm_ins.apply(worse_embed)
        worse_bwd_tmp = candidate_bwd_lstm_ins.apply(worse_embed)
        worse_fwd_hidden, _ = candidate_fwd_lstm.apply(worse_fwd_tmp, mask=worse_mask.astype(theano.config.floatX))
        worse_bwd_hidden, _ = candidate_bwd_lstm.apply(worse_bwd_tmp[::-1], mask=worse_mask.astype(theano.config.floatX)[::-1])
        worse_hidden_list = [worse_fwd_hidden, worse_bwd_hidden]
        worse_enc_dim = 2*sum(config.ctx_lstm_size)
        worse_enc = tensor.concatenate([h[-1,:,:] for h in worse_hidden_list], axis=1)
        worse_enc.name = 'worse_enc'
        candidates_hidden_list = candidates_hidden_list + [worse_fwd_hidden, worse_bwd_hidden]


        #left encoders
        left_context_hidden_list = []

        left_context_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='left_context_fwd_lstm_in_0_0')
        left_context_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='left_context_fwd_lstm_0')

        left_context_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='left_context_bwd_lstm_in_0_0')
        left_context_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='left_context_bwd_lstm_0')

        #adding encoding bricks for initialization
        bricks = bricks + [left_context_fwd_lstm, left_context_bwd_lstm, left_context_fwd_lstm_ins, left_context_bwd_lstm_ins]

        #right encoders
        right_context_hidden_list = []

        right_context_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='right_context_fwd_lstm_in_0_0')
        right_context_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='right_context_fwd_lstm_0')

        right_context_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='right_context_bwd_lstm_in_0_0')
        right_context_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='right_context_bwd_lstm_0')

        #adding encoding bricks for initialization
        bricks = bricks + [right_context_fwd_lstm, right_context_bwd_lstm, right_context_fwd_lstm_ins, right_context_bwd_lstm_ins]


        #left half encodings
        better_left_embed = embed.apply(b_left)
        better_left_fwd_tmp = left_context_fwd_lstm_ins.apply(better_left_embed)
        better_left_bwd_tmp = left_context_bwd_lstm_ins.apply(better_left_embed)
        better_left_fwd_hidden, _ = left_context_fwd_lstm.apply(better_left_fwd_tmp, mask=b_left_mask.astype(theano.config.floatX))
        better_left_bwd_hidden, _ = left_context_bwd_lstm.apply(better_left_bwd_tmp[::-1], mask=b_left_mask.astype(theano.config.floatX)[::-1])
        better_left_hidden_list = [better_left_fwd_hidden, better_left_bwd_hidden]
        better_left_enc_dim = 2*sum(config.ctx_lstm_size)
        better_left_enc = tensor.concatenate([h[-1,:,:] for h in better_left_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size
        better_left_enc.name = 'better_left_enc'
        left_context_hidden_list = left_context_hidden_list + [better_left_fwd_hidden, better_left_bwd_hidden]

        worse_left_embed = embed.apply(w_left)
        worse_left_fwd_tmp = left_context_fwd_lstm_ins.apply(worse_left_embed)
        worse_left_bwd_tmp = left_context_bwd_lstm_ins.apply(worse_left_embed)
        worse_left_fwd_hidden, _ = left_context_fwd_lstm.apply(worse_left_fwd_tmp, mask=w_left_mask.astype(theano.config.floatX))
        worse_left_bwd_hidden, _ = left_context_bwd_lstm.apply(worse_left_bwd_tmp[::-1], mask=w_left_mask.astype(theano.config.floatX)[::-1])
        worse_left_hidden_list = [worse_left_fwd_hidden, worse_left_bwd_hidden]
        worse_left_enc_dim = 2*sum(config.ctx_lstm_size)
        worse_left_enc = tensor.concatenate([h[-1,:,:] for h in worse_left_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size
        worse_left_enc.name = 'worse_left_enc'
        left_context_hidden_list = left_context_hidden_list + [worse_left_fwd_hidden, worse_left_bwd_hidden]


        #right half encoding
        better_right_embed = embed.apply(b_right)
        better_right_fwd_tmp = right_context_fwd_lstm_ins.apply(better_right_embed)
        better_right_bwd_tmp = right_context_bwd_lstm_ins.apply(better_right_embed)
        better_right_fwd_hidden, _ = right_context_fwd_lstm.apply(better_right_fwd_tmp, mask=b_right_mask.astype(theano.config.floatX))
        better_right_bwd_hidden, _ = right_context_bwd_lstm.apply(better_right_bwd_tmp[::-1], mask=b_right_mask.astype(theano.config.floatX)[::-1])
        better_right_hidden_list = [better_right_fwd_hidden, better_right_bwd_hidden]
        better_right_enc_dim = 2*sum(config.ctx_lstm_size)
        better_right_enc = tensor.concatenate([h[-1,:,:] for h in better_right_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size
        better_right_enc.name = 'better_right_enc'
        right_context_hidden_list = right_context_hidden_list + [better_right_fwd_hidden, better_right_bwd_hidden]

        worse_right_embed = embed.apply(w_right)
        worse_right_fwd_tmp = right_context_fwd_lstm_ins.apply(worse_right_embed)
        worse_right_bwd_tmp = right_context_bwd_lstm_ins.apply(worse_right_embed)
        worse_right_fwd_hidden, _ = right_context_fwd_lstm.apply(worse_right_fwd_tmp, mask=w_right_mask.astype(theano.config.floatX))
        worse_right_bwd_hidden, _ = right_context_bwd_lstm.apply(worse_right_bwd_tmp[::-1], mask=w_right_mask.astype(theano.config.floatX)[::-1])
        worse_right_hidden_list = [worse_right_fwd_hidden, worse_right_bwd_hidden]
        worse_right_enc_dim = 2*sum(config.ctx_lstm_size)
        worse_right_enc = tensor.concatenate([h[-1,:,:] for h in worse_right_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size
        worse_right_enc.name = 'worse_right_enc'
        right_context_hidden_list = right_context_hidden_list + [worse_right_fwd_hidden, worse_right_bwd_hidden]


        # F1 prediction MLP
        prediction_mlp = MLP(dims=config.prediction_mlp_hidden + [1],
                             activations=config.prediction_mlp_activations[1:] + [Identity()],
                             name='prediction_mlp')

        prediction_qlinear = Linear(input_dim=qenc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, name='preq')
        prediction_cand_linear = Linear(input_dim=worse_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='precand')
        prediction_left_half_linear = Linear(input_dim=better_left_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='preleft')
        prediction_right_half_linear = Linear(input_dim=better_right_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='preright')
        bricks += [prediction_mlp, prediction_qlinear, prediction_cand_linear, prediction_left_half_linear, prediction_right_half_linear]
        better_layer1 = Tanh('tan1').apply(tensor.concatenate([prediction_cand_linear.apply(better_enc), prediction_qlinear.apply(qenc), prediction_left_half_linear.apply(better_left_enc), prediction_right_half_linear.apply(better_right_enc)],axis=1))
        better_layer1.name = 'better_layer1'

        worse_layer1 = Tanh('tan2').apply(tensor.concatenate([prediction_cand_linear.apply(worse_enc), prediction_qlinear.apply(qenc), prediction_left_half_linear.apply(worse_left_enc), prediction_right_half_linear.apply(worse_right_enc)],axis=1))
        worse_layer1.name = 'worse_layer1'



        better_pred_weights = Tanh('rec1').apply(prediction_mlp.apply(better_layer1)) #batch_size
        worse_pred_weights = Tanh('rec2').apply(prediction_mlp.apply(worse_layer1)) #batch_size

        # numpy.set_printoptions(edgeitems=500)
        # better_pred_weights = theano.printing.Print('better')(better_pred_weights)
        # worse_pred_weights = theano.printing.Print('better')(worse_pred_weights)
        # #cost : max(0,- score-better + score-worse + margin)
        margin = config.margin
        conditions = tensor.lt(better_pred_weights, worse_pred_weights + margin).astype(theano.config.floatX)
        self.predictions = conditions
        cost = (-better_pred_weights + worse_pred_weights + margin) * conditions
        cost = cost.mean()

        # Apply dropout
        cg = ComputationGraph([cost])

        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + candidates_hidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
Beispiel #8
0
    def __init__(self, config, vocab_size):
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')

        bricks = []

        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)

        context_bag = to_bag(context, vocab_size)

        # Embed questions and context
        embed = LookupTable(vocab_size, config.embed_size, name='embed')
        embed.weights_init = IsotropicGaussian(0.01)
        #embeddings_initial_value = init_embedding_table(filename='embeddings/vocab_embeddings.txt')
        #embed.weights_init = Constant(embeddings_initial_value)

        # Calculate context encoding (concatenate layer1)
        cembed = embed.apply(context)
        clstms, chidden_list = make_bidir_lstm_stack(
            cembed, config.embed_size,
            context_mask.astype(theano.config.floatX), config.ctx_lstm_size,
            config.ctx_skip_connections, 'ctx')
        bricks = bricks + clstms
        if config.ctx_skip_connections:
            cenc_dim = 2 * sum(config.ctx_lstm_size)  #2 : fw & bw
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2 * config.ctx_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'

        # Build the encoder bricks
        transition = GatedRecurrent(activation=Tanh(),
                                    dim=config.generator_lstm_size,
                                    name="transition")
        attention = SequenceContentAttention(
            state_names=transition.apply.states,
            attended_dim=cenc_dim,
            match_dim=config.generator_lstm_size,
            name="attention")
        readout = Readout(readout_dim=vocab_size,
                          source_names=[
                              transition.apply.states[0],
                              attention.take_glimpses.outputs[0]
                          ],
                          emitter=MaskedSoftmaxEmitter(context_bag=context_bag,
                                                       name='emitter'),
                          feedback_brick=LookupFeedback(
                              vocab_size, config.feedback_size),
                          name="readout")
        generator = SequenceGenerator(readout=readout,
                                      transition=transition,
                                      attention=attention,
                                      name="generator")

        cost = generator.cost(answer,
                              answer_mask.astype(theano.config.floatX),
                              attended=cenc,
                              attended_mask=context_mask.astype(
                                  theano.config.floatX),
                              name="cost")
        self.predictions = generator.generate(
            n_steps=7,
            batch_size=config.batch_size,
            attended=cenc,
            attended_mask=context_mask.astype(theano.config.floatX),
            iterate=True)[1]

        # Apply dropout
        cg = ComputationGraph([cost])

        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, chidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]

        # initialize new stuff manually (change!)
        generator.weights_init = IsotropicGaussian(0.01)
        generator.biases_init = Constant(0)
        generator.push_allocation_config()
        generator.push_initialization_config()
        transition.weights_init = Orthogonal()
        generator.initialize()

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
Beispiel #9
0
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')
        ans_indices = tensor.imatrix('ans_indices')  # n_steps * n_samples
        ans_indices_mask = tensor.imatrix('ans_indices_mask')

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)
        ans_indices = ans_indices.dimshuffle(1, 0)
        ans_indices_mask = ans_indices_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size, config.embed_size, name='embed')
        embed.weights_init = IsotropicGaussian(0.01)
        # embed.weights_init = Constant(init_embedding_table(filename='embeddings/vocab_embeddings.txt'))

        # one directional LSTM encoding
        q_lstm_ins = Linear(input_dim=config.embed_size,
                            output_dim=4 * config.pre_lstm_size,
                            name='q_lstm_in')
        q_lstm = LSTM(dim=config.pre_lstm_size,
                      activation=Tanh(),
                      name='q_lstm')
        c_lstm_ins = Linear(input_dim=config.embed_size,
                            output_dim=4 * config.pre_lstm_size,
                            name='c_lstm_in')
        c_lstm = LSTM(dim=config.pre_lstm_size,
                      activation=Tanh(),
                      name='c_lstm')
        bricks += [q_lstm, c_lstm, q_lstm_ins, c_lstm_ins]

        q_tmp = q_lstm_ins.apply(embed.apply(question))
        c_tmp = c_lstm_ins.apply(embed.apply(context))
        q_hidden, _ = q_lstm.apply(q_tmp,
                                   mask=question_mask.astype(
                                       theano.config.floatX))  # lq, bs, dim
        c_hidden, _ = c_lstm.apply(c_tmp,
                                   mask=context_mask.astype(
                                       theano.config.floatX))  # lc, bs, dim

        # Attention mechanism Bilinear question
        attention_question = Linear(input_dim=config.pre_lstm_size,
                                    output_dim=config.pre_lstm_size,
                                    name='att_question')
        bricks += [attention_question]
        att_weights_question = q_hidden[
            None, :, :, :] * attention_question.apply(
                c_hidden.reshape(
                    (c_hidden.shape[0] * c_hidden.shape[1],
                     c_hidden.shape[2]))).reshape(
                         (c_hidden.shape[0], c_hidden.shape[1],
                          c_hidden.shape[2]))[:,
                                              None, :, :]  # --> lc,lq,bs,dim
        att_weights_question = att_weights_question.sum(
            axis=3)  # sum over axis 3 -> dimensions --> lc,lq,bs
        att_weights_question = att_weights_question.dimshuffle(
            0, 2, 1)  # --> lc,bs,lq
        att_weights_question = att_weights_question.reshape(
            (att_weights_question.shape[0] * att_weights_question.shape[1],
             att_weights_question.shape[2]))  # --> lc*bs,lq
        att_weights_question = tensor.nnet.softmax(
            att_weights_question
        )  # softmax over axis 1 -> length of question # --> lc*bs,lq
        att_weights_question = att_weights_question.reshape(
            (c_hidden.shape[0], q_hidden.shape[1],
             q_hidden.shape[0]))  # --> lc,bs,lq
        att_weights_question = att_weights_question.dimshuffle(
            0, 2, 1)  # --> lc,lq,bs

        question_context_attention = att_weights_question.dimshuffle(2, 1, 0)
        question_context_attention.name = "question_context_attention"

        self.analyse_vars = [question_context_attention]
        attended_question = tensor.sum(
            q_hidden[None, :, :, :] * att_weights_question[:, :, :, None],
            axis=1)  # sum over axis 1 -> length of question --> lc,bs,dim
        attended_question.name = 'attended_question'

        # Match LSTM
        cqembed = tensor.concatenate([c_hidden, attended_question], axis=2)
        mlstms, mhidden_list = make_bidir_lstm_stack(
            cqembed, 2 * config.pre_lstm_size,
            context_mask.astype(theano.config.floatX), config.match_lstm_size,
            config.match_skip_connections, 'match')
        bricks = bricks + mlstms
        if config.match_skip_connections:
            menc_dim = 2 * sum(config.match_lstm_size)
            menc = tensor.concatenate(mhidden_list, axis=2)
        else:
            menc_dim = 2 * config.match_lstm_size[-1]
            menc = tensor.concatenate(mhidden_list[-2:], axis=2)
        menc.name = 'menc'

        #pointer networks decoder LSTM and Attention parameters
        params = init_params(data_dim=config.decoder_data_dim,
                             lstm_dim=config.decoder_lstm_output_dim)
        tparams = init_tparams(params)

        self.theano_params = []
        add_role(tparams['lstm_de_W'], WEIGHT)
        add_role(tparams['lstm_de_U'], WEIGHT)
        add_role(tparams['lstm_de_b'], BIAS)
        add_role(tparams['ptr_b1'], BIAS)
        add_role(tparams['ptr_b2'], BIAS)
        add_role(tparams['ptr_v'], WEIGHT)
        add_role(tparams['ptr_W1'], WEIGHT)
        add_role(tparams['ptr_W2'], WEIGHT)
        self.theano_params = tparams.values()

        #n_steps = length , n_samples = batch_size
        n_steps = ans_indices.shape[0]
        n_samples = ans_indices.shape[1]
        preds, generations = ptr_network(
            tparams, cqembed, context_mask.astype(theano.config.floatX),
            ans_indices, ans_indices_mask.astype(theano.config.floatX),
            config.decoder_lstm_output_dim, menc)

        self.generations = generations

        idx_steps = tensor.outer(tensor.arange(n_steps, dtype='int64'),
                                 tensor.ones((n_samples, ), dtype='int64'))
        idx_samples = tensor.outer(tensor.ones((n_steps, ), dtype='int64'),
                                   tensor.arange(n_samples, dtype='int64'))
        probs = preds[idx_steps, ans_indices, idx_samples]
        # probs *= y_mask
        off = 1e-8
        if probs.dtype == 'float16':
            off = 1e-6
        # probs += (1 - y_mask)  # change unmasked position to 1, since log(1) = 0
        probs += off
        # probs_printed = theano.printing.Print('this is probs')(probs)
        cost = -tensor.log(probs)
        cost *= ans_indices_mask
        cost = cost.sum(axis=0) / ans_indices_mask.sum(axis=0)
        cost = cost.mean()

        # Apply dropout
        cg = ComputationGraph([cost])
        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, mhidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'

        # self.predictions.name = 'pred'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]
        # self.analyse_vars= [cost, self.predictions, att_weights_start, att_weights_end, att_weights, att_target]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
Beispiel #10
0
 def _embed(self, sample_num, dim, name, *args, **kwargs):
     embed = LookupTable(sample_num, dim, name=name)
     embed.weights_init = IsotropicGaussian(std=1 / numpy.sqrt(dim))
     embed.initialize()
     return embed
Beispiel #11
0
    def __init__(self, config, vocab_size):
        unsorted = tensor.imatrix('unsorted')
        unsorted_mask = tensor.imatrix('unsorted_mask')

        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')

        bricks = []

        unsorted = unsorted.dimshuffle(1, 0)
        unsorted_mask = unsorted_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)

        # Embed unsorted sequence
        embed = LookupTable(vocab_size, config.embed_size, name='embed')
        embed.weights_init = IsotropicGaussian(0.01)
        #make_bidir_lstm_stack(seq, seq_dim, mask, sizes, skip=True, name=''):
        unsorted_embed = embed.apply(unsorted)
        unsorted_lstms, unsorted_hidden_list = make_bidir_lstm_stack(
            unsorted_embed, config.embed_size,
            unsorted_mask.astype(theano.config.floatX), config.lstm_size,
            config.match_skip_connections, 'u')  #lu,bs,lstm_dim

        bricks = bricks + unsorted_lstms
        unsorted_enc_dim = 2 * sum(config.lstm_size)
        unsorted_enc = tensor.concatenate(
            unsorted_hidden_list,
            axis=2)  #concatenate fwd & bwd lstm hidden states
        unsorted_enc.name = 'unsorted_enc'

        #pointer networks decoder LSTM and Attention parameters
        params = init_params(data_dim=config.decoder_data_dim,
                             lstm_dim=config.decoder_lstm_output_dim)
        tparams = init_tparams(params)
        add_role(tparams['lstm_de_W'], WEIGHT)
        add_role(tparams['lstm_de_U'], WEIGHT)
        add_role(tparams['lstm_de_b'], BIAS)
        add_role(tparams['ptr_b1'], BIAS)
        add_role(tparams['ptr_b2'], BIAS)
        add_role(tparams['ptr_v'], WEIGHT)
        add_role(tparams['ptr_W1'], WEIGHT)
        add_role(tparams['ptr_W2'], WEIGHT)
        self.theano_params = tparams.values()

        #n_steps = length , n_samples = batch_size
        n_steps = answer.shape[0]
        n_samples = answer.shape[1]
        preds, generations = ptr_network(
            tparams, unsorted_embed,
            unsorted_mask.astype(theano.config.floatX), answer,
            answer_mask.astype(theano.config.floatX),
            config.decoder_lstm_output_dim, unsorted_enc)

        self.generations = generations

        idx_steps = tensor.outer(tensor.arange(n_steps, dtype='int64'),
                                 tensor.ones((n_samples, ), dtype='int64'))
        idx_samples = tensor.outer(tensor.ones((n_steps, ), dtype='int64'),
                                   tensor.arange(n_samples, dtype='int64'))
        probs = preds[idx_steps, answer, idx_samples]
        # probs *= y_mask
        off = 1e-8
        if probs.dtype == 'float16':
            off = 1e-6

        probs += off
        # probs_printed = theano.printing.Print('probs')(probs)
        cost = -tensor.log(probs)
        cost *= answer_mask
        cost = cost.sum(axis=0) / answer_mask.sum(axis=0)
        cost = cost.mean()

        # Apply dropout
        cg = ComputationGraph([cost])
        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, unsorted_hidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size,
                            config.embed_size,
                            name='question_embed')
        embed.weights_init = IsotropicGaussian(0.01)

        # Calculate question encoding (concatenate layer1)
        qembed = embed.apply(question)
        qlstms, qhidden_list = make_bidir_lstm_stack(
            qembed, config.embed_size,
            question_mask.astype(theano.config.floatX),
            config.question_lstm_size, config.question_skip_connections, 'q')
        bricks = bricks + qlstms
        if config.question_skip_connections:
            qenc_dim = 2 * sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list],
                                      axis=1)
        else:
            qenc_dim = 2 * config.question_lstm_size[-1]
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]],
                                      axis=1)
        qenc.name = 'qenc'

        # Calculate context encoding (concatenate layer1)
        cembed = embed.apply(context)
        clstms, chidden_list = make_bidir_lstm_stack(
            cembed, config.embed_size,
            context_mask.astype(theano.config.floatX), config.ctx_lstm_size,
            config.ctx_skip_connections, 'ctx')
        bricks = bricks + clstms
        if config.ctx_skip_connections:
            cenc_dim = 2 * sum(config.ctx_lstm_size)  #2 : fw & bw
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2 * config.question_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'

        # Attention mechanism MLP fwd
        attention_mlp_fwd = MLP(
            dims=config.attention_mlp_hidden + [1],
            activations=config.attention_mlp_activations[1:] + [Identity()],
            name='attention_mlp_fwd')
        attention_qlinear_fwd = Linear(
            input_dim=qenc_dim,
            output_dim=config.attention_mlp_hidden[0],
            name='attq_fwd')
        attention_clinear_fwd = Linear(
            input_dim=cenc_dim / 2,
            output_dim=config.attention_mlp_hidden[0],
            use_bias=False,
            name='attc_fwd')
        bricks += [
            attention_mlp_fwd, attention_qlinear_fwd, attention_clinear_fwd
        ]
        layer1_fwd = Tanh(name='tanh_fwd')
        layer1_fwd = layer1_fwd.apply(
            attention_clinear_fwd.apply(cenc[:, :, :cenc_dim / 2].reshape(
                (cenc.shape[0] * cenc.shape[1], cenc.shape[2] /
                 2))).reshape((cenc.shape[0], cenc.shape[1],
                               config.attention_mlp_hidden[0])) +
            attention_qlinear_fwd.apply(qenc)[None, :, :])
        att_weights_fwd = attention_mlp_fwd.apply(
            layer1_fwd.reshape((layer1_fwd.shape[0] * layer1_fwd.shape[1],
                                layer1_fwd.shape[2])))
        att_weights_fwd = att_weights_fwd.reshape(
            (layer1_fwd.shape[0], layer1_fwd.shape[1]))
        att_weights_fwd = tensor.nnet.softmax(att_weights_fwd.T)
        att_weights_fwd.name = 'att_weights_fwd'

        attended_fwd = tensor.sum(cenc[:, :, :cenc_dim / 2] *
                                  att_weights_fwd.T[:, :, None],
                                  axis=0)
        attended_fwd.name = 'attended_fwd'

        # Attention mechanism MLP bwd
        attention_mlp_bwd = MLP(
            dims=config.attention_mlp_hidden + [1],
            activations=config.attention_mlp_activations[1:] + [Identity()],
            name='attention_mlp_bwd')
        attention_qlinear_bwd = Linear(
            input_dim=qenc_dim,
            output_dim=config.attention_mlp_hidden[0],
            name='attq_bwd')
        attention_clinear_bwd = Linear(
            input_dim=cenc_dim / 2,
            output_dim=config.attention_mlp_hidden[0],
            use_bias=False,
            name='attc_bwd')
        bricks += [
            attention_mlp_bwd, attention_qlinear_bwd, attention_clinear_bwd
        ]
        layer1_bwd = Tanh(name='tanh_bwd')
        layer1_bwd = layer1_bwd.apply(
            attention_clinear_bwd.apply(cenc[:, :, cenc_dim / 2:].reshape(
                (cenc.shape[0] * cenc.shape[1], cenc.shape[2] /
                 2))).reshape((cenc.shape[0], cenc.shape[1],
                               config.attention_mlp_hidden[0])) +
            attention_qlinear_bwd.apply(qenc)[None, :, :])
        att_weights_bwd = attention_mlp_bwd.apply(
            layer1_bwd.reshape((layer1_bwd.shape[0] * layer1_bwd.shape[1],
                                layer1_bwd.shape[2])))
        att_weights_bwd = att_weights_bwd.reshape(
            (layer1_bwd.shape[0], layer1_bwd.shape[1]))
        att_weights_bwd = tensor.nnet.softmax(att_weights_bwd.T)
        att_weights_bwd.name = 'att_weights_bwd'

        attended_bwd = tensor.sum(cenc[:, :, cenc_dim / 2:] *
                                  att_weights_bwd.T[:, :, None],
                                  axis=0)
        attended_bwd.name = 'attended_bwd'

        ctx_question = tensor.concatenate([attended_fwd, attended_bwd, qenc],
                                          axis=1)
        ctx_question.name = 'ctx_question'

        answer_bag = to_bag(answer, vocab_size)
        answer_bag = tensor.set_subtensor(answer_bag[:, 0:3], 0)
        relevant_items = answer_bag.sum(axis=1, dtype=theano.config.floatX)

        def createSequences(j, index, c_enc, c_enc_dim, c_context,
                            c_window_size):
            sequence = tensor.concatenate([
                c_context[j:j + index, :],
                tensor.zeros((c_window_size - index, c_context.shape[1]))
            ],
                                          axis=0)
            enc = tensor.concatenate([
                c_enc[j + index - 1, :, :], c_enc[j, :, :-1],
                tensor.tile(c_window_size[None, None], (c_enc.shape[1], 1))
            ],
                                     axis=1)
            return enc, sequence

        def createTargetValues(j, index, c_context, c_vocab_size):
            sequence_bag = to_bag(c_context[j:j + index, :], c_vocab_size)
            sequence_bag = tensor.set_subtensor(sequence_bag[:, 0:3], 0)
            selected_items = sequence_bag.sum(axis=1,
                                              dtype=theano.config.floatX)
            tp = (sequence_bag * answer_bag).sum(axis=1,
                                                 dtype=theano.config.floatX)
            precision = tp / (selected_items + 0.00001)
            recall = tp / (relevant_items + 0.00001)
            #precision = tensor.set_subtensor(precision[tensor.isnan(precision)], 0.0)
            #recall = tensor.set_subtensor(recall[tensor.isnan(recall)], 1.0)
            macroF1 = (2 *
                       (precision * recall)) / (precision + recall + 0.00001)
            #macroF1 = tensor.set_subtensor(macroF1[tensor.isnan(macroF1)], 0.0)
            return macroF1

        window_size = 3
        senc = []
        sequences = []
        pred_targets = []
        for i in range(1, window_size + 1):
            (all_enc, all_sequence), _ = theano.scan(
                fn=createSequences,
                sequences=tensor.arange(cenc.shape[0] - i + 1),
                non_sequences=[i, cenc, cenc_dim, context, window_size])
            (all_macroF1), _ = theano.scan(
                fn=createTargetValues,
                sequences=tensor.arange(cenc.shape[0] - i + 1),
                non_sequences=[i, context, vocab_size])
            senc.append(all_enc)
            sequences.append(all_sequence)
            pred_targets.append(all_macroF1)

        senc = tensor.concatenate(senc, axis=0)
        sequences = tensor.concatenate(sequences, axis=0)
        pred_targets = tensor.concatenate(pred_targets, axis=0)

        # F1 prediction Bilinear
        prediction_linear = Linear(input_dim=2 * cenc_dim,
                                   output_dim=cenc_dim + qenc_dim,
                                   name='pred_linear')
        bricks += [prediction_linear]
        pred_weights = ctx_question[None, :, :] * prediction_linear.apply(
            senc.reshape(
                (senc.shape[0] * senc.shape[1], senc.shape[2]))).reshape(
                    (senc.shape[0], senc.shape[1], senc.shape[2]))
        pred_weights = pred_weights.sum(axis=2)
        pred_weights = tensor.nnet.sigmoid(pred_weights.T).T
        pred_weights.name = 'pred_weights'

        pred_targets = pred_targets / (pred_targets.sum(axis=0) + 0.00001)
        pred_weights = pred_weights / (pred_weights.sum(axis=0) + 0.00001)

        #numpy.set_printoptions(edgeitems=500)
        #pred_targets = theano.printing.Print('pred_targets')(pred_targets)
        #pred_weights = theano.printing.Print('pred_weights')(pred_weights)

        cost = tensor.nnet.binary_crossentropy(pred_weights,
                                               pred_targets).mean()
        self.predictions = sequences[pred_weights.argmax(axis=0), :,
                                     tensor.arange(sequences.shape[2])].T

        # Apply dropout
        cg = ComputationGraph([cost])

        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
readout = Readout(readout_dim=readout_size,
                  source_names=source_names +
                  [attention.take_glimpses.outputs[0]],
                  emitter=emitter,
                  name="readout")

generator = SequenceGenerator(readout=readout,
                              attention=attention,
                              transition=transition,
                              name="generator")

generator.weights_init = IsotropicGaussian(0.01)
generator.biases_init = Constant(0.001)
generator.push_initialization_config()

lookup.weights_init = IsotropicGaussian(0.01)
lookup.biases_init = Constant(0.001)
lookup.initialize()

#generator.transition.weights_init = initialization.Identity(0.98)
#generator.transition.biases_init = IsotropicGaussian(0.01,0.9)
generator.transition.push_initialization_config()
generator.initialize()

cost_matrix = generator.cost_matrix(x,
                                    x_mask,
                                    attended=embed,
                                    attended_mask=context_mask)
cost = cost_matrix.sum(axis=0).mean()
cost.name = "nll"
Beispiel #14
0
 def _build_lookup(self, name, word_num, dim=1, *args, **kwargs):
     lookup = LookupTable(length=word_num, dim=dim, name=name)
     lookup.weights_init = Constant(1. / word_num**0.25)
     lookup.initialize()
     return lookup
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')
        ans_indices = tensor.imatrix('ans_indices')  # n_steps * n_samples
        ans_indices_mask = tensor.imatrix('ans_indices_mask')

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)
        ans_indices = ans_indices.dimshuffle(1, 0)
        ans_indices_mask = ans_indices_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size, config.embed_size, name='embed')
        #embed.weights_init = IsotropicGaussian(0.01)
        embed.weights_init = Constant(
            init_embedding_table(filename='embeddings/vocab_embeddings.txt'))

        # one directional LSTM encoding
        q_lstm_ins = Linear(input_dim=config.embed_size,
                            output_dim=4 * config.pre_lstm_size,
                            name='q_lstm_in')
        q_lstm = LSTM(dim=config.pre_lstm_size,
                      activation=Tanh(),
                      name='q_lstm')
        c_lstm_ins = Linear(input_dim=config.embed_size,
                            output_dim=4 * config.pre_lstm_size,
                            name='c_lstm_in')
        c_lstm = LSTM(dim=config.pre_lstm_size,
                      activation=Tanh(),
                      name='c_lstm')
        bricks += [q_lstm, c_lstm, q_lstm_ins, c_lstm_ins]

        q_tmp = q_lstm_ins.apply(embed.apply(question))
        c_tmp = c_lstm_ins.apply(embed.apply(context))
        q_hidden, _ = q_lstm.apply(q_tmp,
                                   mask=question_mask.astype(
                                       theano.config.floatX))  # lq, bs, dim
        c_hidden, _ = c_lstm.apply(c_tmp,
                                   mask=context_mask.astype(
                                       theano.config.floatX))  # lc, bs, dim

        # Attention mechanism Bilinear question
        attention_question = Linear(input_dim=config.pre_lstm_size,
                                    output_dim=config.pre_lstm_size,
                                    name='att_question')
        bricks += [attention_question]
        att_weights_question = q_hidden[
            None, :, :, :] * attention_question.apply(
                c_hidden.reshape(
                    (c_hidden.shape[0] * c_hidden.shape[1],
                     c_hidden.shape[2]))).reshape(
                         (c_hidden.shape[0], c_hidden.shape[1],
                          c_hidden.shape[2]))[:,
                                              None, :, :]  # --> lc,lq,bs,dim
        att_weights_question = att_weights_question.sum(
            axis=3)  # sum over axis 3 -> dimensions --> lc,lq,bs
        att_weights_question = att_weights_question.dimshuffle(
            0, 2, 1)  # --> lc,bs,lq
        att_weights_question = att_weights_question.reshape(
            (att_weights_question.shape[0] * att_weights_question.shape[1],
             att_weights_question.shape[2]))  # --> lc*bs,lq
        att_weights_question = tensor.nnet.softmax(
            att_weights_question
        )  # softmax over axis 1 -> length of question # --> lc*bs,lq
        att_weights_question = att_weights_question.reshape(
            (c_hidden.shape[0], q_hidden.shape[1],
             q_hidden.shape[0]))  # --> lc,bs,lq
        att_weights_question = att_weights_question.dimshuffle(
            0, 2, 1)  # --> lc,lq,bs

        attended_question = tensor.sum(
            q_hidden[None, :, :, :] * att_weights_question[:, :, :, None],
            axis=1)  # sum over axis 1 -> length of question --> lc,bs,dim
        attended_question.name = 'attended_question'

        # Match LSTM
        cqembed = tensor.concatenate([c_hidden, attended_question], axis=2)
        mlstms, mhidden_list = make_bidir_lstm_stack(
            cqembed, 2 * config.pre_lstm_size,
            context_mask.astype(theano.config.floatX), config.match_lstm_size,
            config.match_skip_connections, 'match')
        bricks = bricks + mlstms
        if config.match_skip_connections:
            menc_dim = 2 * sum(config.match_lstm_size)
            menc = tensor.concatenate(mhidden_list, axis=2)
        else:
            menc_dim = 2 * config.match_lstm_size[-1]
            menc = tensor.concatenate(mhidden_list[-2:], axis=2)
        menc.name = 'menc'

        # Attention mechanism MLP start
        attention_mlp_start = MLP(
            dims=config.attention_mlp_hidden + [1],
            activations=config.attention_mlp_activations[1:] + [Identity()],
            name='attention_mlp_start')
        attention_clinear_start = Linear(
            input_dim=menc_dim,
            output_dim=config.attention_mlp_hidden[0],
            name='attm_start')  # Wym
        bricks += [attention_mlp_start, attention_clinear_start]
        layer1_start = Tanh(name='layer1_start')
        layer1_start = layer1_start.apply(
            attention_clinear_start.apply(
                menc.reshape(
                    (menc.shape[0] * menc.shape[1], menc.shape[2]))).reshape(
                        (menc.shape[0], menc.shape[1],
                         config.attention_mlp_hidden[0])))
        att_weights_start = attention_mlp_start.apply(
            layer1_start.reshape(
                (layer1_start.shape[0] * layer1_start.shape[1],
                 layer1_start.shape[2])))
        att_weights_start = att_weights_start.reshape(
            (layer1_start.shape[0], layer1_start.shape[1]))
        att_weights_start = tensor.nnet.softmax(att_weights_start.T).T

        attended = tensor.sum(menc * att_weights_start[:, :, None], axis=0)
        attended.name = 'attended'

        # Attention mechanism MLP end
        attention_mlp_end = MLP(
            dims=config.attention_mlp_hidden + [1],
            activations=config.attention_mlp_activations[1:] + [Identity()],
            name='attention_mlp_end')
        attention_qlinear_end = Linear(
            input_dim=menc_dim,
            output_dim=config.attention_mlp_hidden[0],
            name='atts_end')  #Wum
        attention_clinear_end = Linear(
            input_dim=menc_dim,
            output_dim=config.attention_mlp_hidden[0],
            use_bias=False,
            name='attm_end')  # Wym
        bricks += [
            attention_mlp_end, attention_qlinear_end, attention_clinear_end
        ]
        layer1_end = Tanh(name='layer1_end')
        layer1_end = layer1_end.apply(
            attention_clinear_end.apply(
                menc.reshape((menc.shape[0] * menc.shape[1], menc.shape[2]
                              ))).reshape((menc.shape[0], menc.shape[1],
                                           config.attention_mlp_hidden[0])) +
            attention_qlinear_end.apply(attended)[None, :, :])
        att_weights_end = attention_mlp_end.apply(
            layer1_end.reshape((layer1_end.shape[0] * layer1_end.shape[1],
                                layer1_end.shape[2])))
        att_weights_end = att_weights_end.reshape(
            (layer1_end.shape[0], layer1_end.shape[1]))
        att_weights_end = tensor.nnet.softmax(att_weights_end.T).T

        att_weights_start = tensor.dot(
            tensor.le(
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[None, :],
                    (context.shape[0], 1)),
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[:, None],
                    (1, context.shape[0]))), att_weights_start)
        att_weights_end = tensor.dot(
            tensor.ge(
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[None, :],
                    (context.shape[0], 1)),
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[:, None],
                    (1, context.shape[0]))), att_weights_end)

        # add attention from left and right
        att_weights = att_weights_start * att_weights_end
        #att_weights = tensor.minimum(att_weights_start, att_weights_end)

        att_target = tensor.zeros((ans_indices.shape[1], context.shape[0]),
                                  dtype=theano.config.floatX)
        att_target = tensor.set_subtensor(
            att_target[tensor.arange(ans_indices.shape[1]), ans_indices], 1)
        att_target = att_target.dimshuffle(1, 0)
        #att_target = tensor.eq(tensor.tile(answer[None,:,:], (context.shape[0], 1, 1)),
        #                       tensor.tile(context[:,None,:], (1, answer.shape[0], 1))).sum(axis=1).clip(0,1)

        self.predictions = tensor.gt(att_weights, 0.25) * context

        att_target = att_target / (att_target.sum(axis=0) + 0.00001)
        #att_weights = att_weights / (att_weights.sum(axis=0) + 0.00001)

        cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) *
                context_mask).sum() / context_mask.sum()

        # Apply dropout
        cg = ComputationGraph([cost])
        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, mhidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'
        att_weights_start.name = 'att_weights_start'
        att_weights_end.name = 'att_weights_end'
        att_weights.name = 'att_weights'
        att_target.name = 'att_target'
        self.predictions.name = 'pred'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]
        self.analyse_vars = [
            cost, self.predictions, att_weights_start, att_weights_end,
            att_weights, att_target
        ]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()