Esempio n. 1
0
class ShallowEnergyComputer(Initializable, Feedforward):
    """A simple energy computer: first tanh, then weighted sum."""
    @lazy()
    def __init__(self, **kwargs):
        super(ShallowEnergyComputer, self).__init__(**kwargs)
        self.tanh = Tanh()
        self.linear = Linear(use_bias=False)
        self.children = [self.tanh, self.linear]

    @application
    def apply(self, *args):
        output = args
        output = self.tanh.apply(*pack(output))
        output = self.linear.apply(*pack(output))
        return output

    @property
    def input_dim(self):
        return self.children[1].input_dim

    @input_dim.setter
    def input_dim(self, value):
        self.children[1].input_dim = value

    @property
    def output_dim(self):
        return self.children[1].output_dim

    @output_dim.setter
    def output_dim(self, value):
        self.children[1].output_dim = value
Esempio n. 2
0
    def __init__(self, config):
        inp = tensor.imatrix('bytes')

        embed = theano.shared(config.embedding_matrix.astype(theano.config.floatX),
                              name='embedding_matrix')
        in_repr = embed[inp.flatten(), :].reshape((inp.shape[0], inp.shape[1], config.repr_dim))
        in_repr.name = 'in_repr'

        bricks = []
        states = []

        # Construct predictive GRU hierarchy
        hidden = []
        costs = []
        next_target = in_repr.dimshuffle(1, 0, 2)
        for i, (hdim, cf, q) in enumerate(zip(config.hidden_dims,
                                                   config.cost_factors,
                                                   config.hidden_q)):
            init_state = theano.shared(numpy.zeros((config.num_seqs, hdim)).astype(theano.config.floatX),
                                       name='st0_%d'%i)

            linear = Linear(input_dim=config.repr_dim, output_dim=3*hdim,
                            name="lstm_in_%d"%i)
            lstm = GatedRecurrent(dim=hdim, activation=config.activation_function,
                        name="lstm_rec_%d"%i)
            linear2 = Linear(input_dim=hdim, output_dim=config.repr_dim, name='lstm_out_%d'%i)
            tanh = Tanh('lstm_out_tanh_%d'%i)
            bricks += [linear, lstm, linear2, tanh]
            if i > 0:
                linear1 = Linear(input_dim=config.hidden_dims[i-1], output_dim=3*hdim,
                                 name='lstm_in2_%d'%i)
                bricks += [linear1]

            next_target = tensor.cast(next_target, dtype=theano.config.floatX)
            inter = linear.apply(theano.gradient.disconnected_grad(next_target))
            if i > 0:
                inter += linear1.apply(theano.gradient.disconnected_grad(hidden[-1][:-1,:,:]))
            new_hidden = lstm.apply(inputs=inter[:,:,:hdim],
                                    gate_inputs=inter[:,:,hdim:],
                                    states=init_state)
            states.append((init_state, new_hidden[-1, :, :]))

            hidden += [tensor.concatenate([init_state[None,:,:], new_hidden],axis=0)]
            pred = tanh.apply(linear2.apply(hidden[-1][:-1,:,:]))
            costs += [numpy.float32(cf) * (-next_target * pred).sum(axis=2).mean()]
            costs += [numpy.float32(cf) * q * abs(pred).sum(axis=2).mean()]
            diff = next_target - pred
            next_target = tensor.ge(diff, 0.5) - tensor.le(diff, -0.5)


        # Construct output from hidden states
        hidden = [s.dimshuffle(1, 0, 2) for s in hidden]

        out_parts = []
        out_dims = config.out_hidden + [config.io_dim]
        for i, (dim, state) in enumerate(zip(config.hidden_dims, hidden)):
            pred_linear = Linear(input_dim=dim, output_dim=out_dims[0],
                                name='pred_linear_%d'%i)
            bricks.append(pred_linear)
            lin = theano.gradient.disconnected_grad(state)
            out_parts.append(pred_linear.apply(lin))

        # Do prediction and calculate cost
        out = sum(out_parts)

        if len(out_dims) > 1:
            out = config.out_hidden_act[0](name='out_act0').apply(out)
            mlp = MLP(dims=out_dims,
                      activations=[x(name='out_act%d'%i) for i, x in enumerate(config.out_hidden_act[1:])]
                                 +[Identity()],
                      name='out_mlp')
            bricks.append(mlp)
            out = mlp.apply(out.reshape((inp.shape[0]*(inp.shape[1]+1),-1))
                           ).reshape((inp.shape[0],inp.shape[1]+1,-1))

        pred = out.argmax(axis=2)

        cost = Softmax().categorical_cross_entropy(inp.flatten(),
                                                   out[:,:-1,:].reshape((inp.shape[0]*inp.shape[1],
                                                                config.io_dim))).mean()
        error_rate = tensor.neq(inp.flatten(), pred[:,:-1].flatten()).mean()

        sgd_cost = cost + sum(costs)
            
        # Initialize all bricks
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()

        # apply noise
        cg = ComputationGraph([sgd_cost, cost, error_rate]+costs)
        if config.weight_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.weight_noise)
        sgd_cost = cg.outputs[0]
        cost = cg.outputs[1]
        error_rate = cg.outputs[2]
        costs = cg.outputs[3:]


        # put stuff into self that is usefull for training or extensions
        self.sgd_cost = sgd_cost

        sgd_cost.name = 'sgd_cost'
        for i in range(len(costs)):
            costs[i].name = 'pred_cost_%d'%i
        cost.name = 'cost'
        error_rate.name = 'error_rate'
        self.monitor_vars = [costs, [cost],
                             [error_rate]]

        self.out = out[:,1:,:]
        self.pred = pred[:,1:]

        self.states = states
Esempio n. 3
0
class FRNNEmitter(AbstractEmitter, Initializable, Random):
    """An RNN emitter for the case of real outputs.
    Parameters
    ----------
    """

    def __init__(self, mlp, target_size, frame_size, k, frnn_hidden_size, frnn_step_size, const=1e-5, **kwargs):

        super(FRNNEmitter, self).__init__(**kwargs)

        self.mlp = mlp
        self.target_size = target_size
        self.frame_size = frame_size
        self.k = k
        self.frnn_hidden_size = frnn_hidden_size
        self.const = const
        self.input_dim = self.mlp.output_dim

        self.frnn_step_size = frnn_step_size

        # adding a step if the division is not exact.
        self.number_of_steps = frame_size // frnn_step_size
        self.last_steps = frame_size % frnn_step_size
        if self.last_steps != 0:
            self.number_of_steps += 1

        self.mu = MLP(activations=[Identity()], dims=[frnn_hidden_size, k * frnn_step_size], name=self.name + "_mu")
        self.sigma = MLP(
            activations=[SoftPlus()], dims=[frnn_hidden_size, k * frnn_step_size], name=self.name + "_sigma"
        )

        self.coeff = MLP(activations=[Identity()], dims=[frnn_hidden_size, k], name=self.name + "_coeff")

        self.coeff2 = NDimensionalSoftmax()

        self.frnn_initial_state = Linear(
            input_dim=self.input_dim, output_dim=frnn_hidden_size, name="frnn_initial_state"
        )

        # self.frnn_hidden = Linear(
        #    input_dim=frnn_hidden_size,
        #    output_dim=frnn_hidden_size,
        #    activation=Tanh(),
        #    name="frnn_hidden")

        self.frnn_activation = Tanh(name="frnn_activation")

        self.frnn_linear_transition_state = Linear(
            input_dim=frnn_hidden_size, output_dim=frnn_hidden_size, name="frnn_linear_transition_state"
        )

        self.frnn_linear_transition_input = Linear(
            input_dim=self.frnn_step_size, output_dim=frnn_hidden_size, name="frnn_linear_transition_input"
        )

        # self.frnn_linear_transition_output = Linear (
        #    input_dim = frnn_hidden_size,
        #    output_dim = self.rnn_hidden_dim,
        #    name="frnn_linear_transition_output")

        self.children = [
            self.mlp,
            self.mu,
            self.sigma,
            self.coeff,
            self.coeff2,
            self.frnn_initial_state,
            self.frnn_activation,
            self.frnn_linear_transition_state,
            self.frnn_linear_transition_input,
        ]

    @application
    def emit(self, readouts):
        """
        keep_parameters is True if mu,sigma,coeffs must be stacked and returned
        if false, only the result is given, the others will be empty list.

        """
        # initial state
        state = self.frnn_initial_state.apply(self.mlp.apply(readouts))

        results = []

        for i in range(self.number_of_steps):
            last_iteration = i == self.number_of_steps - 1

            # First generating distribution parameters and sampling.
            mu = self.mu.apply(state)
            sigma = self.sigma.apply(state) + self.const
            coeff = self.coeff2.apply(self.coeff.apply(state), extra_ndim=state.ndim - 2) + self.const

            shape_result = coeff.shape
            shape_result = tensor.set_subtensor(shape_result[-1], self.frnn_step_size)
            ndim_result = coeff.ndim

            mu = mu.reshape((-1, self.frnn_step_size, self.k))
            sigma = sigma.reshape((-1, self.frnn_step_size, self.k))
            coeff = coeff.reshape((-1, self.k))

            sample_coeff = self.theano_rng.multinomial(pvals=coeff, dtype=coeff.dtype)
            idx = predict(sample_coeff, axis=-1)
            # idx = predict(coeff, axis = -1) use this line for using most likely coeff.

            # shapes (ls*bs)*(fs)
            mu = mu[tensor.arange(mu.shape[0]), :, idx]
            sigma = sigma[tensor.arange(sigma.shape[0]), :, idx]

            epsilon = self.theano_rng.normal(size=mu.shape, avg=0.0, std=1.0, dtype=mu.dtype)

            result = mu + sigma * epsilon  # *0.6 #reduce variance.
            result = result.reshape(shape_result, ndim=ndim_result)
            results.append(result)

            # if the total size does not correspond to the frame_size,
            # this removes the need for padding
            if not last_iteration:
                state = self.frnn_activation.apply(
                    self.frnn_linear_transition_state.apply(state) + self.frnn_linear_transition_input.apply(result)
                )

        results = tensor.stack(results, axis=-1)
        results = tensor.flatten(results, outdim=results.ndim - 1)

        # truncate if not good size
        if self.last_steps != 0:
            results = results[tuple([slice(0, None)] * (results.ndim - 1) + [slice(0, self.frame_size)])]

        return results

    @application
    def cost(self, readouts, outputs):
        # initial state
        state = self.frnn_initial_state.apply(self.mlp.apply(readouts))

        inputs = outputs

        mus = []
        sigmas = []
        coeffs = []

        for i in range(self.number_of_steps):
            last_iteration = i == self.number_of_steps - 1

            # First generating distribution parameters and sampling.
            freq_mu = self.mu.apply(state)
            freq_sigma = self.sigma.apply(state) + self.const
            freq_coeff = self.coeff2.apply(self.coeff.apply(state), extra_ndim=state.ndim - 2) + self.const

            freq_mu = freq_mu.reshape((-1, self.frnn_step_size, self.k))
            freq_sigma = freq_sigma.reshape((-1, self.frnn_step_size, self.k))
            freq_coeff = freq_coeff.reshape((-1, self.k))
            # mu,sigma: shape (-1,fs,k)
            # coeff: shape (-1,k)

            mus.append(freq_mu)
            sigmas.append(freq_sigma)
            coeffs.append(freq_coeff)

            index = self.frnn_step_size
            freq_inputs = inputs[
                tuple([slice(0, None)] * (inputs.ndim - 1) + [slice(index, index + self.frnn_step_size)])
            ]

            if not last_iteration:
                state = self.frnn_activation.apply(
                    self.frnn_linear_transition_state.apply(state)
                    + self.frnn_linear_transition_input.apply(freq_inputs)
                )

        mus = tensor.stack(mus, axis=-2)
        sigmas = tensor.stack(sigmas, axis=-2)
        coeffs = tensor.stack(coeffs, axis=-2)

        mus = mus.reshape((-1, self.frnn_step_size * self.number_of_steps, self.k))
        sigmas = sigmas.reshape((-1, self.frnn_step_size * self.number_of_steps, self.k))
        coeffs = coeffs.repeat(self.frnn_step_size, axis=-2)

        mus = mus[tuple([slice(0, None)] * (mus.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])]
        sigmas = sigmas[tuple([slice(0, None)] * (sigmas.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])]
        coeffs = coeffs[tuple([slice(0, None)] * (coeffs.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])]
        # actually prob not necessary
        mu = mus.reshape((-1, self.target_size))
        sigma = sigmas.reshape((-1, self.target_size))
        coeff = coeffs.reshape((-1, self.target_size))

        return FRNN_NLL(y=outputs, mu=mu, sig=sigma, coeff=coeff, frame_size=self.frame_size, k=self.k)

    @application
    def initial_outputs(self, batch_size):
        return tensor.zeros((batch_size, self.frame_size), dtype=floatX)

    def get_dim(self, name):
        # modification here to ensure the right dim.
        if name == "outputs":
            return self.frame_size
        return super(FRNNEmitter, self).get_dim(name)
Esempio n. 4
0
class Decoder(Initializable):
    def __init__(self, vocab_size, embedding_dim, state_dim,
                 representation_dim, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim

        readout = Readout(
            source_names=['states', 'feedback', 'readout_context'],
            readout_dim=self.vocab_size,
            emitter=SoftmaxEmitter(),
            feedback_brick=LookupFeedback(vocab_size, embedding_dim),
            post_merge=InitializableFeedforwardSequence(
                [Bias(dim=1000).apply,
                 Maxout(num_pieces=2).apply,
                 Linear(input_dim=state_dim / 2, output_dim=100,
                        use_bias=False).apply,
                 Linear(input_dim=100).apply]),
            merged_dim=1000)

        self.transition = GatedRecurrentWithContext(Tanh(), dim=state_dim,
                                                    name='decoder')
        # Readout will apply the linear transformation to 'readout_context'
        # with a Merge brick, so no need to fork it here
        self.fork = Fork([name for name in
                          self.transition.apply.contexts +
                          self.transition.apply.states
                          if name != 'readout_context'], prototype=Linear())
        self.tanh = Tanh()

        self.sequence_generator = SequenceGenerator(
            readout=readout, transition=self.transition,
            fork_inputs=[name for name in self.transition.apply.sequences
                         if name != 'mask'],
        )

        self.children = [self.fork, self.sequence_generator, self.tanh]

    def _push_allocation_config(self):
        self.fork.input_dim = self.representation_dim
        self.fork.output_dims = [self.state_dim
                                 for _ in self.fork.output_names]

    @application(inputs=['representation', 'target_sentence_mask',
                         'target_sentence'], outputs=['cost'])
    def cost(self, representation, target_sentence, target_sentence_mask):
        target_sentence = target_sentence.dimshuffle(1, 0)
        target_sentence_mask = target_sentence_mask.T

        # The initial state and contexts, all functions of the representation
        contexts = {key: value.dimshuffle('x', 0, 1)
                    if key not in self.transition.apply.states else value
                    for key, value
                    in self.fork.apply(representation, as_dict=True).items()}
        contexts['states'] = self.tanh.apply(contexts['states'])
        cost = self.sequence_generator.cost(**merge(
            contexts, {'mask': target_sentence_mask,
                       'outputs': target_sentence,
                       'readout_context': representation.dimshuffle('x', 0, 1)}
        ))

        return (cost * target_sentence_mask).sum() / target_sentence_mask.shape[1]
Esempio n. 5
0
class FRNNEmitter(AbstractEmitter, Initializable, Random):
    """An RNN emitter for the case of real outputs.
    Parameters
    ----------
    """
    def __init__(self, mlp, target_size, frame_size, k, frnn_hidden_size, \
            frnn_step_size, const=1e-5, **kwargs):

        super(FRNNEmitter, self).__init__(**kwargs)

        self.mlp = mlp
        self.target_size = target_size
        self.frame_size = frame_size
        self.k = k
        self.frnn_hidden_size = frnn_hidden_size
        self.const = const
        self.input_dim = self.mlp.output_dim

        self.frnn_step_size = frnn_step_size

        # adding a step if the division is not exact.
        self.number_of_steps = frame_size // frnn_step_size
        self.last_steps = frame_size % frnn_step_size
        if self.last_steps != 0:
            self.number_of_steps += 1

        self.mu = MLP(activations=[Identity()],
                dims=[frnn_hidden_size, k*frnn_step_size],
                name=self.name + "_mu")
        self.sigma = MLP(activations=[SoftPlus()],
                dims=[frnn_hidden_size, k*frnn_step_size],
                name=self.name + "_sigma")

        self.coeff = MLP(activations=[Identity()],
                dims=[frnn_hidden_size, k],
                name=self.name + "_coeff")

        self.coeff2 = NDimensionalSoftmax()

        self.frnn_initial_state = Linear(
            input_dim = self.input_dim,
            output_dim=frnn_hidden_size,
            name="frnn_initial_state")

        #self.frnn_hidden = Linear(
        #    input_dim=frnn_hidden_size,
        #    output_dim=frnn_hidden_size,
        #    activation=Tanh(),
        #    name="frnn_hidden")

        self.frnn_activation = Tanh(
            name="frnn_activation")

        self.frnn_linear_transition_state = Linear (
            input_dim = frnn_hidden_size,
            output_dim= frnn_hidden_size,
            name="frnn_linear_transition_state")

        self.frnn_linear_transition_input = Linear (
            input_dim = self.frnn_step_size,
            output_dim = frnn_hidden_size,
            name="frnn_linear_transition_input")

        #self.frnn_linear_transition_output = Linear (
        #    input_dim = frnn_hidden_size,
        #    output_dim = self.rnn_hidden_dim,
        #    name="frnn_linear_transition_output")

        self.children = [self.mlp,self.mu,self.sigma,self.coeff,
            self.coeff2,self.frnn_initial_state,self.frnn_activation,
            self.frnn_linear_transition_state,
            self.frnn_linear_transition_input]

    @application
    def emit(self,readouts):
        """
        keep_parameters is True if mu,sigma,coeffs must be stacked and returned
        if false, only the result is given, the others will be empty list.

        """
        # initial state
        state = self.frnn_initial_state.apply(\
            self.mlp.apply(readouts))

        results = []

        for i in range(self.number_of_steps):
            last_iteration = (i == self.number_of_steps - 1)

            # First generating distribution parameters and sampling.
            mu = self.mu.apply(state)
            sigma = self.sigma.apply(state) + self.const
            coeff = self.coeff2.apply(self.coeff.apply(state),\
                extra_ndim=state.ndim - 2) + self.const

            shape_result = coeff.shape
            shape_result = tensor.set_subtensor(shape_result[-1],self.frnn_step_size)
            ndim_result = coeff.ndim

            mu = mu.reshape((-1, self.frnn_step_size,self.k))
            sigma = sigma.reshape((-1, self.frnn_step_size,self.k))
            coeff = coeff.reshape((-1, self.k))

            sample_coeff = self.theano_rng.multinomial(pvals = coeff, dtype=coeff.dtype)
            idx = predict(sample_coeff, axis = -1)
            #idx = predict(coeff, axis = -1) use this line for using most likely coeff.

            #shapes (ls*bs)*(fs)
            mu = mu[tensor.arange(mu.shape[0]), :,idx]
            sigma = sigma[tensor.arange(sigma.shape[0]), :,idx]

            epsilon = self.theano_rng.normal(
                size=mu.shape,
                avg=0.,
                std=1.,
                dtype=mu.dtype)

            result = mu + sigma*epsilon#*0.6 #reduce variance.
            result = result.reshape(shape_result, ndim = ndim_result)
            results.append(result)

            # if the total size does not correspond to the frame_size,
            #this removes the need for padding
            if not last_iteration:
                state = self.frnn_activation.apply(
                        self.frnn_linear_transition_state.apply(state) +
                        self.frnn_linear_transition_input.apply(result))

        results = tensor.stack(results,axis=-1)
        results = tensor.flatten(results,outdim=results.ndim-1)

        # truncate if not good size
        if self.last_steps != 0:
            results = results[tuple([slice(0,None)] * \
                (results.ndim-1) +[slice(0,self.frame_size)])]

        return results

    @application
    def cost(self, readouts, outputs):
        # initial state
        state = self.frnn_initial_state.apply(\
            self.mlp.apply(readouts))

        inputs = outputs

        mus = []
        sigmas = []
        coeffs = []

        for i in range(self.number_of_steps):
            last_iteration = (i == self.number_of_steps - 1)

            # First generating distribution parameters and sampling.
            freq_mu = self.mu.apply(state)
            freq_sigma = self.sigma.apply(state) + self.const
            freq_coeff = self.coeff2.apply(self.coeff.apply(state),\
                extra_ndim=state.ndim - 2) + self.const

            freq_mu = freq_mu.reshape((-1,self.frnn_step_size,self.k))
            freq_sigma = freq_sigma.reshape((-1,self.frnn_step_size,self.k))
            freq_coeff = freq_coeff.reshape((-1,self.k))
            #mu,sigma: shape (-1,fs,k)
            #coeff: shape (-1,k)

            mus.append(freq_mu)
            sigmas.append(freq_sigma)
            coeffs.append(freq_coeff)

            index = self.frnn_step_size
            freq_inputs = inputs[tuple([slice(0,None)] * \
                (inputs.ndim-1) +[slice(index,index+self.frnn_step_size)])]

            if not last_iteration:
                state = self.frnn_activation.apply(
                    self.frnn_linear_transition_state.apply(state) +
                    self.frnn_linear_transition_input.apply(freq_inputs))

        mus = tensor.stack(mus,axis=-2)
        sigmas = tensor.stack(sigmas,axis=-2)
        coeffs = tensor.stack(coeffs,axis=-2)

        mus = mus.reshape((-1,self.frnn_step_size*self.number_of_steps,self.k))
        sigmas = sigmas.reshape((-1,self.frnn_step_size*self.number_of_steps,self.k))
        coeffs = coeffs.repeat(self.frnn_step_size,axis=-2)

        mus = mus[tuple([slice(0,None)] * \
                (mus.ndim-2) +[slice(0,self.frame_size)] + [slice(0,None)])]
        sigmas = sigmas[tuple([slice(0,None)] * \
                (sigmas.ndim-2) +[slice(0,self.frame_size)] + [slice(0,None)])]
        coeffs = coeffs[tuple([slice(0,None)] * \
                (coeffs.ndim-2) +[slice(0,self.frame_size)] + [slice(0,None)])]
        # actually prob not necessary
        mu = mus.reshape((-1,self.target_size))
        sigma = sigmas.reshape((-1,self.target_size))
        coeff = coeffs.reshape((-1, self.target_size))

        return FRNN_NLL (y=outputs, mu=mu, sig=sigma, coeff=coeff,\
            frame_size=self.frame_size,k=self.k)

    @application
    def initial_outputs(self, batch_size):
        return tensor.zeros((batch_size, self.frame_size), dtype=floatX)

    def get_dim(self, name):
	# modification here to ensure the right dim.
        if name == 'outputs':
            return self.frame_size
        return super(FRNNEmitter, self).get_dim(name)
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size,
                            config.embed_size,
                            name='question_embed')
        embed.weights_init = IsotropicGaussian(0.01)

        # Calculate question encoding (concatenate layer1)
        qembed = embed.apply(question)
        qlstms, qhidden_list = make_bidir_lstm_stack(
            qembed, config.embed_size,
            question_mask.astype(theano.config.floatX),
            config.question_lstm_size, config.question_skip_connections, 'q')
        bricks = bricks + qlstms
        if config.question_skip_connections:
            qenc_dim = 2 * sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list],
                                      axis=1)
        else:
            qenc_dim = 2 * config.question_lstm_size[-1]
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]],
                                      axis=1)
        qenc.name = 'qenc'

        # Calculate context encoding (concatenate layer1)
        cembed = embed.apply(context)
        clstms, chidden_list = make_bidir_lstm_stack(
            cembed, config.embed_size,
            context_mask.astype(theano.config.floatX), config.ctx_lstm_size,
            config.ctx_skip_connections, 'ctx')
        bricks = bricks + clstms
        if config.ctx_skip_connections:
            cenc_dim = 2 * sum(config.ctx_lstm_size)  #2 : fw & bw
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2 * config.question_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'

        # Attention mechanism MLP fwd
        attention_mlp_fwd = MLP(
            dims=config.attention_mlp_hidden + [1],
            activations=config.attention_mlp_activations[1:] + [Identity()],
            name='attention_mlp_fwd')
        attention_qlinear_fwd = Linear(
            input_dim=qenc_dim,
            output_dim=config.attention_mlp_hidden[0],
            name='attq_fwd')
        attention_clinear_fwd = Linear(
            input_dim=cenc_dim / 2,
            output_dim=config.attention_mlp_hidden[0],
            use_bias=False,
            name='attc_fwd')
        bricks += [
            attention_mlp_fwd, attention_qlinear_fwd, attention_clinear_fwd
        ]
        layer1_fwd = Tanh(name='tanh_fwd')
        layer1_fwd = layer1_fwd.apply(
            attention_clinear_fwd.apply(cenc[:, :, :cenc_dim / 2].reshape(
                (cenc.shape[0] * cenc.shape[1], cenc.shape[2] /
                 2))).reshape((cenc.shape[0], cenc.shape[1],
                               config.attention_mlp_hidden[0])) +
            attention_qlinear_fwd.apply(qenc)[None, :, :])
        att_weights_fwd = attention_mlp_fwd.apply(
            layer1_fwd.reshape((layer1_fwd.shape[0] * layer1_fwd.shape[1],
                                layer1_fwd.shape[2])))
        att_weights_fwd = att_weights_fwd.reshape(
            (layer1_fwd.shape[0], layer1_fwd.shape[1]))
        att_weights_fwd = tensor.nnet.softmax(att_weights_fwd.T)
        att_weights_fwd.name = 'att_weights_fwd'

        attended_fwd = tensor.sum(cenc[:, :, :cenc_dim / 2] *
                                  att_weights_fwd.T[:, :, None],
                                  axis=0)
        attended_fwd.name = 'attended_fwd'

        # Attention mechanism MLP bwd
        attention_mlp_bwd = MLP(
            dims=config.attention_mlp_hidden + [1],
            activations=config.attention_mlp_activations[1:] + [Identity()],
            name='attention_mlp_bwd')
        attention_qlinear_bwd = Linear(
            input_dim=qenc_dim,
            output_dim=config.attention_mlp_hidden[0],
            name='attq_bwd')
        attention_clinear_bwd = Linear(
            input_dim=cenc_dim / 2,
            output_dim=config.attention_mlp_hidden[0],
            use_bias=False,
            name='attc_bwd')
        bricks += [
            attention_mlp_bwd, attention_qlinear_bwd, attention_clinear_bwd
        ]
        layer1_bwd = Tanh(name='tanh_bwd')
        layer1_bwd = layer1_bwd.apply(
            attention_clinear_bwd.apply(cenc[:, :, cenc_dim / 2:].reshape(
                (cenc.shape[0] * cenc.shape[1], cenc.shape[2] /
                 2))).reshape((cenc.shape[0], cenc.shape[1],
                               config.attention_mlp_hidden[0])) +
            attention_qlinear_bwd.apply(qenc)[None, :, :])
        att_weights_bwd = attention_mlp_bwd.apply(
            layer1_bwd.reshape((layer1_bwd.shape[0] * layer1_bwd.shape[1],
                                layer1_bwd.shape[2])))
        att_weights_bwd = att_weights_bwd.reshape(
            (layer1_bwd.shape[0], layer1_bwd.shape[1]))
        att_weights_bwd = tensor.nnet.softmax(att_weights_bwd.T)
        att_weights_bwd.name = 'att_weights_bwd'

        attended_bwd = tensor.sum(cenc[:, :, cenc_dim / 2:] *
                                  att_weights_bwd.T[:, :, None],
                                  axis=0)
        attended_bwd.name = 'attended_bwd'

        ctx_question = tensor.concatenate([attended_fwd, attended_bwd, qenc],
                                          axis=1)
        ctx_question.name = 'ctx_question'

        answer_bag = to_bag(answer, vocab_size)
        answer_bag = tensor.set_subtensor(answer_bag[:, 0:3], 0)
        relevant_items = answer_bag.sum(axis=1, dtype=theano.config.floatX)

        def createSequences(j, index, c_enc, c_enc_dim, c_context,
                            c_window_size):
            sequence = tensor.concatenate([
                c_context[j:j + index, :],
                tensor.zeros((c_window_size - index, c_context.shape[1]))
            ],
                                          axis=0)
            enc = tensor.concatenate([
                c_enc[j + index - 1, :, :], c_enc[j, :, :-1],
                tensor.tile(c_window_size[None, None], (c_enc.shape[1], 1))
            ],
                                     axis=1)
            return enc, sequence

        def createTargetValues(j, index, c_context, c_vocab_size):
            sequence_bag = to_bag(c_context[j:j + index, :], c_vocab_size)
            sequence_bag = tensor.set_subtensor(sequence_bag[:, 0:3], 0)
            selected_items = sequence_bag.sum(axis=1,
                                              dtype=theano.config.floatX)
            tp = (sequence_bag * answer_bag).sum(axis=1,
                                                 dtype=theano.config.floatX)
            precision = tp / (selected_items + 0.00001)
            recall = tp / (relevant_items + 0.00001)
            #precision = tensor.set_subtensor(precision[tensor.isnan(precision)], 0.0)
            #recall = tensor.set_subtensor(recall[tensor.isnan(recall)], 1.0)
            macroF1 = (2 *
                       (precision * recall)) / (precision + recall + 0.00001)
            #macroF1 = tensor.set_subtensor(macroF1[tensor.isnan(macroF1)], 0.0)
            return macroF1

        window_size = 3
        senc = []
        sequences = []
        pred_targets = []
        for i in range(1, window_size + 1):
            (all_enc, all_sequence), _ = theano.scan(
                fn=createSequences,
                sequences=tensor.arange(cenc.shape[0] - i + 1),
                non_sequences=[i, cenc, cenc_dim, context, window_size])
            (all_macroF1), _ = theano.scan(
                fn=createTargetValues,
                sequences=tensor.arange(cenc.shape[0] - i + 1),
                non_sequences=[i, context, vocab_size])
            senc.append(all_enc)
            sequences.append(all_sequence)
            pred_targets.append(all_macroF1)

        senc = tensor.concatenate(senc, axis=0)
        sequences = tensor.concatenate(sequences, axis=0)
        pred_targets = tensor.concatenate(pred_targets, axis=0)

        # F1 prediction Bilinear
        prediction_linear = Linear(input_dim=2 * cenc_dim,
                                   output_dim=cenc_dim + qenc_dim,
                                   name='pred_linear')
        bricks += [prediction_linear]
        pred_weights = ctx_question[None, :, :] * prediction_linear.apply(
            senc.reshape(
                (senc.shape[0] * senc.shape[1], senc.shape[2]))).reshape(
                    (senc.shape[0], senc.shape[1], senc.shape[2]))
        pred_weights = pred_weights.sum(axis=2)
        pred_weights = tensor.nnet.sigmoid(pred_weights.T).T
        pred_weights.name = 'pred_weights'

        pred_targets = pred_targets / (pred_targets.sum(axis=0) + 0.00001)
        pred_weights = pred_weights / (pred_weights.sum(axis=0) + 0.00001)

        #numpy.set_printoptions(edgeitems=500)
        #pred_targets = theano.printing.Print('pred_targets')(pred_targets)
        #pred_weights = theano.printing.Print('pred_weights')(pred_weights)

        cost = tensor.nnet.binary_crossentropy(pred_weights,
                                               pred_targets).mean()
        self.predictions = sequences[pred_weights.argmax(axis=0), :,
                                     tensor.arange(sequences.shape[2])].T

        # Apply dropout
        cg = ComputationGraph([cost])

        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
Esempio n. 7
0
class Decoder(Initializable):
    def __init__(self, vocab_size, embedding_dim, state_dim,
                 representation_dim, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim

        readout = Readout(
            source_names=['states', 'feedback', 'readout_context'],
            readout_dim=self.vocab_size,
            emitter=SoftmaxEmitter(),
            feedback_brick=LookupFeedback(vocab_size, embedding_dim),
            post_merge=InitializableFeedforwardSequence([
                Bias(dim=1000).apply,
                Maxout(num_pieces=2).apply,
                Linear(input_dim=state_dim / 2, output_dim=100,
                       use_bias=False).apply,
                Linear(input_dim=100).apply
            ]),
            merged_dim=1000)

        self.transition = GatedRecurrentWithContext(Tanh(),
                                                    dim=state_dim,
                                                    name='decoder')
        # Readout will apply the linear transformation to 'readout_context'
        # with a Merge brick, so no need to fork it here
        self.fork = Fork([
            name for name in self.transition.apply.contexts +
            self.transition.apply.states if name != 'readout_context'
        ],
                         prototype=Linear())
        self.tanh = Tanh()

        self.sequence_generator = SequenceGenerator(
            readout=readout,
            transition=self.transition,
            fork_inputs=[
                name for name in self.transition.apply.sequences
                if name != 'mask'
            ],
        )

        self.children = [self.fork, self.sequence_generator, self.tanh]

    def _push_allocation_config(self):
        self.fork.input_dim = self.representation_dim
        self.fork.output_dims = [
            self.state_dim for _ in self.fork.output_names
        ]

    @application(
        inputs=['representation', 'target_sentence_mask', 'target_sentence'],
        outputs=['cost'])
    def cost(self, representation, target_sentence, target_sentence_mask):
        target_sentence = target_sentence.dimshuffle(1, 0)
        target_sentence_mask = target_sentence_mask.T

        # The initial state and contexts, all functions of the representation
        contexts = {
            key: value.dimshuffle('x', 0, 1)
            if key not in self.transition.apply.states else value
            for key, value in self.fork.apply(representation,
                                              as_dict=True).items()
        }
        contexts['states'] = self.tanh.apply(contexts['states'])
        cost = self.sequence_generator.cost(**merge(
            contexts, {
                'mask': target_sentence_mask,
                'outputs': target_sentence,
                'readout_context': representation.dimshuffle('x', 0, 1)
            }))

        return (cost *
                target_sentence_mask).sum() / target_sentence_mask.shape[1]
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size, config.embed_size, name='question_embed')
        embed.weights_init = IsotropicGaussian(0.01)

        # Calculate question encoding (concatenate layer1)
        qembed = embed.apply(question)
        qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX),
                                                     config.question_lstm_size, config.question_skip_connections, 'q')
        bricks = bricks + qlstms
        if config.question_skip_connections:
            qenc_dim = 2*sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1)
        else:
            qenc_dim = 2*config.question_lstm_size[-1]
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1)
        qenc.name = 'qenc'

        # Calculate context encoding (concatenate layer1)
        cembed = embed.apply(context)
        cqembed = tensor.concatenate([cembed, tensor.extra_ops.repeat(qenc[None, :, :], cembed.shape[0], axis=0)], axis=2)
        clstms, chidden_list = make_bidir_lstm_stack(cqembed, config.embed_size + qenc_dim, context_mask.astype(theano.config.floatX),
                                                     config.ctx_lstm_size, config.ctx_skip_connections, 'ctx')
        bricks = bricks + clstms
        if config.ctx_skip_connections:
            cenc_dim = 2*sum(config.ctx_lstm_size) #2 : fw & bw
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2*config.question_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'

        # Attention mechanism MLP start
        attention_mlp_start = MLP(dims=config.attention_mlp_hidden + [1],
                            activations=config.attention_mlp_activations[1:] + [Identity()],
                            name='attention_mlp_start')
        attention_qlinear_start = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq_start') #Wum
        attention_clinear_start = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc_start') # Wym
        bricks += [attention_mlp_start, attention_qlinear_start, attention_clinear_start]
        layer1_start = Tanh(name='layer1_start')
        layer1_start = layer1_start.apply(attention_clinear_start.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2])))
                                        .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0]))
                             + attention_qlinear_start.apply(qenc)[None, :, :])
        att_weights_start = attention_mlp_start.apply(layer1_start.reshape((layer1_start.shape[0]*layer1_start.shape[1], layer1_start.shape[2])))
        att_weights_start = att_weights_start.reshape((layer1_start.shape[0], layer1_start.shape[1]))
        att_weights_start = tensor.nnet.softmax(att_weights_start.T).T

        attended = tensor.sum(cenc * att_weights_start[:, :, None], axis=0)
        attended.name = 'attended'

        # Attention mechanism MLP end
        attention_mlp_end = MLP(dims=config.attention_mlp_hidden + [1],
                            activations=config.attention_mlp_activations[1:] + [Identity()],
                            name='attention_mlp_end')
        attention_qlinear_end = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq_end') #Wum
        attention_clinear_end = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc_end') # Wym
        bricks += [attention_mlp_end, attention_qlinear_end, attention_clinear_end]
        layer1_end = Tanh(name='layer1_end')
        layer1_end = layer1_end.apply(attention_clinear_end.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2])))
                                        .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0]))
                             + attention_qlinear_end.apply(attended)[None, :, :])
        att_weights_end = attention_mlp_end.apply(layer1_end.reshape((layer1_end.shape[0]*layer1_end.shape[1], layer1_end.shape[2])))
        att_weights_end = att_weights_end.reshape((layer1_end.shape[0], layer1_end.shape[1]))
        att_weights_end = tensor.nnet.softmax(att_weights_end.T).T

        att_weights_start = tensor.dot(tensor.le(tensor.tile(theano.tensor.arange(context.shape[0])[None,:], (context.shape[0], 1)),
                                         tensor.tile(theano.tensor.arange(context.shape[0])[:,None], (1, context.shape[0]))), att_weights_start)
        att_weights_end = tensor.dot(tensor.ge(tensor.tile(theano.tensor.arange(context.shape[0])[None,:], (context.shape[0], 1)),
                                       tensor.tile(theano.tensor.arange(context.shape[0])[:,None], (1, context.shape[0]))), att_weights_end)

        # add attention from left and right
        #att_weights = att_weights_start * att_weights_end
        att_weights = tensor.minimum(att_weights_start, att_weights_end)

        att_target = tensor.eq(tensor.tile(answer[None,:,:], (context.shape[0], 1, 1)),
                               tensor.tile(context[:,None,:], (1, answer.shape[0], 1))).sum(axis=1).clip(0,1)

        self.predictions = tensor.gt(att_weights, 0.5) * context

        att_target = att_target / (att_target.sum(axis=0) + 0.00001)
        att_weights = att_weights / (att_weights.sum(axis=0) + 0.00001)

        cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) * context_mask).sum() / context_mask.sum()

        # Apply dropout
        cg = ComputationGraph([cost])
        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'
        att_weights_start.name = 'att_weights_start'
        att_weights_end.name = 'att_weights_end'
        att_target.name = 'att_target'
        att_weights.name = 'att_weights'
        self.predictions.name = 'pred'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]
        self.analyse_vars= [cost, self.predictions, att_weights_start, att_weights_end, att_weights, att_target]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')
        ans_indices = tensor.imatrix('ans_indices')  # n_steps * n_samples
        ans_indices_mask = tensor.imatrix('ans_indices_mask')

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)
        ans_indices = ans_indices.dimshuffle(1, 0)
        ans_indices_mask = ans_indices_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size, config.embed_size, name='embed')
        #embed.weights_init = IsotropicGaussian(0.01)
        embed.weights_init = Constant(
            init_embedding_table(filename='embeddings/vocab_embeddings.txt'))

        # one directional LSTM encoding
        q_lstm_ins = Linear(input_dim=config.embed_size,
                            output_dim=4 * config.pre_lstm_size,
                            name='q_lstm_in')
        q_lstm = LSTM(dim=config.pre_lstm_size,
                      activation=Tanh(),
                      name='q_lstm')
        c_lstm_ins = Linear(input_dim=config.embed_size,
                            output_dim=4 * config.pre_lstm_size,
                            name='c_lstm_in')
        c_lstm = LSTM(dim=config.pre_lstm_size,
                      activation=Tanh(),
                      name='c_lstm')
        bricks += [q_lstm, c_lstm, q_lstm_ins, c_lstm_ins]

        q_tmp = q_lstm_ins.apply(embed.apply(question))
        c_tmp = c_lstm_ins.apply(embed.apply(context))
        q_hidden, _ = q_lstm.apply(q_tmp,
                                   mask=question_mask.astype(
                                       theano.config.floatX))  # lq, bs, dim
        c_hidden, _ = c_lstm.apply(c_tmp,
                                   mask=context_mask.astype(
                                       theano.config.floatX))  # lc, bs, dim

        # Attention mechanism Bilinear question
        attention_question = Linear(input_dim=config.pre_lstm_size,
                                    output_dim=config.pre_lstm_size,
                                    name='att_question')
        bricks += [attention_question]
        att_weights_question = q_hidden[
            None, :, :, :] * attention_question.apply(
                c_hidden.reshape(
                    (c_hidden.shape[0] * c_hidden.shape[1],
                     c_hidden.shape[2]))).reshape(
                         (c_hidden.shape[0], c_hidden.shape[1],
                          c_hidden.shape[2]))[:,
                                              None, :, :]  # --> lc,lq,bs,dim
        att_weights_question = att_weights_question.sum(
            axis=3)  # sum over axis 3 -> dimensions --> lc,lq,bs
        att_weights_question = att_weights_question.dimshuffle(
            0, 2, 1)  # --> lc,bs,lq
        att_weights_question = att_weights_question.reshape(
            (att_weights_question.shape[0] * att_weights_question.shape[1],
             att_weights_question.shape[2]))  # --> lc*bs,lq
        att_weights_question = tensor.nnet.softmax(
            att_weights_question
        )  # softmax over axis 1 -> length of question # --> lc*bs,lq
        att_weights_question = att_weights_question.reshape(
            (c_hidden.shape[0], q_hidden.shape[1],
             q_hidden.shape[0]))  # --> lc,bs,lq
        att_weights_question = att_weights_question.dimshuffle(
            0, 2, 1)  # --> lc,lq,bs

        attended_question = tensor.sum(
            q_hidden[None, :, :, :] * att_weights_question[:, :, :, None],
            axis=1)  # sum over axis 1 -> length of question --> lc,bs,dim
        attended_question.name = 'attended_question'

        # Match LSTM
        cqembed = tensor.concatenate([c_hidden, attended_question], axis=2)
        mlstms, mhidden_list = make_bidir_lstm_stack(
            cqembed, 2 * config.pre_lstm_size,
            context_mask.astype(theano.config.floatX), config.match_lstm_size,
            config.match_skip_connections, 'match')
        bricks = bricks + mlstms
        if config.match_skip_connections:
            menc_dim = 2 * sum(config.match_lstm_size)
            menc = tensor.concatenate(mhidden_list, axis=2)
        else:
            menc_dim = 2 * config.match_lstm_size[-1]
            menc = tensor.concatenate(mhidden_list[-2:], axis=2)
        menc.name = 'menc'

        # Attention mechanism MLP start
        attention_mlp_start = MLP(
            dims=config.attention_mlp_hidden + [1],
            activations=config.attention_mlp_activations[1:] + [Identity()],
            name='attention_mlp_start')
        attention_clinear_start = Linear(
            input_dim=menc_dim,
            output_dim=config.attention_mlp_hidden[0],
            name='attm_start')  # Wym
        bricks += [attention_mlp_start, attention_clinear_start]
        layer1_start = Tanh(name='layer1_start')
        layer1_start = layer1_start.apply(
            attention_clinear_start.apply(
                menc.reshape(
                    (menc.shape[0] * menc.shape[1], menc.shape[2]))).reshape(
                        (menc.shape[0], menc.shape[1],
                         config.attention_mlp_hidden[0])))
        att_weights_start = attention_mlp_start.apply(
            layer1_start.reshape(
                (layer1_start.shape[0] * layer1_start.shape[1],
                 layer1_start.shape[2])))
        att_weights_start = att_weights_start.reshape(
            (layer1_start.shape[0], layer1_start.shape[1]))
        att_weights_start = tensor.nnet.softmax(att_weights_start.T).T

        attended = tensor.sum(menc * att_weights_start[:, :, None], axis=0)
        attended.name = 'attended'

        # Attention mechanism MLP end
        attention_mlp_end = MLP(
            dims=config.attention_mlp_hidden + [1],
            activations=config.attention_mlp_activations[1:] + [Identity()],
            name='attention_mlp_end')
        attention_qlinear_end = Linear(
            input_dim=menc_dim,
            output_dim=config.attention_mlp_hidden[0],
            name='atts_end')  #Wum
        attention_clinear_end = Linear(
            input_dim=menc_dim,
            output_dim=config.attention_mlp_hidden[0],
            use_bias=False,
            name='attm_end')  # Wym
        bricks += [
            attention_mlp_end, attention_qlinear_end, attention_clinear_end
        ]
        layer1_end = Tanh(name='layer1_end')
        layer1_end = layer1_end.apply(
            attention_clinear_end.apply(
                menc.reshape((menc.shape[0] * menc.shape[1], menc.shape[2]
                              ))).reshape((menc.shape[0], menc.shape[1],
                                           config.attention_mlp_hidden[0])) +
            attention_qlinear_end.apply(attended)[None, :, :])
        att_weights_end = attention_mlp_end.apply(
            layer1_end.reshape((layer1_end.shape[0] * layer1_end.shape[1],
                                layer1_end.shape[2])))
        att_weights_end = att_weights_end.reshape(
            (layer1_end.shape[0], layer1_end.shape[1]))
        att_weights_end = tensor.nnet.softmax(att_weights_end.T).T

        att_weights_start = tensor.dot(
            tensor.le(
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[None, :],
                    (context.shape[0], 1)),
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[:, None],
                    (1, context.shape[0]))), att_weights_start)
        att_weights_end = tensor.dot(
            tensor.ge(
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[None, :],
                    (context.shape[0], 1)),
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[:, None],
                    (1, context.shape[0]))), att_weights_end)

        # add attention from left and right
        att_weights = att_weights_start * att_weights_end
        #att_weights = tensor.minimum(att_weights_start, att_weights_end)

        att_target = tensor.zeros((ans_indices.shape[1], context.shape[0]),
                                  dtype=theano.config.floatX)
        att_target = tensor.set_subtensor(
            att_target[tensor.arange(ans_indices.shape[1]), ans_indices], 1)
        att_target = att_target.dimshuffle(1, 0)
        #att_target = tensor.eq(tensor.tile(answer[None,:,:], (context.shape[0], 1, 1)),
        #                       tensor.tile(context[:,None,:], (1, answer.shape[0], 1))).sum(axis=1).clip(0,1)

        self.predictions = tensor.gt(att_weights, 0.25) * context

        att_target = att_target / (att_target.sum(axis=0) + 0.00001)
        #att_weights = att_weights / (att_weights.sum(axis=0) + 0.00001)

        cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) *
                context_mask).sum() / context_mask.sum()

        # Apply dropout
        cg = ComputationGraph([cost])
        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, mhidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'
        att_weights_start.name = 'att_weights_start'
        att_weights_end.name = 'att_weights_end'
        att_weights.name = 'att_weights'
        att_target.name = 'att_target'
        self.predictions.name = 'pred'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]
        self.analyse_vars = [
            cost, self.predictions, att_weights_start, att_weights_end,
            att_weights, att_target
        ]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()