Python Tanh.apply Examples

Programming Language: Python

Namespace/Package Name: blocks.bricks

Class/Type: Tanh

Method/Function: apply

Examples at hotexamples.com: 9

Python Tanh.apply - 9 examples found. These are the top rated real world Python examples of blocks.bricks.Tanh.apply extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Tanh(30)

apply(6)

reshape(6)

name(5)

Example #1

Show file

File: match_functions.py Project: zlpmichelle/HRAN

class ShallowEnergyComputer(Initializable, Feedforward):
    """A simple energy computer: first tanh, then weighted sum."""
    @lazy()
    def __init__(self, **kwargs):
        super(ShallowEnergyComputer, self).__init__(**kwargs)
        self.tanh = Tanh()
        self.linear = Linear(use_bias=False)
        self.children = [self.tanh, self.linear]

    @application
    def apply(self, *args):
        output = args
        output = self.tanh.apply(*pack(output))
        output = self.linear.apply(*pack(output))
        return output

    @property
    def input_dim(self):
        return self.children[1].input_dim

    @input_dim.setter
    def input_dim(self, value):
        self.children[1].input_dim = value

    @property
    def output_dim(self):
        return self.children[1].output_dim

    @output_dim.setter
    def output_dim(self, value):
        self.children[1].output_dim = value

Example #2

Show file

File: hpc_gru.py Project: Alexis211/text_rnn

    def __init__(self, config):
        inp = tensor.imatrix('bytes')

        embed = theano.shared(config.embedding_matrix.astype(theano.config.floatX),
                              name='embedding_matrix')
        in_repr = embed[inp.flatten(), :].reshape((inp.shape[0], inp.shape[1], config.repr_dim))
        in_repr.name = 'in_repr'

        bricks = []
        states = []

        # Construct predictive GRU hierarchy
        hidden = []
        costs = []
        next_target = in_repr.dimshuffle(1, 0, 2)
        for i, (hdim, cf, q) in enumerate(zip(config.hidden_dims,
                                                   config.cost_factors,
                                                   config.hidden_q)):
            init_state = theano.shared(numpy.zeros((config.num_seqs, hdim)).astype(theano.config.floatX),
                                       name='st0_%d'%i)

            linear = Linear(input_dim=config.repr_dim, output_dim=3*hdim,
                            name="lstm_in_%d"%i)
            lstm = GatedRecurrent(dim=hdim, activation=config.activation_function,
                        name="lstm_rec_%d"%i)
            linear2 = Linear(input_dim=hdim, output_dim=config.repr_dim, name='lstm_out_%d'%i)
            tanh = Tanh('lstm_out_tanh_%d'%i)
            bricks += [linear, lstm, linear2, tanh]
            if i > 0:
                linear1 = Linear(input_dim=config.hidden_dims[i-1], output_dim=3*hdim,
                                 name='lstm_in2_%d'%i)
                bricks += [linear1]

            next_target = tensor.cast(next_target, dtype=theano.config.floatX)
            inter = linear.apply(theano.gradient.disconnected_grad(next_target))
            if i > 0:
                inter += linear1.apply(theano.gradient.disconnected_grad(hidden[-1][:-1,:,:]))
            new_hidden = lstm.apply(inputs=inter[:,:,:hdim],
                                    gate_inputs=inter[:,:,hdim:],
                                    states=init_state)
            states.append((init_state, new_hidden[-1, :, :]))

            hidden += [tensor.concatenate([init_state[None,:,:], new_hidden],axis=0)]
            pred = tanh.apply(linear2.apply(hidden[-1][:-1,:,:]))
            costs += [numpy.float32(cf) * (-next_target * pred).sum(axis=2).mean()]
            costs += [numpy.float32(cf) * q * abs(pred).sum(axis=2).mean()]
            diff = next_target - pred
            next_target = tensor.ge(diff, 0.5) - tensor.le(diff, -0.5)


        # Construct output from hidden states
        hidden = [s.dimshuffle(1, 0, 2) for s in hidden]

        out_parts = []
        out_dims = config.out_hidden + [config.io_dim]
        for i, (dim, state) in enumerate(zip(config.hidden_dims, hidden)):
            pred_linear = Linear(input_dim=dim, output_dim=out_dims[0],
                                name='pred_linear_%d'%i)
            bricks.append(pred_linear)
            lin = theano.gradient.disconnected_grad(state)
            out_parts.append(pred_linear.apply(lin))

        # Do prediction and calculate cost
        out = sum(out_parts)

        if len(out_dims) > 1:
            out = config.out_hidden_act[0](name='out_act0').apply(out)
            mlp = MLP(dims=out_dims,
                      activations=[x(name='out_act%d'%i) for i, x in enumerate(config.out_hidden_act[1:])]
                                 +[Identity()],
                      name='out_mlp')
            bricks.append(mlp)
            out = mlp.apply(out.reshape((inp.shape[0]*(inp.shape[1]+1),-1))
                           ).reshape((inp.shape[0],inp.shape[1]+1,-1))

        pred = out.argmax(axis=2)

        cost = Softmax().categorical_cross_entropy(inp.flatten(),
                                                   out[:,:-1,:].reshape((inp.shape[0]*inp.shape[1],
                                                                config.io_dim))).mean()
        error_rate = tensor.neq(inp.flatten(), pred[:,:-1].flatten()).mean()

        sgd_cost = cost + sum(costs)
            
        # Initialize all bricks
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()

        # apply noise
        cg = ComputationGraph([sgd_cost, cost, error_rate]+costs)
        if config.weight_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.weight_noise)
        sgd_cost = cg.outputs[0]
        cost = cg.outputs[1]
        error_rate = cg.outputs[2]
        costs = cg.outputs[3:]


        # put stuff into self that is usefull for training or extensions
        self.sgd_cost = sgd_cost

        sgd_cost.name = 'sgd_cost'
        for i in range(len(costs)):
            costs[i].name = 'pred_cost_%d'%i
        cost.name = 'cost'
        error_rate.name = 'error_rate'
        self.monitor_vars = [costs, [cost],
                             [error_rate]]

        self.out = out[:,1:,:]
        self.pred = pred[:,1:]

        self.states = states

Example #3

Show file

File: frnn_model.py Project: TiSU32/ift6266h16

class FRNNEmitter(AbstractEmitter, Initializable, Random):
    """An RNN emitter for the case of real outputs.
    Parameters
    ----------
    """

    def __init__(self, mlp, target_size, frame_size, k, frnn_hidden_size, frnn_step_size, const=1e-5, **kwargs):

        super(FRNNEmitter, self).__init__(**kwargs)

        self.mlp = mlp
        self.target_size = target_size
        self.frame_size = frame_size
        self.k = k
        self.frnn_hidden_size = frnn_hidden_size
        self.const = const
        self.input_dim = self.mlp.output_dim

        self.frnn_step_size = frnn_step_size

        # adding a step if the division is not exact.
        self.number_of_steps = frame_size // frnn_step_size
        self.last_steps = frame_size % frnn_step_size
        if self.last_steps != 0:
            self.number_of_steps += 1

        self.mu = MLP(activations=[Identity()], dims=[frnn_hidden_size, k * frnn_step_size], name=self.name + "_mu")
        self.sigma = MLP(
            activations=[SoftPlus()], dims=[frnn_hidden_size, k * frnn_step_size], name=self.name + "_sigma"
        )

        self.coeff = MLP(activations=[Identity()], dims=[frnn_hidden_size, k], name=self.name + "_coeff")

        self.coeff2 = NDimensionalSoftmax()

        self.frnn_initial_state = Linear(
            input_dim=self.input_dim, output_dim=frnn_hidden_size, name="frnn_initial_state"
        )

        # self.frnn_hidden = Linear(
        #    input_dim=frnn_hidden_size,
        #    output_dim=frnn_hidden_size,
        #    activation=Tanh(),
        #    name="frnn_hidden")

        self.frnn_activation = Tanh(name="frnn_activation")

        self.frnn_linear_transition_state = Linear(
            input_dim=frnn_hidden_size, output_dim=frnn_hidden_size, name="frnn_linear_transition_state"
        )

        self.frnn_linear_transition_input = Linear(
            input_dim=self.frnn_step_size, output_dim=frnn_hidden_size, name="frnn_linear_transition_input"
        )

        # self.frnn_linear_transition_output = Linear (
        #    input_dim = frnn_hidden_size,
        #    output_dim = self.rnn_hidden_dim,
        #    name="frnn_linear_transition_output")

        self.children = [
            self.mlp,
            self.mu,
            self.sigma,
            self.coeff,
            self.coeff2,
            self.frnn_initial_state,
            self.frnn_activation,
            self.frnn_linear_transition_state,
            self.frnn_linear_transition_input,
        ]

    @application
    def emit(self, readouts):
        """
        keep_parameters is True if mu,sigma,coeffs must be stacked and returned
        if false, only the result is given, the others will be empty list.

        """
        # initial state
        state = self.frnn_initial_state.apply(self.mlp.apply(readouts))

        results = []

        for i in range(self.number_of_steps):
            last_iteration = i == self.number_of_steps - 1

            # First generating distribution parameters and sampling.
            mu = self.mu.apply(state)
            sigma = self.sigma.apply(state) + self.const
            coeff = self.coeff2.apply(self.coeff.apply(state), extra_ndim=state.ndim - 2) + self.const

            shape_result = coeff.shape
            shape_result = tensor.set_subtensor(shape_result[-1], self.frnn_step_size)
            ndim_result = coeff.ndim

            mu = mu.reshape((-1, self.frnn_step_size, self.k))
            sigma = sigma.reshape((-1, self.frnn_step_size, self.k))
            coeff = coeff.reshape((-1, self.k))

            sample_coeff = self.theano_rng.multinomial(pvals=coeff, dtype=coeff.dtype)
            idx = predict(sample_coeff, axis=-1)
            # idx = predict(coeff, axis = -1) use this line for using most likely coeff.

            # shapes (ls*bs)*(fs)
            mu = mu[tensor.arange(mu.shape[0]), :, idx]
            sigma = sigma[tensor.arange(sigma.shape[0]), :, idx]

            epsilon = self.theano_rng.normal(size=mu.shape, avg=0.0, std=1.0, dtype=mu.dtype)

            result = mu + sigma * epsilon  # *0.6 #reduce variance.
            result = result.reshape(shape_result, ndim=ndim_result)
            results.append(result)

            # if the total size does not correspond to the frame_size,
            # this removes the need for padding
            if not last_iteration:
                state = self.frnn_activation.apply(
                    self.frnn_linear_transition_state.apply(state) + self.frnn_linear_transition_input.apply(result)
                )

        results = tensor.stack(results, axis=-1)
        results = tensor.flatten(results, outdim=results.ndim - 1)

        # truncate if not good size
        if self.last_steps != 0:
            results = results[tuple([slice(0, None)] * (results.ndim - 1) + [slice(0, self.frame_size)])]

        return results

    @application
    def cost(self, readouts, outputs):
        # initial state
        state = self.frnn_initial_state.apply(self.mlp.apply(readouts))

        inputs = outputs

        mus = []
        sigmas = []
        coeffs = []

        for i in range(self.number_of_steps):
            last_iteration = i == self.number_of_steps - 1

            # First generating distribution parameters and sampling.
            freq_mu = self.mu.apply(state)
            freq_sigma = self.sigma.apply(state) + self.const
            freq_coeff = self.coeff2.apply(self.coeff.apply(state), extra_ndim=state.ndim - 2) + self.const

            freq_mu = freq_mu.reshape((-1, self.frnn_step_size, self.k))
            freq_sigma = freq_sigma.reshape((-1, self.frnn_step_size, self.k))
            freq_coeff = freq_coeff.reshape((-1, self.k))
            # mu,sigma: shape (-1,fs,k)
            # coeff: shape (-1,k)

            mus.append(freq_mu)
            sigmas.append(freq_sigma)
            coeffs.append(freq_coeff)

            index = self.frnn_step_size
            freq_inputs = inputs[
                tuple([slice(0, None)] * (inputs.ndim - 1) + [slice(index, index + self.frnn_step_size)])
            ]

            if not last_iteration:
                state = self.frnn_activation.apply(
                    self.frnn_linear_transition_state.apply(state)
                    + self.frnn_linear_transition_input.apply(freq_inputs)
                )

        mus = tensor.stack(mus, axis=-2)
        sigmas = tensor.stack(sigmas, axis=-2)
        coeffs = tensor.stack(coeffs, axis=-2)

        mus = mus.reshape((-1, self.frnn_step_size * self.number_of_steps, self.k))
        sigmas = sigmas.reshape((-1, self.frnn_step_size * self.number_of_steps, self.k))
        coeffs = coeffs.repeat(self.frnn_step_size, axis=-2)

        mus = mus[tuple([slice(0, None)] * (mus.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])]
        sigmas = sigmas[tuple([slice(0, None)] * (sigmas.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])]
        coeffs = coeffs[tuple([slice(0, None)] * (coeffs.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])]
        # actually prob not necessary
        mu = mus.reshape((-1, self.target_size))
        sigma = sigmas.reshape((-1, self.target_size))
        coeff = coeffs.reshape((-1, self.target_size))

        return FRNN_NLL(y=outputs, mu=mu, sig=sigma, coeff=coeff, frame_size=self.frame_size, k=self.k)

    @application
    def initial_outputs(self, batch_size):
        return tensor.zeros((batch_size, self.frame_size), dtype=floatX)

    def get_dim(self, name):
        # modification here to ensure the right dim.
        if name == "outputs":
            return self.frame_size
        return super(FRNNEmitter, self).get_dim(name)

Example #4

Show file

File: model_encdec.py Project: rizar/NMT

class Decoder(Initializable):
    def __init__(self, vocab_size, embedding_dim, state_dim,
                 representation_dim, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim

        readout = Readout(
            source_names=['states', 'feedback', 'readout_context'],
            readout_dim=self.vocab_size,
            emitter=SoftmaxEmitter(),
            feedback_brick=LookupFeedback(vocab_size, embedding_dim),
            post_merge=InitializableFeedforwardSequence(
                [Bias(dim=1000).apply,
                 Maxout(num_pieces=2).apply,
                 Linear(input_dim=state_dim / 2, output_dim=100,
                        use_bias=False).apply,
                 Linear(input_dim=100).apply]),
            merged_dim=1000)

        self.transition = GatedRecurrentWithContext(Tanh(), dim=state_dim,
                                                    name='decoder')
        # Readout will apply the linear transformation to 'readout_context'
        # with a Merge brick, so no need to fork it here
        self.fork = Fork([name for name in
                          self.transition.apply.contexts +
                          self.transition.apply.states
                          if name != 'readout_context'], prototype=Linear())
        self.tanh = Tanh()

        self.sequence_generator = SequenceGenerator(
            readout=readout, transition=self.transition,
            fork_inputs=[name for name in self.transition.apply.sequences
                         if name != 'mask'],
        )

        self.children = [self.fork, self.sequence_generator, self.tanh]

    def _push_allocation_config(self):
        self.fork.input_dim = self.representation_dim
        self.fork.output_dims = [self.state_dim
                                 for _ in self.fork.output_names]

    @application(inputs=['representation', 'target_sentence_mask',
                         'target_sentence'], outputs=['cost'])
    def cost(self, representation, target_sentence, target_sentence_mask):
        target_sentence = target_sentence.dimshuffle(1, 0)
        target_sentence_mask = target_sentence_mask.T

        # The initial state and contexts, all functions of the representation
        contexts = {key: value.dimshuffle('x', 0, 1)
                    if key not in self.transition.apply.states else value
                    for key, value
                    in self.fork.apply(representation, as_dict=True).items()}
        contexts['states'] = self.tanh.apply(contexts['states'])
        cost = self.sequence_generator.cost(**merge(
            contexts, {'mask': target_sentence_mask,
                       'outputs': target_sentence,
                       'readout_context': representation.dimshuffle('x', 0, 1)}
        ))

        return (cost * target_sentence_mask).sum() / target_sentence_mask.shape[1]

Example #5

Show file

File: frnn_model.py Project: TiSU32/ift6266h16

class FRNNEmitter(AbstractEmitter, Initializable, Random):
    """An RNN emitter for the case of real outputs.
    Parameters
    ----------
    """
    def __init__(self, mlp, target_size, frame_size, k, frnn_hidden_size, \
            frnn_step_size, const=1e-5, **kwargs):

        super(FRNNEmitter, self).__init__(**kwargs)

        self.mlp = mlp
        self.target_size = target_size
        self.frame_size = frame_size
        self.k = k
        self.frnn_hidden_size = frnn_hidden_size
        self.const = const
        self.input_dim = self.mlp.output_dim

        self.frnn_step_size = frnn_step_size

        # adding a step if the division is not exact.
        self.number_of_steps = frame_size // frnn_step_size
        self.last_steps = frame_size % frnn_step_size
        if self.last_steps != 0:
            self.number_of_steps += 1

        self.mu = MLP(activations=[Identity()],
                dims=[frnn_hidden_size, k*frnn_step_size],
                name=self.name + "_mu")
        self.sigma = MLP(activations=[SoftPlus()],
                dims=[frnn_hidden_size, k*frnn_step_size],
                name=self.name + "_sigma")

        self.coeff = MLP(activations=[Identity()],
                dims=[frnn_hidden_size, k],
                name=self.name + "_coeff")

        self.coeff2 = NDimensionalSoftmax()

        self.frnn_initial_state = Linear(
            input_dim = self.input_dim,
            output_dim=frnn_hidden_size,
            name="frnn_initial_state")

        #self.frnn_hidden = Linear(
        #    input_dim=frnn_hidden_size,
        #    output_dim=frnn_hidden_size,
        #    activation=Tanh(),
        #    name="frnn_hidden")

        self.frnn_activation = Tanh(
            name="frnn_activation")

        self.frnn_linear_transition_state = Linear (
            input_dim = frnn_hidden_size,
            output_dim= frnn_hidden_size,
            name="frnn_linear_transition_state")

        self.frnn_linear_transition_input = Linear (
            input_dim = self.frnn_step_size,
            output_dim = frnn_hidden_size,
            name="frnn_linear_transition_input")

        #self.frnn_linear_transition_output = Linear (
        #    input_dim = frnn_hidden_size,
        #    output_dim = self.rnn_hidden_dim,
        #    name="frnn_linear_transition_output")

        self.children = [self.mlp,self.mu,self.sigma,self.coeff,
            self.coeff2,self.frnn_initial_state,self.frnn_activation,
            self.frnn_linear_transition_state,
            self.frnn_linear_transition_input]

    @application
    def emit(self,readouts):
        """
        keep_parameters is True if mu,sigma,coeffs must be stacked and returned
        if false, only the result is given, the others will be empty list.

        """
        # initial state
        state = self.frnn_initial_state.apply(\
            self.mlp.apply(readouts))

        results = []

        for i in range(self.number_of_steps):
            last_iteration = (i == self.number_of_steps - 1)

            # First generating distribution parameters and sampling.
            mu = self.mu.apply(state)
            sigma = self.sigma.apply(state) + self.const
            coeff = self.coeff2.apply(self.coeff.apply(state),\
                extra_ndim=state.ndim - 2) + self.const

            shape_result = coeff.shape
            shape_result = tensor.set_subtensor(shape_result[-1],self.frnn_step_size)
            ndim_result = coeff.ndim

            mu = mu.reshape((-1, self.frnn_step_size,self.k))
            sigma = sigma.reshape((-1, self.frnn_step_size,self.k))
            coeff = coeff.reshape((-1, self.k))

            sample_coeff = self.theano_rng.multinomial(pvals = coeff, dtype=coeff.dtype)
            idx = predict(sample_coeff, axis = -1)
            #idx = predict(coeff, axis = -1) use this line for using most likely coeff.

            #shapes (ls*bs)*(fs)
            mu = mu[tensor.arange(mu.shape[0]), :,idx]
            sigma = sigma[tensor.arange(sigma.shape[0]), :,idx]

            epsilon = self.theano_rng.normal(
                size=mu.shape,
                avg=0.,
                std=1.,
                dtype=mu.dtype)

            result = mu + sigma*epsilon#*0.6 #reduce variance.
            result = result.reshape(shape_result, ndim = ndim_result)
            results.append(result)

            # if the total size does not correspond to the frame_size,
            #this removes the need for padding
            if not last_iteration:
                state = self.frnn_activation.apply(
                        self.frnn_linear_transition_state.apply(state) +
                        self.frnn_linear_transition_input.apply(result))

        results = tensor.stack(results,axis=-1)
        results = tensor.flatten(results,outdim=results.ndim-1)

        # truncate if not good size
        if self.last_steps != 0:
            results = results[tuple([slice(0,None)] * \
                (results.ndim-1) +[slice(0,self.frame_size)])]

        return results

    @application
    def cost(self, readouts, outputs):
        # initial state
        state = self.frnn_initial_state.apply(\
            self.mlp.apply(readouts))

        inputs = outputs

        mus = []
        sigmas = []
        coeffs = []

        for i in range(self.number_of_steps):
            last_iteration = (i == self.number_of_steps - 1)

            # First generating distribution parameters and sampling.
            freq_mu = self.mu.apply(state)
            freq_sigma = self.sigma.apply(state) + self.const
            freq_coeff = self.coeff2.apply(self.coeff.apply(state),\
                extra_ndim=state.ndim - 2) + self.const

            freq_mu = freq_mu.reshape((-1,self.frnn_step_size,self.k))
            freq_sigma = freq_sigma.reshape((-1,self.frnn_step_size,self.k))
            freq_coeff = freq_coeff.reshape((-1,self.k))
            #mu,sigma: shape (-1,fs,k)
            #coeff: shape (-1,k)

            mus.append(freq_mu)
            sigmas.append(freq_sigma)
            coeffs.append(freq_coeff)

            index = self.frnn_step_size
            freq_inputs = inputs[tuple([slice(0,None)] * \
                (inputs.ndim-1) +[slice(index,index+self.frnn_step_size)])]

            if not last_iteration:
                state = self.frnn_activation.apply(
                    self.frnn_linear_transition_state.apply(state) +
                    self.frnn_linear_transition_input.apply(freq_inputs))

        mus = tensor.stack(mus,axis=-2)
        sigmas = tensor.stack(sigmas,axis=-2)
        coeffs = tensor.stack(coeffs,axis=-2)

        mus = mus.reshape((-1,self.frnn_step_size*self.number_of_steps,self.k))
        sigmas = sigmas.reshape((-1,self.frnn_step_size*self.number_of_steps,self.k))
        coeffs = coeffs.repeat(self.frnn_step_size,axis=-2)

        mus = mus[tuple([slice(0,None)] * \
                (mus.ndim-2) +[slice(0,self.frame_size)] + [slice(0,None)])]
        sigmas = sigmas[tuple([slice(0,None)] * \
                (sigmas.ndim-2) +[slice(0,self.frame_size)] + [slice(0,None)])]
        coeffs = coeffs[tuple([slice(0,None)] * \
                (coeffs.ndim-2) +[slice(0,self.frame_size)] + [slice(0,None)])]
        # actually prob not necessary
        mu = mus.reshape((-1,self.target_size))
        sigma = sigmas.reshape((-1,self.target_size))
        coeff = coeffs.reshape((-1, self.target_size))

        return FRNN_NLL (y=outputs, mu=mu, sig=sigma, coeff=coeff,\
            frame_size=self.frame_size,k=self.k)

    @application
    def initial_outputs(self, batch_size):
        return tensor.zeros((batch_size, self.frame_size), dtype=floatX)

    def get_dim(self, name):
	# modification here to ensure the right dim.
        if name == 'outputs':
            return self.frame_size
        return super(FRNNEmitter, self).get_dim(name)

Example #6

Show file

File: sequence_scorer_steroids.py Project: shubhampachori12110095/Question-Answering

    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size,
                            config.embed_size,
                            name='question_embed')
        embed.weights_init = IsotropicGaussian(0.01)

        # Calculate question encoding (concatenate layer1)
        qembed = embed.apply(question)
        qlstms, qhidden_list = make_bidir_lstm_stack(
            qembed, config.embed_size,
            question_mask.astype(theano.config.floatX),
            config.question_lstm_size, config.question_skip_connections, 'q')
        bricks = bricks + qlstms
        if config.question_skip_connections:
            qenc_dim = 2 * sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list],
                                      axis=1)
        else:
            qenc_dim = 2 * config.question_lstm_size[-1]
            qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]],
                                      axis=1)
        qenc.name = 'qenc'

        # Calculate context encoding (concatenate layer1)
        cembed = embed.apply(context)
        clstms, chidden_list = make_bidir_lstm_stack(
            cembed, config.embed_size,
            context_mask.astype(theano.config.floatX), config.ctx_lstm_size,
            config.ctx_skip_connections, 'ctx')
        bricks = bricks + clstms
        if config.ctx_skip_connections:
            cenc_dim = 2 * sum(config.ctx_lstm_size)  #2 : fw & bw
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2 * config.question_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'

        # Attention mechanism MLP fwd
        attention_mlp_fwd = MLP(
            dims=config.attention_mlp_hidden + [1],
            activations=config.attention_mlp_activations[1:] + [Identity()],
            name='attention_mlp_fwd')
        attention_qlinear_fwd = Linear(
            input_dim=qenc_dim,
            output_dim=config.attention_mlp_hidden[0],
            name='attq_fwd')
        attention_clinear_fwd = Linear(
            input_dim=cenc_dim / 2,
            output_dim=config.attention_mlp_hidden[0],
            use_bias=False,
            name='attc_fwd')
        bricks += [
            attention_mlp_fwd, attention_qlinear_fwd, attention_clinear_fwd
        ]
        layer1_fwd = Tanh(name='tanh_fwd')
        layer1_fwd = layer1_fwd.apply(
            attention_clinear_fwd.apply(cenc[:, :, :cenc_dim / 2].reshape(
                (cenc.shape[0] * cenc.shape[1], cenc.shape[2] /
                 2))).reshape((cenc.shape[0], cenc.shape[1],
                               config.attention_mlp_hidden[0])) +
            attention_qlinear_fwd.apply(qenc)[None, :, :])
        att_weights_fwd = attention_mlp_fwd.apply(
            layer1_fwd.reshape((layer1_fwd.shape[0] * layer1_fwd.shape[1],
                                layer1_fwd.shape[2])))
        att_weights_fwd = att_weights_fwd.reshape(
            (layer1_fwd.shape[0], layer1_fwd.shape[1]))
        att_weights_fwd = tensor.nnet.softmax(att_weights_fwd.T)
        att_weights_fwd.name = 'att_weights_fwd'

        attended_fwd = tensor.sum(cenc[:, :, :cenc_dim / 2] *
                                  att_weights_fwd.T[:, :, None],
                                  axis=0)
        attended_fwd.name = 'attended_fwd'

        # Attention mechanism MLP bwd
        attention_mlp_bwd = MLP(
            dims=config.attention_mlp_hidden + [1],
            activations=config.attention_mlp_activations[1:] + [Identity()],
            name='attention_mlp_bwd')
        attention_qlinear_bwd = Linear(
            input_dim=qenc_dim,
            output_dim=config.attention_mlp_hidden[0],
            name='attq_bwd')
        attention_clinear_bwd = Linear(
            input_dim=cenc_dim / 2,
            output_dim=config.attention_mlp_hidden[0],
            use_bias=False,
            name='attc_bwd')
        bricks += [
            attention_mlp_bwd, attention_qlinear_bwd, attention_clinear_bwd
        ]
        layer1_bwd = Tanh(name='tanh_bwd')
        layer1_bwd = layer1_bwd.apply(
            attention_clinear_bwd.apply(cenc[:, :, cenc_dim / 2:].reshape(
                (cenc.shape[0] * cenc.shape[1], cenc.shape[2] /
                 2))).reshape((cenc.shape[0], cenc.shape[1],
                               config.attention_mlp_hidden[0])) +
            attention_qlinear_bwd.apply(qenc)[None, :, :])
        att_weights_bwd = attention_mlp_bwd.apply(
            layer1_bwd.reshape((layer1_bwd.shape[0] * layer1_bwd.shape[1],
                                layer1_bwd.shape[2])))
        att_weights_bwd = att_weights_bwd.reshape(
            (layer1_bwd.shape[0], layer1_bwd.shape[1]))
        att_weights_bwd = tensor.nnet.softmax(att_weights_bwd.T)
        att_weights_bwd.name = 'att_weights_bwd'

        attended_bwd = tensor.sum(cenc[:, :, cenc_dim / 2:] *
                                  att_weights_bwd.T[:, :, None],
                                  axis=0)
        attended_bwd.name = 'attended_bwd'

        ctx_question = tensor.concatenate([attended_fwd, attended_bwd, qenc],
                                          axis=1)
        ctx_question.name = 'ctx_question'

        answer_bag = to_bag(answer, vocab_size)
        answer_bag = tensor.set_subtensor(answer_bag[:, 0:3], 0)
        relevant_items = answer_bag.sum(axis=1, dtype=theano.config.floatX)

        def createSequences(j, index, c_enc, c_enc_dim, c_context,
                            c_window_size):
            sequence = tensor.concatenate([
                c_context[j:j + index, :],
                tensor.zeros((c_window_size - index, c_context.shape[1]))
            ],
                                          axis=0)
            enc = tensor.concatenate([
                c_enc[j + index - 1, :, :], c_enc[j, :, :-1],
                tensor.tile(c_window_size[None, None], (c_enc.shape[1], 1))
            ],
                                     axis=1)
            return enc, sequence

        def createTargetValues(j, index, c_context, c_vocab_size):
            sequence_bag = to_bag(c_context[j:j + index, :], c_vocab_size)
            sequence_bag = tensor.set_subtensor(sequence_bag[:, 0:3], 0)
            selected_items = sequence_bag.sum(axis=1,
                                              dtype=theano.config.floatX)
            tp = (sequence_bag * answer_bag).sum(axis=1,
                                                 dtype=theano.config.floatX)
            precision = tp / (selected_items + 0.00001)
            recall = tp / (relevant_items + 0.00001)
            #precision = tensor.set_subtensor(precision[tensor.isnan(precision)], 0.0)
            #recall = tensor.set_subtensor(recall[tensor.isnan(recall)], 1.0)
            macroF1 = (2 *
                       (precision * recall)) / (precision + recall + 0.00001)
            #macroF1 = tensor.set_subtensor(macroF1[tensor.isnan(macroF1)], 0.0)
            return macroF1

        window_size = 3
        senc = []
        sequences = []
        pred_targets = []
        for i in range(1, window_size + 1):
            (all_enc, all_sequence), _ = theano.scan(
                fn=createSequences,
                sequences=tensor.arange(cenc.shape[0] - i + 1),
                non_sequences=[i, cenc, cenc_dim, context, window_size])
            (all_macroF1), _ = theano.scan(
                fn=createTargetValues,
                sequences=tensor.arange(cenc.shape[0] - i + 1),
                non_sequences=[i, context, vocab_size])
            senc.append(all_enc)
            sequences.append(all_sequence)
            pred_targets.append(all_macroF1)

        senc = tensor.concatenate(senc, axis=0)
        sequences = tensor.concatenate(sequences, axis=0)
        pred_targets = tensor.concatenate(pred_targets, axis=0)

        # F1 prediction Bilinear
        prediction_linear = Linear(input_dim=2 * cenc_dim,
                                   output_dim=cenc_dim + qenc_dim,
                                   name='pred_linear')
        bricks += [prediction_linear]
        pred_weights = ctx_question[None, :, :] * prediction_linear.apply(
            senc.reshape(
                (senc.shape[0] * senc.shape[1], senc.shape[2]))).reshape(
                    (senc.shape[0], senc.shape[1], senc.shape[2]))
        pred_weights = pred_weights.sum(axis=2)
        pred_weights = tensor.nnet.sigmoid(pred_weights.T).T
        pred_weights.name = 'pred_weights'

        pred_targets = pred_targets / (pred_targets.sum(axis=0) + 0.00001)
        pred_weights = pred_weights / (pred_weights.sum(axis=0) + 0.00001)

        #numpy.set_printoptions(edgeitems=500)
        #pred_targets = theano.printing.Print('pred_targets')(pred_targets)
        #pred_weights = theano.printing.Print('pred_weights')(pred_weights)

        cost = tensor.nnet.binary_crossentropy(pred_weights,
                                               pred_targets).mean()
        self.predictions = sequences[pred_weights.argmax(axis=0), :,
                                     tensor.arange(sequences.shape[2])].T

        # Apply dropout
        cg = ComputationGraph([cost])

        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()

Example #7

Show file

File: model_encdec.py Project: miradel51/NMT

class Decoder(Initializable):
    def __init__(self, vocab_size, embedding_dim, state_dim,
                 representation_dim, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim

        readout = Readout(
            source_names=['states', 'feedback', 'readout_context'],
            readout_dim=self.vocab_size,
            emitter=SoftmaxEmitter(),
            feedback_brick=LookupFeedback(vocab_size, embedding_dim),
            post_merge=InitializableFeedforwardSequence([
                Bias(dim=1000).apply,
                Maxout(num_pieces=2).apply,
                Linear(input_dim=state_dim / 2, output_dim=100,
                       use_bias=False).apply,
                Linear(input_dim=100).apply
            ]),
            merged_dim=1000)

        self.transition = GatedRecurrentWithContext(Tanh(),
                                                    dim=state_dim,
                                                    name='decoder')
        # Readout will apply the linear transformation to 'readout_context'
        # with a Merge brick, so no need to fork it here
        self.fork = Fork([
            name for name in self.transition.apply.contexts +
            self.transition.apply.states if name != 'readout_context'
        ],
                         prototype=Linear())
        self.tanh = Tanh()

        self.sequence_generator = SequenceGenerator(
            readout=readout,
            transition=self.transition,
            fork_inputs=[
                name for name in self.transition.apply.sequences
                if name != 'mask'
            ],
        )

        self.children = [self.fork, self.sequence_generator, self.tanh]

    def _push_allocation_config(self):
        self.fork.input_dim = self.representation_dim
        self.fork.output_dims = [
            self.state_dim for _ in self.fork.output_names
        ]

    @application(
        inputs=['representation', 'target_sentence_mask', 'target_sentence'],
        outputs=['cost'])
    def cost(self, representation, target_sentence, target_sentence_mask):
        target_sentence = target_sentence.dimshuffle(1, 0)
        target_sentence_mask = target_sentence_mask.T

        # The initial state and contexts, all functions of the representation
        contexts = {
            key: value.dimshuffle('x', 0, 1)
            if key not in self.transition.apply.states else value
            for key, value in self.fork.apply(representation,
                                              as_dict=True).items()
        }
        contexts['states'] = self.tanh.apply(contexts['states'])
        cost = self.sequence_generator.cost(**merge(
            contexts, {
                'mask': target_sentence_mask,
                'outputs': target_sentence,
                'readout_context': representation.dimshuffle('x', 0, 1)
            }))

        return (cost *
                target_sentence_mask).sum() / target_sentence_mask.shape[1]

Example #8

Show file

File: index_attention.py Project: shubhampachori12110095/Question-Answering

    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size, config.embed_size, name='question_embed')
        embed.weights_init = IsotropicGaussian(0.01)

        # Calculate question encoding (concatenate layer1)
        qembed = embed.apply(question)
        qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX),
                                                     config.question_lstm_size, config.question_skip_connections, 'q')
        bricks = bricks + qlstms
        if config.question_skip_connections:
            qenc_dim = 2*sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1)
        else:
            qenc_dim = 2*config.question_lstm_size[-1]
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1)
        qenc.name = 'qenc'

        # Calculate context encoding (concatenate layer1)
        cembed = embed.apply(context)
        cqembed = tensor.concatenate([cembed, tensor.extra_ops.repeat(qenc[None, :, :], cembed.shape[0], axis=0)], axis=2)
        clstms, chidden_list = make_bidir_lstm_stack(cqembed, config.embed_size + qenc_dim, context_mask.astype(theano.config.floatX),
                                                     config.ctx_lstm_size, config.ctx_skip_connections, 'ctx')
        bricks = bricks + clstms
        if config.ctx_skip_connections:
            cenc_dim = 2*sum(config.ctx_lstm_size) #2 : fw & bw
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2*config.question_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'

        # Attention mechanism MLP start
        attention_mlp_start = MLP(dims=config.attention_mlp_hidden + [1],
                            activations=config.attention_mlp_activations[1:] + [Identity()],
                            name='attention_mlp_start')
        attention_qlinear_start = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq_start') #Wum
        attention_clinear_start = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc_start') # Wym
        bricks += [attention_mlp_start, attention_qlinear_start, attention_clinear_start]
        layer1_start = Tanh(name='layer1_start')
        layer1_start = layer1_start.apply(attention_clinear_start.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2])))
                                        .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0]))
                             + attention_qlinear_start.apply(qenc)[None, :, :])
        att_weights_start = attention_mlp_start.apply(layer1_start.reshape((layer1_start.shape[0]*layer1_start.shape[1], layer1_start.shape[2])))
        att_weights_start = att_weights_start.reshape((layer1_start.shape[0], layer1_start.shape[1]))
        att_weights_start = tensor.nnet.softmax(att_weights_start.T).T

        attended = tensor.sum(cenc * att_weights_start[:, :, None], axis=0)
        attended.name = 'attended'

        # Attention mechanism MLP end
        attention_mlp_end = MLP(dims=config.attention_mlp_hidden + [1],
                            activations=config.attention_mlp_activations[1:] + [Identity()],
                            name='attention_mlp_end')
        attention_qlinear_end = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq_end') #Wum
        attention_clinear_end = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc_end') # Wym
        bricks += [attention_mlp_end, attention_qlinear_end, attention_clinear_end]
        layer1_end = Tanh(name='layer1_end')
        layer1_end = layer1_end.apply(attention_clinear_end.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2])))
                                        .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0]))
                             + attention_qlinear_end.apply(attended)[None, :, :])
        att_weights_end = attention_mlp_end.apply(layer1_end.reshape((layer1_end.shape[0]*layer1_end.shape[1], layer1_end.shape[2])))
        att_weights_end = att_weights_end.reshape((layer1_end.shape[0], layer1_end.shape[1]))
        att_weights_end = tensor.nnet.softmax(att_weights_end.T).T

        att_weights_start = tensor.dot(tensor.le(tensor.tile(theano.tensor.arange(context.shape[0])[None,:], (context.shape[0], 1)),
                                         tensor.tile(theano.tensor.arange(context.shape[0])[:,None], (1, context.shape[0]))), att_weights_start)
        att_weights_end = tensor.dot(tensor.ge(tensor.tile(theano.tensor.arange(context.shape[0])[None,:], (context.shape[0], 1)),
                                       tensor.tile(theano.tensor.arange(context.shape[0])[:,None], (1, context.shape[0]))), att_weights_end)

        # add attention from left and right
        #att_weights = att_weights_start * att_weights_end
        att_weights = tensor.minimum(att_weights_start, att_weights_end)

        att_target = tensor.eq(tensor.tile(answer[None,:,:], (context.shape[0], 1, 1)),
                               tensor.tile(context[:,None,:], (1, answer.shape[0], 1))).sum(axis=1).clip(0,1)

        self.predictions = tensor.gt(att_weights, 0.5) * context

        att_target = att_target / (att_target.sum(axis=0) + 0.00001)
        att_weights = att_weights / (att_weights.sum(axis=0) + 0.00001)

        cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) * context_mask).sum() / context_mask.sum()

        # Apply dropout
        cg = ComputationGraph([cost])
        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'
        att_weights_start.name = 'att_weights_start'
        att_weights_end.name = 'att_weights_end'
        att_target.name = 'att_target'
        att_weights.name = 'att_weights'
        self.predictions.name = 'pred'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]
        self.analyse_vars= [cost, self.predictions, att_weights_start, att_weights_end, att_weights, att_target]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()

Example #9

Show file

File: match_lstm.py Project: shubhampachori12110095/Question-Answering

    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')
        ans_indices = tensor.imatrix('ans_indices')  # n_steps * n_samples
        ans_indices_mask = tensor.imatrix('ans_indices_mask')

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)
        ans_indices = ans_indices.dimshuffle(1, 0)
        ans_indices_mask = ans_indices_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size, config.embed_size, name='embed')
        #embed.weights_init = IsotropicGaussian(0.01)
        embed.weights_init = Constant(
            init_embedding_table(filename='embeddings/vocab_embeddings.txt'))

        # one directional LSTM encoding
        q_lstm_ins = Linear(input_dim=config.embed_size,
                            output_dim=4 * config.pre_lstm_size,
                            name='q_lstm_in')
        q_lstm = LSTM(dim=config.pre_lstm_size,
                      activation=Tanh(),
                      name='q_lstm')
        c_lstm_ins = Linear(input_dim=config.embed_size,
                            output_dim=4 * config.pre_lstm_size,
                            name='c_lstm_in')
        c_lstm = LSTM(dim=config.pre_lstm_size,
                      activation=Tanh(),
                      name='c_lstm')
        bricks += [q_lstm, c_lstm, q_lstm_ins, c_lstm_ins]

        q_tmp = q_lstm_ins.apply(embed.apply(question))
        c_tmp = c_lstm_ins.apply(embed.apply(context))
        q_hidden, _ = q_lstm.apply(q_tmp,
                                   mask=question_mask.astype(
                                       theano.config.floatX))  # lq, bs, dim
        c_hidden, _ = c_lstm.apply(c_tmp,
                                   mask=context_mask.astype(
                                       theano.config.floatX))  # lc, bs, dim

        # Attention mechanism Bilinear question
        attention_question = Linear(input_dim=config.pre_lstm_size,
                                    output_dim=config.pre_lstm_size,
                                    name='att_question')
        bricks += [attention_question]
        att_weights_question = q_hidden[
            None, :, :, :] * attention_question.apply(
                c_hidden.reshape(
                    (c_hidden.shape[0] * c_hidden.shape[1],
                     c_hidden.shape[2]))).reshape(
                         (c_hidden.shape[0], c_hidden.shape[1],
                          c_hidden.shape[2]))[:,
                                              None, :, :]  # --> lc,lq,bs,dim
        att_weights_question = att_weights_question.sum(
            axis=3)  # sum over axis 3 -> dimensions --> lc,lq,bs
        att_weights_question = att_weights_question.dimshuffle(
            0, 2, 1)  # --> lc,bs,lq
        att_weights_question = att_weights_question.reshape(
            (att_weights_question.shape[0] * att_weights_question.shape[1],
             att_weights_question.shape[2]))  # --> lc*bs,lq
        att_weights_question = tensor.nnet.softmax(
            att_weights_question
        )  # softmax over axis 1 -> length of question # --> lc*bs,lq
        att_weights_question = att_weights_question.reshape(
            (c_hidden.shape[0], q_hidden.shape[1],
             q_hidden.shape[0]))  # --> lc,bs,lq
        att_weights_question = att_weights_question.dimshuffle(
            0, 2, 1)  # --> lc,lq,bs

        attended_question = tensor.sum(
            q_hidden[None, :, :, :] * att_weights_question[:, :, :, None],
            axis=1)  # sum over axis 1 -> length of question --> lc,bs,dim
        attended_question.name = 'attended_question'

        # Match LSTM
        cqembed = tensor.concatenate([c_hidden, attended_question], axis=2)
        mlstms, mhidden_list = make_bidir_lstm_stack(
            cqembed, 2 * config.pre_lstm_size,
            context_mask.astype(theano.config.floatX), config.match_lstm_size,
            config.match_skip_connections, 'match')
        bricks = bricks + mlstms
        if config.match_skip_connections:
            menc_dim = 2 * sum(config.match_lstm_size)
            menc = tensor.concatenate(mhidden_list, axis=2)
        else:
            menc_dim = 2 * config.match_lstm_size[-1]
            menc = tensor.concatenate(mhidden_list[-2:], axis=2)
        menc.name = 'menc'

        # Attention mechanism MLP start
        attention_mlp_start = MLP(
            dims=config.attention_mlp_hidden + [1],
            activations=config.attention_mlp_activations[1:] + [Identity()],
            name='attention_mlp_start')
        attention_clinear_start = Linear(
            input_dim=menc_dim,
            output_dim=config.attention_mlp_hidden[0],
            name='attm_start')  # Wym
        bricks += [attention_mlp_start, attention_clinear_start]
        layer1_start = Tanh(name='layer1_start')
        layer1_start = layer1_start.apply(
            attention_clinear_start.apply(
                menc.reshape(
                    (menc.shape[0] * menc.shape[1], menc.shape[2]))).reshape(
                        (menc.shape[0], menc.shape[1],
                         config.attention_mlp_hidden[0])))
        att_weights_start = attention_mlp_start.apply(
            layer1_start.reshape(
                (layer1_start.shape[0] * layer1_start.shape[1],
                 layer1_start.shape[2])))
        att_weights_start = att_weights_start.reshape(
            (layer1_start.shape[0], layer1_start.shape[1]))
        att_weights_start = tensor.nnet.softmax(att_weights_start.T).T

        attended = tensor.sum(menc * att_weights_start[:, :, None], axis=0)
        attended.name = 'attended'

        # Attention mechanism MLP end
        attention_mlp_end = MLP(
            dims=config.attention_mlp_hidden + [1],
            activations=config.attention_mlp_activations[1:] + [Identity()],
            name='attention_mlp_end')
        attention_qlinear_end = Linear(
            input_dim=menc_dim,
            output_dim=config.attention_mlp_hidden[0],
            name='atts_end')  #Wum
        attention_clinear_end = Linear(
            input_dim=menc_dim,
            output_dim=config.attention_mlp_hidden[0],
            use_bias=False,
            name='attm_end')  # Wym
        bricks += [
            attention_mlp_end, attention_qlinear_end, attention_clinear_end
        ]
        layer1_end = Tanh(name='layer1_end')
        layer1_end = layer1_end.apply(
            attention_clinear_end.apply(
                menc.reshape((menc.shape[0] * menc.shape[1], menc.shape[2]
                              ))).reshape((menc.shape[0], menc.shape[1],
                                           config.attention_mlp_hidden[0])) +
            attention_qlinear_end.apply(attended)[None, :, :])
        att_weights_end = attention_mlp_end.apply(
            layer1_end.reshape((layer1_end.shape[0] * layer1_end.shape[1],
                                layer1_end.shape[2])))
        att_weights_end = att_weights_end.reshape(
            (layer1_end.shape[0], layer1_end.shape[1]))
        att_weights_end = tensor.nnet.softmax(att_weights_end.T).T

        att_weights_start = tensor.dot(
            tensor.le(
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[None, :],
                    (context.shape[0], 1)),
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[:, None],
                    (1, context.shape[0]))), att_weights_start)
        att_weights_end = tensor.dot(
            tensor.ge(
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[None, :],
                    (context.shape[0], 1)),
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[:, None],
                    (1, context.shape[0]))), att_weights_end)

        # add attention from left and right
        att_weights = att_weights_start * att_weights_end
        #att_weights = tensor.minimum(att_weights_start, att_weights_end)

        att_target = tensor.zeros((ans_indices.shape[1], context.shape[0]),
                                  dtype=theano.config.floatX)
        att_target = tensor.set_subtensor(
            att_target[tensor.arange(ans_indices.shape[1]), ans_indices], 1)
        att_target = att_target.dimshuffle(1, 0)
        #att_target = tensor.eq(tensor.tile(answer[None,:,:], (context.shape[0], 1, 1)),
        #                       tensor.tile(context[:,None,:], (1, answer.shape[0], 1))).sum(axis=1).clip(0,1)

        self.predictions = tensor.gt(att_weights, 0.25) * context

        att_target = att_target / (att_target.sum(axis=0) + 0.00001)
        #att_weights = att_weights / (att_weights.sum(axis=0) + 0.00001)

        cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) *
                context_mask).sum() / context_mask.sum()

        # Apply dropout
        cg = ComputationGraph([cost])
        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, mhidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'
        att_weights_start.name = 'att_weights_start'
        att_weights_end.name = 'att_weights_end'
        att_weights.name = 'att_weights'
        att_target.name = 'att_target'
        self.predictions.name = 'pred'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]
        self.analyse_vars = [
            cost, self.predictions, att_weights_start, att_weights_end,
            att_weights, att_target
        ]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()