Esempio n. 1
0
def build_model(args):
    x = tensor.tensor3('features', dtype=floatX)
    y = tensor.tensor3('targets', dtype=floatX)

    linear = Linear(input_dim=1, output_dim=4 * args.units)
    rnn = LSTM(dim=args.units, activation=Tanh())
    linear2 = Linear(input_dim=args.units, output_dim=1)

    prediction = Tanh().apply(linear2.apply(rnn.apply(linear.apply(x))))

    prediction = prediction[:-1, :, :]

    # SquaredError does not work on 3D tensor
    y = y.reshape((y.shape[0] * y.shape[1], y.shape[2]))
    prediction = prediction.reshape((prediction.shape[0] * prediction.shape[1],
                                     prediction.shape[2]))

    cost = SquaredError().apply(y, prediction)

    # Initialization
    linear.weights_init = IsotropicGaussian(0.1)
    linear2.weights_init = IsotropicGaussian(0.1)
    linear.biases_init = Constant(0)
    linear2.biases_init = Constant(0)
    rnn.weights_init = Orthogonal()

    return cost
def get_presoft(h, args):
    output_size = get_output_size(args.dataset)
    # If args.skip_connections: dim = args.layers * args.state_dim
    # else: dim = args.state_dim
    use_all_states = args.skip_connections or args.skip_output or (args.rnn_type in ["clockwork", "soft"])
    output_layer = Linear(
        input_dim=use_all_states * args.layers *
        args.state_dim + (1 - use_all_states) * args.state_dim,
        output_dim=output_size, name="output_layer")

    output_layer.weights_init = initialization.IsotropicGaussian(0.1)
    output_layer.biases_init = initialization.Constant(0)
    output_layer.initialize()
    presoft = output_layer.apply(h)
    if not has_indices(args.dataset):
        presoft = Tanh().apply(presoft)
    presoft.name = 'presoft'
    return presoft
Esempio n. 3
0
    def __init__(self, dim, activation=None, gate_activation=None, **kwargs):
        self.dim = dim

        if not activation:
            activation = Tanh()
        if not gate_activation:
            gate_activation = Logistic()
        self.activation = activation
        self.gate_activation = gate_activation
        children = [activation, gate_activation]
        kwargs.setdefault('children', []).extend(children)
        super(ZoneoutGRU, self).__init__(**kwargs)
Esempio n. 4
0
def getBidir(input_dim,input_var):
    """SimpleRecurrent-based bidirectionnal"""
    bidir = Bidirectional(weights_init=Orthogonal(),
                               prototype=SimpleRecurrent(
                                   dim=input_dim, activation=Tanh()))
    #bidir.allocate()
    bidir.initialize()
    h = bidir.apply(input_var)

    net = add_softmax_layer(h, input_dim, 2)

    return net
Esempio n. 5
0
    def __init__(self, enc_transition, dims, dim_input, subsample, **kwargs):
        super(Encoder, self).__init__(**kwargs)
        self.subsample = subsample

        for layer_num, (dim_under, dim) in enumerate(
                zip([dim_input] + list(2 * numpy.array(dims)), dims)):
            bidir = Bidirectional(RecurrentWithFork(enc_transition(
                dim=dim, activation=Tanh()).apply,
                                                    dim_under,
                                                    name='with_fork'),
                                  name='bidir{}'.format(layer_num))
            self.children.append(bidir)
Esempio n. 6
0
    def __init__(self, dim, activation=None, gate_activation=None, **kwargs):
        super(GatedRecurrent, self).__init__(**kwargs)
        self.dim = dim

        if not activation:
            activation = Tanh()
        if not gate_activation:
            gate_activation = Sigmoid()
        self.activation = activation
        self.gate_activation = gate_activation

        self.children = [activation, gate_activation]
Esempio n. 7
0
    def __init__(self, dim, activation=None, mlp=None, **kwargs):
        super(SoftGatedRecurrent, self).__init__(**kwargs)
        self.dim = dim

        if not activation:
            activation = Tanh()
        self.activation = activation

        # The activation of the mlp should be a Logistic function
        self.mlp = mlp

        self.children = [activation, mlp]
Esempio n. 8
0
    def __init__(self, dim, activation=None, gate_activation=None, **kwargs):
        self.dim = dim

        if not activation:
            activation = Tanh()
        if not gate_activation:
            gate_activation = Logistic()
        self.activation = activation
        self.gate_activation = gate_activation

        children = [activation, gate_activation] + kwargs.get('children', [])
        super(GatedRecurrent, self).__init__(children=children, **kwargs)
Esempio n. 9
0
    def __init__(self, dim, activation=None, gate_activation=None, **kwargs):
        super(LSTMGraves, self).__init__(**kwargs)
        self.dim = dim

        if not activation:
            activation = Tanh()
        if not gate_activation:
            gate_activation = Logistic()
        self.activation = activation
        self.gate_activation = gate_activation

        self.children = [activation, gate_activation]
Esempio n. 10
0
    def __init__(self, dimension, alphabet_size, **kwargs):
        super(WordReverser, self).__init__(**kwargs)
        encoder = Bidirectional(
            SimpleRecurrent(dim=dimension, activation=Tanh()))
        fork = Fork([
            name for name in encoder.prototype.apply.sequences
            if name != 'mask'
        ])
        fork.input_dim = dimension
        fork.output_dims = [
            encoder.prototype.get_dim(name) for name in fork.input_names
        ]
        lookup = LookupTable(alphabet_size, dimension)
        transition = SimpleRecurrent(activation=Tanh(),
                                     dim=dimension,
                                     name="transition")
        attention = SequenceContentAttention(
            state_names=transition.apply.states,
            attended_dim=2 * dimension,
            match_dim=dimension,
            name="attention")
        readout = Readout(readout_dim=alphabet_size,
                          source_names=[
                              transition.apply.states[0],
                              attention.take_glimpses.outputs[0]
                          ],
                          emitter=SoftmaxEmitter(name="emitter"),
                          feedback_brick=LookupFeedback(
                              alphabet_size, dimension),
                          name="readout")
        generator = SequenceGenerator(readout=readout,
                                      transition=transition,
                                      attention=attention,
                                      name="generator")

        self.lookup = lookup
        self.fork = fork
        self.encoder = encoder
        self.generator = generator
        self.children = [lookup, fork, encoder, generator]
Esempio n. 11
0
 def __init__(self, vocab_size, topical_embedding_dim, state_dim,word_num,batch_size,
              **kwargs):
     super(topicalq_transformer, self).__init__(**kwargs)
     self.vocab_size = vocab_size;
     self.word_embedding_dim = topical_embedding_dim;
     self.state_dim = state_dim;
     self.word_num=word_num;
     self.batch_size=batch_size;
     self.look_up=LookupTable(name='topical_embeddings');
     self.transformer=MLP(activations=[Tanh()],
                             dims=[self.word_embedding_dim*self.word_num, self.state_dim],
                             name='topical_transformer');
     self.children = [self.look_up,self.transformer];
Esempio n. 12
0
    def __init__(self,
                 dim,
                 attended_dim,
                 activation=None,
                 gate_activation=None,
                 **kwargs):
        super(GRU, self).__init__(**kwargs)
        self.dim = dim
        self.attended_dim = attended_dim

        if not activation:
            activation = Tanh()
        if not gate_activation:
            gate_activation = Logistic()
        self.activation = activation
        self.gate_activation = gate_activation

        self.initial_transformer = MLP(activations=[Tanh()],
                                       dims=[attended_dim, self.dim],
                                       name='state_initializer')

        self.children = [activation, gate_activation, self.initial_transformer]
Esempio n. 13
0
def test_highway_activation():
    x = T.matrix()
    highway = Highway(input_dim=100,
                      output_activation=Tanh(),
                      transform_activation=Identity())
    highway.biases_init = Constant(0.0)
    highway.weights_init = init_Identity()
    y = highway.apply(x)
    highway.initialize()
    _func = theano.function([x], y)
    x_val = np.ones((4, 100), dtype=theano.config.floatX)
    ret = _func(x_val)
    assert_allclose(ret, np.tanh(np.ones((4, 100))))
Esempio n. 14
0
    def __init__(self, attended_dim, context_dim, **kwargs):
        super(GRUInitialStateWithInitialStateSumContext,
              self).__init__(**kwargs)
        self.attended_dim = attended_dim
        self.context_dim = context_dim

        # two MLPs which map to the same dimension, then we sum
        # the motivation here is to allow the network to pretrain on the normal MT, task,
        # then keep some params static, and continue training with the context-enhanced task
        # the state transformer
        self.initial_transformer = MLP(activations=[Tanh()],
                                       dims=[attended_dim, self.dim],
                                       name='state_initializer')

        # the context transformer
        self.context_transformer = MLP(
            activations=[Tanh(), Tanh(), Tanh()],
            dims=[context_dim, 2000, 1000, self.dim],
            name='context_initializer')

        self.children.extend(
            [self.initial_transformer, self.context_transformer])
Esempio n. 15
0
def test_convolutional_sequence_with_convolutions_raw_activation():
    seq = ConvolutionalSequence(
        [Convolutional(filter_size=(3, 3), num_filters=4),
         Rectifier(),
         Convolutional(filter_size=(5, 5), num_filters=3, step=(2, 2)),
         Tanh()],
        num_channels=2,
        image_size=(21, 39))
    seq.allocate()
    x = theano.tensor.tensor4()
    out = seq.apply(x).eval({x: numpy.ones((10, 2, 21, 39),
                                           dtype=theano.config.floatX)})
    assert out.shape == (10, 3, 8, 17)
Esempio n. 16
0
def main(save_to, num_epochs):
    mlp = MLP([Tanh(), Softmax()], [784, 100, 10],
              weights_init=IsotropicGaussian(0.01),
              biases_init=Constant(0))
    mlp.initialize()
    x = tensor.matrix('features')
    y = tensor.lmatrix('targets')
    probs = mlp.apply(x)
    cost = CategoricalCrossEntropy().apply(y.flatten(), probs)
    error_rate = MisclassificationRate().apply(y.flatten(), probs)

    cg = ComputationGraph([cost])
    W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables)
    cost = cost + .00005 * (W1**2).sum() + .00005 * (W2**2).sum()
    cost.name = 'final_cost'

    mnist_train = MNIST("train")
    mnist_test = MNIST("test")

    algorithm = GradientDescent(cost=cost,
                                params=cg.parameters,
                                step_rule=Scale(learning_rate=0.1))
    main_loop = MainLoop(
        algorithm,
        DataStream(mnist_train,
                   iteration_scheme=SequentialScheme(mnist_train.num_examples,
                                                     50)),
        model=Model(cost),
        extensions=[
            Timing(),
            FinishAfter(after_n_epochs=num_epochs),
            DataStreamMonitoring([cost, error_rate],
                                 DataStream(mnist_test,
                                            iteration_scheme=SequentialScheme(
                                                mnist_test.num_examples, 500)),
                                 prefix="test"),
            TrainingDataMonitoring([
                cost, error_rate,
                aggregation.mean(algorithm.total_gradient_norm)
            ],
                                   prefix="train",
                                   after_epoch=True),
            Checkpoint(save_to),
            Plot('MNIST example',
                 channels=[[
                     'test_final_cost',
                     'test_misclassificationrate_apply_error_rate'
                 ], ['train_total_gradient_norm']]),
            Printing()
        ])
    main_loop.run()
Esempio n. 17
0
def test_activations():
    x = tensor.vector()
    x_val = numpy.random.rand(8).astype(theano.config.floatX)
    exp_x_val = numpy.exp(x_val)

    assert_allclose(x_val, Identity().apply(x).eval({x: x_val}))
    assert_allclose(numpy.tanh(x_val), Tanh().apply(x).eval({x: x_val}),
                    rtol=1e-06)
    assert_allclose(numpy.log(1 + exp_x_val),
                    Softplus(x).apply(x).eval({x: x_val}), rtol=1e-6)
    assert_allclose(exp_x_val / numpy.sum(exp_x_val),
                    Softmax(x).apply(x).eval({x: x_val}).flatten(), rtol=1e-6)
    assert_allclose(1.0 / (1.0 + numpy.exp(-x_val)),
                    Logistic(x).apply(x).eval({x: x_val}), rtol=1e-6)
Esempio n. 18
0
File: model.py Progetto: npow/DCNMT
 def __init__(self,
              vocab_size,
              embedding_dim,
              igru_state_dim,
              igru_depth,
              trg_dgru_depth,
              emitter=None,
              feedback_brick=None,
              merge=None,
              merge_prototype=None,
              post_merge=None,
              merged_dim=None,
              igru=None,
              **kwargs):
     # for compatible
     if igru_depth == 1:
         self.igru = IGRU(dim=igru_state_dim)
     else:
         self.igru = RecurrentStack(
             [IGRU(dim=igru_state_dim, name='igru')] + [
                 UpperIGRU(dim=igru_state_dim,
                           activation=Tanh(),
                           name='upper_igru' + str(i))
                 for i in range(1, igru_depth)
             ],
             skip_connections=True)
     self.igru_depth = igru_depth
     self.trg_dgru_depth = trg_dgru_depth
     self.lookup = LookupTable(name='embeddings')
     self.vocab_size = vocab_size
     self.igru_state_dim = igru_state_dim
     self.gru_to_softmax = Linear(input_dim=igru_state_dim,
                                  output_dim=vocab_size)
     self.embedding_dim = embedding_dim
     self.gru_fork = Fork([
         name for name in self.igru.apply.sequences
         if name != 'mask' and name != 'input_states'
     ],
                          prototype=Linear(),
                          name='gru_fork')
     kwargs['children'] = [
         self.igru, self.lookup, self.gru_to_softmax, self.gru_fork
     ]
     super(Interpolator, self).__init__(emitter=emitter,
                                        feedback_brick=feedback_brick,
                                        merge=merge,
                                        merge_prototype=merge_prototype,
                                        post_merge=post_merge,
                                        merged_dim=merged_dim,
                                        **kwargs)
Esempio n. 19
0
    def __init__(self, dimension, input_size, embed_input=False, **kwargs):
        super(LSTMEncoder, self).__init__(**kwargs)
        if embed_input:
            self.embedder = LookupTable(input_size, dimension)
        else:
            self.embedder = Linear(input_size, dimension)
        self.fork = Fork(['inputs'],
                         dimension,
                         output_dims=[dimension],
                         prototype=Linear(dimension, 4 * dimension))
        encoder = Bidirectional(LSTM(dim=dimension, activation=Tanh()))

        self.encoder = encoder
        self.children = [encoder, self.embedder, self.fork]
Esempio n. 20
0
def example4():
    """LSTM -> Plante lors de l'initialisation du lstm."""

    x = tensor.tensor3('x')
    dim = 3

    #    gate_inputs = theano.function([x],x*4)
    gate_inputs = Linear(input_dim=dim,
                         output_dim=dim * 4,
                         name="linear",
                         weights_init=initialization.Identity(),
                         biases_init=Constant(2))

    lstm = LSTM(dim=dim,
                activation=Tanh(),
                weights_init=IsotropicGaussian(),
                biases_init=Constant(0))

    gate_inputs.initialize()
    hg = gate_inputs.apply(x)

    #print(gate_inputs.parameters)
    #print(gate_inputs.parameters[1].get_value())

    lstm.initialize()
    h, cells = lstm.apply(hg)
    print(lstm.parameters)

    f = theano.function([x], h)
    print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
    print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
    print(f(4 * np.ones((dim, 1, dim), dtype=theano.config.floatX)))

    print("Good Job!")

    #    lstm_output =

    #Initial State
    h0 = tensor.matrix('h0')
    c = tensor.matrix('cells')
    h, c1 = lstm.apply(
        inputs=x, states=h0,
        cells=c)  # lstm.apply(states=h0,cells=cells,inputs=gate_inputs)

    f = theano.function([x, h0, c], h)
    print("a")
    print(
        f(np.ones((3, 1, 3), dtype=theano.config.floatX),
          np.ones((1, 3), dtype=theano.config.floatX),
          np.ones((1, 3), dtype=theano.config.floatX)))
Esempio n. 21
0
 def __init__(self, vocab_size, embedding_dim, n_layers, skip_connections,
              state_dim, **kwargs):
     """Sole constructor.
     
     Args:
         vocab_size (int): Source vocabulary size
         embedding_dim (int): Dimension of the embedding layer
         n_layers (int): Number of layers. Layers share the same
                         weight matrices.
         skip_connections (bool): Skip connections connect the
                                  source word embeddings directly 
                                  with deeper layers to propagate 
                                  the gradient more efficiently
         state_dim (int): Number of hidden units in the recurrent
                          layers.
     """
     super(DeepBidirectionalEncoder, self).__init__(**kwargs)
     self.vocab_size = vocab_size
     self.embedding_dim = embedding_dim
     self.n_layers = n_layers
     self.state_dim = state_dim
     self.skip_connections = skip_connections
     self.lookup = LookupTable(name='embeddings')
     self.bidirs = []
     self.fwd_forks = []
     self.back_forks = []
     for i in xrange(self.n_layers):
         bidir = BidirectionalWMT15(GatedRecurrent(activation=Tanh(),
                                                   dim=state_dim),
                                    name='bidir%d' % i)
         self.bidirs.append(bidir)
         self.fwd_forks.append(
             Fork([
                 name for name in bidir.prototype.apply.sequences
                 if name != 'mask'
             ],
                  prototype=Linear(),
                  name='fwd_fork%d' % i))
         self.back_forks.append(
             Fork([
                 name for name in bidir.prototype.apply.sequences
                 if name != 'mask'
             ],
                  prototype=Linear(),
                  name='back_fork%d' % i))
     self.children = [self.lookup] \
                     + self.bidirs \
                     + self.fwd_forks \
                     + self.back_forks
Esempio n. 22
0
def create_rnn(hidden_dim, vocab_dim, mode="rnn"):
    # input
    x = tensor.imatrix('inchar')
    y = tensor.imatrix('outchar')

    #
    W = LookupTable(
        name="W1",
        #dim = hidden_dim*4,
        dim=hidden_dim,
        length=vocab_dim,
        weights_init=initialization.IsotropicGaussian(0.01),
        biases_init=initialization.Constant(0))
    if mode == "lstm":
        # Long Short Term Memory
        H = LSTM(hidden_dim,
                 name='H',
                 weights_init=initialization.IsotropicGaussian(0.01),
                 biases_init=initialization.Constant(0.0))
    else:
        # recurrent history weight
        H = SimpleRecurrent(
            name="H",
            dim=hidden_dim,
            activation=Tanh(),
            weights_init=initialization.IsotropicGaussian(0.01))
    #
    S = Linear(name="W2",
               input_dim=hidden_dim,
               output_dim=vocab_dim,
               weights_init=initialization.IsotropicGaussian(0.01),
               biases_init=initialization.Constant(0))

    A = NDimensionalSoftmax(name="softmax")

    initLayers([W, H, S])
    activations = W.apply(x)
    hiddens = H.apply(activations)  #[0]
    activations2 = S.apply(hiddens)
    y_hat = A.apply(activations2, extra_ndim=1)
    cost = A.categorical_cross_entropy(y, activations2, extra_ndim=1).mean()

    cg = ComputationGraph(cost)
    #print VariableFilter(roles=[WEIGHT])(cg.variables)
    #W1,H,W2 = VariableFilter(roles=[WEIGHT])(cg.variables)

    layers = (x, W, H, S, A, y)

    return cg, layers, y_hat, cost
Esempio n. 23
0
    def __init__(self, vocab_size, embedding_dim, state_dim,
                 representation_dim, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim

        self.transition = GRUInitialState(attended_dim=state_dim,
                                          dim=state_dim,
                                          activation=Tanh(),
                                          name='decoder')
        self.attention = SequenceContentAttention(
            state_names=self.transition.apply.states,
            attended_dim=representation_dim,
            match_dim=state_dim,
            name="attention")

        readout = Readout(source_names=[
            'states', 'feedback', self.attention.take_glimpses.outputs[0]
        ],
                          readout_dim=self.vocab_size,
                          emitter=SoftmaxEmitter(initial_output=-1),
                          feedback_brick=LookupFeedbackWMT15(
                              vocab_size, embedding_dim),
                          post_merge=InitializableFeedforwardSequence([
                              Bias(dim=state_dim, name='maxout_bias').apply,
                              Maxout(num_pieces=2, name='maxout').apply,
                              Linear(input_dim=state_dim / 2,
                                     output_dim=embedding_dim,
                                     use_bias=False,
                                     name='softmax0').apply,
                              Linear(input_dim=embedding_dim,
                                     name='softmax1').apply
                          ]),
                          merged_dim=state_dim,
                          merge_prototype=Linear(use_bias=True))

        self.sequence_generator = SequenceGenerator(
            readout=readout,
            transition=self.transition,
            attention=self.attention,
            fork=Fork([
                name
                for name in self.transition.apply.sequences if name != 'mask'
            ],
                      prototype=Linear()))

        self.children = [self.sequence_generator]
Esempio n. 24
0
    def __init__(self, dim, activation=None, gate_activation=None, **kwargs):
        super(GatedRecurrent, self).__init__(**kwargs)
        self.dim = dim

        self.recurrent_weights_init = None
        self.initial_states_init = None

        if not activation:
            activation = Tanh()
        if not gate_activation:
            gate_activation = Logistic()
        self.activation = activation
        self.gate_activation = gate_activation

        self.children = [activation, gate_activation]
Esempio n. 25
0
    def __init__(self, feature_size, embedding_dim, state_dim, **kwargs):
        super(BidirectionalAudioEncoder, self).__init__(**kwargs)
        self.feature_size = feature_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim

        self.embedding = BidirectionalWMT15(GatedRecurrent(activation=Tanh(), dim=state_dim), name="audio_embeddings")
        self.embedding_fwd_fork = Fork(
            [name for name in self.embedding.prototype.apply.sequences
             if name != 'mask'], prototype=Linear(), name='embedding_fwd_fork')
        self.embedding_back_fork = Fork(
            [name for name in self.embedding.prototype.apply.sequences
             if name != 'mask'], prototype=Linear(), name='embedding_back_fork')

        self.bidir = BidirectionalWMT15(GatedRecurrent(activation=Tanh(), dim=state_dim), name="audio_representation")
        self.fwd_fork = Fork(
            [name for name in self.bidir.prototype.apply.sequences
             if name != 'mask'], prototype=Linear(), name='fwd_fork')
        self.back_fork = Fork(
            [name for name in self.bidir.prototype.apply.sequences
             if name != 'mask'], prototype=Linear(), name='back_fork')

        self.children = [self.bidir, self.embedding,
                         self.fwd_fork, self.back_fork, self.embedding_fwd_fork, self.embedding_back_fork]
Esempio n. 26
0
def apply_fc(x, fc_layers, fc_ws, fc_bs):
    out = x
    for layer in fc_layers:
        name, shape, act = layer
        w = {w.name: w for w in fc_ws}[name + '_w']
        b = {b.name: b for b in fc_bs}[name + '_b']
        if act == 'relu':
            act = Rectifier().apply
        elif act == 'tanh':
            act = Tanh().apply
        elif act == 'lin':
            act = lambda n: n
        out = tensor.dot(out, w)
        out = act(out + b)
    return out
Esempio n. 27
0
    def __init__(self, dim, activation=None, mlp=None, **kwargs):
        super(HardGatedRecurrent, self).__init__(**kwargs)
        self.dim = dim

        if not activation:
            activation = Tanh()
        self.activation = activation

        # The activation of the mlp should be a Logistic function
        self.mlp = mlp

        # The random stream
        self.randomstream = MRG_RandomStreams()

        self.children = [activation, mlp]
Esempio n. 28
0
    def __init__(self, dim, activation=None, gate_activation=None,
                 model_type=6, ogates_zoneout=False, **kwargs):
        self.dim = dim
        self.model_type = model_type
        self.ogates_zoneout = ogates_zoneout

        if not activation:
            activation = Tanh()
        if not gate_activation:
            gate_activation = Logistic()
        self.activation = activation
        self.gate_activation = gate_activation

        children = [self.activation, self.gate_activation]
        kwargs.setdefault('children', []).extend(children)
        super(ZoneoutLSTM, self).__init__(**kwargs)
Esempio n. 29
0
    def __init__(self, enc_transition, dims, dim_input, subsample, bidir,
                 **kwargs):
        super(Encoder, self).__init__(**kwargs)
        self.subsample = subsample

        dims_under = [dim_input] + list(
            (2 if bidir else 1) * numpy.array(dims))
        for layer_num, (dim_under, dim) in enumerate(zip(dims_under, dims)):
            layer = RecurrentWithFork(enc_transition(dim=dim,
                                                     activation=Tanh()).apply,
                                      dim_under,
                                      name='with_fork{}'.format(layer_num))
            if bidir:
                layer = Bidirectional(layer, name='bidir{}'.format(layer_num))
            self.children.append(layer)
        self.dim_encoded = (2 if bidir else 1) * dims[-1]
Esempio n. 30
0
def test_super_in_recurrent_overrider():
    # A regression test for the issue #475
    class SimpleRecurrentWithContext(SimpleRecurrent):
        @application(contexts=['context'])
        def apply(self, context, *args, **kwargs):
            kwargs['inputs'] += context
            return super(SimpleRecurrentWithContext, self).apply(*args,
                                                                 **kwargs)

        @apply.delegate
        def apply_delegate(self):
            return super(SimpleRecurrentWithContext, self).apply

    brick = SimpleRecurrentWithContext(100, Tanh())
    inputs = tensor.tensor3('inputs')
    context = tensor.matrix('context').dimshuffle('x', 0, 1)
    brick.apply(context, inputs=inputs)
Esempio n. 31
0
 def _build_bricks(self, *args, **kwargs):
     super(AttentionEUTHM2, self)._build_bricks()
     self.word_shift = MLP(
         activations=[Tanh('word_shift_tanh')],
         dims=[
             self.config.user_embed_dim + self.config.word_embed_dim,
             self.config.word_embed_dim
         ],
         name='word_shift_mlp')
     self.word_shift.weights_init = IsotropicGaussian(
         std=1 / numpy.sqrt(self.config.word_embed_dim +
                            self.config.user_embed_dim))
     self.word_shift.biases_init = Constant(0)
     self.word_shift.initialize()
     self.word_shift_bias = Bias(dim=1, name='word_shift_bias')
     self.word_shift_bias.biases_init = Constant(0)
     self.word_shift_bias.initialize()
Esempio n. 32
0
def test_integer_sequence_generator():
    # Disclaimer: here we only check shapes, not values.

    readout_dim = 5
    feedback_dim = 3
    dim = 20
    batch_size = 30
    n_steps = 10

    transition = GatedRecurrent(name="transition",
                                activation=Tanh(),
                                dim=dim,
                                weights_init=Orthogonal())
    generator = SequenceGenerator(LinearReadout(
        readout_dim=readout_dim,
        source_names=["states"],
        emitter=SoftmaxEmitter(name="emitter"),
        feedbacker=LookupFeedback(readout_dim, feedback_dim),
        name="readout"),
                                  transition,
                                  weights_init=IsotropicGaussian(0.01),
                                  biases_init=Constant(0),
                                  name="generator")
    generator.initialize()

    y = tensor.lmatrix('y')
    mask = tensor.matrix('mask')
    costs = generator.cost(y, mask)
    assert costs.ndim == 2
    costs_val = theano.function([y, mask],
                                [costs])(numpy.zeros((n_steps, batch_size),
                                                     dtype='int64'),
                                         numpy.ones((n_steps, batch_size),
                                                    dtype=floatX))[0]
    assert costs_val.shape == (n_steps, batch_size)

    states, outputs, costs = generator.generate(iterate=True,
                                                batch_size=batch_size,
                                                n_steps=n_steps)
    states_val, outputs_val, costs_val = theano.function(
        [], [states, outputs, costs],
        updates=costs.owner.inputs[0].owner.tag.updates)()
    assert states_val.shape == (n_steps, batch_size, dim)
    assert outputs_val.shape == (n_steps, batch_size)
    assert outputs_val.dtype == 'int64'
    assert costs_val.shape == (n_steps, batch_size)
Esempio n. 33
0
    def __init__(self, vocab_size, embedding_dim, state_dim,
                 representation_dim, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim

        readout = Readout(
            source_names=['states', 'feedback', 'readout_context'],
            readout_dim=self.vocab_size,
            emitter=SoftmaxEmitter(),
            feedback_brick=LookupFeedback(vocab_size, embedding_dim),
            post_merge=InitializableFeedforwardSequence(
                [Bias(dim=1000).apply,
                 Maxout(num_pieces=2).apply,
                 Linear(input_dim=state_dim / 2, output_dim=100,
                        use_bias=False).apply,
                 Linear(input_dim=100).apply]),
            merged_dim=1000)

        self.transition = GatedRecurrentWithContext(Tanh(), dim=state_dim,
                                                    name='decoder')
        # Readout will apply the linear transformation to 'readout_context'
        # with a Merge brick, so no need to fork it here
        self.fork = Fork([name for name in
                          self.transition.apply.contexts +
                          self.transition.apply.states
                          if name != 'readout_context'], prototype=Linear())
        self.tanh = Tanh()

        self.sequence_generator = SequenceGenerator(
            readout=readout, transition=self.transition,
            fork_inputs=[name for name in self.transition.apply.sequences
                         if name != 'mask'],
        )

        self.children = [self.fork, self.sequence_generator, self.tanh]
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.ivector('answer')
        candidates = tensor.imatrix('candidates')
        candidates_mask = tensor.imatrix('candidates_mask')

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)

        # Embed questions and cntext
        embed = LookupTable(vocab_size, config.embed_size, name='question_embed')
        bricks.append(embed)

        qembed = embed.apply(question)
        cembed = embed.apply(context)

        qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX),
                                                     config.question_lstm_size, config.question_skip_connections, 'q')
        clstms, chidden_list = make_bidir_lstm_stack(cembed, config.embed_size, context_mask.astype(theano.config.floatX),
                                                     config.ctx_lstm_size, config.ctx_skip_connections, 'ctx')
        bricks = bricks + qlstms + clstms

        # Calculate question encoding (concatenate layer1)
        if config.question_skip_connections:
            qenc_dim = 2*sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1)
        else:
            qenc_dim = 2*config.question_lstm_size[-1]
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1)
        qenc.name = 'qenc'

        # Calculate context encoding (concatenate layer1)
        if config.ctx_skip_connections:
            cenc_dim = 2*sum(config.ctx_lstm_size)
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2*config.ctx_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'

        # Attention mechanism MLP
        attention_mlp = MLP(dims=config.attention_mlp_hidden + [1],
                            activations=config.attention_mlp_activations[1:] + [Identity()],
                            name='attention_mlp')
        attention_qlinear = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq')
        attention_clinear = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc')
        bricks += [attention_mlp, attention_qlinear, attention_clinear]
        layer1 = Tanh().apply(attention_clinear.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2])))
                                        .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0]))
                             + attention_qlinear.apply(qenc)[None, :, :])
        layer1.name = 'layer1'
        att_weights = attention_mlp.apply(layer1.reshape((layer1.shape[0]*layer1.shape[1], layer1.shape[2])))
        att_weights.name = 'att_weights_0'
        att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1]))
        att_weights.name = 'att_weights'

        attended = tensor.sum(cenc * tensor.nnet.softmax(att_weights.T).T[:, :, None], axis=0)
        attended.name = 'attended'

        # Now we can calculate our output
        out_mlp = MLP(dims=[cenc_dim + qenc_dim] + config.out_mlp_hidden + [config.n_entities],
                      activations=config.out_mlp_activations + [Identity()],
                      name='out_mlp')
        bricks += [out_mlp]
        probs = out_mlp.apply(tensor.concatenate([attended, qenc], axis=1))
        probs.name = 'probs'

        is_candidate = tensor.eq(tensor.arange(config.n_entities, dtype='int32')[None, None, :],
                                 tensor.switch(candidates_mask, candidates, -tensor.ones_like(candidates))[:, :, None]).sum(axis=1)
        probs = tensor.switch(is_candidate, probs, -1000 * tensor.ones_like(probs))

        # Calculate prediction, cost and error rate
        pred = probs.argmax(axis=1)
        cost = Softmax().categorical_cross_entropy(answer, probs).mean()
        error_rate = tensor.neq(answer, pred).mean()

        # Apply dropout
        cg = ComputationGraph([cost, error_rate])
        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout)
        [cost_reg, error_rate_reg] = cg.outputs

        # Other stuff
        cost_reg.name = cost.name = 'cost'
        error_rate_reg.name = error_rate.name = 'error_rate'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg], [error_rate_reg]]
        self.monitor_vars_valid = [[cost], [error_rate]]

        # Initialize bricks
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
Esempio n. 35
0
class FRNNEmitter(AbstractEmitter, Initializable, Random):
    """An RNN emitter for the case of real outputs.
    Parameters
    ----------
    """

    def __init__(self, mlp, target_size, frame_size, k, frnn_hidden_size, frnn_step_size, const=1e-5, **kwargs):

        super(FRNNEmitter, self).__init__(**kwargs)

        self.mlp = mlp
        self.target_size = target_size
        self.frame_size = frame_size
        self.k = k
        self.frnn_hidden_size = frnn_hidden_size
        self.const = const
        self.input_dim = self.mlp.output_dim

        self.frnn_step_size = frnn_step_size

        # adding a step if the division is not exact.
        self.number_of_steps = frame_size // frnn_step_size
        self.last_steps = frame_size % frnn_step_size
        if self.last_steps != 0:
            self.number_of_steps += 1

        self.mu = MLP(activations=[Identity()], dims=[frnn_hidden_size, k * frnn_step_size], name=self.name + "_mu")
        self.sigma = MLP(
            activations=[SoftPlus()], dims=[frnn_hidden_size, k * frnn_step_size], name=self.name + "_sigma"
        )

        self.coeff = MLP(activations=[Identity()], dims=[frnn_hidden_size, k], name=self.name + "_coeff")

        self.coeff2 = NDimensionalSoftmax()

        self.frnn_initial_state = Linear(
            input_dim=self.input_dim, output_dim=frnn_hidden_size, name="frnn_initial_state"
        )

        # self.frnn_hidden = Linear(
        #    input_dim=frnn_hidden_size,
        #    output_dim=frnn_hidden_size,
        #    activation=Tanh(),
        #    name="frnn_hidden")

        self.frnn_activation = Tanh(name="frnn_activation")

        self.frnn_linear_transition_state = Linear(
            input_dim=frnn_hidden_size, output_dim=frnn_hidden_size, name="frnn_linear_transition_state"
        )

        self.frnn_linear_transition_input = Linear(
            input_dim=self.frnn_step_size, output_dim=frnn_hidden_size, name="frnn_linear_transition_input"
        )

        # self.frnn_linear_transition_output = Linear (
        #    input_dim = frnn_hidden_size,
        #    output_dim = self.rnn_hidden_dim,
        #    name="frnn_linear_transition_output")

        self.children = [
            self.mlp,
            self.mu,
            self.sigma,
            self.coeff,
            self.coeff2,
            self.frnn_initial_state,
            self.frnn_activation,
            self.frnn_linear_transition_state,
            self.frnn_linear_transition_input,
        ]

    @application
    def emit(self, readouts):
        """
        keep_parameters is True if mu,sigma,coeffs must be stacked and returned
        if false, only the result is given, the others will be empty list.

        """
        # initial state
        state = self.frnn_initial_state.apply(self.mlp.apply(readouts))

        results = []

        for i in range(self.number_of_steps):
            last_iteration = i == self.number_of_steps - 1

            # First generating distribution parameters and sampling.
            mu = self.mu.apply(state)
            sigma = self.sigma.apply(state) + self.const
            coeff = self.coeff2.apply(self.coeff.apply(state), extra_ndim=state.ndim - 2) + self.const

            shape_result = coeff.shape
            shape_result = tensor.set_subtensor(shape_result[-1], self.frnn_step_size)
            ndim_result = coeff.ndim

            mu = mu.reshape((-1, self.frnn_step_size, self.k))
            sigma = sigma.reshape((-1, self.frnn_step_size, self.k))
            coeff = coeff.reshape((-1, self.k))

            sample_coeff = self.theano_rng.multinomial(pvals=coeff, dtype=coeff.dtype)
            idx = predict(sample_coeff, axis=-1)
            # idx = predict(coeff, axis = -1) use this line for using most likely coeff.

            # shapes (ls*bs)*(fs)
            mu = mu[tensor.arange(mu.shape[0]), :, idx]
            sigma = sigma[tensor.arange(sigma.shape[0]), :, idx]

            epsilon = self.theano_rng.normal(size=mu.shape, avg=0.0, std=1.0, dtype=mu.dtype)

            result = mu + sigma * epsilon  # *0.6 #reduce variance.
            result = result.reshape(shape_result, ndim=ndim_result)
            results.append(result)

            # if the total size does not correspond to the frame_size,
            # this removes the need for padding
            if not last_iteration:
                state = self.frnn_activation.apply(
                    self.frnn_linear_transition_state.apply(state) + self.frnn_linear_transition_input.apply(result)
                )

        results = tensor.stack(results, axis=-1)
        results = tensor.flatten(results, outdim=results.ndim - 1)

        # truncate if not good size
        if self.last_steps != 0:
            results = results[tuple([slice(0, None)] * (results.ndim - 1) + [slice(0, self.frame_size)])]

        return results

    @application
    def cost(self, readouts, outputs):
        # initial state
        state = self.frnn_initial_state.apply(self.mlp.apply(readouts))

        inputs = outputs

        mus = []
        sigmas = []
        coeffs = []

        for i in range(self.number_of_steps):
            last_iteration = i == self.number_of_steps - 1

            # First generating distribution parameters and sampling.
            freq_mu = self.mu.apply(state)
            freq_sigma = self.sigma.apply(state) + self.const
            freq_coeff = self.coeff2.apply(self.coeff.apply(state), extra_ndim=state.ndim - 2) + self.const

            freq_mu = freq_mu.reshape((-1, self.frnn_step_size, self.k))
            freq_sigma = freq_sigma.reshape((-1, self.frnn_step_size, self.k))
            freq_coeff = freq_coeff.reshape((-1, self.k))
            # mu,sigma: shape (-1,fs,k)
            # coeff: shape (-1,k)

            mus.append(freq_mu)
            sigmas.append(freq_sigma)
            coeffs.append(freq_coeff)

            index = self.frnn_step_size
            freq_inputs = inputs[
                tuple([slice(0, None)] * (inputs.ndim - 1) + [slice(index, index + self.frnn_step_size)])
            ]

            if not last_iteration:
                state = self.frnn_activation.apply(
                    self.frnn_linear_transition_state.apply(state)
                    + self.frnn_linear_transition_input.apply(freq_inputs)
                )

        mus = tensor.stack(mus, axis=-2)
        sigmas = tensor.stack(sigmas, axis=-2)
        coeffs = tensor.stack(coeffs, axis=-2)

        mus = mus.reshape((-1, self.frnn_step_size * self.number_of_steps, self.k))
        sigmas = sigmas.reshape((-1, self.frnn_step_size * self.number_of_steps, self.k))
        coeffs = coeffs.repeat(self.frnn_step_size, axis=-2)

        mus = mus[tuple([slice(0, None)] * (mus.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])]
        sigmas = sigmas[tuple([slice(0, None)] * (sigmas.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])]
        coeffs = coeffs[tuple([slice(0, None)] * (coeffs.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])]
        # actually prob not necessary
        mu = mus.reshape((-1, self.target_size))
        sigma = sigmas.reshape((-1, self.target_size))
        coeff = coeffs.reshape((-1, self.target_size))

        return FRNN_NLL(y=outputs, mu=mu, sig=sigma, coeff=coeff, frame_size=self.frame_size, k=self.k)

    @application
    def initial_outputs(self, batch_size):
        return tensor.zeros((batch_size, self.frame_size), dtype=floatX)

    def get_dim(self, name):
        # modification here to ensure the right dim.
        if name == "outputs":
            return self.frame_size
        return super(FRNNEmitter, self).get_dim(name)
Esempio n. 36
0
def get_prediction_function():
	question = tensor.imatrix('question')
	question_mask = tensor.imatrix('question_mask')
	context = tensor.imatrix('context')
	context_mask = tensor.imatrix('context_mask')
	answer = tensor.ivector('answer')
	candidates = tensor.imatrix('candidates')
	candidates_mask = tensor.imatrix('candidates_mask')

	"""	
	question = question.dimshuffle(1, 0)
	question_mask = question_mask.dimshuffle(1, 0)
	context = context.dimshuffle(1, 0)
	context_mask = context_mask.dimshuffle(1, 0)
	"""
	# Embed questions and cntext
	embed = bricks[-5]

	qembed = embed.apply(question.dimshuffle(1, 0))
	cembed = embed.apply(context.dimshuffle(1, 0))
	global _qembed,_cembed
	_qembed = theano.function([question], qembed)
	_cembed = theano.function([context], cembed)

	qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.dimshuffle(1, 0).astype(theano.config.floatX),
												config.question_lstm_size, config.question_skip_connections, 'q')
	chidden_list = make_bidir_lstm_stack(cembed, config.embed_size, context_mask.dimshuffle(1, 0).astype(theano.config.floatX),
												config.ctx_lstm_size, config.ctx_skip_connections, 'ctx')
	
	global _qhidden, _chidden
	_qhidden = theano.function([question, question_mask], qhidden_list)
	_chidden = theano.function([context, context_mask], chidden_list)

	# Calculate question encoding (concatenate layer1)
	if config.question_skip_connections:
		qenc_dim = 2*sum(config.question_lstm_size)
		qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1)
	else:
		qenc_dim = 2*config.question_lstm_size[-1]
		qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1)
	qenc.name = 'qenc'

	# Calculate context encoding (concatenate layer1)
	if config.ctx_skip_connections:
		cenc_dim = 2*sum(config.ctx_lstm_size)
		cenc = tensor.concatenate(chidden_list, axis=2)
	else:
		cenc_dim = 2*config.ctx_lstm_size[-1]
		cenc = tensor.concatenate(chidden_list[-2:], axis=2)
	cenc.name = 'cenc'

	global _qenc, _cenc
	_qenc = theano.function([question, question_mask], qenc)	
	_cenc = theano.function([context, context_mask], cenc)	

	# Attention mechanism MLP
	attention_mlp = bricks[-2]      #attention_mlp  
	attention_qlinear = bricks[4]	#attq
	attention_clinear = bricks[11] # attc
	layer1 = Tanh().apply(attention_clinear.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2])))
							.reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0]))
                             + attention_qlinear.apply(qenc)[None, :, :])

	global _attention_clinear, _attention_qlinear
	_attention_clinear = theano.function([context, context_mask], attention_clinear.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2]))).reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0])))
	_attention_qlinear = theano.function([question, question_mask], attention_qlinear.apply(qenc)[None, :, :])

	layer1.name = 'layer1'
	att_weights = attention_mlp.apply(layer1.reshape((layer1.shape[0]*layer1.shape[1], layer1.shape[2])))
	att_weights.name = 'att_weights_0'
	att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1]))
	att_weights.name = 'att_weights'

	attended = tensor.sum(cenc * tensor.nnet.softmax(att_weights.T).T[:, :, None], axis=0)
	attended.name = 'attended'

	global _attended
	_attended = theano.function([question, question_mask, context, context_mask], attended)

	# Now we can calculate our output
	out_mlp = bricks[-1] #out_mlp
	probs = out_mlp.apply(tensor.concatenate([attended, qenc], axis=1))
	probs.name = 'probs'

	f = theano.function([question, question_mask, context, context_mask], probs)
	return f
Esempio n. 37
0
class Decoder(Initializable):
    def __init__(self, vocab_size, embedding_dim, state_dim,
                 representation_dim, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim

        readout = Readout(
            source_names=['states', 'feedback', 'readout_context'],
            readout_dim=self.vocab_size,
            emitter=SoftmaxEmitter(),
            feedback_brick=LookupFeedback(vocab_size, embedding_dim),
            post_merge=InitializableFeedforwardSequence(
                [Bias(dim=1000).apply,
                 Maxout(num_pieces=2).apply,
                 Linear(input_dim=state_dim / 2, output_dim=100,
                        use_bias=False).apply,
                 Linear(input_dim=100).apply]),
            merged_dim=1000)

        self.transition = GatedRecurrentWithContext(Tanh(), dim=state_dim,
                                                    name='decoder')
        # Readout will apply the linear transformation to 'readout_context'
        # with a Merge brick, so no need to fork it here
        self.fork = Fork([name for name in
                          self.transition.apply.contexts +
                          self.transition.apply.states
                          if name != 'readout_context'], prototype=Linear())
        self.tanh = Tanh()

        self.sequence_generator = SequenceGenerator(
            readout=readout, transition=self.transition,
            fork_inputs=[name for name in self.transition.apply.sequences
                         if name != 'mask'],
        )

        self.children = [self.fork, self.sequence_generator, self.tanh]

    def _push_allocation_config(self):
        self.fork.input_dim = self.representation_dim
        self.fork.output_dims = [self.state_dim
                                 for _ in self.fork.output_names]

    @application(inputs=['representation', 'target_sentence_mask',
                         'target_sentence'], outputs=['cost'])
    def cost(self, representation, target_sentence, target_sentence_mask):
        target_sentence = target_sentence.dimshuffle(1, 0)
        target_sentence_mask = target_sentence_mask.T

        # The initial state and contexts, all functions of the representation
        contexts = {key: value.dimshuffle('x', 0, 1)
                    if key not in self.transition.apply.states else value
                    for key, value
                    in self.fork.apply(representation, as_dict=True).items()}
        contexts['states'] = self.tanh.apply(contexts['states'])
        cost = self.sequence_generator.cost(**merge(
            contexts, {'mask': target_sentence_mask,
                       'outputs': target_sentence,
                       'readout_context': representation.dimshuffle('x', 0, 1)}
        ))

        return (cost * target_sentence_mask).sum() / target_sentence_mask.shape[1]
Esempio n. 38
0
    def __init__(self, mlp, target_size, frame_size, k, frnn_hidden_size, frnn_step_size, const=1e-5, **kwargs):

        super(FRNNEmitter, self).__init__(**kwargs)

        self.mlp = mlp
        self.target_size = target_size
        self.frame_size = frame_size
        self.k = k
        self.frnn_hidden_size = frnn_hidden_size
        self.const = const
        self.input_dim = self.mlp.output_dim

        self.frnn_step_size = frnn_step_size

        # adding a step if the division is not exact.
        self.number_of_steps = frame_size // frnn_step_size
        self.last_steps = frame_size % frnn_step_size
        if self.last_steps != 0:
            self.number_of_steps += 1

        self.mu = MLP(activations=[Identity()], dims=[frnn_hidden_size, k * frnn_step_size], name=self.name + "_mu")
        self.sigma = MLP(
            activations=[SoftPlus()], dims=[frnn_hidden_size, k * frnn_step_size], name=self.name + "_sigma"
        )

        self.coeff = MLP(activations=[Identity()], dims=[frnn_hidden_size, k], name=self.name + "_coeff")

        self.coeff2 = NDimensionalSoftmax()

        self.frnn_initial_state = Linear(
            input_dim=self.input_dim, output_dim=frnn_hidden_size, name="frnn_initial_state"
        )

        # self.frnn_hidden = Linear(
        #    input_dim=frnn_hidden_size,
        #    output_dim=frnn_hidden_size,
        #    activation=Tanh(),
        #    name="frnn_hidden")

        self.frnn_activation = Tanh(name="frnn_activation")

        self.frnn_linear_transition_state = Linear(
            input_dim=frnn_hidden_size, output_dim=frnn_hidden_size, name="frnn_linear_transition_state"
        )

        self.frnn_linear_transition_input = Linear(
            input_dim=self.frnn_step_size, output_dim=frnn_hidden_size, name="frnn_linear_transition_input"
        )

        # self.frnn_linear_transition_output = Linear (
        #    input_dim = frnn_hidden_size,
        #    output_dim = self.rnn_hidden_dim,
        #    name="frnn_linear_transition_output")

        self.children = [
            self.mlp,
            self.mu,
            self.sigma,
            self.coeff,
            self.coeff2,
            self.frnn_initial_state,
            self.frnn_activation,
            self.frnn_linear_transition_state,
            self.frnn_linear_transition_input,
        ]
Esempio n. 39
0
    def __init__(self, config):
        inp = tensor.imatrix('bytes')

        embed = theano.shared(config.embedding_matrix.astype(theano.config.floatX),
                              name='embedding_matrix')
        in_repr = embed[inp.flatten(), :].reshape((inp.shape[0], inp.shape[1], config.repr_dim))
        in_repr.name = 'in_repr'

        bricks = []
        states = []

        # Construct predictive GRU hierarchy
        hidden = []
        costs = []
        next_target = in_repr.dimshuffle(1, 0, 2)
        for i, (hdim, cf, q) in enumerate(zip(config.hidden_dims,
                                                   config.cost_factors,
                                                   config.hidden_q)):
            init_state = theano.shared(numpy.zeros((config.num_seqs, hdim)).astype(theano.config.floatX),
                                       name='st0_%d'%i)

            linear = Linear(input_dim=config.repr_dim, output_dim=3*hdim,
                            name="lstm_in_%d"%i)
            lstm = GatedRecurrent(dim=hdim, activation=config.activation_function,
                        name="lstm_rec_%d"%i)
            linear2 = Linear(input_dim=hdim, output_dim=config.repr_dim, name='lstm_out_%d'%i)
            tanh = Tanh('lstm_out_tanh_%d'%i)
            bricks += [linear, lstm, linear2, tanh]
            if i > 0:
                linear1 = Linear(input_dim=config.hidden_dims[i-1], output_dim=3*hdim,
                                 name='lstm_in2_%d'%i)
                bricks += [linear1]

            next_target = tensor.cast(next_target, dtype=theano.config.floatX)
            inter = linear.apply(theano.gradient.disconnected_grad(next_target))
            if i > 0:
                inter += linear1.apply(theano.gradient.disconnected_grad(hidden[-1][:-1,:,:]))
            new_hidden = lstm.apply(inputs=inter[:,:,:hdim],
                                    gate_inputs=inter[:,:,hdim:],
                                    states=init_state)
            states.append((init_state, new_hidden[-1, :, :]))

            hidden += [tensor.concatenate([init_state[None,:,:], new_hidden],axis=0)]
            pred = tanh.apply(linear2.apply(hidden[-1][:-1,:,:]))
            costs += [numpy.float32(cf) * (-next_target * pred).sum(axis=2).mean()]
            costs += [numpy.float32(cf) * q * abs(pred).sum(axis=2).mean()]
            diff = next_target - pred
            next_target = tensor.ge(diff, 0.5) - tensor.le(diff, -0.5)


        # Construct output from hidden states
        hidden = [s.dimshuffle(1, 0, 2) for s in hidden]

        out_parts = []
        out_dims = config.out_hidden + [config.io_dim]
        for i, (dim, state) in enumerate(zip(config.hidden_dims, hidden)):
            pred_linear = Linear(input_dim=dim, output_dim=out_dims[0],
                                name='pred_linear_%d'%i)
            bricks.append(pred_linear)
            lin = theano.gradient.disconnected_grad(state)
            out_parts.append(pred_linear.apply(lin))

        # Do prediction and calculate cost
        out = sum(out_parts)

        if len(out_dims) > 1:
            out = config.out_hidden_act[0](name='out_act0').apply(out)
            mlp = MLP(dims=out_dims,
                      activations=[x(name='out_act%d'%i) for i, x in enumerate(config.out_hidden_act[1:])]
                                 +[Identity()],
                      name='out_mlp')
            bricks.append(mlp)
            out = mlp.apply(out.reshape((inp.shape[0]*(inp.shape[1]+1),-1))
                           ).reshape((inp.shape[0],inp.shape[1]+1,-1))

        pred = out.argmax(axis=2)

        cost = Softmax().categorical_cross_entropy(inp.flatten(),
                                                   out[:,:-1,:].reshape((inp.shape[0]*inp.shape[1],
                                                                config.io_dim))).mean()
        error_rate = tensor.neq(inp.flatten(), pred[:,:-1].flatten()).mean()

        sgd_cost = cost + sum(costs)
            
        # Initialize all bricks
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()

        # apply noise
        cg = ComputationGraph([sgd_cost, cost, error_rate]+costs)
        if config.weight_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.weight_noise)
        sgd_cost = cg.outputs[0]
        cost = cg.outputs[1]
        error_rate = cg.outputs[2]
        costs = cg.outputs[3:]


        # put stuff into self that is usefull for training or extensions
        self.sgd_cost = sgd_cost

        sgd_cost.name = 'sgd_cost'
        for i in range(len(costs)):
            costs[i].name = 'pred_cost_%d'%i
        cost.name = 'cost'
        error_rate.name = 'error_rate'
        self.monitor_vars = [costs, [cost],
                             [error_rate]]

        self.out = out[:,1:,:]
        self.pred = pred[:,1:]

        self.states = states
Esempio n. 40
0
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')

        # set up 32-bit integer matrices
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.ivector('answer')
        candidates = tensor.imatrix('candidates')
        candidates_mask = tensor.imatrix('candidates_mask')

        # and the multple choice answers:
        ans1 = tensor.ivector('ans1')
        ans1_mask = tensor.ivector('ans1_mask')
        ans2 = tensor.ivector('ans2')
        ans2_mask = tensor.ivector('ans2_mask')
        ans3 = tensor.ivector('ans3')
        ans3_mask = tensor.ivector('ans3_mask')
        ans4 = tensor.ivector('ans4')
        ans4_mask = tensor.ivector('ans4_mask')

        bricks = []

        # inverts 1st and 2nd dimensions of matrix
        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)

        # Embed questions and cntext
        embed = LookupTable(vocab_size, config.embed_size, name='question_embed')
        bricks.append(embed)

        qembed = embed.apply(question)
        cembed = embed.apply(context)
        a1embed = embed.apply(ans1)
        a2embed = embed.apply(ans2)
        a3embed = embed.apply(ans3)
        a4embed = embed.apply(ans4)

        qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX),
                                                     config.question_lstm_size, config.question_skip_connections, 'q')
        clstms, chidden_list = make_bidir_lstm_stack(cembed, config.embed_size, context_mask.astype(theano.config.floatX),
                                                     config.ctx_lstm_size, config.ctx_skip_connections, 'ctx')
        bricks = bricks + qlstms + clstms

        # Calculate question encoding (concatenate layer1)
        if config.question_skip_connections:
            qenc_dim = 2*sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1)
        else:
            qenc_dim = 2*config.question_lstm_size[-1]
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1)
        qenc.name = 'qenc'

        # Calculate context encoding (concatenate layer1)
        if config.ctx_skip_connections:
            cenc_dim = 2*sum(config.ctx_lstm_size)
            cenc = tensor.concatenate(chidden_list, axis=2)
        else:
            cenc_dim = 2*config.ctx_lstm_size[-1]
            cenc = tensor.concatenate(chidden_list[-2:], axis=2)
        cenc.name = 'cenc'

        # Attention mechanism MLP
        attention_mlp = MLP(dims=config.attention_mlp_hidden + [1],
                            activations=config.attention_mlp_activations[1:] + [Identity()],
                            name='attention_mlp')
        attention_qlinear = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq')
        attention_clinear = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc')
        bricks += [attention_mlp, attention_qlinear, attention_clinear]
        layer1 = Tanh().apply(attention_clinear.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2])))
                                        .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0]))
                             + attention_qlinear.apply(qenc)[None, :, :])
        layer1.name = 'layer1'
        att_weights = attention_mlp.apply(layer1.reshape((layer1.shape[0]*layer1.shape[1], layer1.shape[2])))
        att_weights.name = 'att_weights_0'
        att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1]))
        att_weights.name = 'att_weights'

        attended = tensor.sum(cenc * tensor.nnet.softmax(att_weights.T).T[:, :, None], axis=0)
        attended.name = 'attended'

        # Now we can calculate our output
        out_mlp = MLP(dims=[cenc_dim + qenc_dim] + config.out_mlp_hidden + [config.n_entities],
                      activations=config.out_mlp_activations + [Identity()],
                      name='out_mlp')
        bricks += [out_mlp]
        probs = out_mlp.apply(tensor.concatenate([attended, qenc], axis=1))
        probs.name = 'probs'

        # not needed anymore, since we're not only looking at entities
        # is_candidate = tensor.eq(tensor.arange(config.n_entities, dtype='int32')[None, None, :],
        #                          tensor.switch(candidates_mask, candidates, -tensor.ones_like(candidates))[:, :, None]).sum(axis=1)
        # probs = tensor.switch(is_candidate, probs, -1000 * tensor.ones_like(probs))

        # Calculate prediction, cost and error rate

        # vocab = tensor.arange(10)
        # probs = numpy.asarray([0, 0.8, 0, 0.2], dtype=numpy.float32)
        # context = numpy.asarray([3, 2, 8, 1], dtype=numpy.int32)
        # ans3 =  numpy.asarray([2, 8, 1], dtype=numpy.int32)
        # ans1 =  numpy.asarray([1, 3, 4], dtype=numpy.int32)
        # ans2 =  numpy.asarray([1, 1, 4], dtype=numpy.int32)

        # convert probs vector to one that's the same size as vocab, with all zeros except probs:
        # probs = tensor.switch(is_candidate, probs, -1000 * tensor.ones_like(probs))
        probsPadded = tensor.zeros_like(vocab_size, dtype=numpy.float32)
        probsSubset = probsPadded[cembed] #TODO this should be masked
        b = tensor.set_subtensor(probsSubset, probs)

        # get the similarity score of each (masked) answer with the context probs:
        ans1probs = b[a1enc]
        ans1score = tensor.switch(ans1_mask, ans1probs, tensor.zeros_like(ans1probs)).sum()
        ans2probs = b[a2enc]
        ans2score = ans2probs.sum()
        ans3probs = b[a3enc]
        ans3score = ans3probs.sum()
        ans4probs = b[a4enc]
        ans4score = ans4probs.sum()

        # and pick the best one:
        allans = tensor.stacklists([ans1score, ans2score, ans3score, ans4score])
        pred = tensor.argmax(allans)

        cg = ComputationGraph([ans1probs, ans1score, ans2probs, ans2score, ans3probs, ans3score, ans4probs, ans4score, allans, pred])
        f = cg.get_theano_function()
        out = f()

        #pred = probs.argmax(axis=1)
        #print "pred"
        #print pred TODO CHANGE THIS!
        cost = Softmax().categorical_cross_entropy(answer, probs).mean()
        error_rate = tensor.neq(answer, pred).mean()

        # Apply dropout
        cg = ComputationGraph([cost, error_rate])
        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout)
        [cost_reg, error_rate_reg] = cg.outputs

        # Other stuff
        cost_reg.name = cost.name = 'cost'
        error_rate_reg.name = error_rate.name = 'error_rate'


        self.probs = probs
        self.probs.name = "probs"
        self.cost = cost
        self.cost.name = "cost"
        #
        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg], [error_rate_reg]]
        self.monitor_vars_valid = [[cost], [error_rate]]

        # Initialize bricks
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()