def __init__(self, feature_size, embedding_dim, state_dim, **kwargs):
        super(BidirectionalPhonemeAudioEncoder, self).__init__(**kwargs)
        self.feature_size = feature_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim

        self.audio_embedding = BidirectionalWMT15(GatedRecurrent(activation=Tanh(), dim=state_dim), name="audio_embeddings")
        self.audio_fwd_fork = Fork(
            [name for name in self.audio_embedding.prototype.apply.sequences
             if name != 'mask'], prototype=Linear(), name='audio_fwd_fork')
        self.audio_back_fork = Fork(
            [name for name in self.audio_embedding.prototype.apply.sequences
             if name != 'mask'], prototype=Linear(), name='audio_back_fork')

        self.phoneme_embedding = BidirectionalWMT15(GatedRecurrent(activation=Tanh(), dim=state_dim), name="phoneme_embeddings")
        self.phoneme_fwd_fork = Fork(
            [name for name in self.phoneme_embedding.prototype.apply.sequences
             if name != 'mask'], prototype=Linear(), name='phoneme_fwd_fork')
        self.phoneme_back_fork = Fork(
            [name for name in self.phoneme_embedding.prototype.apply.sequences
             if name != 'mask'], prototype=Linear(), name='phoneme_back_fork')

        self.words_embedding = BidirectionalWMT15(GatedRecurrent(activation=Tanh(), dim=state_dim), name="words_embeddings")
        self.words_fwd_fork = Fork(
            [name for name in self.words_embedding.prototype.apply.sequences
             if name != 'mask'], prototype=Linear(), name='words_fwd_fork')
        self.words_back_fork = Fork(
            [name for name in self.words_embedding.prototype.apply.sequences
             if name != 'mask'], prototype=Linear(), name='words_back_fork')

        self.children = [self.phoneme_embedding, self.audio_embedding, self.words_embedding,
                         self.phoneme_fwd_fork, self.phoneme_back_fork, self.audio_fwd_fork, self.audio_back_fork, self.words_fwd_fork, self.words_back_fork]
    def __init__(self, vocab_size, embedding_dim, state_dim, **kwargs):
        super(BidirectionalPhonesEncoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim

        self.lookup = LookupTable(name='phones_embeddings')
        self.embedding = BidirectionalWMT15(GatedRecurrent(activation=Tanh(), dim=state_dim), name="audio_embeddings")
        self.embedding_fwd_fork = Fork(
            [name for name in self.embedding.prototype.apply.sequences
             if name != 'mask'], prototype=Linear(), name='embedding_fwd_fork')
        self.embedding_back_fork = Fork(
            [name for name in self.embedding.prototype.apply.sequences
             if name != 'mask'], prototype=Linear(), name='embedding_back_fork')

        self.bidir = BidirectionalWMT15(GatedRecurrent(activation=Tanh(), dim=state_dim), name="audio_representation")
        self.fwd_fork = Fork(
            [name for name in self.bidir.prototype.apply.sequences
             if name != 'mask'], prototype=Linear(), name='fwd_fork')
        self.back_fork = Fork(
            [name for name in self.bidir.prototype.apply.sequences
             if name != 'mask'], prototype=Linear(), name='back_fork')

        self.children = [self.lookup, self.bidir, self.embedding,
                         self.fwd_fork, self.back_fork, self.embedding_fwd_fork, self.embedding_back_fork]
Example #3
0
 def setUp(self):
     self.gated = GatedRecurrent(
         dim=3, activation=Tanh(),
         gate_activation=Tanh(), weights_init=Constant(2))
     self.gated.initialize()
     self.reset_only = GatedRecurrent(
         dim=3, activation=Tanh(),
         gate_activation=Tanh(),
         weights_init=IsotropicGaussian(), seed=1)
     self.reset_only.initialize()
Example #4
0
 def setUp(self):
     self.gated = GatedRecurrent(
         dim=3, weights_init=Constant(2),
         activation=Tanh(), gate_activation=Tanh())
     self.gated.initialize()
     self.reset_only = GatedRecurrent(
         dim=3, weights_init=IsotropicGaussian(),
         activation=Tanh(), gate_activation=Tanh(),
         use_update_gate=False, rng=numpy.random.RandomState(1))
     self.reset_only.initialize()
Example #5
0
def example2():
    """GRU"""
    x = tensor.tensor3('x')
    dim = 3

    fork = Fork(input_dim=dim,
                output_dims=[dim, dim * 2],
                name='fork',
                output_names=["linear", "gates"],
                weights_init=initialization.Identity(),
                biases_init=Constant(0))
    gru = GatedRecurrent(dim=dim,
                         weights_init=initialization.Identity(),
                         biases_init=Constant(0))

    fork.initialize()
    gru.initialize()

    linear, gate_inputs = fork.apply(x)
    h = gru.apply(linear, gate_inputs)

    f = theano.function([x], h)
    print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))

    doubler = Linear(input_dim=dim,
                     output_dim=dim,
                     weights_init=initialization.Identity(2),
                     biases_init=initialization.Constant(0))
    doubler.initialize()

    lin, gate = fork.apply(doubler.apply(x))
    h_doubler = gru.apply(lin, gate)

    f = theano.function([x], h_doubler)
    print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
Example #6
0
    def __init__(
            self,
            encoder_type,
            num_characters,
            input_dim,
            encoder_dim,
            **kwargs):
        assert encoder_type in [None, 'bidirectional']
        self.encoder_type = encoder_type
        super(Encoder, self).__init__(**kwargs)

        self.children = []

        if encoder_type in ['lookup', 'bidirectional']:
            self.embed_label = LookupTable(
                num_characters,
                input_dim,
                name='embed_label')
            self.children += [
                self.embed_label]
        else:
            # If there is no encoder.
            assert num_characters == input_dim

        if encoder_type == 'bidirectional':
            transition = RecurrentWithFork(
                GatedRecurrent(dim=encoder_dim).apply,
                input_dim, name='encoder_transition')
            self.encoder = Bidirectional(transition, name='encoder')
            self.children.append(self.encoder)
Example #7
0
    def __init__(self, embedding_dim, state_dim, **kwargs):
        super(BidirectionalEncoder, self).__init__(**kwargs)
        # Dimension of the word embeddings taken as input
        self.embedding_dim = embedding_dim
        # Hidden state dimension
        self.state_dim = state_dim

        # The bidir GRU
        self.bidir = BidirectionalFromDict(
            GatedRecurrent(activation=Tanh(), dim=state_dim))
        # Forks to administer the inputs of GRU gates
        self.fwd_fork = Fork([
            name
            for name in self.bidir.prototype.apply.sequences if name != 'mask'
        ],
                             prototype=Linear(),
                             name='fwd_fork')
        self.back_fork = Fork([
            name
            for name in self.bidir.prototype.apply.sequences if name != 'mask'
        ],
                              prototype=Linear(),
                              name='back_fork')

        self.children = [self.bidir, self.fwd_fork, self.back_fork]
Example #8
0
    def __init__(self,hidden_size_recurrent, k, **kwargs):
        super(Scribe, self).__init__(**kwargs)

        readout_size =6*k+1
        transition = [GatedRecurrent(dim=hidden_size_recurrent, 
                      name = "gru_{}".format(i) ) for i in range(3)]

        transition = RecurrentStack( transition,
                name="transition", skip_connections = True)

        emitter = BivariateGMMEmitter(k = k)

        source_names = [name for name in transition.apply.states 
                                      if 'states' in name]

        readout = Readout(
            readout_dim = readout_size,
            source_names =source_names,
            emitter=emitter,
            name="readout")

        self.generator = SequenceGenerator(readout=readout, 
                                  transition=transition,
                                  name = "generator")

        self.children = [self.generator]
Example #9
0
 def __init__(self, embedding_dim, state_dim, **kwargs):
     """Constructor. Note that this implementation only supports
     single layer architectures.
     
     Args:
         embedding_dim (int): Dimensionality of the word vectors
                              defined by the sparse feature map.
         state_dim (int): Size of the recurrent layer.
     """
     super(NoLookupEncoder, self).__init__(**kwargs)
     self.embedding_dim = embedding_dim
     self.state_dim = state_dim
     self.bidir = BidirectionalWMT15(
         GatedRecurrent(activation=Tanh(), dim=state_dim))
     self.fwd_fork = Fork([
         name
         for name in self.bidir.prototype.apply.sequences if name != 'mask'
     ],
                          prototype=Linear(),
                          name='fwd_fork')
     self.back_fork = Fork([
         name
         for name in self.bidir.prototype.apply.sequences if name != 'mask'
     ],
                           prototype=Linear(),
                           name='back_fork')
     self.children = [self.bidir, self.fwd_fork, self.back_fork]
Example #10
0
def gru_layer(dim, h, n):
    fork = Fork(output_names=['linear' + str(n), 'gates' + str(n)],
                name='fork' + str(n), input_dim=dim, output_dims=[dim, dim * 2])
    gru = GatedRecurrent(dim=dim, name='gru' + str(n))
    initialize([fork, gru])
    linear, gates = fork.apply(h)
    return gru.apply(linear, gates)
def test_sequence_generator():
    # Disclaimer: here we only check shapes, not values.

    output_dim = 1
    dim = 20
    batch_size = 30
    n_steps = 10

    transition = GatedRecurrent(
        name="transition", activation=Tanh(), dim=dim,
        weights_init=Orthogonal())
    generator = SequenceGenerator(
        LinearReadout(readout_dim=output_dim, source_names=["states"],
                      emitter=TestEmitter(name="emitter"), name="readout"),
        transition,
        weights_init=IsotropicGaussian(0.01), biases_init=Constant(0),
        name="generator")
    generator.initialize()

    y = tensor.tensor3('y')
    mask = tensor.matrix('mask')
    costs = generator.cost(y, mask)
    assert costs.ndim == 2
    costs_val = theano.function([y, mask], [costs])(
        numpy.zeros((n_steps, batch_size, output_dim), dtype=floatX),
        numpy.ones((n_steps, batch_size), dtype=floatX))[0]
    assert costs_val.shape == (n_steps, batch_size)

    states, outputs, costs = [variable.eval() for variable in
                              generator.generate(
                                  iterate=True, batch_size=batch_size,
                                  n_steps=n_steps)]
    assert states.shape == (n_steps, batch_size, dim)
    assert outputs.shape == (n_steps, batch_size, output_dim)
    assert costs.shape == (n_steps, batch_size)
Example #12
0
    def __init__(self,
                 k=20,
                 rec_h_dim=400,
                 att_size=10,
                 num_letters=68,
                 sampling_bias=0.,
                 attention_type="graves",
                 epsilon=1e-6,
                 attention_alignment=1.,
                 **kwargs):
        super(Scribe, self).__init__(**kwargs)

        # For now only softmax and graves are supported.
        assert attention_type in ["graves", "softmax"]

        readouts_dim = 1 + 6 * k

        self.k = k
        self.rec_h_dim = rec_h_dim
        self.att_size = att_size
        self.num_letters = num_letters
        self.sampling_bias = sampling_bias
        self.attention_type = attention_type
        self.epsilon = epsilon
        self.attention_alignment = attention_alignment

        self.cell1 = GatedRecurrent(dim=rec_h_dim, name='cell1')

        self.inp_to_h1 = Fork(output_names=['cell1_inputs', 'cell1_gates'],
                              input_dim=3,
                              output_dims=[rec_h_dim, 2 * rec_h_dim],
                              name='inp_to_h1')

        self.h1_to_readout = Linear(input_dim=rec_h_dim,
                                    output_dim=readouts_dim,
                                    name='h1_to_readout')

        self.h1_to_att = Fork(output_names=['alpha', 'beta', 'kappa'],
                              input_dim=rec_h_dim,
                              output_dims=[att_size] * 3,
                              name='h1_to_att')

        self.att_to_h1 = Fork(output_names=['cell1_inputs', 'cell1_gates'],
                              input_dim=num_letters,
                              output_dims=[rec_h_dim, 2 * rec_h_dim],
                              name='att_to_h1')

        self.att_to_readout = Linear(input_dim=num_letters,
                                     output_dim=readouts_dim,
                                     name='att_to_readout')

        self.emitter = BivariateGMMEmitter(k=k, sampling_bias=sampling_bias)

        self.children = [
            self.cell1, self.inp_to_h1, self.h1_to_readout, self.h1_to_att,
            self.att_to_h1, self.att_to_readout, self.emitter
        ]
Example #13
0
def gru_layer(dim, h, n, x_mask, first, **kwargs):
    fork = Fork(output_names=['linear' + str(n), 'gates' + str(n)],
                name='fork' + str(n),
                input_dim=dim,
                output_dims=[dim, dim * 2])
    gru = GatedRecurrent(dim=dim, name='gru' + str(n))
    initialize([fork, gru])
    linear, gates = fork.apply(h)
    if first:
        gruApply = gru.apply(linear, gates, mask=x_mask, **kwargs)
    else:
        gruApply = gru.apply(linear, gates, **kwargs)
    return gruApply
Example #14
0
 def __init__(self, vocab_size, embedding_dim, n_layers, skip_connections,
              state_dim, **kwargs):
     """Sole constructor.
     
     Args:
         vocab_size (int): Source vocabulary size
         embedding_dim (int): Dimension of the embedding layer
         n_layers (int): Number of layers. Layers share the same
                         weight matrices.
         skip_connections (bool): Skip connections connect the
                                  source word embeddings directly 
                                  with deeper layers to propagate 
                                  the gradient more efficiently
         state_dim (int): Number of hidden units in the recurrent
                          layers.
     """
     super(DeepBidirectionalEncoder, self).__init__(**kwargs)
     self.vocab_size = vocab_size
     self.embedding_dim = embedding_dim
     self.n_layers = n_layers
     self.state_dim = state_dim
     self.skip_connections = skip_connections
     self.lookup = LookupTable(name='embeddings')
     self.bidirs = []
     self.fwd_forks = []
     self.back_forks = []
     for i in xrange(self.n_layers):
         bidir = BidirectionalWMT15(GatedRecurrent(activation=Tanh(),
                                                   dim=state_dim),
                                    name='bidir%d' % i)
         self.bidirs.append(bidir)
         self.fwd_forks.append(
             Fork([
                 name for name in bidir.prototype.apply.sequences
                 if name != 'mask'
             ],
                  prototype=Linear(),
                  name='fwd_fork%d' % i))
         self.back_forks.append(
             Fork([
                 name for name in bidir.prototype.apply.sequences
                 if name != 'mask'
             ],
                  prototype=Linear(),
                  name='back_fork%d' % i))
     self.children = [self.lookup] \
                     + self.bidirs \
                     + self.fwd_forks \
                     + self.back_forks
Example #15
0
    def __init__(self, dimension, input_size, embed_input=False, **kwargs):
        super(GRUEncoder, self).__init__(**kwargs)
        if embed_input:
            self.embedder = LookupTable(input_size, dimension)
        else:
            self.embedder = Linear(input_size, dimension)
        self.fork = Fork(['inputs', 'gate_inputs'],
                         dimension,
                         output_dims=[dimension, 2 * dimension],
                         prototype=Linear())
        encoder = Bidirectional(
            GatedRecurrent(dim=dimension, activation=Tanh()))

        self.encoder = encoder
        self.children = [encoder, self.embedder, self.fork]
Example #16
0
    def __init__(self, inner_input_dim, outer_input_dim, inner_dim, **kwargs):
        self.inner_gru = GatedRecurrent(dim=inner_dim, name='inner_gru')

        self.inner_input_fork = Fork(
            output_names=[name for name in self.inner_gru.apply.sequences
                          if 'mask' not in name],
            input_dim=inner_input_dim, name='inner_input_fork')
        self.outer_input_fork = Fork(
            output_names=[name for name in self.inner_gru.apply.sequences
                          if 'mask' not in name],
            input_dim=outer_input_dim, name='inner_outer_fork')

        super(InnerRecurrent, self).__init__(**kwargs)

        self.children = [
            self.inner_gru, self.inner_input_fork, self.outer_input_fork]
Example #17
0
def test_integer_sequence_generator():
    # Disclaimer: here we only check shapes, not values.

    readout_dim = 5
    feedback_dim = 3
    dim = 20
    batch_size = 30
    n_steps = 10

    transition = GatedRecurrent(name="transition",
                                activation=Tanh(),
                                dim=dim,
                                weights_init=Orthogonal())
    generator = SequenceGenerator(LinearReadout(
        readout_dim=readout_dim,
        source_names=["states"],
        emitter=SoftmaxEmitter(name="emitter"),
        feedbacker=LookupFeedback(readout_dim, feedback_dim),
        name="readout"),
                                  transition,
                                  weights_init=IsotropicGaussian(0.01),
                                  biases_init=Constant(0),
                                  name="generator")
    generator.initialize()

    y = tensor.lmatrix('y')
    mask = tensor.matrix('mask')
    costs = generator.cost(y, mask)
    assert costs.ndim == 2
    costs_val = theano.function([y, mask],
                                [costs])(numpy.zeros((n_steps, batch_size),
                                                     dtype='int64'),
                                         numpy.ones((n_steps, batch_size),
                                                    dtype=floatX))[0]
    assert costs_val.shape == (n_steps, batch_size)

    states, outputs, costs = generator.generate(iterate=True,
                                                batch_size=batch_size,
                                                n_steps=n_steps)
    states_val, outputs_val, costs_val = theano.function(
        [], [states, outputs, costs],
        updates=costs.owner.inputs[0].owner.tag.updates)()
    assert states_val.shape == (n_steps, batch_size, dim)
    assert outputs_val.shape == (n_steps, batch_size)
    assert outputs_val.dtype == 'int64'
    assert costs_val.shape == (n_steps, batch_size)
Example #18
0
    def __init__(self,
                 hidden_dim,
                 activation=None,
                 gate_activation=None,
                 state_to_state_init=None,
                 state_to_update_init=None,
                 state_to_reset_init=None,
                 input_to_state_transform=None,
                 input_to_update_transform=None,
                 input_to_reset_transform=None,
                 **kwargs):

        super(GatedRecurrentFull, self).__init__(**kwargs)
        self.hidden_dim = hidden_dim

        self.state_to_state_init = state_to_state_init
        self.state_to_update_init = state_to_update_init
        self.state_to_reset_init = state_to_reset_init

        self.input_to_state_transform = input_to_state_transform
        self.input_to_update_transform = input_to_update_transform
        self.input_to_reset_transform = input_to_reset_transform
        self.input_to_state_transform.name += "_input_to_state_transform"
        self.input_to_update_transform.name += "_input_to_update_transform"
        self.input_to_reset_transform.name += "_input_to_reset_transform"

        self.use_mine = True
        if self.use_mine:
            self.rnn = GatedRecurrentFast(weights_init=Constant(np.nan),
                                          dim=self.hidden_dim,
                                          activation=activation,
                                          gate_activation=gate_activation)
        else:
            self.rnn = GatedRecurrent(weights_init=Constant(np.nan),
                                      dim=self.hidden_dim,
                                      activation=activation,
                                      gate_activation=gate_activation)

        self.children = [
            self.rnn, self.input_to_state_transform,
            self.input_to_update_transform, self.input_to_reset_transform
        ]
        self.children.extend(self.rnn.children)
Example #19
0
    def __init__(self, src_vocab_size, embedding_dim, dgru_state_dim,
                 state_dim, src_dgru_depth, bidir_encoder_depth, **kwargs):
        super(BidirectionalEncoder, self).__init__(**kwargs)
        self.state_dim = state_dim
        self.dgru_state_dim = dgru_state_dim
        self.decimator = Decimator(src_vocab_size, embedding_dim,
                                   dgru_state_dim, src_dgru_depth)
        self.bidir = Bidirectional(RecurrentWithFork(GatedRecurrent(
            activation=Tanh(), dim=state_dim),
                                                     dgru_state_dim,
                                                     name='with_fork'),
                                   name='bidir0')

        self.children = [self.decimator, self.bidir]
        for layer_n in range(1, bidir_encoder_depth):
            self.children.append(copy.deepcopy(self.bidir))
            for child in self.children[-1].children:
                child.input_dim = 2 * state_dim
            self.children[-1].name = 'bidir{}'.format(layer_n)
Example #20
0
    def __init__(self,
                 vocab_size,
                 embedding_dim,
                 state_dim,
                 reverse=True,
                 **kwargs):
        super(Encoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.reverse = reverse

        self.lookup = LookupTable(name='embeddings')
        self.transition = GatedRecurrent(Tanh(), name='encoder_transition')
        self.fork = Fork([
            name for name in self.transition.apply.sequences if name != 'mask'
        ],
                         prototype=Linear())

        self.children = [self.lookup, self.transition, self.fork]
Example #21
0
 def __init__(self,
              base_encoder,
              state_dim=1000,
              self_attendable=False,
              **kwargs):
     """Constructor.
     
     Args:
         base_encoder (Brick): Low level encoder network which
                               produces annotations to attend to
         state_dim (int): Size of the recurrent layer.
         self_attendable (bool): If true, the annotator can attend
                                 to its own previous states. If 
                                 false it can only attend to base
                                 annotations
     """
     super(HierarchicalAnnotator, self).__init__(**kwargs)
     self.state_dim = state_dim * 2
     self.base_encoder = base_encoder
     self.self_attendable = self_attendable
     trans_core = GatedRecurrent(activation=Tanh(), dim=self.state_dim)
     if self_attendable:
         self.attention = SelfAttendableContentAttention(
             state_names=trans_core.apply.states,
             attended_dim=self.state_dim,
             match_dim=self.state_dim,
             num_steps=10,
             name="hier_attention")
     else:
         self.attention = SequenceContentAttention(
             state_names=trans_core.apply.states,
             attended_dim=self.state_dim,
             match_dim=self.state_dim,
             name="hier_attention")
     self.transition = AttentionRecurrent(trans_core,
                                          self.attention,
                                          name="hier_att_trans")
     self.children = [self.transition]
Example #22
0
    def __init__(self,
				 batch_size,
				 frame_size,
				 k,
				 depth,
				 size,
				  **kwargs):
		super(PyramidLayer, self).__init__(**kwargs)

		target_size = frame_size * k

		depth_x = depth
		hidden_size_mlp_x = 32*size

		depth_transition = depth-1

		depth_theta = depth
		hidden_size_mlp_theta = 32*size
		hidden_size_recurrent = 32*size*3

		depth_context = depth
		hidden_size_mlp_context = 32*size
		context_size = 32*size

		activations_x = [Rectifier()]*depth_x

		dims_x = [frame_size] + [hidden_size_mlp_x]*(depth_x-1) + \
		         [4*hidden_size_recurrent]

		activations_theta = [Rectifier()]*depth_theta

		dims_theta = [hidden_size_recurrent] + \
		             [hidden_size_mlp_theta]*depth_theta

		activations_context = [Rectifier()]*depth_context

		dims_context = [frame_size] + [hidden_size_mlp_context]*(depth_context-1) + \
		         [context_size]

		mlp_x = MLP(activations = activations_x,
		            dims = dims_x,
		            name = "mlp_x")

		feedback = DeepTransitionFeedback(mlp = mlp_x)

		transition = [GatedRecurrent(dim=hidden_size_recurrent, 
		                   use_bias = True,
		                   name = "gru_{}".format(i) ) for i in range(depth_transition)]

		transition = RecurrentStack( transition,
		            name="transition", skip_connections = True)

		self.transition = transition

		mlp_theta = MLP( activations = activations_theta,
		             dims = dims_theta,
		             name = "mlp_theta")

		mlp_gmm = GMMMLP(mlp = mlp_theta,
		                  dim = target_size,
		                  k = k,
		                  const = 0.00001,
		                  name = "gmm_wrap")

		gmm_emitter = GMMEmitter(gmmmlp = mlp_gmm,
		  output_size = frame_size, k = k)

		source_names = [name for name in transition.apply.states if 'states' in name]

		attention = SimpleSequenceAttention(
		              state_names = source_names,
		              state_dims = [hidden_size_recurrent],
		              attended_dim = context_size,
		              name = "attention")

		#ipdb.set_trace()
		# Verify source names
		readout = Readout(
		    readout_dim = hidden_size_recurrent,
		    source_names =source_names + ['feedback'] + ['glimpses'],
		    emitter=gmm_emitter,
		    feedback_brick = feedback,
		    name="readout")

		self.generator = SequenceGenerator(readout=readout, 
		                              transition=transition,
		                              attention = attention,
		                              name = "generator")

		self.mlp_context = MLP(activations = activations_context,
		                  dims = dims_context)

		self.children = [self.generator, self.mlp_context]
		self.final_states = []
Example #23
0
activations_x = [Rectifier()] * depth_x

dims_x = [frame_size] + [hidden_size_mlp_x]*(depth_x-1) + \
         [hidden_size_recurrent]

activations_theta = [Rectifier()] * depth_theta

dims_theta = [hidden_size_recurrent] + \
             [hidden_size_mlp_theta]*depth_theta

mlp_x = MLP(activations=activations_x, dims=dims_x)

feedback = DeepTransitionFeedback(mlp=mlp_x)

transition = [
    GatedRecurrent(dim=hidden_size_recurrent, name="gru_{}".format(i))
    for i in range(depth_recurrent)
]

transition = RecurrentStack(transition,
                            name="transition",
                            skip_connections=True)

mlp_theta = MLP(activations=activations_theta, dims=dims_theta)

mlp_gmm = GMMMLP(mlp=mlp_theta, dim=target_size, k=k, const=0.00001)

emitter = GMMEmitter(gmmmlp=mlp_gmm,
                     output_size=frame_size,
                     k=k,
                     name="emitter")
Example #24
0
def main(mode, save_path, num_batches, from_dump):
    if mode == "train":
        # Experiment configuration
        dimension = 100
        readout_dimension = len(char2code)

        # Data processing pipeline
        data_stream = DataStreamMapping(
            mapping=lambda data: tuple(array.T for array in data),
            data_stream=PaddingDataStream(
                BatchDataStream(
                    iteration_scheme=ConstantScheme(10),
                    data_stream=DataStreamMapping(
                        mapping=reverse_words,
                        add_sources=("targets", ),
                        data_stream=DataStreamFilter(
                            predicate=lambda data: len(data[0]) <= 100,
                            data_stream=OneBillionWord(
                                "training", [99],
                                char2code,
                                level="character",
                                preprocess=str.lower).get_default_stream())))))

        # Build the model
        chars = tensor.lmatrix("features")
        chars_mask = tensor.matrix("features_mask")
        targets = tensor.lmatrix("targets")
        targets_mask = tensor.matrix("targets_mask")

        encoder = Bidirectional(GatedRecurrent(dim=dimension,
                                               activation=Tanh()),
                                weights_init=Orthogonal())
        encoder.initialize()
        fork = Fork([
            name
            for name in encoder.prototype.apply.sequences if name != 'mask'
        ],
                    weights_init=IsotropicGaussian(0.1),
                    biases_init=Constant(0))
        fork.input_dim = dimension
        fork.fork_dims = {name: dimension for name in fork.fork_names}
        fork.initialize()
        lookup = LookupTable(readout_dimension,
                             dimension,
                             weights_init=IsotropicGaussian(0.1))
        lookup.initialize()
        transition = Transition(activation=Tanh(),
                                dim=dimension,
                                attended_dim=2 * dimension,
                                name="transition")
        attention = SequenceContentAttention(
            state_names=transition.apply.states,
            match_dim=dimension,
            name="attention")
        readout = LinearReadout(readout_dim=readout_dimension,
                                source_names=["states"],
                                emitter=SoftmaxEmitter(name="emitter"),
                                feedbacker=LookupFeedback(
                                    readout_dimension, dimension),
                                name="readout")
        generator = SequenceGenerator(readout=readout,
                                      transition=transition,
                                      attention=attention,
                                      weights_init=IsotropicGaussian(0.1),
                                      biases_init=Constant(0),
                                      name="generator")
        generator.push_initialization_config()
        transition.weights_init = Orthogonal()
        generator.initialize()
        bricks = [encoder, fork, lookup, generator]

        # Give an idea of what's going on
        params = Selector(bricks).get_params()
        logger.info("Parameters:\n" +
                    pprint.pformat([(key, value.get_value().shape)
                                    for key, value in params.items()],
                                   width=120))

        # Build the cost computation graph
        batch_cost = generator.cost(
            targets,
            targets_mask,
            attended=encoder.apply(**dict_union(fork.apply(
                lookup.lookup(chars), return_dict=True),
                                                mask=chars_mask)),
            attended_mask=chars_mask).sum()
        batch_size = named_copy(chars.shape[1], "batch_size")
        cost = aggregation.mean(batch_cost, batch_size)
        cost.name = "sequence_log_likelihood"
        logger.info("Cost graph is built")

        # Fetch variables useful for debugging
        max_length = named_copy(chars.shape[0], "max_length")
        cost_per_character = named_copy(
            aggregation.mean(batch_cost, batch_size * max_length),
            "character_log_likelihood")
        cg = ComputationGraph(cost)
        energies = unpack(VariableFilter(application=readout.readout,
                                         name="output")(cg.variables),
                          singleton=True)
        min_energy = named_copy(energies.min(), "min_energy")
        max_energy = named_copy(energies.max(), "max_energy")
        (activations, ) = VariableFilter(
            application=generator.transition.apply,
            name="states")(cg.variables)
        mean_activation = named_copy(activations.mean(), "mean_activation")

        # Define the training algorithm.
        algorithm = GradientDescent(cost=cost,
                                    step_rule=CompositeRule([
                                        GradientClipping(10.0),
                                        SteepestDescent(0.01)
                                    ]))

        observables = [
            cost, min_energy, max_energy, mean_activation, batch_size,
            max_length, cost_per_character, algorithm.total_step_norm,
            algorithm.total_gradient_norm
        ]
        for name, param in params.items():
            observables.append(named_copy(param.norm(2), name + "_norm"))
            observables.append(
                named_copy(algorithm.gradients[param].norm(2),
                           name + "_grad_norm"))

        main_loop = MainLoop(
            model=bricks,
            data_stream=data_stream,
            algorithm=algorithm,
            extensions=([LoadFromDump(from_dump)] if from_dump else []) + [
                Timing(),
                TrainingDataMonitoring(observables, after_every_batch=True),
                TrainingDataMonitoring(
                    observables, prefix="average", every_n_batches=10),
                FinishAfter(after_n_batches=num_batches).add_condition(
                    "after_batch", lambda log: math.isnan(
                        log.current_row.total_gradient_norm)),
                Plot(os.path.basename(save_path),
                     [["average_" + cost.name],
                      ["average_" + cost_per_character.name]],
                     every_n_batches=10),
                SerializeMainLoop(save_path,
                                  every_n_batches=500,
                                  save_separately=["model", "log"]),
                Printing(every_n_batches=1)
            ])
        main_loop.run()
    elif mode == "test":
        with open(save_path, "rb") as source:
            encoder, fork, lookup, generator = dill.load(source)
        logger.info("Model is loaded")
        chars = tensor.lmatrix("features")
        generated = generator.generate(
            n_steps=3 * chars.shape[0],
            batch_size=chars.shape[1],
            attended=encoder.apply(**dict_union(
                fork.apply(lookup.lookup(chars), return_dict=True))),
            attended_mask=tensor.ones(chars.shape))
        sample_function = ComputationGraph(generated).get_theano_function()
        logging.info("Sampling function is compiled")

        while True:
            # Python 2-3 compatibility
            line = input("Enter a sentence\n")
            batch_size = int(input("Enter a number of samples\n"))
            encoded_input = [
                char2code.get(char, char2code["<UNK>"])
                for char in line.lower().strip()
            ]
            encoded_input = ([char2code['<S>']] + encoded_input +
                             [char2code['</S>']])
            print("Encoder input:", encoded_input)
            target = reverse_words((encoded_input, ))[0]
            print("Target: ", target)
            states, samples, glimpses, weights, costs = sample_function(
                numpy.repeat(numpy.array(encoded_input)[:, None],
                             batch_size,
                             axis=1))

            messages = []
            for i in range(samples.shape[1]):
                sample = list(samples[:, i])
                try:
                    true_length = sample.index(char2code['</S>']) + 1
                except ValueError:
                    true_length = len(sample)
                sample = sample[:true_length]
                cost = costs[:true_length, i].sum()
                message = "({})".format(cost)
                message += "".join(code2char[code] for code in sample)
                if sample == target:
                    message += " CORRECT!"
                messages.append((cost, message))
            messages.sort(key=lambda tuple_: -tuple_[0])
            for _, message in messages:
                print(message)
Example #25
0
def train():

    if os.path.isfile('trainingdata.tar'):
        with open('trainingdata.tar', 'rb') as f:
            main = load(f)
    else:
        hidden_size = 512
        filename = 'warpeace.hdf5'

        encoder = HDF5CharEncoder('warpeace_input.txt', 1000)
        encoder.write(filename)
        alphabet_len = encoder.length

        x = theano.tensor.lmatrix('x')

        readout = Readout(
            readout_dim=alphabet_len,
            feedback_brick=LookupFeedback(alphabet_len, hidden_size, name='feedback'),
            source_names=['states'],
            emitter=RandomSoftmaxEmitter(),
            name='readout'
        )

        transition = GatedRecurrent(
            activation=Tanh(),
            dim=hidden_size)
        transition.weights_init = IsotropicGaussian(0.01)

        gen = SequenceGenerator(readout=readout,
                                transition=transition,
                                weights_init=IsotropicGaussian(0.01),
                                biases_init=Constant(0),
                                name='sequencegenerator')

        gen.push_initialization_config()
        gen.initialize()

        cost = gen.cost(outputs=x)
        cost.name = 'cost'

        cg = ComputationGraph(cost)

        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=Scale(0.5))

        train_set = encoder.get_dataset()
        train_stream = DataStream.default_stream(
            train_set, iteration_scheme=SequentialScheme(
                train_set.num_examples, batch_size=128))

        main = MainLoop(
            model=Model(cost),
            data_stream=train_stream,
            algorithm=algorithm,
            extensions=[
                FinishAfter(),
                Printing(),
                Checkpoint('trainingdata.tar', every_n_epochs=10),
                ShowOutput(every_n_epochs=10)
            ])

    main.run()
def test_integer_sequence_generator():
    """Test a sequence generator with integer outputs.

    Such sequence generators can be used to e.g. model language.

    """
    rng = numpy.random.RandomState(1234)

    readout_dim = 5
    feedback_dim = 3
    dim = 20
    batch_size = 30
    n_steps = 10

    transition = GatedRecurrent(dim=dim, activation=Tanh(),
                                weights_init=Orthogonal())
    generator = SequenceGenerator(
        Readout(readout_dim=readout_dim, source_names=["states"],
                emitter=SoftmaxEmitter(theano_seed=1234),
                feedback_brick=LookupFeedback(readout_dim,
                                              feedback_dim)),
        transition,
        weights_init=IsotropicGaussian(0.1), biases_init=Constant(0),
        seed=1234)
    generator.initialize()

    # Test 'cost_matrix' method
    y = tensor.lmatrix('y')
    mask = tensor.matrix('mask')
    costs = generator.cost_matrix(y, mask)
    assert costs.ndim == 2
    costs_fun = theano.function([y, mask], [costs])
    y_test = rng.randint(readout_dim, size=(n_steps, batch_size))
    m_test = numpy.ones((n_steps, batch_size), dtype=floatX)
    costs_val = costs_fun(y_test, m_test)[0]
    assert costs_val.shape == (n_steps, batch_size)
    assert_allclose(costs_val.sum(), 482.827, rtol=1e-5)

    # Test 'cost' method
    cost = generator.cost(y, mask)
    assert cost.ndim == 0
    cost_val = theano.function([y, mask], [cost])(y_test, m_test)
    assert_allclose(cost_val, 16.0942, rtol=1e-5)

    # Test 'AUXILIARY' variable 'per_sequence_element' in 'cost' method
    cg = ComputationGraph([cost])
    var_filter = VariableFilter(roles=[AUXILIARY])
    aux_var_name = '_'.join([generator.name, generator.cost.name,
                             'per_sequence_element'])
    cost_per_el = [el for el in var_filter(cg.variables)
                   if el.name == aux_var_name][0]
    assert cost_per_el.ndim == 0
    cost_per_el_val = theano.function([y, mask], [cost_per_el])(y_test, m_test)
    assert_allclose(cost_per_el_val, 1.60942, rtol=1e-5)

    # Test generate
    states, outputs, costs = generator.generate(
        iterate=True, batch_size=batch_size, n_steps=n_steps)
    cg = ComputationGraph(states + outputs + costs)
    states_val, outputs_val, costs_val = theano.function(
        [], [states, outputs, costs],
        updates=cg.updates)()
    assert states_val.shape == (n_steps, batch_size, dim)
    assert outputs_val.shape == (n_steps, batch_size)
    assert outputs_val.dtype == 'int64'
    assert costs_val.shape == (n_steps, batch_size)
    assert_allclose(states_val.sum(), -17.91811, rtol=1e-5)
    assert_allclose(costs_val.sum(), 482.863, rtol=1e-5)
    assert outputs_val.sum() == 630

    # Test masks agnostic results of cost
    cost1 = costs_fun([[1], [2]], [[1], [1]])[0]
    cost2 = costs_fun([[3, 1], [4, 2], [2, 0]],
                      [[1, 1], [1, 1], [1, 0]])[0]
    assert_allclose(cost1.sum(), cost2[:, 1].sum(), rtol=1e-5)
Example #27
0
    def __init__(self,
                 vocab_size,
                 embedding_dim,
                 dgru_state_dim,
                 igru_state_dim,
                 state_dim,
                 representation_dim,
                 transition_depth,
                 trg_igru_depth,
                 trg_dgru_depth,
                 trg_space_idx,
                 trg_bos,
                 theano_seed=None,
                 **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.dgru_state_dim = dgru_state_dim
        self.igru_state_dim = igru_state_dim
        self.state_dim = state_dim
        self.trg_space_idx = trg_space_idx
        self.representation_dim = representation_dim
        self.theano_seed = theano_seed

        # Initialize gru with special initial state
        self.transition = RecurrentStack([
            GRUInitialState(attended_dim=state_dim,
                            dim=state_dim,
                            activation=Tanh(),
                            name='decoder_gru_withinit')
        ] + [
            GatedRecurrent(
                dim=state_dim, activation=Tanh(), name='decoder_gru' + str(i))
            for i in range(1, transition_depth)
        ],
                                         skip_connections=False)

        # Initialize the attention mechanism
        self.attention = SequenceContentAttention(
            state_names=self.transition.apply.states,
            attended_dim=representation_dim,
            match_dim=state_dim,
            name="attention")

        self.interpolator = Interpolator(
            vocab_size=vocab_size,
            embedding_dim=embedding_dim,
            igru_state_dim=igru_state_dim,
            igru_depth=trg_igru_depth,
            trg_dgru_depth=trg_dgru_depth,
            source_names=[
                'states', 'feedback', self.attention.take_glimpses.outputs[0]
            ],
            readout_dim=self.vocab_size,
            emitter=SoftmaxEmitter(initial_output=trg_bos,
                                   theano_seed=theano_seed),
            feedback_brick=TargetWordEncoder(vocab_size, embedding_dim,
                                             self.dgru_state_dim,
                                             trg_dgru_depth))

        # Build sequence generator accordingly
        self.sequence_generator = SequenceGeneratorDCNMT(
            trg_space_idx=self.trg_space_idx,
            readout=self.interpolator,
            transition=self.transition,
            attention=self.attention,
            transition_depth=transition_depth,
            igru_depth=trg_igru_depth,
            trg_dgru_depth=trg_dgru_depth,
            fork=Fork([
                name
                for name in self.transition.apply.sequences if name != 'mask'
            ],
                      prototype=Linear()))
        self.children = [self.sequence_generator]
Example #28
0
    def __init__(self,
				 batch_size,
				 frame_size,
				 k,
				 depth,
				 size,
				  **kwargs):
		super(SimplePyramidLayer, self).__init__(**kwargs)

		target_size = frame_size * k

		depth_x = depth
		hidden_size_mlp_x = 32*size

		depth_transition = depth-1

		depth_theta = depth
		hidden_size_mlp_theta = 32*size
		hidden_size_recurrent = 32*size*3

		activations_x = [Rectifier()]*depth_x

		dims_x = [frame_size] + [hidden_size_mlp_x]*(depth_x-1) + \
		         [4*hidden_size_recurrent]

		activations_theta = [Rectifier()]*depth_theta

		dims_theta = [hidden_size_recurrent] + \
		             [hidden_size_mlp_theta]*depth_theta

		self.mlp_x = MLP(activations = activations_x,
		            dims = dims_x,
		            name = "mlp_x")

		transition = [GatedRecurrent(dim=hidden_size_recurrent, 
		                   use_bias = True,
		                   name = "gru_{}".format(i) ) for i in range(depth_transition)]

		self.transition = RecurrentStack( transition,
		            name="transition", skip_connections = True)

		mlp_theta = MLP( activations = activations_theta,
		             dims = dims_theta,
		             name = "mlp_theta")

		mlp_gmm = GMMMLP(mlp = mlp_theta,
		                  dim = target_size,
		                  k = k,
		                  const = 0.00001,
		                  name = "gmm_wrap")

		self.gmm_emitter = GMMEmitter(gmmmlp = mlp_gmm,
		  output_size = frame_size, k = k)

		normal_inputs = [name for name in self.transition.apply.sequences
		                 if 'mask' not in name]

		self.fork = Fork(normal_inputs,
						 input_dim = 4*hidden_size_recurrent,
						 output_dims = self.transition.get_dims(normal_inputs))

		self.children = [self.mlp_x, self.transition,
		                 self.gmm_emitter, self.fork]
Example #29
0
    def __init__(self, vocab_size, embedding_dim, n_layers, skip_connections,
                 state_dim, **kwargs):
        """Sole constructor.
        
        Args:
            vocab_size (int): Source vocabulary size
            embedding_dim (int): Dimension of the embedding layer
            n_layers (int): Number of layers. Layers share the same
                            weight matrices.
            skip_connections (bool): Skip connections connect the
                                     source word embeddings directly 
                                     with deeper layers to propagate 
                                     the gradient more efficiently
            state_dim (int): Number of hidden units in the recurrent
                             layers.
        """
        super(BidirectionalEncoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.n_layers = n_layers
        self.state_dim = state_dim
        self.skip_connections = skip_connections

        self.lookup = LookupTable(name='embeddings')
        if self.n_layers >= 1:
            self.bidir = BidirectionalWMT15(
                GatedRecurrent(activation=Tanh(), dim=state_dim))
            self.fwd_fork = Fork([
                name for name in self.bidir.prototype.apply.sequences
                if name != 'mask'
            ],
                                 prototype=Linear(),
                                 name='fwd_fork')
            self.back_fork = Fork([
                name for name in self.bidir.prototype.apply.sequences
                if name != 'mask'
            ],
                                  prototype=Linear(),
                                  name='back_fork')
            self.children = [
                self.lookup, self.bidir, self.fwd_fork, self.back_fork
            ]
            if self.n_layers > 1:  # Deep encoder
                self.mid_fwd_fork = Fork([
                    name for name in self.bidir.prototype.apply.sequences
                    if name != 'mask'
                ],
                                         prototype=Linear(),
                                         name='mid_fwd_fork')
                self.mid_back_fork = Fork([
                    name for name in self.bidir.prototype.apply.sequences
                    if name != 'mask'
                ],
                                          prototype=Linear(),
                                          name='mid_back_fork')
                self.children.append(self.mid_fwd_fork)
                self.children.append(self.mid_back_fork)
        elif self.n_layers == 0:
            self.embedding_dim = state_dim * 2
            self.children = [self.lookup]
        else:
            logging.fatal("Number of encoder layers must be non-negative")
Example #30
0
    def __init__(
            self,
            input_dim=420,  # Dimension of the text labels
            output_dim=63,  # Dimension of vocoder fram
            rnn_h_dim=1024,  # Size of rnn hidden state
            readouts_dim=1024,  # Size of readouts (summary of rnn)
            weak_feedback=False,  # Feedback to the top rnn layer
            full_feedback=False,  # Feedback to all rnn layers
            feedback_noise_level=None,  # Amount of noise in feedback
            layer_norm=False,  # Use simple normalization?
            use_speaker=False,  # Condition on the speaker id?
            num_speakers=21,  # How many speakers there are?
            speaker_dim=128,  # Size of speaker embedding
            which_cost='MSE',  # Train with MSE or GMM
            k_gmm=20,  # How many components in the GMM
            sampling_bias=0,  # Make samples more likely (Graves13)
            epsilon=1e-5,  # Numerical stabilities
            num_characters=43,  # how many chars in the labels
            attention_type='graves',  # graves or softmax
            attention_size=10,  # number of gaussians in the attention
            attention_alignment=1.,  # audio steps per letter at initialization
            sharpening_coeff=1.,
            timing_coeff=1.,
            encoder_type=None,
            encoder_dim=128,
            **kwargs):

        super(Parrot, self).__init__(**kwargs)

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.rnn_h_dim = rnn_h_dim
        self.readouts_dim = readouts_dim
        self.layer_norm = layer_norm
        self.which_cost = which_cost
        self.use_speaker = use_speaker
        self.full_feedback = full_feedback
        self.feedback_noise_level = feedback_noise_level
        self.epsilon = epsilon

        self.num_characters = num_characters
        self.attention_type = attention_type
        self.attention_alignment = attention_alignment
        self.attention_size = attention_size
        self.sharpening_coeff = sharpening_coeff
        self.timing_coeff = timing_coeff

        self.encoder_type = encoder_type
        self.encoder_dim = encoder_dim

        self.encoded_input_dim = input_dim

        if self.encoder_type == 'bidirectional':
            self.encoded_input_dim = 2 * encoder_dim

        if self.feedback_noise_level is not None:
            self.noise_level_var = tensor.scalar('feedback_noise_level')

        self.rnn1 = GatedRecurrent(dim=rnn_h_dim, name='rnn1')
        self.rnn2 = GatedRecurrent(dim=rnn_h_dim, name='rnn2')
        self.rnn3 = GatedRecurrent(dim=rnn_h_dim, name='rnn3')

        self.h1_to_readout = Linear(input_dim=rnn_h_dim,
                                    output_dim=readouts_dim,
                                    name='h1_to_readout')

        self.h2_to_readout = Linear(input_dim=rnn_h_dim,
                                    output_dim=readouts_dim,
                                    name='h2_to_readout')

        self.h3_to_readout = Linear(input_dim=rnn_h_dim,
                                    output_dim=readouts_dim,
                                    name='h3_to_readout')

        self.h1_to_h2 = Fork(output_names=['rnn2_inputs', 'rnn2_gates'],
                             input_dim=rnn_h_dim,
                             output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                             name='h1_to_h2')

        self.h1_to_h3 = Fork(output_names=['rnn3_inputs', 'rnn3_gates'],
                             input_dim=rnn_h_dim,
                             output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                             name='h1_to_h3')

        self.h2_to_h3 = Fork(output_names=['rnn3_inputs', 'rnn3_gates'],
                             input_dim=rnn_h_dim,
                             output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                             name='h2_to_h3')

        if which_cost == 'MSE':
            self.readout_to_output = Linear(input_dim=readouts_dim,
                                            output_dim=output_dim,
                                            name='readout_to_output')
        elif which_cost == 'GMM':
            self.sampling_bias = sampling_bias
            self.k_gmm = k_gmm
            self.readout_to_output = Fork(
                output_names=['gmm_mu', 'gmm_sigma', 'gmm_coeff'],
                input_dim=readouts_dim,
                output_dims=[output_dim * k_gmm, output_dim * k_gmm, k_gmm],
                name='readout_to_output')

        self.encoder = Encoder(encoder_type,
                               num_characters,
                               input_dim,
                               encoder_dim,
                               name='encoder')

        self.children = [
            self.encoder, self.rnn1, self.rnn2, self.rnn3, self.h1_to_readout,
            self.h2_to_readout, self.h3_to_readout, self.h1_to_h2,
            self.h1_to_h3, self.h2_to_h3, self.readout_to_output
        ]

        self.inp_to_h1 = Fork(output_names=['rnn1_inputs', 'rnn1_gates'],
                              input_dim=self.encoded_input_dim,
                              output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                              name='inp_to_h1')

        self.inp_to_h2 = Fork(output_names=['rnn2_inputs', 'rnn2_gates'],
                              input_dim=self.encoded_input_dim,
                              output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                              name='inp_to_h2')

        self.inp_to_h3 = Fork(output_names=['rnn3_inputs', 'rnn3_gates'],
                              input_dim=self.encoded_input_dim,
                              output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                              name='inp_to_h3')

        self.children += [self.inp_to_h1, self.inp_to_h2, self.inp_to_h3]

        self.h1_to_att = Fork(output_names=['alpha', 'beta', 'kappa'],
                              input_dim=rnn_h_dim,
                              output_dims=[attention_size] * 3,
                              name='h1_to_att')

        self.att_to_readout = Linear(input_dim=self.encoded_input_dim,
                                     output_dim=readouts_dim,
                                     name='att_to_readout')

        self.children += [self.h1_to_att, self.att_to_readout]

        if use_speaker:
            self.num_speakers = num_speakers
            self.speaker_dim = speaker_dim
            self.embed_speaker = LookupTable(num_speakers, speaker_dim)

            self.speaker_to_h1 = Fork(
                output_names=['rnn1_inputs', 'rnn1_gates'],
                input_dim=speaker_dim,
                output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                name='speaker_to_h1')

            self.speaker_to_h2 = Fork(
                output_names=['rnn2_inputs', 'rnn2_gates'],
                input_dim=speaker_dim,
                output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                name='speaker_to_h2')

            self.speaker_to_h3 = Fork(
                output_names=['rnn3_inputs', 'rnn3_gates'],
                input_dim=speaker_dim,
                output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                name='speaker_to_h3')

            self.speaker_to_readout = Linear(input_dim=speaker_dim,
                                             output_dim=readouts_dim,
                                             name='speaker_to_readout')

            if which_cost == 'MSE':
                self.speaker_to_output = Linear(input_dim=speaker_dim,
                                                output_dim=output_dim,
                                                name='speaker_to_output')
            elif which_cost == 'GMM':
                self.speaker_to_output = Fork(
                    output_names=['gmm_mu', 'gmm_sigma', 'gmm_coeff'],
                    input_dim=speaker_dim,
                    output_dims=[
                        output_dim * k_gmm, output_dim * k_gmm, k_gmm
                    ],
                    name='speaker_to_output')

            self.children += [
                self.embed_speaker, self.speaker_to_h1, self.speaker_to_h2,
                self.speaker_to_h3, self.speaker_to_readout,
                self.speaker_to_output
            ]

        if full_feedback:
            self.out_to_h2 = Fork(output_names=['rnn2_inputs', 'rnn2_gates'],
                                  input_dim=output_dim,
                                  output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                                  name='out_to_h2')

            self.out_to_h3 = Fork(output_names=['rnn3_inputs', 'rnn3_gates'],
                                  input_dim=output_dim,
                                  output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                                  name='out_to_h3')
            self.children += [self.out_to_h2, self.out_to_h3]
            weak_feedback = True

        self.weak_feedback = weak_feedback

        if weak_feedback:
            self.out_to_h1 = Fork(output_names=['rnn1_inputs', 'rnn1_gates'],
                                  input_dim=output_dim,
                                  output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                                  name='out_to_h1')
            self.children += [self.out_to_h1]