name='GRUEncoder',
                                mask_input=l_mask_enc)
print lasagne.layers.get_output(l_enc,
                                inputs={
                                    l_in: x_sym,
                                    l_mask_enc: xmask_sym
                                }).eval({
                                    x_sym: inputs,
                                    xmask_sym: input_masks
                                }).shape

##### START OF DECODER######

l_dec = LSTMAttentionDecodeFeedbackLayer(l_enc,
                                         num_units=NUM_UNITS_DEC,
                                         aln_num_units=20,
                                         n_decodesteps=MAX_DIGITS + 1,
                                         name='LSTMDecoder')

# We need to do some reshape voodo to connect a softmax layer to the decoder.
# See http://lasagne.readthedocs.org/en/latest/modules/layers/recurrent.html#examples
# In short this line changes the shape from
# (batch_size, decode_len, num_dec_units) -> (batch_size*decodelen,num_dec_units).
# We need to do this since the softmax is applied to the last dimension and we want to
# softmax the output at each position individually
l_reshape = lasagne.layers.ReshapeLayer(l_dec, (-1, [2]))

l_softmax = lasagne.layers.DenseLayer(
    l_reshape,
    num_units=NUM_OUTPUTS,
    nonlinearity=lasagne.nonlinearities.softmax,
l_enc = lasagne.layers.GRULayer(l_in,
                                num_units=NUM_UNITS_ENC,
                                mask_input=l_in_mask)
print "GRU Encoder: {}".format(
    lasagne.layers.get_output(l_enc,
                              inputs={
                                  l_in: x_sym,
                                  l_in_mask: x_mask_sym
                              }).eval({
                                  x_sym: v_train_x,
                                  x_mask_sym: v_train_x_mask
                              }).shape)

l_dec = LSTMAttentionDecodeFeedbackLayer(l_enc,
                                         num_units=NUM_UNITS_DEC,
                                         aln_num_units=52,
                                         n_decodesteps=MAX_LENGHT_OUPUT)

print "LSTM Decoder Layer: {}".format(
    lasagne.layers.get_output(l_dec,
                              inputs={
                                  l_in: x_sym,
                                  l_in_mask: x_mask_sym
                              }).eval({
                                  x_sym: v_train_x,
                                  x_mask_sym: v_train_x_mask
                              }).shape)

# classic reshape voodoo
# we have an input that is batch_size, time_steps, num_units
# we want batch_size * time_steps, num_units