Beispiel #1
0
def test_lstm_hid_init_layer_eval():
    # Test `hid_init` as a `Layer` with some dummy input. Compare the output of
    # a network with a `Layer` as input to `hid_init` to a network with a
    # `np.array` as input to `hid_init`
    n_units = 7
    n_test_cases = 2
    in_shp = (n_test_cases, 2, 3)
    in_h_shp = (1, n_units)
    in_cell_shp = (1, n_units)

    # dummy inputs
    X_test = np.ones(in_shp, dtype=theano.config.floatX)
    Xh_test = np.ones(in_h_shp, dtype=theano.config.floatX)
    Xc_test = np.ones(in_cell_shp, dtype=theano.config.floatX)
    Xh_test_batch = np.tile(Xh_test, (n_test_cases, 1))
    Xc_test_batch = np.tile(Xc_test, (n_test_cases, 1))

    # network with `Layer` initializer for hid_init
    l_inp = InputLayer(in_shp)
    l_inp_h = InputLayer(in_h_shp)
    l_inp_cell = InputLayer(in_cell_shp)
    l_rec_inp_layer = LSTMLayer(l_inp, n_units, hid_init=l_inp_h,
                                cell_init=l_inp_cell, nonlinearity=None)

    # network with `np.array` initializer for hid_init
    l_rec_nparray = LSTMLayer(l_inp, n_units, hid_init=Xh_test,
                              cell_init=Xc_test, nonlinearity=None)

    # copy network parameters from l_rec_inp_layer to l_rec_nparray
    l_il_param = dict([(p.name, p) for p in l_rec_inp_layer.get_params()])
    l_rn_param = dict([(p.name, p) for p in l_rec_nparray.get_params()])
    for k, v in l_rn_param.items():
        if k in l_il_param:
            v.set_value(l_il_param[k].get_value())

    # build the theano functions
    X = T.tensor3()
    Xh = T.matrix()
    Xc = T.matrix()
    output_inp_layer = lasagne.layers.get_output(l_rec_inp_layer,
                                                 {l_inp: X, l_inp_h:
                                                  Xh, l_inp_cell: Xc})
    output_nparray = lasagne.layers.get_output(l_rec_nparray, {l_inp: X})

    # test both nets with dummy input
    output_val_inp_layer = output_inp_layer.eval({X: X_test, Xh: Xh_test_batch,
                                                  Xc: Xc_test_batch})
    output_val_nparray = output_nparray.eval({X: X_test})

    # check output given `Layer` is the same as with `np.array`
    assert np.allclose(output_val_inp_layer, output_val_nparray)
Beispiel #2
0
def test_lstm_hid_init_layer_eval():
    # Test `hid_init` as a `Layer` with some dummy input. Compare the output of
    # a network with a `Layer` as input to `hid_init` to a network with a
    # `np.array` as input to `hid_init`
    n_units = 7
    n_test_cases = 2
    in_shp = (n_test_cases, 2, 3)
    in_h_shp = (1, n_units)
    in_cell_shp = (1, n_units)

    # dummy inputs
    X_test = np.ones(in_shp, dtype=theano.config.floatX)
    Xh_test = np.ones(in_h_shp, dtype=theano.config.floatX)
    Xc_test = np.ones(in_cell_shp, dtype=theano.config.floatX)
    Xh_test_batch = np.tile(Xh_test, (n_test_cases, 1))
    Xc_test_batch = np.tile(Xc_test, (n_test_cases, 1))

    # network with `Layer` initializer for hid_init
    l_inp = InputLayer(in_shp)
    l_inp_h = InputLayer(in_h_shp)
    l_inp_cell = InputLayer(in_cell_shp)
    l_rec_inp_layer = LSTMLayer(l_inp, n_units, hid_init=l_inp_h,
                                cell_init=l_inp_cell, nonlinearity=None)

    # network with `np.array` initializer for hid_init
    l_rec_nparray = LSTMLayer(l_inp, n_units, hid_init=Xh_test,
                              cell_init=Xc_test, nonlinearity=None)

    # copy network parameters from l_rec_inp_layer to l_rec_nparray
    l_il_param = dict([(p.name, p) for p in l_rec_inp_layer.get_params()])
    l_rn_param = dict([(p.name, p) for p in l_rec_nparray.get_params()])
    for k, v in list(l_rn_param.items()):
        if k in l_il_param:
            v.set_value(l_il_param[k].get_value())

    # build the theano functions
    X = T.tensor3()
    Xh = T.matrix()
    Xc = T.matrix()
    output_inp_layer = lasagne.layers.get_output(l_rec_inp_layer,
                                                 {l_inp: X, l_inp_h:
                                                  Xh, l_inp_cell: Xc})
    output_nparray = lasagne.layers.get_output(l_rec_nparray, {l_inp: X})

    # test both nets with dummy input
    output_val_inp_layer = output_inp_layer.eval({X: X_test, Xh: Xh_test_batch,
                                                  Xc: Xc_test_batch})
    output_val_nparray = output_nparray.eval({X: X_test})

    # check output given `Layer` is the same as with `np.array`
    assert np.allclose(output_val_inp_layer, output_val_nparray)
Beispiel #3
0
def test_lasagne_ctc():
    import lasagne
    from lasagne.layers import (
        LSTMLayer,
        InputLayer,
        DenseLayer,
        NonlinearityLayer,
        ReshapeLayer,
        EmbeddingLayer,
        RecurrentLayer,
    )
    import theano
    import theano.tensor as T
    import numpy as np

    num_batch, input_seq_len = 1, 12
    num_classes = 5
    target_seq_len = 3
    num_rnn_units = 50

    def print_pred(y_hat):
        blank_symbol = num_classes
        res = []
        for i, s in enumerate(y_hat):
            if (s != blank_symbol) and (i == 0 or s != y_hat[i - 1]):
                res += [s]
        if len(res) > 0:
            return "".join(map(str, list(res)))
        else:
            return "-" * target_seq_len

    Y_hat = np.asarray(np.random.normal(0, 1, (input_seq_len, num_batch, num_classes + 1)), dtype=floatX)
    Y = np.zeros((target_seq_len, num_batch), dtype="int64")
    Y[25:, :] = 1
    Y_hat_mask = np.ones((input_seq_len, num_batch), dtype=floatX)
    Y_hat_mask[-5:] = 0
    # default blank symbol is the highest class index (3 in this case)
    Y_mask = np.asarray(np.ones_like(Y), dtype=floatX)
    X = np.random.random((num_batch, input_seq_len)).astype("int32")

    y = T.imatrix("phonemes")
    x = T.imatrix()  # batchsize, input_seq_len, features

    print "num_batch =", num_batch, "input_seq_len =", input_seq_len
    print "num_classes =", num_classes
    # setup Lasagne Recurrent network
    # The output from the network is shape
    #  a) output_lin_ctc is the activation before softmax  (input_seq_len, batch_size, num_classes + 1)
    #  b) ouput_softmax is the output after softmax  (batch_size, input_seq_len, num_classes + 1)
    l_inp = InputLayer((num_batch, input_seq_len))
    netshape = lasagne.layers.get_output_shape(l_inp)
    print ("Layer l_inp shape:")
    print (netshape)
    l_emb = EmbeddingLayer(
        l_inp, input_size=num_classes + 1, output_size=num_classes + 1, W=np.identity(num_classes + 1).astype("float32")
    )
    netshape = lasagne.layers.get_output_shape(l_emb)
    print ("Layer l_emb shape:")
    print (netshape)
    l_rnn = LSTMLayer(l_emb, num_units=num_rnn_units)
    netshape = lasagne.layers.get_output_shape(l_rnn)
    print ("Layer l_rnn shape:")
    print (netshape)
    l_rnn_shp = ReshapeLayer(l_rnn, (num_batch * input_seq_len, num_rnn_units))
    netshape = lasagne.layers.get_output_shape(l_rnn_shp)
    print ("Layer l_rnn_shp shape:")
    print (netshape)
    l_out = DenseLayer(l_rnn_shp, num_units=num_classes + 1, nonlinearity=lasagne.nonlinearities.identity)  # + blank

    netshape = lasagne.layers.get_output_shape(l_out)
    print ("Layer l_out shape:")
    print (netshape)
    l_out_shp = ReshapeLayer(l_out, (num_batch, input_seq_len, num_classes + 1))
    netshape = lasagne.layers.get_output_shape(l_out_shp)
    print ("Layer l_out_shp shape:")
    print (netshape)

    # dimshuffle to shape format (input_seq_len, batch_size, num_classes + 1)
    # l_out_shp_ctc = lasagne.layers.DimshuffleLayer(l_out_shp, (1, 0, 2))

    l_out_softmax = NonlinearityLayer(l_out, nonlinearity=lasagne.nonlinearities.softmax)
    netshape = lasagne.layers.get_output_shape(l_out_softmax)
    print ("Layer l_out_softmax shape:")
    print (netshape)
    l_out_softmax_shp = ReshapeLayer(l_out_softmax, (num_batch, input_seq_len, num_classes + 1))
    netshape = lasagne.layers.get_output_shape(l_out_softmax_shp)
    print ("Layer l_out_softmax_shp shape:")
    print (netshape)

    output_lin_ctc = lasagne.layers.get_output(l_out_shp, x)
    output_softmax = lasagne.layers.get_output(l_out_softmax_shp, x)
    all_params = l_rnn.get_params(trainable=True)  # dont learn embeddingl
    print "x type:", type(x)
    print "x shape", x.shape
    print "y type:", type(y)
    print "y shape", y.shape

    ###############
    #  GRADIENTS  #
    ###############

    # the CTC cross entropy between y and linear output network
    # (num_batch,t,class+1)
    # output_lin_ctc shape (1,12,6)
    pseudo_cost = ctc_cost.pseudo_cost(y, output_lin_ctc)

    # calculate the gradients of the CTC wrt. linar output of network
    pseudo_cost_grad = T.grad(pseudo_cost.sum() / num_batch, all_params)
    true_cost = ctc_cost.cost(y, output_softmax)
    cost = T.mean(true_cost)

    sh_lr = theano.shared(lasagne.utils.floatX(0.01))
    updates = lasagne.updates.rmsprop(pseudo_cost_grad, all_params, learning_rate=sh_lr)
    # x shape (1,12)
    # y shape (1,3)

    train = theano.function([x, y], [output_lin_ctc, output_softmax, cost, pseudo_cost], updates=updates)

    # Create test dataset
    num_samples = 10
    np.random.seed(1234)

    # create simple dataset of format
    # input [5,5,5,5,5,2,2,2,2,2,3,3,3,3,3,....,1,1,1,1]
    # targets [5,2,3,...,1]
    # etc...
    input_lst, output_lst = [], []
    for i in range(num_samples):
        this_input = []
        this_output = []
        for j in range(target_seq_len):
            this_class = np.random.randint(num_classes)
            this_input += [this_class] * 3 + [num_classes]
            this_output += [this_class]

        this_input += (input_seq_len - len(this_input)) * [this_input[-1]]

        input_lst.append(this_input)
        output_lst.append(this_output)
        print this_input, this_output

    input_arr = np.concatenate([input_lst]).astype("int32")
    y_arr = np.concatenate([output_lst]).astype("int32")

    print "y_arr shape:", y_arr.shape

    y_mask_arr = np.ones((num_batch, target_seq_len), dtype="float32")
    input_mask_arr = np.ones((num_batch, input_seq_len), dtype="float32")

    for nn in range(1000):
        cost_lst = []
        shuffle = np.random.permutation(num_samples)
        for i in range(num_samples // num_batch):
            idx = shuffle[i * num_batch : (i + 1) * num_batch]
            _, output_softmax_val, cost, pseudo_cost_val = train(input_arr[idx], y_arr[idx])
            print "x=", input_arr[idx]
            # x shape (1,12)
            print "x shape", input_arr[idx].shape
            print "y=", y_arr[idx]
            # y shape (1,3)
            print "y shape", y_arr[idx].shape
            output_softmax_lst = output_softmax_val
            labels_lst = y_arr[idx]
            cost_lst += [cost]
            # testing.assert_almost_equal(pseudo_cost, pseudo_cost_old, decimal=4)
            # testing.assert_array_almost_equal(pseudo_cost_val, pseudo_cost_old_val)

        if (nn + 1) % 20 == 0:
            DECAY = 1.5
            new_lr = lasagne.utils.floatX(sh_lr.get_value() / DECAY)
            sh_lr.set_value(new_lr)
            print "----------------------->NEW LR:", new_lr

        print nn, "Mean cost:", np.mean(cost_lst)
        if (nn + 1) % 4 == 0:
            for jj in range(num_batch):
                pred = print_pred(np.argmax(output_softmax_val[jj], axis=-1))
                true = "".join(map(str, labels_lst[jj]))
                pred += (target_seq_len - len(pred)) * " "
                print "pred =", pred, "true =", true
Beispiel #4
0
def test_lasagne_ctc():
    import lasagne
    from lasagne.layers import LSTMLayer, InputLayer, DenseLayer,\
        NonlinearityLayer, ReshapeLayer, EmbeddingLayer, RecurrentLayer
    import theano
    import theano.tensor as T
    import numpy as np
    num_batch, input_seq_len = 10, 15
    num_classes = 10
    target_seq_len = 5
    num_rnn_units = 50

    input_seq_len += target_seq_len

    def print_pred(y_hat):
        blank_symbol = num_classes
        res = []
        for i, s in enumerate(y_hat):
            if (s != blank_symbol) and (i == 0 or s != y_hat[i - 1]):
                res += [s]
        if len(res) > 0:
            return "".join(map(str, list(res)))
        else:
            return "-" * target_seq_len

    Y_hat = np.asarray(np.random.normal(
        0, 1, (input_seq_len, num_batch, num_classes + 1)),
                       dtype=floatX)
    Y = np.zeros((target_seq_len, num_batch), dtype='int64')
    Y[25:, :] = 1
    Y_hat_mask = np.ones((input_seq_len, num_batch), dtype=floatX)
    Y_hat_mask[-5:] = 0
    # default blank symbol is the highest class index (3 in this case)
    Y_mask = np.asarray(np.ones_like(Y), dtype=floatX)
    X = np.random.random((num_batch, input_seq_len)).astype('int32')

    y = T.imatrix('phonemes')
    x = T.imatrix()  # batchsize, input_seq_len, features

    # setup Lasagne Recurrent network
    # The output from the network is shape
    #  a) output_lin_ctc is the activation before softmax  (input_seq_len, batch_size, num_classes + 1)
    #  b) ouput_softmax is the output after softmax  (batch_size, input_seq_len, num_classes + 1)
    l_inp = InputLayer((num_batch, input_seq_len))
    l_emb = EmbeddingLayer(l_inp,
                           input_size=num_classes + 1,
                           output_size=num_classes + 1,
                           W=np.identity(num_classes + 1).astype('float32'))
    ini = lasagne.init.Uniform(0.1)
    zero = lasagne.init.Constant(0.0)
    cell = lasagne.init.Uniform(0.1)
    l_rnn = LSTMLayer(l_emb,
                      num_units=num_rnn_units,
                      peepholes=True,
                      W_in_to_ingate=ini,
                      W_hid_to_ingate=ini,
                      b_ingate=zero,
                      W_in_to_forgetgate=ini,
                      W_hid_to_forgetgate=ini,
                      b_forgetgate=zero,
                      W_in_to_cell=ini,
                      W_hid_to_cell=ini,
                      b_cell=zero,
                      W_in_to_outgate=ini,
                      W_hid_to_outgate=ini,
                      b_outgate=zero,
                      cell_init=lasagne.init.Constant(0.),
                      hid_init=lasagne.init.Constant(0.),
                      W_cell_to_forgetgate=cell,
                      W_cell_to_ingate=cell,
                      W_cell_to_outgate=cell)
    l_rnn_shp = ReshapeLayer(l_rnn, (num_batch * input_seq_len, num_rnn_units))
    l_out = DenseLayer(l_rnn_shp,
                       num_units=num_classes + 1,
                       nonlinearity=lasagne.nonlinearities.identity)  # + blank

    l_out_shp = ReshapeLayer(l_out,
                             (num_batch, input_seq_len, num_classes + 1))

    # dimshuffle to shape format (input_seq_len, batch_size, num_classes + 1)
    #l_out_shp_ctc = lasagne.layers.DimshuffleLayer(l_out_shp, (1, 0, 2))

    l_out_softmax = NonlinearityLayer(
        l_out, nonlinearity=lasagne.nonlinearities.softmax)
    l_out_softmax_shp = ReshapeLayer(
        l_out_softmax, (num_batch, input_seq_len, num_classes + 1))

    output_lin_ctc = lasagne.layers.get_output(l_out_shp, x)
    output_softmax = lasagne.layers.get_output(l_out_softmax_shp, x)
    all_params = l_rnn.get_params(trainable=True)  # dont learn embeddingl
    print all_params

    ###############
    #  GRADIENTS  #
    ###############

    # the CTC cross entropy between y and linear output network
    pseudo_cost = ctc_cost.pseudo_cost(y, output_lin_ctc)

    # calculate the gradients of the CTC wrt. linar output of network
    pseudo_cost_grad = T.grad(pseudo_cost.sum() / num_batch, all_params)
    true_cost = ctc_cost.cost(y, output_softmax)
    cost = T.mean(true_cost)

    sh_lr = theano.shared(lasagne.utils.floatX(0.01))
    #updates = lasagne.updates.sgd(pseudo_cost_grad, all_params, learning_rate=sh_lr)
    #updates = lasagne.updates.apply_nesterov_momentum(updates, all_params, momentum=0.9)
    updates = lasagne.updates.rmsprop(pseudo_cost_grad,
                                      all_params,
                                      learning_rate=sh_lr)

    train = theano.function(
        [x, y], [output_lin_ctc, output_softmax, cost, pseudo_cost],
        updates=updates)

    # Create test dataset
    num_samples = 1000
    np.random.seed(1234)

    # create simple dataset of format
    # input [5,5,5,5,5,2,2,2,2,2,3,3,3,3,3,....,1,1,1,1]
    # targets [5,2,3,...,1]
    # etc...
    input_lst, output_lst = [], []
    for i in range(num_samples):
        this_input = []
        this_output = []
        for j in range(target_seq_len):
            this_class = np.random.randint(num_classes)
            this_input += [this_class] * 3 + [num_classes]
            this_output += [this_class]

        this_input += (input_seq_len - len(this_input)) * [this_input[-1]]

        input_lst.append(this_input)
        output_lst.append(this_output)
        print this_input, this_output

    input_arr = np.concatenate([input_lst]).astype('int32')
    y_arr = np.concatenate([output_lst]).astype('int32')

    y_mask_arr = np.ones((num_batch, target_seq_len), dtype='float32')
    input_mask_arr = np.ones((num_batch, input_seq_len), dtype='float32')

    for nn in range(10000):
        cost_lst = []
        shuffle = np.random.permutation(num_samples)
        for i in range(num_samples // num_batch):
            idx = shuffle[i * num_batch:(i + 1) * num_batch]
            _, output_softmax_val, cost, pseudo_cost_val = train(
                input_arr[idx], y_arr[idx])
            output_softmax_lst = output_softmax_val
            labels_lst = y_arr[idx]
            cost_lst += [cost]
            #testing.assert_almost_equal(pseudo_cost, pseudo_cost_old, decimal=4)
            #testing.assert_array_almost_equal(pseudo_cost_val, pseudo_cost_old_val)

        if (nn + 1) % 200 == 0:
            DECAY = 1.5
            new_lr = lasagne.utils.floatX(sh_lr.get_value() / DECAY)
            sh_lr.set_value(new_lr)
            print "----------------------->NEW LR:", new_lr

        print nn, "Mean cost:", np.mean(cost_lst)
        if (nn + 1) % 4 == 0:
            for jj in range(num_batch):
                pred = print_pred(np.argmax(output_softmax_val[jj], axis=-1))
                true = "".join(map(str, labels_lst[jj]))
                pred += (target_seq_len - len(pred)) * " "
                print pred, true
Beispiel #5
0
def test_lasagne_ctc():
    import lasagne
    from lasagne.layers import LSTMLayer, InputLayer, DenseLayer,\
        NonlinearityLayer, ReshapeLayer, EmbeddingLayer, RecurrentLayer
    import theano
    import theano.tensor as T
    import numpy as np
    num_batch, input_seq_len = 10, 15
    num_classes = 10
    target_seq_len = 5
    num_rnn_units = 50

    input_seq_len += target_seq_len
    def print_pred(y_hat):
        blank_symbol = num_classes
        res = []
        for i, s in enumerate(y_hat):
            if (s != blank_symbol) and (i == 0 or s != y_hat[i - 1]):
                res += [s]
        if len(res) > 0:
            return "".join(map(str, list(res)))
        else:
            return "-"*target_seq_len

    Y_hat = np.asarray(np.random.normal(
        0, 1, (input_seq_len, num_batch, num_classes + 1)), dtype=floatX)
    Y = np.zeros((target_seq_len, num_batch), dtype='int64')
    Y[25:, :] = 1
    Y_hat_mask = np.ones((input_seq_len, num_batch), dtype=floatX)
    Y_hat_mask[-5:] = 0
    # default blank symbol is the highest class index (3 in this case)
    Y_mask = np.asarray(np.ones_like(Y), dtype=floatX)
    X = np.random.random(
        (num_batch, input_seq_len)).astype('int32')

    y = T.imatrix('phonemes')
    x = T.imatrix()   # batchsize, input_seq_len, features



    # setup Lasagne Recurrent network
    # The output from the network is shape
    #  a) output_lin_ctc is the activation before softmax  (input_seq_len, batch_size, num_classes + 1)
    #  b) ouput_softmax is the output after softmax  (batch_size, input_seq_len, num_classes + 1)
    l_inp = InputLayer((num_batch, input_seq_len))
    l_emb = EmbeddingLayer(l_inp,
                           input_size=num_classes+1,
                           output_size=num_classes+1,
                           W=np.identity(num_classes+1).astype('float32'))
    ini = lasagne.init.Uniform(0.1)
    zero = lasagne.init.Constant(0.0)
    cell = lasagne.init.Uniform(0.1)
    l_rnn = LSTMLayer(l_emb,
                      num_units=num_rnn_units,
                      peepholes=True,
                      W_in_to_ingate=ini,
                      W_hid_to_ingate=ini,
                      b_ingate=zero,
                      W_in_to_forgetgate=ini,
                      W_hid_to_forgetgate=ini,
                      b_forgetgate=zero,
                      W_in_to_cell=ini,
                      W_hid_to_cell=ini,
                      b_cell=zero,
                      W_in_to_outgate=ini,
                      W_hid_to_outgate=ini,
                      b_outgate=zero,
                      cell_init=lasagne.init.Constant(0.),
                      hid_init=lasagne.init.Constant(0.),
                      W_cell_to_forgetgate=cell,
                      W_cell_to_ingate=cell,
                      W_cell_to_outgate=cell)
    l_rnn_shp = ReshapeLayer(l_rnn, (num_batch*input_seq_len, num_rnn_units))
    l_out = DenseLayer(l_rnn_shp, num_units=num_classes+1,
                       nonlinearity=lasagne.nonlinearities.identity)  # + blank

    l_out_shp = ReshapeLayer(l_out, (num_batch, input_seq_len, num_classes+1))

    # dimshuffle to shape format (input_seq_len, batch_size, num_classes + 1)
    #l_out_shp_ctc = lasagne.layers.DimshuffleLayer(l_out_shp, (1, 0, 2))

    l_out_softmax = NonlinearityLayer(
        l_out, nonlinearity=lasagne.nonlinearities.softmax)
    l_out_softmax_shp = ReshapeLayer(
        l_out_softmax, (num_batch, input_seq_len, num_classes+1))

    output_lin_ctc = lasagne.layers.get_output(l_out_shp, x)
    output_softmax = lasagne.layers.get_output(l_out_softmax_shp, x)
    all_params = l_rnn.get_params(trainable=True)  # dont learn embeddingl
    print all_params

    ###############
    #  GRADIENTS  #
    ###############

    # the CTC cross entropy between y and linear output network
    pseudo_cost = ctc_cost.pseudo_cost(
        y, output_lin_ctc)

    # calculate the gradients of the CTC wrt. linar output of network
    pseudo_cost_grad = T.grad(pseudo_cost.sum() / num_batch, all_params)
    true_cost = ctc_cost.cost(y, output_softmax)
    cost = T.mean(true_cost)

    sh_lr = theano.shared(lasagne.utils.floatX(0.01))
    #updates = lasagne.updates.sgd(pseudo_cost_grad, all_params, learning_rate=sh_lr)
    #updates = lasagne.updates.apply_nesterov_momentum(updates, all_params, momentum=0.9)
    updates = lasagne.updates.rmsprop(pseudo_cost_grad, all_params, learning_rate=sh_lr)

    train = theano.function([x, y],
                            [output_lin_ctc, output_softmax, cost, pseudo_cost],
                            updates=updates)


    # Create test dataset
    num_samples = 1000
    np.random.seed(1234)

    # create simple dataset of format
    # input [5,5,5,5,5,2,2,2,2,2,3,3,3,3,3,....,1,1,1,1]
    # targets [5,2,3,...,1]
    # etc...
    input_lst, output_lst = [], []
    for i in range(num_samples):
        this_input = []
        this_output = []
        for j in range(target_seq_len):
            this_class = np.random.randint(num_classes)
            this_input += [this_class]*3 + [num_classes]
            this_output += [this_class]

        this_input += (input_seq_len - len(this_input))*[this_input[-1]]

        input_lst.append(this_input)
        output_lst.append(this_output)
        print this_input, this_output

    input_arr = np.concatenate([input_lst]).astype('int32')
    y_arr = np.concatenate([output_lst]).astype('int32')

    y_mask_arr = np.ones((num_batch, target_seq_len), dtype='float32')
    input_mask_arr = np.ones((num_batch, input_seq_len), dtype='float32')

    for nn in range(10000):
        cost_lst = []
        shuffle = np.random.permutation(num_samples)
        for i in range(num_samples//num_batch):
            idx = shuffle[i*num_batch:(i+1)*num_batch]
            _, output_softmax_val, cost, pseudo_cost_val = train(
                input_arr[idx],
                y_arr[idx])
            output_softmax_lst = output_softmax_val
            labels_lst = y_arr[idx]
            cost_lst += [cost]
            #testing.assert_almost_equal(pseudo_cost, pseudo_cost_old, decimal=4)
            #testing.assert_array_almost_equal(pseudo_cost_val, pseudo_cost_old_val)

        if (nn+1) % 200 == 0:
            DECAY = 1.5
            new_lr = lasagne.utils.floatX(sh_lr.get_value() / DECAY)
            sh_lr.set_value(new_lr)
            print "----------------------->NEW LR:", new_lr

        print nn, "Mean cost:", np.mean(cost_lst)
        if (nn+1) % 4 == 0:
            for jj in range(num_batch):
                pred = print_pred(np.argmax(output_softmax_val[jj], axis=-1))
                true = "".join(map(str, labels_lst[jj]))
                pred += (target_seq_len-len(pred)) * " "
                print pred, true