def test_ctc_symmetry_logscale():
    LENGTH = 5000
    BATCHES = 3
    CLASSES = 4
    y_hat = T.tensor3('features')
    input_mask = T.matrix('features_mask')
    y_hat_mask = input_mask
    y = T.lmatrix('phonemes')
    y_mask = T.matrix('phonemes_mask')
    ctc_cost_t = ctc_cost.cost(y, y_hat, y_mask, y_hat_mask)

    Y_hat = np.zeros((LENGTH, BATCHES, CLASSES), dtype=floatX)
    Y_hat[:, :, 0] = .3
    Y_hat[:, :, 1] = .2
    Y_hat[:, :, 2] = .4
    Y_hat[:, :, 3] = .1
    Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX)
    # default blank symbol is the highest class index (3 in this case)
    Y = np.repeat(np.array([0, 1, 2, 1, 2, 0, 2, 2, 2]),
                  BATCHES).reshape((9, BATCHES))
    # the masks for this test should be all ones.
    Y_mask = np.asarray(np.ones_like(Y), dtype=floatX)
    forward_cost = ctc_cost_t.eval({y_hat: Y_hat, y: Y,
                                  y_hat_mask: Y_hat_mask, y_mask: Y_mask})
    backward_cost = ctc_cost_t.eval({y_hat: Y_hat, y: Y[::-1],
                                   y_hat_mask: Y_hat_mask, y_mask: Y_mask})
    testing.assert_almost_equal(forward_cost[0], backward_cost[0])
    assert not np.isnan(forward_cost[0])
    assert not np.isnan(backward_cost[0])
    assert not np.isinf(np.abs(forward_cost[0]))
    assert not np.isinf(np.abs(backward_cost[0]))
def test_ctc_exact():
    LENGTH = 4
    BATCHES = 1
    CLASSES = 2
    y_hat = T.tensor3('features')
    input_mask = T.matrix('features_mask')
    y_hat_mask = input_mask
    y = T.lmatrix('phonemes')
    y_mask = T.matrix('phonemes_mask')
    ctc_cost_t = ctc_cost.cost(y, y_hat, y_mask, y_hat_mask, log_scale=False)

    Y_hat = np.zeros((LENGTH, BATCHES, CLASSES), dtype=floatX)
    Y_hat[:, :, 0] = .7
    Y_hat[:, :, 1] = .3
    Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX)
    # default blank symbol is the highest class index (3 in this case)
    Y = np.zeros((2, 1), dtype='int64')
    # -0-0
    # 0-0-
    # 0--0
    # 0-00
    # 00-0
    answer = np.log(3 * (.3 * .7)**2 + 2 * .3 * .7**3)
    Y_mask = np.asarray(np.ones_like(Y), dtype=floatX)
    forward_cost = ctc_cost_t.eval({y_hat: Y_hat, y: Y,
                                  y_hat_mask: Y_hat_mask, y_mask: Y_mask})
    backward_cost = ctc_cost_t.eval({y_hat: Y_hat, y: Y[::-1],
                                   y_hat_mask: Y_hat_mask, y_mask: Y_mask})
    assert not np.isnan(forward_cost[0])
    assert not np.isnan(backward_cost[0])
    assert not np.isinf(np.abs(forward_cost[0]))
    assert not np.isinf(np.abs(backward_cost[0]))
    testing.assert_almost_equal(-forward_cost[0], answer)
    testing.assert_almost_equal(-backward_cost[0], answer)
Esempio n. 3
0
    def setup(self):
        # setup Lasagne Recurrent network
        # The output from the network is shape
        #  a) output_lin_ctc is the activation before softmax  (input_seq_len, batch_size, num_classes + 1)
        #  b) ouput_softmax is the output after softmax  (batch_size, input_seq_len, num_classes + 1)
        l_inp = InputLayer(shape=(self.num_batch, self.input_seq_len, self.num_inputs))
        l_mask = InputLayer(shape=(self.num_batch, self.input_seq_len))
        l_emb = EmbeddingLayer(l_inp, input_size=self.num_inputs, output_size=self.num_features)

        l_rnn = LSTMLayer(l_inp, num_units=self.num_units, peepholes=True, mask_input=l_mask)

        l_rnn_shp = ReshapeLayer(l_rnn, shape=(-1, self.num_units))
        l_out = DenseLayer(l_rnn_shp, num_units=self.num_outputs, nonlinearity=identity)
        l_out_shp = ReshapeLayer(l_out, shape=(-1, self.input_seq_len, self.num_outputs))

        # dimshuffle to shape format (input_seq_len, batch_size, num_classes + 1)
        #l_out_shp_ctc = lasagne.layers.DimshuffleLayer(l_out_shp, (1, 0, 2))

        l_out_softmax = NonlinearityLayer(l_out, nonlinearity=softmax)
        l_out_softmax_shp = ReshapeLayer(l_out_softmax, shape=(-1, self.input_seq_len, self.num_outputs))

        # calculate grad and cost
        output_lin_ctc = get_output(l_out_shp, {l_inp: self.x, l_mask: self.mask_x})
        output_softmax = get_output(l_out_softmax_shp, {l_inp: self.x, l_mask: self.mask_x})

        all_params = get_all_params(l_out_softmax_shp, trainable=True)  # dont learn embeddinglayer

        # the CTC cross entropy between y and linear output network
        pseudo_cost = ctc_cost.pseudo_cost(self.y, output_lin_ctc, self.mask_y, self.mask_x)

        # calculate the gradients of the CTC wrt. linar output of network
        pseudo_grad = T.grad(pseudo_cost.sum() / self.num_batch, all_params)
        true_cost = ctc_cost.cost(self.y, output_softmax, self.mask_y, self.mask_x)
        cost = T.mean(true_cost)

        shared_lr = theano.shared(lasagne.utils.floatX(0.001))
        #updates = lasagne.updates.sgd(pseudo_cost_grad, all_params, learning_rate=shared_lr)
        #updates = lasagne.updates.apply_nesterov_momentum(updates, all_params, momentum=0.9)
        updates = lasagne.updates.rmsprop(pseudo_grad, all_params, learning_rate=shared_lr)

        self.train = theano.function([self.x, self.mask_x, self.y, self.mask_y],
                                     [output_softmax, cost], updates=updates)
        self.test = theano.function([self.x, self.mask_x], [output_softmax])
Esempio n. 4
0
def test_ctc_exact():
    LENGTH = 4
    BATCHES = 1
    CLASSES = 2
    y_hat = T.tensor3('features')
    input_mask = T.matrix('features_mask')
    y_hat_mask = input_mask
    y = T.lmatrix('phonemes')
    y_mask = T.matrix('phonemes_mask')
    ctc_cost_t = ctc_cost.cost(y, y_hat, y_mask, y_hat_mask, log_scale=False)

    Y_hat = np.zeros((LENGTH, BATCHES, CLASSES), dtype=floatX)
    Y_hat[:, :, 0] = .7
    Y_hat[:, :, 1] = .3
    Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX)
    # default blank symbol is the highest class index (3 in this case)
    Y = np.zeros((2, 1), dtype='int64')
    # -0-0
    # 0-0-
    # 0--0
    # 0-00
    # 00-0
    answer = np.log(3 * (.3 * .7)**2 + 2 * .3 * .7**3)
    Y_mask = np.asarray(np.ones_like(Y), dtype=floatX)
    forward_cost = ctc_cost_t.eval({
        y_hat: Y_hat,
        y: Y,
        y_hat_mask: Y_hat_mask,
        y_mask: Y_mask
    })
    backward_cost = ctc_cost_t.eval({
        y_hat: Y_hat,
        y: Y[::-1],
        y_hat_mask: Y_hat_mask,
        y_mask: Y_mask
    })
    assert not np.isnan(forward_cost[0])
    assert not np.isnan(backward_cost[0])
    assert not np.isinf(np.abs(forward_cost[0]))
    assert not np.isinf(np.abs(backward_cost[0]))
    testing.assert_almost_equal(-forward_cost[0], answer)
    testing.assert_almost_equal(-backward_cost[0], answer)
def finite_diff(Y, Y_hat, Y_mask, Y_hat_mask, eps=1e-2, n_steps=None):
    y_hat = T.tensor3('features')
    y_hat_mask = T.matrix('features_mask')
    y = T.lmatrix('phonemes')
    y_mask = T.matrix('phonemes_mask')
    ctc_cost_t = ctc_cost.cost(y, y_hat, y_mask, y_hat_mask)
    get_cost = theano.function([y, y_hat, y_mask, y_hat_mask],
                               ctc_cost_t.sum())
    diff_grad = np.zeros_like(Y_hat)
    
    for grad, val in islice(izip(np.nditer(diff_grad, op_flags=['readwrite']),
                                 np.nditer(Y_hat, op_flags=['readwrite'])),
                            0, n_steps):
        val += eps
        error_inc = get_cost(Y, Y_hat, Y_mask, Y_hat_mask)
        val -= 2.0 * eps
        error_dec = get_cost(Y, Y_hat, Y_mask, Y_hat_mask)
        grad[...] = .5 * (error_inc - error_dec) / eps
        val += eps

    return diff_grad
Esempio n. 6
0
def finite_diff(Y, Y_hat, Y_mask, Y_hat_mask, eps=1e-2, n_steps=None):
    y_hat = T.tensor3('features')
    y_hat_mask = T.matrix('features_mask')
    y = T.lmatrix('phonemes')
    y_mask = T.matrix('phonemes_mask')
    ctc_cost_t = ctc_cost.cost(y, y_hat, y_mask, y_hat_mask)
    get_cost = theano.function([y, y_hat, y_mask, y_hat_mask],
                               ctc_cost_t.sum())
    diff_grad = np.zeros_like(Y_hat)

    for grad, val in islice(
            izip(np.nditer(diff_grad, op_flags=['readwrite']),
                 np.nditer(Y_hat, op_flags=['readwrite'])), 0, n_steps):
        val += eps
        error_inc = get_cost(Y, Y_hat, Y_mask, Y_hat_mask)
        val -= 2.0 * eps
        error_dec = get_cost(Y, Y_hat, Y_mask, Y_hat_mask)
        grad[...] = .5 * (error_inc - error_dec) / eps
        val += eps

    return diff_grad
Esempio n. 7
0
def test_ctc_symmetry_logscale():
    LENGTH = 5000
    BATCHES = 3
    CLASSES = 4
    y_hat = T.tensor3('features')
    input_mask = T.matrix('features_mask')
    y_hat_mask = input_mask
    y = T.lmatrix('phonemes')
    y_mask = T.matrix('phonemes_mask')
    ctc_cost_t = ctc_cost.cost(y, y_hat, y_mask, y_hat_mask)

    Y_hat = np.zeros((LENGTH, BATCHES, CLASSES), dtype=floatX)
    Y_hat[:, :, 0] = .3
    Y_hat[:, :, 1] = .2
    Y_hat[:, :, 2] = .4
    Y_hat[:, :, 3] = .1
    Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX)
    # default blank symbol is the highest class index (3 in this case)
    Y = np.repeat(np.array([0, 1, 2, 1, 2, 0, 2, 2, 2]), BATCHES).reshape(
        (9, BATCHES))
    # the masks for this test should be all ones.
    Y_mask = np.asarray(np.ones_like(Y), dtype=floatX)
    forward_cost = ctc_cost_t.eval({
        y_hat: Y_hat,
        y: Y,
        y_hat_mask: Y_hat_mask,
        y_mask: Y_mask
    })
    backward_cost = ctc_cost_t.eval({
        y_hat: Y_hat,
        y: Y[::-1],
        y_hat_mask: Y_hat_mask,
        y_mask: Y_mask
    })
    testing.assert_almost_equal(forward_cost[0], backward_cost[0])
    assert not np.isnan(forward_cost[0])
    assert not np.isnan(backward_cost[0])
    assert not np.isinf(np.abs(forward_cost[0]))
    assert not np.isinf(np.abs(backward_cost[0]))
Esempio n. 8
0
    net['conv4b'].b.tag.grad_scale = 2
    net['conv5a'].b.tag.grad_scale = 2
    net['conv5b'].b.tag.grad_scale = 2
    net['fc6-1'].b.tag.grad_scale = 2
    # net['fc7-1'].b.tag.grad_scale = 2
    # net['fc8-1'].b.tag.grad_scale = 2
    net['fc8-1'].W.tag.grad_scale = 10
    net['fc8-1'].b.tag.grad_scale = 20

output_train = lasagne.layers.get_output(net['prob'], deterministic=False)
output_eval = lasagne.layers.get_output(net['prob'], deterministic=True)

# compute the cost for training
output_flat = T.reshape(output_train, (num_batch, clip_length, num_classes))
#cost = T.mean(T.nnet.categorical_crossentropy(output_flat+TOL, sym_y.flatten()))
cost = T.mean(ctc_cost.cost(output_flat + TOL, sym_y))

# maybe it is necessary to add l2_penalty to the cost
regularizable_params = lasagne.layers.get_all_params(net['prob'],
                                                     regularizable=True)
print 'the regularizable_params are:'
for p in regularizable_params:
    print p.name

l2_w = 0.0005
all_layers = lasagne.layers.get_all_layers(net['prob'])
l2_penalty = lasagne.regularization.regularize_layer_params(
    all_layers, lasagne.regularization.l2) * l2_w
cost += l2_penalty

# compute the cost for evaluation
Esempio n. 9
0
    def setup(self):
        # setup Lasagne Recurrent network
        # The output from the network is shape
        #  a) output_lin_ctc is the activation before softmax  (input_seq_len, batch_size, num_classes + 1)
        #  b) ouput_softmax is the output after softmax  (batch_size, input_seq_len, num_classes + 1)
        l_inp = InputLayer(shape=(self.num_batch, self.input_seq_len,
                                  self.num_inputs))
        l_mask = InputLayer(shape=(self.num_batch, self.input_seq_len))
        l_emb = EmbeddingLayer(l_inp,
                               input_size=self.num_inputs,
                               output_size=self.num_features)

        l_rnn = LSTMLayer(l_inp,
                          num_units=self.num_units,
                          peepholes=True,
                          mask_input=l_mask)

        l_rnn_shp = ReshapeLayer(l_rnn, shape=(-1, self.num_units))
        l_out = DenseLayer(l_rnn_shp,
                           num_units=self.num_outputs,
                           nonlinearity=identity)
        l_out_shp = ReshapeLayer(l_out,
                                 shape=(-1, self.input_seq_len,
                                        self.num_outputs))

        # dimshuffle to shape format (input_seq_len, batch_size, num_classes + 1)
        #l_out_shp_ctc = lasagne.layers.DimshuffleLayer(l_out_shp, (1, 0, 2))

        l_out_softmax = NonlinearityLayer(l_out, nonlinearity=softmax)
        l_out_softmax_shp = ReshapeLayer(l_out_softmax,
                                         shape=(-1, self.input_seq_len,
                                                self.num_outputs))

        # calculate grad and cost
        output_lin_ctc = get_output(l_out_shp, {
            l_inp: self.x,
            l_mask: self.mask_x
        })
        output_softmax = get_output(l_out_softmax_shp, {
            l_inp: self.x,
            l_mask: self.mask_x
        })

        all_params = get_all_params(
            l_out_softmax_shp, trainable=True)  # dont learn embeddinglayer

        # the CTC cross entropy between y and linear output network
        pseudo_cost = ctc_cost.pseudo_cost(self.y, output_lin_ctc, self.mask_y,
                                           self.mask_x)

        # calculate the gradients of the CTC wrt. linar output of network
        pseudo_grad = T.grad(pseudo_cost.sum() / self.num_batch, all_params)
        true_cost = ctc_cost.cost(self.y, output_softmax, self.mask_y,
                                  self.mask_x)
        cost = T.mean(true_cost)

        shared_lr = theano.shared(lasagne.utils.floatX(0.001))
        #updates = lasagne.updates.sgd(pseudo_cost_grad, all_params, learning_rate=shared_lr)
        #updates = lasagne.updates.apply_nesterov_momentum(updates, all_params, momentum=0.9)
        updates = lasagne.updates.rmsprop(pseudo_grad,
                                          all_params,
                                          learning_rate=shared_lr)

        self.train = theano.function(
            [self.x, self.mask_x, self.y, self.mask_y], [output_softmax, cost],
            updates=updates)
        self.test = theano.function([self.x, self.mask_x], [output_softmax])
Esempio n. 10
0
def test_lasagne_ctc():
    import lasagne
    from lasagne.layers import (
        LSTMLayer,
        InputLayer,
        DenseLayer,
        NonlinearityLayer,
        ReshapeLayer,
        EmbeddingLayer,
        RecurrentLayer,
    )
    import theano
    import theano.tensor as T
    import numpy as np

    num_batch, input_seq_len = 1, 12
    num_classes = 5
    target_seq_len = 3
    num_rnn_units = 50

    def print_pred(y_hat):
        blank_symbol = num_classes
        res = []
        for i, s in enumerate(y_hat):
            if (s != blank_symbol) and (i == 0 or s != y_hat[i - 1]):
                res += [s]
        if len(res) > 0:
            return "".join(map(str, list(res)))
        else:
            return "-" * target_seq_len

    Y_hat = np.asarray(np.random.normal(0, 1, (input_seq_len, num_batch, num_classes + 1)), dtype=floatX)
    Y = np.zeros((target_seq_len, num_batch), dtype="int64")
    Y[25:, :] = 1
    Y_hat_mask = np.ones((input_seq_len, num_batch), dtype=floatX)
    Y_hat_mask[-5:] = 0
    # default blank symbol is the highest class index (3 in this case)
    Y_mask = np.asarray(np.ones_like(Y), dtype=floatX)
    X = np.random.random((num_batch, input_seq_len)).astype("int32")

    y = T.imatrix("phonemes")
    x = T.imatrix()  # batchsize, input_seq_len, features

    print "num_batch =", num_batch, "input_seq_len =", input_seq_len
    print "num_classes =", num_classes
    # setup Lasagne Recurrent network
    # The output from the network is shape
    #  a) output_lin_ctc is the activation before softmax  (input_seq_len, batch_size, num_classes + 1)
    #  b) ouput_softmax is the output after softmax  (batch_size, input_seq_len, num_classes + 1)
    l_inp = InputLayer((num_batch, input_seq_len))
    netshape = lasagne.layers.get_output_shape(l_inp)
    print ("Layer l_inp shape:")
    print (netshape)
    l_emb = EmbeddingLayer(
        l_inp, input_size=num_classes + 1, output_size=num_classes + 1, W=np.identity(num_classes + 1).astype("float32")
    )
    netshape = lasagne.layers.get_output_shape(l_emb)
    print ("Layer l_emb shape:")
    print (netshape)
    l_rnn = LSTMLayer(l_emb, num_units=num_rnn_units)
    netshape = lasagne.layers.get_output_shape(l_rnn)
    print ("Layer l_rnn shape:")
    print (netshape)
    l_rnn_shp = ReshapeLayer(l_rnn, (num_batch * input_seq_len, num_rnn_units))
    netshape = lasagne.layers.get_output_shape(l_rnn_shp)
    print ("Layer l_rnn_shp shape:")
    print (netshape)
    l_out = DenseLayer(l_rnn_shp, num_units=num_classes + 1, nonlinearity=lasagne.nonlinearities.identity)  # + blank

    netshape = lasagne.layers.get_output_shape(l_out)
    print ("Layer l_out shape:")
    print (netshape)
    l_out_shp = ReshapeLayer(l_out, (num_batch, input_seq_len, num_classes + 1))
    netshape = lasagne.layers.get_output_shape(l_out_shp)
    print ("Layer l_out_shp shape:")
    print (netshape)

    # dimshuffle to shape format (input_seq_len, batch_size, num_classes + 1)
    # l_out_shp_ctc = lasagne.layers.DimshuffleLayer(l_out_shp, (1, 0, 2))

    l_out_softmax = NonlinearityLayer(l_out, nonlinearity=lasagne.nonlinearities.softmax)
    netshape = lasagne.layers.get_output_shape(l_out_softmax)
    print ("Layer l_out_softmax shape:")
    print (netshape)
    l_out_softmax_shp = ReshapeLayer(l_out_softmax, (num_batch, input_seq_len, num_classes + 1))
    netshape = lasagne.layers.get_output_shape(l_out_softmax_shp)
    print ("Layer l_out_softmax_shp shape:")
    print (netshape)

    output_lin_ctc = lasagne.layers.get_output(l_out_shp, x)
    output_softmax = lasagne.layers.get_output(l_out_softmax_shp, x)
    all_params = l_rnn.get_params(trainable=True)  # dont learn embeddingl
    print "x type:", type(x)
    print "x shape", x.shape
    print "y type:", type(y)
    print "y shape", y.shape

    ###############
    #  GRADIENTS  #
    ###############

    # the CTC cross entropy between y and linear output network
    # (num_batch,t,class+1)
    # output_lin_ctc shape (1,12,6)
    pseudo_cost = ctc_cost.pseudo_cost(y, output_lin_ctc)

    # calculate the gradients of the CTC wrt. linar output of network
    pseudo_cost_grad = T.grad(pseudo_cost.sum() / num_batch, all_params)
    true_cost = ctc_cost.cost(y, output_softmax)
    cost = T.mean(true_cost)

    sh_lr = theano.shared(lasagne.utils.floatX(0.01))
    updates = lasagne.updates.rmsprop(pseudo_cost_grad, all_params, learning_rate=sh_lr)
    # x shape (1,12)
    # y shape (1,3)

    train = theano.function([x, y], [output_lin_ctc, output_softmax, cost, pseudo_cost], updates=updates)

    # Create test dataset
    num_samples = 10
    np.random.seed(1234)

    # create simple dataset of format
    # input [5,5,5,5,5,2,2,2,2,2,3,3,3,3,3,....,1,1,1,1]
    # targets [5,2,3,...,1]
    # etc...
    input_lst, output_lst = [], []
    for i in range(num_samples):
        this_input = []
        this_output = []
        for j in range(target_seq_len):
            this_class = np.random.randint(num_classes)
            this_input += [this_class] * 3 + [num_classes]
            this_output += [this_class]

        this_input += (input_seq_len - len(this_input)) * [this_input[-1]]

        input_lst.append(this_input)
        output_lst.append(this_output)
        print this_input, this_output

    input_arr = np.concatenate([input_lst]).astype("int32")
    y_arr = np.concatenate([output_lst]).astype("int32")

    print "y_arr shape:", y_arr.shape

    y_mask_arr = np.ones((num_batch, target_seq_len), dtype="float32")
    input_mask_arr = np.ones((num_batch, input_seq_len), dtype="float32")

    for nn in range(1000):
        cost_lst = []
        shuffle = np.random.permutation(num_samples)
        for i in range(num_samples // num_batch):
            idx = shuffle[i * num_batch : (i + 1) * num_batch]
            _, output_softmax_val, cost, pseudo_cost_val = train(input_arr[idx], y_arr[idx])
            print "x=", input_arr[idx]
            # x shape (1,12)
            print "x shape", input_arr[idx].shape
            print "y=", y_arr[idx]
            # y shape (1,3)
            print "y shape", y_arr[idx].shape
            output_softmax_lst = output_softmax_val
            labels_lst = y_arr[idx]
            cost_lst += [cost]
            # testing.assert_almost_equal(pseudo_cost, pseudo_cost_old, decimal=4)
            # testing.assert_array_almost_equal(pseudo_cost_val, pseudo_cost_old_val)

        if (nn + 1) % 20 == 0:
            DECAY = 1.5
            new_lr = lasagne.utils.floatX(sh_lr.get_value() / DECAY)
            sh_lr.set_value(new_lr)
            print "----------------------->NEW LR:", new_lr

        print nn, "Mean cost:", np.mean(cost_lst)
        if (nn + 1) % 4 == 0:
            for jj in range(num_batch):
                pred = print_pred(np.argmax(output_softmax_val[jj], axis=-1))
                true = "".join(map(str, labels_lst[jj]))
                pred += (target_seq_len - len(pred)) * " "
                print "pred =", pred, "true =", true
Esempio n. 11
0
#    dims_bidir=conf.dims_transition, dims_top=[num_classes],
#    bidir_trans=GatedRecurrent, bottom_activation=None)

# ******************* output *******************
y_hat = recognizer.apply(x, x_m)
y_hat.name = 'outputs'
y_hat_softmax = NDimensionalSoftmax().apply(y_hat, extra_ndim=y_hat.ndim - 2)
y_hat_softmax.name = 'outputs_softmax'

# there is a cost function for monitoring and for training, because one is more stable to compute
# gradients and seems also to be more memory efficient, but does not compute the true cost.
if conf.task == 'CTC':
    cost_train = ctc.pseudo_cost(y, y_hat, y_m, x_m).mean()
    cost_train.name = "cost_train"

    cost_monitor = ctc.cost(y, y_hat_softmax, y_m, x_m).mean()
    cost_monitor.name = "cost_monitor"
elif conf.task == 'framewise':
    cost_train = categorical_crossentropy_batch().apply(y_hat_softmax, y, x_m)
    cost_train.name = 'cost'
    cost_monitor = cost_train
else:
    raise ValueError, conf.task

recognizer.initialize()
cg = ComputationGraph([cost_train, y_hat, x_m, y, y_m])

weights = VariableFilter(roles=[WEIGHT])(cg.variables)
cg = apply_noise(cg, weights, conf.weight_noise)

#************* training algorithm *************
Esempio n. 12
0
    def build_model(self, Dir_features, args):

        self._set_model_param(Dir_features)

        # try to scale the gradients on the level of parameters like caffe
        # by now only change the code with sgd
        scale_grad = True
        scale_l2_w = False

        TOL = 1e-5

        sym_y = T.imatrix()

        # W is regularizable, b is not regularizable (correspondence with caffe)
        if scale_grad:
            self.net['conv1a'].b.tag.grad_scale = 2
            self.net['conv2a'].b.tag.grad_scale = 2
            self.net['conv3a'].b.tag.grad_scale = 2
            self.net['conv3b'].b.tag.grad_scale = 2
            self.net['conv4a'].b.tag.grad_scale = 2
            self.net['conv4b'].b.tag.grad_scale = 2
            self.net['conv5a'].b.tag.grad_scale = 2
            self.net['conv5b'].b.tag.grad_scale = 2
            self.net['fc6-1'].b.tag.grad_scale = 2
            self.net['fc8-1'].W.tag.grad_scale = 10
            self.net['fc8-1'].b.tag.grad_scale = 20

        output_train = lasagne.layers.get_output(self.net['prob'],
                                                 deterministic=False)
        output_eval = lasagne.layers.get_output(self.net['prob'],
                                                deterministic=True)

        ##############
        # compute cost
        ##############
        # compute the cost for training
        output_flat = T.reshape(
            output_train,
            (self.batch_size, self.clip_length, self.num_classes))
        cost = T.mean(ctc_cost.cost(output_flat + TOL, sym_y))

        # maybe it is necessary to add l2_penalty to the cost
        regularizable_params = lasagne.layers.get_all_params(
            self.net['prob'], regularizable=True)
        l2_w = 0.0005
        all_layers = lasagne.layers.get_all_layers(self.net['prob'])
        l2_penalty = lasagne.regularization.regularize_layer_params(
            all_layers, lasagne.regularization.l2) * l2_w
        cost += l2_penalty

        # compute the cost for evaluation
        output_eval_flat = T.reshape(
            output_eval,
            (self.num_batch_eval, self.clip_length, self.num_classes))
        cost_eval = T.mean(ctc_cost.cost(output_eval_flat + TOL, sym_y))

        trainable_params = lasagne.layers.get_all_params(self.net['prob'],
                                                         trainable=True)

        sh_lr = theano.shared(lasagne.utils.floatX(args.lr))

        ##################################################################
        # try to scale the gradients on the level of parameters like caffe
        # by now only change the code with sgd
        ##################################################################
        if scale_grad:
            grads = theano.grad(cost, trainable_params)
            for idx, param in enumerate(trainable_params):
                grad_scale = getattr(trainable_params, 'grad_scale', 1)
                if grad_scale != 1:
                    grads[idx] *= grad_scale

        #################
        # compute updates
        #################
        # adam works with lr 0.001
        if args.optimizer == 'rmsprop':
            updates_opt = lasagne.updates.rmsprop(cost,
                                                  trainable_params,
                                                  learning_rate=sh_lr)
            updates = lasagne.updates.apply_momentum(updates_opt,
                                                     trainable_params,
                                                     momentum=0.9)

        elif args.optimizer == 'adam':
            updates_opt = lasagne.updates.adam(cost,
                                               trainable_params,
                                               learning_rate=sh_lr)
            updates = lasagne.updates.apply_momentum(updates_opt,
                                                     trainable_params,
                                                     momentum=0.9)

        elif args.optimizer == 'sgd':
            # Stochastic Gradient Descent (SGD) with momentum
            if scale_grad:
                updates = lasagne.updates.momentum(grads,
                                                   trainable_params,
                                                   learning_rate=sh_lr,
                                                   momentum=0.9)
            else:
                updates = lasagne.updates.momentum(cost,
                                                   trainable_params,
                                                   learning_rate=sh_lr,
                                                   momentum=0.9)

        elif args.optimizer == 'adadelta':
            updates_opt = lasagne.updates.adadelta(cost,
                                                   trainable_params,
                                                   learning_rate=sh_lr)
            updates = lasagne.updates.apply_momentum(updates_opt,
                                                     trainable_params,
                                                     momentum=0.9)

        elif args.optimizer == 'adagrad':
            updates_opt = lasagne.updates.adagrad(cost,
                                                  trainable_params,
                                                  learning_rate=sh_lr)
            updates = lasagne.updates.apply_momentum(updates_opt,
                                                     trainable_params,
                                                     momentum=0.9)

        #############################
        # set train and eval function
        #############################
        f_train = theano.function(
            [self.net['input'].input_var, sym_y, self.net['mask'].input_var],
            [cost, output_train],
            updates=updates)
        f_eval = theano.function(
            [self.net['input'].input_var, sym_y, self.net['mask'].input_var],
            [cost_eval, output_eval])

        return f_train, f_eval
Esempio n. 13
0
def test_lasagne_ctc():
    import lasagne
    from lasagne.layers import LSTMLayer, InputLayer, DenseLayer,\
        NonlinearityLayer, ReshapeLayer, EmbeddingLayer, RecurrentLayer
    import theano
    import theano.tensor as T
    import numpy as np
    num_batch, input_seq_len = 10, 15
    num_classes = 10
    target_seq_len = 5
    num_rnn_units = 50

    input_seq_len += target_seq_len

    def print_pred(y_hat):
        blank_symbol = num_classes
        res = []
        for i, s in enumerate(y_hat):
            if (s != blank_symbol) and (i == 0 or s != y_hat[i - 1]):
                res += [s]
        if len(res) > 0:
            return "".join(map(str, list(res)))
        else:
            return "-" * target_seq_len

    Y_hat = np.asarray(np.random.normal(
        0, 1, (input_seq_len, num_batch, num_classes + 1)),
                       dtype=floatX)
    Y = np.zeros((target_seq_len, num_batch), dtype='int64')
    Y[25:, :] = 1
    Y_hat_mask = np.ones((input_seq_len, num_batch), dtype=floatX)
    Y_hat_mask[-5:] = 0
    # default blank symbol is the highest class index (3 in this case)
    Y_mask = np.asarray(np.ones_like(Y), dtype=floatX)
    X = np.random.random((num_batch, input_seq_len)).astype('int32')

    y = T.imatrix('phonemes')
    x = T.imatrix()  # batchsize, input_seq_len, features

    # setup Lasagne Recurrent network
    # The output from the network is shape
    #  a) output_lin_ctc is the activation before softmax  (input_seq_len, batch_size, num_classes + 1)
    #  b) ouput_softmax is the output after softmax  (batch_size, input_seq_len, num_classes + 1)
    l_inp = InputLayer((num_batch, input_seq_len))
    l_emb = EmbeddingLayer(l_inp,
                           input_size=num_classes + 1,
                           output_size=num_classes + 1,
                           W=np.identity(num_classes + 1).astype('float32'))
    ini = lasagne.init.Uniform(0.1)
    zero = lasagne.init.Constant(0.0)
    cell = lasagne.init.Uniform(0.1)
    l_rnn = LSTMLayer(l_emb,
                      num_units=num_rnn_units,
                      peepholes=True,
                      W_in_to_ingate=ini,
                      W_hid_to_ingate=ini,
                      b_ingate=zero,
                      W_in_to_forgetgate=ini,
                      W_hid_to_forgetgate=ini,
                      b_forgetgate=zero,
                      W_in_to_cell=ini,
                      W_hid_to_cell=ini,
                      b_cell=zero,
                      W_in_to_outgate=ini,
                      W_hid_to_outgate=ini,
                      b_outgate=zero,
                      cell_init=lasagne.init.Constant(0.),
                      hid_init=lasagne.init.Constant(0.),
                      W_cell_to_forgetgate=cell,
                      W_cell_to_ingate=cell,
                      W_cell_to_outgate=cell)
    l_rnn_shp = ReshapeLayer(l_rnn, (num_batch * input_seq_len, num_rnn_units))
    l_out = DenseLayer(l_rnn_shp,
                       num_units=num_classes + 1,
                       nonlinearity=lasagne.nonlinearities.identity)  # + blank

    l_out_shp = ReshapeLayer(l_out,
                             (num_batch, input_seq_len, num_classes + 1))

    # dimshuffle to shape format (input_seq_len, batch_size, num_classes + 1)
    #l_out_shp_ctc = lasagne.layers.DimshuffleLayer(l_out_shp, (1, 0, 2))

    l_out_softmax = NonlinearityLayer(
        l_out, nonlinearity=lasagne.nonlinearities.softmax)
    l_out_softmax_shp = ReshapeLayer(
        l_out_softmax, (num_batch, input_seq_len, num_classes + 1))

    output_lin_ctc = lasagne.layers.get_output(l_out_shp, x)
    output_softmax = lasagne.layers.get_output(l_out_softmax_shp, x)
    all_params = l_rnn.get_params(trainable=True)  # dont learn embeddingl
    print all_params

    ###############
    #  GRADIENTS  #
    ###############

    # the CTC cross entropy between y and linear output network
    pseudo_cost = ctc_cost.pseudo_cost(y, output_lin_ctc)

    # calculate the gradients of the CTC wrt. linar output of network
    pseudo_cost_grad = T.grad(pseudo_cost.sum() / num_batch, all_params)
    true_cost = ctc_cost.cost(y, output_softmax)
    cost = T.mean(true_cost)

    sh_lr = theano.shared(lasagne.utils.floatX(0.01))
    #updates = lasagne.updates.sgd(pseudo_cost_grad, all_params, learning_rate=sh_lr)
    #updates = lasagne.updates.apply_nesterov_momentum(updates, all_params, momentum=0.9)
    updates = lasagne.updates.rmsprop(pseudo_cost_grad,
                                      all_params,
                                      learning_rate=sh_lr)

    train = theano.function(
        [x, y], [output_lin_ctc, output_softmax, cost, pseudo_cost],
        updates=updates)

    # Create test dataset
    num_samples = 1000
    np.random.seed(1234)

    # create simple dataset of format
    # input [5,5,5,5,5,2,2,2,2,2,3,3,3,3,3,....,1,1,1,1]
    # targets [5,2,3,...,1]
    # etc...
    input_lst, output_lst = [], []
    for i in range(num_samples):
        this_input = []
        this_output = []
        for j in range(target_seq_len):
            this_class = np.random.randint(num_classes)
            this_input += [this_class] * 3 + [num_classes]
            this_output += [this_class]

        this_input += (input_seq_len - len(this_input)) * [this_input[-1]]

        input_lst.append(this_input)
        output_lst.append(this_output)
        print this_input, this_output

    input_arr = np.concatenate([input_lst]).astype('int32')
    y_arr = np.concatenate([output_lst]).astype('int32')

    y_mask_arr = np.ones((num_batch, target_seq_len), dtype='float32')
    input_mask_arr = np.ones((num_batch, input_seq_len), dtype='float32')

    for nn in range(10000):
        cost_lst = []
        shuffle = np.random.permutation(num_samples)
        for i in range(num_samples // num_batch):
            idx = shuffle[i * num_batch:(i + 1) * num_batch]
            _, output_softmax_val, cost, pseudo_cost_val = train(
                input_arr[idx], y_arr[idx])
            output_softmax_lst = output_softmax_val
            labels_lst = y_arr[idx]
            cost_lst += [cost]
            #testing.assert_almost_equal(pseudo_cost, pseudo_cost_old, decimal=4)
            #testing.assert_array_almost_equal(pseudo_cost_val, pseudo_cost_old_val)

        if (nn + 1) % 200 == 0:
            DECAY = 1.5
            new_lr = lasagne.utils.floatX(sh_lr.get_value() / DECAY)
            sh_lr.set_value(new_lr)
            print "----------------------->NEW LR:", new_lr

        print nn, "Mean cost:", np.mean(cost_lst)
        if (nn + 1) % 4 == 0:
            for jj in range(num_batch):
                pred = print_pred(np.argmax(output_softmax_val[jj], axis=-1))
                true = "".join(map(str, labels_lst[jj]))
                pred += (target_seq_len - len(pred)) * " "
                print pred, true
Esempio n. 14
0
#    bidir_trans=GatedRecurrent, bottom_activation=None)


# ******************* output *******************
y_hat = recognizer.apply(x,x_m)
y_hat.name = 'outputs'
y_hat_softmax = NDimensionalSoftmax().apply(y_hat, extra_ndim = y_hat.ndim - 2)
y_hat_softmax.name = 'outputs_softmax'

# there is a cost function for monitoring and for training, because one is more stable to compute
# gradients and seems also to be more memory efficient, but does not compute the true cost.
if conf.task=='CTC':
    cost_train = ctc.pseudo_cost(y, y_hat, y_m, x_m).mean()
    cost_train.name = "cost_train"
    
    cost_monitor = ctc.cost(y, y_hat_softmax, y_m, x_m).mean()
    cost_monitor.name = "cost_monitor"
elif conf.task=='framewise':
    cost_train = categorical_crossentropy_batch().apply(y_hat_softmax, y, x_m)
    cost_train.name='cost'
    cost_monitor = cost_train
else:
    raise ValueError, conf.task


recognizer.initialize()
cg = ComputationGraph([cost_train, y_hat, x_m, y, y_m])


weights = VariableFilter(roles=[WEIGHT])(cg.variables)
cg = apply_noise(cg, weights, conf.weight_noise)
Esempio n. 15
0
all_params = L.get_all_params(l_rnn_2, trainable=True)

# ## Costs, Gradients & Training Functions

# Cost functions
target_values = T.imatrix('target_output')
input_values = T.imatrix()

### Gradients ###
# pseudo costs - ctc cross entropy b/n targets and linear output - used in training
pseudo_cost = ctc_cost.pseudo_cost(target_values, output_lin_ctc)
pseudo_cost_grad = T.grad(pseudo_cost.sum() / batchsize, all_params)
pseudo_cost = pseudo_cost.mean()

# true costs
cost = ctc_cost.cost(target_values, network_output)
cost = cost.mean()

# Compute SGD updates for training
print("Computing updates ...")
updates = lasagne.updates.rmsprop(pseudo_cost_grad, all_params, LEARNING_RATE)

# Theano functions for training and computing cost
print("Compiling functions ...")
train = theano.function(
        [l_in.input_var, target_values], [cost, pseudo_cost, network_output], updates=updates)
validate = theano.function([l_in.input_var, target_values], [cost, network_output])
predict = theano.function([l_in.input_var], network_output)

# ## Network Training
Esempio n. 16
0
def test_lasagne_ctc():
    import lasagne
    from lasagne.layers import LSTMLayer, InputLayer, DenseLayer,\
        NonlinearityLayer, ReshapeLayer, EmbeddingLayer, RecurrentLayer
    import theano
    import theano.tensor as T
    import numpy as np
    num_batch, input_seq_len = 10, 15
    num_classes = 10
    target_seq_len = 5
    num_rnn_units = 50

    input_seq_len += target_seq_len
    def print_pred(y_hat):
        blank_symbol = num_classes
        res = []
        for i, s in enumerate(y_hat):
            if (s != blank_symbol) and (i == 0 or s != y_hat[i - 1]):
                res += [s]
        if len(res) > 0:
            return "".join(map(str, list(res)))
        else:
            return "-"*target_seq_len

    Y_hat = np.asarray(np.random.normal(
        0, 1, (input_seq_len, num_batch, num_classes + 1)), dtype=floatX)
    Y = np.zeros((target_seq_len, num_batch), dtype='int64')
    Y[25:, :] = 1
    Y_hat_mask = np.ones((input_seq_len, num_batch), dtype=floatX)
    Y_hat_mask[-5:] = 0
    # default blank symbol is the highest class index (3 in this case)
    Y_mask = np.asarray(np.ones_like(Y), dtype=floatX)
    X = np.random.random(
        (num_batch, input_seq_len)).astype('int32')

    y = T.imatrix('phonemes')
    x = T.imatrix()   # batchsize, input_seq_len, features



    # setup Lasagne Recurrent network
    # The output from the network is shape
    #  a) output_lin_ctc is the activation before softmax  (input_seq_len, batch_size, num_classes + 1)
    #  b) ouput_softmax is the output after softmax  (batch_size, input_seq_len, num_classes + 1)
    l_inp = InputLayer((num_batch, input_seq_len))
    l_emb = EmbeddingLayer(l_inp,
                           input_size=num_classes+1,
                           output_size=num_classes+1,
                           W=np.identity(num_classes+1).astype('float32'))
    ini = lasagne.init.Uniform(0.1)
    zero = lasagne.init.Constant(0.0)
    cell = lasagne.init.Uniform(0.1)
    l_rnn = LSTMLayer(l_emb,
                      num_units=num_rnn_units,
                      peepholes=True,
                      W_in_to_ingate=ini,
                      W_hid_to_ingate=ini,
                      b_ingate=zero,
                      W_in_to_forgetgate=ini,
                      W_hid_to_forgetgate=ini,
                      b_forgetgate=zero,
                      W_in_to_cell=ini,
                      W_hid_to_cell=ini,
                      b_cell=zero,
                      W_in_to_outgate=ini,
                      W_hid_to_outgate=ini,
                      b_outgate=zero,
                      cell_init=lasagne.init.Constant(0.),
                      hid_init=lasagne.init.Constant(0.),
                      W_cell_to_forgetgate=cell,
                      W_cell_to_ingate=cell,
                      W_cell_to_outgate=cell)
    l_rnn_shp = ReshapeLayer(l_rnn, (num_batch*input_seq_len, num_rnn_units))
    l_out = DenseLayer(l_rnn_shp, num_units=num_classes+1,
                       nonlinearity=lasagne.nonlinearities.identity)  # + blank

    l_out_shp = ReshapeLayer(l_out, (num_batch, input_seq_len, num_classes+1))

    # dimshuffle to shape format (input_seq_len, batch_size, num_classes + 1)
    #l_out_shp_ctc = lasagne.layers.DimshuffleLayer(l_out_shp, (1, 0, 2))

    l_out_softmax = NonlinearityLayer(
        l_out, nonlinearity=lasagne.nonlinearities.softmax)
    l_out_softmax_shp = ReshapeLayer(
        l_out_softmax, (num_batch, input_seq_len, num_classes+1))

    output_lin_ctc = lasagne.layers.get_output(l_out_shp, x)
    output_softmax = lasagne.layers.get_output(l_out_softmax_shp, x)
    all_params = l_rnn.get_params(trainable=True)  # dont learn embeddingl
    print all_params

    ###############
    #  GRADIENTS  #
    ###############

    # the CTC cross entropy between y and linear output network
    pseudo_cost = ctc_cost.pseudo_cost(
        y, output_lin_ctc)

    # calculate the gradients of the CTC wrt. linar output of network
    pseudo_cost_grad = T.grad(pseudo_cost.sum() / num_batch, all_params)
    true_cost = ctc_cost.cost(y, output_softmax)
    cost = T.mean(true_cost)

    sh_lr = theano.shared(lasagne.utils.floatX(0.01))
    #updates = lasagne.updates.sgd(pseudo_cost_grad, all_params, learning_rate=sh_lr)
    #updates = lasagne.updates.apply_nesterov_momentum(updates, all_params, momentum=0.9)
    updates = lasagne.updates.rmsprop(pseudo_cost_grad, all_params, learning_rate=sh_lr)

    train = theano.function([x, y],
                            [output_lin_ctc, output_softmax, cost, pseudo_cost],
                            updates=updates)


    # Create test dataset
    num_samples = 1000
    np.random.seed(1234)

    # create simple dataset of format
    # input [5,5,5,5,5,2,2,2,2,2,3,3,3,3,3,....,1,1,1,1]
    # targets [5,2,3,...,1]
    # etc...
    input_lst, output_lst = [], []
    for i in range(num_samples):
        this_input = []
        this_output = []
        for j in range(target_seq_len):
            this_class = np.random.randint(num_classes)
            this_input += [this_class]*3 + [num_classes]
            this_output += [this_class]

        this_input += (input_seq_len - len(this_input))*[this_input[-1]]

        input_lst.append(this_input)
        output_lst.append(this_output)
        print this_input, this_output

    input_arr = np.concatenate([input_lst]).astype('int32')
    y_arr = np.concatenate([output_lst]).astype('int32')

    y_mask_arr = np.ones((num_batch, target_seq_len), dtype='float32')
    input_mask_arr = np.ones((num_batch, input_seq_len), dtype='float32')

    for nn in range(10000):
        cost_lst = []
        shuffle = np.random.permutation(num_samples)
        for i in range(num_samples//num_batch):
            idx = shuffle[i*num_batch:(i+1)*num_batch]
            _, output_softmax_val, cost, pseudo_cost_val = train(
                input_arr[idx],
                y_arr[idx])
            output_softmax_lst = output_softmax_val
            labels_lst = y_arr[idx]
            cost_lst += [cost]
            #testing.assert_almost_equal(pseudo_cost, pseudo_cost_old, decimal=4)
            #testing.assert_array_almost_equal(pseudo_cost_val, pseudo_cost_old_val)

        if (nn+1) % 200 == 0:
            DECAY = 1.5
            new_lr = lasagne.utils.floatX(sh_lr.get_value() / DECAY)
            sh_lr.set_value(new_lr)
            print "----------------------->NEW LR:", new_lr

        print nn, "Mean cost:", np.mean(cost_lst)
        if (nn+1) % 4 == 0:
            for jj in range(num_batch):
                pred = print_pred(np.argmax(output_softmax_val[jj], axis=-1))
                true = "".join(map(str, labels_lst[jj]))
                pred += (target_seq_len-len(pred)) * " "
                print pred, true