def test_ctc_pseudo_cost_skip_softmax_stability():
    LENGTH = 500
    BATCHES = 40
    CLASSES = 2
    N_LABELS = 45
    y_hat = T.tensor3('features')
    input_mask = T.matrix('features_mask')
    y_hat_mask = input_mask
    y = T.lmatrix('phonemes')
    y_mask = T.matrix('phonemes_mask')
    pseudo_cost = ctc_cost.pseudo_cost(y, y_hat, y_mask, y_hat_mask,
                                       skip_softmax=True)

    Y_hat = np.asarray(np.random.normal(0, 1, (LENGTH, BATCHES, CLASSES + 1)),
                       dtype=floatX)
    Y = np.zeros((N_LABELS, BATCHES), dtype='int64')
    Y[25:, :] = 1
    Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX)
    Y_hat_mask[-5:] = 0
    # default blank symbol is the highest class index (3 in this case)
    Y_mask = np.asarray(np.ones_like(Y), dtype=floatX)
    Y_mask[30:] = 0
    pseudo_grad = T.grad(pseudo_cost.sum(), y_hat)
    test_grad = pseudo_grad.eval({y_hat: Y_hat, y: Y,
                                  y_hat_mask: Y_hat_mask, y_mask: Y_mask})
    y_hat_softmax = T.exp(y_hat) / T.exp(y_hat).sum(2)[:, :, None]
    pseudo_cost2 = ctc_cost.pseudo_cost(y, y_hat_softmax, y_mask, y_hat_mask,
                                        skip_softmax=False)
    pseudo_grad2 = T.grad(pseudo_cost2.sum(), y_hat)
    test_grad2 = pseudo_grad2.eval({y_hat: Y_hat, y: Y,
                                    y_hat_mask: Y_hat_mask, y_mask: Y_mask})
    testing.assert_almost_equal(test_grad, test_grad2, decimal=4)
def test_ctc_pseudo_cost():
    LENGTH = 500
    BATCHES = 40
    CLASSES = 2
    N_LABELS = 45
    y_hat = T.tensor3('features')
    input_mask = T.matrix('features_mask')
    y_hat_mask = input_mask
    y = T.lmatrix('phonemes')
    y_mask = T.matrix('phonemes_mask')
    pseudo_cost = ctc_cost.pseudo_cost(y, y_hat, y_mask, y_hat_mask)

    Y_hat = np.zeros((LENGTH, BATCHES, CLASSES + 1), dtype=floatX)
    Y_hat[:, :, 0] = .75
    Y_hat[:, :, 1] = .2
    Y_hat[:, :, 2] = .05
    Y_hat[3, 0, 0] = .3
    Y_hat[3, 0, 1] = .4
    Y_hat[3, 0, 2] = .3
    Y = np.zeros((N_LABELS, BATCHES), dtype='int64')
    Y[25:, :] = 1
    Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX)
    Y_hat_mask[-5:] = 0
    # default blank symbol is the highest class index (3 in this case)
    Y_mask = np.asarray(np.ones_like(Y), dtype=floatX)
    Y_mask[30:] = 0
    cost = pseudo_cost.eval({y_hat: Y_hat, y: Y,
                             y_hat_mask: Y_hat_mask, y_mask: Y_mask})
    pseudo_grad = T.grad(ctc_cost.pseudo_cost(y, y_hat,
                                              y_mask, y_hat_mask).sum(),
                         y_hat)
    #test_grad2 = pseudo_grad.eval({y_hat: Y_hat, y: Y,
    #                               y_hat_mask: Y_hat_mask, y_mask: Y_mask})
    # TODO: write some more meaningful asserts here
    assert cost.sum() > 0
Beispiel #3
0
    def setup(self):
        # setup Lasagne Recurrent network
        # The output from the network is shape
        #  a) output_lin_ctc is the activation before softmax  (input_seq_len, batch_size, num_classes + 1)
        #  b) ouput_softmax is the output after softmax  (batch_size, input_seq_len, num_classes + 1)
        l_inp = InputLayer(shape=(self.num_batch, self.input_seq_len, self.num_inputs))
        l_mask = InputLayer(shape=(self.num_batch, self.input_seq_len))
        l_emb = EmbeddingLayer(l_inp, input_size=self.num_inputs, output_size=self.num_features)

        l_rnn = LSTMLayer(l_inp, num_units=self.num_units, peepholes=True, mask_input=l_mask)

        l_rnn_shp = ReshapeLayer(l_rnn, shape=(-1, self.num_units))
        l_out = DenseLayer(l_rnn_shp, num_units=self.num_outputs, nonlinearity=identity)
        l_out_shp = ReshapeLayer(l_out, shape=(-1, self.input_seq_len, self.num_outputs))

        # dimshuffle to shape format (input_seq_len, batch_size, num_classes + 1)
        #l_out_shp_ctc = lasagne.layers.DimshuffleLayer(l_out_shp, (1, 0, 2))

        l_out_softmax = NonlinearityLayer(l_out, nonlinearity=softmax)
        l_out_softmax_shp = ReshapeLayer(l_out_softmax, shape=(-1, self.input_seq_len, self.num_outputs))

        # calculate grad and cost
        output_lin_ctc = get_output(l_out_shp, {l_inp: self.x, l_mask: self.mask_x})
        output_softmax = get_output(l_out_softmax_shp, {l_inp: self.x, l_mask: self.mask_x})

        all_params = get_all_params(l_out_softmax_shp, trainable=True)  # dont learn embeddinglayer

        # the CTC cross entropy between y and linear output network
        pseudo_cost = ctc_cost.pseudo_cost(self.y, output_lin_ctc, self.mask_y, self.mask_x)

        # calculate the gradients of the CTC wrt. linar output of network
        pseudo_grad = T.grad(pseudo_cost.sum() / self.num_batch, all_params)
        true_cost = ctc_cost.cost(self.y, output_softmax, self.mask_y, self.mask_x)
        cost = T.mean(true_cost)

        shared_lr = theano.shared(lasagne.utils.floatX(0.001))
        #updates = lasagne.updates.sgd(pseudo_cost_grad, all_params, learning_rate=shared_lr)
        #updates = lasagne.updates.apply_nesterov_momentum(updates, all_params, momentum=0.9)
        updates = lasagne.updates.rmsprop(pseudo_grad, all_params, learning_rate=shared_lr)

        self.train = theano.function([self.x, self.mask_x, self.y, self.mask_y],
                                     [output_softmax, cost], updates=updates)
        self.test = theano.function([self.x, self.mask_x], [output_softmax])
Beispiel #4
0
    def setup(self):
        # setup Lasagne Recurrent network
        # The output from the network is shape
        #  a) output_lin_ctc is the activation before softmax  (input_seq_len, batch_size, num_classes + 1)
        #  b) ouput_softmax is the output after softmax  (batch_size, input_seq_len, num_classes + 1)
        l_inp = InputLayer(shape=(self.num_batch, self.input_seq_len,
                                  self.num_inputs))
        l_mask = InputLayer(shape=(self.num_batch, self.input_seq_len))
        l_emb = EmbeddingLayer(l_inp,
                               input_size=self.num_inputs,
                               output_size=self.num_features)

        l_rnn = LSTMLayer(l_inp,
                          num_units=self.num_units,
                          peepholes=True,
                          mask_input=l_mask)

        l_rnn_shp = ReshapeLayer(l_rnn, shape=(-1, self.num_units))
        l_out = DenseLayer(l_rnn_shp,
                           num_units=self.num_outputs,
                           nonlinearity=identity)
        l_out_shp = ReshapeLayer(l_out,
                                 shape=(-1, self.input_seq_len,
                                        self.num_outputs))

        # dimshuffle to shape format (input_seq_len, batch_size, num_classes + 1)
        #l_out_shp_ctc = lasagne.layers.DimshuffleLayer(l_out_shp, (1, 0, 2))

        l_out_softmax = NonlinearityLayer(l_out, nonlinearity=softmax)
        l_out_softmax_shp = ReshapeLayer(l_out_softmax,
                                         shape=(-1, self.input_seq_len,
                                                self.num_outputs))

        # calculate grad and cost
        output_lin_ctc = get_output(l_out_shp, {
            l_inp: self.x,
            l_mask: self.mask_x
        })
        output_softmax = get_output(l_out_softmax_shp, {
            l_inp: self.x,
            l_mask: self.mask_x
        })

        all_params = get_all_params(
            l_out_softmax_shp, trainable=True)  # dont learn embeddinglayer

        # the CTC cross entropy between y and linear output network
        pseudo_cost = ctc_cost.pseudo_cost(self.y, output_lin_ctc, self.mask_y,
                                           self.mask_x)

        # calculate the gradients of the CTC wrt. linar output of network
        pseudo_grad = T.grad(pseudo_cost.sum() / self.num_batch, all_params)
        true_cost = ctc_cost.cost(self.y, output_softmax, self.mask_y,
                                  self.mask_x)
        cost = T.mean(true_cost)

        shared_lr = theano.shared(lasagne.utils.floatX(0.001))
        #updates = lasagne.updates.sgd(pseudo_cost_grad, all_params, learning_rate=shared_lr)
        #updates = lasagne.updates.apply_nesterov_momentum(updates, all_params, momentum=0.9)
        updates = lasagne.updates.rmsprop(pseudo_grad,
                                          all_params,
                                          learning_rate=shared_lr)

        self.train = theano.function(
            [self.x, self.mask_x, self.y, self.mask_y], [output_softmax, cost],
            updates=updates)
        self.test = theano.function([self.x, self.mask_x], [output_softmax])
Beispiel #5
0
def test_lasagne_ctc():
    import lasagne
    from lasagne.layers import (
        LSTMLayer,
        InputLayer,
        DenseLayer,
        NonlinearityLayer,
        ReshapeLayer,
        EmbeddingLayer,
        RecurrentLayer,
    )
    import theano
    import theano.tensor as T
    import numpy as np

    num_batch, input_seq_len = 1, 12
    num_classes = 5
    target_seq_len = 3
    num_rnn_units = 50

    def print_pred(y_hat):
        blank_symbol = num_classes
        res = []
        for i, s in enumerate(y_hat):
            if (s != blank_symbol) and (i == 0 or s != y_hat[i - 1]):
                res += [s]
        if len(res) > 0:
            return "".join(map(str, list(res)))
        else:
            return "-" * target_seq_len

    Y_hat = np.asarray(np.random.normal(0, 1, (input_seq_len, num_batch, num_classes + 1)), dtype=floatX)
    Y = np.zeros((target_seq_len, num_batch), dtype="int64")
    Y[25:, :] = 1
    Y_hat_mask = np.ones((input_seq_len, num_batch), dtype=floatX)
    Y_hat_mask[-5:] = 0
    # default blank symbol is the highest class index (3 in this case)
    Y_mask = np.asarray(np.ones_like(Y), dtype=floatX)
    X = np.random.random((num_batch, input_seq_len)).astype("int32")

    y = T.imatrix("phonemes")
    x = T.imatrix()  # batchsize, input_seq_len, features

    print "num_batch =", num_batch, "input_seq_len =", input_seq_len
    print "num_classes =", num_classes
    # setup Lasagne Recurrent network
    # The output from the network is shape
    #  a) output_lin_ctc is the activation before softmax  (input_seq_len, batch_size, num_classes + 1)
    #  b) ouput_softmax is the output after softmax  (batch_size, input_seq_len, num_classes + 1)
    l_inp = InputLayer((num_batch, input_seq_len))
    netshape = lasagne.layers.get_output_shape(l_inp)
    print ("Layer l_inp shape:")
    print (netshape)
    l_emb = EmbeddingLayer(
        l_inp, input_size=num_classes + 1, output_size=num_classes + 1, W=np.identity(num_classes + 1).astype("float32")
    )
    netshape = lasagne.layers.get_output_shape(l_emb)
    print ("Layer l_emb shape:")
    print (netshape)
    l_rnn = LSTMLayer(l_emb, num_units=num_rnn_units)
    netshape = lasagne.layers.get_output_shape(l_rnn)
    print ("Layer l_rnn shape:")
    print (netshape)
    l_rnn_shp = ReshapeLayer(l_rnn, (num_batch * input_seq_len, num_rnn_units))
    netshape = lasagne.layers.get_output_shape(l_rnn_shp)
    print ("Layer l_rnn_shp shape:")
    print (netshape)
    l_out = DenseLayer(l_rnn_shp, num_units=num_classes + 1, nonlinearity=lasagne.nonlinearities.identity)  # + blank

    netshape = lasagne.layers.get_output_shape(l_out)
    print ("Layer l_out shape:")
    print (netshape)
    l_out_shp = ReshapeLayer(l_out, (num_batch, input_seq_len, num_classes + 1))
    netshape = lasagne.layers.get_output_shape(l_out_shp)
    print ("Layer l_out_shp shape:")
    print (netshape)

    # dimshuffle to shape format (input_seq_len, batch_size, num_classes + 1)
    # l_out_shp_ctc = lasagne.layers.DimshuffleLayer(l_out_shp, (1, 0, 2))

    l_out_softmax = NonlinearityLayer(l_out, nonlinearity=lasagne.nonlinearities.softmax)
    netshape = lasagne.layers.get_output_shape(l_out_softmax)
    print ("Layer l_out_softmax shape:")
    print (netshape)
    l_out_softmax_shp = ReshapeLayer(l_out_softmax, (num_batch, input_seq_len, num_classes + 1))
    netshape = lasagne.layers.get_output_shape(l_out_softmax_shp)
    print ("Layer l_out_softmax_shp shape:")
    print (netshape)

    output_lin_ctc = lasagne.layers.get_output(l_out_shp, x)
    output_softmax = lasagne.layers.get_output(l_out_softmax_shp, x)
    all_params = l_rnn.get_params(trainable=True)  # dont learn embeddingl
    print "x type:", type(x)
    print "x shape", x.shape
    print "y type:", type(y)
    print "y shape", y.shape

    ###############
    #  GRADIENTS  #
    ###############

    # the CTC cross entropy between y and linear output network
    # (num_batch,t,class+1)
    # output_lin_ctc shape (1,12,6)
    pseudo_cost = ctc_cost.pseudo_cost(y, output_lin_ctc)

    # calculate the gradients of the CTC wrt. linar output of network
    pseudo_cost_grad = T.grad(pseudo_cost.sum() / num_batch, all_params)
    true_cost = ctc_cost.cost(y, output_softmax)
    cost = T.mean(true_cost)

    sh_lr = theano.shared(lasagne.utils.floatX(0.01))
    updates = lasagne.updates.rmsprop(pseudo_cost_grad, all_params, learning_rate=sh_lr)
    # x shape (1,12)
    # y shape (1,3)

    train = theano.function([x, y], [output_lin_ctc, output_softmax, cost, pseudo_cost], updates=updates)

    # Create test dataset
    num_samples = 10
    np.random.seed(1234)

    # create simple dataset of format
    # input [5,5,5,5,5,2,2,2,2,2,3,3,3,3,3,....,1,1,1,1]
    # targets [5,2,3,...,1]
    # etc...
    input_lst, output_lst = [], []
    for i in range(num_samples):
        this_input = []
        this_output = []
        for j in range(target_seq_len):
            this_class = np.random.randint(num_classes)
            this_input += [this_class] * 3 + [num_classes]
            this_output += [this_class]

        this_input += (input_seq_len - len(this_input)) * [this_input[-1]]

        input_lst.append(this_input)
        output_lst.append(this_output)
        print this_input, this_output

    input_arr = np.concatenate([input_lst]).astype("int32")
    y_arr = np.concatenate([output_lst]).astype("int32")

    print "y_arr shape:", y_arr.shape

    y_mask_arr = np.ones((num_batch, target_seq_len), dtype="float32")
    input_mask_arr = np.ones((num_batch, input_seq_len), dtype="float32")

    for nn in range(1000):
        cost_lst = []
        shuffle = np.random.permutation(num_samples)
        for i in range(num_samples // num_batch):
            idx = shuffle[i * num_batch : (i + 1) * num_batch]
            _, output_softmax_val, cost, pseudo_cost_val = train(input_arr[idx], y_arr[idx])
            print "x=", input_arr[idx]
            # x shape (1,12)
            print "x shape", input_arr[idx].shape
            print "y=", y_arr[idx]
            # y shape (1,3)
            print "y shape", y_arr[idx].shape
            output_softmax_lst = output_softmax_val
            labels_lst = y_arr[idx]
            cost_lst += [cost]
            # testing.assert_almost_equal(pseudo_cost, pseudo_cost_old, decimal=4)
            # testing.assert_array_almost_equal(pseudo_cost_val, pseudo_cost_old_val)

        if (nn + 1) % 20 == 0:
            DECAY = 1.5
            new_lr = lasagne.utils.floatX(sh_lr.get_value() / DECAY)
            sh_lr.set_value(new_lr)
            print "----------------------->NEW LR:", new_lr

        print nn, "Mean cost:", np.mean(cost_lst)
        if (nn + 1) % 4 == 0:
            for jj in range(num_batch):
                pred = print_pred(np.argmax(output_softmax_val[jj], axis=-1))
                true = "".join(map(str, labels_lst[jj]))
                pred += (target_seq_len - len(pred)) * " "
                print "pred =", pred, "true =", true
Beispiel #6
0
#recognizer = SpeechRecognizer(
#    num_features=num_features, dims_bottom=[],
#    dims_bidir=conf.dims_transition, dims_top=[num_classes],
#    bidir_trans=GatedRecurrent, bottom_activation=None)

# ******************* output *******************
y_hat = recognizer.apply(x, x_m)
y_hat.name = 'outputs'
y_hat_softmax = NDimensionalSoftmax().apply(y_hat, extra_ndim=y_hat.ndim - 2)
y_hat_softmax.name = 'outputs_softmax'

# there is a cost function for monitoring and for training, because one is more stable to compute
# gradients and seems also to be more memory efficient, but does not compute the true cost.
if conf.task == 'CTC':
    cost_train = ctc.pseudo_cost(y, y_hat, y_m, x_m).mean()
    cost_train.name = "cost_train"

    cost_monitor = ctc.cost(y, y_hat_softmax, y_m, x_m).mean()
    cost_monitor.name = "cost_monitor"
elif conf.task == 'framewise':
    cost_train = categorical_crossentropy_batch().apply(y_hat_softmax, y, x_m)
    cost_train.name = 'cost'
    cost_monitor = cost_train
else:
    raise ValueError, conf.task

recognizer.initialize()
cg = ComputationGraph([cost_train, y_hat, x_m, y, y_m])

weights = VariableFilter(roles=[WEIGHT])(cg.variables)
Beispiel #7
0
def main(paramFile="",num_epochs=5):
    # Prepare Theano variables for inputs and targets
    input_var = T.tensor4('inputs')
    target_var = T.ivector('targets')
    #y = T.matrix()
    label = T.matrix()
    blank_symbol = T.scalar()

    # Create neural network model (depending on first command line parameter)
    print("Building model and compiling functions...")
    network,ctcout = build_cnn(input_var)
    
    #jin
    if paramFile=="":
		print("Train a new network!")
    else:
		print("Load well trained parameters from "+paramFile)
		f = file(paramFile,'rb')
		params = cPickle.load(f)
		f.close()
		lasagne.layers.set_all_param_values(network,params)
	
    # Create a loss expression for training, i.e., a scalar objective we want
    # to minimize the objective function:
    y = lasagne.layers.get_output(ctcout)
    ctc_cost = CTC.pseudo_cost(label,y)
    params = lasagne.layers.get_all_params(ctcout, trainable=True)
    pseudo_cost_grad = T.grad(ctc_cost.sum(),params)
    updates = lasagne.updates.nesterov_momentum(
            pseudo_cost_grad, params, learning_rate=0.0001, momentum=0.9)
    train_fn = theano.function([input_var, label], ctc_cost, updates=updates,allow_input_downcast=True)

    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(test_prediction,
                                                            target_var)
    test_loss = test_loss.mean()
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                      dtype=theano.config.floatX)
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc])

    # Finally, launch the training loop.
    print("Starting training...")
    #jin
    # return numpy.ndarray
    train_out = T.argmax(test_prediction, axis=1)
    train_acc = T.mean(T.eq(train_out, target_var),
                      dtype=theano.config.floatX)
    train_label = theano.function([input_var,target_var],[train_out,train_acc,test_prediction])
    val_out = T.argmax(test_prediction, axis=1)
    val_label = theano.function([input_var],val_out)
    
    # We iterate over epochs:
    #jin
    # train set and validation set
    dirpath = os.getcwd()
    print('dirpath = '+dirpath)
    train_dirpath = dirpath + '/train'
    test_dirpath = dirpath + '/test'
    total = len(os.listdir(train_dirpath)) / 2
    train_total_num = int(0.9 * total)
    validation_total_num = total - train_total_num
    print('Train num = ' + str(train_total_num))
    print('Validation num = '+str(validation_total_num))
    blank_symbol_num = 39

    for epoch in range(num_epochs):
        # change current directory
        os.chdir(train_dirpath)
        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_batches = 0
        start_time = time.time()
        counter = 0
        # And a full pass over the validation data:
        val_err = 0
        val_acc = 0
        val_batches = 0
        #for batch in loadArray(train_dirpath):
        for batch in loadArray(train_dirpath):
            inputs, targets, batchNum = batch
            print('spectro shape:')
            print(inputs.shape)
            print('label shape:')
            print(targets.shape)
            label_without_blank = PER.phn2targetseq(targets,blank_symbol_num)
            #label_without_blank = label_without_blank[0,:]
            print('noblanklabel shape = '+str(label_without_blank.shape))
            counter += 1
            if counter < train_total_num:
                train_batches += batchNum
                # valwrd = predicting output frames
                # wrd = predicting output phoneme
                trainwrd, acc, yy = train_label(inputs,targets)
                print("y shape = "+str(yy.shape))
                ctc_loss = train_fn(inputs, label_without_blank)
                train_err += ctc_loss
                #ctc_loss = ctc_fn(yy, label_without_blank, blank_symbol_num)
                print('ctc loss = '+str(ctc_loss))
                print('train acc = '+str(acc))
                wrd = PER.phn2word(trainwrd)
                print('train output word=')
                print(wrd)
                labelphn = PER.phn2word(targets)
                print('labelphn=')
                print(labelphn)
                
                print('  Train set completed : '+str(float(counter)/train_total_num*100))
            else:
                err, acc = val_fn(inputs, targets)
                val_err += err * batchNum
                val_acc += acc * batchNum
                val_batches += batchNum
                # valwrd = predicting output frames
                # wrd = predicting output phoneme
                valwrd = val_label(inputs)
                print('test acc = '+str(acc))
                print('test output word=')
                valwrd = PER.phn2word(valwrd)
                print(valwrd)
                labelphn = PER.phn2word(targets)
                print('labelphn=')
                print(labelphn)
                
                print('  Validation set completed : '+str(float(counter-train_total_num)/validation_total_num*100))

        # Then we print the results for this epoch:
        print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time))
        print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
        print("  validation loss:\t\t{:.6f}".format(val_err / val_batches))
        print("  validation accuracy:\t\t{:.2f} %".format(val_acc / val_batches * 100))
        
        # change current directory
        os.chdir(dirpath)
        # store parameters
        print("  should store epoch {}".format(epoch+1))
        pythonName,suffix = os.path.splitext(__file__)
        param2store = lasagne.layers.get_all_param_values(network)
        storename = pythonName+"_"+str((epoch+1))+"_accu="+str(val_acc / val_batches * 100)+".save"
        with file(storename,'wb') as f:
            cPickle.dump(param2store,f)
	
    # change current directory
    os.chdir(test_dirpath)
    # After training, we compute and print the test error:
    test_err = 0
    test_acc = 0
    test_batches = 0
    for batch in loadArray(test_dirpath):
        inputs, targets, batchNum = batch
        err, acc = val_fn(inputs, targets)
        test_err += err*batchNum
        test_acc += acc*batchNum
        test_batches += batchNum
    print("Final results:")
    print("  test loss:\t\t\t{:.6f}".format(test_err / test_batches))
    print("  test accuracy:\t\t{:.2f} %".format(test_acc / test_batches * 100))
Beispiel #8
0
def test_lasagne_ctc():
    import lasagne
    from lasagne.layers import LSTMLayer, InputLayer, DenseLayer,\
        NonlinearityLayer, ReshapeLayer, EmbeddingLayer, RecurrentLayer
    import theano
    import theano.tensor as T
    import numpy as np
    num_batch, input_seq_len = 10, 15
    num_classes = 10
    target_seq_len = 5
    num_rnn_units = 50

    input_seq_len += target_seq_len

    def print_pred(y_hat):
        blank_symbol = num_classes
        res = []
        for i, s in enumerate(y_hat):
            if (s != blank_symbol) and (i == 0 or s != y_hat[i - 1]):
                res += [s]
        if len(res) > 0:
            return "".join(map(str, list(res)))
        else:
            return "-" * target_seq_len

    Y_hat = np.asarray(np.random.normal(
        0, 1, (input_seq_len, num_batch, num_classes + 1)),
                       dtype=floatX)
    Y = np.zeros((target_seq_len, num_batch), dtype='int64')
    Y[25:, :] = 1
    Y_hat_mask = np.ones((input_seq_len, num_batch), dtype=floatX)
    Y_hat_mask[-5:] = 0
    # default blank symbol is the highest class index (3 in this case)
    Y_mask = np.asarray(np.ones_like(Y), dtype=floatX)
    X = np.random.random((num_batch, input_seq_len)).astype('int32')

    y = T.imatrix('phonemes')
    x = T.imatrix()  # batchsize, input_seq_len, features

    # setup Lasagne Recurrent network
    # The output from the network is shape
    #  a) output_lin_ctc is the activation before softmax  (input_seq_len, batch_size, num_classes + 1)
    #  b) ouput_softmax is the output after softmax  (batch_size, input_seq_len, num_classes + 1)
    l_inp = InputLayer((num_batch, input_seq_len))
    l_emb = EmbeddingLayer(l_inp,
                           input_size=num_classes + 1,
                           output_size=num_classes + 1,
                           W=np.identity(num_classes + 1).astype('float32'))
    ini = lasagne.init.Uniform(0.1)
    zero = lasagne.init.Constant(0.0)
    cell = lasagne.init.Uniform(0.1)
    l_rnn = LSTMLayer(l_emb,
                      num_units=num_rnn_units,
                      peepholes=True,
                      W_in_to_ingate=ini,
                      W_hid_to_ingate=ini,
                      b_ingate=zero,
                      W_in_to_forgetgate=ini,
                      W_hid_to_forgetgate=ini,
                      b_forgetgate=zero,
                      W_in_to_cell=ini,
                      W_hid_to_cell=ini,
                      b_cell=zero,
                      W_in_to_outgate=ini,
                      W_hid_to_outgate=ini,
                      b_outgate=zero,
                      cell_init=lasagne.init.Constant(0.),
                      hid_init=lasagne.init.Constant(0.),
                      W_cell_to_forgetgate=cell,
                      W_cell_to_ingate=cell,
                      W_cell_to_outgate=cell)
    l_rnn_shp = ReshapeLayer(l_rnn, (num_batch * input_seq_len, num_rnn_units))
    l_out = DenseLayer(l_rnn_shp,
                       num_units=num_classes + 1,
                       nonlinearity=lasagne.nonlinearities.identity)  # + blank

    l_out_shp = ReshapeLayer(l_out,
                             (num_batch, input_seq_len, num_classes + 1))

    # dimshuffle to shape format (input_seq_len, batch_size, num_classes + 1)
    #l_out_shp_ctc = lasagne.layers.DimshuffleLayer(l_out_shp, (1, 0, 2))

    l_out_softmax = NonlinearityLayer(
        l_out, nonlinearity=lasagne.nonlinearities.softmax)
    l_out_softmax_shp = ReshapeLayer(
        l_out_softmax, (num_batch, input_seq_len, num_classes + 1))

    output_lin_ctc = lasagne.layers.get_output(l_out_shp, x)
    output_softmax = lasagne.layers.get_output(l_out_softmax_shp, x)
    all_params = l_rnn.get_params(trainable=True)  # dont learn embeddingl
    print all_params

    ###############
    #  GRADIENTS  #
    ###############

    # the CTC cross entropy between y and linear output network
    pseudo_cost = ctc_cost.pseudo_cost(y, output_lin_ctc)

    # calculate the gradients of the CTC wrt. linar output of network
    pseudo_cost_grad = T.grad(pseudo_cost.sum() / num_batch, all_params)
    true_cost = ctc_cost.cost(y, output_softmax)
    cost = T.mean(true_cost)

    sh_lr = theano.shared(lasagne.utils.floatX(0.01))
    #updates = lasagne.updates.sgd(pseudo_cost_grad, all_params, learning_rate=sh_lr)
    #updates = lasagne.updates.apply_nesterov_momentum(updates, all_params, momentum=0.9)
    updates = lasagne.updates.rmsprop(pseudo_cost_grad,
                                      all_params,
                                      learning_rate=sh_lr)

    train = theano.function(
        [x, y], [output_lin_ctc, output_softmax, cost, pseudo_cost],
        updates=updates)

    # Create test dataset
    num_samples = 1000
    np.random.seed(1234)

    # create simple dataset of format
    # input [5,5,5,5,5,2,2,2,2,2,3,3,3,3,3,....,1,1,1,1]
    # targets [5,2,3,...,1]
    # etc...
    input_lst, output_lst = [], []
    for i in range(num_samples):
        this_input = []
        this_output = []
        for j in range(target_seq_len):
            this_class = np.random.randint(num_classes)
            this_input += [this_class] * 3 + [num_classes]
            this_output += [this_class]

        this_input += (input_seq_len - len(this_input)) * [this_input[-1]]

        input_lst.append(this_input)
        output_lst.append(this_output)
        print this_input, this_output

    input_arr = np.concatenate([input_lst]).astype('int32')
    y_arr = np.concatenate([output_lst]).astype('int32')

    y_mask_arr = np.ones((num_batch, target_seq_len), dtype='float32')
    input_mask_arr = np.ones((num_batch, input_seq_len), dtype='float32')

    for nn in range(10000):
        cost_lst = []
        shuffle = np.random.permutation(num_samples)
        for i in range(num_samples // num_batch):
            idx = shuffle[i * num_batch:(i + 1) * num_batch]
            _, output_softmax_val, cost, pseudo_cost_val = train(
                input_arr[idx], y_arr[idx])
            output_softmax_lst = output_softmax_val
            labels_lst = y_arr[idx]
            cost_lst += [cost]
            #testing.assert_almost_equal(pseudo_cost, pseudo_cost_old, decimal=4)
            #testing.assert_array_almost_equal(pseudo_cost_val, pseudo_cost_old_val)

        if (nn + 1) % 200 == 0:
            DECAY = 1.5
            new_lr = lasagne.utils.floatX(sh_lr.get_value() / DECAY)
            sh_lr.set_value(new_lr)
            print "----------------------->NEW LR:", new_lr

        print nn, "Mean cost:", np.mean(cost_lst)
        if (nn + 1) % 4 == 0:
            for jj in range(num_batch):
                pred = print_pred(np.argmax(output_softmax_val[jj], axis=-1))
                true = "".join(map(str, labels_lst[jj]))
                pred += (target_seq_len - len(pred)) * " "
                print pred, true
Beispiel #9
0
#recognizer = SpeechRecognizer(
#    num_features=num_features, dims_bottom=[],
#    dims_bidir=conf.dims_transition, dims_top=[num_classes],
#    bidir_trans=GatedRecurrent, bottom_activation=None)


# ******************* output *******************
y_hat = recognizer.apply(x,x_m)
y_hat.name = 'outputs'
y_hat_softmax = NDimensionalSoftmax().apply(y_hat, extra_ndim = y_hat.ndim - 2)
y_hat_softmax.name = 'outputs_softmax'

# there is a cost function for monitoring and for training, because one is more stable to compute
# gradients and seems also to be more memory efficient, but does not compute the true cost.
if conf.task=='CTC':
    cost_train = ctc.pseudo_cost(y, y_hat, y_m, x_m).mean()
    cost_train.name = "cost_train"
    
    cost_monitor = ctc.cost(y, y_hat_softmax, y_m, x_m).mean()
    cost_monitor.name = "cost_monitor"
elif conf.task=='framewise':
    cost_train = categorical_crossentropy_batch().apply(y_hat_softmax, y, x_m)
    cost_train.name='cost'
    cost_monitor = cost_train
else:
    raise ValueError, conf.task


recognizer.initialize()
cg = ComputationGraph([cost_train, y_hat, x_m, y, y_m])
Beispiel #10
0
l_out_softmax = NonlinearityLayer(l_out, nonlinearity=soft)
l_out_softmax_shp = ReshapeLayer(l_out_softmax, (batchsize, seqlen, num_classes))

output_lin_ctc = L.get_output(l_out_shp)
network_output = L.get_output(l_out_softmax_shp)
all_params = L.get_all_params(l_rnn_2, trainable=True)

# ## Costs, Gradients & Training Functions

# Cost functions
target_values = T.imatrix('target_output')
input_values = T.imatrix()

### Gradients ###
# pseudo costs - ctc cross entropy b/n targets and linear output - used in training
pseudo_cost = ctc_cost.pseudo_cost(target_values, output_lin_ctc)
pseudo_cost_grad = T.grad(pseudo_cost.sum() / batchsize, all_params)
pseudo_cost = pseudo_cost.mean()

# true costs
cost = ctc_cost.cost(target_values, network_output)
cost = cost.mean()

# Compute SGD updates for training
print("Computing updates ...")
updates = lasagne.updates.rmsprop(pseudo_cost_grad, all_params, LEARNING_RATE)

# Theano functions for training and computing cost
print("Compiling functions ...")
train = theano.function(
        [l_in.input_var, target_values], [cost, pseudo_cost, network_output], updates=updates)
Beispiel #11
0
def test_lasagne_ctc():
    import lasagne
    from lasagne.layers import LSTMLayer, InputLayer, DenseLayer,\
        NonlinearityLayer, ReshapeLayer, EmbeddingLayer, RecurrentLayer
    import theano
    import theano.tensor as T
    import numpy as np
    num_batch, input_seq_len = 10, 15
    num_classes = 10
    target_seq_len = 5
    num_rnn_units = 50

    input_seq_len += target_seq_len
    def print_pred(y_hat):
        blank_symbol = num_classes
        res = []
        for i, s in enumerate(y_hat):
            if (s != blank_symbol) and (i == 0 or s != y_hat[i - 1]):
                res += [s]
        if len(res) > 0:
            return "".join(map(str, list(res)))
        else:
            return "-"*target_seq_len

    Y_hat = np.asarray(np.random.normal(
        0, 1, (input_seq_len, num_batch, num_classes + 1)), dtype=floatX)
    Y = np.zeros((target_seq_len, num_batch), dtype='int64')
    Y[25:, :] = 1
    Y_hat_mask = np.ones((input_seq_len, num_batch), dtype=floatX)
    Y_hat_mask[-5:] = 0
    # default blank symbol is the highest class index (3 in this case)
    Y_mask = np.asarray(np.ones_like(Y), dtype=floatX)
    X = np.random.random(
        (num_batch, input_seq_len)).astype('int32')

    y = T.imatrix('phonemes')
    x = T.imatrix()   # batchsize, input_seq_len, features



    # setup Lasagne Recurrent network
    # The output from the network is shape
    #  a) output_lin_ctc is the activation before softmax  (input_seq_len, batch_size, num_classes + 1)
    #  b) ouput_softmax is the output after softmax  (batch_size, input_seq_len, num_classes + 1)
    l_inp = InputLayer((num_batch, input_seq_len))
    l_emb = EmbeddingLayer(l_inp,
                           input_size=num_classes+1,
                           output_size=num_classes+1,
                           W=np.identity(num_classes+1).astype('float32'))
    ini = lasagne.init.Uniform(0.1)
    zero = lasagne.init.Constant(0.0)
    cell = lasagne.init.Uniform(0.1)
    l_rnn = LSTMLayer(l_emb,
                      num_units=num_rnn_units,
                      peepholes=True,
                      W_in_to_ingate=ini,
                      W_hid_to_ingate=ini,
                      b_ingate=zero,
                      W_in_to_forgetgate=ini,
                      W_hid_to_forgetgate=ini,
                      b_forgetgate=zero,
                      W_in_to_cell=ini,
                      W_hid_to_cell=ini,
                      b_cell=zero,
                      W_in_to_outgate=ini,
                      W_hid_to_outgate=ini,
                      b_outgate=zero,
                      cell_init=lasagne.init.Constant(0.),
                      hid_init=lasagne.init.Constant(0.),
                      W_cell_to_forgetgate=cell,
                      W_cell_to_ingate=cell,
                      W_cell_to_outgate=cell)
    l_rnn_shp = ReshapeLayer(l_rnn, (num_batch*input_seq_len, num_rnn_units))
    l_out = DenseLayer(l_rnn_shp, num_units=num_classes+1,
                       nonlinearity=lasagne.nonlinearities.identity)  # + blank

    l_out_shp = ReshapeLayer(l_out, (num_batch, input_seq_len, num_classes+1))

    # dimshuffle to shape format (input_seq_len, batch_size, num_classes + 1)
    #l_out_shp_ctc = lasagne.layers.DimshuffleLayer(l_out_shp, (1, 0, 2))

    l_out_softmax = NonlinearityLayer(
        l_out, nonlinearity=lasagne.nonlinearities.softmax)
    l_out_softmax_shp = ReshapeLayer(
        l_out_softmax, (num_batch, input_seq_len, num_classes+1))

    output_lin_ctc = lasagne.layers.get_output(l_out_shp, x)
    output_softmax = lasagne.layers.get_output(l_out_softmax_shp, x)
    all_params = l_rnn.get_params(trainable=True)  # dont learn embeddingl
    print all_params

    ###############
    #  GRADIENTS  #
    ###############

    # the CTC cross entropy between y and linear output network
    pseudo_cost = ctc_cost.pseudo_cost(
        y, output_lin_ctc)

    # calculate the gradients of the CTC wrt. linar output of network
    pseudo_cost_grad = T.grad(pseudo_cost.sum() / num_batch, all_params)
    true_cost = ctc_cost.cost(y, output_softmax)
    cost = T.mean(true_cost)

    sh_lr = theano.shared(lasagne.utils.floatX(0.01))
    #updates = lasagne.updates.sgd(pseudo_cost_grad, all_params, learning_rate=sh_lr)
    #updates = lasagne.updates.apply_nesterov_momentum(updates, all_params, momentum=0.9)
    updates = lasagne.updates.rmsprop(pseudo_cost_grad, all_params, learning_rate=sh_lr)

    train = theano.function([x, y],
                            [output_lin_ctc, output_softmax, cost, pseudo_cost],
                            updates=updates)


    # Create test dataset
    num_samples = 1000
    np.random.seed(1234)

    # create simple dataset of format
    # input [5,5,5,5,5,2,2,2,2,2,3,3,3,3,3,....,1,1,1,1]
    # targets [5,2,3,...,1]
    # etc...
    input_lst, output_lst = [], []
    for i in range(num_samples):
        this_input = []
        this_output = []
        for j in range(target_seq_len):
            this_class = np.random.randint(num_classes)
            this_input += [this_class]*3 + [num_classes]
            this_output += [this_class]

        this_input += (input_seq_len - len(this_input))*[this_input[-1]]

        input_lst.append(this_input)
        output_lst.append(this_output)
        print this_input, this_output

    input_arr = np.concatenate([input_lst]).astype('int32')
    y_arr = np.concatenate([output_lst]).astype('int32')

    y_mask_arr = np.ones((num_batch, target_seq_len), dtype='float32')
    input_mask_arr = np.ones((num_batch, input_seq_len), dtype='float32')

    for nn in range(10000):
        cost_lst = []
        shuffle = np.random.permutation(num_samples)
        for i in range(num_samples//num_batch):
            idx = shuffle[i*num_batch:(i+1)*num_batch]
            _, output_softmax_val, cost, pseudo_cost_val = train(
                input_arr[idx],
                y_arr[idx])
            output_softmax_lst = output_softmax_val
            labels_lst = y_arr[idx]
            cost_lst += [cost]
            #testing.assert_almost_equal(pseudo_cost, pseudo_cost_old, decimal=4)
            #testing.assert_array_almost_equal(pseudo_cost_val, pseudo_cost_old_val)

        if (nn+1) % 200 == 0:
            DECAY = 1.5
            new_lr = lasagne.utils.floatX(sh_lr.get_value() / DECAY)
            sh_lr.set_value(new_lr)
            print "----------------------->NEW LR:", new_lr

        print nn, "Mean cost:", np.mean(cost_lst)
        if (nn+1) % 4 == 0:
            for jj in range(num_batch):
                pred = print_pred(np.argmax(output_softmax_val[jj], axis=-1))
                true = "".join(map(str, labels_lst[jj]))
                pred += (target_seq_len-len(pred)) * " "
                print pred, true