def test_ctc_pseudo_cost_skip_softmax_stability(): LENGTH = 500 BATCHES = 40 CLASSES = 2 N_LABELS = 45 y_hat = T.tensor3('features') input_mask = T.matrix('features_mask') y_hat_mask = input_mask y = T.lmatrix('phonemes') y_mask = T.matrix('phonemes_mask') pseudo_cost = ctc_cost.pseudo_cost(y, y_hat, y_mask, y_hat_mask, skip_softmax=True) Y_hat = np.asarray(np.random.normal(0, 1, (LENGTH, BATCHES, CLASSES + 1)), dtype=floatX) Y = np.zeros((N_LABELS, BATCHES), dtype='int64') Y[25:, :] = 1 Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX) Y_hat_mask[-5:] = 0 # default blank symbol is the highest class index (3 in this case) Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) Y_mask[30:] = 0 pseudo_grad = T.grad(pseudo_cost.sum(), y_hat) test_grad = pseudo_grad.eval({y_hat: Y_hat, y: Y, y_hat_mask: Y_hat_mask, y_mask: Y_mask}) y_hat_softmax = T.exp(y_hat) / T.exp(y_hat).sum(2)[:, :, None] pseudo_cost2 = ctc_cost.pseudo_cost(y, y_hat_softmax, y_mask, y_hat_mask, skip_softmax=False) pseudo_grad2 = T.grad(pseudo_cost2.sum(), y_hat) test_grad2 = pseudo_grad2.eval({y_hat: Y_hat, y: Y, y_hat_mask: Y_hat_mask, y_mask: Y_mask}) testing.assert_almost_equal(test_grad, test_grad2, decimal=4)
def test_ctc_pseudo_cost(): LENGTH = 500 BATCHES = 40 CLASSES = 2 N_LABELS = 45 y_hat = T.tensor3('features') input_mask = T.matrix('features_mask') y_hat_mask = input_mask y = T.lmatrix('phonemes') y_mask = T.matrix('phonemes_mask') pseudo_cost = ctc_cost.pseudo_cost(y, y_hat, y_mask, y_hat_mask) Y_hat = np.zeros((LENGTH, BATCHES, CLASSES + 1), dtype=floatX) Y_hat[:, :, 0] = .75 Y_hat[:, :, 1] = .2 Y_hat[:, :, 2] = .05 Y_hat[3, 0, 0] = .3 Y_hat[3, 0, 1] = .4 Y_hat[3, 0, 2] = .3 Y = np.zeros((N_LABELS, BATCHES), dtype='int64') Y[25:, :] = 1 Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX) Y_hat_mask[-5:] = 0 # default blank symbol is the highest class index (3 in this case) Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) Y_mask[30:] = 0 cost = pseudo_cost.eval({y_hat: Y_hat, y: Y, y_hat_mask: Y_hat_mask, y_mask: Y_mask}) pseudo_grad = T.grad(ctc_cost.pseudo_cost(y, y_hat, y_mask, y_hat_mask).sum(), y_hat) #test_grad2 = pseudo_grad.eval({y_hat: Y_hat, y: Y, # y_hat_mask: Y_hat_mask, y_mask: Y_mask}) # TODO: write some more meaningful asserts here assert cost.sum() > 0
def setup(self): # setup Lasagne Recurrent network # The output from the network is shape # a) output_lin_ctc is the activation before softmax (input_seq_len, batch_size, num_classes + 1) # b) ouput_softmax is the output after softmax (batch_size, input_seq_len, num_classes + 1) l_inp = InputLayer(shape=(self.num_batch, self.input_seq_len, self.num_inputs)) l_mask = InputLayer(shape=(self.num_batch, self.input_seq_len)) l_emb = EmbeddingLayer(l_inp, input_size=self.num_inputs, output_size=self.num_features) l_rnn = LSTMLayer(l_inp, num_units=self.num_units, peepholes=True, mask_input=l_mask) l_rnn_shp = ReshapeLayer(l_rnn, shape=(-1, self.num_units)) l_out = DenseLayer(l_rnn_shp, num_units=self.num_outputs, nonlinearity=identity) l_out_shp = ReshapeLayer(l_out, shape=(-1, self.input_seq_len, self.num_outputs)) # dimshuffle to shape format (input_seq_len, batch_size, num_classes + 1) #l_out_shp_ctc = lasagne.layers.DimshuffleLayer(l_out_shp, (1, 0, 2)) l_out_softmax = NonlinearityLayer(l_out, nonlinearity=softmax) l_out_softmax_shp = ReshapeLayer(l_out_softmax, shape=(-1, self.input_seq_len, self.num_outputs)) # calculate grad and cost output_lin_ctc = get_output(l_out_shp, {l_inp: self.x, l_mask: self.mask_x}) output_softmax = get_output(l_out_softmax_shp, {l_inp: self.x, l_mask: self.mask_x}) all_params = get_all_params(l_out_softmax_shp, trainable=True) # dont learn embeddinglayer # the CTC cross entropy between y and linear output network pseudo_cost = ctc_cost.pseudo_cost(self.y, output_lin_ctc, self.mask_y, self.mask_x) # calculate the gradients of the CTC wrt. linar output of network pseudo_grad = T.grad(pseudo_cost.sum() / self.num_batch, all_params) true_cost = ctc_cost.cost(self.y, output_softmax, self.mask_y, self.mask_x) cost = T.mean(true_cost) shared_lr = theano.shared(lasagne.utils.floatX(0.001)) #updates = lasagne.updates.sgd(pseudo_cost_grad, all_params, learning_rate=shared_lr) #updates = lasagne.updates.apply_nesterov_momentum(updates, all_params, momentum=0.9) updates = lasagne.updates.rmsprop(pseudo_grad, all_params, learning_rate=shared_lr) self.train = theano.function([self.x, self.mask_x, self.y, self.mask_y], [output_softmax, cost], updates=updates) self.test = theano.function([self.x, self.mask_x], [output_softmax])
def setup(self): # setup Lasagne Recurrent network # The output from the network is shape # a) output_lin_ctc is the activation before softmax (input_seq_len, batch_size, num_classes + 1) # b) ouput_softmax is the output after softmax (batch_size, input_seq_len, num_classes + 1) l_inp = InputLayer(shape=(self.num_batch, self.input_seq_len, self.num_inputs)) l_mask = InputLayer(shape=(self.num_batch, self.input_seq_len)) l_emb = EmbeddingLayer(l_inp, input_size=self.num_inputs, output_size=self.num_features) l_rnn = LSTMLayer(l_inp, num_units=self.num_units, peepholes=True, mask_input=l_mask) l_rnn_shp = ReshapeLayer(l_rnn, shape=(-1, self.num_units)) l_out = DenseLayer(l_rnn_shp, num_units=self.num_outputs, nonlinearity=identity) l_out_shp = ReshapeLayer(l_out, shape=(-1, self.input_seq_len, self.num_outputs)) # dimshuffle to shape format (input_seq_len, batch_size, num_classes + 1) #l_out_shp_ctc = lasagne.layers.DimshuffleLayer(l_out_shp, (1, 0, 2)) l_out_softmax = NonlinearityLayer(l_out, nonlinearity=softmax) l_out_softmax_shp = ReshapeLayer(l_out_softmax, shape=(-1, self.input_seq_len, self.num_outputs)) # calculate grad and cost output_lin_ctc = get_output(l_out_shp, { l_inp: self.x, l_mask: self.mask_x }) output_softmax = get_output(l_out_softmax_shp, { l_inp: self.x, l_mask: self.mask_x }) all_params = get_all_params( l_out_softmax_shp, trainable=True) # dont learn embeddinglayer # the CTC cross entropy between y and linear output network pseudo_cost = ctc_cost.pseudo_cost(self.y, output_lin_ctc, self.mask_y, self.mask_x) # calculate the gradients of the CTC wrt. linar output of network pseudo_grad = T.grad(pseudo_cost.sum() / self.num_batch, all_params) true_cost = ctc_cost.cost(self.y, output_softmax, self.mask_y, self.mask_x) cost = T.mean(true_cost) shared_lr = theano.shared(lasagne.utils.floatX(0.001)) #updates = lasagne.updates.sgd(pseudo_cost_grad, all_params, learning_rate=shared_lr) #updates = lasagne.updates.apply_nesterov_momentum(updates, all_params, momentum=0.9) updates = lasagne.updates.rmsprop(pseudo_grad, all_params, learning_rate=shared_lr) self.train = theano.function( [self.x, self.mask_x, self.y, self.mask_y], [output_softmax, cost], updates=updates) self.test = theano.function([self.x, self.mask_x], [output_softmax])
def test_lasagne_ctc(): import lasagne from lasagne.layers import ( LSTMLayer, InputLayer, DenseLayer, NonlinearityLayer, ReshapeLayer, EmbeddingLayer, RecurrentLayer, ) import theano import theano.tensor as T import numpy as np num_batch, input_seq_len = 1, 12 num_classes = 5 target_seq_len = 3 num_rnn_units = 50 def print_pred(y_hat): blank_symbol = num_classes res = [] for i, s in enumerate(y_hat): if (s != blank_symbol) and (i == 0 or s != y_hat[i - 1]): res += [s] if len(res) > 0: return "".join(map(str, list(res))) else: return "-" * target_seq_len Y_hat = np.asarray(np.random.normal(0, 1, (input_seq_len, num_batch, num_classes + 1)), dtype=floatX) Y = np.zeros((target_seq_len, num_batch), dtype="int64") Y[25:, :] = 1 Y_hat_mask = np.ones((input_seq_len, num_batch), dtype=floatX) Y_hat_mask[-5:] = 0 # default blank symbol is the highest class index (3 in this case) Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) X = np.random.random((num_batch, input_seq_len)).astype("int32") y = T.imatrix("phonemes") x = T.imatrix() # batchsize, input_seq_len, features print "num_batch =", num_batch, "input_seq_len =", input_seq_len print "num_classes =", num_classes # setup Lasagne Recurrent network # The output from the network is shape # a) output_lin_ctc is the activation before softmax (input_seq_len, batch_size, num_classes + 1) # b) ouput_softmax is the output after softmax (batch_size, input_seq_len, num_classes + 1) l_inp = InputLayer((num_batch, input_seq_len)) netshape = lasagne.layers.get_output_shape(l_inp) print ("Layer l_inp shape:") print (netshape) l_emb = EmbeddingLayer( l_inp, input_size=num_classes + 1, output_size=num_classes + 1, W=np.identity(num_classes + 1).astype("float32") ) netshape = lasagne.layers.get_output_shape(l_emb) print ("Layer l_emb shape:") print (netshape) l_rnn = LSTMLayer(l_emb, num_units=num_rnn_units) netshape = lasagne.layers.get_output_shape(l_rnn) print ("Layer l_rnn shape:") print (netshape) l_rnn_shp = ReshapeLayer(l_rnn, (num_batch * input_seq_len, num_rnn_units)) netshape = lasagne.layers.get_output_shape(l_rnn_shp) print ("Layer l_rnn_shp shape:") print (netshape) l_out = DenseLayer(l_rnn_shp, num_units=num_classes + 1, nonlinearity=lasagne.nonlinearities.identity) # + blank netshape = lasagne.layers.get_output_shape(l_out) print ("Layer l_out shape:") print (netshape) l_out_shp = ReshapeLayer(l_out, (num_batch, input_seq_len, num_classes + 1)) netshape = lasagne.layers.get_output_shape(l_out_shp) print ("Layer l_out_shp shape:") print (netshape) # dimshuffle to shape format (input_seq_len, batch_size, num_classes + 1) # l_out_shp_ctc = lasagne.layers.DimshuffleLayer(l_out_shp, (1, 0, 2)) l_out_softmax = NonlinearityLayer(l_out, nonlinearity=lasagne.nonlinearities.softmax) netshape = lasagne.layers.get_output_shape(l_out_softmax) print ("Layer l_out_softmax shape:") print (netshape) l_out_softmax_shp = ReshapeLayer(l_out_softmax, (num_batch, input_seq_len, num_classes + 1)) netshape = lasagne.layers.get_output_shape(l_out_softmax_shp) print ("Layer l_out_softmax_shp shape:") print (netshape) output_lin_ctc = lasagne.layers.get_output(l_out_shp, x) output_softmax = lasagne.layers.get_output(l_out_softmax_shp, x) all_params = l_rnn.get_params(trainable=True) # dont learn embeddingl print "x type:", type(x) print "x shape", x.shape print "y type:", type(y) print "y shape", y.shape ############### # GRADIENTS # ############### # the CTC cross entropy between y and linear output network # (num_batch,t,class+1) # output_lin_ctc shape (1,12,6) pseudo_cost = ctc_cost.pseudo_cost(y, output_lin_ctc) # calculate the gradients of the CTC wrt. linar output of network pseudo_cost_grad = T.grad(pseudo_cost.sum() / num_batch, all_params) true_cost = ctc_cost.cost(y, output_softmax) cost = T.mean(true_cost) sh_lr = theano.shared(lasagne.utils.floatX(0.01)) updates = lasagne.updates.rmsprop(pseudo_cost_grad, all_params, learning_rate=sh_lr) # x shape (1,12) # y shape (1,3) train = theano.function([x, y], [output_lin_ctc, output_softmax, cost, pseudo_cost], updates=updates) # Create test dataset num_samples = 10 np.random.seed(1234) # create simple dataset of format # input [5,5,5,5,5,2,2,2,2,2,3,3,3,3,3,....,1,1,1,1] # targets [5,2,3,...,1] # etc... input_lst, output_lst = [], [] for i in range(num_samples): this_input = [] this_output = [] for j in range(target_seq_len): this_class = np.random.randint(num_classes) this_input += [this_class] * 3 + [num_classes] this_output += [this_class] this_input += (input_seq_len - len(this_input)) * [this_input[-1]] input_lst.append(this_input) output_lst.append(this_output) print this_input, this_output input_arr = np.concatenate([input_lst]).astype("int32") y_arr = np.concatenate([output_lst]).astype("int32") print "y_arr shape:", y_arr.shape y_mask_arr = np.ones((num_batch, target_seq_len), dtype="float32") input_mask_arr = np.ones((num_batch, input_seq_len), dtype="float32") for nn in range(1000): cost_lst = [] shuffle = np.random.permutation(num_samples) for i in range(num_samples // num_batch): idx = shuffle[i * num_batch : (i + 1) * num_batch] _, output_softmax_val, cost, pseudo_cost_val = train(input_arr[idx], y_arr[idx]) print "x=", input_arr[idx] # x shape (1,12) print "x shape", input_arr[idx].shape print "y=", y_arr[idx] # y shape (1,3) print "y shape", y_arr[idx].shape output_softmax_lst = output_softmax_val labels_lst = y_arr[idx] cost_lst += [cost] # testing.assert_almost_equal(pseudo_cost, pseudo_cost_old, decimal=4) # testing.assert_array_almost_equal(pseudo_cost_val, pseudo_cost_old_val) if (nn + 1) % 20 == 0: DECAY = 1.5 new_lr = lasagne.utils.floatX(sh_lr.get_value() / DECAY) sh_lr.set_value(new_lr) print "----------------------->NEW LR:", new_lr print nn, "Mean cost:", np.mean(cost_lst) if (nn + 1) % 4 == 0: for jj in range(num_batch): pred = print_pred(np.argmax(output_softmax_val[jj], axis=-1)) true = "".join(map(str, labels_lst[jj])) pred += (target_seq_len - len(pred)) * " " print "pred =", pred, "true =", true
#recognizer = SpeechRecognizer( # num_features=num_features, dims_bottom=[], # dims_bidir=conf.dims_transition, dims_top=[num_classes], # bidir_trans=GatedRecurrent, bottom_activation=None) # ******************* output ******************* y_hat = recognizer.apply(x, x_m) y_hat.name = 'outputs' y_hat_softmax = NDimensionalSoftmax().apply(y_hat, extra_ndim=y_hat.ndim - 2) y_hat_softmax.name = 'outputs_softmax' # there is a cost function for monitoring and for training, because one is more stable to compute # gradients and seems also to be more memory efficient, but does not compute the true cost. if conf.task == 'CTC': cost_train = ctc.pseudo_cost(y, y_hat, y_m, x_m).mean() cost_train.name = "cost_train" cost_monitor = ctc.cost(y, y_hat_softmax, y_m, x_m).mean() cost_monitor.name = "cost_monitor" elif conf.task == 'framewise': cost_train = categorical_crossentropy_batch().apply(y_hat_softmax, y, x_m) cost_train.name = 'cost' cost_monitor = cost_train else: raise ValueError, conf.task recognizer.initialize() cg = ComputationGraph([cost_train, y_hat, x_m, y, y_m]) weights = VariableFilter(roles=[WEIGHT])(cg.variables)
def main(paramFile="",num_epochs=5): # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') target_var = T.ivector('targets') #y = T.matrix() label = T.matrix() blank_symbol = T.scalar() # Create neural network model (depending on first command line parameter) print("Building model and compiling functions...") network,ctcout = build_cnn(input_var) #jin if paramFile=="": print("Train a new network!") else: print("Load well trained parameters from "+paramFile) f = file(paramFile,'rb') params = cPickle.load(f) f.close() lasagne.layers.set_all_param_values(network,params) # Create a loss expression for training, i.e., a scalar objective we want # to minimize the objective function: y = lasagne.layers.get_output(ctcout) ctc_cost = CTC.pseudo_cost(label,y) params = lasagne.layers.get_all_params(ctcout, trainable=True) pseudo_cost_grad = T.grad(ctc_cost.sum(),params) updates = lasagne.updates.nesterov_momentum( pseudo_cost_grad, params, learning_rate=0.0001, momentum=0.9) train_fn = theano.function([input_var, label], ctc_cost, updates=updates,allow_input_downcast=True) test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var) test_loss = test_loss.mean() test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) # Finally, launch the training loop. print("Starting training...") #jin # return numpy.ndarray train_out = T.argmax(test_prediction, axis=1) train_acc = T.mean(T.eq(train_out, target_var), dtype=theano.config.floatX) train_label = theano.function([input_var,target_var],[train_out,train_acc,test_prediction]) val_out = T.argmax(test_prediction, axis=1) val_label = theano.function([input_var],val_out) # We iterate over epochs: #jin # train set and validation set dirpath = os.getcwd() print('dirpath = '+dirpath) train_dirpath = dirpath + '/train' test_dirpath = dirpath + '/test' total = len(os.listdir(train_dirpath)) / 2 train_total_num = int(0.9 * total) validation_total_num = total - train_total_num print('Train num = ' + str(train_total_num)) print('Validation num = '+str(validation_total_num)) blank_symbol_num = 39 for epoch in range(num_epochs): # change current directory os.chdir(train_dirpath) # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() counter = 0 # And a full pass over the validation data: val_err = 0 val_acc = 0 val_batches = 0 #for batch in loadArray(train_dirpath): for batch in loadArray(train_dirpath): inputs, targets, batchNum = batch print('spectro shape:') print(inputs.shape) print('label shape:') print(targets.shape) label_without_blank = PER.phn2targetseq(targets,blank_symbol_num) #label_without_blank = label_without_blank[0,:] print('noblanklabel shape = '+str(label_without_blank.shape)) counter += 1 if counter < train_total_num: train_batches += batchNum # valwrd = predicting output frames # wrd = predicting output phoneme trainwrd, acc, yy = train_label(inputs,targets) print("y shape = "+str(yy.shape)) ctc_loss = train_fn(inputs, label_without_blank) train_err += ctc_loss #ctc_loss = ctc_fn(yy, label_without_blank, blank_symbol_num) print('ctc loss = '+str(ctc_loss)) print('train acc = '+str(acc)) wrd = PER.phn2word(trainwrd) print('train output word=') print(wrd) labelphn = PER.phn2word(targets) print('labelphn=') print(labelphn) print(' Train set completed : '+str(float(counter)/train_total_num*100)) else: err, acc = val_fn(inputs, targets) val_err += err * batchNum val_acc += acc * batchNum val_batches += batchNum # valwrd = predicting output frames # wrd = predicting output phoneme valwrd = val_label(inputs) print('test acc = '+str(acc)) print('test output word=') valwrd = PER.phn2word(valwrd) print(valwrd) labelphn = PER.phn2word(targets) print('labelphn=') print(labelphn) print(' Validation set completed : '+str(float(counter-train_total_num)/validation_total_num*100)) # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) print(" validation accuracy:\t\t{:.2f} %".format(val_acc / val_batches * 100)) # change current directory os.chdir(dirpath) # store parameters print(" should store epoch {}".format(epoch+1)) pythonName,suffix = os.path.splitext(__file__) param2store = lasagne.layers.get_all_param_values(network) storename = pythonName+"_"+str((epoch+1))+"_accu="+str(val_acc / val_batches * 100)+".save" with file(storename,'wb') as f: cPickle.dump(param2store,f) # change current directory os.chdir(test_dirpath) # After training, we compute and print the test error: test_err = 0 test_acc = 0 test_batches = 0 for batch in loadArray(test_dirpath): inputs, targets, batchNum = batch err, acc = val_fn(inputs, targets) test_err += err*batchNum test_acc += acc*batchNum test_batches += batchNum print("Final results:") print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches)) print(" test accuracy:\t\t{:.2f} %".format(test_acc / test_batches * 100))
def test_lasagne_ctc(): import lasagne from lasagne.layers import LSTMLayer, InputLayer, DenseLayer,\ NonlinearityLayer, ReshapeLayer, EmbeddingLayer, RecurrentLayer import theano import theano.tensor as T import numpy as np num_batch, input_seq_len = 10, 15 num_classes = 10 target_seq_len = 5 num_rnn_units = 50 input_seq_len += target_seq_len def print_pred(y_hat): blank_symbol = num_classes res = [] for i, s in enumerate(y_hat): if (s != blank_symbol) and (i == 0 or s != y_hat[i - 1]): res += [s] if len(res) > 0: return "".join(map(str, list(res))) else: return "-" * target_seq_len Y_hat = np.asarray(np.random.normal( 0, 1, (input_seq_len, num_batch, num_classes + 1)), dtype=floatX) Y = np.zeros((target_seq_len, num_batch), dtype='int64') Y[25:, :] = 1 Y_hat_mask = np.ones((input_seq_len, num_batch), dtype=floatX) Y_hat_mask[-5:] = 0 # default blank symbol is the highest class index (3 in this case) Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) X = np.random.random((num_batch, input_seq_len)).astype('int32') y = T.imatrix('phonemes') x = T.imatrix() # batchsize, input_seq_len, features # setup Lasagne Recurrent network # The output from the network is shape # a) output_lin_ctc is the activation before softmax (input_seq_len, batch_size, num_classes + 1) # b) ouput_softmax is the output after softmax (batch_size, input_seq_len, num_classes + 1) l_inp = InputLayer((num_batch, input_seq_len)) l_emb = EmbeddingLayer(l_inp, input_size=num_classes + 1, output_size=num_classes + 1, W=np.identity(num_classes + 1).astype('float32')) ini = lasagne.init.Uniform(0.1) zero = lasagne.init.Constant(0.0) cell = lasagne.init.Uniform(0.1) l_rnn = LSTMLayer(l_emb, num_units=num_rnn_units, peepholes=True, W_in_to_ingate=ini, W_hid_to_ingate=ini, b_ingate=zero, W_in_to_forgetgate=ini, W_hid_to_forgetgate=ini, b_forgetgate=zero, W_in_to_cell=ini, W_hid_to_cell=ini, b_cell=zero, W_in_to_outgate=ini, W_hid_to_outgate=ini, b_outgate=zero, cell_init=lasagne.init.Constant(0.), hid_init=lasagne.init.Constant(0.), W_cell_to_forgetgate=cell, W_cell_to_ingate=cell, W_cell_to_outgate=cell) l_rnn_shp = ReshapeLayer(l_rnn, (num_batch * input_seq_len, num_rnn_units)) l_out = DenseLayer(l_rnn_shp, num_units=num_classes + 1, nonlinearity=lasagne.nonlinearities.identity) # + blank l_out_shp = ReshapeLayer(l_out, (num_batch, input_seq_len, num_classes + 1)) # dimshuffle to shape format (input_seq_len, batch_size, num_classes + 1) #l_out_shp_ctc = lasagne.layers.DimshuffleLayer(l_out_shp, (1, 0, 2)) l_out_softmax = NonlinearityLayer( l_out, nonlinearity=lasagne.nonlinearities.softmax) l_out_softmax_shp = ReshapeLayer( l_out_softmax, (num_batch, input_seq_len, num_classes + 1)) output_lin_ctc = lasagne.layers.get_output(l_out_shp, x) output_softmax = lasagne.layers.get_output(l_out_softmax_shp, x) all_params = l_rnn.get_params(trainable=True) # dont learn embeddingl print all_params ############### # GRADIENTS # ############### # the CTC cross entropy between y and linear output network pseudo_cost = ctc_cost.pseudo_cost(y, output_lin_ctc) # calculate the gradients of the CTC wrt. linar output of network pseudo_cost_grad = T.grad(pseudo_cost.sum() / num_batch, all_params) true_cost = ctc_cost.cost(y, output_softmax) cost = T.mean(true_cost) sh_lr = theano.shared(lasagne.utils.floatX(0.01)) #updates = lasagne.updates.sgd(pseudo_cost_grad, all_params, learning_rate=sh_lr) #updates = lasagne.updates.apply_nesterov_momentum(updates, all_params, momentum=0.9) updates = lasagne.updates.rmsprop(pseudo_cost_grad, all_params, learning_rate=sh_lr) train = theano.function( [x, y], [output_lin_ctc, output_softmax, cost, pseudo_cost], updates=updates) # Create test dataset num_samples = 1000 np.random.seed(1234) # create simple dataset of format # input [5,5,5,5,5,2,2,2,2,2,3,3,3,3,3,....,1,1,1,1] # targets [5,2,3,...,1] # etc... input_lst, output_lst = [], [] for i in range(num_samples): this_input = [] this_output = [] for j in range(target_seq_len): this_class = np.random.randint(num_classes) this_input += [this_class] * 3 + [num_classes] this_output += [this_class] this_input += (input_seq_len - len(this_input)) * [this_input[-1]] input_lst.append(this_input) output_lst.append(this_output) print this_input, this_output input_arr = np.concatenate([input_lst]).astype('int32') y_arr = np.concatenate([output_lst]).astype('int32') y_mask_arr = np.ones((num_batch, target_seq_len), dtype='float32') input_mask_arr = np.ones((num_batch, input_seq_len), dtype='float32') for nn in range(10000): cost_lst = [] shuffle = np.random.permutation(num_samples) for i in range(num_samples // num_batch): idx = shuffle[i * num_batch:(i + 1) * num_batch] _, output_softmax_val, cost, pseudo_cost_val = train( input_arr[idx], y_arr[idx]) output_softmax_lst = output_softmax_val labels_lst = y_arr[idx] cost_lst += [cost] #testing.assert_almost_equal(pseudo_cost, pseudo_cost_old, decimal=4) #testing.assert_array_almost_equal(pseudo_cost_val, pseudo_cost_old_val) if (nn + 1) % 200 == 0: DECAY = 1.5 new_lr = lasagne.utils.floatX(sh_lr.get_value() / DECAY) sh_lr.set_value(new_lr) print "----------------------->NEW LR:", new_lr print nn, "Mean cost:", np.mean(cost_lst) if (nn + 1) % 4 == 0: for jj in range(num_batch): pred = print_pred(np.argmax(output_softmax_val[jj], axis=-1)) true = "".join(map(str, labels_lst[jj])) pred += (target_seq_len - len(pred)) * " " print pred, true
#recognizer = SpeechRecognizer( # num_features=num_features, dims_bottom=[], # dims_bidir=conf.dims_transition, dims_top=[num_classes], # bidir_trans=GatedRecurrent, bottom_activation=None) # ******************* output ******************* y_hat = recognizer.apply(x,x_m) y_hat.name = 'outputs' y_hat_softmax = NDimensionalSoftmax().apply(y_hat, extra_ndim = y_hat.ndim - 2) y_hat_softmax.name = 'outputs_softmax' # there is a cost function for monitoring and for training, because one is more stable to compute # gradients and seems also to be more memory efficient, but does not compute the true cost. if conf.task=='CTC': cost_train = ctc.pseudo_cost(y, y_hat, y_m, x_m).mean() cost_train.name = "cost_train" cost_monitor = ctc.cost(y, y_hat_softmax, y_m, x_m).mean() cost_monitor.name = "cost_monitor" elif conf.task=='framewise': cost_train = categorical_crossentropy_batch().apply(y_hat_softmax, y, x_m) cost_train.name='cost' cost_monitor = cost_train else: raise ValueError, conf.task recognizer.initialize() cg = ComputationGraph([cost_train, y_hat, x_m, y, y_m])
l_out_softmax = NonlinearityLayer(l_out, nonlinearity=soft) l_out_softmax_shp = ReshapeLayer(l_out_softmax, (batchsize, seqlen, num_classes)) output_lin_ctc = L.get_output(l_out_shp) network_output = L.get_output(l_out_softmax_shp) all_params = L.get_all_params(l_rnn_2, trainable=True) # ## Costs, Gradients & Training Functions # Cost functions target_values = T.imatrix('target_output') input_values = T.imatrix() ### Gradients ### # pseudo costs - ctc cross entropy b/n targets and linear output - used in training pseudo_cost = ctc_cost.pseudo_cost(target_values, output_lin_ctc) pseudo_cost_grad = T.grad(pseudo_cost.sum() / batchsize, all_params) pseudo_cost = pseudo_cost.mean() # true costs cost = ctc_cost.cost(target_values, network_output) cost = cost.mean() # Compute SGD updates for training print("Computing updates ...") updates = lasagne.updates.rmsprop(pseudo_cost_grad, all_params, LEARNING_RATE) # Theano functions for training and computing cost print("Compiling functions ...") train = theano.function( [l_in.input_var, target_values], [cost, pseudo_cost, network_output], updates=updates)
def test_lasagne_ctc(): import lasagne from lasagne.layers import LSTMLayer, InputLayer, DenseLayer,\ NonlinearityLayer, ReshapeLayer, EmbeddingLayer, RecurrentLayer import theano import theano.tensor as T import numpy as np num_batch, input_seq_len = 10, 15 num_classes = 10 target_seq_len = 5 num_rnn_units = 50 input_seq_len += target_seq_len def print_pred(y_hat): blank_symbol = num_classes res = [] for i, s in enumerate(y_hat): if (s != blank_symbol) and (i == 0 or s != y_hat[i - 1]): res += [s] if len(res) > 0: return "".join(map(str, list(res))) else: return "-"*target_seq_len Y_hat = np.asarray(np.random.normal( 0, 1, (input_seq_len, num_batch, num_classes + 1)), dtype=floatX) Y = np.zeros((target_seq_len, num_batch), dtype='int64') Y[25:, :] = 1 Y_hat_mask = np.ones((input_seq_len, num_batch), dtype=floatX) Y_hat_mask[-5:] = 0 # default blank symbol is the highest class index (3 in this case) Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) X = np.random.random( (num_batch, input_seq_len)).astype('int32') y = T.imatrix('phonemes') x = T.imatrix() # batchsize, input_seq_len, features # setup Lasagne Recurrent network # The output from the network is shape # a) output_lin_ctc is the activation before softmax (input_seq_len, batch_size, num_classes + 1) # b) ouput_softmax is the output after softmax (batch_size, input_seq_len, num_classes + 1) l_inp = InputLayer((num_batch, input_seq_len)) l_emb = EmbeddingLayer(l_inp, input_size=num_classes+1, output_size=num_classes+1, W=np.identity(num_classes+1).astype('float32')) ini = lasagne.init.Uniform(0.1) zero = lasagne.init.Constant(0.0) cell = lasagne.init.Uniform(0.1) l_rnn = LSTMLayer(l_emb, num_units=num_rnn_units, peepholes=True, W_in_to_ingate=ini, W_hid_to_ingate=ini, b_ingate=zero, W_in_to_forgetgate=ini, W_hid_to_forgetgate=ini, b_forgetgate=zero, W_in_to_cell=ini, W_hid_to_cell=ini, b_cell=zero, W_in_to_outgate=ini, W_hid_to_outgate=ini, b_outgate=zero, cell_init=lasagne.init.Constant(0.), hid_init=lasagne.init.Constant(0.), W_cell_to_forgetgate=cell, W_cell_to_ingate=cell, W_cell_to_outgate=cell) l_rnn_shp = ReshapeLayer(l_rnn, (num_batch*input_seq_len, num_rnn_units)) l_out = DenseLayer(l_rnn_shp, num_units=num_classes+1, nonlinearity=lasagne.nonlinearities.identity) # + blank l_out_shp = ReshapeLayer(l_out, (num_batch, input_seq_len, num_classes+1)) # dimshuffle to shape format (input_seq_len, batch_size, num_classes + 1) #l_out_shp_ctc = lasagne.layers.DimshuffleLayer(l_out_shp, (1, 0, 2)) l_out_softmax = NonlinearityLayer( l_out, nonlinearity=lasagne.nonlinearities.softmax) l_out_softmax_shp = ReshapeLayer( l_out_softmax, (num_batch, input_seq_len, num_classes+1)) output_lin_ctc = lasagne.layers.get_output(l_out_shp, x) output_softmax = lasagne.layers.get_output(l_out_softmax_shp, x) all_params = l_rnn.get_params(trainable=True) # dont learn embeddingl print all_params ############### # GRADIENTS # ############### # the CTC cross entropy between y and linear output network pseudo_cost = ctc_cost.pseudo_cost( y, output_lin_ctc) # calculate the gradients of the CTC wrt. linar output of network pseudo_cost_grad = T.grad(pseudo_cost.sum() / num_batch, all_params) true_cost = ctc_cost.cost(y, output_softmax) cost = T.mean(true_cost) sh_lr = theano.shared(lasagne.utils.floatX(0.01)) #updates = lasagne.updates.sgd(pseudo_cost_grad, all_params, learning_rate=sh_lr) #updates = lasagne.updates.apply_nesterov_momentum(updates, all_params, momentum=0.9) updates = lasagne.updates.rmsprop(pseudo_cost_grad, all_params, learning_rate=sh_lr) train = theano.function([x, y], [output_lin_ctc, output_softmax, cost, pseudo_cost], updates=updates) # Create test dataset num_samples = 1000 np.random.seed(1234) # create simple dataset of format # input [5,5,5,5,5,2,2,2,2,2,3,3,3,3,3,....,1,1,1,1] # targets [5,2,3,...,1] # etc... input_lst, output_lst = [], [] for i in range(num_samples): this_input = [] this_output = [] for j in range(target_seq_len): this_class = np.random.randint(num_classes) this_input += [this_class]*3 + [num_classes] this_output += [this_class] this_input += (input_seq_len - len(this_input))*[this_input[-1]] input_lst.append(this_input) output_lst.append(this_output) print this_input, this_output input_arr = np.concatenate([input_lst]).astype('int32') y_arr = np.concatenate([output_lst]).astype('int32') y_mask_arr = np.ones((num_batch, target_seq_len), dtype='float32') input_mask_arr = np.ones((num_batch, input_seq_len), dtype='float32') for nn in range(10000): cost_lst = [] shuffle = np.random.permutation(num_samples) for i in range(num_samples//num_batch): idx = shuffle[i*num_batch:(i+1)*num_batch] _, output_softmax_val, cost, pseudo_cost_val = train( input_arr[idx], y_arr[idx]) output_softmax_lst = output_softmax_val labels_lst = y_arr[idx] cost_lst += [cost] #testing.assert_almost_equal(pseudo_cost, pseudo_cost_old, decimal=4) #testing.assert_array_almost_equal(pseudo_cost_val, pseudo_cost_old_val) if (nn+1) % 200 == 0: DECAY = 1.5 new_lr = lasagne.utils.floatX(sh_lr.get_value() / DECAY) sh_lr.set_value(new_lr) print "----------------------->NEW LR:", new_lr print nn, "Mean cost:", np.mean(cost_lst) if (nn+1) % 4 == 0: for jj in range(num_batch): pred = print_pred(np.argmax(output_softmax_val[jj], axis=-1)) true = "".join(map(str, labels_lst[jj])) pred += (target_seq_len-len(pred)) * " " print pred, true