def get_predictions(index, partition, input_to_softmax, model_path, spectrogram_features=True): """ Print a model's decoded predictions Params: index (int): The example you would like to visualize partition (str): One of 'train' or 'validation' input_to_softmax (Model): The acoustic model model_path (str): Path to saved acoustic model's weights return the predicted probability matrix (in a 2D matrix) and the ground truth """ # load the train and test data data_gen = AudioGenerator(spectrogram=spectrogram_features) data_gen.load_train_data() data_gen.load_validation_data() # obtain the true transcription and the audio features if partition == 'validation': transcr = data_gen.valid_texts[index] audio_path = data_gen.valid_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) elif partition == 'train': transcr = data_gen.train_texts[index] audio_path = data_gen.train_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) else: raise Exception('Invalid partition! Must be "train" or "validation"') # obtain and decode the acoustic model's predictions input_to_softmax.load_weights(model_path) prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0)) output_length = [input_to_softmax.output_length(data_point.shape[0])] return (prediction[0], transcr, audio_path)
def get_predictions(index, partition, input_to_softmax, model_path): ''' Get the model's decoded predictions to caculate metrics ''' # load the train and test data data_gen = AudioGenerator() data_gen.load_train_data() data_gen.load_validation_data() # obtain the true transcription and the audio features if partition == 'valid': transcr = data_gen.valid_texts[index] audio_path = data_gen.valid_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) elif partition == 'train': transcr = data_gen.train_texts[index] audio_path = data_gen.train_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) else: raise Exception('Invalid') # obtain and decode the acoustic model's predictions input_to_softmax.load_weights(model_path) prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0)) output_length = [input_to_softmax.output_length(data_point.shape[0])] pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) + 1).flatten().tolist() label = transcr predicted = ''.join(int_sequence_to_text(pred_ints)) return label, predicted
def compare_predictions(index, partition, inputs_to_softmax=[], model_paths=[], phn=False): """ Print a model's decoded predictions Params: index (int): The example you would like to visualize partition (str): One of 'train' or 'validation' input_to_softmax (Model): The acoustic model model_path (str): Path to saved acoustic model's weights """ # load the train and test data data_gen = AudioGenerator() data_gen.load_train_data() data_gen.load_test_data() # obtain the true transcription and the audio features if partition == 'test': if phn: transcr = data_gen.test_phn_texts[index] audio_path = data_gen.test_phn_audio_paths[index] elif not phn: transcr = data_gen.test_wrd_texts[index] audio_path = data_gen.test_wrd_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) elif partition == 'train': if phn: transcr = data_gen.train_phn_texts[index] audio_path = data_gen.train_phn_audio_paths[index] elif not phn: transcr = data_gen.train_wrd_texts[index] audio_path = data_gen.train_wrd_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) else: raise Exception('Invalid partition! Must be "train" or "validation"') # obtain and decode the acoustic model's predictions pred_ints = [] for model_path, input_to_softmax in zip(model_paths, inputs_to_softmax): input_to_softmax.load_weights(model_path) prediction = input_to_softmax.predict( np.expand_dims(data_point, axis=0)) output_length = [input_to_softmax.output_length(data_point.shape[0])] pred_int = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) + 1).flatten().tolist() pred_ints.append(pred_int) # play the audio file, and display the true and predicted transcriptions print('-' * 80) Audio(audio_path) print('True transcription:\n' + '\n' + transcr) print('-' * 80) i = 0 for pred_in in pred_ints: i = i + 1 print('Predicted transcription number', i, ':\n' + '\n' + ''.join(int_sequence_to_text(pred_in, phn))) print('-' * 80)
def predict_test(input_to_softmax, model_path, audio_range=100000): ''' Method for predicting the testing set. Set default audio_range to be 100000. (If it's over the index, it will return an error anyway). ''' data_gen = AudioGenerator() data_gen.load_train_data() data_gen.load_test_data() transcr = data_gen.test_texts audio_path = data_gen.test_audio_paths input_to_softmax.load_weights(model_path) predictions = [] try: for i in range(len(audio_path)): #default len(audio_path)): data_point = data_gen.normalize(data_gen.featurize(audio_path[i])) prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0)) output_length = [input_to_softmax.output_length(data_point.shape[0])] pred_ints = (K.eval(K.ctc_decode( prediction, output_length)[0][0])+1).flatten().tolist() pred = ''.join(int_sequence_to_text(pred_ints)) predictions.append(pred) except: predictions = ''.join(predictions) transcr = transcr[:10] transcr = ''.join(transcr) with open("predictions/predictions.txt", "w") as output: output.write(str(predictions)) with open("predictions/truescr.txt", "w") as output: output.write(str(transcr))
def train_model(input_to_softmax, pickle_path, save_model_path, train_json = 'train_corpus.json', valid_json = 'valid_corpus.json', minibatch_size = 20, spectrogram = True, mfcc_dim = 13, optimizer = SGD(lr = 0.02, decay = 1e-6, momentum = 0.9, nesterov = True, clipnorm = 5), epochs = 20, verbose = 1, sort_by_duration = False, max_duration = 10.0): audio_gen = AudioGenerator(minibatch_size = minibatch_size, spectrogram = spectrogram, mfcc_dim = mfcc_dim, max_duration = max_duration, sort_by_duration = sort_by_duration) audio_gen.load_train_data(train_json) audio_gen.load_validation_data(valid_json) num_train_examples = len(audio_gen.train_audio_paths) steps_per_epoch = num_train_examples//minibatch_size num_valid_samples = len(audio_gen.valid_audio_paths) validation_steps = num_valid_samples//minibatch_size model = add_ctc_loss(input_to_softmax) model.compile(loss = {'ctc': lambda y_true, y_pred: y_pred}, optimizer = optimizer) if not os.path.exists('results'): os.makedirs('results') checkpointer = ModelCheckpoint(filepath = 'results/' + save_model_path, verbose = 0) hist = model.fit_generator(generator = audio_gen.next_train(), steps_per_epoch = steps_per_epoch, epochs = epochs, validation_data = audio_gen.next_valid(), validation_steps = validation_steps, callbacks = [checkpointer], verbose = verbose) with open('results/' + pickle_path, 'wb') as f: pickle.dump(hist.history, f)
def get_predictions(index, partition, trained_model, model_path): """ Print a model's decoded predictions Params: index (int): The example you would like to visualize partition (str): One of 'train' or 'validation' trained_model (Model): The acoustic model model_path (str): Path to saved acoustic model's weights """ # load the train and test data data_gen = AudioGenerator(spectrogram=True) data_gen.load_train_data() data_gen.load_validation_data() # obtain the true transcription and the audio features from Dataset if partition == 'validation': transcr = data_gen.valid_texts[index] audio_path = data_gen.valid_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) elif partition == 'train': transcr = data_gen.train_texts[index] audio_path = data_gen.train_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) else: raise Exception('Invalid partition! Must be "train" or "validation"') print("Trained model output length:\n" + str(trained_model.output_length(data_point.shape[0]))) # obtain and decode the acoustic model's predictions trained_model.load_weights(model_path) prediction = trained_model.predict(np.expand_dims(data_point, axis=0)) output_length = [trained_model.output_length(data_point.shape[0])] pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) + 1).flatten().tolist() transcription = ''.join(int_sequence_to_text(pred_ints)) # Correction using KenLM language model toolkit corrected_transcription = correction(transcription) print('-' * 80) print(repr(audio_path).replace(r"\\", r"/")) print('True transcription:\n' + '\n' + transcr) print('-' * 80) print('Raw prediction:\n' + str(prediction[0])) print('CTC Decoded predicted Ints before conversion to text:\n' + str(pred_ints)) print('Predicted transcription:\n' + '\n' + transcription) print('Predicted transcription with correction:\n' + corrected_transcription) print('-' * 80)
def train_model(input_to_softmax, pickle_path, save_model_path, train_json='train_corpus.json', valid_json='valid_corpus.json', minibatch_size=20, spectrogram=True, mfcc_dim=13, optimizer=SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5), epochs=20, verbose=1, sort_by_duration=False, max_duration=10.0): # create a class instance for obtaining batches of data audio_gen = AudioGenerator(minibatch_size=minibatch_size, spectrogram=spectrogram, mfcc_dim=mfcc_dim, max_duration=max_duration, sort_by_duration=sort_by_duration) # add the training data to the generator audio_gen.load_train_data(train_json) audio_gen.load_validation_data(valid_json) # calculate steps_per_epoch num_train_examples = len(audio_gen.train_audio_paths) steps_per_epoch = num_train_examples // minibatch_size # calculate validation_steps num_valid_samples = len(audio_gen.valid_audio_paths) validation_steps = num_valid_samples // minibatch_size # add CTC loss to the NN specified in input_to_softmax model = add_ctc_loss(input_to_softmax) # CTC loss is implemented elsewhere, so use a dummy lambda function for the loss model.compile(loss={ 'ctc': lambda y_true, y_pred: y_pred }, optimizer=optimizer) # make results directory, if necessary if not os.path.exists('results'): os.makedirs('results') # add checkpointer checkpointer = ModelCheckpoint(filepath=os.path.join( 'results', save_model_path), verbose=0) # train the model callbacks = [TQDMNotebookCallback(), checkpointer ] if verbose < 0 else [checkpointer] hist = model.fit_generator(generator=audio_gen.next_train(), steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=audio_gen.next_valid(), validation_steps=validation_steps, callbacks=callbacks, verbose=verbose) # save model loss with open(os.path.join('results', pickle_path), 'wb') as f: pickle.dump(hist.history, f)
def train_model(input_to_softmax, phn, pickle_path, save_model_path, train_json='JSON\\train_corpus', valid_json='JSON\\test_corpus', minibatch_size=10, mfcc_dim=13, optimizer=SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5), epochs=20, verbose=1, sort_by_duration=False, max_duration=20.0): # create a class instance for obtaining batches of data audio_gen = AudioGenerator(minibatch_size=minibatch_size, mfcc_dim=mfcc_dim, max_duration=max_duration, sort_by_duration=sort_by_duration) # add the training data to the generator audio_gen.load_train_data(train_json) audio_gen.load_test_data(valid_json) # calculate steps_per_epoch if phn: num_train_examples=len(audio_gen.train_phn_audio_paths) steps_per_epoch = num_train_examples//minibatch_size elif not phn: num_train_examples=len(audio_gen.train_wrd_audio_paths) steps_per_epoch = num_train_examples//minibatch_size # calculate validation_steps if phn: num_valid_samples = len(audio_gen.test_phn_audio_paths) validation_steps = num_valid_samples//minibatch_size elif not phn: num_valid_samples = len(audio_gen.test_wrd_audio_paths) validation_steps = num_valid_samples//minibatch_size # add CTC loss to the NN specified in input_to_softmax model = add_ctc_loss(input_to_softmax) # CTC loss is implemented elsewhere, so use a dummy lambda function for the loss model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer) # make results/ directory, if necessary if not os.path.exists('models'): os.makedirs('models') # add checkpointer checkpointer = ModelCheckpoint(filepath='models/'+save_model_path, verbose=0) # train the model generator=audio_gen.next_train(phn) validation_data=audio_gen.next_test(phn) hist = model.fit_generator(generator=generator, steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=validation_data, validation_steps=validation_steps, callbacks=[checkpointer], verbose=verbose) # save model loss with open('models/'+pickle_path, 'wb') as f: pickle.dump(hist.history, f)
def train_model(input_to_softmax, pickle_path, save_model_path, train_json='train_corpus.json', valid_json='valid_corpus.json', minibatch_size=20, spectrogram=True, mfcc_dim=13, # another option for the optimizer # optimizer=RMSprop(clipvalue=0.5), optimizer=SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=1.0, clipvalue=0.5), # clipnorm was originally set to 5 # there are many exploding gradients and clipnorm/clipvalue can help epochs=20, verbose=1, sort_by_duration=False, max_duration=20.0): # create a class instance for obtaining batches of data audio_gen = AudioGenerator(minibatch_size=minibatch_size, spectrogram=spectrogram, mfcc_dim=mfcc_dim, max_duration=max_duration, sort_by_duration=sort_by_duration) # add the training data to the generator audio_gen.load_train_data(train_json) audio_gen.load_validation_data(valid_json) # calculate steps_per_epoch num_train_examples=len(audio_gen.train_audio_paths) steps_per_epoch = num_train_examples//minibatch_size # calculate validation_steps num_valid_samples = len(audio_gen.valid_audio_paths) validation_steps = num_valid_samples//minibatch_size # add CTC loss to the NN specified in input_to_softmax model = add_ctc_loss(input_to_softmax) # CTC loss is implemented elsewhere, so use a dummy lambda function for the loss model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer) # make results/ directory, if necessary if not os.path.exists('results'): os.makedirs('results') # add checkpointer checkpointer = ModelCheckpoint(filepath='results/'+save_model_path, verbose=0) keras.backend.get_session().run(tf.global_variables_initializer()) # train the model hist = model.fit_generator(generator=audio_gen.next_train(), steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=audio_gen.next_valid(), validation_steps=validation_steps, callbacks=[checkpointer], verbose=verbose) # save model loss with open('results/'+pickle_path, 'wb') as f: pickle.dump(hist.history, f)
def validation_sentences(): """ storage the validation sentences Params: None """ # load the test data data_gen = AudioGenerator(spectrogram=True) data_gen.load_validation_data() # obtain the true transcription and the audio features num = 500 f = open('C:/Users/mribles/Desktop/corpus.txt', 'a') while num > 490: transcr = data_gen.valid_texts[num] f.write(transcr + '\n') num = num -1 f.close()
def get_predictions(index, partition, input_to_softmax, model_path): """ Print a model's decoded predictions Params: index (int): The example you would like to visualize partition (str): One of 'train' or 'validation' input_to_softmax (Model): The acoustic model model_path (str): Path to saved acoustic model's weights """ # load the train and test data data_gen = AudioGenerator() data_gen.load_train_data() data_gen.load_validation_data() print(len(data_gen.valid_texts)) # obtain the true transcription and the audio features if partition == 'validation': transcr = data_gen.valid_texts[index] audio_path = data_gen.valid_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) elif partition == 'train': transcr = data_gen.train_texts[index] audio_path = data_gen.train_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) else: raise Exception('Invalid partition! Must be "train" or "validation"') # obtain and decode the acoustic model's predictions input_to_softmax.load_weights(model_path) prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0)) output_length = [input_to_softmax.output_length(data_point.shape[0])] pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) + 1).flatten().tolist() if not data_gen.valid_texts[index]: return with open(r'D:\DIPLOMSKA\results\predictions_cnn_rnn_12.txt', 'a+', encoding='utf8') as fp: fp.write('True transcription:\n' + '\n' + transcr + '\n') #print(transcr) fp.write('-' * 30 + '\n') fp.write('Predicted transcription:\n' + '\n' + ''.join(int_sequence_to_text(pred_ints)) + '\n') #print(int_sequence_to_text(pred_ints)) fp.write('-' * 30 + '\n')
def get_predictions(index, partition, input_to_softmax, model_path, spectogram=True, mfcc_dim=13): """ Print a model's decoded predictions Params: index (int): The example you would like to visualize partition (str): One of 'train' or 'validation' input_to_softmax (Model): The acoustic model model_path (str): Path to saved acoustic model's weights """ # load the train and test data data_gen = AudioGenerator(spectrogram=spectogram, mfcc_dim=mfcc_dim) data_gen.load_train_data() data_gen.load_validation_data() # obtain the true transcription and the audio features if partition == 'validation': transcr = data_gen.valid_texts[index] audio_path = data_gen.valid_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) elif partition == 'train': transcr = data_gen.train_texts[index] audio_path = data_gen.train_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) else: raise Exception('Invalid partition! Must be "train" or "validation"') # obtain and decode the acoustic model's predictions input_to_softmax.load_weights('results/' + model_path) prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0)) output_length = [input_to_softmax.output_length(data_point.shape[0])] pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) + 1).flatten().tolist() # play the audio file, and display the true and predicted transcriptions print('-' * 80) Audio(audio_path) print('True transcription:\n' + '\n' + transcr) print('-' * 80) print('Predicted transcription:\n' + '\n' + ''.join(int_sequence_to_text(pred_ints))) print('-' * 80)
def train_my_model(model, pickle_path, save_model_path, train_json='train_corpus.json', valid_json='valid_corpus.json', minibatch_size=20, spectrogram=True, epochs=20, verbose=1, sort_by_duration=False, max_duration=40.0): """ Gabriel Freire: Train my own model sample_models.py > own_model(input_dim=161, output_dim=29) """ # create a class instance for obtaining batches of data audio_gen = AudioGenerator(minibatch_size=minibatch_size, spectrogram=spectrogram, max_duration=max_duration, sort_by_duration=sort_by_duration) # add the training data to the generator audio_gen.load_train_data(train_json) audio_gen.load_validation_data(valid_json) # calculate steps_per_epoch num_train_examples=len(audio_gen.train_audio_paths) steps_per_epoch = num_train_examples//minibatch_size print("Num of training examples: {}".format(num_train_examples)) # calculate validation_steps num_valid_samples = len(audio_gen.valid_audio_paths) validation_steps = num_valid_samples//minibatch_size print("Num of validation examples: {}".format(num_valid_samples)) # add ctc loss model = add_ctc_loss(model) # Compile optimizer = SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5) # optimizer = Adam(lr=0.02, clipnorm=5, decay=1e-6) model.compile(loss=ctc, optimizer=optimizer) # make results/ directory, if necessary if not os.path.exists('results'): os.makedirs('results') # make tensorboard/ directory, if necessary if not os.path.exists('tensorboard'): os.makedirs('tensorboard') # add checkpointer and tensorboard callbacks checkpointer = ModelCheckpoint(filepath='results/' + save_model_path, verbose=0) tensorboard = TensorBoard(log_dir='tensorboard/{}/'.format('cnn_rnn_own_model_events'), write_graph=False, write_images=True) # train the model hist = model.fit_generator(generator=audio_gen.next_train(), steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=audio_gen.next_valid(), validation_steps=validation_steps, callbacks=[checkpointer, tensorboard], verbose=verbose) # save model loss with open('results/'+pickle_path, 'wb') as f: pickle.dump(hist.history, f)
def get_predictions(data_gen: AudioGenerator, model, partition, index, omit_true=False, print_line=True): """ Print a model's decoded predictions Params: index (int): The example you would like to visualize partition (str): One of 'train' or 'validation' model (Model): The acoustic model model_path (str): Path to saved acoustic model's weights """ # load the train and test data # data_gen = AudioGenerator() # data_gen.load_train_data() # data_gen.load_validation_data() # obtain the true transcription and the audio features if data_gen is None: print("Data Generator is None!") if partition == 'validation': transcription = data_gen.valid_texts[index] audio_path = data_gen.valid_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) elif partition == 'train': transcription = data_gen.train_texts[index] audio_path = data_gen.train_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) else: raise Exception('Invalid partition! Must be "train" or "validation"') # obtain and decode the acoustic model's predictions prediction = model.predict(np.expand_dims(data_point, axis=0)) output_length = [model.output_length(data_point.shape[0])] pred_ints = (K.eval(K.ctc_decode( prediction, output_length)[0][0]) + 1).flatten().tolist() # play the audio file, and display the true and predicted transcriptions # Audio(audio_path) input_type = "SPEC" if data_gen.spectrogram else "MFCC" if not omit_true: print('TRUE: ' + transcription) print('PRED ' + input_type + ': ' + ''.join(int_sequence_to_text(pred_ints))) if print_line: print('-' * 82) return audio_path
def get_predictions(index, partition, input_to_softmax, model_path): """ Print a model's decoded predictions · Params: index (int): The example you would like to visualize partition (str): One of 'train' or 'validation' input_to_softmax (Model): The acoustic model model_path (str): Path to saved acoustic model's weights """ # load the train and test data starttime = time.time() data_gen = AudioGenerator(spectrogram=True) data_gen.load_train_data() data_gen.load_validation_data() # obtain the true transcription and the audio features if partition == 'validation': transcr = data_gen.valid_texts[index] audio_path = data_gen.valid_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) elif partition == 'train': transcr = data_gen.train_texts[index] audio_path = data_gen.train_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) else: raise Exception('Invalid partition! Must be "train" or "validation"') # obtain and decode the acoustic model's predictions input_to_softmax.load_weights(model_path) #print(input_to_softmax.summary()) prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0)) output_length = [input_to_softmax.output_length(data_point.shape[0])] pred_ints = (K.eval(K.ctc_decode( prediction, output_length, greedy=True)[0][0]) + 1).flatten().tolist() Audio(audio_path) print('-' * 80) b = "".join(int_sequence_to_text(pred_ints)) a = transcr print("Greedy_predictions:\n" + '\n' + ''.join(int_sequence_to_text(pred_ints))) print('1. Word Error Rate for ASR ==', wer(a, b)*100, '%') endtime = time.time() print('2. Real Time Factor for ASR ==',(endtime - starttime)/data_gen.valid_durations[index],'\n') print('-' * 80)
def lexcion_predictions(index, partition, input_to_softmax, model_path): """ Print a model's decoded predictions · Params: index (int): The example you would like to visualize partition (str): One of 'train' or 'validation' input_to_softmax (Model): The acoustic model model_path (str): Path to saved acoustic model's weights """ # load the train and test data starttime = time.time() data_gen = AudioGenerator(spectrogram=True) data_gen.load_train_data() data_gen.load_validation_data() # obtain the true transcription and the audio features if partition == 'validation': transcr = data_gen.valid_texts[index] audio_path = data_gen.valid_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) elif partition == 'train': transcr = data_gen.train_texts[index] audio_path = data_gen.train_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) else: raise Exception('Invalid partition! Must be "train" or "validation"') # obtain and decode the acoustic model's predictions input_to_softmax.load_weights(model_path) #print(input_to_softmax.summary()) prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0)) print('-' * 80) Audio(audio_path) print('True transcription:\n' + '\n' + transcr) print('-' * 80) b = testline(prediction[0]) a = transcr print("TokenPassing_predictions:\n") print(b + '\n') print('1. Word Error Rate for ASR ==', wer(a, b)*100, '%') endtime = time.time() print('2. Real Time Factor for ASR ==',(endtime - starttime)/data_gen.valid_durations[index])
def train_model(input_to_softmax, pickle_path, save_model_path, train_json='train_corpus.json', valid_json='valid_corpus.json', minibatch_size=20, spectrogram=True, mfcc_dim=13, optimizer=SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5), epochs=30, verbose=1, sort_by_duration=False, max_duration=10.0): # create a class instance for obtaining batches of data audio_gen = AudioGenerator(minibatch_size=minibatch_size, spectrogram=spectrogram, mfcc_dim=mfcc_dim, max_duration=max_duration, sort_by_duration=sort_by_duration) # add the training data to the generator audio_gen.load_train_data(train_json) audio_gen.load_validation_data(valid_json) # calculate steps_per_epoch num_train_examples=len(audio_gen.train_audio_paths) steps_per_epoch = num_train_examples//minibatch_size # calculate validation_steps num_valid_samples = len(audio_gen.valid_audio_paths) validation_steps = num_valid_samples//minibatch_size # add CTC loss to the NN specified in input_to_softmax model = add_ctc_loss(input_to_softmax) # CTC loss is implemented elsewhere, so use a dummy lambda function for the loss model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer) # make results/ directory, if necessary #if not os.path.exists('results'): # os.makedirs('results') #if os.path.exists('/gdrive/My Drive/results/model_end.h5'): # model.load_weights('/gdrive/My Drive/results/model_end.h5') resume_weights = '/gdrive/My Drive/results/rnn_model.hdf5' if os.path.isfile(resume_weights): print ("Resumed model's weights from {}".format(resume_weights)) model.load_weights(resume_weights) # add checkpointer checkpointer = ModelCheckpoint(filepath='/gdrive/My Drive/results/'+save_model_path, monitor='val_loss', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1) # train the model hist = model.fit_generator(generator=audio_gen.next_train(), steps_per_epoch=steps_per_epoch,epochs=epochs, validation_data=audio_gen.next_valid(), validation_steps=validation_steps,callbacks=[checkpointer], verbose=verbose)
def get_predictions_rec(input_to_softmax, a_path, model_path): data_gen = AudioGenerator(spectrogram=False) data_gen.load_train_data() data_gen.load_validation_data() audio_path = a_path data_point = data_gen.normalize(data_gen.featurize(audio_path)) # obtain and decode the acoustic model's predictions input_to_softmax.load_weights(model_path) prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0)) output_length = [input_to_softmax.output_length(data_point.shape[0])] pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) + 1).flatten().tolist() return 'Predicted transcription:\n' + '\n' + ''.join( int_sequence_to_text(pred_ints))
def train_model(input_to_softmax, pickle_path, save_model_path, train_json='train_corpus.json', valid_json='valid_corpus.json', minibatch_size=20, spectrogram=True, mfcc_dim=13, optimizer=SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5), epochs=1, verbose=1, sort_by_duration=False, max_duration=10.0): # create a class instance for obtaining batches of data audio_gen = AudioGenerator(minibatch_size=minibatch_size, spectrogram=spectrogram, mfcc_dim=mfcc_dim, max_duration=max_duration, sort_by_duration=sort_by_duration) # add the training data to the generator audio_gen.load_train_data(train_json) audio_gen.load_validation_data(valid_json) # calculate steps_per_epoch num_train_examples=len(audio_gen.train_audio_paths) steps_per_epoch = num_train_examples//minibatch_size # calculate validation_steps num_valid_samples = len(audio_gen.valid_audio_paths) validation_steps = num_valid_samples//minibatch_size # add CTC loss to the NN specified in input_to_softmax model = add_ctc_loss(input_to_softmax) # CTC loss is implemented elsewhere, so use a dummy lambda function for the loss model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer) # make results/ directory, if necessary if not os.path.exists('results'): os.makedirs('results') # add checkpointer checkpointer = ModelCheckpoint(filepath='results/'+save_model_path, verbose=0) # train the model hist = model.fit_generator(generator=audio_gen.next_train(), steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=audio_gen.next_valid(), validation_steps=validation_steps, callbacks=[checkpointer], verbose=verbose) # save model loss with open('results/'+pickle_path, 'wb') as f: pickle.dump(hist.history, f)
def train_model_history( input_to_softmax, train_json='train_corpus.json', valid_json='valid_corpus.json', minibatch_size=20, spectrogram=True, mfcc_dim=13, optimizer=SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5), epochs=20, verbose=1, sort_by_duration=False, max_duration=10.0): # create a class instance for obtaining batches of data audio_gen = AudioGenerator(minibatch_size=minibatch_size, spectrogram=spectrogram, mfcc_dim=mfcc_dim, max_duration=max_duration, sort_by_duration=sort_by_duration) # add the training data to the generator audio_gen.load_train_data(train_json) audio_gen.load_validation_data(valid_json) # calculate steps_per_epoch num_train_examples=len(audio_gen.train_audio_paths) steps_per_epoch = num_train_examples//minibatch_size # calculate validation_steps num_valid_samples = len(audio_gen.valid_audio_paths) validation_steps = num_valid_samples//minibatch_size # add CTC loss to the NN specified in input_to_softmax model = add_ctc_loss(input_to_softmax) # CTC loss is implemented elsewhere, so use a dummy lambda function for the loss model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer) # train the model hist = model.fit_generator(generator=audio_gen.next_train(), steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=audio_gen.next_valid(), validation_steps=validation_steps, callbacks=[], verbose=verbose) return hist
def _predict(name="200_32_3.wav"): ''' Get the predicted results of a single sample :param name: ''' # load the train and test data data_gen = AudioGenerator() data_gen.load_train_data() data_gen.load_validation_data() audio_path_valid = data_gen.valid_audio_paths audio_path_train = data_gen.train_audio_paths idx = -1 partition = "valid" for i in range(len(audio_path_valid)): rets = audio_path_valid[i].split('/') if rets[-1] == name: idx = i break if idx == -1: for i in range(len(audio_path_train)): rets = audio_path_train[i].split('/') if rets[-1] == name: idx = i partition = "train" break start = datetime.now() label, predicted = get_predictions(index=idx, partition=partition, input_to_softmax=mmodel1( input_dim=13, filters=512, kernel_size=5, conv_stride=1, conv_border_mode='same', units=1024, output_dim=95), model_path='results/mmodel1.h5') time = datetime.now() - start return label, predicted, str(time)
def get_predictions(audio_path, input_to_softmax, model_path): """ Print a model's decoded predictions Params: index (int): The example you would like to visualize partition (str): One of 'train' or 'validation' input_to_softmax (Model): The acoustic model model_path (str): Path to saved acoustic model's weights """ # print("OK"); # return; # load the train and test data data_gen = AudioGenerator(spectrogram=False, mfcc_dim=13) # read and get features # audio_path = "./samples/16/19/16-19-0159.wav" # print("audio_path:{}".format(audio_path)) # data not normalized yet data_point = data_gen.featurize(audio_path) # print("shape:{}".format(data_gen.featurize(audio_path).shape)) # print("feats_mean: {}".format(data_gen.feats_mean)) # print("feats_std: {}".format(data_gen.feats_std)) # print("feats_mean: {}".format(data_gen.feats_mean.shape)) # print("feats_std: {}".format(data_gen.feats_std.shape)) feats_mean = np.array([ 14.81652005, -0.1802923, -1.22285122, 0.87062853, -16.05643781, -14.03943633, -5.7298706, -15.52425927, -3.39637537, -3.85226744, -5.17435844, -2.13766871, -11.39111645 ]) feats_std = np.array([ 7.16816358, 14.58747728, 11.99928947, 15.69431836, 14.45918537, 16.79930368, 13.98395715, 12.60133111, 11.61310503, 11.34526655, 12.01205471, 13.41467652, 10.89021869 ]) # print("feats_mean: {}".format(feats_mean)) # print("feats_std: {}".format(feats_std)) # print("feats_mean: {}".format(feats_mean.shape)) # print("feats_std: {}".format(feats_std.shape)) # print(data_gen.featurize(audio_path).shape) # normalize data eps = 1e-14 data_point = (data_point - feats_mean) / (feats_std + eps) # data_point = data_gen.normalize(data_gen.featurize(audio_path)) # print("data_point,shape:{}".format(data_point.shape)) # obtain and decode the acoustic model's predictions input_to_softmax.load_weights(model_path) prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0)) output_length = [input_to_softmax.output_length(data_point.shape[0])] pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) + 1).flatten().tolist() recognized_text = "".join(int_sequence_to_text(pred_ints)) print(recognized_text)
def train_model( input_to_softmax, pickle_path, #save_model_path, train_json='train_corpus.json', valid_json='valid_corpus.json', minibatch_size=20, spectrogram=True, mfcc_dim=13, #optimizer=SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5), optimizer=adam, epochs=20, verbose=1, sort_by_duration=True, max_duration=16.7): # create a class instance for obtaining batches of data audio_gen_train = AudioGenerator(minibatch_size=minibatch_size, spectrogram=spectrogram, mfcc_dim=mfcc_dim, max_duration=max_duration, sort_by_duration=sort_by_duration) # add the training data to the generator audio_gen_train.load_train_data(train_json) audio_gen_test = AudioGenerator(minibatch_size=minibatch_size, spectrogram=spectrogram, mfcc_dim=mfcc_dim, max_duration=30.0, sort_by_duration=False) audio_gen_test.load_validation_data(valid_json) # calculate steps_per_epoch num_train_examples = len(audio_gen_train.train_audio_paths) print('number of train examples ==', num_train_examples) steps_per_epoch = num_train_examples // minibatch_size # calculate validation_steps num_valid_samples = len(audio_gen_test.valid_audio_paths) validation_steps = num_valid_samples // minibatch_size # add CTC loss to the NN specified in input_to_softmax model = add_ctc_loss(input_to_softmax) # CTC loss is implemented elsewhere, so use a dummy lambda function for the loss # model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer, metrics=['accuracy']) model.compile(loss={ 'ctc': lambda y_true, y_pred: y_pred }, optimizer='adam') # make results/ directory, if necessary if not os.path.exists('results'): os.makedirs('results') # add checkpointer #checkpointer = ModelCheckpoint(filepath='results/' + save_model_path, verbose=0, period=1) filepath = "results/test-{epoch:02d}.hdf5" checkpointer = ModelCheckpoint(filepath, verbose=0, period=5) # train the model hist = model.fit_generator(generator=audio_gen_train.next_train(), steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=audio_gen_test.next_valid(), validation_steps=validation_steps, callbacks=[checkpointer], verbose=verbose) # save model loss with open('results/' + pickle_path, 'wb') as f: pickle.dump(hist.history, f)
def generate_corpus(desc_file): data_sentences = AudioGenerator() #data_gen.load_train_data(desc_file=desc_file) data_sentences.load_train_data(desc_file=desc_file) sentences = data_sentences.train_texts return sentences
and predicts character sequence neural network will process each frame of the spectrogram length of x != length of y use CTC = connectionist temporal classification ''' if __name__ == "__main__": # datasets partition = { 'train': 'LibriSpeech/dev-clean/84/121123/84-121123.trans.txt', 'validation': 'LibriSpeech/dev-clean/84/121550/84-121550.trans.txt' } # Generators training_generator = AudioGenerator(descr_file=partition['train'], batch_size=20) validation_generator = AudioGenerator(descr_file=partition['validation'], batch_size=20) # get this model working first and then use lstm model = cnn_rnn_model(input_dim=161, filters=200, kernel_size=11, conv_stride=2, conv_border_mode='valid', units=200) train_model(input_to_softmax=model, pickle_path='model_0.pickle', train_generator=training_generator, validation_generator=validation_generator,
lr_scheduler = LearningRateScheduler(scheduler) lr_tracker = SGDLearningRateTracker() lr_plateau = ReduceLROnPlateau( monitor='val_acc', mode='max', patience=4, factor=np.sqrt(0.1), verbose=1, min_lr=1e-6) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--fold', type=str, default='0', help='which fold') FLAGS, _ = parser.parse_known_args() print('conduct train and test in fold {0}'.format(FLAGS.fold)) train_generator = AudioGenerator( root_dir= '../data/input/train/audio/', k=FLAGS.fold, file_temp=TRAIN_SPLIT_FILE_TEMP, ori_batch_size=batch_size, train_or_valid='train', augmentation_prob=30, ) # train_generator.steps_per_epoch = train_generator.steps_per_epoch * 2 valid_generator = AudioGenerator( root_dir= '../data/input/train/audio/', k=FLAGS.fold, file_temp=VALID_SPLIT_FILE_TEMP, ori_batch_size=batch_size, train_or_valid='valid', ) preds = np.zeros((len(fname_test), n_classes)) for run in range(RUNS_IN_FOLD): print('fold {0} runs {1}'.format(FLAGS.fold, run)) # use model check point callbacks
def get_predictions(indexes, partition, input_to_softmax, model_path): """ Print a model's decoded predictions Params: index (int): The example you would like to visualize partition (str): One of 'train' or 'validation' input_to_softmax (Model): The acoustic model model_path (str): Path to saved acoustic model's weights """ # load the train and test data data_gen = AudioGenerator(spectrogram=False, mfcc_dim=13) data_gen.load_train_data() data_gen.load_validation_data() # loading language model alphabet = ''.join(index_map.values()) language_model = LanguageModel('data/word/corpus.txt', alphabet) # obtain and decode the acoustic model's predictions input_to_softmax.load_weights(model_path) for index in indexes: # obtain the true transcription and the audio features if partition == 'validation': transcr = data_gen.valid_texts[index] audio_path = data_gen.valid_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) elif partition == 'train': transcr = data_gen.train_texts[index] audio_path = data_gen.train_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) elif partition == 'test': transcr = 'hello how are you' audio_path = '../datasets/AgentExpress/hello.wav.wav' data_point = data_gen.normalize(data_gen.featurize(audio_path)) #print(data_point) else: raise Exception( 'Invalid partition! Must be "train" or "validation"') prediction = input_to_softmax.predict( np.expand_dims(data_point, axis=0)) output_length = [input_to_softmax.output_length(data_point.shape[0])] #print(K.eval(K.ctc_decode(prediction, output_length, False, 30, 1)[0][0])) pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) + 1).flatten().tolist() # play the audio file, and display the true and predicted transcriptions print('-' * 80) Audio(audio_path) print('ground_truth:' + ' ' * 4 + transcr) print('best_path:' + ' ' * 7 + ''.join(int_sequence_to_text(pred_ints))) pred_beam = ctcBeamSearch(prediction[0], alphabet, None) print('beam_search:' + ' ' * 5 + pred_beam) pred_beam_lm = ctcBeamSearch(prediction[0], alphabet, language_model) print('beam_search_lm:' + ' ' * 2 + pred_beam_lm) pred_token = ctcTokenPassing(prediction[0], alphabet, language_model.getWordList()) print('token_passing:' + ' ' * 3 + pred_token)
from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import LabelEncoder label_enc = LabelEncoder() enc = OneHotEncoder(sparse=False) y_train_int = label_enc.fit_transform(y_train) y_train_int = y_train_int.reshape(len(y_train_int), 1) y_train_one_hot = enc.fit_transform(y_train_int) y_test_int = label_enc.transform(y_test) y_test_int = y_test_int.reshape(len(y_test_int), 1) y_test_one_hot = enc.transform(y_test_int) # Create audio generator audio_gen = AudioGenerator(batch_size=batch_size, fns=X_train_fn, labels=y_train_one_hot, mode=mode) valid_gen = AudioGenerator(batch_size=batch_size, fns=X_test_fn, labels=y_test_one_hot, mode=mode) l, Sxx = audio_gen.rnd_one_sample() num_train = audio_gen.get_train_test_num() num_test = valid_gen.get_train_test_num() print(num_train, num_test) step_per_epoch = num_train // batch_size validation_step = num_test // batch_size image_shape = Sxx.shape print(image_shape)
def generate_corpus(desc_file): #outputs a list of sentences data_sentences = AudioGenerator() data_sentences.load_train_data(desc_file=desc_file) sentences = data_sentences.train_texts return sentences
def get_group_predictions(input_to_softmax, model_path, partition): starttime = time.time() wer_sum = 0 data_gen = AudioGenerator(spectrogram=True) data_gen.load_train_data() data_gen.load_validation_data() input_to_softmax.load_weights(model_path) # obtain the true transcription and the audio features if partition == 'validation': num = 99 while num >= 0: index = random.randint(1, 2500) transcr = data_gen.valid_texts[index] audio_path = data_gen.valid_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) # obtain and decode the acoustic model's predictions prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0)) output_length = [input_to_softmax.output_length(data_point.shape[0])] pred_ints = (K.eval(K.ctc_decode(prediction, output_length, greedy=False, beam_width=100, top_paths=1)[0][ 0]) + 1).flatten().tolist() # print('True transcription:\n' + '\n' + transcr) b = "".join(int_sequence_to_text(pred_ints)) a = transcr # print('Predicted transcription:\n' + '\n' + ''.join(int_sequence_to_text(pred_ints))) # print('-' * 80) # print('1.Editable Distance for ASR ==', edit(a, b), '\n') if (wer(a, b) <= 1): print('index_%d' % index, ':') wer_sum = wer_sum + wer(a, b) print(wer(a, b)) print("Transcription: ",a) print("Prediction: ",b) print('-' * 80) elif (): num = num + 1 num = num - 1 elif partition == 'train': num = 999 while num >= 0: index = random.randint(1, 10000) transcr = data_gen.train_texts[index] audio_path = data_gen.train_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) # obtain and decode the acoustic model's predictions prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0)) output_length = [input_to_softmax.output_length(data_point.shape[0])] pred_ints = (K.eval(K.ctc_decode(prediction, output_length, greedy=False, beam_width=100, top_paths=1)[0][ 0]) + 1).flatten().tolist() # print('True transcription:\n' + '\n' + transcr) b = "".join(int_sequence_to_text(pred_ints)) a = transcr # print('Predicted transcription:\n' + '\n' + ''.join(int_sequence_to_text(pred_ints))) # print('-' * 80) # print('1.Editable Distance for ASR ==', edit(a, b), '\n') if (wer(a, b) <= 1): print('index_%d' % index, ':') wer_sum = wer_sum + wer(a, b) print(wer(a, b)) print("Transcription: ",a) print("Prediction: ",b) print('-' * 80) elif (): num = num + 1 num = num - 1 else: raise Exception('Invalid partition! Must be "train" or "validation"') endtime = time.time() #print('1. Average Word Error Rate for ASR ==', wer_sum/100 , '%') print('1. Average Word Error Rate for ASR ==', wer_sum / 100) print('2. Average Real Time Factor for ASR ==', (endtime - starttime) / 100, '\n')
################################################## print('prepare test data') d_test = pickle.load(open(test_dir + 'test_{0}.pkl'.format(FE_TYPE), 'rb')) fname_test, X_test = d_test['fname'], d_test['data'] X_test = X_test.reshape(tuple(list(X_test.shape) + [1])).astype('float32') del d_test gc.collect() ################################################## # make data generator ################################################## print('prepare train data in fold {0}'.format(FLAGS.fold)) train_generator = AudioGenerator( root_dir='../data/input/train/audio/', k=FLAGS.fold, file_temp=TRAIN_SPLIT_FILE_TEMP, ori_batch_size=batch_size, train_or_valid='train', augmentation_prob=0, ) # train_generator.steps_per_epoch = train_generator.steps_per_epoch * 2 print('prepare valid data in fold {0}'.format(FLAGS.fold)) valid_generator = AudioGenerator( root_dir='../data/input/train/audio/', k=FLAGS.fold, file_temp=VALID_SPLIT_FILE_TEMP, ori_batch_size=batch_size, train_or_valid='valid', ) # prepare valid data fname_valid = valid_generator.in_fold_data['fname'] truth_valid = valid_generator.in_fold_data['truth']
def train_model(input_to_softmax, pickle_path, save_model_path, train_json='train_corpus.json', valid_json='valid_corpus.json', minibatch_size=20, spectrogram=True, mfcc_dim=13, optimizer=SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5), epochs=20, verbose=1, sort_by_duration=False, max_duration=10.0): # create a class instance for obtaining batches of data audio_gen = AudioGenerator(minibatch_size=minibatch_size, spectrogram=spectrogram, mfcc_dim=mfcc_dim, max_duration=max_duration, sort_by_duration=sort_by_duration) # add the training data to the generator audio_gen.load_train_data(train_json) audio_gen.load_validation_data(valid_json) # calculate steps_per_epoch num_train_examples = len(audio_gen.train_audio_paths) steps_per_epoch = num_train_examples // minibatch_size # calculate validation_steps num_valid_samples = len(audio_gen.valid_audio_paths) validation_steps = num_valid_samples // minibatch_size # add CTC loss to the NN specified in input_to_softmax model = add_ctc_loss(input_to_softmax) # CTC loss is implemented elsewhere, so use a dummy lambda function for the loss model.compile( loss={ 'ctc': lambda y_true, y_pred: y_pred }, optimizer=optimizer, ) # make results/ directory, if necessary if not os.path.exists('results'): os.makedirs('results') # add checkpointer # checkpointer = ModelCheckpoint(filepath='results/'+save_model_path, verbose=0) # callbacks # Notes: # added ReduceLROnPlateau to decrease the learning rate when the model doesn't improve for 2 epochs # added EarlyStopping to stop the training when the model clearly overfits callbacks = [ ModelCheckpoint(filepath='results/' + save_model_path, verbose=0), ReduceLROnPlateau(monitor="val_loss", factor=0.9, patience=2, verbose=verbose, min_lr=0.001), EarlyStopping(patience=4) ] # train the model hist = model.fit_generator( generator=audio_gen.next_train(), steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=audio_gen.next_valid(), validation_steps=validation_steps, callbacks=callbacks, verbose=verbose) # originally `callbacks=[checkpointer] # save model loss with open('results/' + pickle_path, 'wb') as f: pickle.dump(hist.history, f)
def get_predictions(index, partition, input_to_softmax, model_path, phn=False): """ Print a model's decoded predictions Params: index (int): The example you would like to visualize partition (str): One of 'train' or 'validation' input_to_softmax (Model): The acoustic model model_path (str): Path to saved acoustic model's weights """ # load the train and test data data_gen = AudioGenerator() data_gen.load_train_data() data_gen.load_test_data() # obtain the true transcription and the audio features if partition == 'test': if phn: transcr = data_gen.test_phn_texts[index] audio_path = data_gen.test_phn_audio_paths[index] elif not phn: transcr = data_gen.test_wrd_texts[index] audio_path = data_gen.test_wrd_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) elif partition == 'train': if phn: transcr = data_gen.train_phn_texts[index] audio_path = data_gen.train_phn_audio_paths[index] elif not phn: transcr = data_gen.train_wrd_texts[index] audio_path = data_gen.train_wrd_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) else: raise Exception('Invalid partition! Must be "train" or "validation"') # obtain and decode the acoustic model's predictions input_to_softmax.load_weights(model_path) prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0)) output_length = [input_to_softmax.output_length(data_point.shape[0])] pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) + 1).flatten().tolist() # play the audio file, and display the true and predicted transcriptions if not phn: print('-' * 80) Audio(audio_path) print('True transcription:\n' + '\n' + transcr) print('-' * 80) print('Predicted transcription:\n' + '\n' + ''.join(int_sequence_to_text(pred_ints, phn))) print('-' * 80) else: print('-' * 80) Audio(audio_path) print('True transcription:\n' + '\n' + transcr) print('-' * 80) print('Predicted transcription:\n' + '\n') split_true = transcr.split(" ") split_pred = (''.join(int_sequence_to_text(pred_ints, phn))).split(" ") print("\033[1;32m" + split_pred[0] + " ", end='') for i in range(1, len(split_true) - 1): if split_true[i - 1] == split_pred[i] or split_true[ i] == split_pred[i] or split_true[i + 1] == split_pred[i]: print("\033[1;32m" + split_pred[i] + " ", end='') else: print("\033[1;31m" + split_pred[i] + " ", end='') print(split_pred[len(split_true) - 1] + " ", end='') split_pred = (''.join(int_sequence_to_text(pred_ints, phn))).split(" ") split_true = transcr.split(" ") displayAccuracy(split_true, split_pred, phn)
print('\nLR: {:.6f}\n'.format(lr)) lr_scheduler = LearningRateScheduler(scheduler) lr_tracker = SGDLearningRateTracker() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--fold', type=str, default='0', help='which fold') FLAGS, _ = parser.parse_known_args() print('conduct train and test in fold {0}'.format(FLAGS.fold)) train_generator = AudioGenerator( root_dir= '../data/input/train/audio/', k=FLAGS.fold, file_temp=TRAIN_SPLIT_FILE_TEMP, ori_batch_size=batch_size, train_or_valid='train', augmentation_prob=50, ) train_generator.steps_per_epoch = train_generator.steps_per_epoch * 2 valid_generator = AudioGenerator( root_dir= '../data/input/train/audio/', k=FLAGS.fold, file_temp=VALID_SPLIT_FILE_TEMP, ori_batch_size=batch_size, train_or_valid='valid', ) preds = np.zeros((len(fname_test), n_classes)) for run in range(RUNS_IN_FOLD): print('fold {0} runs {1}'.format(FLAGS.fold, run)) model = get_model()