def predict_test(input_to_softmax, model_path, audio_range=100000): ''' Method for predicting the testing set. Set default audio_range to be 100000. (If it's over the index, it will return an error anyway). ''' data_gen = AudioGenerator() data_gen.load_train_data() data_gen.load_test_data() transcr = data_gen.test_texts audio_path = data_gen.test_audio_paths input_to_softmax.load_weights(model_path) predictions = [] try: for i in range(len(audio_path)): #default len(audio_path)): data_point = data_gen.normalize(data_gen.featurize(audio_path[i])) prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0)) output_length = [input_to_softmax.output_length(data_point.shape[0])] pred_ints = (K.eval(K.ctc_decode( prediction, output_length)[0][0])+1).flatten().tolist() pred = ''.join(int_sequence_to_text(pred_ints)) predictions.append(pred) except: predictions = ''.join(predictions) transcr = transcr[:10] transcr = ''.join(transcr) with open("predictions/predictions.txt", "w") as output: output.write(str(predictions)) with open("predictions/truescr.txt", "w") as output: output.write(str(transcr))
def compare_predictions(index, partition, inputs_to_softmax=[], model_paths=[], phn=False): """ Print a model's decoded predictions Params: index (int): The example you would like to visualize partition (str): One of 'train' or 'validation' input_to_softmax (Model): The acoustic model model_path (str): Path to saved acoustic model's weights """ # load the train and test data data_gen = AudioGenerator() data_gen.load_train_data() data_gen.load_test_data() # obtain the true transcription and the audio features if partition == 'test': if phn: transcr = data_gen.test_phn_texts[index] audio_path = data_gen.test_phn_audio_paths[index] elif not phn: transcr = data_gen.test_wrd_texts[index] audio_path = data_gen.test_wrd_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) elif partition == 'train': if phn: transcr = data_gen.train_phn_texts[index] audio_path = data_gen.train_phn_audio_paths[index] elif not phn: transcr = data_gen.train_wrd_texts[index] audio_path = data_gen.train_wrd_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) else: raise Exception('Invalid partition! Must be "train" or "validation"') # obtain and decode the acoustic model's predictions pred_ints = [] for model_path, input_to_softmax in zip(model_paths, inputs_to_softmax): input_to_softmax.load_weights(model_path) prediction = input_to_softmax.predict( np.expand_dims(data_point, axis=0)) output_length = [input_to_softmax.output_length(data_point.shape[0])] pred_int = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) + 1).flatten().tolist() pred_ints.append(pred_int) # play the audio file, and display the true and predicted transcriptions print('-' * 80) Audio(audio_path) print('True transcription:\n' + '\n' + transcr) print('-' * 80) i = 0 for pred_in in pred_ints: i = i + 1 print('Predicted transcription number', i, ':\n' + '\n' + ''.join(int_sequence_to_text(pred_in, phn))) print('-' * 80)
def train_model(input_to_softmax, phn, pickle_path, save_model_path, train_json='JSON\\train_corpus', valid_json='JSON\\test_corpus', minibatch_size=10, mfcc_dim=13, optimizer=SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5), epochs=20, verbose=1, sort_by_duration=False, max_duration=20.0): # create a class instance for obtaining batches of data audio_gen = AudioGenerator(minibatch_size=minibatch_size, mfcc_dim=mfcc_dim, max_duration=max_duration, sort_by_duration=sort_by_duration) # add the training data to the generator audio_gen.load_train_data(train_json) audio_gen.load_test_data(valid_json) # calculate steps_per_epoch if phn: num_train_examples=len(audio_gen.train_phn_audio_paths) steps_per_epoch = num_train_examples//minibatch_size elif not phn: num_train_examples=len(audio_gen.train_wrd_audio_paths) steps_per_epoch = num_train_examples//minibatch_size # calculate validation_steps if phn: num_valid_samples = len(audio_gen.test_phn_audio_paths) validation_steps = num_valid_samples//minibatch_size elif not phn: num_valid_samples = len(audio_gen.test_wrd_audio_paths) validation_steps = num_valid_samples//minibatch_size # add CTC loss to the NN specified in input_to_softmax model = add_ctc_loss(input_to_softmax) # CTC loss is implemented elsewhere, so use a dummy lambda function for the loss model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer) # make results/ directory, if necessary if not os.path.exists('models'): os.makedirs('models') # add checkpointer checkpointer = ModelCheckpoint(filepath='models/'+save_model_path, verbose=0) # train the model generator=audio_gen.next_train(phn) validation_data=audio_gen.next_test(phn) hist = model.fit_generator(generator=generator, steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=validation_data, validation_steps=validation_steps, callbacks=[checkpointer], verbose=verbose) # save model loss with open('models/'+pickle_path, 'wb') as f: pickle.dump(hist.history, f)
def get_predictions(index, partition, input_to_softmax, model_path, phn=False): """ Print a model's decoded predictions Params: index (int): The example you would like to visualize partition (str): One of 'train' or 'validation' input_to_softmax (Model): The acoustic model model_path (str): Path to saved acoustic model's weights """ # load the train and test data data_gen = AudioGenerator() data_gen.load_train_data() data_gen.load_test_data() # obtain the true transcription and the audio features if partition == 'test': if phn: transcr = data_gen.test_phn_texts[index] audio_path = data_gen.test_phn_audio_paths[index] elif not phn: transcr = data_gen.test_wrd_texts[index] audio_path = data_gen.test_wrd_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) elif partition == 'train': if phn: transcr = data_gen.train_phn_texts[index] audio_path = data_gen.train_phn_audio_paths[index] elif not phn: transcr = data_gen.train_wrd_texts[index] audio_path = data_gen.train_wrd_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) else: raise Exception('Invalid partition! Must be "train" or "validation"') # obtain and decode the acoustic model's predictions input_to_softmax.load_weights(model_path) prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0)) output_length = [input_to_softmax.output_length(data_point.shape[0])] pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) + 1).flatten().tolist() # play the audio file, and display the true and predicted transcriptions if not phn: print('-' * 80) Audio(audio_path) print('True transcription:\n' + '\n' + transcr) print('-' * 80) print('Predicted transcription:\n' + '\n' + ''.join(int_sequence_to_text(pred_ints, phn))) print('-' * 80) else: print('-' * 80) Audio(audio_path) print('True transcription:\n' + '\n' + transcr) print('-' * 80) print('Predicted transcription:\n' + '\n') split_true = transcr.split(" ") split_pred = (''.join(int_sequence_to_text(pred_ints, phn))).split(" ") print("\033[1;32m" + split_pred[0] + " ", end='') for i in range(1, len(split_true) - 1): if split_true[i - 1] == split_pred[i] or split_true[ i] == split_pred[i] or split_true[i + 1] == split_pred[i]: print("\033[1;32m" + split_pred[i] + " ", end='') else: print("\033[1;31m" + split_pred[i] + " ", end='') print(split_pred[len(split_true) - 1] + " ", end='') split_pred = (''.join(int_sequence_to_text(pred_ints, phn))).split(" ") split_true = transcr.split(" ") displayAccuracy(split_true, split_pred, phn)