def predict_test(input_to_softmax, model_path, audio_range=100000): ''' Method for predicting the testing set. Set default audio_range to be 100000. (If it's over the index, it will return an error anyway). ''' data_gen = AudioGenerator() data_gen.load_train_data() data_gen.load_test_data() transcr = data_gen.test_texts audio_path = data_gen.test_audio_paths input_to_softmax.load_weights(model_path) predictions = [] try: for i in range(len(audio_path)): #default len(audio_path)): data_point = data_gen.normalize(data_gen.featurize(audio_path[i])) prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0)) output_length = [input_to_softmax.output_length(data_point.shape[0])] pred_ints = (K.eval(K.ctc_decode( prediction, output_length)[0][0])+1).flatten().tolist() pred = ''.join(int_sequence_to_text(pred_ints)) predictions.append(pred) except: predictions = ''.join(predictions) transcr = transcr[:10] transcr = ''.join(transcr) with open("predictions/predictions.txt", "w") as output: output.write(str(predictions)) with open("predictions/truescr.txt", "w") as output: output.write(str(transcr))
def get_predictions(index, partition, input_to_softmax, model_path): ''' Get the model's decoded predictions to caculate metrics ''' # load the train and test data data_gen = AudioGenerator() data_gen.load_train_data() data_gen.load_validation_data() # obtain the true transcription and the audio features if partition == 'valid': transcr = data_gen.valid_texts[index] audio_path = data_gen.valid_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) elif partition == 'train': transcr = data_gen.train_texts[index] audio_path = data_gen.train_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) else: raise Exception('Invalid') # obtain and decode the acoustic model's predictions input_to_softmax.load_weights(model_path) prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0)) output_length = [input_to_softmax.output_length(data_point.shape[0])] pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) + 1).flatten().tolist() label = transcr predicted = ''.join(int_sequence_to_text(pred_ints)) return label, predicted
def calculate_wer2(input_to_softmax, model_path, words=False): # data_gen = AudioGenerator() # data_gen.load_train_data() # data_gen.load_validation_data() wers = [] input_to_softmax.load_weights(model_path) l = len(data_gen.valid_texts) l = 100 for index in range(l): transcr = data_gen.valid_texts[index] # audio_path = data_gen.valid_audio_paths[index] # data_point = data_gen.normalize(data_gen.featurize(audio_path)) data_point = valid_cache[index] prediction = input_to_softmax.predict( np.expand_dims(data_point, axis=0)) output_length = [input_to_softmax.output_length(data_point.shape[0])] pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) + 1).flatten().tolist() pred = ''.join(int_sequence_to_text(pred_ints)) if words: w = wer(transcr.split(), pred.split()) else: w = wer(list(transcr), list(pred)) wers.append(w) if index % 100 == 0: print(index, len(data_gen.valid_texts), wers[-1]) print("FINAL WER:", sum(wers) / len(wers), "words:", words) return sum(wers) / len(wers)
def get_predictions_recorded( self, spectrogram=False, recordingpath='recordings/demo.wav', ): """ Print a model's decoded predictions from live recordings Params: index (int): The example you would like to visualize partition (str): One of 'train' or 'validation' input_to_softmax (Model): The acoustic model model_path (str): Path to saved acoustic model's weights """ # load the train and test data data_gen = AudioGenerator(spectrogram=spectrogram) data_gen.load_train_data() self.audio_path = recordingpath # obtain the true transcription and the audio feature data_point = data_gen.normalize(data_gen.featurize(recordingpath)) #pprint(data_point) # obtain and decode the acoustic model's predictions prediction = self.input_to_softmax.predict( np.expand_dims(data_point, axis=0)) output_length = [ self.input_to_softmax.output_length(data_point.shape[0]) ] pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) + 1).flatten().tolist() print('-' * 80) print('Predicted transcription:\n' + '\n' + ''.join(int_sequence_to_text(pred_ints))) print('-' * 80)
def compare_predictions(index, partition, inputs_to_softmax=[], model_paths=[], phn=False): """ Print a model's decoded predictions Params: index (int): The example you would like to visualize partition (str): One of 'train' or 'validation' input_to_softmax (Model): The acoustic model model_path (str): Path to saved acoustic model's weights """ # load the train and test data data_gen = AudioGenerator() data_gen.load_train_data() data_gen.load_test_data() # obtain the true transcription and the audio features if partition == 'test': if phn: transcr = data_gen.test_phn_texts[index] audio_path = data_gen.test_phn_audio_paths[index] elif not phn: transcr = data_gen.test_wrd_texts[index] audio_path = data_gen.test_wrd_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) elif partition == 'train': if phn: transcr = data_gen.train_phn_texts[index] audio_path = data_gen.train_phn_audio_paths[index] elif not phn: transcr = data_gen.train_wrd_texts[index] audio_path = data_gen.train_wrd_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) else: raise Exception('Invalid partition! Must be "train" or "validation"') # obtain and decode the acoustic model's predictions pred_ints = [] for model_path, input_to_softmax in zip(model_paths, inputs_to_softmax): input_to_softmax.load_weights(model_path) prediction = input_to_softmax.predict( np.expand_dims(data_point, axis=0)) output_length = [input_to_softmax.output_length(data_point.shape[0])] pred_int = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) + 1).flatten().tolist() pred_ints.append(pred_int) # play the audio file, and display the true and predicted transcriptions print('-' * 80) Audio(audio_path) print('True transcription:\n' + '\n' + transcr) print('-' * 80) i = 0 for pred_in in pred_ints: i = i + 1 print('Predicted transcription number', i, ':\n' + '\n' + ''.join(int_sequence_to_text(pred_in, phn))) print('-' * 80)
def get_predictions(index, partition, input_to_softmax, model_path): """ Print a model's decoded predictions ยท Params: index (int): The example you would like to visualize partition (str): One of 'train' or 'validation' input_to_softmax (Model): The acoustic model model_path (str): Path to saved acoustic model's weights """ # load the train and test data starttime = time.time() data_gen = AudioGenerator(spectrogram=True) data_gen.load_train_data() data_gen.load_validation_data() # obtain the true transcription and the audio features if partition == 'validation': transcr = data_gen.valid_texts[index] audio_path = data_gen.valid_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) elif partition == 'train': transcr = data_gen.train_texts[index] audio_path = data_gen.train_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) else: raise Exception('Invalid partition! Must be "train" or "validation"') # obtain and decode the acoustic model's predictions input_to_softmax.load_weights(model_path) #print(input_to_softmax.summary()) prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0)) output_length = [input_to_softmax.output_length(data_point.shape[0])] pred_ints = (K.eval(K.ctc_decode( prediction, output_length, greedy=True)[0][0]) + 1).flatten().tolist() Audio(audio_path) print('-' * 80) b = "".join(int_sequence_to_text(pred_ints)) a = transcr print("Greedy_predictions:\n" + '\n' + ''.join(int_sequence_to_text(pred_ints))) print('1. Word Error Rate for ASR ==', wer(a, b)*100, '%') endtime = time.time() print('2. Real Time Factor for ASR ==',(endtime - starttime)/data_gen.valid_durations[index],'\n') print('-' * 80)
def get_predictions(index, partition, trained_model, model_path): """ Print a model's decoded predictions Params: index (int): The example you would like to visualize partition (str): One of 'train' or 'validation' trained_model (Model): The acoustic model model_path (str): Path to saved acoustic model's weights """ # load the train and test data data_gen = AudioGenerator(spectrogram=True) data_gen.load_train_data() data_gen.load_validation_data() # obtain the true transcription and the audio features from Dataset if partition == 'validation': transcr = data_gen.valid_texts[index] audio_path = data_gen.valid_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) elif partition == 'train': transcr = data_gen.train_texts[index] audio_path = data_gen.train_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) else: raise Exception('Invalid partition! Must be "train" or "validation"') print("Trained model output length:\n" + str(trained_model.output_length(data_point.shape[0]))) # obtain and decode the acoustic model's predictions trained_model.load_weights(model_path) prediction = trained_model.predict(np.expand_dims(data_point, axis=0)) output_length = [trained_model.output_length(data_point.shape[0])] pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) + 1).flatten().tolist() transcription = ''.join(int_sequence_to_text(pred_ints)) # Correction using KenLM language model toolkit corrected_transcription = correction(transcription) print('-' * 80) print(repr(audio_path).replace(r"\\", r"/")) print('True transcription:\n' + '\n' + transcr) print('-' * 80) print('Raw prediction:\n' + str(prediction[0])) print('CTC Decoded predicted Ints before conversion to text:\n' + str(pred_ints)) print('Predicted transcription:\n' + '\n' + transcription) print('Predicted transcription with correction:\n' + corrected_transcription) print('-' * 80)
def get_predictions_rec(input_to_softmax, a_path, model_path): data_gen = AudioGenerator(spectrogram=False) data_gen.load_train_data() data_gen.load_validation_data() audio_path = a_path data_point = data_gen.normalize(data_gen.featurize(audio_path)) # obtain and decode the acoustic model's predictions input_to_softmax.load_weights(model_path) prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0)) output_length = [input_to_softmax.output_length(data_point.shape[0])] pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) + 1).flatten().tolist() return 'Predicted transcription:\n' + '\n' + ''.join( int_sequence_to_text(pred_ints))
def get_predictions(index, partition, input_to_softmax, model_path, spectogram=True, mfcc_dim=13): """ Print a model's decoded predictions Params: index (int): The example you would like to visualize partition (str): One of 'train' or 'validation' input_to_softmax (Model): The acoustic model model_path (str): Path to saved acoustic model's weights """ # load the train and test data data_gen = AudioGenerator(spectrogram=spectogram, mfcc_dim=mfcc_dim) data_gen.load_train_data() data_gen.load_validation_data() # obtain the true transcription and the audio features if partition == 'validation': transcr = data_gen.valid_texts[index] audio_path = data_gen.valid_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) elif partition == 'train': transcr = data_gen.train_texts[index] audio_path = data_gen.train_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) else: raise Exception('Invalid partition! Must be "train" or "validation"') # obtain and decode the acoustic model's predictions input_to_softmax.load_weights('results/' + model_path) prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0)) output_length = [input_to_softmax.output_length(data_point.shape[0])] pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) + 1).flatten().tolist() # play the audio file, and display the true and predicted transcriptions print('-' * 80) Audio(audio_path) print('True transcription:\n' + '\n' + transcr) print('-' * 80) print('Predicted transcription:\n' + '\n' + ''.join(int_sequence_to_text(pred_ints))) print('-' * 80)
def get_predictions(data_gen: AudioGenerator, model, partition, index, omit_true=False, print_line=True): """ Print a model's decoded predictions Params: index (int): The example you would like to visualize partition (str): One of 'train' or 'validation' model (Model): The acoustic model model_path (str): Path to saved acoustic model's weights """ # load the train and test data # data_gen = AudioGenerator() # data_gen.load_train_data() # data_gen.load_validation_data() # obtain the true transcription and the audio features if data_gen is None: print("Data Generator is None!") if partition == 'validation': transcription = data_gen.valid_texts[index] audio_path = data_gen.valid_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) elif partition == 'train': transcription = data_gen.train_texts[index] audio_path = data_gen.train_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) else: raise Exception('Invalid partition! Must be "train" or "validation"') # obtain and decode the acoustic model's predictions prediction = model.predict(np.expand_dims(data_point, axis=0)) output_length = [model.output_length(data_point.shape[0])] pred_ints = (K.eval(K.ctc_decode( prediction, output_length)[0][0]) + 1).flatten().tolist() # play the audio file, and display the true and predicted transcriptions # Audio(audio_path) input_type = "SPEC" if data_gen.spectrogram else "MFCC" if not omit_true: print('TRUE: ' + transcription) print('PRED ' + input_type + ': ' + ''.join(int_sequence_to_text(pred_ints))) if print_line: print('-' * 82) return audio_path
def test_gen(): s_gen = AudioGeneratorCached(spectrogram=True, minibatch_size=1, sort_by_duration=True) s_gen.load_train_data('train_corpus.json') m_gen = AudioGeneratorCached(spectrogram=False, minibatch_size=1, sort_by_duration=True) m_gen.load_train_data('train_corpus.json') # c_gen = DG.AudioGeneratorCached(spectrogram=True, mfcc_concat=True, minibatch_size=1, sort_by_duration=True) # c_gen.load_train_data('train_corpus.json') # mfcc_audio_gen = DG.AudioGeneratorCached(spectrogram=False) # for i, j in zip(s_gen.next_train(), m_gen.next_train()): # break counter = 0 for j in s_gen.next_train(): # print(type(j), len(j), type(j[0]), type(j[1])) # print(j[0].keys()) # print(j[1].keys()) print(j[0]['the_input'].shape, j[0]['the_labels'].shape, end=' ') ints = j[0]['the_labels'][0] ints = [t + 1 for t in ints] print("".join(int_sequence_to_text(ints))) # for m in range(0, j[0]['the_labels'].shape[1]): # print("{:02.0f}".format(j[0]['the_labels'][0][m]), end=' ') # print() counter += 1 if counter > 30: break # break # print(mfcc_audio_gen.next_train()) # test_gen()
def get_predictions(audio_path, input_to_softmax, model_path): """ Print a model's decoded predictions Params: index (int): The example you would like to visualize partition (str): One of 'train' or 'validation' input_to_softmax (Model): The acoustic model model_path (str): Path to saved acoustic model's weights """ # print("OK"); # return; # load the train and test data data_gen = AudioGenerator(spectrogram=False, mfcc_dim=13) # read and get features # audio_path = "./samples/16/19/16-19-0159.wav" # print("audio_path:{}".format(audio_path)) # data not normalized yet data_point = data_gen.featurize(audio_path) # print("shape:{}".format(data_gen.featurize(audio_path).shape)) # print("feats_mean: {}".format(data_gen.feats_mean)) # print("feats_std: {}".format(data_gen.feats_std)) # print("feats_mean: {}".format(data_gen.feats_mean.shape)) # print("feats_std: {}".format(data_gen.feats_std.shape)) feats_mean = np.array([ 14.81652005, -0.1802923, -1.22285122, 0.87062853, -16.05643781, -14.03943633, -5.7298706, -15.52425927, -3.39637537, -3.85226744, -5.17435844, -2.13766871, -11.39111645 ]) feats_std = np.array([ 7.16816358, 14.58747728, 11.99928947, 15.69431836, 14.45918537, 16.79930368, 13.98395715, 12.60133111, 11.61310503, 11.34526655, 12.01205471, 13.41467652, 10.89021869 ]) # print("feats_mean: {}".format(feats_mean)) # print("feats_std: {}".format(feats_std)) # print("feats_mean: {}".format(feats_mean.shape)) # print("feats_std: {}".format(feats_std.shape)) # print(data_gen.featurize(audio_path).shape) # normalize data eps = 1e-14 data_point = (data_point - feats_mean) / (feats_std + eps) # data_point = data_gen.normalize(data_gen.featurize(audio_path)) # print("data_point,shape:{}".format(data_point.shape)) # obtain and decode the acoustic model's predictions input_to_softmax.load_weights(model_path) prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0)) output_length = [input_to_softmax.output_length(data_point.shape[0])] pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) + 1).flatten().tolist() recognized_text = "".join(int_sequence_to_text(pred_ints)) print(recognized_text)
def get_group_predictions(input_to_softmax, model_path, partition): starttime = time.time() wer_sum = 0 data_gen = AudioGenerator(spectrogram=True) data_gen.load_train_data() data_gen.load_validation_data() input_to_softmax.load_weights(model_path) # obtain the true transcription and the audio features if partition == 'validation': num = 99 while num >= 0: index = random.randint(1, 2500) transcr = data_gen.valid_texts[index] audio_path = data_gen.valid_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) # obtain and decode the acoustic model's predictions prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0)) output_length = [input_to_softmax.output_length(data_point.shape[0])] pred_ints = (K.eval(K.ctc_decode(prediction, output_length, greedy=False, beam_width=100, top_paths=1)[0][ 0]) + 1).flatten().tolist() # print('True transcription:\n' + '\n' + transcr) b = "".join(int_sequence_to_text(pred_ints)) a = transcr # print('Predicted transcription:\n' + '\n' + ''.join(int_sequence_to_text(pred_ints))) # print('-' * 80) # print('1.Editable Distance for ASR ==', edit(a, b), '\n') if (wer(a, b) <= 1): print('index_%d' % index, ':') wer_sum = wer_sum + wer(a, b) print(wer(a, b)) print("Transcription: ",a) print("Prediction: ",b) print('-' * 80) elif (): num = num + 1 num = num - 1 elif partition == 'train': num = 999 while num >= 0: index = random.randint(1, 10000) transcr = data_gen.train_texts[index] audio_path = data_gen.train_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) # obtain and decode the acoustic model's predictions prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0)) output_length = [input_to_softmax.output_length(data_point.shape[0])] pred_ints = (K.eval(K.ctc_decode(prediction, output_length, greedy=False, beam_width=100, top_paths=1)[0][ 0]) + 1).flatten().tolist() # print('True transcription:\n' + '\n' + transcr) b = "".join(int_sequence_to_text(pred_ints)) a = transcr # print('Predicted transcription:\n' + '\n' + ''.join(int_sequence_to_text(pred_ints))) # print('-' * 80) # print('1.Editable Distance for ASR ==', edit(a, b), '\n') if (wer(a, b) <= 1): print('index_%d' % index, ':') wer_sum = wer_sum + wer(a, b) print(wer(a, b)) print("Transcription: ",a) print("Prediction: ",b) print('-' * 80) elif (): num = num + 1 num = num - 1 else: raise Exception('Invalid partition! Must be "train" or "validation"') endtime = time.time() #print('1. Average Word Error Rate for ASR ==', wer_sum/100 , '%') print('1. Average Word Error Rate for ASR ==', wer_sum / 100) print('2. Average Real Time Factor for ASR ==', (endtime - starttime) / 100, '\n')
def get_predictions(indexes, partition, input_to_softmax, model_path): """ Print a model's decoded predictions Params: index (int): The example you would like to visualize partition (str): One of 'train' or 'validation' input_to_softmax (Model): The acoustic model model_path (str): Path to saved acoustic model's weights """ # load the train and test data data_gen = AudioGenerator(spectrogram=False, mfcc_dim=13) data_gen.load_train_data() data_gen.load_validation_data() # loading language model alphabet = ''.join(index_map.values()) language_model = LanguageModel('data/word/corpus.txt', alphabet) # obtain and decode the acoustic model's predictions input_to_softmax.load_weights(model_path) for index in indexes: # obtain the true transcription and the audio features if partition == 'validation': transcr = data_gen.valid_texts[index] audio_path = data_gen.valid_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) elif partition == 'train': transcr = data_gen.train_texts[index] audio_path = data_gen.train_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) elif partition == 'test': transcr = 'hello how are you' audio_path = '../datasets/AgentExpress/hello.wav.wav' data_point = data_gen.normalize(data_gen.featurize(audio_path)) #print(data_point) else: raise Exception( 'Invalid partition! Must be "train" or "validation"') prediction = input_to_softmax.predict( np.expand_dims(data_point, axis=0)) output_length = [input_to_softmax.output_length(data_point.shape[0])] #print(K.eval(K.ctc_decode(prediction, output_length, False, 30, 1)[0][0])) pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) + 1).flatten().tolist() # play the audio file, and display the true and predicted transcriptions print('-' * 80) Audio(audio_path) print('ground_truth:' + ' ' * 4 + transcr) print('best_path:' + ' ' * 7 + ''.join(int_sequence_to_text(pred_ints))) pred_beam = ctcBeamSearch(prediction[0], alphabet, None) print('beam_search:' + ' ' * 5 + pred_beam) pred_beam_lm = ctcBeamSearch(prediction[0], alphabet, language_model) print('beam_search_lm:' + ' ' * 2 + pred_beam_lm) pred_token = ctcTokenPassing(prediction[0], alphabet, language_model.getWordList()) print('token_passing:' + ' ' * 3 + pred_token)
def get_predictions(index, partition, input_to_softmax, model_path, phn=False): """ Print a model's decoded predictions Params: index (int): The example you would like to visualize partition (str): One of 'train' or 'validation' input_to_softmax (Model): The acoustic model model_path (str): Path to saved acoustic model's weights """ # load the train and test data data_gen = AudioGenerator() data_gen.load_train_data() data_gen.load_test_data() # obtain the true transcription and the audio features if partition == 'test': if phn: transcr = data_gen.test_phn_texts[index] audio_path = data_gen.test_phn_audio_paths[index] elif not phn: transcr = data_gen.test_wrd_texts[index] audio_path = data_gen.test_wrd_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) elif partition == 'train': if phn: transcr = data_gen.train_phn_texts[index] audio_path = data_gen.train_phn_audio_paths[index] elif not phn: transcr = data_gen.train_wrd_texts[index] audio_path = data_gen.train_wrd_audio_paths[index] data_point = data_gen.normalize(data_gen.featurize(audio_path)) else: raise Exception('Invalid partition! Must be "train" or "validation"') # obtain and decode the acoustic model's predictions input_to_softmax.load_weights(model_path) prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0)) output_length = [input_to_softmax.output_length(data_point.shape[0])] pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) + 1).flatten().tolist() # play the audio file, and display the true and predicted transcriptions if not phn: print('-' * 80) Audio(audio_path) print('True transcription:\n' + '\n' + transcr) print('-' * 80) print('Predicted transcription:\n' + '\n' + ''.join(int_sequence_to_text(pred_ints, phn))) print('-' * 80) else: print('-' * 80) Audio(audio_path) print('True transcription:\n' + '\n' + transcr) print('-' * 80) print('Predicted transcription:\n' + '\n') split_true = transcr.split(" ") split_pred = (''.join(int_sequence_to_text(pred_ints, phn))).split(" ") print("\033[1;32m" + split_pred[0] + " ", end='') for i in range(1, len(split_true) - 1): if split_true[i - 1] == split_pred[i] or split_true[ i] == split_pred[i] or split_true[i + 1] == split_pred[i]: print("\033[1;32m" + split_pred[i] + " ", end='') else: print("\033[1;31m" + split_pred[i] + " ", end='') print(split_pred[len(split_true) - 1] + " ", end='') split_pred = (''.join(int_sequence_to_text(pred_ints, phn))).split(" ") split_true = transcr.split(" ") displayAccuracy(split_true, split_pred, phn)
model.output_length = lambda x: x print(model.summary()) return model model = bidirectional_rnn_model( input_dim=161, # change to 13 if you would like to use MFCC features units=512 + 32) print('load Model') model.load_weights('results/model_20.h5') data_gen = AudioGenerator() print("Load file") audio_path = 'output.wav' data_point = data_gen.normalize(data_gen.featurize(audio_path)) print("Start prediction") #input_to_softmax.load_weights(model_path) prediction = model.predict(np.expand_dims(data_point, axis=0), batch_size=1) output_length = [model.output_length(data_point.shape[0])] pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) + 1).flatten().tolist() print(prediction) print(output_length) print(pred_ints) print('Predicted transcription:\n' + '\n' + ''.join(int_sequence_to_text(pred_ints)))