Python int_sequence_to_text Examples, utils.int_sequence_to_text Python Examples

Example #1

0

Show file

def predict_test(input_to_softmax, model_path, audio_range=100000):
    '''
    Method for predicting the testing set. Set default audio_range to be 100000.
    (If it's over the index, it will return an error anyway).
    '''
    data_gen = AudioGenerator()
    data_gen.load_train_data()
    data_gen.load_test_data()

    transcr = data_gen.test_texts
    audio_path = data_gen.test_audio_paths
    input_to_softmax.load_weights(model_path)
    predictions = []
    try:
        for i in range(len(audio_path)):  #default len(audio_path)):
            data_point = data_gen.normalize(data_gen.featurize(audio_path[i]))

            prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
            output_length = [input_to_softmax.output_length(data_point.shape[0])]
            pred_ints = (K.eval(K.ctc_decode(
                    prediction, output_length)[0][0])+1).flatten().tolist()
            pred = ''.join(int_sequence_to_text(pred_ints))
            predictions.append(pred)
    except:
        predictions = ''.join(predictions)
        transcr = transcr[:10]
        transcr = ''.join(transcr)
        with open("predictions/predictions.txt", "w") as output:
            output.write(str(predictions))
        with open("predictions/truescr.txt", "w") as output:
            output.write(str(transcr))

Example #2

0

Show file

File: evaluation.py Project: phanxuanphucnd/masr

def get_predictions(index, partition, input_to_softmax, model_path):
    '''
    Get the model's decoded predictions to caculate metrics
    '''
    # load the train and test data
    data_gen = AudioGenerator()
    data_gen.load_train_data()
    data_gen.load_validation_data()

    # obtain the true transcription and the audio features
    if partition == 'valid':
        transcr = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcr = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid')

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                 1).flatten().tolist()

    label = transcr
    predicted = ''.join(int_sequence_to_text(pred_ints))

    return label, predicted

Example #3

0

Show file

File: wer_k.py Project: kovleventer/VITMAL00

def calculate_wer2(input_to_softmax, model_path, words=False):
    # data_gen = AudioGenerator()
    # data_gen.load_train_data()
    # data_gen.load_validation_data()
    wers = []
    input_to_softmax.load_weights(model_path)

    l = len(data_gen.valid_texts)
    l = 100
    for index in range(l):
        transcr = data_gen.valid_texts[index]
        # audio_path = data_gen.valid_audio_paths[index]
        # data_point = data_gen.normalize(data_gen.featurize(audio_path))

        data_point = valid_cache[index]

        prediction = input_to_softmax.predict(
            np.expand_dims(data_point, axis=0))
        output_length = [input_to_softmax.output_length(data_point.shape[0])]
        pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                     1).flatten().tolist()

        pred = ''.join(int_sequence_to_text(pred_ints))

        if words:
            w = wer(transcr.split(), pred.split())
        else:
            w = wer(list(transcr), list(pred))
        wers.append(w)
        if index % 100 == 0:
            print(index, len(data_gen.valid_texts), wers[-1])

    print("FINAL WER:", sum(wers) / len(wers), "words:", words)
    return sum(wers) / len(wers)

Example #4

0

Show file

File: model_use.py Project: lithathampan/speech_recognition_nn_comparison

 def get_predictions_recorded(
     self,
     spectrogram=False,
     recordingpath='recordings/demo.wav',
 ):
     """ Print a model's decoded predictions from live recordings
     Params:
         index (int): The example you would like to visualize
         partition (str): One of 'train' or 'validation'
         input_to_softmax (Model): The acoustic model
         model_path (str): Path to saved acoustic model's weights
     """
     # load the train and test data
     data_gen = AudioGenerator(spectrogram=spectrogram)
     data_gen.load_train_data()
     self.audio_path = recordingpath
     # obtain the true transcription and the audio feature
     data_point = data_gen.normalize(data_gen.featurize(recordingpath))
     #pprint(data_point)
     # obtain and decode the acoustic model's predictions
     prediction = self.input_to_softmax.predict(
         np.expand_dims(data_point, axis=0))
     output_length = [
         self.input_to_softmax.output_length(data_point.shape[0])
     ]
     pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                  1).flatten().tolist()
     print('-' * 80)
     print('Predicted transcription:\n' + '\n' +
           ''.join(int_sequence_to_text(pred_ints)))
     print('-' * 80)

Example #5

0

Show file

def compare_predictions(index,
                        partition,
                        inputs_to_softmax=[],
                        model_paths=[],
                        phn=False):
    """ Print a model's decoded predictions
	Params:
		index (int): The example you would like to visualize
		partition (str): One of 'train' or 'validation'
		input_to_softmax (Model): The acoustic model
		model_path (str): Path to saved acoustic model's weights
	"""
    # load the train and test data
    data_gen = AudioGenerator()
    data_gen.load_train_data()
    data_gen.load_test_data()

    # obtain the true transcription and the audio features
    if partition == 'test':
        if phn:
            transcr = data_gen.test_phn_texts[index]
            audio_path = data_gen.test_phn_audio_paths[index]
        elif not phn:
            transcr = data_gen.test_wrd_texts[index]
            audio_path = data_gen.test_wrd_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        if phn:
            transcr = data_gen.train_phn_texts[index]
            audio_path = data_gen.train_phn_audio_paths[index]
        elif not phn:
            transcr = data_gen.train_wrd_texts[index]
            audio_path = data_gen.train_wrd_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!	 Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    pred_ints = []
    for model_path, input_to_softmax in zip(model_paths, inputs_to_softmax):
        input_to_softmax.load_weights(model_path)
        prediction = input_to_softmax.predict(
            np.expand_dims(data_point, axis=0))
        output_length = [input_to_softmax.output_length(data_point.shape[0])]
        pred_int = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                    1).flatten().tolist()
        pred_ints.append(pred_int)

    # play the audio file, and display the true and predicted transcriptions
    print('-' * 80)
    Audio(audio_path)
    print('True transcription:\n' + '\n' + transcr)
    print('-' * 80)
    i = 0
    for pred_in in pred_ints:
        i = i + 1
        print('Predicted transcription number', i,
              ':\n' + '\n' + ''.join(int_sequence_to_text(pred_in, phn)))
        print('-' * 80)

Example #6

0

Show file

def get_predictions(index, partition, input_to_softmax, model_path):
    """ Print a model's decoded predictions ·
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        input_to_softmax (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """
    # load the train and test data
    starttime = time.time()
    data_gen = AudioGenerator(spectrogram=True)
    data_gen.load_train_data()
    data_gen.load_validation_data()
    # obtain the true transcription and the audio features
    if partition == 'validation':
        transcr = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcr = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    #print(input_to_softmax.summary())
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(
       prediction, output_length, greedy=True)[0][0]) + 1).flatten().tolist()
    Audio(audio_path)
    print('-' * 80)
    b = "".join(int_sequence_to_text(pred_ints))
    a = transcr
    print("Greedy_predictions:\n" + '\n' + ''.join(int_sequence_to_text(pred_ints)))
    print('1. Word Error Rate for ASR ==', wer(a, b)*100, '%')
    endtime = time.time()
    print('2. Real Time Factor for ASR ==',(endtime - starttime)/data_gen.valid_durations[index],'\n')
    print('-' * 80)

Example #7

0

Show file

def get_predictions(index, partition, trained_model, model_path):
    """ Print a model's decoded predictions
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        trained_model (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """
    # load the train and test data
    data_gen = AudioGenerator(spectrogram=True)
    data_gen.load_train_data()
    data_gen.load_validation_data()

    # obtain the true transcription and the audio features from Dataset
    if partition == 'validation':
        transcr = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcr = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')

    print("Trained model output length:\n" +
          str(trained_model.output_length(data_point.shape[0])))
    # obtain and decode the acoustic model's predictions
    trained_model.load_weights(model_path)
    prediction = trained_model.predict(np.expand_dims(data_point, axis=0))
    output_length = [trained_model.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                 1).flatten().tolist()

    transcription = ''.join(int_sequence_to_text(pred_ints))
    # Correction using KenLM language model toolkit
    corrected_transcription = correction(transcription)

    print('-' * 80)
    print(repr(audio_path).replace(r"\\", r"/"))
    print('True transcription:\n' + '\n' + transcr)
    print('-' * 80)
    print('Raw prediction:\n' + str(prediction[0]))
    print('CTC Decoded predicted Ints before conversion to text:\n' +
          str(pred_ints))
    print('Predicted transcription:\n' + '\n' + transcription)
    print('Predicted transcription with correction:\n' +
          corrected_transcription)
    print('-' * 80)

Example #8

0

Show file

File: app.py Project: LocalNavigators/FlaskSpeech

def get_predictions_rec(input_to_softmax, a_path, model_path):
    data_gen = AudioGenerator(spectrogram=False)
    data_gen.load_train_data()
    data_gen.load_validation_data()

    audio_path = a_path
    data_point = data_gen.normalize(data_gen.featurize(audio_path))

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                 1).flatten().tolist()

    return 'Predicted transcription:\n' + '\n' + ''.join(
        int_sequence_to_text(pred_ints))

Example #9

0

Show file

def get_predictions(index,
                    partition,
                    input_to_softmax,
                    model_path,
                    spectogram=True,
                    mfcc_dim=13):
    """ Print a model's decoded predictions
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        input_to_softmax (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """
    # load the train and test data
    data_gen = AudioGenerator(spectrogram=spectogram, mfcc_dim=mfcc_dim)
    data_gen.load_train_data()
    data_gen.load_validation_data()

    # obtain the true transcription and the audio features
    if partition == 'validation':
        transcr = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcr = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights('results/' + model_path)
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                 1).flatten().tolist()

    # play the audio file, and display the true and predicted transcriptions
    print('-' * 80)
    Audio(audio_path)
    print('True transcription:\n' + '\n' + transcr)
    print('-' * 80)
    print('Predicted transcription:\n' + '\n' +
          ''.join(int_sequence_to_text(pred_ints)))
    print('-' * 80)

Example #10

0

Show file

File: train_utils.py Project: Dmitri-AI/udacity-project-nlp-speech-recognition

def get_predictions(data_gen: AudioGenerator,
                    model,
                    partition, index, omit_true=False, print_line=True):
    """ Print a model's decoded predictions
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        model (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """
    # load the train and test data
    # data_gen = AudioGenerator()
    # data_gen.load_train_data()
    # data_gen.load_validation_data()

    # obtain the true transcription and the audio features
    if data_gen is None:
        print("Data Generator is None!")
    if partition == 'validation':
        transcription = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcription = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    prediction = model.predict(np.expand_dims(data_point, axis=0))
    output_length = [model.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(
        prediction, output_length)[0][0]) + 1).flatten().tolist()

    # play the audio file, and display the true and predicted transcriptions
    # Audio(audio_path)
    input_type = "SPEC" if data_gen.spectrogram else "MFCC"
    if not omit_true:
        print('TRUE:      ' + transcription)
    print('PRED ' + input_type + ': ' + ''.join(int_sequence_to_text(pred_ints)))
    if print_line:
        print('-' * 82)
    return audio_path

Example #11

0

Show file

def test_gen():
    s_gen = AudioGeneratorCached(spectrogram=True,
                                 minibatch_size=1,
                                 sort_by_duration=True)
    s_gen.load_train_data('train_corpus.json')
    m_gen = AudioGeneratorCached(spectrogram=False,
                                 minibatch_size=1,
                                 sort_by_duration=True)
    m_gen.load_train_data('train_corpus.json')
    #     c_gen = DG.AudioGeneratorCached(spectrogram=True, mfcc_concat=True, minibatch_size=1, sort_by_duration=True)
    #     c_gen.load_train_data('train_corpus.json')

    # mfcc_audio_gen = DG.AudioGeneratorCached(spectrogram=False)
    #     for i, j in zip(s_gen.next_train(), m_gen.next_train()):
    #         break
    counter = 0
    for j in s_gen.next_train():
        #         print(type(j), len(j), type(j[0]), type(j[1]))
        #         print(j[0].keys())
        #         print(j[1].keys())
        print(j[0]['the_input'].shape, j[0]['the_labels'].shape, end=' ')
        ints = j[0]['the_labels'][0]
        ints = [t + 1 for t in ints]
        print("".join(int_sequence_to_text(ints)))

        #         for m in range(0, j[0]['the_labels'].shape[1]):

        #             print("{:02.0f}".format(j[0]['the_labels'][0][m]), end=' ')
        #         print()
        counter += 1
        if counter > 30:
            break


#         break
# print(mfcc_audio_gen.next_train())

# test_gen()

Example #12

0

Show file

def get_predictions(audio_path, input_to_softmax, model_path):
    """ Print a model's decoded predictions
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        input_to_softmax (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """

    # print("OK");
    # return;

    # load the train and test data
    data_gen = AudioGenerator(spectrogram=False, mfcc_dim=13)

    # read and get features
    # audio_path = "./samples/16/19/16-19-0159.wav"

    # print("audio_path:{}".format(audio_path))

    # data not normalized yet
    data_point = data_gen.featurize(audio_path)
    # print("shape:{}".format(data_gen.featurize(audio_path).shape))

    # print("feats_mean: {}".format(data_gen.feats_mean))
    # print("feats_std: {}".format(data_gen.feats_std))
    # print("feats_mean: {}".format(data_gen.feats_mean.shape))
    # print("feats_std: {}".format(data_gen.feats_std.shape))

    feats_mean = np.array([
        14.81652005, -0.1802923, -1.22285122, 0.87062853, -16.05643781,
        -14.03943633, -5.7298706, -15.52425927, -3.39637537, -3.85226744,
        -5.17435844, -2.13766871, -11.39111645
    ])
    feats_std = np.array([
        7.16816358, 14.58747728, 11.99928947, 15.69431836, 14.45918537,
        16.79930368, 13.98395715, 12.60133111, 11.61310503, 11.34526655,
        12.01205471, 13.41467652, 10.89021869
    ])

    # print("feats_mean: {}".format(feats_mean))
    # print("feats_std: {}".format(feats_std))
    # print("feats_mean: {}".format(feats_mean.shape))
    # print("feats_std: {}".format(feats_std.shape))

    # print(data_gen.featurize(audio_path).shape)
    # normalize data
    eps = 1e-14
    data_point = (data_point - feats_mean) / (feats_std + eps)

    # data_point = data_gen.normalize(data_gen.featurize(audio_path))

    # print("data_point,shape:{}".format(data_point.shape))

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                 1).flatten().tolist()

    recognized_text = "".join(int_sequence_to_text(pred_ints))
    print(recognized_text)

Example #13

0

Show file

def get_group_predictions(input_to_softmax, model_path, partition):
    starttime = time.time()
    wer_sum = 0
    data_gen = AudioGenerator(spectrogram=True)
    data_gen.load_train_data()
    data_gen.load_validation_data()
    input_to_softmax.load_weights(model_path)
    # obtain the true transcription and the audio features
    if partition == 'validation':
        num = 99
        while num >= 0:
            index = random.randint(1, 2500)
            transcr = data_gen.valid_texts[index]
            audio_path = data_gen.valid_audio_paths[index]
            data_point = data_gen.normalize(data_gen.featurize(audio_path))
            # obtain and decode the acoustic model's predictions
            prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
            output_length = [input_to_softmax.output_length(data_point.shape[0])]
            pred_ints = (K.eval(K.ctc_decode(prediction, output_length, greedy=False, beam_width=100, top_paths=1)[0][
                                    0]) + 1).flatten().tolist()
            # print('True transcription:\n' + '\n' + transcr)
            b = "".join(int_sequence_to_text(pred_ints))
            a = transcr
            # print('Predicted transcription:\n' + '\n' + ''.join(int_sequence_to_text(pred_ints)))
            # print('-' * 80)
            # print('1.Editable Distance for ASR ==', edit(a, b), '\n')
            if (wer(a, b) <= 1):
                print('index_%d' % index, ':')
                wer_sum = wer_sum + wer(a, b)
                print(wer(a, b))
                print("Transcription: ",a)
                print("Prediction:    ",b)
                print('-' * 80)
            elif ():
                num = num + 1
            num = num - 1
    elif partition == 'train':
        num = 999
        while num >= 0:
            index = random.randint(1, 10000)
            transcr = data_gen.train_texts[index]
            audio_path = data_gen.train_audio_paths[index]
            data_point = data_gen.normalize(data_gen.featurize(audio_path))

            # obtain and decode the acoustic model's predictions

            prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
            output_length = [input_to_softmax.output_length(data_point.shape[0])]
            pred_ints = (K.eval(K.ctc_decode(prediction, output_length, greedy=False, beam_width=100, top_paths=1)[0][
                                    0]) + 1).flatten().tolist()
            # print('True transcription:\n' + '\n' + transcr)
            b = "".join(int_sequence_to_text(pred_ints))
            a = transcr
            # print('Predicted transcription:\n' + '\n' + ''.join(int_sequence_to_text(pred_ints)))
            # print('-' * 80)
            # print('1.Editable Distance for ASR ==', edit(a, b), '\n')

            if (wer(a, b) <= 1):
                print('index_%d' % index, ':')
                wer_sum = wer_sum + wer(a, b)
                print(wer(a, b))
                print("Transcription: ",a)
                print("Prediction:    ",b)
                print('-' * 80)
            elif ():
                num = num + 1
            num = num - 1
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')

    endtime = time.time()
    #print('1. Average Word Error Rate for ASR ==', wer_sum/100 , '%')
    print('1. Average Word Error Rate for ASR ==', wer_sum / 100)
    print('2. Average Real Time Factor for ASR ==', (endtime - starttime) / 100, '\n')

Example #14

0

Show file

File: predict.py Project: bhattsachin/AIND-VUI-Capstone

def get_predictions(indexes, partition, input_to_softmax, model_path):
    """ Print a model's decoded predictions
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        input_to_softmax (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """
    # load the train and test data
    data_gen = AudioGenerator(spectrogram=False, mfcc_dim=13)
    data_gen.load_train_data()
    data_gen.load_validation_data()

    # loading language model
    alphabet = ''.join(index_map.values())
    language_model = LanguageModel('data/word/corpus.txt', alphabet)

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)

    for index in indexes:
        # obtain the true transcription and the audio features
        if partition == 'validation':
            transcr = data_gen.valid_texts[index]
            audio_path = data_gen.valid_audio_paths[index]
            data_point = data_gen.normalize(data_gen.featurize(audio_path))
        elif partition == 'train':
            transcr = data_gen.train_texts[index]
            audio_path = data_gen.train_audio_paths[index]
            data_point = data_gen.normalize(data_gen.featurize(audio_path))
        elif partition == 'test':
            transcr = 'hello how are you'
            audio_path = '../datasets/AgentExpress/hello.wav.wav'
            data_point = data_gen.normalize(data_gen.featurize(audio_path))
            #print(data_point)
        else:
            raise Exception(
                'Invalid partition!  Must be "train" or "validation"')

        prediction = input_to_softmax.predict(
            np.expand_dims(data_point, axis=0))

        output_length = [input_to_softmax.output_length(data_point.shape[0])]

        #print(K.eval(K.ctc_decode(prediction, output_length, False, 30, 1)[0][0]))
        pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                     1).flatten().tolist()

        # play the audio file, and display the true and predicted transcriptions
        print('-' * 80)
        Audio(audio_path)
        print('ground_truth:' + ' ' * 4 + transcr)
        print('best_path:' + ' ' * 7 +
              ''.join(int_sequence_to_text(pred_ints)))
        pred_beam = ctcBeamSearch(prediction[0], alphabet, None)
        print('beam_search:' + ' ' * 5 + pred_beam)
        pred_beam_lm = ctcBeamSearch(prediction[0], alphabet, language_model)
        print('beam_search_lm:' + ' ' * 2 + pred_beam_lm)
        pred_token = ctcTokenPassing(prediction[0], alphabet,
                                     language_model.getWordList())
        print('token_passing:' + ' ' * 3 + pred_token)

Example #15

0

Show file

def get_predictions(index, partition, input_to_softmax, model_path, phn=False):
    """ Print a model's decoded predictions
	Params:
		index (int): The example you would like to visualize
		partition (str): One of 'train' or 'validation'
		input_to_softmax (Model): The acoustic model
		model_path (str): Path to saved acoustic model's weights
	"""
    # load the train and test data
    data_gen = AudioGenerator()
    data_gen.load_train_data()
    data_gen.load_test_data()

    # obtain the true transcription and the audio features
    if partition == 'test':
        if phn:
            transcr = data_gen.test_phn_texts[index]
            audio_path = data_gen.test_phn_audio_paths[index]
        elif not phn:
            transcr = data_gen.test_wrd_texts[index]
            audio_path = data_gen.test_wrd_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        if phn:
            transcr = data_gen.train_phn_texts[index]
            audio_path = data_gen.train_phn_audio_paths[index]
        elif not phn:
            transcr = data_gen.train_wrd_texts[index]
            audio_path = data_gen.train_wrd_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!	 Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                 1).flatten().tolist()

    # play the audio file, and display the true and predicted transcriptions
    if not phn:
        print('-' * 80)
        Audio(audio_path)
        print('True transcription:\n' + '\n' + transcr)
        print('-' * 80)
        print('Predicted transcription:\n' + '\n' +
              ''.join(int_sequence_to_text(pred_ints, phn)))
        print('-' * 80)
    else:
        print('-' * 80)
        Audio(audio_path)
        print('True transcription:\n' + '\n' + transcr)
        print('-' * 80)
        print('Predicted transcription:\n' + '\n')
        split_true = transcr.split(" ")
        split_pred = (''.join(int_sequence_to_text(pred_ints, phn))).split(" ")
        print("\033[1;32m" + split_pred[0] + " ", end='')
        for i in range(1, len(split_true) - 1):
            if split_true[i - 1] == split_pred[i] or split_true[
                    i] == split_pred[i] or split_true[i + 1] == split_pred[i]:
                print("\033[1;32m" + split_pred[i] + " ", end='')
            else:
                print("\033[1;31m" + split_pred[i] + " ", end='')
        print(split_pred[len(split_true) - 1] + " ", end='')
    split_pred = (''.join(int_sequence_to_text(pred_ints, phn))).split(" ")
    split_true = transcr.split(" ")
    displayAccuracy(split_true, split_pred, phn)

Example #16

0

Show file

    model.output_length = lambda x: x
    print(model.summary())
    return model


model = bidirectional_rnn_model(
    input_dim=161,  # change to 13 if you would like to use MFCC features
    units=512 + 32)

print('load Model')
model.load_weights('results/model_20.h5')
data_gen = AudioGenerator()
print("Load file")
audio_path = 'output.wav'
data_point = data_gen.normalize(data_gen.featurize(audio_path))

print("Start prediction")

#input_to_softmax.load_weights(model_path)
prediction = model.predict(np.expand_dims(data_point, axis=0), batch_size=1)
output_length = [model.output_length(data_point.shape[0])]
pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
             1).flatten().tolist()

print(prediction)
print(output_length)

print(pred_ints)
print('Predicted transcription:\n' + '\n' +
      ''.join(int_sequence_to_text(pred_ints)))