Esempio n. 1
0
    def __init__(self,
                 data_valid,
                 model,
                 language,
                 target_dir,
                 num_epochs,
                 num_minutes=None,
                 save_progress=True,
                 early_stopping=False,
                 shuffle_data=True,
                 force_output=False,
                 lm_path=None,
                 vocab_path=None):
        """
        Will calculate WER and LER at epoch end and print out infered transcriptions from validation set using the 
        current model and weights
        :param data_valid: validation data
        :param model: compiled model
        :param target_dir: string that identifies the current run
        :param save_progress: whether to save the model if a new WER/LER benchmark was achieved
        :param early_stopping: whether to stop the model if the WER/LER did not improve over the last 5 epochs
        :param shuffle_data: whether to shuffle the validation data before reporting
        :param force_output: whether to output the decoded inferences for all validation samples or only the ones
                             with WER < 0.6
        """
        super().__init__()
        self.data_valid = data_valid
        self.model = model
        self.language = language
        self.target_dir = target_dir
        self.num_epochs = num_epochs
        self.num_minutes = num_minutes
        self.save_progress = save_progress
        self.early_stopping = early_stopping
        self.shuffle_data = shuffle_data
        self.force_output = force_output
        self.lm = None
        self.lm_vocab = None
        self.decoders = {}
        if lm_path:
            self.lm_path, self.vocab_path = lm_path, vocab_path
            self.lm = load_lm(lm_path)
            self.lm_vocab = load_vocab(vocab_path)

        if not isdir(self.target_dir):
            makedirs(self.target_dir)

        self.decoder_greedy = BestPathDecoder(model, language)
        self.decoder_beam = BeamSearchDecoder(model, language)

        # WER/LER history
        columns = pd.MultiIndex.from_product(
            [metrics, decoding_strategies, lm_uses],
            names=['metric', 'decoding strategy', 'LM correction'])
        self.df_history = pd.DataFrame(index=np.arange(num_epochs),
                                       columns=columns)
        # base name for files that will be written to target directory
        self.base_name = 'model' + (f'_{self.num_minutes}_min'
                                    if self.num_minutes else '')
        print(f'base name for result files: {self.base_name}')
Esempio n. 2
0
def main(args):
    print(create_args_str(args))
    lang, audio, trans, keras, ds, ds_alpha, ds_trie, lm, vocab, target_dir, normalize, gpu = setup(args)
    print(f'all artifacts will be saved to {target_dir}')

    lm = load_lm(lm) if lm else None
    vocab = load_vocab(vocab) if vocab else None

    audio_bytes, sample_rate, transcript, language = preprocess(audio, trans, lang, norm_transcript=normalize)
    voiced_segments = vad(audio_bytes, sample_rate)
    df_alignments = pipeline(voiced_segments=voiced_segments, sample_rate=sample_rate, transcript=transcript,
                             language='en',
                             ds_path=ds, ds_alpha_path=ds_alpha, ds_trie_path=ds_trie,
                             keras_path=keras, lm=lm, vocab=vocab,
                             force_realignment=args.force_realignment, align_endings=args.align_endings,
                             target_dir=target_dir)

    df_stats = calculate_stats(df_alignments, ds, transcript)
    create_demo_files(target_dir, audio, transcript, df_alignments, df_stats)

    print()
    print_dataframe(df_stats)
    print()

    stats_csv = join(target_dir, 'stats.csv')
    print(f'Saving stats to {stats_csv}')
    df_alignments.to_csv(stats_csv)
Esempio n. 3
0
def main(args):
    print(create_args_str(args))
    target_dir, keras_path, lm_path, vocab_path, gpu = setup(args)
    print(f'all results will be written to {target_dir}')

    lm = load_lm(lm_path) if lm_path else None
    vocab = load_vocab(vocab_path) if vocab_path else None

    corpus = get_corpus('rl', 'de')
    corpus.summary()
    test_entries = list(set((segment.entry for segment in corpus.test_set())))
    # add 6 entries from PodClub corpus
    corpus = get_corpus('pc', 'de')
    corpus.summary()
    test_entries += [
        corpus['record1058'], corpus['record1063'], corpus['record1076'],
        corpus['record1523'], corpus['record1548'], corpus['record1556']
    ]
    stats = []
    for i, entry in enumerate(test_entries):
        print(f'entry {i + 1}/{len(test_entries)}')
        audio_file = entry.audio_path
        sample_rate = entry.rate
        with open(entry.transcript_path, encoding='utf-8') as f:
            transcript = f.read()
            if args.norm_transcript:
                transcript = normalize(transcript, 'de')

        demo_id = splitext(basename(audio_file))[0]
        target_dir_entry = join(target_dir, demo_id)
        if not exists(target_dir_entry):
            makedirs(target_dir_entry)

        voiced_segments = [
            Voice(s.audio, s.rate, s.start_frame, s.end_frame) for s in entry
        ]
        df_alignments = pipeline(voiced_segments=voiced_segments,
                                 sample_rate=sample_rate,
                                 transcript=transcript,
                                 language='de',
                                 keras_path=keras_path,
                                 lm=lm,
                                 vocab=vocab,
                                 force_realignment=args.force_realignment,
                                 align_endings=args.align_endings,
                                 target_dir=target_dir_entry)

        df_stats = calculate_stats(df_alignments, keras_path, transcript)

        # calculate average similarity between Keras-alignment and original aligments
        original_alignments = [s.transcript for s in entry.segments]
        av_similarity = np.mean([
            levenshtein_similarity(ka, oa)
            for (ka,
                 oa) in zip(df_alignments['alignment'], original_alignments)
        ])
        df_stats['similarity'] = av_similarity
        create_demo_files(target_dir_entry, audio_file, transcript,
                          df_alignments, df_stats)

        stats.append(df_stats)

    df_keras = pd.concat(stats)
    csv_keras = join(target_dir, 'performance.csv')
    df_keras.to_csv(csv_keras)
    print(f'summary saved to {csv_keras}')

    visualize_pipeline_performance(csv_keras, csv_ds=None, silent=True)
    update_index(target_dir,
                 lang='de',
                 num_aligned=len(test_entries),
                 df_keras=df_keras,
                 keras_path=keras_path,
                 lm_path=lm_path,
                 vocab_path=vocab_path)
    K.clear_session()
Esempio n. 4
0
 def test_correction(self):
     lm = load_lm('/media/daniel/IP9/lm/ds_en/lm.binary')
     vocab = load_vocab('/media/daniel/IP9/lm/ds_en/lm_80k.vocab')
     text = 'i seee i sey saind the blnd manp to his deaf dauhgter'
     text_corrected = lm_util.correction(text, 'en', lm=lm, vocab=vocab)
     assert_that(text_corrected, is_('i see i see said the blind man to his deaf daughter'))
Esempio n. 5
0
 def test_load_lm(self):
     lm = load_lm('/media/daniel/IP9/lm/ds_en/lm.binary')
     vocab = load_vocab('/media/daniel/IP9/lm/ds_en/lm_80k.vocab')
     assert_that(lm, is_(not_(None)))
     assert_that(vocab, is_(not_(None)))
Esempio n. 6
0
def main(args):
    print(create_args_str(args))
    demo_files, target_dir, keras_path, ds_path, ds_alpha, ds_trie, lm_path, vocab_path, normalize, gpu = setup(
        args)
    num_files = len(demo_files)
    print(
        f'Processing {num_files} audio/transcript samples. All results will be written to {target_dir}'
    )

    lm = load_lm(lm_path) if lm_path else None
    vocab = load_vocab(vocab_path) if vocab_path else None

    stats_keras, stats_ds = [], []
    for i, (audio, transcript) in enumerate(demo_files):
        print(
            '-----------------------------------------------------------------'
        )
        print(f'{i + 1}/{num_files}: Evaluating pipeline on {audio}')
        print(
            '-----------------------------------------------------------------'
        )
        demo_id = splitext(basename(audio))[0]
        target_dir_ds = join(target_dir, demo_id + '_ds')
        target_dir_keras = join(target_dir, demo_id + '_keras')

        audio_bytes, sample_rate, transcript, language = preprocess(
            audio, transcript, 'en', norm_transcript=normalize)
        voiced_segments = vad(audio_bytes, sample_rate)

        df_alignments_ds = pipeline(voiced_segments=voiced_segments,
                                    sample_rate=sample_rate,
                                    transcript=transcript,
                                    language='en',
                                    ds_path=ds_path,
                                    ds_alpha_path=ds_alpha,
                                    ds_trie_path=ds_trie,
                                    lm_path=lm,
                                    force_realignment=args.force_realignment,
                                    align_endings=args.align_endings,
                                    target_dir=target_dir_ds)
        df_stats_ds = calculate_stats(df_alignments_ds, ds_path, transcript)

        df_alignments_keras = pipeline(
            voiced_segments=voiced_segments,
            sample_rate=sample_rate,
            transcript=transcript,
            language='en',
            keras_path=keras_path,
            lm=lm,
            vocab=vocab,
            force_realignment=args.force_realignment,
            align_endings=args.align_endings,
            target_dir=target_dir_keras)
        df_stats_keras = calculate_stats(df_alignments_keras, keras_path,
                                         transcript)

        # average similarity between Keras and DeepSpeech alignments
        av_similarity = np.mean([
            levenshtein_similarity(al_keras, al_ds)
            for (al_keras, al_ds) in zip(df_alignments_keras['alignment'],
                                         df_alignments_ds['alignment'])
        ])

        df_stats_ds['similarity'] = av_similarity
        df_stats_keras['similarity'] = av_similarity
        stats_ds.append(df_stats_ds)
        stats_keras.append(df_stats_keras)

        create_demo_files(target_dir_ds, audio, transcript, df_alignments_ds,
                          df_stats_ds)
        create_demo_files(target_dir_keras, audio, transcript,
                          df_alignments_keras, df_stats_keras)

    df_keras = pd.concat(stats_keras)
    csv_keras = join(target_dir, 'performance_keras.csv')
    df_keras.to_csv(csv_keras)

    df_ds = pd.concat(stats_ds)
    csv_ds = join(target_dir, 'performance_ds.csv')
    df_ds.to_csv(csv_ds)
    print(f'summary saved to {csv_keras}')

    visualize_pipeline_performance(csv_keras, csv_ds, silent=True)
    update_index(target_dir,
                 lang='en',
                 num_aligned=len(demo_files),
                 df_keras=df_keras,
                 keras_path=keras_path,
                 df_ds=df_ds,
                 ds_path=ds_path,
                 lm_path=lm_path,
                 vocab_path=vocab_path)

    print(f'Done! Demos have been saved to {target_dir}')