def __init__(self, data_valid, model, language, target_dir, num_epochs, num_minutes=None, save_progress=True, early_stopping=False, shuffle_data=True, force_output=False, lm_path=None, vocab_path=None): """ Will calculate WER and LER at epoch end and print out infered transcriptions from validation set using the current model and weights :param data_valid: validation data :param model: compiled model :param target_dir: string that identifies the current run :param save_progress: whether to save the model if a new WER/LER benchmark was achieved :param early_stopping: whether to stop the model if the WER/LER did not improve over the last 5 epochs :param shuffle_data: whether to shuffle the validation data before reporting :param force_output: whether to output the decoded inferences for all validation samples or only the ones with WER < 0.6 """ super().__init__() self.data_valid = data_valid self.model = model self.language = language self.target_dir = target_dir self.num_epochs = num_epochs self.num_minutes = num_minutes self.save_progress = save_progress self.early_stopping = early_stopping self.shuffle_data = shuffle_data self.force_output = force_output self.lm = None self.lm_vocab = None self.decoders = {} if lm_path: self.lm_path, self.vocab_path = lm_path, vocab_path self.lm = load_lm(lm_path) self.lm_vocab = load_vocab(vocab_path) if not isdir(self.target_dir): makedirs(self.target_dir) self.decoder_greedy = BestPathDecoder(model, language) self.decoder_beam = BeamSearchDecoder(model, language) # WER/LER history columns = pd.MultiIndex.from_product( [metrics, decoding_strategies, lm_uses], names=['metric', 'decoding strategy', 'LM correction']) self.df_history = pd.DataFrame(index=np.arange(num_epochs), columns=columns) # base name for files that will be written to target directory self.base_name = 'model' + (f'_{self.num_minutes}_min' if self.num_minutes else '') print(f'base name for result files: {self.base_name}')
def main(args): print(create_args_str(args)) lang, audio, trans, keras, ds, ds_alpha, ds_trie, lm, vocab, target_dir, normalize, gpu = setup(args) print(f'all artifacts will be saved to {target_dir}') lm = load_lm(lm) if lm else None vocab = load_vocab(vocab) if vocab else None audio_bytes, sample_rate, transcript, language = preprocess(audio, trans, lang, norm_transcript=normalize) voiced_segments = vad(audio_bytes, sample_rate) df_alignments = pipeline(voiced_segments=voiced_segments, sample_rate=sample_rate, transcript=transcript, language='en', ds_path=ds, ds_alpha_path=ds_alpha, ds_trie_path=ds_trie, keras_path=keras, lm=lm, vocab=vocab, force_realignment=args.force_realignment, align_endings=args.align_endings, target_dir=target_dir) df_stats = calculate_stats(df_alignments, ds, transcript) create_demo_files(target_dir, audio, transcript, df_alignments, df_stats) print() print_dataframe(df_stats) print() stats_csv = join(target_dir, 'stats.csv') print(f'Saving stats to {stats_csv}') df_alignments.to_csv(stats_csv)
def main(args): print(create_args_str(args)) target_dir, keras_path, lm_path, vocab_path, gpu = setup(args) print(f'all results will be written to {target_dir}') lm = load_lm(lm_path) if lm_path else None vocab = load_vocab(vocab_path) if vocab_path else None corpus = get_corpus('rl', 'de') corpus.summary() test_entries = list(set((segment.entry for segment in corpus.test_set()))) # add 6 entries from PodClub corpus corpus = get_corpus('pc', 'de') corpus.summary() test_entries += [ corpus['record1058'], corpus['record1063'], corpus['record1076'], corpus['record1523'], corpus['record1548'], corpus['record1556'] ] stats = [] for i, entry in enumerate(test_entries): print(f'entry {i + 1}/{len(test_entries)}') audio_file = entry.audio_path sample_rate = entry.rate with open(entry.transcript_path, encoding='utf-8') as f: transcript = f.read() if args.norm_transcript: transcript = normalize(transcript, 'de') demo_id = splitext(basename(audio_file))[0] target_dir_entry = join(target_dir, demo_id) if not exists(target_dir_entry): makedirs(target_dir_entry) voiced_segments = [ Voice(s.audio, s.rate, s.start_frame, s.end_frame) for s in entry ] df_alignments = pipeline(voiced_segments=voiced_segments, sample_rate=sample_rate, transcript=transcript, language='de', keras_path=keras_path, lm=lm, vocab=vocab, force_realignment=args.force_realignment, align_endings=args.align_endings, target_dir=target_dir_entry) df_stats = calculate_stats(df_alignments, keras_path, transcript) # calculate average similarity between Keras-alignment and original aligments original_alignments = [s.transcript for s in entry.segments] av_similarity = np.mean([ levenshtein_similarity(ka, oa) for (ka, oa) in zip(df_alignments['alignment'], original_alignments) ]) df_stats['similarity'] = av_similarity create_demo_files(target_dir_entry, audio_file, transcript, df_alignments, df_stats) stats.append(df_stats) df_keras = pd.concat(stats) csv_keras = join(target_dir, 'performance.csv') df_keras.to_csv(csv_keras) print(f'summary saved to {csv_keras}') visualize_pipeline_performance(csv_keras, csv_ds=None, silent=True) update_index(target_dir, lang='de', num_aligned=len(test_entries), df_keras=df_keras, keras_path=keras_path, lm_path=lm_path, vocab_path=vocab_path) K.clear_session()
def test_correction(self): lm = load_lm('/media/daniel/IP9/lm/ds_en/lm.binary') vocab = load_vocab('/media/daniel/IP9/lm/ds_en/lm_80k.vocab') text = 'i seee i sey saind the blnd manp to his deaf dauhgter' text_corrected = lm_util.correction(text, 'en', lm=lm, vocab=vocab) assert_that(text_corrected, is_('i see i see said the blind man to his deaf daughter'))
def test_load_lm(self): lm = load_lm('/media/daniel/IP9/lm/ds_en/lm.binary') vocab = load_vocab('/media/daniel/IP9/lm/ds_en/lm_80k.vocab') assert_that(lm, is_(not_(None))) assert_that(vocab, is_(not_(None)))
def main(args): print(create_args_str(args)) demo_files, target_dir, keras_path, ds_path, ds_alpha, ds_trie, lm_path, vocab_path, normalize, gpu = setup( args) num_files = len(demo_files) print( f'Processing {num_files} audio/transcript samples. All results will be written to {target_dir}' ) lm = load_lm(lm_path) if lm_path else None vocab = load_vocab(vocab_path) if vocab_path else None stats_keras, stats_ds = [], [] for i, (audio, transcript) in enumerate(demo_files): print( '-----------------------------------------------------------------' ) print(f'{i + 1}/{num_files}: Evaluating pipeline on {audio}') print( '-----------------------------------------------------------------' ) demo_id = splitext(basename(audio))[0] target_dir_ds = join(target_dir, demo_id + '_ds') target_dir_keras = join(target_dir, demo_id + '_keras') audio_bytes, sample_rate, transcript, language = preprocess( audio, transcript, 'en', norm_transcript=normalize) voiced_segments = vad(audio_bytes, sample_rate) df_alignments_ds = pipeline(voiced_segments=voiced_segments, sample_rate=sample_rate, transcript=transcript, language='en', ds_path=ds_path, ds_alpha_path=ds_alpha, ds_trie_path=ds_trie, lm_path=lm, force_realignment=args.force_realignment, align_endings=args.align_endings, target_dir=target_dir_ds) df_stats_ds = calculate_stats(df_alignments_ds, ds_path, transcript) df_alignments_keras = pipeline( voiced_segments=voiced_segments, sample_rate=sample_rate, transcript=transcript, language='en', keras_path=keras_path, lm=lm, vocab=vocab, force_realignment=args.force_realignment, align_endings=args.align_endings, target_dir=target_dir_keras) df_stats_keras = calculate_stats(df_alignments_keras, keras_path, transcript) # average similarity between Keras and DeepSpeech alignments av_similarity = np.mean([ levenshtein_similarity(al_keras, al_ds) for (al_keras, al_ds) in zip(df_alignments_keras['alignment'], df_alignments_ds['alignment']) ]) df_stats_ds['similarity'] = av_similarity df_stats_keras['similarity'] = av_similarity stats_ds.append(df_stats_ds) stats_keras.append(df_stats_keras) create_demo_files(target_dir_ds, audio, transcript, df_alignments_ds, df_stats_ds) create_demo_files(target_dir_keras, audio, transcript, df_alignments_keras, df_stats_keras) df_keras = pd.concat(stats_keras) csv_keras = join(target_dir, 'performance_keras.csv') df_keras.to_csv(csv_keras) df_ds = pd.concat(stats_ds) csv_ds = join(target_dir, 'performance_ds.csv') df_ds.to_csv(csv_ds) print(f'summary saved to {csv_keras}') visualize_pipeline_performance(csv_keras, csv_ds, silent=True) update_index(target_dir, lang='en', num_aligned=len(demo_files), df_keras=df_keras, keras_path=keras_path, df_ds=df_ds, ds_path=ds_path, lm_path=lm_path, vocab_path=vocab_path) print(f'Done! Demos have been saved to {target_dir}')