def segment_and_asr(audio_path): """ Reads the audio file, calculates log-probabilities of tokens using a trained nemo model and segments the audio file into smaller sections :returns log_probabilities of tokens in individual frames step size of frames (in seconds) vocabulary (list of tokens) list of segments in the audio file (start, end in seconds) """ sr = sox.file_info.sample_rate(audio_path) nsamples = np.round(sox.file_info.duration(audio_path) * sr) _, log_prob, vocab = infer(model_path, [audio_path], 1) step = np.round(nsamples / len(log_prob[0])) log_prob_step_sec = step / sr # Run VAD on the input file frames = read_frames_from_file(audio_path, model_format, frame_duration_ms=20) split = vad_split(frames, model_format, threshold=0.5, aggressiveness=2) segments = [] for i, segment in enumerate(split): segment_buffer, time_start, time_end = segment time_length = time_end - time_start if stt_min_duration_ms and time_length < stt_min_duration_ms: print('Fragment {}: Audio too short for STT'.format(i)) continue if stt_max_duration_ms and time_length > stt_max_duration_ms: print('Fragment {}: Audio too long for STT'.format(i)) continue segments.append((time_start, time_end)) return log_prob[0], log_prob_step_sec, vocab, segments
def main(): # Debug helpers logging.basicConfig() logging.root.setLevel(args.loglevel if args.loglevel else 20) def progress(it=None, desc='Processing', total=None): logging.info(desc) return it if args.no_progress else log_progress( it, interval=args.progress_interval, total=total) def resolve(base_path, spec_path): if spec_path is None: return None if not path.isabs(spec_path): spec_path = path.join(base_path, spec_path) return spec_path def exists(file_path): if file_path is None: return False return os.path.isfile(file_path) to_prepare = [] def enqueue_or_fail(audio, tlog, script, aligned, prefix=''): if exists(aligned) and not args.force: fail( prefix + 'Alignment file "{}" already existing - use --force to overwrite' .format(aligned)) if tlog is None: if args.ignore_missing: return fail(prefix + 'Missing transcription log path') if not exists(audio) and not exists(tlog): if args.ignore_missing: return fail(prefix + 'Both audio file "{}" and transcription log "{}" are missing'. format(audio, tlog)) if not exists(script): if args.ignore_missing: return fail(prefix + 'Missing script "{}"'.format(script)) to_prepare.append((audio, tlog, script, aligned)) if (args.audio or args.tlog) and args.script and args.aligned and not args.catalog: enqueue_or_fail(args.audio, args.tlog, args.script, args.aligned) elif args.catalog: if not exists(args.catalog): fail('Unable to load catalog file "{}"'.format(args.catalog)) catalog = path.abspath(args.catalog) catalog_dir = path.dirname(catalog) with open(catalog, 'r', encoding='utf-8') as catalog_file: catalog_entries = json.load(catalog_file) for entry in progress(catalog_entries, desc='Reading catalog'): enqueue_or_fail( resolve(catalog_dir, entry['audio']), resolve(catalog_dir, entry['tlog']), resolve(catalog_dir, entry['script']), resolve(catalog_dir, entry['aligned']), prefix='Problem loading catalog "{}" - '.format(catalog)) else: fail( 'You have to either specify a combination of "--audio/--tlog,--script,--aligned" or "--catalog"' ) logging.debug('Start') to_align = [] output_graph_path = None for audio_path, tlog_path, script_path, aligned_path in to_prepare: if not exists(tlog_path): generated_scorer = False if output_graph_path is None: logging.debug( 'Looking for model files in "{}"...'.format(model_dir)) output_graph_path = glob(model_dir + "/*.pbmm")[0] lang_scorer_path = glob(model_dir + "/*.scorer")[0] kenlm_path = 'dependencies/kenlm/build/bin' if not path.exists(kenlm_path): kenlm_path = None deepspeech_path = 'dependencies/deepspeech' if not path.exists(deepspeech_path): deepspeech_path = None if kenlm_path and deepspeech_path and not args.stt_no_own_lm: tc = read_script(script_path) if not tc.clean_text.strip(): logging.error('Cleaned transcript is empty for {}'.format( path.basename(script_path))) continue clean_text_path = script_path + '.clean' with open(clean_text_path, 'w', encoding='utf-8') as clean_text_file: clean_text_file.write(tc.clean_text) scorer_path = script_path + '.scorer' if not path.exists(scorer_path): # Generate LM data_lower, vocab_str = convert_and_filter_topk( scorer_path, clean_text_path, 500000) build_lm(scorer_path, kenlm_path, 5, '85%', '0|0|1', True, 255, 8, 'trie', data_lower, vocab_str) os.remove(scorer_path + '.' + 'lower.txt.gz') os.remove(scorer_path + '.' + 'lm.arpa') os.remove(scorer_path + '.' + 'lm_filtered.arpa') os.remove(clean_text_path) # Generate scorer create_bundle(alphabet_path, scorer_path + '.' + 'lm.binary', scorer_path + '.' + 'vocab-500000.txt', scorer_path, False, 0.931289039105002, 1.1834137581510284) os.remove(scorer_path + '.' + 'lm.binary') os.remove(scorer_path + '.' + 'vocab-500000.txt') generated_scorer = True else: scorer_path = lang_scorer_path logging.debug( 'Loading acoustic model from "{}", alphabet from "{}" and scorer from "{}"...' .format(output_graph_path, alphabet_path, scorer_path)) # Run VAD on the input file logging.debug('Transcribing VAD segments...') frames = read_frames_from_file(audio_path, model_format, args.audio_vad_frame_length) segments = vad_split(frames, model_format, num_padding_frames=args.audio_vad_padding, threshold=args.audio_vad_threshold, aggressiveness=args.audio_vad_aggressiveness) def pre_filter(): for i, segment in enumerate(segments): segment_buffer, time_start, time_end = segment time_length = time_end - time_start if args.stt_min_duration and time_length < args.stt_min_duration: logging.info( 'Fragment {}: Audio too short for STT'.format(i)) continue if args.stt_max_duration and time_length > args.stt_max_duration: logging.info( 'Fragment {}: Audio too long for STT'.format(i)) continue yield (time_start, time_end, np.frombuffer(segment_buffer, dtype=np.int16)) samples = list(progress(pre_filter(), desc='VAD splitting')) pool = multiprocessing.Pool(initializer=init_stt, initargs=(output_graph_path, scorer_path), processes=args.stt_workers) transcripts = list( progress(pool.imap(stt, samples), desc='Transcribing', total=len(samples))) fragments = [] for time_start, time_end, segment_transcript in transcripts: if segment_transcript is None: continue fragments.append({ 'start': time_start, 'end': time_end, 'transcript': segment_transcript }) logging.debug('Excluded {} empty transcripts'.format( len(transcripts) - len(fragments))) logging.debug( 'Writing transcription log to file "{}"...'.format(tlog_path)) with open(tlog_path, 'w', encoding='utf-8') as tlog_file: tlog_file.write( json.dumps(fragments, indent=4 if args.output_pretty else None, ensure_ascii=False)) # Remove scorer if generated if generated_scorer: os.remove(scorer_path) if not path.isfile(tlog_path): fail('Problem loading transcript from "{}"'.format(tlog_path)) to_align.append((tlog_path, script_path, aligned_path)) total_fragments = 0 dropped_fragments = 0 reasons = Counter() index = 0 pool = multiprocessing.Pool(processes=args.align_workers) for aligned_file, file_total_fragments, file_dropped_fragments, file_reasons in \ progress(pool.imap_unordered(align, to_align), desc='Aligning', total=len(to_align)): if args.no_progress: index += 1 logging.info( 'Aligned file {} of {} - wrote results to "{}"'.format( index, len(to_align), aligned_file)) total_fragments += file_total_fragments dropped_fragments += file_dropped_fragments reasons += file_reasons logging.info('Aligned {} fragments'.format(total_fragments)) if total_fragments > 0 and dropped_fragments > 0: logging.info('Dropped {} fragments {:0.2f}%:'.format( dropped_fragments, dropped_fragments * 100.0 / total_fragments)) for key, number in reasons.most_common(): logging.info(' - {}: {}'.format(key, number))
def main(): # Debug helpers logging.basicConfig() logging.root.setLevel(args.loglevel if args.loglevel else 20) def progress(it=None, desc="Processing", total=None): logging.info(desc) return (it if args.no_progress else log_progress( it, interval=args.progress_interval, total=total)) def resolve(base_path, spec_path): if spec_path is None: return None if not path.isabs(spec_path): spec_path = path.join(base_path, spec_path) return spec_path def exists(file_path): if file_path is None: return False return os.path.isfile(file_path) to_prepare = [] def enqueue_or_fail(audio, tlog, script, aligned, prefix=""): if exists(aligned) and not args.force: fail( prefix + 'Alignment file "{}" already existing - use --force to overwrite' .format(aligned)) if tlog is None: if args.ignore_missing: return fail(prefix + "Missing transcription log path") if not exists(audio) and not exists(tlog): if args.ignore_missing: return fail(prefix + 'Both audio file "{}" and transcription log "{}" are missing'. format(audio, tlog)) if not exists(script): if args.ignore_missing: return fail(prefix + 'Missing script "{}"'.format(script)) to_prepare.append((audio, tlog, script, aligned)) if (args.audio or args.tlog) and args.script and args.aligned and not args.catalog: enqueue_or_fail(args.audio, args.tlog, args.script, args.aligned) elif args.catalog: if not exists(args.catalog): fail('Unable to load catalog file "{}"'.format(args.catalog)) catalog = path.abspath(args.catalog) catalog_dir = path.dirname(catalog) with open(catalog, "r", encoding="utf-8") as catalog_file: catalog_entries = json.load(catalog_file) for entry in progress(catalog_entries, desc="Reading catalog"): enqueue_or_fail( resolve(catalog_dir, entry["audio"]), resolve(catalog_dir, entry["tlog"]), resolve(catalog_dir, entry["script"]), resolve(catalog_dir, entry["aligned"]), prefix='Problem loading catalog "{}" - '.format(catalog), ) else: fail( 'You have to either specify a combination of "--audio/--tlog,--script,--aligned" or "--catalog"' ) logging.debug("Start") to_align = [] output_graph_path = None for audio_path, tlog_path, script_path, aligned_path in to_prepare: if not exists(tlog_path): # or args.force: generated_scorer = False if output_graph_path is None: logging.debug( 'Looking for model files in "{}"...'.format(model_dir)) output_graph_path = glob(model_dir + "/*.pbmm")[0] lang_scorer_path = glob(model_dir + "/*.scorer")[0] kenlm_path = "/install/kenlm/build/bin" deepspeech_path = "third_party/DeepSpeech" if args.per_document_lm: assert path.exists(kenlm_path) assert path.exists(deepspeech_path) scorer_path = script_path + ".scorer" if not path.exists(scorer_path): data_lower, vocab_str = convert_and_filter_topk( scorer_path, clean_text_path, 500000) build_lm( scorer_path, kenlm_path, 5, "85%", "0|0|1", True, 255, 8, "trie", data_lower, vocab_str, ) os.remove(scorer_path + "." + "lower.txt.gz") os.remove(scorer_path + "." + "lm.arpa") os.remove(scorer_path + "." + "lm_filtered.arpa") os.remove(clean_text_path) create_bundle( alphabet_path, scorer_path + "." + "lm.binary", scorer_path + "." + "vocab-500000.txt", scorer_path, False, 0.931289039105002, 1.1834137581510284, ) os.remove(scorer_path + "." + "lm.binary") os.remove(scorer_path + "." + "vocab-500000.txt") tc = read_script(script_path) if not tc.clean_text.strip(): logging.error("Cleaned transcript is empty for {}".format( path.basename(script_path))) continue clean_text_path = script_path + ".clean" with open(clean_text_path, "w", encoding="utf-8") as clean_text_file: clean_text_file.write(tc.clean_text) generated_scorer = True else: scorer_path = lang_scorer_path logging.debug( 'Loading acoustic model from "{}", alphabet from "{}" and scorer from "{}"...' .format(output_graph_path, alphabet_path, scorer_path)) # Run VAD on the input file logging.debug("Transcribing VAD segments...") frames = read_frames_from_file(audio_path, model_format, args.audio_vad_frame_length) frames = list(frames) with open("dsalign_voiced_buffers.npy", "wb") as fh: np.save(fh, frames) segments = vad_split( frames, model_format, num_padding_frames=args.audio_vad_padding, threshold=args.audio_vad_threshold, aggressiveness=args.audio_vad_aggressiveness, ) def pre_filter(): for i, segment in enumerate(segments): segment_buffer, time_start, time_end = segment time_length = time_end - time_start if args.stt_min_duration and time_length < args.stt_min_duration: logging.info( "Fragment {}: Audio too short for STT".format(i)) continue if args.stt_max_duration and time_length > args.stt_max_duration: logging.info( "Fragment {}: Audio too long for STT".format(i)) continue yield ( time_start, time_end, np.frombuffer(segment_buffer, dtype=np.int16), ) samples = list(progress(pre_filter(), desc="VAD splitting")) # It does multiprocessing on the individual chunks within # a particular document. This is not a great thing. Not # much parallelism were we to use a TPU or GPU. # multiprocessing pool. Need to replace this with a queue of some sort. pool = multiprocessing.Pool( initializer=init_stt, initargs=(output_graph_path, scorer_path), processes=args.stt_workers, ) transcripts = list( progress(pool.imap(stt, samples), desc="Transcribing", total=len(samples))) fragments = [] for time_start, time_end, segment_transcript in transcripts: if segment_transcript is None: continue fragments.append({ "start": time_start, "end": time_end, "transcript": segment_transcript, }) logging.debug("Excluded {} empty transcripts".format( len(transcripts) - len(fragments))) logging.debug( 'Writing transcription log to file "{}"...'.format(tlog_path)) with open(tlog_path, "w", encoding="utf-8") as tlog_file: tlog_file.write( json.dumps( fragments, indent=4 if args.output_pretty else None, ensure_ascii=False, )) # Remove scorer if generated if generated_scorer: os.remove(scorer_path) if not path.isfile(tlog_path): fail('Problem loading transcript from "{}"'.format(tlog_path)) to_align.append((tlog_path, script_path, aligned_path)) total_fragments = 0 dropped_fragments = 0 reasons = Counter() index = 0 pool = multiprocessing.Pool(processes=args.align_workers) for ( aligned_file, file_total_fragments, file_dropped_fragments, file_reasons, ) in progress(pool.imap_unordered(align, to_align), desc="Aligning", total=len(to_align)): if args.no_progress: index += 1 logging.info( 'Aligned file {} of {} - wrote results to "{}"'.format( index, len(to_align), aligned_file)) total_fragments += file_total_fragments dropped_fragments += file_dropped_fragments reasons += file_reasons logging.info("Aligned {} fragments".format(total_fragments)) if total_fragments > 0 and dropped_fragments > 0: logging.info("Dropped {} fragments {:0.2f}%:".format( dropped_fragments, dropped_fragments * 100.0 / total_fragments)) for key, number in reasons.most_common(): logging.info(" - {}: {}".format(key, number))
def main(audio_chunks_path, transcript_lst_path): # Debug helpers logging.basicConfig() logging.root.setLevel(args.loglevel if args.loglevel else 20) def progress(it=None, desc='Processing', total=None): logging.info(desc) return it if args.no_progress else log_progress( it, interval=args.progress_interval, total=total) def resolve(base_path, spec_path): if spec_path is None: return None if not path.isabs(spec_path): spec_path = path.join(base_path, spec_path) return spec_path def exists(file_path): if file_path is None: return False return os.path.isfile(file_path) to_prepare = [] def enqueue_or_fail(audio, tlog, script, aligned, prefix=''): if exists(aligned) and not args.force: fail( prefix + 'Alignment file "{}" already existing - use --force to overwrite' .format(aligned)) if tlog is None: if args.ignore_missing: return fail(prefix + 'Missing transcription log path') if not exists(audio) and not exists(tlog): if args.ignore_missing: return fail(prefix + 'Both audio file "{}" and transcription log "{}" are missing'. format(audio, tlog)) if not exists(script): if args.ignore_missing: return fail(prefix + 'Missing script "{}"'.format(script)) to_prepare.append((audio, tlog, script, aligned)) if (args.audio or args.tlog) and args.script and args.aligned and not args.catalog: enqueue_or_fail(args.audio, args.tlog, args.script, args.aligned) elif args.catalog: if not exists(args.catalog): fail('Unable to load catalog file "{}"'.format(args.catalog)) catalog = path.abspath(args.catalog) catalog_dir = path.dirname(catalog) with open(catalog, 'r', encoding='utf-8') as catalog_file: catalog_entries = json.load(catalog_file) for entry in progress(catalog_entries, desc='Reading catalog'): enqueue_or_fail( resolve(catalog_dir, entry['audio']), resolve(catalog_dir, entry['tlog']), resolve(catalog_dir, entry['script']), resolve(catalog_dir, entry['aligned']), prefix='Problem loading catalog "{}" - '.format(catalog)) else: fail( 'You have to either specify a combination of "--audio/--tlog,--script,--aligned" or "--catalog"' ) logging.debug('Start') to_align = [] for audio_path, tlog_path, script_path, aligned_path in to_prepare: if not exists(tlog_path): if not args.stt_no_own_lm: tc = read_script(script_path) if not tc.clean_text.strip(): logging.error('Cleaned transcript is empty for {}'.format( path.basename(script_path))) continue clean_text_path = script_path + '.clean' with open(clean_text_path, 'w', encoding='utf-8') as clean_text_file: clean_text_file.write(tc.clean_text) # Run VAD on the input file logging.debug('Transcribing VAD segments...') frames = read_frames_from_file(audio_path, model_format, args.audio_vad_frame_length) segments = vad_split(frames, model_format, num_padding_frames=args.audio_vad_padding, threshold=args.audio_vad_threshold, aggressiveness=args.audio_vad_aggressiveness) def pre_filter(): for i, segment in enumerate(segments): segment_buffer, time_start, time_end = segment time_length = time_end - time_start if args.stt_min_duration and time_length < args.stt_min_duration: logging.info( 'Fragment {}: Audio too short for STT'.format(i)) continue if args.stt_max_duration and time_length > args.stt_max_duration: logging.info( 'Fragment {}: Audio too long for STT'.format(i)) continue yield (time_start, time_end, np.frombuffer(segment_buffer, dtype=np.int16)) samples = list(progress(pre_filter(), desc='VAD splitting')) cnt = 1 for time_start, time_end, audio in samples: wf.write(audio_chunks_path + '/test' + str(cnt) + '.wav', 16000, audio) if (cnt == 1): f = open(transcript_lst_path + '/transcript.txt', 'w+') f.write( str(cnt) + ' ' + '/root/wav2letter/temp_audio/test' + str(cnt) + '.wav' + ' ' + str(np.random.randint(500, 1000, 1)[0]) + ' Welcome to Glib') f.close() f = open(transcript_lst_path + '/transcript.txt', 'a') else: f.write('\n' + str(cnt) + ' ' + '/root/wav2letter/temp_audio/test' + str(cnt) + '.wav' + ' ' + str(np.random.randint(500, 1000, 1)[0]) + ' Welcome to Glib') cnt += 1 f.close() os.rename(transcript_lst_path + '/transcript.txt', transcript_lst_path + '/transcript.lst') decoder_path = "./root/wav2letter/build/Decoder" cfg_path = "/root/wav2letter/recipes/models/streaming_convnets/librispeech/decode_500ms_right_future_ngram_other.cfg" os.system( '[ ! "$(docker ps -a | grep mycontainer)" ] && docker run -d --name mycontainer -i wav2letter-cpu-1' ) os.system( "docker start mycontainer && docker cp " + transcript_lst_path + "/transcript.lst mycontainer:/root/wav2letter/lists/transcript.lst && docker cp -a " + audio_chunks_path + " mycontainer:/root/wav2letter && docker exec -ti mycontainer /bin/bash -c 'export LD_LIBRARY_PATH=/opt/intel/compilers_and_libraries_2018.5.274/linux/mkl/lib/intel64_lin \n ./root/wav2letter/build/Decoder --flagsfile=/root/wav2letter/recipes/models/streaming_convnets/librispeech/decode_500ms_right_future_ngram_other.cfg \n rm -r /root/wav2letter/temp_audio \n exit' && docker cp mycontainer:/root/wav2letter/lists/transcript.lst.hyp " + transcript_lst_path + "/transcript.lst.hyp && docker stop mycontainer") decoder_trans = [] with open(transcript_lst_path + '/transcript.lst.hyp', 'r') as f: for line in f: decoder_trans.insert(0, line[:-3]) cnt = 0 transcripts = [] for time_start, time_end, audio in samples: tup = tuple([time_start, time_end, decoder_trans[cnt]]) cnt += 1 transcripts.append(tup) fragments = [] for time_start, time_end, segment_transcript in transcripts: if segment_transcript is None: continue fragments.append({ 'start': time_start, 'end': time_end, 'transcript': segment_transcript }) logging.debug('Excluded {} empty transcripts'.format( len(transcripts) - len(fragments))) logging.debug( 'Writing transcription log to file "{}"...'.format(tlog_path)) with open(tlog_path, 'w', encoding='utf-8') as tlog_file: tlog_file.write( json.dumps(fragments, indent=4 if args.output_pretty else None, ensure_ascii=False)) if not path.isfile(tlog_path): fail('Problem loading transcript from "{}"'.format(tlog_path)) to_align.append((tlog_path, script_path, aligned_path)) total_fragments = 0 dropped_fragments = 0 reasons = Counter() index = 0 pool = multiprocessing.Pool(processes=args.align_workers) for aligned_file, file_total_fragments, file_dropped_fragments, file_reasons in \ progress(pool.imap_unordered(align, to_align), desc='Aligning', total=len(to_align)): if args.no_progress: index += 1 logging.info( 'Aligned file {} of {} - wrote results to "{}"'.format( index, len(to_align), aligned_file)) total_fragments += file_total_fragments dropped_fragments += file_dropped_fragments reasons += file_reasons logging.info('Aligned {} fragments'.format(total_fragments)) if total_fragments > 0 and dropped_fragments > 0: logging.info('Dropped {} fragments {:0.2f}%:'.format( dropped_fragments, dropped_fragments * 100.0 / total_fragments)) for key, number in reasons.most_common(): logging.info(' - {}: {}'.format(key, number)) logging.info('clean up DSalign') os.system("rm " + audio_chunks_path + "/* && rm " + transcript_lst_path + "/*")