def decode_ctc(words_path, graph_path, out_folder, featstrings, min_active=20, max_active=700, max_mem=500000, beam=5.0, latbeam=5.0, acwt=1.0, **kwargs): out_folder = f"{out_folder}/exp_files" acwt = 1.0 assert graph_path.endswith("TLG.fst") assert words_path.endswith("words.txt") assert isinstance(featstrings, list) assert os.environ['EESEN_ROOT'] latgen_faster_bin = f"{os.environ['EESEN_ROOT']}/src/decoderbin/latgen-faster" chnk_id = 0 for ck_data in tqdm(featstrings, desc="lattice generation chunk:"): finalfeats = f"ark,s,cs: " # Decode for each of the acoustic scales run_shell( f"{latgen_faster_bin} " + f"--max-active={max_active} " + f"--max-mem={max_mem} " + f"--beam={beam} " + f"--lattice-beam={latbeam} " + f"--acoustic-scale={acwt} " + f"--allow-partial=true " + f"--word-symbol-table={words_path} " + f"{graph_path} " + f"ark:{ck_data} \"ark:|gzip -c > {out_folder}/lat.{chnk_id}.gz\"") transcripts_best, transcripts, lattice_confidence, lm_posterior, acoustic_posterior = get_transcripts( words_path, out_folder) for t in transcripts_best: # if transcripts_best[t] != ['THREE'] and transcripts[t] != [ # 'THREE']: # might give a different path for e.g. THREE and TREE assert transcripts_best[t] == transcripts[ t], f"{t}: {transcripts_best[t]} =!= {transcripts[t]}" assert len(transcripts) == len(lattice_confidence) transcripts = dict(transcripts) lattice_confidence = dict(lattice_confidence) lm_posterior = dict(lm_posterior) acoustic_posterior = dict(acoustic_posterior) result = {} for sample_id in transcripts: _lattice_confidence = lattice_confidence[sample_id] \ if 10000000000.0 != lattice_confidence[sample_id] else float("inf") _lm_posterior = lm_posterior[sample_id] # TODO normalize _acoustic_posterior = acoustic_posterior[sample_id] # TODO normalize result[sample_id] = (transcripts[sample_id], _lattice_confidence, _lm_posterior, _acoustic_posterior) return result
def score(data, lang_or_graph, dir, num_jobs, min_lmwt=1, max_lmwt=10): decoing_scripts_folder = os.path.join( os.getcwd(), __name__.split(".")[0]) # 'kaldi_decoding_scripts' # end configuration section. phonemap = "conf/phones.60-48-39.map" nj = num_jobs symtab = os.path.join(lang_or_graph, "words.txt") assert os.path.exists(symtab) assert os.path.exists(os.path.join(dir, "lat.1.gz")) assert os.path.exists(os.path.join(data, "text")) timit_norm_trans_script = os.path.join(decoing_scripts_folder, "local/timit_norm_trans.pl") assert os.path.exists(timit_norm_trans_script) assert os.access(timit_norm_trans_script, os.X_OK) int2sym_script = os.path.join(decoing_scripts_folder, "utils/int2sym.pl") assert os.path.exists(int2sym_script) assert os.access(int2sym_script, os.X_OK) phonemap = os.path.join(decoing_scripts_folder, phonemap) assert os.path.exists(phonemap) pl_cmd_script = os.path.join(decoing_scripts_folder, "utils/run.pl") assert os.path.exists(pl_cmd_script) assert os.access(pl_cmd_script, os.X_OK) os.makedirs(os.path.join(dir, "scoring", "log")) # Map reference to 39 phone classes: cmd = f"cat {data}/text | {timit_norm_trans_script} -i - -m {phonemap} -from 48 -to 39 > {dir}/scoring/test_filt.txt" run_shell(cmd) # Get the phone-sequence on the best-path: for LMWT in range(min_lmwt, max_lmwt): cmd = f"{pl_cmd_script} JOB=1:{nj} {dir}/scoring/log/best_path_basic.{LMWT}.JOB.log " + \ f"lattice-best-path --lm-scale={LMWT} --word-symbol-table={symtab} --verbose=2 \"ark:gunzip -c {dir}/lat.JOB.gz|\" ark,t:{dir}/scoring/{LMWT}.JOB.tra || exit 1;" run_shell(cmd) run_shell( f"cat {dir}/scoring/{LMWT}.*.tra | sort > {dir}/scoring/{LMWT}.tra" ) run_shell(f"rm {dir}/scoring/{LMWT}.*.tra") # Map hypothesis to 39 phone classes: cmd = f"{pl_cmd_script} LMWT={min_lmwt}:{max_lmwt}{dir}/scoring/log/score_basic.LMWT.log " + \ f"cat {dir}/scoring/LMWT.tra \| " + \ f"{int2sym_script} -f 2- {symtab} \| " + \ f"{timit_norm_trans_script} -i - -m {phonemap} -from 48 -to 39 \| " + \ f"compute-wer --text --mode=all ark:{dir}/scoring/test_filt.txt ark,p:- \">&\" {dir}/wer_LMWT || exit 1;" run_shell(cmd)
def build_1_gram_fst(arpa_lm_path, graph_dir): one_gram_arpa = [] with gzip.open(arpa_lm_path, 'rb') as f: for line in f: line = line.decode("utf-8") if "\\2-grams:" in line: break elif not ("ngram 2=" in line or "ngram 3=" in line or "ngram 4=" in line or "ngram 5=" in line or "ngram 6=" in line): if line.startswith("-"): # removing backtracking prob for 1-gram fake_prob = "-1.0" word = line.split(maxsplit=2)[1] _line = f"{fake_prob}\t{word}\n" else: _line = line one_gram_arpa.append(_line) else: pass one_gram_arpa.append("\\end\\\n") pruned_lexicon_path = f"{graph_dir}/pruned_lexicon.txt" with open(pruned_lexicon_path, 'w') as f: f.writelines(one_gram_arpa) eesen_utils_path = os.path.join(os.getcwd(), "kws_decoder", "eesen_utils") assert os.path.exists(f"{eesen_utils_path}/s2eps.pl") assert os.access(f"{eesen_utils_path}/s2eps.pl", os.X_OK) assert os.path.exists(f"{eesen_utils_path}/eps2disambig.pl") assert os.access(f"{eesen_utils_path}/eps2disambig.pl", os.X_OK) run_shell( f"arpa2fst {pruned_lexicon_path} | fstprint | " # + f"utils/remove_oovs.pl {tmpdir}/oovs_{lm_suffix}.txt | " + f"{eesen_utils_path}/eps2disambig.pl | {eesen_utils_path}/s2eps.pl | fstcompile --isymbols={graph_dir}/words.txt " + f"--osymbols={graph_dir}/words.txt --keep_isymbols=false --keep_osymbols=false | " + f"fstrmepsilon | fstarcsort --sort_type=ilabel > {graph_dir}/G.fst") return os.path.abspath(f"{graph_dir}/G.fst")
def build_grammar_fst(arpa_lm_path, graph_dir): eesen_utils_path = os.path.join(os.getcwd(), "kws_decoder", "eesen_utils") assert os.path.exists(f"{eesen_utils_path}/s2eps.pl") assert os.access(f"{eesen_utils_path}/s2eps.pl", os.X_OK) assert os.path.exists(f"{eesen_utils_path}/eps2disambig.pl") assert os.access(f"{eesen_utils_path}/eps2disambig.pl", os.X_OK) run_shell( f" gunzip -c {arpa_lm_path} | " + f"grep -v '<s> <s>' | " + f" grep -v '</s> <s>' |" + f"grep -v '</s> </s>' |" + f"arpa2fst - | fstprint | " # + f"utils/remove_oovs.pl {tmpdir}/oovs_{lm_suffix}.txt | " + f"{eesen_utils_path}/eps2disambig.pl | {eesen_utils_path}/s2eps.pl | fstcompile --isymbols={graph_dir}/words.txt " + f"--osymbols={graph_dir}/words.txt --keep_isymbols=false --keep_osymbols=false | " + f"fstrmepsilon | fstarcsort --sort_type=ilabel > {graph_dir}/G.fst") return os.path.abspath(f"{graph_dir}/G.fst")
def get_lab_count(label_opts, num_label, folder_lab_count): cmd = f'analyze-counts --print-args=False --verbose=0 --binary=false' \ + f' --counts-dim={num_label} "ark:{label_opts}' \ + f' {folder_lab_count}/final.mdl \\"ark:gunzip -c ' \ + f'{folder_lab_count}/ali.*.gz |\\" ark:- |" -' # + f'{folder_lab_count}/ali.*.gz |\\" ark:- |" {count_file_path}' # count_file_path = "tmp.count" #TODO save in file instead lab_count = run_shell(cmd) lab_count = lab_count.strip().strip('[]').strip() lab_count = [float(v) for v in lab_count.split()] return lab_count
def decode_ctc(data, graphdir, out_folder, featstrings, min_active=20, max_active=5000, max_mem=500000, beam=17.0, latbeam=8.0, acwt=0.9, **kwargs): out_folder = f"{out_folder}/exp_files" if not os.path.exists(out_folder): os.makedirs(out_folder) assert os.path.exists(f"{graphdir}/TLG.fst") assert os.path.exists(f"{graphdir}/words.txt") assert isinstance(featstrings, list) assert os.environ['EESEN_ROOT'] latgen_faster_bin = f"{os.environ['EESEN_ROOT']}/src/decoderbin/latgen-faster" chnk_id = 0 for ck_data in tqdm(featstrings, desc="lattice generation chunk:"): finalfeats = f"ark,s,cs: " # Decode for each of the acoustic scales run_shell( f"{latgen_faster_bin} " + f"--max-active={max_active} " + f"--max-mem={max_mem} " + f"--beam={beam} " + f"--lattice-beam={latbeam} " + f"--acoustic-scale={acwt} " + f"--allow-partial=true " + f"--word-symbol-table={graphdir}/words.txt " + f"{graphdir}/TLG.fst " + f"ark:{ck_data} \"ark:|gzip -c > {out_folder}/lat.{chnk_id}.gz\"") score(data, f"{graphdir}/words.txt", out_folder)
def get_kaldi_feats(scp_file, out_dir, spk2utt, utt2spk): # Compute features fbank_config = "kaldi_decoding_scripts/conf/fbank.conf" compress = "true" name = "decoding" assert os.path.exists(fbank_config) out_scp = f"{out_dir}/raw_fbank_{name}.scp" out_ark = f"{out_dir}/raw_fbank_{name}.ark" run_shell(f"compute-fbank-feats --verbose=2 --config={fbank_config} scp,p:{scp_file} ark:- | \ copy-feats --compress={compress} ark:- ark,scp:{out_ark},{out_scp}") # Compute normalization data cmvn_ark = f"{out_dir}/cmvn_{name}.ark" cmvn_scp = f"{out_dir}/cmvn_{name}.scp" run_shell(f"compute-cmvn-stats --spk2utt=ark:{spk2utt} scp:{out_scp} ark,scp:{cmvn_ark},{cmvn_scp}") # Load normalized feature_opts = f"apply-cmvn --utt2spk=ark:{utt2spk} ark:{cmvn_ark} ark:- ark:- | add-deltas --delta-order=0 ark:- ark:- |" features_loaded = \ {k: m for k, m in kaldi_io.read_mat_ark(f'ark:copy-feats scp:{out_scp} ark:- | {feature_opts}')} return features_loaded
def build_kw_grammar_fst(keywords, words_file): if not isinstance(keywords[0], list): # expect each kw to be a list of words keywords = [kw.split(" ") for kw in keywords] with open(words_file, "r") as f: word_map = f.readlines() word_map = dict([line.strip().split(" ", 1) for line in word_map]) assert EPS_SYM in word_map assert SIL_SYM in word_map assert UNK_SYM in word_map assert SPN_SYM in word_map keyword_fst_folder = "keyword_fsts" if os.path.isdir(keyword_fst_folder): shutil.rmtree(keyword_fst_folder) os.makedirs(keyword_fst_folder) write_fst([UNK_SYM], keyword_fst_folder, word_map) for kw in keywords: write_fst(kw, keyword_fst_folder, word_map) out_fst = f"{keyword_fst_folder}/UNION.fst" run_shell(f"fstcompile {keyword_fst_folder}/UNK.txt {out_fst}") for fst in glob(f"{keyword_fst_folder}/*.txt"): run_shell(f"fstcompile {fst} {fst[:-4]}.fst") run_shell(f"fstunion {fst[:-4]}.fst {out_fst} {out_fst}") run_shell(f"fstrmepsilon {out_fst} | fstarcsort --sort_type=ilabel - {out_fst}") # run_shell(f"fstprint --isymbols={words_file} --osymbols={words_file} {out_fst}") # run_shell(f"fstprint {out_fst}") # run_shell( # f"fstdraw --isymbols={words_file} --osymbols={words_file} {out_fst} " # + f"| dot -Tpdf -o {os.path.basename(out_fst)[:-4]}.pdf") return os.path.abspath(out_fst)
def score(data, lang_or_graph, dir, num_jobs, stage=0, model=None, min_lmwt=1, max_lmwt=10, mbr_scale=1.0 ): decoing_scripts_folder = os.path.join(os.getcwd(), __name__.split(".")[0]) # 'kaldi_decoding_scripts' # end configuration section. if model is None: # assume model one level up from decoding dir. model = os.path.join(dir, "..", "final.mdl") KALDI_ROOT = os.environ['KALDI_ROOT'] hubscr = f"{KALDI_ROOT}/tools/sctk/bin/hubscr.pl" assert os.path.exists(f"{KALDI_ROOT}/tools/sctk/bin/hubscr.pl"), "Cannot find scoring program hubscr" hubdir = os.path.dirname(f"{KALDI_ROOT}/tools/sctk/bin/hubscr.pl") phonemap = "conf/phones.60-48-39.map" nj = num_jobs symtab = os.path.join(lang_or_graph, "words.txt") assert os.path.exists(symtab) assert os.path.exists(os.path.join(dir, "lat.1.gz")) assert os.path.exists(os.path.join(data, "text")) timit_norm_trans_script = os.path.join(decoing_scripts_folder, "local/timit_norm_trans.pl") assert os.path.exists(timit_norm_trans_script) assert os.access(timit_norm_trans_script, os.X_OK) int2sym_script = os.path.join(decoing_scripts_folder, "utils/int2sym.pl") assert os.path.exists(int2sym_script) assert os.access(int2sym_script, os.X_OK) phonemap = os.path.join(decoing_scripts_folder, phonemap) assert os.path.exists(phonemap) pl_cmd_script = os.path.join(decoing_scripts_folder, "utils/run.pl") assert os.path.exists(pl_cmd_script) #TODO remove run.pl command assert os.access(pl_cmd_script, os.X_OK) os.makedirs(os.path.join(dir, "scoring", "log")) # Map reference to 39 phone classes, the silence is optional (.): cmd = f"{timit_norm_trans_script} -i {data}/stm -m {phonemap} -from 48 -to 39 | \ sed 's: sil: (sil):g' > {dir}/scoring/stm_39phn" run_shell(cmd) copy(os.path.join(data, "glm"), os.path.join(dir, "scoring", "glm_39phn")) if stage <= 0: # Get the phone-sequence on the best-path: for LMWT in range(min_lmwt, max_lmwt): acoustic_scale = 1 / LMWT * mbr_scale cmd = f"{pl_cmd_script} JOB=1:{nj} {dir}/scoring/log/best_path.{LMWT}.JOB.log " + \ f"lattice-align-phones {model} \"ark:gunzip -c {dir}/lat.JOB.gz|\" ark:- \| " + \ f"lattice-to-ctm-conf --acoustic-scale={acoustic_scale:.8f} --lm-scale={mbr_scale} ark:- {dir}/scoring/{LMWT}.JOB.ctm || exit 1;" run_shell(cmd) run_shell(f"cat {dir}/scoring/{LMWT}.*.ctm | sort > {dir}/scoring/{LMWT}.ctm") run_shell(f"rm {dir}/scoring/{LMWT}.*.ctm") if stage <= 1: # Map ctm to 39 phone classes: cmd = f"{pl_cmd_script} LMWT={min_lmwt}:{max_lmwt} {dir}/scoring/log/map_ctm.LMWT.log " + \ f"mkdir {dir}/score_LMWT ';' " + \ f"cat {dir}/scoring/LMWT.ctm \| " + \ f"{int2sym_script} -f 5 {symtab} \| " + \ f"{timit_norm_trans_script} -i - -m {phonemap} -from 48 -to 39 '>' " + \ f"{dir}/scoring/LMWT.ctm_39phn || exit 1" run_shell(cmd) # Score the set... cmd = f"{pl_cmd_script} LMWT={min_lmwt}:{max_lmwt} {dir}/scoring/log/map_ctm.LMWT.log " + \ f"cp {dir}/scoring/stm_39phn {dir}/score_LMWT/stm_39phn '&&' cp {dir}/scoring/LMWT.ctm_39phn {dir}/score_LMWT/ctm_39phn '&&' " + \ f"{hubscr} -p {hubdir} -V -l english -h hub5 -g {dir}/scoring/glm_39phn -r {dir}/score_LMWT/stm_39phn {dir}/score_LMWT/ctm_39phn || exit 1;" run_shell(cmd)
def decode( alidir, data, graphdir, out_folder, featstrings, min_active=200, max_active=7000, max_mem=50000000, beam=13.0, latbeam=8.0, acwt=0.2, max_arcs=-1.0, scoring_type="std", # none, std & basic so far scoring_opts=None, norm_vars=False, **kwargs): # TODO remove if scoring_opts == '"--min-lmwt 1 --max-lmwt 10"': scoring_opts = {"min_lmwt": 1, "max_lmwt": 10} if scoring_opts is None: scoring_opts = {"min_lmwt": 1, "max_lmwt": 10} assert isinstance(featstrings, list) num_threads = 1 assert out_folder[-1] != '/' srcdir = os.path.dirname(out_folder) thread_string = "-parallel --num-threads={}".format(num_threads) if not os.path.isdir(os.path.join(out_folder, "log")): os.makedirs(os.path.join(out_folder, "log")) num_jobs = len(featstrings) with open(os.path.join(out_folder, "num_jobs"), "w") as f: f.write(str(num_jobs)) assert os.path.exists(os.path.join(graphdir, "HCLG.fst")) JOB = 1 for ck_data in featstrings: finalfeats = f"ark,s,cs: cat {ck_data} |" cmd = f'latgen-faster-mapped{thread_string} --min-active={min_active} ' + \ f'--max-active={max_active} --max-mem={max_mem} ' + \ f'--beam={beam} --lattice-beam={latbeam} ' + \ f'--acoustic-scale={acwt} --allow-partial=true ' + \ f'--word-symbol-table={graphdir}/words.txt {alidir}/final.mdl ' + \ f'{graphdir}/HCLG.fst ' + \ f'\"{finalfeats}\" \"ark:|gzip -c > {out_folder}/lat.{JOB}.gz\" &> {out_folder}/log/decode.{JOB}.log' run_shell(cmd) JOB += 1 copy(os.path.join(alidir, "final.mdl"), srcdir) if scoring_type != "none": if scoring_type == "std": score(data, graphdir, out_folder, num_jobs, **scoring_opts) elif scoring_type == "basic": score_basic(data, graphdir, out_folder, num_jobs, **scoring_opts) elif scoring_type == "libri": score_libri(data, graphdir, out_folder, **scoring_opts) elif scoring_type == "just_transcript": return get_transcripts(data, graphdir, out_folder) else: raise ValueError
def make_ctc_decoding_graph( keywords, phn2idx, tmpdir, lexicon_path=f"{KALDI_ROOT}/egs/librispeech/s5/data/local/lm/librispeech-lexicon.txt", draw_G_L_fsts=False): tmpdir = os.path.join(tmpdir, "tmp") if os.path.isdir(tmpdir): shutil.rmtree(tmpdir) os.makedirs(tmpdir) graph_dir = os.path.join(tmpdir, "graph_dir") if os.path.isdir(graph_dir): shutil.rmtree(graph_dir) os.makedirs(graph_dir) keywords = [kw.upper() for kw in keywords] ####### # check_units_txt(units_txt) if not os.path.exists(graph_dir): os.makedirs(graph_dir) if not os.path.exists(tmpdir): os.makedirs(tmpdir) assert os.path.exists(lexicon_path) filter_lexicon(keywords, lexicon_path, f"{tmpdir}/lexicon.txt") # Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0. # But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is. run_shell( f"perl -ape 's/(\S+\s+)(.+)/${{1}}1.0\\t$2/;' < {tmpdir}/lexicon.txt > {tmpdir}/lexiconp.txt" ) # Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst. # Without these symbols, determinization will fail. eesen_utils_path = os.path.join(os.getcwd(), "kws_decoder", "eesen_utils") assert os.path.exists(f"{eesen_utils_path}/add_lex_disambig.pl") assert os.access(f"{eesen_utils_path}/add_lex_disambig.pl", os.X_OK) ndisambig = int( run_shell( f"{eesen_utils_path}/add_lex_disambig.pl {tmpdir}/lexiconp.txt {tmpdir}/lexiconp_disambig.txt" ).strip()) assert isinstance(ndisambig, int) ndisambig += 1 with open(f"{tmpdir}/disambig.list", "w") as f: f.writelines([f"#{n}\n" for n in range(ndisambig)]) # Get the full list of CTC tokens used in FST. These tokens include <eps>, the blank <blk>, the actual labels (e.g., # phonemes), and the disambiguation symbols. with open(f"{tmpdir}/units.list", "w") as f: for phn in phn2idx: # if "SIL" not in phn and "sil" not in phn: f.write(f"{phn.upper()}\n") # f.write(f"\n") # run_shell(f"cat {units_txt} | awk '{{print $1}}' > {tmpdir}/units.list") run_shell( f"(echo '<eps>'; echo '<blk>';) | cat - {tmpdir}/units.list {tmpdir}/disambig.list " + f"| awk '{{print $1 \" \" (NR-1)}}' > {graph_dir}/tokens.txt") with open(f"{graph_dir}/tokens.txt", "r") as f: token_lines = f.readlines() toekn_fst_txt = ctc_token_fst(token_lines) with open(f"{graph_dir}/tokens_fst.txt", "w") as f: f.writelines(toekn_fst_txt) run_shell( f"cat {graph_dir}/tokens_fst.txt | fstcompile --isymbols={graph_dir}/tokens.txt --osymbols={graph_dir}/tokens.txt \ --keep_isymbols=false --keep_osymbols=false | fstarcsort --sort_type=olabel > {graph_dir}/T.fst " ) # Encode the words with indices. Will be used in lexicon and language model FST compiling. run_shell(f""" cat {tmpdir}/lexiconp.txt | awk '{{print $1}}' | sort | uniq | awk ' BEGIN {{ print "<eps> 0"; }} {{ printf("%s %d\\n", $1, NR); }} END {{ printf("#0 %d\\n", NR+1); }}' > {graph_dir}/words.txt || exit 1; """) # Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time. token_disambig_symbol = int( run_shell( f"grep \#0 {graph_dir}/tokens.txt | awk '{{print $2}}'").strip()) word_disambig_symbol = int( run_shell( f"grep \#0 {graph_dir}/words.txt | awk '{{print $2}}'").strip()) # TODO why does piping not work? assert os.path.exists(f"{eesen_utils_path}/make_lexicon_fst.pl") assert os.access(f"{eesen_utils_path}/make_lexicon_fst.pl", os.X_OK) lexicon_fst = run_shell( f"{eesen_utils_path}/make_lexicon_fst.pl --pron-probs {tmpdir}/lexiconp_disambig.txt 0 \"SIL\" #{ndisambig}" ) run_shell( f"echo \"{lexicon_fst}\" | " + f"fstcompile --isymbols={graph_dir}/tokens.txt --osymbols={graph_dir}/words.txt " + f"--keep_isymbols=false --keep_osymbols=false | " + f"fstaddselfloops \"echo {token_disambig_symbol} |\" \"echo {word_disambig_symbol} |\" | " + f"fstarcsort --sort_type=olabel > {graph_dir}/L.fst") if draw_G_L_fsts: run_shell( f"fstdraw --isymbols={graph_dir}/tokens.txt " + f"--osymbols={graph_dir}/words.txt {graph_dir}/L.fst | dot -Tpdf -o /mnt/data/drawn_graphs/L.pdf" ) ########## MkGraph grammar_fst_path = build_kw_grammar_fst( keywords, words_file=f"{graph_dir}/words.txt") shutil.copy(grammar_fst_path, f"{graph_dir}/G.fst") if draw_G_L_fsts: run_shell( f"fstdraw --isymbols={graph_dir}/words.txt " + f"--osymbols={graph_dir}/words.txt {graph_dir}/G.fst | dot -Tpdf -o /mnt/data/drawn_graphs/G.pdf" ) run_shell( f"fsttablecompose {graph_dir}/L.fst {graph_dir}/G.fst | fstdeterminizestar --use-log=true | " + f"fstminimizeencoded | fstarcsort --sort_type=ilabel > {graph_dir}/LG.fst" ) run_shell( f"fsttablecompose {graph_dir}/T.fst {graph_dir}/LG.fst > {graph_dir}/TLG.fst" ) if draw_G_L_fsts: run_shell( f"fstdraw --isymbols={graph_dir}/tokens.txt " + f"--osymbols={graph_dir}/tokens.txt {graph_dir}/T.fst | dot -Tpdf -o /mnt/data/drawn_graphs/T.pdf" ) run_shell( f"fstdraw --isymbols={graph_dir}/tokens.txt " + f"--osymbols={graph_dir}/words.txt {graph_dir}/TLG.fst | dot -Tpdf -o /mnt/data/drawn_graphs/TLG.pdf" ) return os.path.abspath(graph_dir)
def get_transcripts(words_path, workdir): decoing_scripts_folder = os.path.join( os.getcwd(), __name__.split(".")[0]) # 'kaldi_decoding_scripts'] int2sym_script = os.path.join(decoing_scripts_folder, "utils/int2sym.pl") assert len(glob(f"{workdir}/lat.*.gz")) > 0 assert os.path.exists(int2sym_script) assert os.access(int2sym_script, os.X_OK) if not os.path.isdir(os.path.join(workdir, "scoring", "log")): os.makedirs(os.path.join(workdir, "scoring", "log")) for file in glob(f"{workdir}/lat.*.gz"): assert os.stat( file ).st_size > 20, f"{file} seems to be empty with size of {os.stat(file).st_size} bytes" # TODO think about if each of these scalings and penalties make sense # TODO look into lattice-to-post --acoustic-scale=0.1 ark:1.lats ark:- | \ # gmm-acc-stats 10.mdl "$feats" ark:- 1.acc for confidence/sensitivity # TODO look into lattice-to-fst --lm-scale=0.0 --acoustic-scale=0.0 ark:1.lats ark:1.words # to visualize the lattice and how the pruned fst looks like # cmd = f"lattice-scale --inv-acoustic-scale={language_model_weigth} " \ # + f"\"ark:gunzip -c {workdir}/lat.*.gz |\" ark:- | " \ # + f"lattice-add-penalty --word-ins-penalty={word_ins_penalty} ark:- ark:- | " \ # + f"lattice-best-path --word-symbol-table={words_path} ark:- " \ # + f"ark,t:{workdir}/scoring/{language_model_weigth}.{word_ins_penalty}.tra" # run_shell(cmd) ali_out_file = "/dev/null" transcript_out_file = f"{workdir}/scoring/keywords.tra" lm_posterior_out_file = f"{workdir}/scoring/keywords.lm_post" acoustic_posterior_out_file = f"{workdir}/scoring/keywords.ac_post" # plt =True # if plt: # only for word sequences, not useful for KWS # run_shell(f"gunzip -c {workdir}/lat.*.gz | lattice-to-fst ark:- \"scp,p:echo $utterance_id $tmp_dir/$utterance_id.fst|\"") # run_shell(f"gunzip -c {workdir}/lat.*.gz | lattice-to-fst ark:- ") run_shell( f"gunzip -c {workdir}/lat.*.gz | " + f"lattice-to-nbest ark:- ark,t:- | " + f"nbest-to-linear ark:- ark:{ali_out_file} ark,t:{transcript_out_file} " + f"ark,t:{lm_posterior_out_file} ark,t:{acoustic_posterior_out_file}") transcripts = int2sym(transcript_out_file, words_path) with open(lm_posterior_out_file, "r") as f: lm_posterior = f.readlines() lm_posterior = [ line.strip().split(" ") for line in lm_posterior if line != "" ] lm_posterior = [(sample_id[:-2] if sample_id.endswith("-1") else sample_id, float(posterior)) for sample_id, posterior in lm_posterior] with open(acoustic_posterior_out_file, "r") as f: acoustic_posterior = f.readlines() # acoustic_posterior = [line.split(" ") for line in acoustic_posterior.split("\n") if line != ""] acoustic_posterior = [ line.strip().split(" ") for line in acoustic_posterior if line != "" ] acoustic_posterior = [ (sample_id[:-2] if sample_id.endswith("-1") else sample_id, float(posterior)) for sample_id, posterior in acoustic_posterior ] run_shell(f"gunzip -c {workdir}/lat.*.gz | " + f"lattice-best-path --word-symbol-table={words_path} ark:- " + f"ark,t:{workdir}/scoring/keywords_best.tra") transcripts_best = int2sym(f"{workdir}/scoring/keywords_best.tra", words_path) lattice_confidence = run_shell(f"gunzip -c {workdir}/lat.*.gz | " \ + f"lattice-confidence ark:- ark,t:-") lattice_confidence = [ line.strip().split(" ", 1) for line in lattice_confidence.split("\n") if line != "" ] lattice_confidence = [(sample_id, float(confidence)) for sample_id, confidence in lattice_confidence] # run_shell(f"cat {workdir}/scoring/keywords.tra") # run_shell(f"gunzip -c {workdir}/lat.*.gz | " \ # + f"lattice-1best ark:- ark,t:{workdir}/scoring/keywords.lat") # ali_model = '/mnt/data/pytorch-kaldi/tmp/graph_final/final.mdl' # run_shell(f"gunzip -c {workdir}/lat.*.gz | " \ # + f"lattice-to-nbest ark:- ark,t:- | nbest-to-linear ark:- ark,t:- | " + # f" lattice-to-fst ark:- \"scp,p:echo tree_fc2411fe_nohash_2 /tmp/kaldi.UIEL/tree_fc2411fe_nohash_2.fst|\" ") # nbest-to-linear ark,t:1.ali 'ark,t:1.tra' ark,t:1.lm ark,t:1.ac # run_shell(f"fstdraw /tmp/kaldi.UIEL/tree_fc2411fe_nohash_2.fst") # run_shell(f"cat {workdir}/scoring/keywords.lat | " \ # + f" lattice-to-fst") # TODO lattice-confidence # Compute sentence-level lattice confidence measures for each lattice. # The output is simly the difference between the total costs of the best and # second-best paths in the lattice (or a very large value if the lattice # had only one path). Caution: this is not necessarily a very good confidence # measure. You almost certainly want to specify the acoustic scale. # If the input is a state-level lattice, you need to specify # --read-compact-lattice=false, or the confidences will be very small # (and wrong). You can get word-level confidence info from lattice-mbr-decode. return transcripts_best, transcripts, lattice_confidence, lm_posterior, acoustic_posterior
def make_kaldi_decoding_graph( keywords, out_dir, train_graph_dir=f"{KALDI_ROOT}/egs/librispeech/s5/exp/tri4b", train_dict_folder=f"{KALDI_ROOT}/egs/librispeech/s5/data/local/dict_nosp", lexicon_path=f"{KALDI_ROOT}/egs/librispeech/s5/data/local/lm/librispeech-lexicon.txt", draw_G_L_fsts=True): libri_lexicon, lang_in_tmp, lang_tmp, final_lang_dir = \ check_andsetup__dirs(out_dir, train_graph_dir, train_dict_folder, lexicon_path) keywords = [kw.upper() for kw in keywords] if not os.path.exists(os.path.join(out_dir, "utils/prepare_lang.sh")): os.symlink(f"{KALDI_ROOT}/egs/wsj/s5/utils", os.path.join(out_dir, "utils")) os.symlink(f"{KALDI_ROOT}/egs/wsj/s5/steps", os.path.join(out_dir, "steps")) filter_lexicon(keywords, libri_lexicon, out_folder=lang_in_tmp) # TODO explore unk fst # unk_fst_dir = os.path.join(out_dir, "unk_fst") # if not os.path.isdir(unk_fst_dir): # os.makedirs(unk_fst_dir) #####Optional UNK FST # in librispeech_workdir # or reduce num ngram option ## using bigram only and num-ngrams is only 3336 # num_extra_ngrams=1000 # run_shell( # f"{out_dir}/utils/lang/make_unk_lm.sh --num_extra_ngrams 1000 --ngram-order 2 --cmd utils/run.pl {lang_in_tmp} {unk_fst_dir}") # TODO alternative simple phone loop cwd = os.getcwd() os.chdir(out_dir) # necessary because the kaldi scripts expect it if not os.path.exists("path.sh"): with open("path.sh", "w") as f: f.writelines("\n".join([ """export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH""", """[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1""", """. $KALDI_ROOT/tools/config/common_path.sh""", """export LC_ALL=C""", "" ])) prepare_lang_script = f"{out_dir}/utils/prepare_lang.sh" # run_shell(f"{prepare_lang_script} --unk-fst {unk_fst_dir}/unk_fst.txt {lang_in_tmp} \"{unk_sym}\" {lang_tmp} {final_lang_dir}") run_shell( f"{prepare_lang_script} {lang_in_tmp} \"{UNK_SYM}\" {lang_tmp} {final_lang_dir}" ) if draw_G_L_fsts: run_shell( f"fstdraw --isymbols={final_lang_dir}/phones.txt " + f"--osymbols={final_lang_dir}/words.txt {final_lang_dir}/L.fst | dot -Tpdf -o{out_dir}/L.pdf" ) grammar_fst_path = build_kw_grammar_fst( keywords, words_file=f"{final_lang_dir}/words.txt") shutil.copy(grammar_fst_path, f"{final_lang_dir}/G.fst") if draw_G_L_fsts: run_shell( f"fstdraw --isymbols={final_lang_dir}/words.txt " + f"--osymbols={final_lang_dir}/words.txt {final_lang_dir}/G.fst | dot -Tpdf -o{out_dir}/G.pdf" ) run_shell( f"{out_dir}/utils/validate_lang.pl --skip-determinization-check {final_lang_dir}" ) final_graph_dir = os.path.join(out_dir, "graph_final") if not os.path.isdir(final_graph_dir): os.makedirs(final_graph_dir) run_shell( f"{out_dir}/utils/mkgraph.sh {final_lang_dir} {train_graph_dir} {final_graph_dir}" ) if not os.path.exists(os.path.join(final_graph_dir, "final.mdl")): os.symlink(f"{train_graph_dir}/final.mdl", os.path.join(final_graph_dir, "final.mdl")) os.chdir(cwd) return os.path.abspath(final_graph_dir)
def score( data, words_path, dir, word_ins_penalty=None, min_acwt=1, max_acwt=20, acwt_factor=0.05 # the scaling factor for the acoustic scale. The scaling factor for acoustic likelihoods # needs to be 0.5 ~1.0. However, the job submission script can only take integers as the # job marker. That's why we set the acwt to be integers (5 ~ 10), but scale them with 0.1 # when they are actually used. ): if word_ins_penalty is None: word_ins_penalty = [0.0, 0.5, 1.0, 1.5, 2.0] # decoing_scripts_folder = os.path.join(os.getcwd(), __name__.split(".")[0]) # 'kaldi_decoding_scripts' # pl_cmd_script = os.path.join(decoing_scripts_folder, "utils/run.pl") # assert os.path.exists(pl_cmd_script) # assert os.access(pl_cmd_script, os.X_OK) # symtab = os.path.join(lang_or_graph, "words.txt") # assert os.path.exists(symtab) # assert os.path.exists(os.path.join(dir, "lat.1.gz")) # assert os.path.exists(os.path.join(data, "text")) # int2sym_script = os.path.join(decoing_scripts_folder, "utils/int2sym.pl") # assert os.path.exists(int2sym_script) # assert os.access(int2sym_script, os.X_OK) # if not os.path.isdir(os.path.join(dir, "scoring", "log")): # os.makedirs(os.path.join(dir, "scoring", "log")) # --cmd "$decode_cmd" --nj 10 --beam 17.0 --lattice_beam 8.0 --max-active 5000 --acwt 0.9 \ # --skip true --splice true --splice-opts "--left-context=1 --right-context=1" --skip-frames 3 --skip-offset 1 \ # ${lang_dir}_test_${lm_suffix} $exp_base/$test $train_dir/decode_${test}_${lm_suffix} || exit 1; # words_path = "wrds.txt" if not os.path.exists(f"{dir}/scoring"): os.makedirs(f"{dir}/scoring") assert os.environ['EESEN_ROOT'] lattice_scale_bin = f"{os.environ['EESEN_ROOT']}/src/decoderbin/lattice-scale" lattice_add_penalty_bin = f"{os.environ['EESEN_ROOT']}/src/decoderbin/lattice-add-penalty" lattice_best_path_bin = f"{os.environ['EESEN_ROOT']}/src/decoderbin/lattice-best-path" # for wip in word_ins_penalty: # for ACWT in range(min_acwt, max_acwt): # run_shell( # f"{lattice_scale_bin} --acoustic-scale={ACWT} --ascale-factor={acwt_factor} \"ark:gunzip -c {dir}/lat.*.gz|\" ark:- | " # + f"{lattice_add_penalty_bin} --word-ins-penalty={wip} ark:- ark:- |" # + f"{lattice_best_path_bin} --word-symbol-table={words_path} ark:- ark,t:{dir}/scoring/{ACWT}_{wip}_tra") # run_shell(f"cat {data}/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > {dir}/scoring/test_filt.txt") run_shell( f"cat {data}/text | sed 's:<UNK>::g' | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > {dir}/scoring/text_filt" ) compute_wer_bin = f"{os.environ['EESEN_ROOT']}/src/decoderbin/compute-wer" lattice_1best_bin = f"{os.environ['EESEN_ROOT']}/src/decoderbin/lattice-1best" nbest_to_ctm_bin = f"{os.environ['EESEN_ROOT']}/src/decoderbin/nbest-to-ctm" compute_wer_bin = f"{os.environ['EESEN_ROOT']}/src/decoderbin/compute-wer" int2sym_script = os.path.join(os.getcwd(), "kaldi_decoding_scripts/utils/int2sym.pl") assert os.path.exists(int2sym_script) # for wip in word_ins_penalty: # for ACWT in range(min_acwt, max_acwt): # run_shell(f"cat {dir}/scoring/{ACWT}_{wip}_tra | {int2sym_script} -f 2- {words_path} | " # + f" sed 's:<UNK>::g' | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' |" # + f"{compute_wer_bin} --text --mode=present ark:{dir}/scoring/text_filt ark,p:- {dir}/details_{ACWT}_{wip} >& {dir}/wer_{ACWT}_{wip}") convert_ctm_script = os.path.join( os.getcwd(), "kws_decoder/eesen_utils/convert_ctm.pl") assert os.path.exists(convert_ctm_script) name = "test_name_" # for wip in word_ins_penalty: for ACWT in range(min_acwt, max_acwt): if not os.path.exists(f"{dir}/score_{ACWT}/"): os.makedirs(f"{dir}/score_{ACWT}/") run_shell( f"{lattice_1best_bin} --acoustic-scale={ACWT} --ascale-factor={acwt_factor} \"ark:gunzip -c {dir}/lat.*.gz|\" ark:- | " + f"{nbest_to_ctm_bin} ark:- - | " + f"{int2sym_script} -f 5 {words_path} | " + f"{convert_ctm_script} {data}/segments {data}/reco2file_and_channel" ) run_shell( f"{lattice_1best_bin} --acoustic-scale={ACWT} --ascale-factor={acwt_factor} \"ark:gunzip -c {dir}/lat.*.gz|\" ark:- | " + f"{nbest_to_ctm_bin} ark:- - | " + f"{int2sym_script} -f 5 {words_path} | " + f"{convert_ctm_script} {data}/segments {data}/reco2file_and_channel " + f"> {dir}/score_{ACWT}/{name}.ctm")
def save_checkpoint( epoch, global_step, model, optimizers, lr_schedulers, seq_len_scheduler, config, checkpoint_dir, # monitor_best=None, dataset_sampler_state=None, save_best=None): """ Saving checkpoints :param epoch: current epoch number :param log: logging information of the epoch :param save_best: if True, rename the saved checkpoint to 'model_best.pth' """ assert dataset_sampler_state != save_best, "save_best is only done at the end of an epoch" # TODO figure out why shutil.disk_usage gives different result to df # available_disk_space_in_gb = shutil.disk_usage(checkpoint_dir).free * 1e-9 available_disk_space_in_gb = run_shell(f"df -h {checkpoint_dir}") available_disk_space_in_gb = int( available_disk_space_in_gb.split("\n")[1].split(" ")[13][:-1]) assert available_disk_space_in_gb > 5, \ f"available_disk_space_in_gb of {available_disk_space_in_gb} is lower than 5GB" \ + f"Aborting to try to save in order to not corrupt the model files" torch_rng_state, python_rng_state, numpy_rng_state = get_rng_state() state = { 'epoch': epoch, 'global_step': global_step, 'state_dict': model.state_dict(), 'optimizers': { opti_name: optimizers[opti_name].state_dict() for opti_name in optimizers }, 'lr_schedulers': { lr_sched_name: lr_schedulers[lr_sched_name].state_dict() for lr_sched_name in lr_schedulers }, 'seq_len_scheduler': seq_len_scheduler, 'dataset_sampler_state': dataset_sampler_state, # 'monitor_best': monitor_best, 'config': config, 'torch_rng_state': torch_rng_state, 'python_rng_state': python_rng_state, 'numpy_rng_state': numpy_rng_state, } if dataset_sampler_state is not None: # Intermediate save during training epoch all_previous_checkpoints = glob( os.path.join(checkpoint_dir, 'checkpoint_e*_gs*.pth')) checkpoint_name = f'checkpoint_e{epoch}_gs{global_step}.pth' filename = os.path.join(checkpoint_dir, checkpoint_name) torch.save(state, filename) logger.info(f"Saved checkpoint: {filename}") for old_checkpoint in all_previous_checkpoints: if os.path.exists(old_checkpoint): os.remove(old_checkpoint) logger.info(f"Removed old checkpoint: {old_checkpoint} ") else: checkpoint_name = f'checkpoint_e{epoch}.pth' filename = os.path.join(checkpoint_dir, checkpoint_name) torch.save(state, filename) logger.info(f"Saved checkpoint: {filename}") if epoch >= 3: filename_prev = os.path.join(checkpoint_dir, f'checkpoint_e{epoch - 3}.pth') if os.path.exists(filename_prev): os.remove(filename_prev) logger.info(f"Removed old checkpoint: {filename_prev} ") if save_best is not None and save_best: checkpoint_name = f'checkpoint_best.pth' best_path = os.path.join(checkpoint_dir, checkpoint_name) torch.save(state, best_path) logger.info(f"Saved current best: {checkpoint_name}") # available_disk_space_in_gb = shutil.disk_usage(checkpoint_dir).free * 1e-9 available_disk_space_in_gb = run_shell(f"df -h {checkpoint_dir}") available_disk_space_in_gb = int( available_disk_space_in_gb.split("\n")[1].split(" ")[13][:-1]) assert available_disk_space_in_gb > 5, \ f"available_disk_space_in_gb of {available_disk_space_in_gb} is lower than 5GB" \ + f"Aborting since next checkpoint save probably fails because of too little space -> no wasted training compute"
def decode(alignment_model_path, words_path, graph_path, out_folder, featstrings, min_active=20, max_active=700, max_mem=500000, beam=5.0, latbeam=5.0, acwt=1.0, max_arcs=-1.0, **kwargs): out_folder = f"{out_folder}/exp_files" assert isinstance(featstrings, list) num_threads = 4 # TODO more threads assert out_folder[-1] != '/' srcdir = os.path.dirname(out_folder) thread_string = f"-parallel --num-threads={num_threads}" if not os.path.isdir(os.path.join(out_folder, "log")): os.makedirs(os.path.join(out_folder, "log")) assert graph_path.endswith("HCLG.fst") assert words_path.endswith("words.txt") assert alignment_model_path.endswith("final.mdl") # TODO should we really just delete these files? if len(glob(f"{out_folder}/lat.*.gz")) > 0: for file in glob(f"{out_folder}/lat.*.gz"): os.remove(file) if len(glob(f"{out_folder}/log/decode.*.log")) > 0: for file in glob(f"{out_folder}/log/decode.*.log"): os.remove(file) chnk_id = 0 for ck_data in tqdm(featstrings, desc="lattice generation chunk:"): assert not os.path.exists(f"{out_folder}/lat.{chnk_id}.gz") assert not os.path.exists(f"{out_folder}/log/decode.{chnk_id}.log") finalfeats = f"ark,s,cs: cat {ck_data} |" cmd = f'latgen-faster-mapped{thread_string} --min-active={min_active} ' \ + f'--max-active={max_active} --max-mem={max_mem} ' \ + f'--beam={beam} --lattice-beam={latbeam} ' \ + f'--acoustic-scale={acwt}' \ + f' --allow-partial=true ' \ + f'--word-symbol-table={words_path} {alignment_model_path} ' \ + f'{graph_path} ' \ + f'\"{finalfeats}\" \"ark:|gzip -c > {out_folder}/lat.{chnk_id}.gz\"' run_shell(cmd) chnk_id += 1 # TODO display the generated lattice for keywords copy(alignment_model_path, srcdir) transcripts_best, transcripts, lattice_confidence, lm_posterior, acoustic_posterior = get_transcripts( words_path, out_folder) for t in transcripts_best: assert transcripts_best[t] == transcripts[ t], f"{t}: {transcripts_best[t]} =!= {transcripts[t]}" assert len(transcripts) == len(lattice_confidence) transcripts = dict(transcripts) lattice_confidence = dict(lattice_confidence) lm_posterior = dict(lm_posterior) acoustic_posterior = dict(acoustic_posterior) result = {} for sample_id in transcripts: _lattice_confidence = lattice_confidence[sample_id] \ if 10000000000.0 != lattice_confidence[sample_id] else float("inf") _lm_posterior = lm_posterior[sample_id] # TODO normalize _acoustic_posterior = acoustic_posterior[sample_id] # TODO normalize result[sample_id] = (transcripts[sample_id], _lattice_confidence, _lm_posterior, _acoustic_posterior) return result