Esempio n. 1
0
def decode_ctc(words_path,
               graph_path,
               out_folder,
               featstrings,
               min_active=20,
               max_active=700,
               max_mem=500000,
               beam=5.0,
               latbeam=5.0,
               acwt=1.0,
               **kwargs):
    out_folder = f"{out_folder}/exp_files"

    acwt = 1.0

    assert graph_path.endswith("TLG.fst")
    assert words_path.endswith("words.txt")

    assert isinstance(featstrings, list)

    assert os.environ['EESEN_ROOT']
    latgen_faster_bin = f"{os.environ['EESEN_ROOT']}/src/decoderbin/latgen-faster"

    chnk_id = 0
    for ck_data in tqdm(featstrings, desc="lattice generation chunk:"):
        finalfeats = f"ark,s,cs: "

        # Decode for each of the acoustic scales
        run_shell(
            f"{latgen_faster_bin} " + f"--max-active={max_active} " +
            f"--max-mem={max_mem} " + f"--beam={beam} " +
            f"--lattice-beam={latbeam} " + f"--acoustic-scale={acwt} " +
            f"--allow-partial=true " + f"--word-symbol-table={words_path} " +
            f"{graph_path} " +
            f"ark:{ck_data} \"ark:|gzip -c > {out_folder}/lat.{chnk_id}.gz\"")

    transcripts_best, transcripts, lattice_confidence, lm_posterior, acoustic_posterior = get_transcripts(
        words_path, out_folder)

    for t in transcripts_best:
        # if transcripts_best[t] != ['THREE'] and transcripts[t] != [
        #     'THREE']:  # might give a different path for e.g. THREE and TREE
        assert transcripts_best[t] == transcripts[
            t], f"{t}: {transcripts_best[t]} =!= {transcripts[t]}"

    assert len(transcripts) == len(lattice_confidence)
    transcripts = dict(transcripts)
    lattice_confidence = dict(lattice_confidence)
    lm_posterior = dict(lm_posterior)
    acoustic_posterior = dict(acoustic_posterior)
    result = {}
    for sample_id in transcripts:
        _lattice_confidence = lattice_confidence[sample_id] \
            if 10000000000.0 != lattice_confidence[sample_id] else float("inf")
        _lm_posterior = lm_posterior[sample_id]  # TODO normalize
        _acoustic_posterior = acoustic_posterior[sample_id]  # TODO normalize
        result[sample_id] = (transcripts[sample_id], _lattice_confidence,
                             _lm_posterior, _acoustic_posterior)

    return result
Esempio n. 2
0
def score(data, lang_or_graph, dir, num_jobs, min_lmwt=1, max_lmwt=10):
    decoing_scripts_folder = os.path.join(
        os.getcwd(),
        __name__.split(".")[0])  # 'kaldi_decoding_scripts'

    # end configuration section.

    phonemap = "conf/phones.60-48-39.map"
    nj = num_jobs

    symtab = os.path.join(lang_or_graph, "words.txt")

    assert os.path.exists(symtab)
    assert os.path.exists(os.path.join(dir, "lat.1.gz"))
    assert os.path.exists(os.path.join(data, "text"))
    timit_norm_trans_script = os.path.join(decoing_scripts_folder,
                                           "local/timit_norm_trans.pl")
    assert os.path.exists(timit_norm_trans_script)
    assert os.access(timit_norm_trans_script, os.X_OK)
    int2sym_script = os.path.join(decoing_scripts_folder, "utils/int2sym.pl")
    assert os.path.exists(int2sym_script)
    assert os.access(int2sym_script, os.X_OK)
    phonemap = os.path.join(decoing_scripts_folder, phonemap)
    assert os.path.exists(phonemap)
    pl_cmd_script = os.path.join(decoing_scripts_folder, "utils/run.pl")
    assert os.path.exists(pl_cmd_script)
    assert os.access(pl_cmd_script, os.X_OK)

    os.makedirs(os.path.join(dir, "scoring", "log"))

    # Map reference to 39 phone classes:
    cmd = f"cat {data}/text | {timit_norm_trans_script} -i - -m {phonemap} -from 48 -to 39 > {dir}/scoring/test_filt.txt"
    run_shell(cmd)

    # Get the phone-sequence on the best-path:
    for LMWT in range(min_lmwt, max_lmwt):
        cmd = f"{pl_cmd_script} JOB=1:{nj} {dir}/scoring/log/best_path_basic.{LMWT}.JOB.log " + \
              f"lattice-best-path --lm-scale={LMWT} --word-symbol-table={symtab} --verbose=2 \"ark:gunzip -c {dir}/lat.JOB.gz|\" ark,t:{dir}/scoring/{LMWT}.JOB.tra || exit 1;"
        run_shell(cmd)
        run_shell(
            f"cat {dir}/scoring/{LMWT}.*.tra | sort > {dir}/scoring/{LMWT}.tra"
        )
        run_shell(f"rm {dir}/scoring/{LMWT}.*.tra")

    # Map hypothesis to 39 phone classes:
    cmd = f"{pl_cmd_script} LMWT={min_lmwt}:{max_lmwt}{dir}/scoring/log/score_basic.LMWT.log " + \
          f"cat {dir}/scoring/LMWT.tra \| " + \
          f"{int2sym_script} -f 2- {symtab} \| " + \
          f"{timit_norm_trans_script} -i - -m {phonemap} -from 48 -to 39 \| " + \
          f"compute-wer --text --mode=all ark:{dir}/scoring/test_filt.txt ark,p:- \">&\" {dir}/wer_LMWT || exit 1;"
    run_shell(cmd)
Esempio n. 3
0
def build_1_gram_fst(arpa_lm_path, graph_dir):
    one_gram_arpa = []

    with gzip.open(arpa_lm_path, 'rb') as f:
        for line in f:
            line = line.decode("utf-8")
            if "\\2-grams:" in line:
                break
            elif not ("ngram 2=" in line or "ngram 3=" in line or "ngram 4="
                      in line or "ngram 5=" in line or "ngram 6=" in line):
                if line.startswith("-"):
                    # removing backtracking prob for 1-gram
                    fake_prob = "-1.0"
                    word = line.split(maxsplit=2)[1]

                    _line = f"{fake_prob}\t{word}\n"
                else:
                    _line = line
                one_gram_arpa.append(_line)

            else:
                pass

    one_gram_arpa.append("\\end\\\n")
    pruned_lexicon_path = f"{graph_dir}/pruned_lexicon.txt"
    with open(pruned_lexicon_path, 'w') as f:
        f.writelines(one_gram_arpa)

    eesen_utils_path = os.path.join(os.getcwd(), "kws_decoder", "eesen_utils")

    assert os.path.exists(f"{eesen_utils_path}/s2eps.pl")
    assert os.access(f"{eesen_utils_path}/s2eps.pl", os.X_OK)

    assert os.path.exists(f"{eesen_utils_path}/eps2disambig.pl")
    assert os.access(f"{eesen_utils_path}/eps2disambig.pl", os.X_OK)

    run_shell(
        f"arpa2fst {pruned_lexicon_path} | fstprint | "
        # + f"utils/remove_oovs.pl {tmpdir}/oovs_{lm_suffix}.txt | "
        +
        f"{eesen_utils_path}/eps2disambig.pl | {eesen_utils_path}/s2eps.pl | fstcompile --isymbols={graph_dir}/words.txt "
        +
        f"--osymbols={graph_dir}/words.txt  --keep_isymbols=false --keep_osymbols=false | "
        + f"fstrmepsilon | fstarcsort --sort_type=ilabel > {graph_dir}/G.fst")

    return os.path.abspath(f"{graph_dir}/G.fst")
Esempio n. 4
0
def build_grammar_fst(arpa_lm_path, graph_dir):
    eesen_utils_path = os.path.join(os.getcwd(), "kws_decoder", "eesen_utils")

    assert os.path.exists(f"{eesen_utils_path}/s2eps.pl")
    assert os.access(f"{eesen_utils_path}/s2eps.pl", os.X_OK)

    assert os.path.exists(f"{eesen_utils_path}/eps2disambig.pl")
    assert os.access(f"{eesen_utils_path}/eps2disambig.pl", os.X_OK)

    run_shell(
        f"    gunzip -c {arpa_lm_path} | " + f"grep -v '<s> <s>' | " +
        f" grep -v '</s> <s>' |" + f"grep -v '</s> </s>' |" +
        f"arpa2fst - | fstprint | "
        # + f"utils/remove_oovs.pl {tmpdir}/oovs_{lm_suffix}.txt | "
        +
        f"{eesen_utils_path}/eps2disambig.pl | {eesen_utils_path}/s2eps.pl | fstcompile --isymbols={graph_dir}/words.txt "
        +
        f"--osymbols={graph_dir}/words.txt  --keep_isymbols=false --keep_osymbols=false | "
        + f"fstrmepsilon | fstarcsort --sort_type=ilabel > {graph_dir}/G.fst")

    return os.path.abspath(f"{graph_dir}/G.fst")
Esempio n. 5
0
def get_lab_count(label_opts, num_label, folder_lab_count):
    cmd = f'analyze-counts --print-args=False --verbose=0 --binary=false' \
          + f' --counts-dim={num_label} "ark:{label_opts}' \
          + f' {folder_lab_count}/final.mdl \\"ark:gunzip -c ' \
          + f'{folder_lab_count}/ali.*.gz |\\" ark:- |" -'
    # + f'{folder_lab_count}/ali.*.gz |\\" ark:- |" {count_file_path}'
    # count_file_path = "tmp.count" #TODO save in file instead

    lab_count = run_shell(cmd)

    lab_count = lab_count.strip().strip('[]').strip()
    lab_count = [float(v) for v in lab_count.split()]
    return lab_count
Esempio n. 6
0
def decode_ctc(data,
               graphdir,
               out_folder,
               featstrings,
               min_active=20,
               max_active=5000,
               max_mem=500000,
               beam=17.0,
               latbeam=8.0,
               acwt=0.9,
               **kwargs):
    out_folder = f"{out_folder}/exp_files"
    if not os.path.exists(out_folder):
        os.makedirs(out_folder)

    assert os.path.exists(f"{graphdir}/TLG.fst")
    assert os.path.exists(f"{graphdir}/words.txt")

    assert isinstance(featstrings, list)

    assert os.environ['EESEN_ROOT']
    latgen_faster_bin = f"{os.environ['EESEN_ROOT']}/src/decoderbin/latgen-faster"

    chnk_id = 0
    for ck_data in tqdm(featstrings, desc="lattice generation chunk:"):
        finalfeats = f"ark,s,cs: "

        # Decode for each of the acoustic scales
        run_shell(
            f"{latgen_faster_bin} " + f"--max-active={max_active} " +
            f"--max-mem={max_mem} " + f"--beam={beam} " +
            f"--lattice-beam={latbeam} " + f"--acoustic-scale={acwt} " +
            f"--allow-partial=true " +
            f"--word-symbol-table={graphdir}/words.txt " +
            f"{graphdir}/TLG.fst " +
            f"ark:{ck_data} \"ark:|gzip -c > {out_folder}/lat.{chnk_id}.gz\"")

        score(data, f"{graphdir}/words.txt", out_folder)
Esempio n. 7
0
def get_kaldi_feats(scp_file, out_dir, spk2utt, utt2spk):
    # Compute features
    fbank_config = "kaldi_decoding_scripts/conf/fbank.conf"
    compress = "true"
    name = "decoding"
    assert os.path.exists(fbank_config)
    out_scp = f"{out_dir}/raw_fbank_{name}.scp"
    out_ark = f"{out_dir}/raw_fbank_{name}.ark"
    run_shell(f"compute-fbank-feats --verbose=2 --config={fbank_config} scp,p:{scp_file} ark:- | \
    copy-feats --compress={compress} ark:- ark,scp:{out_ark},{out_scp}")

    # Compute normalization data
    cmvn_ark = f"{out_dir}/cmvn_{name}.ark"
    cmvn_scp = f"{out_dir}/cmvn_{name}.scp"
    run_shell(f"compute-cmvn-stats --spk2utt=ark:{spk2utt} scp:{out_scp} ark,scp:{cmvn_ark},{cmvn_scp}")

    # Load normalized
    feature_opts = f"apply-cmvn --utt2spk=ark:{utt2spk} ark:{cmvn_ark} ark:- ark:- | add-deltas --delta-order=0 ark:- ark:- |"
    features_loaded = \
        {k: m for k, m in
         kaldi_io.read_mat_ark(f'ark:copy-feats scp:{out_scp} ark:- | {feature_opts}')}

    return features_loaded
Esempio n. 8
0
def build_kw_grammar_fst(keywords, words_file):
    if not isinstance(keywords[0], list):
        # expect each kw to be a list of words
        keywords = [kw.split(" ") for kw in keywords]

    with open(words_file, "r") as f:
        word_map = f.readlines()
        word_map = dict([line.strip().split(" ", 1) for line in word_map])
        assert EPS_SYM in word_map
        assert SIL_SYM in word_map
        assert UNK_SYM in word_map
        assert SPN_SYM in word_map

    keyword_fst_folder = "keyword_fsts"
    if os.path.isdir(keyword_fst_folder):
        shutil.rmtree(keyword_fst_folder)
    os.makedirs(keyword_fst_folder)

    write_fst([UNK_SYM], keyword_fst_folder, word_map)

    for kw in keywords:
        write_fst(kw, keyword_fst_folder, word_map)

    out_fst = f"{keyword_fst_folder}/UNION.fst"
    run_shell(f"fstcompile {keyword_fst_folder}/UNK.txt {out_fst}")

    for fst in glob(f"{keyword_fst_folder}/*.txt"):
        run_shell(f"fstcompile {fst} {fst[:-4]}.fst")
        run_shell(f"fstunion {fst[:-4]}.fst {out_fst} {out_fst}")

    run_shell(f"fstrmepsilon {out_fst} | fstarcsort --sort_type=ilabel - {out_fst}")

    # run_shell(f"fstprint --isymbols={words_file} --osymbols={words_file} {out_fst}")
    # run_shell(f"fstprint {out_fst}")
    # run_shell(
    #     f"fstdraw --isymbols={words_file} --osymbols={words_file} {out_fst} "
    #     + f"| dot -Tpdf -o {os.path.basename(out_fst)[:-4]}.pdf")

    return os.path.abspath(out_fst)
Esempio n. 9
0
def score(data, lang_or_graph, dir, num_jobs, stage=0,
          model=None,
          min_lmwt=1,
          max_lmwt=10,
          mbr_scale=1.0
          ):
    decoing_scripts_folder = os.path.join(os.getcwd(), __name__.split(".")[0])  # 'kaldi_decoding_scripts'
    # end configuration section.

    if model is None:
        # assume model one level up from decoding dir.
        model = os.path.join(dir, "..", "final.mdl")

    KALDI_ROOT = os.environ['KALDI_ROOT']

    hubscr = f"{KALDI_ROOT}/tools/sctk/bin/hubscr.pl"
    assert os.path.exists(f"{KALDI_ROOT}/tools/sctk/bin/hubscr.pl"), "Cannot find scoring program hubscr"

    hubdir = os.path.dirname(f"{KALDI_ROOT}/tools/sctk/bin/hubscr.pl")

    phonemap = "conf/phones.60-48-39.map"
    nj = num_jobs

    symtab = os.path.join(lang_or_graph, "words.txt")

    assert os.path.exists(symtab)
    assert os.path.exists(os.path.join(dir, "lat.1.gz"))
    assert os.path.exists(os.path.join(data, "text"))
    timit_norm_trans_script = os.path.join(decoing_scripts_folder, "local/timit_norm_trans.pl")
    assert os.path.exists(timit_norm_trans_script)
    assert os.access(timit_norm_trans_script, os.X_OK)
    int2sym_script = os.path.join(decoing_scripts_folder, "utils/int2sym.pl")
    assert os.path.exists(int2sym_script)
    assert os.access(int2sym_script, os.X_OK)
    phonemap = os.path.join(decoing_scripts_folder, phonemap)
    assert os.path.exists(phonemap)
    pl_cmd_script = os.path.join(decoing_scripts_folder, "utils/run.pl")
    assert os.path.exists(pl_cmd_script) #TODO remove run.pl command
    assert os.access(pl_cmd_script, os.X_OK)

    os.makedirs(os.path.join(dir, "scoring", "log"))

    # Map reference to 39 phone classes, the silence is optional (.):
    cmd = f"{timit_norm_trans_script} -i {data}/stm -m {phonemap} -from 48 -to 39 | \
     sed 's: sil: (sil):g' > {dir}/scoring/stm_39phn"
    run_shell(cmd)

    copy(os.path.join(data, "glm"), os.path.join(dir, "scoring", "glm_39phn"))

    if stage <= 0:
        # Get the phone-sequence on the best-path:
        for LMWT in range(min_lmwt, max_lmwt):
            acoustic_scale = 1 / LMWT * mbr_scale
            cmd = f"{pl_cmd_script} JOB=1:{nj} {dir}/scoring/log/best_path.{LMWT}.JOB.log " + \
                  f"lattice-align-phones {model} \"ark:gunzip -c {dir}/lat.JOB.gz|\" ark:- \| " + \
                  f"lattice-to-ctm-conf --acoustic-scale={acoustic_scale:.8f} --lm-scale={mbr_scale} ark:- {dir}/scoring/{LMWT}.JOB.ctm || exit 1;"
            run_shell(cmd)
            run_shell(f"cat {dir}/scoring/{LMWT}.*.ctm | sort > {dir}/scoring/{LMWT}.ctm")
            run_shell(f"rm {dir}/scoring/{LMWT}.*.ctm")

    if stage <= 1:
        # Map ctm to 39 phone classes:
        cmd = f"{pl_cmd_script} LMWT={min_lmwt}:{max_lmwt} {dir}/scoring/log/map_ctm.LMWT.log " + \
              f"mkdir {dir}/score_LMWT ';' " + \
              f"cat {dir}/scoring/LMWT.ctm \| " + \
              f"{int2sym_script} -f 5 {symtab} \| " + \
              f"{timit_norm_trans_script} -i - -m {phonemap} -from 48 -to 39 '>' " + \
              f"{dir}/scoring/LMWT.ctm_39phn || exit 1"
        run_shell(cmd)

    # Score the set...
    cmd = f"{pl_cmd_script} LMWT={min_lmwt}:{max_lmwt} {dir}/scoring/log/map_ctm.LMWT.log " + \
          f"cp {dir}/scoring/stm_39phn {dir}/score_LMWT/stm_39phn '&&' cp {dir}/scoring/LMWT.ctm_39phn {dir}/score_LMWT/ctm_39phn '&&' " + \
          f"{hubscr} -p {hubdir} -V -l english -h hub5 -g {dir}/scoring/glm_39phn -r {dir}/score_LMWT/stm_39phn {dir}/score_LMWT/ctm_39phn || exit 1;"
    run_shell(cmd)
Esempio n. 10
0
def decode(
        alidir,
        data,
        graphdir,
        out_folder,
        featstrings,
        min_active=200,
        max_active=7000,
        max_mem=50000000,
        beam=13.0,
        latbeam=8.0,
        acwt=0.2,
        max_arcs=-1.0,
        scoring_type="std",  # none, std & basic so far
        scoring_opts=None,
        norm_vars=False,
        **kwargs):
    # TODO remove
    if scoring_opts == '"--min-lmwt 1 --max-lmwt 10"':
        scoring_opts = {"min_lmwt": 1, "max_lmwt": 10}
    if scoring_opts is None:
        scoring_opts = {"min_lmwt": 1, "max_lmwt": 10}
    assert isinstance(featstrings, list)
    num_threads = 1
    assert out_folder[-1] != '/'
    srcdir = os.path.dirname(out_folder)

    thread_string = "-parallel --num-threads={}".format(num_threads)

    if not os.path.isdir(os.path.join(out_folder, "log")):
        os.makedirs(os.path.join(out_folder, "log"))

    num_jobs = len(featstrings)

    with open(os.path.join(out_folder, "num_jobs"), "w") as f:
        f.write(str(num_jobs))

    assert os.path.exists(os.path.join(graphdir, "HCLG.fst"))

    JOB = 1
    for ck_data in featstrings:
        finalfeats = f"ark,s,cs: cat {ck_data} |"
        cmd = f'latgen-faster-mapped{thread_string} --min-active={min_active} ' + \
              f'--max-active={max_active} --max-mem={max_mem} ' + \
              f'--beam={beam} --lattice-beam={latbeam} ' + \
              f'--acoustic-scale={acwt} --allow-partial=true ' + \
              f'--word-symbol-table={graphdir}/words.txt {alidir}/final.mdl ' + \
              f'{graphdir}/HCLG.fst ' + \
              f'\"{finalfeats}\" \"ark:|gzip -c > {out_folder}/lat.{JOB}.gz\" &> {out_folder}/log/decode.{JOB}.log'
        run_shell(cmd)
        JOB += 1

    copy(os.path.join(alidir, "final.mdl"), srcdir)

    if scoring_type != "none":
        if scoring_type == "std":
            score(data, graphdir, out_folder, num_jobs, **scoring_opts)
        elif scoring_type == "basic":
            score_basic(data, graphdir, out_folder, num_jobs, **scoring_opts)
        elif scoring_type == "libri":
            score_libri(data, graphdir, out_folder, **scoring_opts)
        elif scoring_type == "just_transcript":
            return get_transcripts(data, graphdir, out_folder)
        else:
            raise ValueError
Esempio n. 11
0
def make_ctc_decoding_graph(
        keywords,
        phn2idx,
        tmpdir,
        lexicon_path=f"{KALDI_ROOT}/egs/librispeech/s5/data/local/lm/librispeech-lexicon.txt",
        draw_G_L_fsts=False):
    tmpdir = os.path.join(tmpdir, "tmp")
    if os.path.isdir(tmpdir):
        shutil.rmtree(tmpdir)
    os.makedirs(tmpdir)

    graph_dir = os.path.join(tmpdir, "graph_dir")
    if os.path.isdir(graph_dir):
        shutil.rmtree(graph_dir)
    os.makedirs(graph_dir)

    keywords = [kw.upper() for kw in keywords]

    #######

    # check_units_txt(units_txt)
    if not os.path.exists(graph_dir):
        os.makedirs(graph_dir)

    if not os.path.exists(tmpdir):
        os.makedirs(tmpdir)

    assert os.path.exists(lexicon_path)

    filter_lexicon(keywords, lexicon_path, f"{tmpdir}/lexicon.txt")

    # Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0.
    # But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is.
    run_shell(
        f"perl -ape 's/(\S+\s+)(.+)/${{1}}1.0\\t$2/;' < {tmpdir}/lexicon.txt > {tmpdir}/lexiconp.txt"
    )

    # Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst.
    # Without these symbols, determinization will fail.

    eesen_utils_path = os.path.join(os.getcwd(), "kws_decoder", "eesen_utils")

    assert os.path.exists(f"{eesen_utils_path}/add_lex_disambig.pl")
    assert os.access(f"{eesen_utils_path}/add_lex_disambig.pl", os.X_OK)
    ndisambig = int(
        run_shell(
            f"{eesen_utils_path}/add_lex_disambig.pl {tmpdir}/lexiconp.txt {tmpdir}/lexiconp_disambig.txt"
        ).strip())
    assert isinstance(ndisambig, int)
    ndisambig += 1

    with open(f"{tmpdir}/disambig.list", "w") as f:
        f.writelines([f"#{n}\n" for n in range(ndisambig)])

    # Get the full list of CTC tokens used in FST. These tokens include <eps>, the blank <blk>, the actual labels (e.g.,
    # phonemes), and the disambiguation symbols.

    with open(f"{tmpdir}/units.list", "w") as f:
        for phn in phn2idx:
            # if "SIL" not in phn and "sil" not in phn:
            f.write(f"{phn.upper()}\n")
        # f.write(f"\n")

    # run_shell(f"cat {units_txt} | awk '{{print $1}}' > {tmpdir}/units.list")
    run_shell(
        f"(echo '<eps>'; echo '<blk>';) | cat - {tmpdir}/units.list {tmpdir}/disambig.list "
        + f"| awk '{{print $1 \" \" (NR-1)}}' > {graph_dir}/tokens.txt")

    with open(f"{graph_dir}/tokens.txt", "r") as f:
        token_lines = f.readlines()
    toekn_fst_txt = ctc_token_fst(token_lines)
    with open(f"{graph_dir}/tokens_fst.txt", "w") as f:
        f.writelines(toekn_fst_txt)

    run_shell(
        f"cat {graph_dir}/tokens_fst.txt | fstcompile --isymbols={graph_dir}/tokens.txt --osymbols={graph_dir}/tokens.txt \
       --keep_isymbols=false --keep_osymbols=false | fstarcsort --sort_type=olabel > {graph_dir}/T.fst "
    )

    # Encode the words with indices. Will be used in lexicon and language model FST compiling.
    run_shell(f"""
    cat {tmpdir}/lexiconp.txt | awk '{{print $1}}' | sort | uniq  | awk '
      BEGIN {{
        print "<eps> 0";
      }} 
      {{
        printf("%s %d\\n", $1, NR);
      }}
      END {{
        printf("#0 %d\\n", NR+1);
      }}' > {graph_dir}/words.txt || exit 1;
    """)

    # Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time.
    token_disambig_symbol = int(
        run_shell(
            f"grep \#0 {graph_dir}/tokens.txt | awk '{{print $2}}'").strip())
    word_disambig_symbol = int(
        run_shell(
            f"grep \#0 {graph_dir}/words.txt | awk '{{print $2}}'").strip())

    # TODO why does piping not work?
    assert os.path.exists(f"{eesen_utils_path}/make_lexicon_fst.pl")
    assert os.access(f"{eesen_utils_path}/make_lexicon_fst.pl", os.X_OK)
    lexicon_fst = run_shell(
        f"{eesen_utils_path}/make_lexicon_fst.pl --pron-probs {tmpdir}/lexiconp_disambig.txt 0 \"SIL\" #{ndisambig}"
    )

    run_shell(
        f"echo \"{lexicon_fst}\" | " +
        f"fstcompile --isymbols={graph_dir}/tokens.txt --osymbols={graph_dir}/words.txt "
        + f"--keep_isymbols=false --keep_osymbols=false | " +
        f"fstaddselfloops  \"echo {token_disambig_symbol} |\" \"echo {word_disambig_symbol} |\" | "
        + f"fstarcsort --sort_type=olabel > {graph_dir}/L.fst")

    if draw_G_L_fsts:
        run_shell(
            f"fstdraw --isymbols={graph_dir}/tokens.txt " +
            f"--osymbols={graph_dir}/words.txt {graph_dir}/L.fst | dot -Tpdf -o /mnt/data/drawn_graphs/L.pdf"
        )

    ########## MkGraph

    grammar_fst_path = build_kw_grammar_fst(
        keywords, words_file=f"{graph_dir}/words.txt")
    shutil.copy(grammar_fst_path, f"{graph_dir}/G.fst")

    if draw_G_L_fsts:
        run_shell(
            f"fstdraw --isymbols={graph_dir}/words.txt " +
            f"--osymbols={graph_dir}/words.txt {graph_dir}/G.fst | dot -Tpdf -o /mnt/data/drawn_graphs/G.pdf"
        )

    run_shell(
        f"fsttablecompose {graph_dir}/L.fst {graph_dir}/G.fst | fstdeterminizestar --use-log=true | "
        +
        f"fstminimizeencoded | fstarcsort --sort_type=ilabel > {graph_dir}/LG.fst"
    )
    run_shell(
        f"fsttablecompose {graph_dir}/T.fst {graph_dir}/LG.fst > {graph_dir}/TLG.fst"
    )

    if draw_G_L_fsts:
        run_shell(
            f"fstdraw --isymbols={graph_dir}/tokens.txt " +
            f"--osymbols={graph_dir}/tokens.txt {graph_dir}/T.fst | dot -Tpdf -o /mnt/data/drawn_graphs/T.pdf"
        )

        run_shell(
            f"fstdraw --isymbols={graph_dir}/tokens.txt " +
            f"--osymbols={graph_dir}/words.txt {graph_dir}/TLG.fst | dot -Tpdf -o /mnt/data/drawn_graphs/TLG.pdf"
        )

    return os.path.abspath(graph_dir)
Esempio n. 12
0
def get_transcripts(words_path, workdir):
    decoing_scripts_folder = os.path.join(
        os.getcwd(),
        __name__.split(".")[0])  # 'kaldi_decoding_scripts']
    int2sym_script = os.path.join(decoing_scripts_folder, "utils/int2sym.pl")

    assert len(glob(f"{workdir}/lat.*.gz")) > 0
    assert os.path.exists(int2sym_script)
    assert os.access(int2sym_script, os.X_OK)

    if not os.path.isdir(os.path.join(workdir, "scoring", "log")):
        os.makedirs(os.path.join(workdir, "scoring", "log"))

    for file in glob(f"{workdir}/lat.*.gz"):
        assert os.stat(
            file
        ).st_size > 20, f"{file} seems to be empty with size of {os.stat(file).st_size} bytes"

    # TODO think about if each of these scalings and penalties make sense
    # TODO look into lattice-to-post --acoustic-scale=0.1 ark:1.lats ark:- | \
    #     gmm-acc-stats 10.mdl "$feats" ark:- 1.acc for confidence/sensitivity
    # TODO look into   lattice-to-fst --lm-scale=0.0 --acoustic-scale=0.0 ark:1.lats ark:1.words
    # to visualize the lattice and how the pruned fst looks like

    # cmd = f"lattice-scale --inv-acoustic-scale={language_model_weigth} " \
    #       + f"\"ark:gunzip -c {workdir}/lat.*.gz |\" ark:- | " \
    #       + f"lattice-add-penalty --word-ins-penalty={word_ins_penalty} ark:- ark:- | " \
    #       + f"lattice-best-path --word-symbol-table={words_path} ark:- " \
    #       + f"ark,t:{workdir}/scoring/{language_model_weigth}.{word_ins_penalty}.tra"
    # run_shell(cmd)

    ali_out_file = "/dev/null"
    transcript_out_file = f"{workdir}/scoring/keywords.tra"
    lm_posterior_out_file = f"{workdir}/scoring/keywords.lm_post"
    acoustic_posterior_out_file = f"{workdir}/scoring/keywords.ac_post"

    # plt =True
    # if plt:
    # only for word sequences, not useful for KWS
    #     run_shell(f"gunzip -c {workdir}/lat.*.gz | lattice-to-fst ark:- \"scp,p:echo $utterance_id $tmp_dir/$utterance_id.fst|\"")
    # run_shell(f"gunzip -c {workdir}/lat.*.gz | lattice-to-fst ark:- ")

    run_shell(
        f"gunzip -c {workdir}/lat.*.gz | " +
        f"lattice-to-nbest ark:- ark,t:- | " +
        f"nbest-to-linear ark:- ark:{ali_out_file} ark,t:{transcript_out_file} "
        + f"ark,t:{lm_posterior_out_file} ark,t:{acoustic_posterior_out_file}")
    transcripts = int2sym(transcript_out_file, words_path)
    with open(lm_posterior_out_file, "r") as f:
        lm_posterior = f.readlines()
    lm_posterior = [
        line.strip().split(" ") for line in lm_posterior if line != ""
    ]
    lm_posterior = [(sample_id[:-2] if sample_id.endswith("-1") else sample_id,
                     float(posterior))
                    for sample_id, posterior in lm_posterior]

    with open(acoustic_posterior_out_file, "r") as f:
        acoustic_posterior = f.readlines()
    # acoustic_posterior = [line.split(" ") for line in acoustic_posterior.split("\n") if line != ""]

    acoustic_posterior = [
        line.strip().split(" ") for line in acoustic_posterior if line != ""
    ]
    acoustic_posterior = [
        (sample_id[:-2] if sample_id.endswith("-1") else sample_id,
         float(posterior)) for sample_id, posterior in acoustic_posterior
    ]

    run_shell(f"gunzip -c {workdir}/lat.*.gz | " +
              f"lattice-best-path --word-symbol-table={words_path} ark:- " +
              f"ark,t:{workdir}/scoring/keywords_best.tra")
    transcripts_best = int2sym(f"{workdir}/scoring/keywords_best.tra",
                               words_path)

    lattice_confidence = run_shell(f"gunzip -c {workdir}/lat.*.gz | " \
                                   + f"lattice-confidence ark:- ark,t:-")
    lattice_confidence = [
        line.strip().split(" ", 1) for line in lattice_confidence.split("\n")
        if line != ""
    ]
    lattice_confidence = [(sample_id, float(confidence))
                          for sample_id, confidence in lattice_confidence]

    # run_shell(f"cat {workdir}/scoring/keywords.tra")

    # run_shell(f"gunzip -c {workdir}/lat.*.gz | " \
    #           + f"lattice-1best ark:- ark,t:{workdir}/scoring/keywords.lat")

    # ali_model = '/mnt/data/pytorch-kaldi/tmp/graph_final/final.mdl'
    # run_shell(f"gunzip -c {workdir}/lat.*.gz | " \
    #           + f"lattice-to-nbest ark:- ark,t:- | nbest-to-linear ark:- ark,t:- | " +
    #           f" lattice-to-fst ark:- \"scp,p:echo tree_fc2411fe_nohash_2 /tmp/kaldi.UIEL/tree_fc2411fe_nohash_2.fst|\" ")

    # nbest-to-linear ark,t:1.ali 'ark,t:1.tra' ark,t:1.lm ark,t:1.ac

    # run_shell(f"fstdraw /tmp/kaldi.UIEL/tree_fc2411fe_nohash_2.fst")

    # run_shell(f"cat {workdir}/scoring/keywords.lat | " \
    #           + f" lattice-to-fst")

    # TODO lattice-confidence
    # Compute sentence-level lattice confidence measures for each lattice.
    # The output is simly the difference between the total costs of the best and
    # second-best paths in the lattice (or a very large value if the lattice
    # had only one path).  Caution: this is not necessarily a very good confidence
    # measure.  You almost certainly want to specify the acoustic scale.
    # If the input is a state-level lattice, you need to specify
    # --read-compact-lattice=false, or the confidences will be very small
    # (and wrong).  You can get word-level confidence info from lattice-mbr-decode.

    return transcripts_best, transcripts, lattice_confidence, lm_posterior, acoustic_posterior
Esempio n. 13
0
def make_kaldi_decoding_graph(
        keywords,
        out_dir,
        train_graph_dir=f"{KALDI_ROOT}/egs/librispeech/s5/exp/tri4b",
        train_dict_folder=f"{KALDI_ROOT}/egs/librispeech/s5/data/local/dict_nosp",
        lexicon_path=f"{KALDI_ROOT}/egs/librispeech/s5/data/local/lm/librispeech-lexicon.txt",
        draw_G_L_fsts=True):
    libri_lexicon, lang_in_tmp, lang_tmp, final_lang_dir = \
        check_andsetup__dirs(out_dir, train_graph_dir, train_dict_folder, lexicon_path)

    keywords = [kw.upper() for kw in keywords]

    if not os.path.exists(os.path.join(out_dir, "utils/prepare_lang.sh")):
        os.symlink(f"{KALDI_ROOT}/egs/wsj/s5/utils",
                   os.path.join(out_dir, "utils"))
        os.symlink(f"{KALDI_ROOT}/egs/wsj/s5/steps",
                   os.path.join(out_dir, "steps"))

    filter_lexicon(keywords, libri_lexicon, out_folder=lang_in_tmp)

    # TODO explore unk fst

    # unk_fst_dir = os.path.join(out_dir, "unk_fst")
    # if not os.path.isdir(unk_fst_dir):
    #     os.makedirs(unk_fst_dir)

    #####Optional  UNK FST
    # in librispeech_workdir
    # or reduce num ngram option
    ## using bigram only and num-ngrams is only 3336
    # num_extra_ngrams=1000

    # run_shell(
    #     f"{out_dir}/utils/lang/make_unk_lm.sh --num_extra_ngrams 1000 --ngram-order 2 --cmd utils/run.pl {lang_in_tmp} {unk_fst_dir}")

    # TODO alternative simple phone loop

    cwd = os.getcwd()
    os.chdir(out_dir)  # necessary because the kaldi scripts expect it

    if not os.path.exists("path.sh"):
        with open("path.sh", "w") as f:
            f.writelines("\n".join([
                """export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH""",
                """[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1""",
                """. $KALDI_ROOT/tools/config/common_path.sh""",
                """export LC_ALL=C""", ""
            ]))

    prepare_lang_script = f"{out_dir}/utils/prepare_lang.sh"
    # run_shell(f"{prepare_lang_script} --unk-fst {unk_fst_dir}/unk_fst.txt {lang_in_tmp} \"{unk_sym}\" {lang_tmp} {final_lang_dir}")
    run_shell(
        f"{prepare_lang_script} {lang_in_tmp} \"{UNK_SYM}\" {lang_tmp} {final_lang_dir}"
    )

    if draw_G_L_fsts:
        run_shell(
            f"fstdraw --isymbols={final_lang_dir}/phones.txt " +
            f"--osymbols={final_lang_dir}/words.txt {final_lang_dir}/L.fst | dot -Tpdf -o{out_dir}/L.pdf"
        )

    grammar_fst_path = build_kw_grammar_fst(
        keywords, words_file=f"{final_lang_dir}/words.txt")
    shutil.copy(grammar_fst_path, f"{final_lang_dir}/G.fst")

    if draw_G_L_fsts:
        run_shell(
            f"fstdraw --isymbols={final_lang_dir}/words.txt " +
            f"--osymbols={final_lang_dir}/words.txt {final_lang_dir}/G.fst | dot -Tpdf -o{out_dir}/G.pdf"
        )

    run_shell(
        f"{out_dir}/utils/validate_lang.pl --skip-determinization-check {final_lang_dir}"
    )

    final_graph_dir = os.path.join(out_dir, "graph_final")
    if not os.path.isdir(final_graph_dir):
        os.makedirs(final_graph_dir)

    run_shell(
        f"{out_dir}/utils/mkgraph.sh {final_lang_dir} {train_graph_dir} {final_graph_dir}"
    )

    if not os.path.exists(os.path.join(final_graph_dir, "final.mdl")):
        os.symlink(f"{train_graph_dir}/final.mdl",
                   os.path.join(final_graph_dir, "final.mdl"))

    os.chdir(cwd)

    return os.path.abspath(final_graph_dir)
Esempio n. 14
0
def score(
    data,
    words_path,
    dir,
    word_ins_penalty=None,
    min_acwt=1,
    max_acwt=20,
    acwt_factor=0.05  # the scaling factor for the acoustic scale. The scaling factor for acoustic likelihoods
    # needs to be 0.5 ~1.0. However, the job submission script can only take integers as the
    # job marker. That's why we set the acwt to be integers (5 ~ 10), but scale them with 0.1
    # when they are actually used.
):
    if word_ins_penalty is None:
        word_ins_penalty = [0.0, 0.5, 1.0, 1.5, 2.0]
    # decoing_scripts_folder = os.path.join(os.getcwd(), __name__.split(".")[0])  # 'kaldi_decoding_scripts'
    # pl_cmd_script = os.path.join(decoing_scripts_folder, "utils/run.pl")
    # assert os.path.exists(pl_cmd_script)
    # assert os.access(pl_cmd_script, os.X_OK)
    # symtab = os.path.join(lang_or_graph, "words.txt")
    # assert os.path.exists(symtab)
    # assert os.path.exists(os.path.join(dir, "lat.1.gz"))
    # assert os.path.exists(os.path.join(data, "text"))
    # int2sym_script = os.path.join(decoing_scripts_folder, "utils/int2sym.pl")
    # assert os.path.exists(int2sym_script)
    # assert os.access(int2sym_script, os.X_OK)
    # if not os.path.isdir(os.path.join(dir, "scoring", "log")):
    #     os.makedirs(os.path.join(dir, "scoring", "log"))

    # --cmd "$decode_cmd" --nj 10 --beam 17.0 --lattice_beam 8.0 --max-active 5000 --acwt 0.9 \
    #                   --skip true --splice true --splice-opts "--left-context=1 --right-context=1" --skip-frames 3 --skip-offset 1 \
    #                                   ${lang_dir}_test_${lm_suffix} $exp_base/$test $train_dir/decode_${test}_${lm_suffix} || exit 1;

    # words_path = "wrds.txt"

    if not os.path.exists(f"{dir}/scoring"):
        os.makedirs(f"{dir}/scoring")

    assert os.environ['EESEN_ROOT']
    lattice_scale_bin = f"{os.environ['EESEN_ROOT']}/src/decoderbin/lattice-scale"
    lattice_add_penalty_bin = f"{os.environ['EESEN_ROOT']}/src/decoderbin/lattice-add-penalty"
    lattice_best_path_bin = f"{os.environ['EESEN_ROOT']}/src/decoderbin/lattice-best-path"

    # for wip in word_ins_penalty:
    #     for ACWT in range(min_acwt, max_acwt):
    #         run_shell(
    #             f"{lattice_scale_bin} --acoustic-scale={ACWT} --ascale-factor={acwt_factor}  \"ark:gunzip -c {dir}/lat.*.gz|\" ark:- | "
    #             + f"{lattice_add_penalty_bin} --word-ins-penalty={wip} ark:- ark:- |"
    #             + f"{lattice_best_path_bin} --word-symbol-table={words_path} ark:- ark,t:{dir}/scoring/{ACWT}_{wip}_tra")

    # run_shell(f"cat {data}/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > {dir}/scoring/test_filt.txt")
    run_shell(
        f"cat {data}/text | sed 's:<UNK>::g' | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > {dir}/scoring/text_filt"
    )

    compute_wer_bin = f"{os.environ['EESEN_ROOT']}/src/decoderbin/compute-wer"
    lattice_1best_bin = f"{os.environ['EESEN_ROOT']}/src/decoderbin/lattice-1best"
    nbest_to_ctm_bin = f"{os.environ['EESEN_ROOT']}/src/decoderbin/nbest-to-ctm"
    compute_wer_bin = f"{os.environ['EESEN_ROOT']}/src/decoderbin/compute-wer"

    int2sym_script = os.path.join(os.getcwd(),
                                  "kaldi_decoding_scripts/utils/int2sym.pl")
    assert os.path.exists(int2sym_script)

    # for wip in word_ins_penalty:
    #     for ACWT in range(min_acwt, max_acwt):
    #         run_shell(f"cat {dir}/scoring/{ACWT}_{wip}_tra | {int2sym_script} -f 2- {words_path} | "
    #                   + f" sed 's:<UNK>::g' | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' |"
    #                   + f"{compute_wer_bin} --text --mode=present ark:{dir}/scoring/text_filt  ark,p:- {dir}/details_{ACWT}_{wip} >& {dir}/wer_{ACWT}_{wip}")

    convert_ctm_script = os.path.join(
        os.getcwd(), "kws_decoder/eesen_utils/convert_ctm.pl")
    assert os.path.exists(convert_ctm_script)
    name = "test_name_"
    # for wip in word_ins_penalty:
    for ACWT in range(min_acwt, max_acwt):
        if not os.path.exists(f"{dir}/score_{ACWT}/"):
            os.makedirs(f"{dir}/score_{ACWT}/")

        run_shell(
            f"{lattice_1best_bin} --acoustic-scale={ACWT} --ascale-factor={acwt_factor} \"ark:gunzip -c {dir}/lat.*.gz|\" ark:- | "
            + f"{nbest_to_ctm_bin} ark:- - | " +
            f"{int2sym_script} -f 5 {words_path}  | " +
            f"{convert_ctm_script} {data}/segments {data}/reco2file_and_channel"
        )

        run_shell(
            f"{lattice_1best_bin} --acoustic-scale={ACWT} --ascale-factor={acwt_factor} \"ark:gunzip -c {dir}/lat.*.gz|\" ark:- | "
            + f"{nbest_to_ctm_bin} ark:- - | " +
            f"{int2sym_script} -f 5 {words_path}  | " +
            f"{convert_ctm_script} {data}/segments {data}/reco2file_and_channel "
            + f"> {dir}/score_{ACWT}/{name}.ctm")
Esempio n. 15
0
def save_checkpoint(
        epoch,
        global_step,
        model,
        optimizers,
        lr_schedulers,
        seq_len_scheduler,
        config,
        checkpoint_dir,  # monitor_best=None,
        dataset_sampler_state=None,
        save_best=None):
    """
    Saving checkpoints

    :param epoch: current epoch number
    :param log: logging information of the epoch
    :param save_best: if True, rename the saved checkpoint to 'model_best.pth'
    """
    assert dataset_sampler_state != save_best, "save_best is only done at the end of an epoch"

    # TODO figure out why shutil.disk_usage gives different result to df

    # available_disk_space_in_gb = shutil.disk_usage(checkpoint_dir).free * 1e-9
    available_disk_space_in_gb = run_shell(f"df -h {checkpoint_dir}")
    available_disk_space_in_gb = int(
        available_disk_space_in_gb.split("\n")[1].split(" ")[13][:-1])

    assert available_disk_space_in_gb > 5, \
        f"available_disk_space_in_gb of {available_disk_space_in_gb} is lower than 5GB" \
        + f"Aborting to try to save in order to not corrupt the model files"

    torch_rng_state, python_rng_state, numpy_rng_state = get_rng_state()

    state = {
        'epoch': epoch,
        'global_step': global_step,
        'state_dict': model.state_dict(),
        'optimizers': {
            opti_name: optimizers[opti_name].state_dict()
            for opti_name in optimizers
        },
        'lr_schedulers': {
            lr_sched_name: lr_schedulers[lr_sched_name].state_dict()
            for lr_sched_name in lr_schedulers
        },
        'seq_len_scheduler': seq_len_scheduler,
        'dataset_sampler_state': dataset_sampler_state,
        # 'monitor_best': monitor_best,
        'config': config,
        'torch_rng_state': torch_rng_state,
        'python_rng_state': python_rng_state,
        'numpy_rng_state': numpy_rng_state,
    }
    if dataset_sampler_state is not None:
        # Intermediate save during training epoch
        all_previous_checkpoints = glob(
            os.path.join(checkpoint_dir, 'checkpoint_e*_gs*.pth'))
        checkpoint_name = f'checkpoint_e{epoch}_gs{global_step}.pth'

        filename = os.path.join(checkpoint_dir, checkpoint_name)
        torch.save(state, filename)
        logger.info(f"Saved checkpoint: {filename}")

        for old_checkpoint in all_previous_checkpoints:
            if os.path.exists(old_checkpoint):
                os.remove(old_checkpoint)
                logger.info(f"Removed old checkpoint: {old_checkpoint} ")

    else:
        checkpoint_name = f'checkpoint_e{epoch}.pth'

        filename = os.path.join(checkpoint_dir, checkpoint_name)
        torch.save(state, filename)
        logger.info(f"Saved checkpoint: {filename}")

        if epoch >= 3:
            filename_prev = os.path.join(checkpoint_dir,
                                         f'checkpoint_e{epoch - 3}.pth')
            if os.path.exists(filename_prev):
                os.remove(filename_prev)
                logger.info(f"Removed old checkpoint: {filename_prev} ")

        if save_best is not None and save_best:
            checkpoint_name = f'checkpoint_best.pth'

            best_path = os.path.join(checkpoint_dir, checkpoint_name)
            torch.save(state, best_path)
            logger.info(f"Saved current best: {checkpoint_name}")

    # available_disk_space_in_gb = shutil.disk_usage(checkpoint_dir).free * 1e-9
    available_disk_space_in_gb = run_shell(f"df -h {checkpoint_dir}")
    available_disk_space_in_gb = int(
        available_disk_space_in_gb.split("\n")[1].split(" ")[13][:-1])
    assert available_disk_space_in_gb > 5, \
        f"available_disk_space_in_gb of {available_disk_space_in_gb} is lower than 5GB" \
        + f"Aborting since next checkpoint save probably fails because of too little space -> no wasted training compute"
Esempio n. 16
0
def decode(alignment_model_path,
           words_path,
           graph_path,
           out_folder,
           featstrings,
           min_active=20,
           max_active=700,
           max_mem=500000,
           beam=5.0,
           latbeam=5.0,
           acwt=1.0,
           max_arcs=-1.0,
           **kwargs):
    out_folder = f"{out_folder}/exp_files"
    assert isinstance(featstrings, list)
    num_threads = 4  # TODO more threads
    assert out_folder[-1] != '/'
    srcdir = os.path.dirname(out_folder)

    thread_string = f"-parallel --num-threads={num_threads}"

    if not os.path.isdir(os.path.join(out_folder, "log")):
        os.makedirs(os.path.join(out_folder, "log"))

    assert graph_path.endswith("HCLG.fst")
    assert words_path.endswith("words.txt")
    assert alignment_model_path.endswith("final.mdl")

    # TODO should we really just delete these files?
    if len(glob(f"{out_folder}/lat.*.gz")) > 0:
        for file in glob(f"{out_folder}/lat.*.gz"):
            os.remove(file)
    if len(glob(f"{out_folder}/log/decode.*.log")) > 0:
        for file in glob(f"{out_folder}/log/decode.*.log"):
            os.remove(file)

    chnk_id = 0
    for ck_data in tqdm(featstrings, desc="lattice generation chunk:"):
        assert not os.path.exists(f"{out_folder}/lat.{chnk_id}.gz")
        assert not os.path.exists(f"{out_folder}/log/decode.{chnk_id}.log")
        finalfeats = f"ark,s,cs: cat {ck_data} |"
        cmd = f'latgen-faster-mapped{thread_string} --min-active={min_active} ' \
              + f'--max-active={max_active} --max-mem={max_mem} ' \
              + f'--beam={beam} --lattice-beam={latbeam} ' \
              + f'--acoustic-scale={acwt}' \
              + f' --allow-partial=true ' \
              + f'--word-symbol-table={words_path} {alignment_model_path} ' \
              + f'{graph_path} ' \
              + f'\"{finalfeats}\" \"ark:|gzip -c > {out_folder}/lat.{chnk_id}.gz\"'
        run_shell(cmd)
        chnk_id += 1

        # TODO display the generated lattice for keywords

    copy(alignment_model_path, srcdir)
    transcripts_best, transcripts, lattice_confidence, lm_posterior, acoustic_posterior = get_transcripts(
        words_path, out_folder)

    for t in transcripts_best:
        assert transcripts_best[t] == transcripts[
            t], f"{t}: {transcripts_best[t]} =!= {transcripts[t]}"

    assert len(transcripts) == len(lattice_confidence)
    transcripts = dict(transcripts)
    lattice_confidence = dict(lattice_confidence)
    lm_posterior = dict(lm_posterior)
    acoustic_posterior = dict(acoustic_posterior)
    result = {}
    for sample_id in transcripts:
        _lattice_confidence = lattice_confidence[sample_id] \
            if 10000000000.0 != lattice_confidence[sample_id] else float("inf")
        _lm_posterior = lm_posterior[sample_id]  # TODO normalize
        _acoustic_posterior = acoustic_posterior[sample_id]  # TODO normalize
        result[sample_id] = (transcripts[sample_id], _lattice_confidence,
                             _lm_posterior, _acoustic_posterior)

    return result