def create_basic_work_dir_structure(data_dir, wav16_dir, mfcc_dir, work_dir,
                                    language_model_dir, kaldi_root):
    # FIXME: unused, remove misc.mkdirs('%s/lexicon' % data_dir)
    misc.mkdirs('%s/local/dict' % data_dir)
    misc.mkdirs(wav16_dir)
    misc.mkdirs(mfcc_dir)
    misc.symlink(language_model_dir, '%s/lm' % work_dir)
    misc.symlink('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % work_dir)
    misc.symlink('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % work_dir)
Exemple #2
0
        continue

    for speaker in os.listdir(SRCDIR + '/' + subset):
        for book_id in os.listdir(SRCDIR + '/' + subset + '/' + speaker):

            folder = 'librispeech%s-%s' % (speaker, book_id)
            dstdir = '%s/%s' % (DESTDIR, folder)

            misc.mkdirs('%s/flac' % dstdir)
            misc.mkdirs('%s/etc' % dstdir)

            promptsfn = '%s/etc/prompts-original' % dstdir
            transfn = '%s/%s/%s/%s/%s-%s.trans.txt' % (
                SRCDIR, subset, speaker, book_id, speaker, book_id)

            with codecs.open(promptsfn, 'w', 'utf8') as promptsf:
                with codecs.open(transfn, 'r', 'utf8') as transf:
                    for line in transf:
                        parts = line.split()
                        promptsf.write(line)

                        flac_src = '%s/%s/%s/%s/%s.flac' % (
                            SRCDIR, subset, speaker, book_id, parts[0])
                        flac_dst = '%s/flac/%s.flac' % (dstdir, parts[0])

                        logging.debug(' %s -> %s' % (flac_src, flac_dst))

                        misc.symlink(flac_src, flac_dst)

            logging.debug('%s written.' % promptsfn)
#

cmd = 'rm -rf %s' % WORKDIR
logging.info(cmd)
os.system(cmd)
misc.mkdirs(WORKDIR)

#
# copy scripts
#

misc.copy_file ('data/src/speech/kaldi-run-segmentation.sh', '%s/run-segmentation.sh' % WORKDIR)

misc.copy_file ('data/src/speech/kaldi-cmd.sh', '%s/cmd.sh' % WORKDIR)
misc.render_template ('data/src/speech/kaldi-path.sh.template', '%s/path.sh' % WORKDIR, kaldi_root=kaldi_root)
misc.symlink ('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % WORKDIR)
misc.symlink ('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % WORKDIR)

#
# create skeleton dst model
#

misc.mkdirs ('%s/exp'  % WORKDIR)

cmd = "cp -r '%s/model' '%s/exp/tri2b_adapt'" % (modelfn, WORKDIR)
logging.info(cmd)
os.system(cmd)
cmd = "cp -r '%s/data'  '%s/data'" % (modelfn, WORKDIR)
logging.info(cmd)
os.system(cmd)
cmd = "cp -r '%s/conf'  '%s/conf'" % (modelfn, WORKDIR)
#

config = misc.load_config('.speechrc')

kaldi_root = config.get("speech", "kaldi_root")
wav16_dir = config.get("speech", "wav16")

#
# create basic work dir structure
#

# FIXME: unused, remove misc.mkdirs('%s/lexicon' % data_dir)
misc.mkdirs('%s/local/dict' % data_dir)
misc.mkdirs(wav16_dir)
misc.mkdirs(mfcc_dir)
misc.symlink('../../../../../%s' % language_model_dir, '%s/lm' % work_dir)
misc.symlink('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % work_dir)
misc.symlink('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % work_dir)

#
# generate speech and text corpora
#

logging.info("loading lexicon...")
lex = Lexicon(file_name=dictionary)
logging.info("loading lexicon...done.")

if sequitur_model_path:
    add_all = True
else:
    add_all = False
Exemple #5
0
logging.info ( "loading transcripts...")
transcripts = Transcripts(lang=options.lang)
ts_all, ts_train, ts_test = transcripts.split(limit=options.debug, add_all=options.add_all)
logging.info ( "loading transcripts (%d train, %d test) ...done." % (len(ts_train), len(ts_test)))

#
# create work_dir structure
#


misc.mkdirs('%s/lexicon' % data_dir)
misc.mkdirs('%s/local/dict' % data_dir)
misc.mkdirs(wav16_dir)
misc.mkdirs(mfcc_dir)

misc.symlink('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % work_dir)
misc.symlink('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % work_dir)

#
# kaldi data part
#

def export_kaldi_data (destdirfn, tsdict):

    global wav16_dir

    logging.info ( "Exporting to %s..." % destdirfn)

    misc.mkdirs(destdirfn)

    with open(destdirfn+'wav.scp','w') as wavscpf,  \
Exemple #6
0
def kaldi_adapt_lm(kaldi_root, src_model_dir, lm_fn, work_dir, dst_model_name):

    steps_path = '%s/egs/wsj/s5/steps' % kaldi_root
    if not os.path.exists (steps_path):
        raise Exception ('%s does not exist - is kaldi really installed in %s ?' % (steps_path, kaldi_root))

    tmpl_dir = os.path.dirname(os.path.abspath(__file__)) + '/templates'

    #
    # copy dictionary and phoneme sets from original model
    #

    logging.info("copying dictionary and phoneme sets from original model...")

    misc.mkdirs('%s/data/local/dict' % work_dir)
    misc.copy_file ('%s/data/local/dict/lexicon.txt' % src_model_dir,           '%s/data/local/dict/lexicon.txt' % work_dir)
    misc.copy_file ('%s/data/local/dict/nonsilence_phones.txt' % src_model_dir, '%s/data/local/dict/nonsilence_phones.txt' % work_dir)
    misc.copy_file ('%s/data/local/dict/silence_phones.txt' % src_model_dir,    '%s/data/local/dict/silence_phones.txt' % work_dir)
    misc.copy_file ('%s/data/local/dict/optional_silence.txt' % src_model_dir,  '%s/data/local/dict/optional_silence.txt' % work_dir)
    misc.copy_file ('%s/data/local/dict/extra_questions.txt' % src_model_dir,   '%s/data/local/dict/extra_questions.txt' % work_dir)

    #
    # language model 
    #

    misc.copy_file (lm_fn, '%s/lm.arpa' % work_dir)

    #
    # create skeleton dst model
    #

    logging.info("creating skeleton destination model...")

    misc.mkdirs ('%s/exp/adapt'  % work_dir)

    misc.copy_file ('%s/model/final.mdl' % src_model_dir, '%s/exp/adapt/final.mdl' % work_dir)
    misc.copy_file ('%s/model/cmvn_opts' % src_model_dir, '%s/exp/adapt/cmvn_opts' % work_dir)
    misc.copy_file ('%s/model/tree'      % src_model_dir, '%s/exp/adapt/tree'      % work_dir)

    for optional_file in [ 'final.mat', 'splice_opts', 'final.occs', 'full.mat' ] :
        if os.path.exists('%s/model/%s' % (src_model_dir, optional_file)):
            misc.copy_file ('%s/model/%s' % (src_model_dir, optional_file), '%s/exp/adapt/%s' % (work_dir, optional_file))

    if os.path.exists('%s/extractor' % src_model_dir):

        misc.mkdirs ('%s/exp/extractor' % work_dir)

        misc.copy_file ('%s/extractor/final.mat'         % src_model_dir, '%s/exp/extractor/final.mat'         % work_dir)
        misc.copy_file ('%s/extractor/global_cmvn.stats' % src_model_dir, '%s/exp/extractor/global_cmvn.stats' % work_dir)
        misc.copy_file ('%s/extractor/final.dubm'        % src_model_dir, '%s/exp/extractor/final.dubm'        % work_dir)
        misc.copy_file ('%s/extractor/final.ie'          % src_model_dir, '%s/exp/extractor/final.ie'          % work_dir)
        misc.copy_file ('%s/extractor/splice_opts'       % src_model_dir, '%s/exp/extractor/splice_opts'       % work_dir)

        misc.mkdirs ('%s/exp/ivectors_test_hires/conf' % work_dir)

        misc.copy_file ('%s/ivectors_test_hires/conf/splice.conf'       % src_model_dir, '%s/exp/ivectors_test_hires/conf'    % work_dir)

    misc.mkdirs ('%s/conf'  % work_dir)
    misc.copy_file ('%s/conf/mfcc.conf' % src_model_dir,        '%s/conf/mfcc.conf' % work_dir)
    misc.copy_file ('%s/conf/mfcc_hires.conf' % src_model_dir,  '%s/conf/mfcc_hires.conf' % work_dir)
    misc.copy_file ('%s/conf/online_cmvn.conf' % src_model_dir, '%s/conf/online_cmvn.conf' % work_dir)

    #
    # copy scripts and config files
    #
     
    misc.copy_file       ('%s/kaldi-run-adaptation.sh' % tmpl_dir, '%s/run-adaptation.sh' % work_dir)
    misc.copy_file       ('%s/kaldi-cmd.sh' % tmpl_dir,            '%s/cmd.sh' % work_dir)
    misc.render_template ('%s/kaldi-path.sh.template' % tmpl_dir,  '%s/path.sh' % work_dir, kaldi_root=kaldi_root)
    misc.copy_file       ('%s/kaldi-model-dist.sh' % tmpl_dir,     '%s/model-dist.sh' % work_dir)

    misc.symlink ('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % work_dir)
    misc.symlink ('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % work_dir)

    cmd = '/bin/bash -c "pushd %s && bash run-adaptation.sh && popd"' % work_dir
    logging.info (cmd)
    os.system (cmd)

    cmd = '/bin/bash -c "pushd %s && bash model-dist.sh "%s" && popd"' % (work_dir, dst_model_name)
    logging.info (cmd)
    os.system (cmd)