def create_basic_work_dir_structure(data_dir, wav16_dir, mfcc_dir, work_dir, language_model_dir, kaldi_root): # FIXME: unused, remove misc.mkdirs('%s/lexicon' % data_dir) misc.mkdirs('%s/local/dict' % data_dir) misc.mkdirs(wav16_dir) misc.mkdirs(mfcc_dir) misc.symlink(language_model_dir, '%s/lm' % work_dir) misc.symlink('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % work_dir) misc.symlink('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % work_dir)
continue for speaker in os.listdir(SRCDIR + '/' + subset): for book_id in os.listdir(SRCDIR + '/' + subset + '/' + speaker): folder = 'librispeech%s-%s' % (speaker, book_id) dstdir = '%s/%s' % (DESTDIR, folder) misc.mkdirs('%s/flac' % dstdir) misc.mkdirs('%s/etc' % dstdir) promptsfn = '%s/etc/prompts-original' % dstdir transfn = '%s/%s/%s/%s/%s-%s.trans.txt' % ( SRCDIR, subset, speaker, book_id, speaker, book_id) with codecs.open(promptsfn, 'w', 'utf8') as promptsf: with codecs.open(transfn, 'r', 'utf8') as transf: for line in transf: parts = line.split() promptsf.write(line) flac_src = '%s/%s/%s/%s/%s.flac' % ( SRCDIR, subset, speaker, book_id, parts[0]) flac_dst = '%s/flac/%s.flac' % (dstdir, parts[0]) logging.debug(' %s -> %s' % (flac_src, flac_dst)) misc.symlink(flac_src, flac_dst) logging.debug('%s written.' % promptsfn)
# cmd = 'rm -rf %s' % WORKDIR logging.info(cmd) os.system(cmd) misc.mkdirs(WORKDIR) # # copy scripts # misc.copy_file ('data/src/speech/kaldi-run-segmentation.sh', '%s/run-segmentation.sh' % WORKDIR) misc.copy_file ('data/src/speech/kaldi-cmd.sh', '%s/cmd.sh' % WORKDIR) misc.render_template ('data/src/speech/kaldi-path.sh.template', '%s/path.sh' % WORKDIR, kaldi_root=kaldi_root) misc.symlink ('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % WORKDIR) misc.symlink ('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % WORKDIR) # # create skeleton dst model # misc.mkdirs ('%s/exp' % WORKDIR) cmd = "cp -r '%s/model' '%s/exp/tri2b_adapt'" % (modelfn, WORKDIR) logging.info(cmd) os.system(cmd) cmd = "cp -r '%s/data' '%s/data'" % (modelfn, WORKDIR) logging.info(cmd) os.system(cmd) cmd = "cp -r '%s/conf' '%s/conf'" % (modelfn, WORKDIR)
# config = misc.load_config('.speechrc') kaldi_root = config.get("speech", "kaldi_root") wav16_dir = config.get("speech", "wav16") # # create basic work dir structure # # FIXME: unused, remove misc.mkdirs('%s/lexicon' % data_dir) misc.mkdirs('%s/local/dict' % data_dir) misc.mkdirs(wav16_dir) misc.mkdirs(mfcc_dir) misc.symlink('../../../../../%s' % language_model_dir, '%s/lm' % work_dir) misc.symlink('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % work_dir) misc.symlink('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % work_dir) # # generate speech and text corpora # logging.info("loading lexicon...") lex = Lexicon(file_name=dictionary) logging.info("loading lexicon...done.") if sequitur_model_path: add_all = True else: add_all = False
logging.info ( "loading transcripts...") transcripts = Transcripts(lang=options.lang) ts_all, ts_train, ts_test = transcripts.split(limit=options.debug, add_all=options.add_all) logging.info ( "loading transcripts (%d train, %d test) ...done." % (len(ts_train), len(ts_test))) # # create work_dir structure # misc.mkdirs('%s/lexicon' % data_dir) misc.mkdirs('%s/local/dict' % data_dir) misc.mkdirs(wav16_dir) misc.mkdirs(mfcc_dir) misc.symlink('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % work_dir) misc.symlink('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % work_dir) # # kaldi data part # def export_kaldi_data (destdirfn, tsdict): global wav16_dir logging.info ( "Exporting to %s..." % destdirfn) misc.mkdirs(destdirfn) with open(destdirfn+'wav.scp','w') as wavscpf, \
def kaldi_adapt_lm(kaldi_root, src_model_dir, lm_fn, work_dir, dst_model_name): steps_path = '%s/egs/wsj/s5/steps' % kaldi_root if not os.path.exists (steps_path): raise Exception ('%s does not exist - is kaldi really installed in %s ?' % (steps_path, kaldi_root)) tmpl_dir = os.path.dirname(os.path.abspath(__file__)) + '/templates' # # copy dictionary and phoneme sets from original model # logging.info("copying dictionary and phoneme sets from original model...") misc.mkdirs('%s/data/local/dict' % work_dir) misc.copy_file ('%s/data/local/dict/lexicon.txt' % src_model_dir, '%s/data/local/dict/lexicon.txt' % work_dir) misc.copy_file ('%s/data/local/dict/nonsilence_phones.txt' % src_model_dir, '%s/data/local/dict/nonsilence_phones.txt' % work_dir) misc.copy_file ('%s/data/local/dict/silence_phones.txt' % src_model_dir, '%s/data/local/dict/silence_phones.txt' % work_dir) misc.copy_file ('%s/data/local/dict/optional_silence.txt' % src_model_dir, '%s/data/local/dict/optional_silence.txt' % work_dir) misc.copy_file ('%s/data/local/dict/extra_questions.txt' % src_model_dir, '%s/data/local/dict/extra_questions.txt' % work_dir) # # language model # misc.copy_file (lm_fn, '%s/lm.arpa' % work_dir) # # create skeleton dst model # logging.info("creating skeleton destination model...") misc.mkdirs ('%s/exp/adapt' % work_dir) misc.copy_file ('%s/model/final.mdl' % src_model_dir, '%s/exp/adapt/final.mdl' % work_dir) misc.copy_file ('%s/model/cmvn_opts' % src_model_dir, '%s/exp/adapt/cmvn_opts' % work_dir) misc.copy_file ('%s/model/tree' % src_model_dir, '%s/exp/adapt/tree' % work_dir) for optional_file in [ 'final.mat', 'splice_opts', 'final.occs', 'full.mat' ] : if os.path.exists('%s/model/%s' % (src_model_dir, optional_file)): misc.copy_file ('%s/model/%s' % (src_model_dir, optional_file), '%s/exp/adapt/%s' % (work_dir, optional_file)) if os.path.exists('%s/extractor' % src_model_dir): misc.mkdirs ('%s/exp/extractor' % work_dir) misc.copy_file ('%s/extractor/final.mat' % src_model_dir, '%s/exp/extractor/final.mat' % work_dir) misc.copy_file ('%s/extractor/global_cmvn.stats' % src_model_dir, '%s/exp/extractor/global_cmvn.stats' % work_dir) misc.copy_file ('%s/extractor/final.dubm' % src_model_dir, '%s/exp/extractor/final.dubm' % work_dir) misc.copy_file ('%s/extractor/final.ie' % src_model_dir, '%s/exp/extractor/final.ie' % work_dir) misc.copy_file ('%s/extractor/splice_opts' % src_model_dir, '%s/exp/extractor/splice_opts' % work_dir) misc.mkdirs ('%s/exp/ivectors_test_hires/conf' % work_dir) misc.copy_file ('%s/ivectors_test_hires/conf/splice.conf' % src_model_dir, '%s/exp/ivectors_test_hires/conf' % work_dir) misc.mkdirs ('%s/conf' % work_dir) misc.copy_file ('%s/conf/mfcc.conf' % src_model_dir, '%s/conf/mfcc.conf' % work_dir) misc.copy_file ('%s/conf/mfcc_hires.conf' % src_model_dir, '%s/conf/mfcc_hires.conf' % work_dir) misc.copy_file ('%s/conf/online_cmvn.conf' % src_model_dir, '%s/conf/online_cmvn.conf' % work_dir) # # copy scripts and config files # misc.copy_file ('%s/kaldi-run-adaptation.sh' % tmpl_dir, '%s/run-adaptation.sh' % work_dir) misc.copy_file ('%s/kaldi-cmd.sh' % tmpl_dir, '%s/cmd.sh' % work_dir) misc.render_template ('%s/kaldi-path.sh.template' % tmpl_dir, '%s/path.sh' % work_dir, kaldi_root=kaldi_root) misc.copy_file ('%s/kaldi-model-dist.sh' % tmpl_dir, '%s/model-dist.sh' % work_dir) misc.symlink ('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % work_dir) misc.symlink ('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % work_dir) cmd = '/bin/bash -c "pushd %s && bash run-adaptation.sh && popd"' % work_dir logging.info (cmd) os.system (cmd) cmd = '/bin/bash -c "pushd %s && bash model-dist.sh "%s" && popd"' % (work_dir, dst_model_name) logging.info (cmd) os.system (cmd)