def generate_speech_and_text_corpora(data_dir, wav16_dir, debug, sequitur_model_path, lexicon_file_name, audio_corpora, prompt_words): logging.info("loading lexicon...") lex = Lexicon(file_name=lexicon_file_name) logging.info("loading lexicon...done.") logging.info("loading transcripts...") if sequitur_model_path: add_all = True else: add_all = False ts_all = {} ts_train = {} ts_test = {} transcript_objs = [] for audio_corpus in audio_corpora: transcripts = Transcripts(corpus_name=audio_corpus) ts_all_, ts_train_, ts_test_ = transcripts.split(limit=debug, add_all=add_all) logging.info("loading transcripts from %s (%d train, %d test) ..." % (audio_corpus, len(ts_train_), len(ts_test_))) ts_all.update(ts_all_) ts_train.update(ts_train_) ts_test.update(ts_test_) transcript_objs.append(transcripts) logging.info("loading transcripts (%d train, %d test) ...done." % ( len(ts_train), len(ts_test))) export_kaldi_data(wav16_dir, audio_corpora, '%s/train/' % data_dir, ts_train) export_kaldi_data(wav16_dir, audio_corpora, '%s/test/' % data_dir, ts_test) if sequitur_model_path: for transcript_obj in transcript_objs: lex = add_missing_words(transcript_obj, lex, sequitur_model_path) ps, utt_dict = export_dictionary(ts_all, lex, '%s/local/dict/lexicon.txt' % data_dir, prompt_words) write_nonsilence_phones( ps, '%s/local/dict/nonsilence_phones.txt' % data_dir) write_silence_phones('%s/local/dict/silence_phones.txt' % data_dir) write_optional_silence('%s/local/dict/optional_silence.txt' % data_dir) write_extra_questions(ps, '%s/local/dict/extra_questions.txt' % data_dir) create_training_data_for_language_model(transcript_objs, utt_dict, data_dir)
ts_filter = args[0].decode('utf8') # # load transcripts # print "loading transcripts..." transcripts = Transcripts() print "loading transcripts...done." # # load lexicon # print "loading lexicon..." lex = Lexicon() print "loading lexicon...done." # # load prompts # prompt_tokens = [] prompt_token_idx = 0 if options.promptsfn: with codecs.open(options.promptsfn, 'r', 'utf8') as promptsf: for line in promptsf: prompt_tokens.extend(tokenize(line)) print "%s read. %d tokens." % (options.promptsfn, len(prompt_tokens))
if len(args) < 2: parser.print_usage() sys.exit(1) lex_name = args[0] corpus_name = args[1] sequitur_model = SEQUITUR_MODEL % lex_name # # load lexicon, transcripts # logging.info("loading lexicon...") lex = Lexicon(file_name=lex_name) logging.info("loading lexicon...done.") logging.info("loading transcripts...") transcripts = Transcripts(corpus_name=corpus_name) logging.info("loading transcripts...done.") # # find missing words # missing = {} # word -> count num = len(transcripts) cnt = 0
(options, args) = parser.parse_args() if options.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) dict_name = options.dict_name workdir = 'data/dst/dict-models/%s/sequitur' % dict_name # # load lexicon # logging.info("loading lexicon...") lex = Lexicon(file_name=dict_name) logging.info("loading lexicon...done.") # # export # misc.mkdirs(workdir) with codecs.open('%s/train.lex' % workdir, 'w', 'utf8') as trainf, \ codecs.open('%s/test.lex' % workdir, 'w', 'utf8') as testf, \ codecs.open('%s/all.lex' % workdir, 'w', 'utf8') as allf : cnt = 0 for word in lex:
# # FIXME: unused, remove misc.mkdirs('%s/lexicon' % data_dir) misc.mkdirs('%s/local/dict' % data_dir) misc.mkdirs(wav16_dir) misc.mkdirs(mfcc_dir) misc.symlink('../../../../../%s' % language_model_dir, '%s/lm' % work_dir) misc.symlink('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % work_dir) misc.symlink('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % work_dir) # # generate speech and text corpora # logging.info("loading lexicon...") lex = Lexicon(file_name=dictionary) logging.info("loading lexicon...done.") if sequitur_model_path: add_all = True else: add_all = False ts_all = {} ts_train = {} ts_test = {} transcript_objs = [] for audio_corpus in audio_corpora: logging.info("loading transcripts from %s ..." % audio_corpus)
logging.info(repr(wrt)) # # load transcripts # logging.info("loading transcripts...") transcripts = Transcripts(corpus_name=corpus_name) logging.info("loading transcripts...done.") # # load lexicon # logging.info("loading lexicon...") lex = Lexicon(file_name=options.dict_name) logging.info("loading lexicon...done.") # # load prompts # prompt_tokens = [] prompt_token_idx = 0 if options.promptsfn: with codecs.open(options.promptsfn, 'r', 'utf8') as promptsf: for line in promptsf: prompt_tokens.extend(tokwrt(line)) logging.info("%s read. %d tokens." % (options.promptsfn, len(prompt_tokens)))
else: logging.basicConfig(level=logging.INFO) if len(args) != 1: parser.print_usage() sys.exit(1) inputfn = args[0] outputfn = os.path.splitext(args[0])[0] + ".prompt" # # load lexicon # logging.info("loading lexicon...") lex = Lexicon(DICT) logging.info("loading lexicon...done.") lex_edit = LexEdit(lex) # # load wrt # wrt = {} if os.path.exists(options.wrt): logging.info("loading %s" % options.wrt) with codecs.open(options.wrt, 'r', 'utf8') as wrtf:
vf_login = config.get("speech", "vf_login") extrasdir = config.get("speech", "extrasdir_%s" % lang) # # TTS (for audio output) # tts = TTS('local', 0, locale='de', voice='bits3', engine='espeak') # # load lexicon # logging.info("loading lexicon...") lex = Lexicon(file_name=options.lang) logging.info("loading lexicon...done.") # # main ui loop # next_segment() while segmentfn: print print segmentfn print prompt # any words not covered by our lexicon?
# work_dir = WORKDIR %options.lang kaldi_root = config.get("speech", "kaldi_root") data_dir = "%s/data" % work_dir mfcc_dir = "%s/mfcc" % work_dir wav16_dir = config.get("speech", "wav16_dir_%s" % options.lang) # # load lexicon, transcripts # logging.info ( "loading lexicon...") lex = Lexicon(lang=options.lang) logging.info ( "loading lexicon...done.") logging.info ( "loading transcripts...") transcripts = Transcripts(lang=options.lang) ts_all, ts_train, ts_test = transcripts.split(limit=options.debug, add_all=options.add_all) logging.info ( "loading transcripts (%d train, %d test) ...done." % (len(ts_train), len(ts_test))) # # create work_dir structure # misc.mkdirs('%s/lexicon' % data_dir) misc.mkdirs('%s/local/dict' % data_dir) misc.mkdirs(wav16_dir)
dst_model = args[3] dst_dir = 'data/dst/asr-models/kaldi/%s' % dst_model # # config # kaldi_root = config.get("speech", "kaldi_root") # # load lexicon, transcripts # logging.info("loading lexicon...") lex = Lexicon(file_name=dict_name) logging.info("loading lexicon...done.") # # cleanup leftovers from previous runs # cmd = 'rm -rf %s' % dst_dir logging.info(cmd) os.system(cmd) # # dictionary export # misc.mkdirs('%s/data/local/dict' % dst_dir)
logging.basicConfig(level=logging.INFO) # # load transcripts # logging.info("loading transcripts...") transcripts = Transcripts(lang=options.lang) logging.info("loading transcripts...done.") # # load lexicon # logging.info("loading lexicon...") lex = Lexicon(lang=options.lang) logging.info("loading lexicon...done.") # # load prompts # prompt_tokens = [] prompt_token_idx = 0 if options.promptsfn: with codecs.open(options.promptsfn, 'r', 'utf8') as promptsf: for line in promptsf: prompt_tokens.extend(tokenize(line, lang=options.lang)) logging.info("%s read. %d tokens." % (options.promptsfn, len(prompt_tokens)))
(options, args) = parser.parse_args() if len(args) < 1: parser.print_usage() print sys.exit(1) lex_tokens = map(lambda x: x.decode('utf8'), args) # # load lexicon # print "loading lexicon..." lex = Lexicon() print "loading lexicon...done." # # curses # locale.setlocale(locale.LC_ALL, "") stdscr = curses.initscr() curses.noecho() curses.cbreak() stdscr.keypad(1) # # config
(options, args) = parser.parse_args() if len(args) < 1: parser.print_usage() print sys.exit(1) lex_tokens = map(lambda x: x.decode('utf8'), args) # # load lexicon # print "loading lexicon..." lex = Lexicon(LEXICON_NAME) print "loading lexicon...done." # # curses # locale.setlocale(locale.LC_ALL, "") stdscr = curses.initscr() curses.noecho() curses.cbreak() stdscr.keypad(1) # # config
else: logging.basicConfig(level=logging.INFO) if len(args) != 1: parser.print_usage() sys.exit(1) inputfn = args[0] outputfn = os.path.splitext(args[0])[0] + ".prompt" # # load lexicon # logging.info("loading lexicon...") lex = Lexicon(lang=LANG) logging.info("loading lexicon...done.") lex_edit = LexEdit(lex) # # load wrt # wrt = {} if os.path.exists(options.wrt): logging.info("loading %s" % options.wrt) with codecs.open(options.wrt, 'r', 'utf8') as wrtf:
(options, args) = parser.parse_args() if options.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) lang = options.lang workdir = 'data/dst/speech/%s/sequitur' % lang # # load lexicon # logging.info("loading lexicon...") lex = Lexicon(lang=lang) logging.info("loading lexicon...done.") # # export # misc.mkdirs(workdir) with codecs.open('%s/train.lex' % workdir, 'w', 'utf8') as trainf, \ codecs.open('%s/test.lex' % workdir, 'w', 'utf8') as testf, \ codecs.open('%s/all.lex' % workdir, 'w', 'utf8') as allf : cnt = 0 for word in lex:
config = utils.load_config() work_dir = WORKDIR % LANG kaldi_root = config.get("speech", "kaldi_root") data_dir = "%s/data" % work_dir mfcc_dir = "%s/mfcc" % work_dir wav16_dir = config.get("speech", "wav16_dir_de") # # load lexicon, transcripts # print "loading lexicon..." lex = Lexicon() print "loading lexicon...done." print "loading transcripts..." transcripts = Transcripts() ts_all, ts_train, ts_test = transcripts.split(limit=DEBUG_LIMIT, add_all=add_all) print "loading transcripts (%d train, %d test) ...done." % (len(ts_train), len(ts_test)) # # create work_dir structure # utils.mkdirs('%s/lexicon' % data_dir) utils.mkdirs('%s/local/dict' % data_dir) utils.mkdirs(wav16_dir)
return ipa_m return None # token = u"abakteriell" # ipa_r = u"'ʔaːb-ak-'teː-ʁiː-'ɛl" # ipa_w = u"ʔabakteːʁiː'ɛl" # print merge_check(token, ipa_r, ipa_w) # sys.exit(0) # # load lexicon # print "loading lexicon..." lex = Lexicon('dict-de.ipa') print "loading lexicon...done." # # load wiktionary # print "loading wiktionary..." wiktionary = {} with codecs.open(DICTFN, 'r', 'utf8') as dictf: for line in dictf: parts = line.strip().split(';') if len(parts) != 2: # print "Failed to parse line %s" % line.strip() continue
logging.basicConfig(level=logging.INFO) # # load transcripts # logging.info("loading transcripts...") transcripts = Transcripts(corpus_name=options.lang) logging.info("loading transcripts...done.") # # load lexicon # logging.info("loading lexicon...") lex = Lexicon(file_name=options.lang) logging.info("loading lexicon...done.") # # load prompts # prompt_tokens = [] prompt_token_idx = 0 if options.promptsfn: with codecs.open(options.promptsfn, 'r', 'utf8') as promptsf: for line in promptsf: prompt_tokens.extend(tokenize(line, lang=options.lang)) logging.info("%s read. %d tokens." % (options.promptsfn, len(prompt_tokens)))
parser.add_option ("-v", "--verbose", action="store_true", dest="verbose", help="enable verbose logging") (options, args) = parser.parse_args() if options.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) # # load lexicon # print "loading lexicon..." lex = Lexicon(LEXICON_NAME) print "loading lexicon...done." # # check # cnt = 0 failed_tokens = [] for token in lex: if token == u'nspc': continue
(options, args) = parser.parse_args() if len(args) < 1: parser.print_usage() print sys.exit(1) lex_tokens = map(lambda x: x.decode('utf8'), args) # # load lexicon # print "loading lexicon..." lex = Lexicon(options.dict_name) print "loading lexicon...done." # # curses # locale.setlocale(locale.LC_ALL, "") stdscr = curses.initscr() curses.noecho() curses.cbreak() stdscr.keypad(1) # # config