Ejemplo n.º 1
0
def generate_speech_and_text_corpora(data_dir,
                                     wav16_dir,
                                     debug,
                                     sequitur_model_path,
                                     lexicon_file_name,
                                     audio_corpora,
                                     prompt_words):
    logging.info("loading lexicon...")
    lex = Lexicon(file_name=lexicon_file_name)
    logging.info("loading lexicon...done.")
    logging.info("loading transcripts...")

    if sequitur_model_path:
        add_all = True
    else:
        add_all = False

    ts_all = {}
    ts_train = {}
    ts_test = {}
    transcript_objs = []
    for audio_corpus in audio_corpora:
        transcripts = Transcripts(corpus_name=audio_corpus)

        ts_all_, ts_train_, ts_test_ = transcripts.split(limit=debug, add_all=add_all)

        logging.info("loading transcripts from %s (%d train, %d test) ..." % (audio_corpus, len(ts_train_), len(ts_test_)))

        ts_all.update(ts_all_)
        ts_train.update(ts_train_)
        ts_test.update(ts_test_)
        transcript_objs.append(transcripts)

    logging.info("loading transcripts (%d train, %d test) ...done." % (
        len(ts_train), len(ts_test)))

    export_kaldi_data(wav16_dir, audio_corpora, '%s/train/' % data_dir, ts_train)
    export_kaldi_data(wav16_dir, audio_corpora, '%s/test/' % data_dir, ts_test)

    if sequitur_model_path:
        for transcript_obj in transcript_objs:
            lex = add_missing_words(transcript_obj, lex, sequitur_model_path)

    ps, utt_dict = export_dictionary(ts_all,
                                     lex,
                                     '%s/local/dict/lexicon.txt' % data_dir,
                                     prompt_words)
    write_nonsilence_phones(
        ps, '%s/local/dict/nonsilence_phones.txt' % data_dir)

    write_silence_phones('%s/local/dict/silence_phones.txt' % data_dir)
    write_optional_silence('%s/local/dict/optional_silence.txt' % data_dir)
    write_extra_questions(ps, '%s/local/dict/extra_questions.txt' % data_dir)
    create_training_data_for_language_model(transcript_objs, utt_dict, data_dir)
Ejemplo n.º 2
0
    ts_filter = args[0].decode('utf8')

#
# load transcripts
#

print "loading transcripts..."
transcripts = Transcripts()
print "loading transcripts...done."

#
# load lexicon
#

print "loading lexicon..."
lex = Lexicon()
print "loading lexicon...done."

#
# load prompts
#

prompt_tokens = []
prompt_token_idx = 0
if options.promptsfn:
    with codecs.open(options.promptsfn, 'r', 'utf8') as promptsf:
        for line in promptsf:
            prompt_tokens.extend(tokenize(line))

    print "%s read. %d tokens." % (options.promptsfn, len(prompt_tokens))
Ejemplo n.º 3
0
if len(args) < 2:
    parser.print_usage()
    sys.exit(1)

lex_name = args[0]
corpus_name = args[1]

sequitur_model = SEQUITUR_MODEL % lex_name

#
# load lexicon, transcripts
#

logging.info("loading lexicon...")
lex = Lexicon(file_name=lex_name)
logging.info("loading lexicon...done.")

logging.info("loading transcripts...")
transcripts = Transcripts(corpus_name=corpus_name)
logging.info("loading transcripts...done.")

#
# find missing words
#

missing = {}  # word -> count

num = len(transcripts)
cnt = 0
Ejemplo n.º 4
0
(options, args) = parser.parse_args()

if options.verbose:
    logging.basicConfig(level=logging.DEBUG)
else:
    logging.basicConfig(level=logging.INFO)
dict_name = options.dict_name
workdir = 'data/dst/dict-models/%s/sequitur' % dict_name

#
# load lexicon
#

logging.info("loading lexicon...")
lex = Lexicon(file_name=dict_name)
logging.info("loading lexicon...done.")

#
# export
#

misc.mkdirs(workdir)

with codecs.open('%s/train.lex' % workdir, 'w', 'utf8') as trainf, \
     codecs.open('%s/test.lex'  % workdir, 'w', 'utf8') as testf, \
     codecs.open('%s/all.lex'  % workdir, 'w', 'utf8') as allf :

    cnt = 0

    for word in lex:
Ejemplo n.º 5
0
#

# FIXME: unused, remove misc.mkdirs('%s/lexicon' % data_dir)
misc.mkdirs('%s/local/dict' % data_dir)
misc.mkdirs(wav16_dir)
misc.mkdirs(mfcc_dir)
misc.symlink('../../../../../%s' % language_model_dir, '%s/lm' % work_dir)
misc.symlink('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % work_dir)
misc.symlink('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % work_dir)

#
# generate speech and text corpora
#

logging.info("loading lexicon...")
lex = Lexicon(file_name=dictionary)
logging.info("loading lexicon...done.")

if sequitur_model_path:
    add_all = True
else:
    add_all = False

ts_all = {}
ts_train = {}
ts_test = {}
transcript_objs = []
for audio_corpus in audio_corpora:

    logging.info("loading transcripts from %s ..." % audio_corpus)
Ejemplo n.º 6
0
logging.info(repr(wrt))

#
# load transcripts
#

logging.info("loading transcripts...")
transcripts = Transcripts(corpus_name=corpus_name)
logging.info("loading transcripts...done.")

#
# load lexicon
#

logging.info("loading lexicon...")
lex = Lexicon(file_name=options.dict_name)
logging.info("loading lexicon...done.")

#
# load prompts
#

prompt_tokens    = []
prompt_token_idx = 0
if options.promptsfn:
    with codecs.open(options.promptsfn, 'r', 'utf8') as promptsf:
        for line in promptsf:
            prompt_tokens.extend(tokwrt(line))

    logging.info("%s read. %d tokens." % (options.promptsfn, len(prompt_tokens)))
Ejemplo n.º 7
0
else:
    logging.basicConfig(level=logging.INFO)

if len(args) != 1:
    parser.print_usage()
    sys.exit(1)

inputfn = args[0]
outputfn = os.path.splitext(args[0])[0] + ".prompt"

#
# load lexicon
#

logging.info("loading lexicon...")
lex = Lexicon(DICT)
logging.info("loading lexicon...done.")

lex_edit = LexEdit(lex)

#
# load wrt
#

wrt = {}

if os.path.exists(options.wrt):
    logging.info("loading %s" % options.wrt)

    with codecs.open(options.wrt, 'r', 'utf8') as wrtf:
Ejemplo n.º 8
0
vf_login = config.get("speech", "vf_login")
extrasdir = config.get("speech", "extrasdir_%s" % lang)

#
# TTS (for audio output)
#

tts = TTS('local', 0, locale='de', voice='bits3', engine='espeak')

#
# load lexicon
#

logging.info("loading lexicon...")
lex = Lexicon(file_name=options.lang)
logging.info("loading lexicon...done.")

#
# main ui loop
#

next_segment()

while segmentfn:

    print
    print segmentfn
    print prompt

    # any words not covered by our lexicon?
Ejemplo n.º 9
0
#

work_dir    = WORKDIR %options.lang 
kaldi_root  = config.get("speech", "kaldi_root")

data_dir    = "%s/data" % work_dir
mfcc_dir    = "%s/mfcc" % work_dir

wav16_dir   = config.get("speech", "wav16_dir_%s" % options.lang)

#
# load lexicon, transcripts
#

logging.info ( "loading lexicon...")
lex = Lexicon(lang=options.lang)
logging.info ( "loading lexicon...done.")

logging.info ( "loading transcripts...")
transcripts = Transcripts(lang=options.lang)
ts_all, ts_train, ts_test = transcripts.split(limit=options.debug, add_all=options.add_all)
logging.info ( "loading transcripts (%d train, %d test) ...done." % (len(ts_train), len(ts_test)))

#
# create work_dir structure
#


misc.mkdirs('%s/lexicon' % data_dir)
misc.mkdirs('%s/local/dict' % data_dir)
misc.mkdirs(wav16_dir)
Ejemplo n.º 10
0
dst_model = args[3]

dst_dir = 'data/dst/asr-models/kaldi/%s' % dst_model

#
# config
#

kaldi_root = config.get("speech", "kaldi_root")

#
# load lexicon, transcripts
#

logging.info("loading lexicon...")
lex = Lexicon(file_name=dict_name)
logging.info("loading lexicon...done.")

#
# cleanup leftovers from previous runs
#

cmd = 'rm -rf %s' % dst_dir
logging.info(cmd)
os.system(cmd)

#
# dictionary export
#

misc.mkdirs('%s/data/local/dict' % dst_dir)
Ejemplo n.º 11
0
    logging.basicConfig(level=logging.INFO)

#
# load transcripts
#

logging.info("loading transcripts...")
transcripts = Transcripts(lang=options.lang)
logging.info("loading transcripts...done.")

#
# load lexicon
#

logging.info("loading lexicon...")
lex = Lexicon(lang=options.lang)
logging.info("loading lexicon...done.")

#
# load prompts
#

prompt_tokens = []
prompt_token_idx = 0
if options.promptsfn:
    with codecs.open(options.promptsfn, 'r', 'utf8') as promptsf:
        for line in promptsf:
            prompt_tokens.extend(tokenize(line, lang=options.lang))

    logging.info("%s read. %d tokens." %
                 (options.promptsfn, len(prompt_tokens)))
Ejemplo n.º 12
0
(options, args) = parser.parse_args()

if len(args) < 1:
    parser.print_usage()
    print
    sys.exit(1)

lex_tokens = map(lambda x: x.decode('utf8'), args)

#
# load lexicon
#

print "loading lexicon..."
lex = Lexicon()
print "loading lexicon...done."

#
# curses
#

locale.setlocale(locale.LC_ALL, "")

stdscr = curses.initscr()
curses.noecho()
curses.cbreak()
stdscr.keypad(1)

#
# config
Ejemplo n.º 13
0
(options, args) = parser.parse_args()

if len(args) < 1:
    parser.print_usage()
    print
    sys.exit(1)

lex_tokens = map(lambda x: x.decode('utf8'), args)

#
# load lexicon
#

print "loading lexicon..."
lex = Lexicon(LEXICON_NAME)
print "loading lexicon...done."

#
# curses
#

locale.setlocale(locale.LC_ALL, "")

stdscr = curses.initscr()
curses.noecho()
curses.cbreak()
stdscr.keypad(1)

#
# config
else:
    logging.basicConfig(level=logging.INFO)

if len(args) != 1:
    parser.print_usage()
    sys.exit(1)

inputfn = args[0]
outputfn = os.path.splitext(args[0])[0] + ".prompt"

#
# load lexicon
#

logging.info("loading lexicon...")
lex = Lexicon(lang=LANG)
logging.info("loading lexicon...done.")

lex_edit = LexEdit(lex)

#
# load wrt
#

wrt = {}

if os.path.exists(options.wrt):
    logging.info("loading %s" % options.wrt)

    with codecs.open(options.wrt, 'r', 'utf8') as wrtf:
Ejemplo n.º 15
0
(options, args) = parser.parse_args()

if options.verbose:
    logging.basicConfig(level=logging.DEBUG)
else:
    logging.basicConfig(level=logging.INFO)
lang = options.lang
workdir = 'data/dst/speech/%s/sequitur' % lang

#
# load lexicon
#

logging.info("loading lexicon...")
lex = Lexicon(lang=lang)
logging.info("loading lexicon...done.")

#
# export
#

misc.mkdirs(workdir)

with codecs.open('%s/train.lex' % workdir, 'w', 'utf8') as trainf, \
     codecs.open('%s/test.lex'  % workdir, 'w', 'utf8') as testf, \
     codecs.open('%s/all.lex'  % workdir, 'w', 'utf8') as allf :

    cnt = 0

    for word in lex:
Ejemplo n.º 16
0
config = utils.load_config()

work_dir = WORKDIR % LANG
kaldi_root = config.get("speech", "kaldi_root")

data_dir = "%s/data" % work_dir
mfcc_dir = "%s/mfcc" % work_dir

wav16_dir = config.get("speech", "wav16_dir_de")

#
# load lexicon, transcripts
#

print "loading lexicon..."
lex = Lexicon()
print "loading lexicon...done."

print "loading transcripts..."
transcripts = Transcripts()
ts_all, ts_train, ts_test = transcripts.split(limit=DEBUG_LIMIT,
                                              add_all=add_all)
print "loading transcripts (%d train, %d test) ...done." % (len(ts_train),
                                                            len(ts_test))
#
# create work_dir structure
#

utils.mkdirs('%s/lexicon' % data_dir)
utils.mkdirs('%s/local/dict' % data_dir)
utils.mkdirs(wav16_dir)
        return ipa_m
    return None


# token = u"abakteriell"
# ipa_r = u"'ʔaːb-ak-'teː-ʁiː-'ɛl"
# ipa_w = u"ʔabakteːʁiː'ɛl"
# print merge_check(token, ipa_r, ipa_w)
# sys.exit(0)

#
# load lexicon
#

print "loading lexicon..."
lex = Lexicon('dict-de.ipa')
print "loading lexicon...done."

#
# load wiktionary
#

print "loading wiktionary..."
wiktionary = {}
with codecs.open(DICTFN, 'r', 'utf8') as dictf:
    for line in dictf:
        parts = line.strip().split(';')
        if len(parts) != 2:
            # print "Failed to parse line %s" % line.strip()
            continue
Ejemplo n.º 18
0
    logging.basicConfig(level=logging.INFO)

#
# load transcripts
#

logging.info("loading transcripts...")
transcripts = Transcripts(corpus_name=options.lang)
logging.info("loading transcripts...done.")

#
# load lexicon
#

logging.info("loading lexicon...")
lex = Lexicon(file_name=options.lang)
logging.info("loading lexicon...done.")

#
# load prompts
#

prompt_tokens = []
prompt_token_idx = 0
if options.promptsfn:
    with codecs.open(options.promptsfn, 'r', 'utf8') as promptsf:
        for line in promptsf:
            prompt_tokens.extend(tokenize(line, lang=options.lang))

    logging.info("%s read. %d tokens." %
                 (options.promptsfn, len(prompt_tokens)))
parser.add_option ("-v", "--verbose", action="store_true", dest="verbose",
                   help="enable verbose logging")

(options, args) = parser.parse_args()

if options.verbose:
    logging.basicConfig(level=logging.DEBUG)
else:
    logging.basicConfig(level=logging.INFO)

#
# load lexicon
#

print "loading lexicon..."
lex = Lexicon(LEXICON_NAME)
print "loading lexicon...done."

#
# check
#

cnt = 0

failed_tokens = []

for token in lex:

    if token == u'nspc':
        continue
Ejemplo n.º 20
0
(options, args) = parser.parse_args()

if len(args) < 1:
    parser.print_usage()
    print
    sys.exit(1)

lex_tokens = map(lambda x: x.decode('utf8'), args)

#
# load lexicon
#

print "loading lexicon..."
lex = Lexicon(options.dict_name)
print "loading lexicon...done."

#
# curses
#

locale.setlocale(locale.LC_ALL, "")

stdscr = curses.initscr()
curses.noecho()
curses.cbreak()
stdscr.keypad(1)

#
# config