Ejemplo n.º 1
def word_to_phone_mlf(model, dict, word_mlf, phone_mlf, mono_list):
    Convert the word-level mlf to a phone level mlf with HLEd

    if not os.path.isfile(word_mlf):
        util.log_write(model.logfh, 'No word MLF file here [%s]' % word_mlf)

    if not os.path.isfile(dict):
        util.log_write(model.logfh, 'No dict file here [%s]' % dict)

    ## Create mkphones0.led
    led_file = '%s/mkphones0.led' % model.exp
    fh = open(led_file, 'w')
    fh.write('EX\nIS sil sil\n')

    ## Convert the word level MLF into a phone MLF
    cmd_log = '%s/hhed_word_to_phone.log' % model.exp
    cmd = 'HLEd -A -T 1 -l "*"'
    cmd += ' -d %s' % dict
    cmd += ' -i %s' % phone_mlf
    cmd += ' %s %s > %s' % (led_file, word_mlf, cmd_log)

    ## Create list of phones (appearing in the phone MLF)
    monophones = set()
    for line in open(phone_mlf):
        phone = line.strip()
        if phone.isalpha(): monophones.add(phone)
    monophones = list(monophones)
    fh = open(mono_list, 'w')
    for phone in monophones:
        fh.write('%s\n' % phone)

    return len(monophones)
Ejemplo n.º 3
def make_mlf_from_transcripts(model, orig_dict, setup, data_path, word_mlf, mfc_list, skip_oov=True):
    An MLF is an HTK-formatted transcription file. This is created
    from the word-level transcripts in setup.
    replace_escaped_words = True

    ## Load the dictionary words
    dict_words = set([entry.split()[0].upper() for entry in open(orig_dict).read().splitlines()
                      if not entry.startswith('#') and len(entry.strip()) > 0])
    words = set()

    if setup.endswith('gz'): setup_reader = lambda x: gzip.open(x)
    else: setup_reader = lambda x: open(x)

    ## Create MLF-format entries for each utterance
    mfcs = []
    mlf = ['#!MLF!#']
    count = 0
    for line in setup_reader(setup):
        skip = False
        items = line.strip().split()
        wav = items[0]
        mfc = coding.get_mfc_name_from_wav(wav, data_path)
        curr = ['"*/%s.lab"' %os.path.basename(wav).split('.')[0]]
        trans = map(str.upper, items[2:])
        for word in trans:
            if replace_escaped_words and '\\' in word:
                new_word = re.sub(r'\\[^A-Za-z0-9]*', r'', word)
                if new_word in dict_words: word = new_word
            if word not in dict_words:
                ## Don't include bracketed words or periods in the labels
                if word.startswith('[') and word.endswith(']'): continue
                if word == '.': continue
                if model.verbose > 0: util.log_write(model.logfh, 'not in dictionary [%s]' %word)
                ## Remove the utterance if there are other non-dictionary words
                if skip_oov: skip = True

            if word[0].isdigit(): word = '_' + word

        ## Check for empty transcriptions
        if len(curr) <= 1: skip = True

        if not skip:
            for word in curr:
            count += 1

    ## Write the MLF
    fh = open(word_mlf, 'w')
    fh.write('\n'.join(mlf) + '\n')
    ## Create a new MFC list file
    fh = open(mfc_list, 'w')
    for mfc in mfcs: fh.write('%s\n' %mfc)
    return count, words
Ejemplo n.º 4
def build_lm_from_mlf(model, word_mlf, dictionary, vocab, lm_dir, lm, lm_order, target_ppl_ratio=None):
    Build a language model using SRILM
    Use the transcripts in the word mlf
    Output to lm
    Output intermediate files in lm_dir
    Return perplexity on the training text

    dict = set([entry.split()[0].upper() for entry in open(dictionary).read().splitlines()
                if not entry.startswith('#') and len(entry.strip()) > 0])

    ## Prepare to build an LM by creating a file with one sentence per line
    text_file = '%s/training.txt' %lm_dir
    text, curr = [], []
    ## Extract a vocab from the MLF
    cmd = 'cat %s | grep ".lab" -v | grep "MLF" -v | sort | uniq' %word_mlf
    mlf_vocab = set(os.popen(cmd).read().splitlines())
    mlf_dict_vocab = list(mlf_vocab.intersection(dict))
    fh = open(vocab, 'w')
    for word in mlf_dict_vocab: fh.write(word + '\n')

    for line in open(word_mlf):
        line = line.strip()
        if line.startswith('#!MLF'): continue
        if line.startswith('"') and '.lab' in line: continue
        if line == '.':
            text.append(' '.join(curr))
            curr = []

    fh = open(text_file, 'w')

    ## Build a language model
    cutoff, cutoff_min, cutoff_max = 5, 1, 50
    iters, prev_cutoff = 0, 0

    cmd = 'ngram-count -vocab %s -order %d -text %s -lm %s' %(vocab, lm_order, text_file, lm)
    util.run(cmd, lm_dir)
    cmd = 'ngram -order %d -lm %s -ppl %s -debug 0' %(lm_order, lm, text_file)
    res = util.run(cmd, lm_dir)
    ppl = float(os.popen('grep zeroprobs %s' %res).read().split()[5])
    if not target_ppl_ratio: return ppl
    util.log_write(model.logfh, '  cutoff [%d] gives ppl [%1.2f]' %(1, ppl))
    target_ppl = ppl * target_ppl_ratio

    while True:
        iters += 1
        params = '-gt%dmin %d' %(lm_order, cutoff)
        cmd = 'ngram-count -vocab %s -order %d -text %s -lm %s %s' %(vocab, lm_order, text_file, lm, params)
        util.run(cmd, lm_dir)
        cmd = 'ngram -order %d -lm %s -ppl %s -debug 0' %(lm_order, lm, text_file)
        res = util.run(cmd, lm_dir)
        ppl = float(os.popen('grep zeroprobs %s' %res).read().split()[5])

        if not target_ppl or abs(ppl - target_ppl) < 1: break
        if cutoff == prev_cutoff or iters > 10: break
        prev_cutoff = cutoff
        util.log_write(model.logfh, '  cutoff [%d] gives ppl [%1.2f]' %(cutoff, ppl))

        if ppl < target_ppl:
            cutoff_min = cutoff
            cutoff = (cutoff + cutoff_max) / 2
            cutoff_max = cutoff
            cutoff = (cutoff + cutoff_min) / 2

    ## Return perplexity on the training data
    return ppl
Ejemplo n.º 9
def tie_states_search(model, output_dir, model_dir, mono_list, tri_list,
    Tie HMM states using decision tree clustering

    tree_hed = '%s/tree.hed' % output_dir
    tree_output = '%s/trees' % output_dir
    hhed_log = '%s/hhed_cluster.log' % output_dir
    all_tri_list = '%s/all_tri.list' % model.exp

    ## Decision tree parameters
    ro = model.dt_ro
    tb = model.dt_tb
    tb_min = 100.0
    tb_max = 10000.0

    ## Create the full list of possible triphones
    phones = open(mono_list).read().splitlines()
    non_sp_phones = [p for p in phones if p not in ['sp', 'sil']]
    fh = open(all_tri_list, 'w')
    for p1 in non_sp_phones:
        fh.write('sil-%s+sil\n' % p1)
        for p2 in non_sp_phones:
            fh.write('sil-%s+%s\n' % (p1, p2))
            fh.write('%s-%s+sil\n' % (p2, p1))
            for p3 in non_sp_phones:
                fh.write('%s-%s+%s\n' % (p2, p1, p3))

    ## Search over tb arguments to get the right number states
    num_states = 0
    attempts = 0
    prev_tb = 0
    while True:

        os.system('rm -f %s %s %s' % (tree_hed, tree_output, hhed_log))

        ## Set up decision tree clustering
        fh = open(tree_hed, 'w')
        fh.write('RO %d %s/stats\n' % (ro, model_dir))
        fh.write('TR 0\n')
        fh.write('%s\n' % open(model.tree_questions).read())
        fh.write('TR 12\n')
        for p in non_sp_phones:
            for s in range(1, model.states + 1)[1:-1]:
                    'TB %d "ST_%s_%d_" {(%s,*-%s+*,%s+*,*-%s).state[%d]}\n' %
                    (tb, p, s, p, p, p, p, s))
        fh.write('TR 1\n')
        fh.write('AU "%s"\n' % all_tri_list)
        fh.write('CO "%s"\n' % tied_list)
        fh.write('ST "%s"\n' % tree_output)

        ## Use HHEd to cluster
        cmd = 'HHEd -A -T 1 -H %s/MMF' % model_dir
        cmd += ' -M %s' % output_dir
        cmd += ' %s %s > %s' % (tree_hed, tri_list, hhed_log)

        if model.local == 1: os.system(cmd)
        else: util.run(cmd, output_dir)
        num_states = int(
            os.popen('grep -c "<MEAN>" %s/MMF' % output_dir).read().strip())

        if abs(
                float(num_states - model.triphone_states) /
                model.triphone_states) <= 0.01:
                ' current states [%d] tb [%1.2f]' % (num_states, tb))

        if abs(prev_tb - tb) <= 0.01:
                ' Could not converge. Stopping. Current states [%d] tb [%1.2f]'
                % (num_states, tb))

        attempts += 1
        prev_tb = tb
        if num_states < model.triphone_states:
            tb = (tb_min + tb) / 2
            tb_max = prev_tb
            tb = (tb_max + tb) / 2
            tb_min = prev_tb
            ' [%d] goal [%d] current states [%d] tb [%1.2f] -> [%1.2f] [%1.1f %1.1f]'
            % (attempts, model.triphone_states, num_states, prev_tb, tb,
               tb_min, tb_max))

        if attempts > 50:
                           ' Goal not reached after 50 tries. Exiting.')

    return output_dir
Ejemplo n.º 10
def align(model, root_dir, mfc_list, model_dir, word_mlf, new_mlf, model_list,
          dict, align_config):
    Create a new alignment based on a model and the word alignment with HVite

    output_dir = '%s/Align' % root_dir
    utts_per_split = max(100, (1 + (model.setup_length / 200)))

    ## Copy old mfc list
    os.system('cp %s %s/mfc_old.list' % (mfc_list, output_dir))

    ## HVite parameters
    prune_thresh = 250

    def hvite(input, output):
        #-o SWT
        cmd = 'HVite -D -A -T 1 -b silence -a -m -y lab '
        cmd += '-t %d' % prune_thresh
        cmd += ' -C %s' % align_config
        cmd += ' -H %s/MMF' % model_dir
        cmd += ' -i %s' % output
        cmd += ' -I %s' % word_mlf
        cmd += ' -S %s' % input
        cmd += ' %s %s' % (dict, model_list)
        cmd += ' >> %s.hvite.log' % output
        return cmd

    ## Split up MFC list with unix split
    cmd = 'split -a 4 -d -l %d %s %s/%s' % (utts_per_split, mfc_list,
                                            output_dir, 'mfc.list.')

    ## Create the HVite commands
    cmds = []
    outputs = []
    inputs = os.popen('ls %s/mfc.list.*' % output_dir).read().splitlines()
    for input in inputs:
        output = input.replace('mfc.list', 'align.output')
        cmds.append(hvite(input, output))

    if model.local == 1:
        for cmd in cmds:
            print cmd
            print os.popen(cmd).read()
        cmds_file = '%s/hvite.commands' % output_dir
        fh = open(cmds_file, 'w')
        for cmd in cmds:
            fh.write('%s\n' % cmd)
        util.run_parallel(cmds_file, model.jobs, output_dir)

    ## Merge and fix silences
    ## TODO: -s file_list
    merge_sil = '%s/merge_sp_sil.led' % output_dir
    fh = open(merge_sil, 'w')
    fh.write('ME sil sp sil\n')
    fh.write('ME sil sil sil\n')
    fh.write('ME sp sil sil\n')

    cmd = 'HLEd -D -A -T 1 -i %s %s %s >> %s/hled.log' % (
        new_mlf, merge_sil, ' '.join(outputs), output_dir)

    if model.local == 1: os.system(cmd)
    else: util.run(cmd, output_dir)

    ## Prune failed alignments from the mfc list
    bad_count = 0
    mlf_labels = os.popen('grep "\.lab" %s' % new_mlf).read().splitlines()
    mlf_labels = set([os.path.basename(s).split('.')[0] for s in mlf_labels])
    mfc_labels = open(mfc_list).read().splitlines()
    fh = open(mfc_list, 'w')
    for mfc in mfc_labels:
        id = os.path.basename(mfc).split('.')[0]

        ## Check for missing transcriptions
        if id not in mlf_labels:
            if model.verbose > 0:
                util.log_write(model.logfh, 'removed bad alignment [%s]' % id)
            bad_count += 1
            fh.write(mfc + '\n')
    util.log_write(model.logfh, 'removed alignments [%d]' % bad_count)

    ## Clean up
    os.system('rm -f %s/mfc.list.* %s/align.output.*' %
              (output_dir, output_dir))
    return output_dir
