Example #1
0
def word_to_phone_mlf(model, dict, word_mlf, phone_mlf, mono_list):
    """
    Convert the word-level mlf to a phone level mlf with HLEd
    """

    if not os.path.isfile(word_mlf):
        util.log_write(model.logfh, 'No word MLF file here [%s]' % word_mlf)
        util.exit(model.log)

    if not os.path.isfile(dict):
        util.log_write(model.logfh, 'No dict file here [%s]' % dict)
        util.exit(model.log)

    ## Create mkphones0.led
    led_file = '%s/mkphones0.led' % model.exp
    fh = open(led_file, 'w')
    fh.write('EX\nIS sil sil\n')
    fh.close()

    ## Convert the word level MLF into a phone MLF
    cmd_log = '%s/hhed_word_to_phone.log' % model.exp
    cmd = 'HLEd -A -T 1 -l "*"'
    cmd += ' -d %s' % dict
    cmd += ' -i %s' % phone_mlf
    cmd += ' %s %s > %s' % (led_file, word_mlf, cmd_log)
    os.system(cmd)

    ## Create list of phones (appearing in the phone MLF)
    monophones = set()
    for line in open(phone_mlf):
        phone = line.strip()
        if phone.isalpha(): monophones.add(phone)
    monophones = list(monophones)
    monophones.sort()
    fh = open(mono_list, 'w')
    for phone in monophones:
        fh.write('%s\n' % phone)
    fh.close()

    return len(monophones)
Example #2
0
def word_to_phone_mlf(model, dict, word_mlf, phone_mlf, mono_list):
    """
    Convert the word-level mlf to a phone level mlf with HLEd
    """

    if not os.path.isfile(word_mlf):
        util.log_write(model.logfh, 'No word MLF file here [%s]' %word_mlf)
        util.exit(model.log)

    if not os.path.isfile(dict):
        util.log_write(model.logfh, 'No dict file here [%s]' %dict)
        util.exit(model.log)

    ## Create mkphones0.led
    led_file = '%s/mkphones0.led' %model.exp
    fh = open(led_file, 'w')
    fh.write('EX\nIS sil sil\n')
    fh.close()

    ## Convert the word level MLF into a phone MLF
    cmd_log = '%s/hhed_word_to_phone.log' %model.exp
    cmd  = 'HLEd -A -T 1 -l "*"'
    cmd += ' -d %s' %dict
    cmd += ' -i %s' %phone_mlf
    cmd += ' %s %s > %s' %(led_file, word_mlf, cmd_log)
    os.system(cmd)

    ## Create list of phones (appearing in the phone MLF)
    monophones = set()
    for line in open(phone_mlf):
        phone = line.strip()
        if phone.isalpha(): monophones.add(phone)
    monophones = list(monophones)
    monophones.sort()
    fh = open(mono_list, 'w')
    for phone in monophones: fh.write('%s\n' %phone)
    fh.close()

    return len(monophones)
Example #3
0
def make_mlf_from_transcripts(model, orig_dict, setup, data_path, word_mlf, mfc_list, skip_oov=True):
    """
    An MLF is an HTK-formatted transcription file. This is created
    from the word-level transcripts in setup.
    """
    
    replace_escaped_words = True

    ## Load the dictionary words
    dict_words = set([entry.split()[0].upper() for entry in open(orig_dict).read().splitlines()
                      if not entry.startswith('#') and len(entry.strip()) > 0])
    words = set()

    if setup.endswith('gz'): setup_reader = lambda x: gzip.open(x)
    else: setup_reader = lambda x: open(x)

    ## Create MLF-format entries for each utterance
    mfcs = []
    mlf = ['#!MLF!#']
    count = 0
    for line in setup_reader(setup):
        skip = False
        items = line.strip().split()
        wav = items[0]
        mfc = coding.get_mfc_name_from_wav(wav, data_path)
        curr = ['"*/%s.lab"' %os.path.basename(wav).split('.')[0]]
        trans = map(str.upper, items[2:])
        for word in trans:
            if replace_escaped_words and '\\' in word:
                new_word = re.sub(r'\\[^A-Za-z0-9]*', r'', word)
                if new_word in dict_words: word = new_word
                
            if word not in dict_words:
                ## Don't include bracketed words or periods in the labels
                if word.startswith('[') and word.endswith(']'): continue
                if word == '.': continue
                if model.verbose > 0: util.log_write(model.logfh, 'not in dictionary [%s]' %word)
                
                ## Remove the utterance if there are other non-dictionary words
                if skip_oov: skip = True

            if word[0].isdigit(): word = '_' + word
            curr.append(word)

        ## Check for empty transcriptions
        if len(curr) <= 1: skip = True

        curr.append('.')
        if not skip:
            mlf.extend(curr)
            for word in curr:
                words.add(word)
            mfcs.append(mfc)
            count += 1

    ## Write the MLF
    fh = open(word_mlf, 'w')
    fh.write('\n'.join(mlf) + '\n')
    fh.close()
    
    ## Create a new MFC list file
    fh = open(mfc_list, 'w')
    for mfc in mfcs: fh.write('%s\n' %mfc)
    fh.close()
    
    return count, words
Example #4
0
def build_lm_from_mlf(model, word_mlf, dictionary, vocab, lm_dir, lm, lm_order, target_ppl_ratio=None):
    """
    Build a language model using SRILM
    Use the transcripts in the word mlf
    Output to lm
    Output intermediate files in lm_dir
    Return perplexity on the training text
    """

    dict = set([entry.split()[0].upper() for entry in open(dictionary).read().splitlines()
                if not entry.startswith('#') and len(entry.strip()) > 0])

    ## Prepare to build an LM by creating a file with one sentence per line
    text_file = '%s/training.txt' %lm_dir
    text, curr = [], []
    
    ## Extract a vocab from the MLF
    cmd = 'cat %s | grep ".lab" -v | grep "MLF" -v | sort | uniq' %word_mlf
    mlf_vocab = set(os.popen(cmd).read().splitlines())
    mlf_dict_vocab = list(mlf_vocab.intersection(dict))
    mlf_dict_vocab.sort()
    fh = open(vocab, 'w')
    for word in mlf_dict_vocab: fh.write(word + '\n')
    fh.close()

    for line in open(word_mlf):
        line = line.strip()
        if line.startswith('#!MLF'): continue
        if line.startswith('"') and '.lab' in line: continue
        if line == '.':
            text.append(' '.join(curr))
            curr = []
            continue
        curr.append(line)

    fh = open(text_file, 'w')
    fh.write('\n'.join(text))
    fh.close()

    ## Build a language model
    cutoff, cutoff_min, cutoff_max = 5, 1, 50
    iters, prev_cutoff = 0, 0

    cmd = 'ngram-count -vocab %s -order %d -text %s -lm %s' %(vocab, lm_order, text_file, lm)
    util.run(cmd, lm_dir)
    cmd = 'ngram -order %d -lm %s -ppl %s -debug 0' %(lm_order, lm, text_file)
    res = util.run(cmd, lm_dir)
    ppl = float(os.popen('grep zeroprobs %s' %res).read().split()[5])
    if not target_ppl_ratio: return ppl
    util.log_write(model.logfh, '  cutoff [%d] gives ppl [%1.2f]' %(1, ppl))
    target_ppl = ppl * target_ppl_ratio

    while True:
        iters += 1
        params = '-gt%dmin %d' %(lm_order, cutoff)
        cmd = 'ngram-count -vocab %s -order %d -text %s -lm %s %s' %(vocab, lm_order, text_file, lm, params)
        util.run(cmd, lm_dir)
        cmd = 'ngram -order %d -lm %s -ppl %s -debug 0' %(lm_order, lm, text_file)
        res = util.run(cmd, lm_dir)
        ppl = float(os.popen('grep zeroprobs %s' %res).read().split()[5])

        if not target_ppl or abs(ppl - target_ppl) < 1: break
        if cutoff == prev_cutoff or iters > 10: break
        prev_cutoff = cutoff
        util.log_write(model.logfh, '  cutoff [%d] gives ppl [%1.2f]' %(cutoff, ppl))

        if ppl < target_ppl:
            cutoff_min = cutoff
            cutoff = (cutoff + cutoff_max) / 2
        else:
            cutoff_max = cutoff
            cutoff = (cutoff + cutoff_min) / 2

    ## Return perplexity on the training data
    return ppl
Example #5
0
def decode_to_lattices(model, output_dir, model_dir, mfc_list, lm, dict,
                       model_list, gold_mlf):

    sys.stderr.write('Decoding to lattices\n')
    output_mlf = '%s/train_recog.mlf' % output_dir
    results_log = '%s/results.log' % output_dir

    ## Create a config file to use with HDecode
    hdecode_config = '%s/hdecode.config' % output_dir
    fh = open(hdecode_config, 'w')
    #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n')
    fh.write('HNETFILTER = "gunzip -c < $.gz"\n')
    fh.write('HNETOFILTER = "gzip -c > $.gz"\n')
    fh.write('RAWMITFORMAT = T\n')
    fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n')
    fh.write('GCFREQ = 50\n')
    fh.write('HLAT:TRACE = 19\n')
    fh.write('HLVNET:TRACE = 1\n')
    fh.write('HLVREC:TRACE = 1\n')
    fh.write('HLVLM:TRACE = 1\n')
    fh.write('LATPRUNEBEAM = 500.0\n')
    fh.write('MAXLMLA = 3.0\n')
    fh.write('BUILDLATSENTEND = T\n')
    fh.write('FORCELATOUT = F\n')
    fh.write('STARTWORD = <s>\n')
    fh.write('ENDWORD = </s>\n')
    fh.close()

    ## HDecode parameters
    utts_per_split = 100
    block_size = 5
    beam = 150.0
    word_end_beam = 125.0
    max_model = 10000
    lm_scale = 15.0
    word_insertion_penalty = 0.0

    def hdecode(input, output):
        cmd = 'HDecode -A -D -V -T 9 -o M -z lat -C %s' % hdecode_config
        cmd += ' -H %s/MMF' % model_dir
        cmd += ' -k %d' % block_size
        cmd += ' -t %f 100.0' % beam
        cmd += ' -v %f 115.0' % word_end_beam
        cmd += ' -u %d' % max_model
        cmd += ' -s %f' % lm_scale
        cmd += ' -p %f' % word_insertion_penalty
        cmd += ' -w %s' % lm
        cmd += ' -S %s' % input
        cmd += ' -l %s/' % output
        cmd += ' %s %s' % (dict, model_list)
        if model.verbose > 0:
            cmd += ' >%s/%s.log' % (output_dir, os.path.basename(input))
        return cmd

    ## Split up MFC list
    split_mfc = SplitList(output_dir, mfc_list, by_path=True)

    ## Create the HDecode commands
    cmds = []
    inputs = split_mfc.get_files()
    for input in inputs:
        output = '%s/%s' % (output_dir, split_mfc.get_key(input))
        if not os.path.isdir(output): os.makedirs(output)
        cmds.append(hdecode(input, output))

    if model.local == 1:
        for cmd in cmds:
            print cmd
            print os.popen(cmd).read()
    else:
        cmds_file = '%s/hdecode.commands' % output_dir
        fh = open(cmds_file, 'w')
        for cmd in cmds:
            fh.write('%s\n' % cmd)
        fh.close()
        util.run_parallel(cmds_file, model.jobs, output_dir)

    ## Copy old mfc list
    old_mfc_list = '%s/mfc_old.list' % output_dir
    os.system('cp %s %s' % (mfc_list, old_mfc_list))

    ## Prune bad lats from the mfc list
    lat_ids = [
        os.path.basename(f).split('.')[0]
        for f in util.get_files(output_dir, r'.*\.lat')
    ]
    bad_count = 0
    fh = open(mfc_list, 'w')
    for mfc in open(old_mfc_list):
        id = os.path.basename(mfc.strip()).split('.')[0]

        ## Check for missing transcriptions
        if id not in lat_ids:
            if model.verbose > 1:
                util.log_write(model.logfh, 'removed bad lat [%s]' % id)
            bad_count += 1
        else:
            fh.write(mfc)
    fh.close()
    util.log_write(model.logfh, 'removed bad lats [%d]' % bad_count)

    ## Create an MLF from the recognition output
    outputs = util.get_files(output_dir, r'.*\.rec')
    os.popen('rm -f %s' % output_mlf)
    fh = open(output_mlf, 'w')
    fh.write('#!MLF!#\n')
    for output in outputs:
        fh.write('"%s"\n' % output)
        for line in open(output):
            if '<s>' in line or '</s>' in line: continue
            fh.write(line)
        fh.write('.\n')
    fh.close()

    ## Evaluate
    cmd = 'HResults -h -n -A -T 1'
    cmd += ' -I %s' % gold_mlf
    cmd += ' %s %s > %s' % (model_list, output_mlf, results_log)
    os.system(cmd)
    print os.popen('cat ' + results_log).read()
Example #6
0
def phonemark_lattices(model, lattice_dir, output_dir, model_dir, mfc_list, lm,
                       dict, model_list):

    sys.stderr.write('Phonemarking lattices\n')

    ## Create a config file to use with HDecode
    hdecode_config = '%s/hdecode.config' % output_dir
    fh = open(hdecode_config, 'w')
    #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n')
    fh.write('HNETFILTER = "gunzip -c < $.gz"\n')
    fh.write('HNETOFILTER = "gzip -c > $.gz"\n')
    fh.write('RAWMITFORMAT = T\n')
    fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n')
    fh.write('GCFREQ = 50\n')
    fh.write('HLAT:TRACE = 19\n')
    fh.write('HLVNET:TRACE = 1\n')
    fh.write('HLVREC:TRACE = 1\n')
    fh.write('HLVLM:TRACE = 1\n')
    fh.write('LATPRUNEBEAM = 500.0\n')
    fh.write('MAXLMLA = 3.0\n')
    fh.write('BUILDLATSENTEND = T\n')
    fh.write('FORCELATOUT = F\n')
    fh.write('STARTWORD = <s>\n')
    fh.write('ENDWORD = </s>\n')
    fh.close()

    ## HDecode parameters
    utts_per_split = 100
    block_size = 5
    beam = 200.0
    lm_scale = 15.0
    word_insertion_penalty = 0.0

    def hdecode_mod(input, path):
        input_dir = '%s/%s/' % (lattice_dir, path)
        if not os.path.isdir(input_dir):
            input_dir = '%s/%s/' % (lattice_dir, path.replace('_', ''))
        cmd = 'HDecode.mod -A -D -V -T 9 -q tvaldm -z lat -X lat -C %s' % hdecode_config
        cmd += ' -H %s/MMF' % model_dir
        cmd += ' -k %d' % block_size
        cmd += ' -t %f' % beam
        cmd += ' -s %f' % lm_scale
        cmd += ' -p %f' % word_insertion_penalty
        cmd += ' -w'  # %s' %lm
        cmd += ' -S %s' % input
        cmd += ' -l %s/%s/' % (output_dir, path)
        cmd += ' -L %s' % input_dir
        cmd += ' %s %s' % (dict, model_list)
        if model.verbose > 0:
            cmd += ' >%s/%s.log' % (output_dir, os.path.basename(input))
        return cmd

    ## Split up MFC list with unix split
    split_mfc = SplitList(output_dir, mfc_list, by_path=True)

    ## Create the HDecode commands
    cmds = []
    inputs = split_mfc.get_files()
    for input in inputs:
        key = split_mfc.get_key(input)
        new_output = '%s/%s' % (output_dir, key)
        if not os.path.isdir(new_output): os.makedirs(new_output)

        cmds.append(hdecode_mod(input, key))

    if model.local == 1:
        for cmd in cmds:
            print cmd
            print os.popen(cmd).read()
    else:
        cmds_file = '%s/hdecode_mod.commands' % output_dir
        fh = open(cmds_file, 'w')
        for cmd in cmds:
            fh.write('%s\n' % cmd)
        fh.close()
        util.run_parallel(cmds_file, model.jobs, output_dir)

    ## Copy old mfc list
    old_mfc_list = '%s/mfc_old.list' % output_dir
    os.system('cp %s %s' % (mfc_list, old_mfc_list))

    ## Prune bad lats from the mfc list
    lat_ids = [
        os.path.basename(f).split('.')[0]
        for f in util.get_files(output_dir, r'.*\.lat')
    ]
    bad_count = 0
    fh = open(mfc_list, 'w')
    for mfc in open(old_mfc_list):
        id = os.path.basename(mfc.strip()).split('.')[0]

        ## Check for missing transcriptions
        if id not in lat_ids:
            if model.verbose > 1:
                util.log_write(model.logfh, 'removed bad lat [%s]' % id)
            bad_count += 1
        else:
            fh.write(mfc)
    fh.close()
    util.log_write(model.logfh, 'removed bad lats [%d]' % bad_count)
Example #7
0
def tie_states_search(model, output_dir, model_dir, mono_list, tri_list, tied_list):
    """
    Tie HMM states using decision tree clustering
    """

    util.create_new_dir(output_dir)
    tree_hed = '%s/tree.hed' %output_dir
    tree_output = '%s/trees' %output_dir
    hhed_log = '%s/hhed_cluster.log' %output_dir
    all_tri_list = '%s/all_tri.list' %model.exp

    ## Decision tree parameters
    ro = model.dt_ro
    tb = model.dt_tb
    tb_min = 100.0
    tb_max = 10000.0

    ## Create the full list of possible triphones
    phones = open(mono_list).read().splitlines()
    non_sp_phones = [p for p in phones if p not in ['sp', 'sil']]
    fh = open(all_tri_list, 'w')
    fh.write('sp\n')
    fh.write('sil\n')
    for p1 in non_sp_phones:
        fh.write('sil-%s+sil\n' %p1)
        for p2 in non_sp_phones:
            fh.write('sil-%s+%s\n' %(p1, p2))
            fh.write('%s-%s+sil\n' %(p2, p1))
            for p3 in non_sp_phones:
                fh.write('%s-%s+%s\n' %(p2, p1, p3))
    fh.close()

    ## Search over tb arguments to get the right number states
    num_states = 0
    attempts = 0
    prev_tb = 0
    while True:

        os.system('rm -f %s %s %s' %(tree_hed, tree_output, hhed_log))
        
        ## Set up decision tree clustering
        fh = open(tree_hed, 'w')
        fh.write('RO %d %s/stats\n' %(ro, model_dir))
        fh.write('TR 0\n')
        fh.write('%s\n' %open(model.tree_questions).read())
        fh.write('TR 12\n')
        for p in non_sp_phones:
            for s in range(1, model.states+1)[1:-1]:
                fh.write('TB %d "ST_%s_%d_" {(%s,*-%s+*,%s+*,*-%s).state[%d]}\n' %(tb,p,s,p,p,p,p,s))
        fh.write('TR 1\n')
        fh.write('AU "%s"\n' %all_tri_list)
        fh.write('CO "%s"\n' %tied_list)
        fh.write('ST "%s"\n' %tree_output)
        fh.close()

        ## Use HHEd to cluster
        cmd  = 'HHEd -A -T 1 -H %s/MMF' %model_dir
        cmd += ' -M %s' %output_dir
        cmd += ' %s %s > %s' %(tree_hed, tri_list, hhed_log)

        if model.local == 1: os.system(cmd)
        else: util.run(cmd, output_dir)
        num_states = int(os.popen('grep -c "<MEAN>" %s/MMF' %output_dir).read().strip())

        
        if abs(float(num_states - model.triphone_states)/model.triphone_states) <= 0.01:
            util.log_write(model.logfh, ' current states [%d] tb [%1.2f]' %(num_states, tb))
            break
        
        if abs(prev_tb - tb) <= 0.01:
            util.log_write(model.logfh, ' Could not converge. Stopping. Current states [%d] tb [%1.2f]' %(num_states,tb))
            break
        
        attempts += 1
        prev_tb = tb
        if num_states < model.triphone_states:
            tb = (tb_min + tb) / 2
            tb_max = prev_tb
        else:
            tb = (tb_max + tb) / 2
            tb_min = prev_tb
        util.log_write(model.logfh, ' [%d] goal [%d] current states [%d] tb [%1.2f] -> [%1.2f] [%1.1f %1.1f]' %(attempts, model.triphone_states, num_states, prev_tb, tb, tb_min, tb_max))

        if attempts > 50:
            util.log_write(model.logfh, ' Goal not reached after 50 tries. Exiting.')
            sys.exit()

    return output_dir
Example #8
0
def align(model, root_dir, mfc_list, model_dir, word_mlf, new_mlf, model_list, dict, align_config): 
    """
    Create a new alignment based on a model and the word alignment with HVite
    """

    output_dir = '%s/Align' %root_dir
    util.create_new_dir(output_dir)
    utts_per_split = max(100, (1 + (model.setup_length / 200)))

    ## Copy old mfc list
    os.system('cp %s %s/mfc_old.list' %(mfc_list, output_dir))

    ## HVite parameters
    prune_thresh = 250

    def hvite(input, output):
        #-o SWT 
        cmd  = 'HVite -D -A -T 1 -b silence -a -m -y lab '
        cmd += '-t %d' %prune_thresh
        cmd += ' -C %s' %align_config
        cmd += ' -H %s/MMF' %model_dir
        cmd += ' -i %s' %output
        cmd += ' -I %s' %word_mlf
        cmd += ' -S %s' %input
        cmd += ' %s %s' %(dict, model_list)
        cmd += ' >> %s.hvite.log' %output
        return cmd

    ## Split up MFC list with unix split
    cmd = 'split -a 4 -d -l %d %s %s/%s' %(utts_per_split, mfc_list, output_dir, 'mfc.list.')
    os.system(cmd)

    ## Create the HVite commands
    cmds = []
    outputs = []
    inputs = os.popen('ls %s/mfc.list.*' %output_dir).read().splitlines()
    for input in inputs:
        output = input.replace('mfc.list', 'align.output')
        outputs.append(output)
        cmds.append(hvite(input, output))

    if model.local == 1:
        for cmd in cmds:
            print cmd
            print os.popen(cmd).read()
    else:
        cmds_file = '%s/hvite.commands' %output_dir
        fh = open(cmds_file, 'w')
        for cmd in cmds: fh.write('%s\n' %cmd)
        fh.close()
        util.run_parallel(cmds_file, model.jobs, output_dir)

    ## Merge and fix silences
    ## TODO: -s file_list
    merge_sil = '%s/merge_sp_sil.led' %output_dir
    fh = open(merge_sil, 'w')
    fh.write('ME sil sp sil\n')
    fh.write('ME sil sil sil\n')
    fh.write('ME sp sil sil\n')
    fh.close()

    cmd = 'HLEd -D -A -T 1 -i %s %s %s >> %s/hled.log' %(new_mlf, merge_sil, ' '.join(outputs), output_dir)
            
    if model.local == 1: os.system(cmd)
    else: util.run(cmd, output_dir)

    ## Prune failed alignments from the mfc list
    bad_count = 0
    mlf_labels = os.popen('grep "\.lab" %s' %new_mlf).read().splitlines()
    mlf_labels = set([os.path.basename(s).split('.')[0] for s in mlf_labels])
    mfc_labels = open(mfc_list).read().splitlines()
    fh = open(mfc_list, 'w')
    for mfc in mfc_labels:
        id = os.path.basename(mfc).split('.')[0]

        ## Check for missing transcriptions
        if id not in mlf_labels:
            if model.verbose > 0: util.log_write(model.logfh, 'removed bad alignment [%s]' %id)
            bad_count += 1
        else: fh.write(mfc + '\n')
    fh.close()
    util.log_write(model.logfh, 'removed alignments [%d]' %bad_count)

    ## Clean up
    os.system('rm -f %s/mfc.list.* %s/align.output.*' %(output_dir, output_dir))
    return output_dir
Example #9
0
def tie_states_search(model, output_dir, model_dir, mono_list, tri_list,
                      tied_list):
    """
    Tie HMM states using decision tree clustering
    """

    util.create_new_dir(output_dir)
    tree_hed = '%s/tree.hed' % output_dir
    tree_output = '%s/trees' % output_dir
    hhed_log = '%s/hhed_cluster.log' % output_dir
    all_tri_list = '%s/all_tri.list' % model.exp

    ## Decision tree parameters
    ro = model.dt_ro
    tb = model.dt_tb
    tb_min = 100.0
    tb_max = 10000.0

    ## Create the full list of possible triphones
    phones = open(mono_list).read().splitlines()
    non_sp_phones = [p for p in phones if p not in ['sp', 'sil']]
    fh = open(all_tri_list, 'w')
    fh.write('sp\n')
    fh.write('sil\n')
    for p1 in non_sp_phones:
        fh.write('sil-%s+sil\n' % p1)
        for p2 in non_sp_phones:
            fh.write('sil-%s+%s\n' % (p1, p2))
            fh.write('%s-%s+sil\n' % (p2, p1))
            for p3 in non_sp_phones:
                fh.write('%s-%s+%s\n' % (p2, p1, p3))
    fh.close()

    ## Search over tb arguments to get the right number states
    num_states = 0
    attempts = 0
    prev_tb = 0
    while True:

        os.system('rm -f %s %s %s' % (tree_hed, tree_output, hhed_log))

        ## Set up decision tree clustering
        fh = open(tree_hed, 'w')
        fh.write('RO %d %s/stats\n' % (ro, model_dir))
        fh.write('TR 0\n')
        fh.write('%s\n' % open(model.tree_questions).read())
        fh.write('TR 12\n')
        for p in non_sp_phones:
            for s in range(1, model.states + 1)[1:-1]:
                fh.write(
                    'TB %d "ST_%s_%d_" {(%s,*-%s+*,%s+*,*-%s).state[%d]}\n' %
                    (tb, p, s, p, p, p, p, s))
        fh.write('TR 1\n')
        fh.write('AU "%s"\n' % all_tri_list)
        fh.write('CO "%s"\n' % tied_list)
        fh.write('ST "%s"\n' % tree_output)
        fh.close()

        ## Use HHEd to cluster
        cmd = 'HHEd -A -T 1 -H %s/MMF' % model_dir
        cmd += ' -M %s' % output_dir
        cmd += ' %s %s > %s' % (tree_hed, tri_list, hhed_log)

        if model.local == 1: os.system(cmd)
        else: util.run(cmd, output_dir)
        num_states = int(
            os.popen('grep -c "<MEAN>" %s/MMF' % output_dir).read().strip())

        if abs(
                float(num_states - model.triphone_states) /
                model.triphone_states) <= 0.01:
            util.log_write(
                model.logfh,
                ' current states [%d] tb [%1.2f]' % (num_states, tb))
            break

        if abs(prev_tb - tb) <= 0.01:
            util.log_write(
                model.logfh,
                ' Could not converge. Stopping. Current states [%d] tb [%1.2f]'
                % (num_states, tb))
            break

        attempts += 1
        prev_tb = tb
        if num_states < model.triphone_states:
            tb = (tb_min + tb) / 2
            tb_max = prev_tb
        else:
            tb = (tb_max + tb) / 2
            tb_min = prev_tb
        util.log_write(
            model.logfh,
            ' [%d] goal [%d] current states [%d] tb [%1.2f] -> [%1.2f] [%1.1f %1.1f]'
            % (attempts, model.triphone_states, num_states, prev_tb, tb,
               tb_min, tb_max))

        if attempts > 50:
            util.log_write(model.logfh,
                           ' Goal not reached after 50 tries. Exiting.')
            sys.exit()

    return output_dir
Example #10
0
def align(model, root_dir, mfc_list, model_dir, word_mlf, new_mlf, model_list,
          dict, align_config):
    """
    Create a new alignment based on a model and the word alignment with HVite
    """

    output_dir = '%s/Align' % root_dir
    util.create_new_dir(output_dir)
    utts_per_split = max(100, (1 + (model.setup_length / 200)))

    ## Copy old mfc list
    os.system('cp %s %s/mfc_old.list' % (mfc_list, output_dir))

    ## HVite parameters
    prune_thresh = 250

    def hvite(input, output):
        #-o SWT
        cmd = 'HVite -D -A -T 1 -b silence -a -m -y lab '
        cmd += '-t %d' % prune_thresh
        cmd += ' -C %s' % align_config
        cmd += ' -H %s/MMF' % model_dir
        cmd += ' -i %s' % output
        cmd += ' -I %s' % word_mlf
        cmd += ' -S %s' % input
        cmd += ' %s %s' % (dict, model_list)
        cmd += ' >> %s.hvite.log' % output
        return cmd

    ## Split up MFC list with unix split
    cmd = 'split -a 4 -d -l %d %s %s/%s' % (utts_per_split, mfc_list,
                                            output_dir, 'mfc.list.')
    os.system(cmd)

    ## Create the HVite commands
    cmds = []
    outputs = []
    inputs = os.popen('ls %s/mfc.list.*' % output_dir).read().splitlines()
    for input in inputs:
        output = input.replace('mfc.list', 'align.output')
        outputs.append(output)
        cmds.append(hvite(input, output))

    if model.local == 1:
        for cmd in cmds:
            print cmd
            print os.popen(cmd).read()
    else:
        cmds_file = '%s/hvite.commands' % output_dir
        fh = open(cmds_file, 'w')
        for cmd in cmds:
            fh.write('%s\n' % cmd)
        fh.close()
        util.run_parallel(cmds_file, model.jobs, output_dir)

    ## Merge and fix silences
    ## TODO: -s file_list
    merge_sil = '%s/merge_sp_sil.led' % output_dir
    fh = open(merge_sil, 'w')
    fh.write('ME sil sp sil\n')
    fh.write('ME sil sil sil\n')
    fh.write('ME sp sil sil\n')
    fh.close()

    cmd = 'HLEd -D -A -T 1 -i %s %s %s >> %s/hled.log' % (
        new_mlf, merge_sil, ' '.join(outputs), output_dir)

    if model.local == 1: os.system(cmd)
    else: util.run(cmd, output_dir)

    ## Prune failed alignments from the mfc list
    bad_count = 0
    mlf_labels = os.popen('grep "\.lab" %s' % new_mlf).read().splitlines()
    mlf_labels = set([os.path.basename(s).split('.')[0] for s in mlf_labels])
    mfc_labels = open(mfc_list).read().splitlines()
    fh = open(mfc_list, 'w')
    for mfc in mfc_labels:
        id = os.path.basename(mfc).split('.')[0]

        ## Check for missing transcriptions
        if id not in mlf_labels:
            if model.verbose > 0:
                util.log_write(model.logfh, 'removed bad alignment [%s]' % id)
            bad_count += 1
        else:
            fh.write(mfc + '\n')
    fh.close()
    util.log_write(model.logfh, 'removed alignments [%d]' % bad_count)

    ## Clean up
    os.system('rm -f %s/mfc.list.* %s/align.output.*' %
              (output_dir, output_dir))
    return output_dir
Example #11
0
def decode_to_lattices(model, output_dir, model_dir, mfc_list, lm, dict, model_list, gold_mlf):

    sys.stderr.write('Decoding to lattices\n')
    output_mlf = '%s/train_recog.mlf' %output_dir
    results_log = '%s/results.log' %output_dir

    ## Create a config file to use with HDecode
    hdecode_config = '%s/hdecode.config' %output_dir
    fh = open(hdecode_config, 'w')
    #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n')
    fh.write('HNETFILTER = "gunzip -c < $.gz"\n')
    fh.write('HNETOFILTER = "gzip -c > $.gz"\n')
    fh.write('RAWMITFORMAT = T\n')
    fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n')
    fh.write('GCFREQ = 50\n')
    fh.write('HLAT:TRACE = 19\n')
    fh.write('HLVNET:TRACE = 1\n')
    fh.write('HLVREC:TRACE = 1\n')
    fh.write('HLVLM:TRACE = 1\n')
    fh.write('LATPRUNEBEAM = 500.0\n')
    fh.write('MAXLMLA = 3.0\n')
    fh.write('BUILDLATSENTEND = T\n')
    fh.write('FORCELATOUT = F\n')
    fh.write('STARTWORD = <s>\n')
    fh.write('ENDWORD = </s>\n')
    fh.close()

    ## HDecode parameters
    utts_per_split = 100
    block_size = 5
    beam = 150.0
    word_end_beam = 125.0
    max_model = 10000
    lm_scale = 15.0
    word_insertion_penalty = 0.0

    def hdecode(input, output):
        cmd  = 'HDecode -A -D -V -T 9 -o M -z lat -C %s' %hdecode_config
        cmd += ' -H %s/MMF' %model_dir
        cmd += ' -k %d' %block_size
        cmd += ' -t %f 100.0' %beam
        cmd += ' -v %f 115.0' %word_end_beam
        cmd += ' -u %d' %max_model
        cmd += ' -s %f' %lm_scale
        cmd += ' -p %f' %word_insertion_penalty
        cmd += ' -w %s' %lm
        cmd += ' -S %s' %input
        cmd += ' -l %s/' %output
        cmd += ' %s %s' %(dict, model_list)
        if model.verbose > 0: cmd += ' >%s/%s.log' %(output_dir, os.path.basename(input))
        return cmd
    
    ## Split up MFC list
    split_mfc = SplitList(output_dir, mfc_list, by_path=True)

    ## Create the HDecode commands
    cmds = []
    inputs = split_mfc.get_files()
    for input in inputs:
        output = '%s/%s' %(output_dir, split_mfc.get_key(input))
        if not os.path.isdir(output): os.makedirs(output)
        cmds.append(hdecode(input, output))

    if model.local == 1:
        for cmd in cmds:
            print cmd
            print os.popen(cmd).read()
    else:
        cmds_file = '%s/hdecode.commands' %output_dir
        fh = open(cmds_file, 'w')
        for cmd in cmds: fh.write('%s\n' %cmd)
        fh.close()
        util.run_parallel(cmds_file, model.jobs, output_dir)

    ## Copy old mfc list
    old_mfc_list = '%s/mfc_old.list' %output_dir
    os.system('cp %s %s' %(mfc_list, old_mfc_list))

    ## Prune bad lats from the mfc list
    lat_ids = [os.path.basename(f).split('.')[0] for f in util.get_files(output_dir, r'.*\.lat')]
    bad_count = 0
    fh = open(mfc_list, 'w')
    for mfc in open(old_mfc_list):
        id = os.path.basename(mfc.strip()).split('.')[0]

        ## Check for missing transcriptions
        if id not in lat_ids:
            if model.verbose > 1: util.log_write(model.logfh, 'removed bad lat [%s]' %id)
            bad_count += 1
        else: fh.write(mfc)
    fh.close()
    util.log_write(model.logfh, 'removed bad lats [%d]' %bad_count)
    
    ## Create an MLF from the recognition output
    outputs = util.get_files(output_dir, r'.*\.rec')
    os.popen('rm -f %s' %output_mlf)
    fh = open(output_mlf, 'w')
    fh.write('#!MLF!#\n')
    for output in outputs:
        fh.write('"%s"\n' %output)
        for line in open(output):
            if '<s>' in line or '</s>' in line: continue
            fh.write(line)
        fh.write('.\n')
    fh.close()

    ## Evaluate
    cmd  = 'HResults -h -n -A -T 1'
    cmd += ' -I %s' %gold_mlf
    cmd += ' %s %s > %s' %(model_list, output_mlf, results_log)
    os.system(cmd)
    print os.popen('cat ' + results_log).read()
Example #12
0
def phonemark_lattices(model, lattice_dir, output_dir, model_dir, mfc_list, lm, dict, model_list):

    sys.stderr.write('Phonemarking lattices\n')

    ## Create a config file to use with HDecode
    hdecode_config = '%s/hdecode.config' %output_dir
    fh = open(hdecode_config, 'w')
    #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n')
    fh.write('HNETFILTER = "gunzip -c < $.gz"\n')
    fh.write('HNETOFILTER = "gzip -c > $.gz"\n')
    fh.write('RAWMITFORMAT = T\n')
    fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n')
    fh.write('GCFREQ = 50\n')
    fh.write('HLAT:TRACE = 19\n')
    fh.write('HLVNET:TRACE = 1\n')
    fh.write('HLVREC:TRACE = 1\n')
    fh.write('HLVLM:TRACE = 1\n')
    fh.write('LATPRUNEBEAM = 500.0\n')
    fh.write('MAXLMLA = 3.0\n')
    fh.write('BUILDLATSENTEND = T\n')
    fh.write('FORCELATOUT = F\n')
    fh.write('STARTWORD = <s>\n')
    fh.write('ENDWORD = </s>\n')
    fh.close()
    
    ## HDecode parameters
    utts_per_split = 100
    block_size = 5
    beam = 200.0
    lm_scale = 15.0
    word_insertion_penalty = 0.0

    def hdecode_mod(input, path):
        input_dir = '%s/%s/' %(lattice_dir, path)
        if not os.path.isdir(input_dir):
            input_dir = '%s/%s/' %(lattice_dir, path.replace('_', ''))
        cmd  = 'HDecode.mod -A -D -V -T 9 -q tvaldm -z lat -X lat -C %s' %hdecode_config
        cmd += ' -H %s/MMF' %model_dir
        cmd += ' -k %d' %block_size
        cmd += ' -t %f' %beam
        cmd += ' -s %f' %lm_scale
        cmd += ' -p %f' %word_insertion_penalty
        cmd += ' -w' # %s' %lm
        cmd += ' -S %s' %input
        cmd += ' -l %s/%s/' %(output_dir, path)
        cmd += ' -L %s' %input_dir
        cmd += ' %s %s' %(dict, model_list)
        if model.verbose > 0: cmd += ' >%s/%s.log' %(output_dir, os.path.basename(input))
        return cmd

    ## Split up MFC list with unix split
    split_mfc = SplitList(output_dir, mfc_list, by_path=True)

    ## Create the HDecode commands
    cmds = []
    inputs = split_mfc.get_files()
    for input in inputs:
        key = split_mfc.get_key(input)
        new_output = '%s/%s' %(output_dir, key)
        if not os.path.isdir(new_output): os.makedirs(new_output)
        
        cmds.append(hdecode_mod(input, key))

    if model.local == 1:
        for cmd in cmds:
            print cmd
            print os.popen(cmd).read()
    else:
        cmds_file = '%s/hdecode_mod.commands' %output_dir
        fh = open(cmds_file, 'w')
        for cmd in cmds: fh.write('%s\n' %cmd)
        fh.close()
        util.run_parallel(cmds_file, model.jobs, output_dir)
        
    ## Copy old mfc list
    old_mfc_list = '%s/mfc_old.list' %output_dir
    os.system('cp %s %s' %(mfc_list, old_mfc_list))
        
    ## Prune bad lats from the mfc list
    lat_ids = [os.path.basename(f).split('.')[0] for f in util.get_files(output_dir, r'.*\.lat')]
    bad_count = 0
    fh = open(mfc_list, 'w')
    for mfc in open(old_mfc_list):
        id = os.path.basename(mfc.strip()).split('.')[0]

        ## Check for missing transcriptions
        if id not in lat_ids:
            if model.verbose > 1: util.log_write(model.logfh, 'removed bad lat [%s]' %id)
            bad_count += 1
        else: fh.write(mfc)
    fh.close()
    util.log_write(model.logfh, 'removed bad lats [%d]' %bad_count)