def word_to_phone_mlf(model, dict, word_mlf, phone_mlf, mono_list): """ Convert the word-level mlf to a phone level mlf with HLEd """ if not os.path.isfile(word_mlf): util.log_write(model.logfh, 'No word MLF file here [%s]' % word_mlf) util.exit(model.log) if not os.path.isfile(dict): util.log_write(model.logfh, 'No dict file here [%s]' % dict) util.exit(model.log) ## Create mkphones0.led led_file = '%s/mkphones0.led' % model.exp fh = open(led_file, 'w') fh.write('EX\nIS sil sil\n') fh.close() ## Convert the word level MLF into a phone MLF cmd_log = '%s/hhed_word_to_phone.log' % model.exp cmd = 'HLEd -A -T 1 -l "*"' cmd += ' -d %s' % dict cmd += ' -i %s' % phone_mlf cmd += ' %s %s > %s' % (led_file, word_mlf, cmd_log) os.system(cmd) ## Create list of phones (appearing in the phone MLF) monophones = set() for line in open(phone_mlf): phone = line.strip() if phone.isalpha(): monophones.add(phone) monophones = list(monophones) monophones.sort() fh = open(mono_list, 'w') for phone in monophones: fh.write('%s\n' % phone) fh.close() return len(monophones)
def word_to_phone_mlf(model, dict, word_mlf, phone_mlf, mono_list): """ Convert the word-level mlf to a phone level mlf with HLEd """ if not os.path.isfile(word_mlf): util.log_write(model.logfh, 'No word MLF file here [%s]' %word_mlf) util.exit(model.log) if not os.path.isfile(dict): util.log_write(model.logfh, 'No dict file here [%s]' %dict) util.exit(model.log) ## Create mkphones0.led led_file = '%s/mkphones0.led' %model.exp fh = open(led_file, 'w') fh.write('EX\nIS sil sil\n') fh.close() ## Convert the word level MLF into a phone MLF cmd_log = '%s/hhed_word_to_phone.log' %model.exp cmd = 'HLEd -A -T 1 -l "*"' cmd += ' -d %s' %dict cmd += ' -i %s' %phone_mlf cmd += ' %s %s > %s' %(led_file, word_mlf, cmd_log) os.system(cmd) ## Create list of phones (appearing in the phone MLF) monophones = set() for line in open(phone_mlf): phone = line.strip() if phone.isalpha(): monophones.add(phone) monophones = list(monophones) monophones.sort() fh = open(mono_list, 'w') for phone in monophones: fh.write('%s\n' %phone) fh.close() return len(monophones)
def make_mlf_from_transcripts(model, orig_dict, setup, data_path, word_mlf, mfc_list, skip_oov=True): """ An MLF is an HTK-formatted transcription file. This is created from the word-level transcripts in setup. """ replace_escaped_words = True ## Load the dictionary words dict_words = set([entry.split()[0].upper() for entry in open(orig_dict).read().splitlines() if not entry.startswith('#') and len(entry.strip()) > 0]) words = set() if setup.endswith('gz'): setup_reader = lambda x: gzip.open(x) else: setup_reader = lambda x: open(x) ## Create MLF-format entries for each utterance mfcs = [] mlf = ['#!MLF!#'] count = 0 for line in setup_reader(setup): skip = False items = line.strip().split() wav = items[0] mfc = coding.get_mfc_name_from_wav(wav, data_path) curr = ['"*/%s.lab"' %os.path.basename(wav).split('.')[0]] trans = map(str.upper, items[2:]) for word in trans: if replace_escaped_words and '\\' in word: new_word = re.sub(r'\\[^A-Za-z0-9]*', r'', word) if new_word in dict_words: word = new_word if word not in dict_words: ## Don't include bracketed words or periods in the labels if word.startswith('[') and word.endswith(']'): continue if word == '.': continue if model.verbose > 0: util.log_write(model.logfh, 'not in dictionary [%s]' %word) ## Remove the utterance if there are other non-dictionary words if skip_oov: skip = True if word[0].isdigit(): word = '_' + word curr.append(word) ## Check for empty transcriptions if len(curr) <= 1: skip = True curr.append('.') if not skip: mlf.extend(curr) for word in curr: words.add(word) mfcs.append(mfc) count += 1 ## Write the MLF fh = open(word_mlf, 'w') fh.write('\n'.join(mlf) + '\n') fh.close() ## Create a new MFC list file fh = open(mfc_list, 'w') for mfc in mfcs: fh.write('%s\n' %mfc) fh.close() return count, words
def build_lm_from_mlf(model, word_mlf, dictionary, vocab, lm_dir, lm, lm_order, target_ppl_ratio=None): """ Build a language model using SRILM Use the transcripts in the word mlf Output to lm Output intermediate files in lm_dir Return perplexity on the training text """ dict = set([entry.split()[0].upper() for entry in open(dictionary).read().splitlines() if not entry.startswith('#') and len(entry.strip()) > 0]) ## Prepare to build an LM by creating a file with one sentence per line text_file = '%s/training.txt' %lm_dir text, curr = [], [] ## Extract a vocab from the MLF cmd = 'cat %s | grep ".lab" -v | grep "MLF" -v | sort | uniq' %word_mlf mlf_vocab = set(os.popen(cmd).read().splitlines()) mlf_dict_vocab = list(mlf_vocab.intersection(dict)) mlf_dict_vocab.sort() fh = open(vocab, 'w') for word in mlf_dict_vocab: fh.write(word + '\n') fh.close() for line in open(word_mlf): line = line.strip() if line.startswith('#!MLF'): continue if line.startswith('"') and '.lab' in line: continue if line == '.': text.append(' '.join(curr)) curr = [] continue curr.append(line) fh = open(text_file, 'w') fh.write('\n'.join(text)) fh.close() ## Build a language model cutoff, cutoff_min, cutoff_max = 5, 1, 50 iters, prev_cutoff = 0, 0 cmd = 'ngram-count -vocab %s -order %d -text %s -lm %s' %(vocab, lm_order, text_file, lm) util.run(cmd, lm_dir) cmd = 'ngram -order %d -lm %s -ppl %s -debug 0' %(lm_order, lm, text_file) res = util.run(cmd, lm_dir) ppl = float(os.popen('grep zeroprobs %s' %res).read().split()[5]) if not target_ppl_ratio: return ppl util.log_write(model.logfh, ' cutoff [%d] gives ppl [%1.2f]' %(1, ppl)) target_ppl = ppl * target_ppl_ratio while True: iters += 1 params = '-gt%dmin %d' %(lm_order, cutoff) cmd = 'ngram-count -vocab %s -order %d -text %s -lm %s %s' %(vocab, lm_order, text_file, lm, params) util.run(cmd, lm_dir) cmd = 'ngram -order %d -lm %s -ppl %s -debug 0' %(lm_order, lm, text_file) res = util.run(cmd, lm_dir) ppl = float(os.popen('grep zeroprobs %s' %res).read().split()[5]) if not target_ppl or abs(ppl - target_ppl) < 1: break if cutoff == prev_cutoff or iters > 10: break prev_cutoff = cutoff util.log_write(model.logfh, ' cutoff [%d] gives ppl [%1.2f]' %(cutoff, ppl)) if ppl < target_ppl: cutoff_min = cutoff cutoff = (cutoff + cutoff_max) / 2 else: cutoff_max = cutoff cutoff = (cutoff + cutoff_min) / 2 ## Return perplexity on the training data return ppl
def decode_to_lattices(model, output_dir, model_dir, mfc_list, lm, dict, model_list, gold_mlf): sys.stderr.write('Decoding to lattices\n') output_mlf = '%s/train_recog.mlf' % output_dir results_log = '%s/results.log' % output_dir ## Create a config file to use with HDecode hdecode_config = '%s/hdecode.config' % output_dir fh = open(hdecode_config, 'w') #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n') fh.write('HNETFILTER = "gunzip -c < $.gz"\n') fh.write('HNETOFILTER = "gzip -c > $.gz"\n') fh.write('RAWMITFORMAT = T\n') fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n') fh.write('GCFREQ = 50\n') fh.write('HLAT:TRACE = 19\n') fh.write('HLVNET:TRACE = 1\n') fh.write('HLVREC:TRACE = 1\n') fh.write('HLVLM:TRACE = 1\n') fh.write('LATPRUNEBEAM = 500.0\n') fh.write('MAXLMLA = 3.0\n') fh.write('BUILDLATSENTEND = T\n') fh.write('FORCELATOUT = F\n') fh.write('STARTWORD = <s>\n') fh.write('ENDWORD = </s>\n') fh.close() ## HDecode parameters utts_per_split = 100 block_size = 5 beam = 150.0 word_end_beam = 125.0 max_model = 10000 lm_scale = 15.0 word_insertion_penalty = 0.0 def hdecode(input, output): cmd = 'HDecode -A -D -V -T 9 -o M -z lat -C %s' % hdecode_config cmd += ' -H %s/MMF' % model_dir cmd += ' -k %d' % block_size cmd += ' -t %f 100.0' % beam cmd += ' -v %f 115.0' % word_end_beam cmd += ' -u %d' % max_model cmd += ' -s %f' % lm_scale cmd += ' -p %f' % word_insertion_penalty cmd += ' -w %s' % lm cmd += ' -S %s' % input cmd += ' -l %s/' % output cmd += ' %s %s' % (dict, model_list) if model.verbose > 0: cmd += ' >%s/%s.log' % (output_dir, os.path.basename(input)) return cmd ## Split up MFC list split_mfc = SplitList(output_dir, mfc_list, by_path=True) ## Create the HDecode commands cmds = [] inputs = split_mfc.get_files() for input in inputs: output = '%s/%s' % (output_dir, split_mfc.get_key(input)) if not os.path.isdir(output): os.makedirs(output) cmds.append(hdecode(input, output)) if model.local == 1: for cmd in cmds: print cmd print os.popen(cmd).read() else: cmds_file = '%s/hdecode.commands' % output_dir fh = open(cmds_file, 'w') for cmd in cmds: fh.write('%s\n' % cmd) fh.close() util.run_parallel(cmds_file, model.jobs, output_dir) ## Copy old mfc list old_mfc_list = '%s/mfc_old.list' % output_dir os.system('cp %s %s' % (mfc_list, old_mfc_list)) ## Prune bad lats from the mfc list lat_ids = [ os.path.basename(f).split('.')[0] for f in util.get_files(output_dir, r'.*\.lat') ] bad_count = 0 fh = open(mfc_list, 'w') for mfc in open(old_mfc_list): id = os.path.basename(mfc.strip()).split('.')[0] ## Check for missing transcriptions if id not in lat_ids: if model.verbose > 1: util.log_write(model.logfh, 'removed bad lat [%s]' % id) bad_count += 1 else: fh.write(mfc) fh.close() util.log_write(model.logfh, 'removed bad lats [%d]' % bad_count) ## Create an MLF from the recognition output outputs = util.get_files(output_dir, r'.*\.rec') os.popen('rm -f %s' % output_mlf) fh = open(output_mlf, 'w') fh.write('#!MLF!#\n') for output in outputs: fh.write('"%s"\n' % output) for line in open(output): if '<s>' in line or '</s>' in line: continue fh.write(line) fh.write('.\n') fh.close() ## Evaluate cmd = 'HResults -h -n -A -T 1' cmd += ' -I %s' % gold_mlf cmd += ' %s %s > %s' % (model_list, output_mlf, results_log) os.system(cmd) print os.popen('cat ' + results_log).read()
def phonemark_lattices(model, lattice_dir, output_dir, model_dir, mfc_list, lm, dict, model_list): sys.stderr.write('Phonemarking lattices\n') ## Create a config file to use with HDecode hdecode_config = '%s/hdecode.config' % output_dir fh = open(hdecode_config, 'w') #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n') fh.write('HNETFILTER = "gunzip -c < $.gz"\n') fh.write('HNETOFILTER = "gzip -c > $.gz"\n') fh.write('RAWMITFORMAT = T\n') fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n') fh.write('GCFREQ = 50\n') fh.write('HLAT:TRACE = 19\n') fh.write('HLVNET:TRACE = 1\n') fh.write('HLVREC:TRACE = 1\n') fh.write('HLVLM:TRACE = 1\n') fh.write('LATPRUNEBEAM = 500.0\n') fh.write('MAXLMLA = 3.0\n') fh.write('BUILDLATSENTEND = T\n') fh.write('FORCELATOUT = F\n') fh.write('STARTWORD = <s>\n') fh.write('ENDWORD = </s>\n') fh.close() ## HDecode parameters utts_per_split = 100 block_size = 5 beam = 200.0 lm_scale = 15.0 word_insertion_penalty = 0.0 def hdecode_mod(input, path): input_dir = '%s/%s/' % (lattice_dir, path) if not os.path.isdir(input_dir): input_dir = '%s/%s/' % (lattice_dir, path.replace('_', '')) cmd = 'HDecode.mod -A -D -V -T 9 -q tvaldm -z lat -X lat -C %s' % hdecode_config cmd += ' -H %s/MMF' % model_dir cmd += ' -k %d' % block_size cmd += ' -t %f' % beam cmd += ' -s %f' % lm_scale cmd += ' -p %f' % word_insertion_penalty cmd += ' -w' # %s' %lm cmd += ' -S %s' % input cmd += ' -l %s/%s/' % (output_dir, path) cmd += ' -L %s' % input_dir cmd += ' %s %s' % (dict, model_list) if model.verbose > 0: cmd += ' >%s/%s.log' % (output_dir, os.path.basename(input)) return cmd ## Split up MFC list with unix split split_mfc = SplitList(output_dir, mfc_list, by_path=True) ## Create the HDecode commands cmds = [] inputs = split_mfc.get_files() for input in inputs: key = split_mfc.get_key(input) new_output = '%s/%s' % (output_dir, key) if not os.path.isdir(new_output): os.makedirs(new_output) cmds.append(hdecode_mod(input, key)) if model.local == 1: for cmd in cmds: print cmd print os.popen(cmd).read() else: cmds_file = '%s/hdecode_mod.commands' % output_dir fh = open(cmds_file, 'w') for cmd in cmds: fh.write('%s\n' % cmd) fh.close() util.run_parallel(cmds_file, model.jobs, output_dir) ## Copy old mfc list old_mfc_list = '%s/mfc_old.list' % output_dir os.system('cp %s %s' % (mfc_list, old_mfc_list)) ## Prune bad lats from the mfc list lat_ids = [ os.path.basename(f).split('.')[0] for f in util.get_files(output_dir, r'.*\.lat') ] bad_count = 0 fh = open(mfc_list, 'w') for mfc in open(old_mfc_list): id = os.path.basename(mfc.strip()).split('.')[0] ## Check for missing transcriptions if id not in lat_ids: if model.verbose > 1: util.log_write(model.logfh, 'removed bad lat [%s]' % id) bad_count += 1 else: fh.write(mfc) fh.close() util.log_write(model.logfh, 'removed bad lats [%d]' % bad_count)
def tie_states_search(model, output_dir, model_dir, mono_list, tri_list, tied_list): """ Tie HMM states using decision tree clustering """ util.create_new_dir(output_dir) tree_hed = '%s/tree.hed' %output_dir tree_output = '%s/trees' %output_dir hhed_log = '%s/hhed_cluster.log' %output_dir all_tri_list = '%s/all_tri.list' %model.exp ## Decision tree parameters ro = model.dt_ro tb = model.dt_tb tb_min = 100.0 tb_max = 10000.0 ## Create the full list of possible triphones phones = open(mono_list).read().splitlines() non_sp_phones = [p for p in phones if p not in ['sp', 'sil']] fh = open(all_tri_list, 'w') fh.write('sp\n') fh.write('sil\n') for p1 in non_sp_phones: fh.write('sil-%s+sil\n' %p1) for p2 in non_sp_phones: fh.write('sil-%s+%s\n' %(p1, p2)) fh.write('%s-%s+sil\n' %(p2, p1)) for p3 in non_sp_phones: fh.write('%s-%s+%s\n' %(p2, p1, p3)) fh.close() ## Search over tb arguments to get the right number states num_states = 0 attempts = 0 prev_tb = 0 while True: os.system('rm -f %s %s %s' %(tree_hed, tree_output, hhed_log)) ## Set up decision tree clustering fh = open(tree_hed, 'w') fh.write('RO %d %s/stats\n' %(ro, model_dir)) fh.write('TR 0\n') fh.write('%s\n' %open(model.tree_questions).read()) fh.write('TR 12\n') for p in non_sp_phones: for s in range(1, model.states+1)[1:-1]: fh.write('TB %d "ST_%s_%d_" {(%s,*-%s+*,%s+*,*-%s).state[%d]}\n' %(tb,p,s,p,p,p,p,s)) fh.write('TR 1\n') fh.write('AU "%s"\n' %all_tri_list) fh.write('CO "%s"\n' %tied_list) fh.write('ST "%s"\n' %tree_output) fh.close() ## Use HHEd to cluster cmd = 'HHEd -A -T 1 -H %s/MMF' %model_dir cmd += ' -M %s' %output_dir cmd += ' %s %s > %s' %(tree_hed, tri_list, hhed_log) if model.local == 1: os.system(cmd) else: util.run(cmd, output_dir) num_states = int(os.popen('grep -c "<MEAN>" %s/MMF' %output_dir).read().strip()) if abs(float(num_states - model.triphone_states)/model.triphone_states) <= 0.01: util.log_write(model.logfh, ' current states [%d] tb [%1.2f]' %(num_states, tb)) break if abs(prev_tb - tb) <= 0.01: util.log_write(model.logfh, ' Could not converge. Stopping. Current states [%d] tb [%1.2f]' %(num_states,tb)) break attempts += 1 prev_tb = tb if num_states < model.triphone_states: tb = (tb_min + tb) / 2 tb_max = prev_tb else: tb = (tb_max + tb) / 2 tb_min = prev_tb util.log_write(model.logfh, ' [%d] goal [%d] current states [%d] tb [%1.2f] -> [%1.2f] [%1.1f %1.1f]' %(attempts, model.triphone_states, num_states, prev_tb, tb, tb_min, tb_max)) if attempts > 50: util.log_write(model.logfh, ' Goal not reached after 50 tries. Exiting.') sys.exit() return output_dir
def align(model, root_dir, mfc_list, model_dir, word_mlf, new_mlf, model_list, dict, align_config): """ Create a new alignment based on a model and the word alignment with HVite """ output_dir = '%s/Align' %root_dir util.create_new_dir(output_dir) utts_per_split = max(100, (1 + (model.setup_length / 200))) ## Copy old mfc list os.system('cp %s %s/mfc_old.list' %(mfc_list, output_dir)) ## HVite parameters prune_thresh = 250 def hvite(input, output): #-o SWT cmd = 'HVite -D -A -T 1 -b silence -a -m -y lab ' cmd += '-t %d' %prune_thresh cmd += ' -C %s' %align_config cmd += ' -H %s/MMF' %model_dir cmd += ' -i %s' %output cmd += ' -I %s' %word_mlf cmd += ' -S %s' %input cmd += ' %s %s' %(dict, model_list) cmd += ' >> %s.hvite.log' %output return cmd ## Split up MFC list with unix split cmd = 'split -a 4 -d -l %d %s %s/%s' %(utts_per_split, mfc_list, output_dir, 'mfc.list.') os.system(cmd) ## Create the HVite commands cmds = [] outputs = [] inputs = os.popen('ls %s/mfc.list.*' %output_dir).read().splitlines() for input in inputs: output = input.replace('mfc.list', 'align.output') outputs.append(output) cmds.append(hvite(input, output)) if model.local == 1: for cmd in cmds: print cmd print os.popen(cmd).read() else: cmds_file = '%s/hvite.commands' %output_dir fh = open(cmds_file, 'w') for cmd in cmds: fh.write('%s\n' %cmd) fh.close() util.run_parallel(cmds_file, model.jobs, output_dir) ## Merge and fix silences ## TODO: -s file_list merge_sil = '%s/merge_sp_sil.led' %output_dir fh = open(merge_sil, 'w') fh.write('ME sil sp sil\n') fh.write('ME sil sil sil\n') fh.write('ME sp sil sil\n') fh.close() cmd = 'HLEd -D -A -T 1 -i %s %s %s >> %s/hled.log' %(new_mlf, merge_sil, ' '.join(outputs), output_dir) if model.local == 1: os.system(cmd) else: util.run(cmd, output_dir) ## Prune failed alignments from the mfc list bad_count = 0 mlf_labels = os.popen('grep "\.lab" %s' %new_mlf).read().splitlines() mlf_labels = set([os.path.basename(s).split('.')[0] for s in mlf_labels]) mfc_labels = open(mfc_list).read().splitlines() fh = open(mfc_list, 'w') for mfc in mfc_labels: id = os.path.basename(mfc).split('.')[0] ## Check for missing transcriptions if id not in mlf_labels: if model.verbose > 0: util.log_write(model.logfh, 'removed bad alignment [%s]' %id) bad_count += 1 else: fh.write(mfc + '\n') fh.close() util.log_write(model.logfh, 'removed alignments [%d]' %bad_count) ## Clean up os.system('rm -f %s/mfc.list.* %s/align.output.*' %(output_dir, output_dir)) return output_dir
def tie_states_search(model, output_dir, model_dir, mono_list, tri_list, tied_list): """ Tie HMM states using decision tree clustering """ util.create_new_dir(output_dir) tree_hed = '%s/tree.hed' % output_dir tree_output = '%s/trees' % output_dir hhed_log = '%s/hhed_cluster.log' % output_dir all_tri_list = '%s/all_tri.list' % model.exp ## Decision tree parameters ro = model.dt_ro tb = model.dt_tb tb_min = 100.0 tb_max = 10000.0 ## Create the full list of possible triphones phones = open(mono_list).read().splitlines() non_sp_phones = [p for p in phones if p not in ['sp', 'sil']] fh = open(all_tri_list, 'w') fh.write('sp\n') fh.write('sil\n') for p1 in non_sp_phones: fh.write('sil-%s+sil\n' % p1) for p2 in non_sp_phones: fh.write('sil-%s+%s\n' % (p1, p2)) fh.write('%s-%s+sil\n' % (p2, p1)) for p3 in non_sp_phones: fh.write('%s-%s+%s\n' % (p2, p1, p3)) fh.close() ## Search over tb arguments to get the right number states num_states = 0 attempts = 0 prev_tb = 0 while True: os.system('rm -f %s %s %s' % (tree_hed, tree_output, hhed_log)) ## Set up decision tree clustering fh = open(tree_hed, 'w') fh.write('RO %d %s/stats\n' % (ro, model_dir)) fh.write('TR 0\n') fh.write('%s\n' % open(model.tree_questions).read()) fh.write('TR 12\n') for p in non_sp_phones: for s in range(1, model.states + 1)[1:-1]: fh.write( 'TB %d "ST_%s_%d_" {(%s,*-%s+*,%s+*,*-%s).state[%d]}\n' % (tb, p, s, p, p, p, p, s)) fh.write('TR 1\n') fh.write('AU "%s"\n' % all_tri_list) fh.write('CO "%s"\n' % tied_list) fh.write('ST "%s"\n' % tree_output) fh.close() ## Use HHEd to cluster cmd = 'HHEd -A -T 1 -H %s/MMF' % model_dir cmd += ' -M %s' % output_dir cmd += ' %s %s > %s' % (tree_hed, tri_list, hhed_log) if model.local == 1: os.system(cmd) else: util.run(cmd, output_dir) num_states = int( os.popen('grep -c "<MEAN>" %s/MMF' % output_dir).read().strip()) if abs( float(num_states - model.triphone_states) / model.triphone_states) <= 0.01: util.log_write( model.logfh, ' current states [%d] tb [%1.2f]' % (num_states, tb)) break if abs(prev_tb - tb) <= 0.01: util.log_write( model.logfh, ' Could not converge. Stopping. Current states [%d] tb [%1.2f]' % (num_states, tb)) break attempts += 1 prev_tb = tb if num_states < model.triphone_states: tb = (tb_min + tb) / 2 tb_max = prev_tb else: tb = (tb_max + tb) / 2 tb_min = prev_tb util.log_write( model.logfh, ' [%d] goal [%d] current states [%d] tb [%1.2f] -> [%1.2f] [%1.1f %1.1f]' % (attempts, model.triphone_states, num_states, prev_tb, tb, tb_min, tb_max)) if attempts > 50: util.log_write(model.logfh, ' Goal not reached after 50 tries. Exiting.') sys.exit() return output_dir
def align(model, root_dir, mfc_list, model_dir, word_mlf, new_mlf, model_list, dict, align_config): """ Create a new alignment based on a model and the word alignment with HVite """ output_dir = '%s/Align' % root_dir util.create_new_dir(output_dir) utts_per_split = max(100, (1 + (model.setup_length / 200))) ## Copy old mfc list os.system('cp %s %s/mfc_old.list' % (mfc_list, output_dir)) ## HVite parameters prune_thresh = 250 def hvite(input, output): #-o SWT cmd = 'HVite -D -A -T 1 -b silence -a -m -y lab ' cmd += '-t %d' % prune_thresh cmd += ' -C %s' % align_config cmd += ' -H %s/MMF' % model_dir cmd += ' -i %s' % output cmd += ' -I %s' % word_mlf cmd += ' -S %s' % input cmd += ' %s %s' % (dict, model_list) cmd += ' >> %s.hvite.log' % output return cmd ## Split up MFC list with unix split cmd = 'split -a 4 -d -l %d %s %s/%s' % (utts_per_split, mfc_list, output_dir, 'mfc.list.') os.system(cmd) ## Create the HVite commands cmds = [] outputs = [] inputs = os.popen('ls %s/mfc.list.*' % output_dir).read().splitlines() for input in inputs: output = input.replace('mfc.list', 'align.output') outputs.append(output) cmds.append(hvite(input, output)) if model.local == 1: for cmd in cmds: print cmd print os.popen(cmd).read() else: cmds_file = '%s/hvite.commands' % output_dir fh = open(cmds_file, 'w') for cmd in cmds: fh.write('%s\n' % cmd) fh.close() util.run_parallel(cmds_file, model.jobs, output_dir) ## Merge and fix silences ## TODO: -s file_list merge_sil = '%s/merge_sp_sil.led' % output_dir fh = open(merge_sil, 'w') fh.write('ME sil sp sil\n') fh.write('ME sil sil sil\n') fh.write('ME sp sil sil\n') fh.close() cmd = 'HLEd -D -A -T 1 -i %s %s %s >> %s/hled.log' % ( new_mlf, merge_sil, ' '.join(outputs), output_dir) if model.local == 1: os.system(cmd) else: util.run(cmd, output_dir) ## Prune failed alignments from the mfc list bad_count = 0 mlf_labels = os.popen('grep "\.lab" %s' % new_mlf).read().splitlines() mlf_labels = set([os.path.basename(s).split('.')[0] for s in mlf_labels]) mfc_labels = open(mfc_list).read().splitlines() fh = open(mfc_list, 'w') for mfc in mfc_labels: id = os.path.basename(mfc).split('.')[0] ## Check for missing transcriptions if id not in mlf_labels: if model.verbose > 0: util.log_write(model.logfh, 'removed bad alignment [%s]' % id) bad_count += 1 else: fh.write(mfc + '\n') fh.close() util.log_write(model.logfh, 'removed alignments [%d]' % bad_count) ## Clean up os.system('rm -f %s/mfc.list.* %s/align.output.*' % (output_dir, output_dir)) return output_dir
def decode_to_lattices(model, output_dir, model_dir, mfc_list, lm, dict, model_list, gold_mlf): sys.stderr.write('Decoding to lattices\n') output_mlf = '%s/train_recog.mlf' %output_dir results_log = '%s/results.log' %output_dir ## Create a config file to use with HDecode hdecode_config = '%s/hdecode.config' %output_dir fh = open(hdecode_config, 'w') #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n') fh.write('HNETFILTER = "gunzip -c < $.gz"\n') fh.write('HNETOFILTER = "gzip -c > $.gz"\n') fh.write('RAWMITFORMAT = T\n') fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n') fh.write('GCFREQ = 50\n') fh.write('HLAT:TRACE = 19\n') fh.write('HLVNET:TRACE = 1\n') fh.write('HLVREC:TRACE = 1\n') fh.write('HLVLM:TRACE = 1\n') fh.write('LATPRUNEBEAM = 500.0\n') fh.write('MAXLMLA = 3.0\n') fh.write('BUILDLATSENTEND = T\n') fh.write('FORCELATOUT = F\n') fh.write('STARTWORD = <s>\n') fh.write('ENDWORD = </s>\n') fh.close() ## HDecode parameters utts_per_split = 100 block_size = 5 beam = 150.0 word_end_beam = 125.0 max_model = 10000 lm_scale = 15.0 word_insertion_penalty = 0.0 def hdecode(input, output): cmd = 'HDecode -A -D -V -T 9 -o M -z lat -C %s' %hdecode_config cmd += ' -H %s/MMF' %model_dir cmd += ' -k %d' %block_size cmd += ' -t %f 100.0' %beam cmd += ' -v %f 115.0' %word_end_beam cmd += ' -u %d' %max_model cmd += ' -s %f' %lm_scale cmd += ' -p %f' %word_insertion_penalty cmd += ' -w %s' %lm cmd += ' -S %s' %input cmd += ' -l %s/' %output cmd += ' %s %s' %(dict, model_list) if model.verbose > 0: cmd += ' >%s/%s.log' %(output_dir, os.path.basename(input)) return cmd ## Split up MFC list split_mfc = SplitList(output_dir, mfc_list, by_path=True) ## Create the HDecode commands cmds = [] inputs = split_mfc.get_files() for input in inputs: output = '%s/%s' %(output_dir, split_mfc.get_key(input)) if not os.path.isdir(output): os.makedirs(output) cmds.append(hdecode(input, output)) if model.local == 1: for cmd in cmds: print cmd print os.popen(cmd).read() else: cmds_file = '%s/hdecode.commands' %output_dir fh = open(cmds_file, 'w') for cmd in cmds: fh.write('%s\n' %cmd) fh.close() util.run_parallel(cmds_file, model.jobs, output_dir) ## Copy old mfc list old_mfc_list = '%s/mfc_old.list' %output_dir os.system('cp %s %s' %(mfc_list, old_mfc_list)) ## Prune bad lats from the mfc list lat_ids = [os.path.basename(f).split('.')[0] for f in util.get_files(output_dir, r'.*\.lat')] bad_count = 0 fh = open(mfc_list, 'w') for mfc in open(old_mfc_list): id = os.path.basename(mfc.strip()).split('.')[0] ## Check for missing transcriptions if id not in lat_ids: if model.verbose > 1: util.log_write(model.logfh, 'removed bad lat [%s]' %id) bad_count += 1 else: fh.write(mfc) fh.close() util.log_write(model.logfh, 'removed bad lats [%d]' %bad_count) ## Create an MLF from the recognition output outputs = util.get_files(output_dir, r'.*\.rec') os.popen('rm -f %s' %output_mlf) fh = open(output_mlf, 'w') fh.write('#!MLF!#\n') for output in outputs: fh.write('"%s"\n' %output) for line in open(output): if '<s>' in line or '</s>' in line: continue fh.write(line) fh.write('.\n') fh.close() ## Evaluate cmd = 'HResults -h -n -A -T 1' cmd += ' -I %s' %gold_mlf cmd += ' %s %s > %s' %(model_list, output_mlf, results_log) os.system(cmd) print os.popen('cat ' + results_log).read()
def phonemark_lattices(model, lattice_dir, output_dir, model_dir, mfc_list, lm, dict, model_list): sys.stderr.write('Phonemarking lattices\n') ## Create a config file to use with HDecode hdecode_config = '%s/hdecode.config' %output_dir fh = open(hdecode_config, 'w') #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n') fh.write('HNETFILTER = "gunzip -c < $.gz"\n') fh.write('HNETOFILTER = "gzip -c > $.gz"\n') fh.write('RAWMITFORMAT = T\n') fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n') fh.write('GCFREQ = 50\n') fh.write('HLAT:TRACE = 19\n') fh.write('HLVNET:TRACE = 1\n') fh.write('HLVREC:TRACE = 1\n') fh.write('HLVLM:TRACE = 1\n') fh.write('LATPRUNEBEAM = 500.0\n') fh.write('MAXLMLA = 3.0\n') fh.write('BUILDLATSENTEND = T\n') fh.write('FORCELATOUT = F\n') fh.write('STARTWORD = <s>\n') fh.write('ENDWORD = </s>\n') fh.close() ## HDecode parameters utts_per_split = 100 block_size = 5 beam = 200.0 lm_scale = 15.0 word_insertion_penalty = 0.0 def hdecode_mod(input, path): input_dir = '%s/%s/' %(lattice_dir, path) if not os.path.isdir(input_dir): input_dir = '%s/%s/' %(lattice_dir, path.replace('_', '')) cmd = 'HDecode.mod -A -D -V -T 9 -q tvaldm -z lat -X lat -C %s' %hdecode_config cmd += ' -H %s/MMF' %model_dir cmd += ' -k %d' %block_size cmd += ' -t %f' %beam cmd += ' -s %f' %lm_scale cmd += ' -p %f' %word_insertion_penalty cmd += ' -w' # %s' %lm cmd += ' -S %s' %input cmd += ' -l %s/%s/' %(output_dir, path) cmd += ' -L %s' %input_dir cmd += ' %s %s' %(dict, model_list) if model.verbose > 0: cmd += ' >%s/%s.log' %(output_dir, os.path.basename(input)) return cmd ## Split up MFC list with unix split split_mfc = SplitList(output_dir, mfc_list, by_path=True) ## Create the HDecode commands cmds = [] inputs = split_mfc.get_files() for input in inputs: key = split_mfc.get_key(input) new_output = '%s/%s' %(output_dir, key) if not os.path.isdir(new_output): os.makedirs(new_output) cmds.append(hdecode_mod(input, key)) if model.local == 1: for cmd in cmds: print cmd print os.popen(cmd).read() else: cmds_file = '%s/hdecode_mod.commands' %output_dir fh = open(cmds_file, 'w') for cmd in cmds: fh.write('%s\n' %cmd) fh.close() util.run_parallel(cmds_file, model.jobs, output_dir) ## Copy old mfc list old_mfc_list = '%s/mfc_old.list' %output_dir os.system('cp %s %s' %(mfc_list, old_mfc_list)) ## Prune bad lats from the mfc list lat_ids = [os.path.basename(f).split('.')[0] for f in util.get_files(output_dir, r'.*\.lat')] bad_count = 0 fh = open(mfc_list, 'w') for mfc in open(old_mfc_list): id = os.path.basename(mfc.strip()).split('.')[0] ## Check for missing transcriptions if id not in lat_ids: if model.verbose > 1: util.log_write(model.logfh, 'removed bad lat [%s]' %id) bad_count += 1 else: fh.write(mfc) fh.close() util.log_write(model.logfh, 'removed bad lats [%d]' %bad_count)