Ejemplo n.º 1
0
def lattice_rescore(log_id, lat_dir, lat_dir_out, lm, lm_scale):
    global num_tasks

    clean_split_dir(lat_dir_out)

    rescore = ["lattice-tool"]

    lattice_scp = lat_dir+'/lattices.scp'

    with open(lattice_scp, 'w') as lattice_scp_file:
        for lattice_file in glob.iglob(lat_dir + '/*.lat.gz'):
            print >> lattice_scp_file, lattice_file

    max_tasks = split_file(lattice_scp, num_tasks)

    rescore.extend(["-order", '10',
                    '-read-htk',
                    '-htk-lmscale', lm_scale,
                    '-in-lattice-list', lattice_scp+'.part.%t',
                    '-lm', lm,
                    '-out-lattice-dir', lat_dir_out+'.part.%t',
                    '-write-htk',
                    '-debug', '1'])

    ostream, estream = _get_output_stream_names(log_id)
    job_runner.submit_job([str(part) for part in rescore], {'numtasks': max_tasks,
                                    'ostream': ostream,
                                    'estream': estream,
                                    'timelimit': '00:15:00'})

    merge_split_dir(lat_dir_out)
    clean_split_file(lattice_scp)
Ejemplo n.º 2
0
def HDecode(log_id,  scp_file, model_dir, dict, phones_list, language_model,  label_dir, num_tokens, out_mlf, configs, lm_scale, beam, end_beam, max_pruning, adapt_dirs = None, num_speaker_chars = 3):
    global num_tasks, extra_HTK_options

    max_tasks = split_file(scp_file, num_tasks)

    HDecode = ["HDecode"]
    HDecode.extend(extra_HTK_options)

    for config in configs:
        HDecode.extend(['-C', config])

    if adapt_dirs is not None:
        for source_dir, extension in adapt_dirs:
            if extension is None:
                HDecode.extend(['-J', source_dir])
            else:
                HDecode.extend(['-J', source_dir, extension])

    if adapt_dirs is not None and len(adapt_dirs) > 0:
        HDecode.extend(['-m'])
        pattern = '*.%%%'
        if num_speaker_chars > 0:
            pattern = "*/" + ('%' * num_speaker_chars) + "*.*"
        HDecode.extend(["-h", pattern])
        
    HDecode.extend(['-S', scp_file+ ".part.%t",
                '-H', model_dir + "/macros",
                '-H', model_dir + "/hmmdefs",
                '-z', 'lat',
                '-o', 'ST',
                '-i', out_mlf+'.part.%t',
                '-l', label_dir,
                '-w', language_model,
                '-n', num_tokens,
                '-s', "{0:.1f}".format(lm_scale),
                '-t', "{0:.1f}".format(beam),
                '-v', "{0:.1f}".format(end_beam),
                '-u', max_pruning,
                '-p', '0.0',
                dict,
                phones_list])

    ostream, estream = _get_output_stream_names(log_id)
    job_runner.submit_job([str(part) for part in HDecode], {'numtasks': min(max_tasks, num_tasks),
                                    'ostream': ostream,
                                    'estream': estream,
                                    'memlimit': '1200',
                                    'timelimit': '04:00:00'} )

    merge_mlf_files(out_mlf)
    # remove splitted scp files
    clean_split_file(scp_file)
Ejemplo n.º 3
0
def recode_audio(log_id, scp_file, data_set, amr):
    global num_tasks

    split_file(scp_file, num_tasks)
    recode_audio = ["recode_audio.py"]
    recode_audio.extend(["-s", data_set,
                         "-S", scp_file+ ".part.%t"])
    if amr:
        recode_audio.append('-a')
        
    ostream, estream = _get_output_stream_names(log_id)
    job_runner.submit_job(recode_audio, {'numtasks': num_tasks,
                                    'ostream': ostream,
                                    'estream': estream})

    clean_split_file(scp_file)
Ejemplo n.º 4
0
def HCopy(log_id, scp_file, config):
    global num_tasks, extra_HTK_options
    
    split_file(scp_file, num_tasks)
    
    HCopy = ["HCopy"]
    HCopy.extend(extra_HTK_options)
    HCopy.extend(["-C", config,
                "-S", scp_file+ ".part.%t"])
    
    ostream, estream = _get_output_stream_names(log_id)
    job_runner.submit_job(HCopy, {'numtasks': num_tasks,
                                    'ostream': ostream,
                                    'estream': estream})    
                                    
    clean_split_file(scp_file)
Ejemplo n.º 5
0
def HVite(log_id, scp_file, hmm_dir, dict, phones_list, word_transcriptions, new_transcriptions, ext = 'lab', config = None, pruning = None):
    global num_tasks, extra_HTK_options, default_config_file, default_HERest_pruning
    
    if config is None: config = default_config_file
    if pruning is None: pruning = default_HERest_pruning

    tasks = max(1,int(num_tasks / 10))
    # divide scp files over HVite tasks
    max_tasks = split_file(scp_file, tasks)
    
    HVite = ["HVite"]
    HVite.extend(extra_HTK_options)

    HVite.extend(["-S", scp_file+ ".part.%t",
                    "-i", new_transcriptions + ".part.%t", 
                    "-l", '*',
                    "-C", config,
                    "-o", "ST",
                    "-a",
                    "-H", hmm_dir + "/macros",
                    "-H", hmm_dir + "/hmmdefs",
                    "-m",
                    "-y", "lab",
                    "-X", ext,
                    "-I", word_transcriptions])
    
    HVite.extend(["-t"])
    HVite.extend(pruning)
    
    HVite.extend([dict,
                phones_list])
    
    ostream, estream = _get_output_stream_names(log_id)
    job_runner.submit_job(HVite, {'numtasks': max_tasks,
                                    'ostream': ostream,
                                    'estream': estream})
                                    
    with open(new_transcriptions, 'w') as mlffile:
        print >> mlffile, '#!MLF!#'
        for file in glob.glob(new_transcriptions+".part.*"):
            for line in open(file):
                if not line.startswith('#!MLF!#'): 
                    print >> mlffile, line.rstrip()
            os.remove(file)
    
    clean_split_file(scp_file)
Ejemplo n.º 6
0
def HLEd(log_id, input_transcriptions, led_file, selector, phones_list, output_transcriptions, dict = None):
    global num_tasks, extra_HTK_options
    HLEd = ["HLEd"]
    HLEd.extend(extra_HTK_options)
                    
    if dict:
        HLEd.extend(["-d", dict])
        
    HLEd.extend(["-n", phones_list,
                "-l", selector,
                "-i", output_transcriptions,
                led_file,
                input_transcriptions])

    ostream, estream = _get_output_stream_names(log_id)
    job_runner.submit_job(HLEd, {'numtasks': 1,
                                    'ostream': ostream,
                                    'estream': estream})
Ejemplo n.º 7
0
def HCompV(log_id, scpfile, target_hmm_dir, protofile, min_variance, config = None):
    global num_tasks, extra_HTK_options, default_config_file
    
    if config is None: config = default_config_file
    
    HCompV = ["HCompV"]
    HCompV.extend(extra_HTK_options)
    HCompV.extend(["-C", config,
                "-f", min_variance,
                "-m",
                "-S", scpfile,
                "-M", target_hmm_dir,
                protofile])
    
    ostream, estream = _get_output_stream_names(log_id)
    job_runner.submit_job(HCompV, {'numtasks': 1,
                                    'ostream': ostream,
                                    'estream': estream,
                                    'timelimit': '04:00:00'})            
Ejemplo n.º 8
0
def lattice_decode(log_id ,lat_dir, out_mlf, lm_scale):
    global num_tasks

    decode = ["lattice-tool"]

    lattice_scp = lat_dir+'/lattices.scp'

    with open(lattice_scp, 'w') as lattice_scp_file:
        for lattice_file in glob.iglob(lat_dir + '/*.lat.gz'):
            print >> lattice_scp_file, lattice_file

    max_tasks = split_file(lattice_scp, max(1,int(num_tasks/10)))

    decode.extend(['-read-htk',
                    '-htk-lmscale', lm_scale,
                    '-in-lattice-list', lattice_scp+'.part.%t',
                    '-viterbi-decode'])

    ostream, estream = _get_output_stream_names(log_id)
    job_runner.submit_job([str(part) for part in decode], {'numtasks': max_tasks,
                                    'ostream': ostream,
                                    'estream': estream,
                                    'timelimit': '00:15:00'})

    with open(out_mlf, 'w') as out_mlf_file:
        print >> out_mlf_file, "#!MLF!#"
        for file in glob.iglob(ostream.replace('%c', 'lattice-tool').replace('%t', '*').replace('%j', '*')):
            if file.endswith('parent'):
                continue
            for line in open(file):
                if line.startswith('#!MLF!#'):
                    continue
                else:
                    filename, transcription = line.split(None, 1)
                    transcription = transcription.replace(r'\344', u'ä'.encode('iso-8859-1')).replace(r'\366', u'ö'.encode('iso-8859-1'))
                    print >> out_mlf_file, '"*/%s.rec"' % os.path.splitext(os.path.basename(filename))[0]
                    transcription = transcription.rstrip().lstrip()
                    for word in transcription.split():
                        print >> out_mlf_file, word
                    print >> out_mlf_file, "."

    clean_split_file(lattice_scp)
Ejemplo n.º 9
0
def HHEd(log_id, source_hmm_dir, target_hmm_dir, hed, phones_list, w_flag = None):
    global extra_HTK_options
    
    HHEd = ["HHEd"]
    HHEd.extend(extra_HTK_options)
    HHEd.extend(["-H", source_hmm_dir + "/macros",
                "-H", source_hmm_dir + "/hmmdefs",
                "-M", target_hmm_dir])

    if w_flag is not None:
        HHEd.extend(['-w', w_flag])


    HHEd.extend([hed,
                phones_list])
    
    ostream, estream = _get_output_stream_names(log_id)
    job_runner.submit_job(HHEd, {'numtasks': 1,
                                    'ostream': ostream,
                                    'estream': estream})
Ejemplo n.º 10
0
def cdgen(log_id, monophones, tiedlist, mmf, outfsm):
    cdgen = ["cdgen"]

    cdgen.extend(['-monoListFName', monophones,
                  '-silMonophone', 'sil',
                  '-pauseMonophone', 'sp',
                  '-tiedListFName', tiedlist,
                  '-htkModelsFName', mmf,
                  '-cdSepChars', '-+',
                  '-cdType', 'xwrdtri',
                  '-fsmFName', '%s.fsm' % outfsm,
                  '-inSymsFName', '%s.insyms' % outfsm,
                  '-outSymsFName', '%s.outsyms' % outfsm,
                  ])

    ostream, estream = _get_output_stream_names(log_id)
    job_runner.submit_job([str(part) for part in cdgen], {'numtasks': 1,
                                    'ostream': ostream,
                                    'estream': estream,
                                    })
Ejemplo n.º 11
0
def lexgen(log_id, monophones, dict_hvite, outfsm):

    lexgen = ["lexgen"]

    lexgen.extend(['-lexFName', dict_hvite,
                   '-sentStartWord', '<s>',
                   '-sentEndWord', '</s>',
                   '-monoListFName', monophones,
                   '-silMonophone', 'sil',
                   '-pauseMonophone', 'sp',
                   '-fsmFName', '%s.fsm' % outfsm,
                   '-inSymsFName', '%s.insyms' % outfsm,
                   '-outSymsFName', '%s.outsyms' % outfsm,
                   '-addPronunsWithEndSil',
                   '-addPronunsWithEndPause',
                   '-addPhiLoop',
                   '-outputAuxPhones'
                   ])

    ostream, estream = _get_output_stream_names(log_id)
    job_runner.submit_job([str(part) for part in lexgen], {'numtasks': 1,
                                    'ostream': ostream,
                                    'estream': estream,
                                    })
Ejemplo n.º 12
0
def HERest(log_id, scp_file, source_hmm_dir, target_hmm_dir, phones_list, transcriptions, stats = False, config = None, transform_dir = None, num_pattern_chars = 3, pruning = None, binary = True):
    global num_tasks, extra_HTK_options, default_config_file, default_HERest_pruning
    
    if config is None: config = default_config_file
    if pruning is None: pruning = default_HERest_pruning
    
    # divide scp files over HERest tasks
    keep_together = False
    if transform_dir is not None:
        keep_together = True
    max_tasks = split_file(scp_file, num_tasks, keep_together, num_pattern_chars)

    HERest = ["HERest"]
    HERest.extend(extra_HTK_options)


    if type(config).__name__=='list':
        for c in config:
            HERest.extend(["-C", c])
    else:
        HERest.extend(["-C", config])

    HERest.extend(["-I", transcriptions,
                    "-H", source_hmm_dir + "/macros",
                    "-H", source_hmm_dir + "/hmmdefs",
                    "-M", target_hmm_dir])

    if transform_dir is not None:
        pattern = "*/" + ('%' * num_pattern_chars) + "*.*"
        HERest.extend(["-J", transform_dir, 'cmllr',
                       "-J", transform_dir,
                    "-h", pattern,
                    '-a'])
    
    HERest.extend(["-t"])
    HERest.extend(pruning)
    
    # copy merge_command now because the last options are different
    HERest_merge = list(HERest)
    
    HERest.extend(["-S", scp_file+ ".part.%t",
                    "-p", "%t",
                    phones_list])
    
    for file in glob.glob(target_hmm_dir+"/*.acc"): os.remove(file)

    ostream, estream = _get_output_stream_names(log_id)
    
    job_runner.submit_job(HERest, {'numtasks': max_tasks,
                                    'ostream': ostream,
                                    'estream': estream,
                                    'priority': 1} )

    if len(glob.glob(target_hmm_dir+"/*.acc")) != max_tasks:
        sys.exit("At least one acc file missing")
    
    if stats:
        HERest_merge.extend(["-s", target_hmm_dir + "/stats", "-w", "1.1"])

    if binary:
        HERest_merge.extend(["-B"])
        
    HERest_merge.extend(["-p", str(0),
                        phones_list])
    HERest_merge.extend(glob.glob(target_hmm_dir+"/*.acc"))
    
    job_runner.submit_job(HERest_merge,  {'numtasks': 1,
                                    'ostream': ostream,
                                    'estream': estream})

    # remove acc files
    for file in glob.glob(target_hmm_dir+"/*.acc"): os.remove(file)
    
    # remove splitted scp files
    clean_split_file(scp_file)
Ejemplo n.º 13
0
def HERest_estimate_transform(log_id, scp_file, source_hmm_dir, target_dir, phones_list, transcriptions,  max_adap_sentences = None, config = [], num_chars = 3, target_extension = 'cmllr', input_transform_dirs = [], use_parent = False, parent_transform_dirs = [], pruning = None, min_mix_weigth = 0.1, prune_treshold = 20.0):
    global num_tasks, extra_HTK_options, default_config_file, default_HERest_pruning

    if config is None: config = default_config_file
    if pruning is None: pruning = default_HERest_pruning

    # divide scp files over HERest tasks
    max_tasks = split_file(scp_file, num_tasks, True, num_chars)

    HERest = ["HERest"]
    HERest.extend(extra_HTK_options)

    if use_parent:
        HERest.extend(["-a"])    

    if type(config).__name__=='list':
        for c in config:
            HERest.extend(["-C", c])
    else:
        HERest.extend(["-C", config])

    pattern = '*.%%%'
    if num_chars > 0:
        pattern = "*/" + ('%' * num_chars) + "*.*"

    HERest.extend(["-h", pattern,
                    "-I", transcriptions,
                    "-H", source_hmm_dir + "/macros",
                    "-H", source_hmm_dir + "/hmmdefs",
                    "-K", target_dir, target_extension,
                    "-S", scp_file+ ".part.%t",
                    "-w", str(min_mix_weigth),
                    "-m", '0',
                    "-u", "a",
                    "-c", str(prune_treshold)])

#                        "-M", target_dir,
    if max_adap_sentences is not None:
        HERest.extend(['-l', max_adap_sentences])
    for source_dir, extension in input_transform_dirs:
        if extension is None:
            HERest.extend(['-J', source_dir])
        else:
            HERest.extend(['-J', source_dir, extension])

    for source_dir, extension in parent_transform_dirs:
        if extension is None:
            HERest.extend(['-E', source_dir])
        else:
            HERest.extend(['-E', source_dir, extension])       



    HERest.extend(["-t"])
    HERest.extend(pruning)

    HERest.extend([phones_list])

    ostream, estream = _get_output_stream_names(log_id)

    job_runner.submit_job(HERest, {'numtasks': min(max_tasks, num_tasks),
                                    'ostream': ostream,
                                    'estream': estream} )

    # remove splitted scp files
    clean_split_file(scp_file)